让我的C++程序直接阅读网页(5) 一个简单的例子

王朝c/c++·作者佚名  2006-02-01
窄屏简体版  字體: |||超大  

5. 一个简单的例子这里用一个简单的示例说明如何利用前面几个工具函数,编写出能够从Web网页中自动抽取信息的程序。

这个示例程序直接使用 google.com 和baidu.com 的 Web搜索网页,检索关键词 ”思维信息”,选择各自前100个结果,然后把结果输出到 search.html 中。

在 search.html 中,分列显示”Google 搜索结果”, “Baidu搜索结果”

事实上 google.com, baidu.com 都能一次显示超过10个的搜索结果,这个例子仅仅是为了演示,编码比较随意, 未必具有实际的价值.

说明:

函数google_search 执行在google.com上搜索并返回前10页 (默认是10/页) 结果

函数baidu_search执行在baidu.com上搜索并返回前10页 (默认是10/页) 结果

函数url_encode 是返回url编码格式的字符串

搜索的关键词直接用 string keyword 设定,搜索请求也是直接拼接在 string path中

通过编译的环境:MS Win2000/XP + MS VC++ 6.0

需要连接的库:winhttp.lib

代码:

// example of accessing Web pages by C++ program

////////////////////////////////////////////////////////////////////////////////////////////////

#include <windows.h>

#include <winhttp.h>

#include <algorithm>

#include <fstream>

#include <iostream>

#include <stack>

#include <string>

using namespace std;

#define WSTR_LENGTH 500

typedef unsigned long UInt32;

typedef enum Enum_HTMLTagUnpair

{

HTML_TAGUNPAIR_TEXT,

HTML_TAGUNPAIR_THIS,

HTML_TAGUNPAIR_NEXT

};

typedef enum Enum_HTMLFormMethod

{

HTML_FORM_GET,

HTML_FORM_POST

};

const char hex_char[]= "0123456789ABCDEF\0";

const string whitespaces( "\t\n\v\f\r " );

string

url_encode( const string & data )

{

string e_d;

e_d.reserve( data.capacity() );

unsigned char ch;

unsigned int I = data.size();

for ( unsigned int i = 0; i < I; ++i )

{

ch = data[i];

if ( ( ch >= '0' && ch <= '9' )

|| ( ch >= 'A' && ch <= 'Z' )

|| ( ch >= 'a' && ch <= 'z' ) )

{

e_d.append( 1, ch );

}

else

if ( ch == 0x20 )

{

e_d.append( 1, '+' );

}

else

{

char esc[4] = { '%', hex_char[ch >> 4], hex_char[ch & 0xF], 0 };

e_d.append( esc );

}

}

return e_d;

}

// main

////////////////////////////////////////////////////////////////////////////////////////////////

void

google_search( string & res );

void

baidu_search( string & res );

HINTERNET h_inet = NULL;

void

main()

{

ofstream f( "search.html" );

try

{

cout << "Example of accessing Web pages by C++ program\n" << endl;

h_inet = ::WinHttpOpen( L"MyInternet",

WINHTTP_ACCESS_TYPE_DEFAULT_PROXY,

NULL,

NULL,

0 );

if ( h_inet == NULL )

throw GetLastError();

f << "<html><body>\n";

f << "<table width=\"100%\" border=\"1\" cellspacing=\"1\" cellpadding=\"1\"><tr><td width=\"50%\">Google 搜索结果</td><td width=\"50%\">Baidu 搜索结果</td></tr><tr valign=\"top\"><td width=\"50%\">";

string res;

res.reserve( 100000 );

google_search( res );

f << res;

f << "</td><td width=\"50%\">";

res.erase();

baidu_search( res );

f << res;

f << "</td></tr></table>\n";

f << "</body></html>";

}

catch ( UInt32 ex )

{

cout << "Error : " << ex << endl;

}

catch ( std::exception & ex )

{

cout << "Exception : " << ex.what() << endl;

}

catch ( ... )

{

cout << "Exception : unknown\n";

}

cout << endl;

f.close();

if ( h_inet != NULL )

::WinHttpCloseHandle( h_inet );

}

void

google_search( string & res )

{

HINTERNET h_site = NULL;

wchar_t *w_str = new wchar_t[WSTR_LENGTH];

try

{

string obj = "思维信息";

string site = "www.google.com";

string path = "/search?hl=zh-CN";

path += "&q=" + url_encode( obj );

path += "&bntG=" + url_encode( "Google 搜索" );

path += "&lr=";

int i = ::mbstowcs( w_str, site.c_str(), site.size() );

w_str[i] = L'\0';

h_site = ::WinHttpConnect( h_inet,

w_str,

INTERNET_DEFAULT_HTTP_PORT,

0 );

if ( h_site == NULL )

throw ::GetLastError();

string read_page;

read_page.reserve( 20000 );

int loop = 10;

bool b_fin = false;

while ( ! b_fin )

{

read_page.erase();

getHttpFile( h_site,

read_page,

site,

path );

if ( read_page.empty() )

throw std::exception( "mainPageEmpty" );

getHTMLObject( obj,

read_page,

"div",

1 );

res.append( obj );

getHTMLLink( path,

read_page,

1,

"下一页" );

cout << path << endl;

b_fin = ( path.empty() || ! --loop );

}

::WinHttpCloseHandle( h_site );

delete [] w_str;

}

catch ( ... )

{

delete [] w_str;

if ( h_site != NULL )

::WinHttpCloseHandle( h_site );

throw;

}

}

void

baidu_search( string & res )

{

HINTERNET h_site = NULL;

wchar_t *w_str = new wchar_t[WSTR_LENGTH];

try

{

string obj = "思维信息";

string site = "www.baidu.com";

string path = "/s?";

path += "wd=" + url_encode( obj );

path += "&cl=3";

int i = ::mbstowcs( w_str, site.c_str(), site.size() );

w_str[i] = L'\0';

h_site = ::WinHttpConnect( h_inet,

w_str,

INTERNET_DEFAULT_HTTP_PORT,

0 );

if ( h_site == NULL )

throw ::GetLastError();

string read_page;

read_page.reserve( 20000 );

int loop = 10;

bool b_fin = false;

while ( ! b_fin )

{

read_page.erase();

getHttpFile( h_site,

read_page,

site,

path );

if ( read_page.empty() )

throw std::exception( "mainPageEmpty" );

int I = 15;

for ( i = 5; i < I; ++i )

{

getHTMLObject( obj,

read_page,

"table",

i );

res.append( obj );

res.append( "<br>" );

}

getHTMLLink( path,

read_page,

1,

"下一页" );

cout << path << endl;

b_fin = ( path.empty() || ! --loop );

}

::WinHttpCloseHandle( h_site );

delete [] w_str;

}

catch ( ... )

{

delete [] w_str;

if ( h_site != NULL )

::WinHttpCloseHandle( h_site );

throw;

}

}

 
 
 
免责声明:本文为网络用户发布,其观点仅代表作者个人观点,与本站无关,本站仅提供信息存储服务。文中陈述内容未经本站证实,其真实性、完整性、及时性本站不作任何保证或承诺,请读者仅作参考,并请自行核实相关内容。
 
 
© 2005- 王朝網路 版權所有 導航