让我的C++程序直接阅读网页(6) 小结

6. 小结“让我的C++程序直接阅读网页” 记录了我以前写的，现在正在使用的部分Web工具的编程经历，代替了文档编写整理。同时也对一些需要改进完善的地方作个小结，以利提高。

(1) 从示例的 baidu_search 中可以看出，在搜索结果网页里每条搜索结果用一个 table 显示，所以每次 get table 都要从网页第一个字符开始扫描，这是一个比较严重的效率陷阱。

改进的方法：

一种是从前次检索过的位置开始新的检索，实现比较简单，只要 getHTMLObject 返回一个 end position。但是只在顺序检索时有效，随机检索时陷阱仍然存在。

另一种是建立 element index, 实现比较复杂，但有许多好处，更接近于 parser engine。方法是改进 canonHTML，在扫描全文的同时建立tag的索引，索引至少需要这几个信息

index tuple = ( name, start position, end positein, tag begin, tag end ）

然后 getHTMLObject 可以直接从索引表中检索，可以随机地检索各个元素

(2) 到目前为止，除了getHTMLLink中的 href, HTML的各种元素名称、关键词都没有出现过，基本上是按照W3C XML的标准在做。省略了许多东西，如基本的namespace，reference等等。所以实际上是一个简易通用的XML文档检索，当然带来一个好处是也能检索 RSS

—— 也许可以 DIY RSS Reader

(3) 在使用WinHttp 时，基本上是引用MS给的示例代码，没有改进。其中频繁地分配/释放缓冲区的作法是低效的。最好的方法时避免使用缓冲区，避免进行内存拷贝，直接输出结果。

具体解决时可以参照HTTP协议文本RFC2616

(4) HTTP Cookies

有些应用场合可能会需要使用Cookie, 下面是一段用 WinHttp读出Cookie的代码，可以直接嵌入 getHTTPFile函数，作为参考

wchar_t *p_wbuf = NULL;

UInt32 h_id = WINHTTP_NO_HEADER_INDEX;

{

l_0 = 0;

l_1 = h_id;

b_res = ::WinHttpQueryHeaders( h_file,

WINHTTP_QUERY_SET_COOKIE,

WINHTTP_HEADER_NAME_BY_INDEX,

NULL,

&l_0,

&h_id );

if ( ! b_res

|| l_0 == 0 )

continue;

p_wbuf = new wchar_t[l_0 / sizeof(wchar_t) + 1];

h_id = l_1;

b_res = ::WinHttpQueryHeaders( h_file,

WINHTTP_QUERY_SET_COOKIE,

WINHTTP_HEADER_NAME_BY_INDEX,

p_wbuf,

&l_0,

&h_id );

if ( b_res )

{

l_0 = ::wcslen( p_wbuf ) + 1;

char *p_str = new char[l_0];

::wcstombs( p_str, p_wbuf, l_0 );

// 保存 http_cookie( p_str ) ;

delete [] p_str;

}

delete [] p_wbuf;

} while ( b_res );

http_cookie类的示例：

class http_cookie

{

public:

http_cookie( const char * p = NULL )

: name( "" ), value( "" ), domain( "" ), path( "" ), secure( false )

{

if ( p != NULL )

parse( p );

};

bool match()

{

return false;

};

bool parse( const string & text )

{

name = "";

value = "";

domain = "";

path = "";

expire = "";

secure = false;

int pos[5];

for ( int s_pos = 0; s_pos < 5; ++s_pos )

pos[s_pos] = 0;

// parse name/value

int t_sz = text.size();

pos[0] = text.find( '=', 0 );

s_pos = pos[0];

if ( s_pos == string::npos

|| s_pos == t_sz - 1 )

return false;

int e_pos = text.find( ';', s_pos );

if ( e_pos == string::npos )

return false;

string::iterator t_b = const_cast<string::iterator>(text.begin());

name.assign( t_b, t_b + s_pos );

value.assign( t_b + s_pos + 1, t_b + e_pos );

if ( e_pos == t_sz - 1 )

return true;

pos[1] = text.find( "Domain=", e_pos );

if ( pos[1] == string::npos )

pos[1] = text.find( "domain=", e_pos );

pos[2] = text.find( "Path=", e_pos );

if ( pos[2] == string::npos )

pos[2] = text.find( "path=", e_pos );

pos[3] = text.find( "Expires=", e_pos );

if ( pos[3] == string::npos )

pos[3] = text.find( "expire=", e_pos );

pos[4] = text.find( "Secure", e_pos );

// parse domain

if ( pos[1] != string::npos )

{

s_pos = pos[1] + 7;

e_pos = text.find( ';', s_pos );

if ( e_pos != string::npos )

domain.assign( t_b + s_pos, t_b + e_pos );

else

domain.assign( t_b + s_pos );

}

// parse path

if ( pos[2] != string::npos )

{

s_pos = pos[2] + 5;

e_pos = text.find( ';', s_pos );

if ( e_pos != string::npos )

path.assign( t_b + s_pos, t_b + e_pos );

else

path.assign( t_b + s_pos );

}

// parse expire

if ( pos[3] != string::npos )

{

s_pos = pos[3] + 8;

e_pos = text.find( ';', s_pos );

if ( e_pos != string::npos )

expire.assign( t_b + s_pos, t_b + e_pos );

else

expire.assign( t_b + s_pos );

}

secure = ( pos[4] != string::npos );

return true;

};

// elements

string name;

string value;

string domain;

string path;

string expire;

bool secure;

};