函数:
getHTMLObject
getHTMLObjectText
原型:
void
getHTMLObject( string & obj,
const string & htmltext,
const string & tag,
UInt32 tagindex,
UInt32 unpair = HTML_TAGUNPAIR_TEXT )
void
getHTMLObjectText( string & obj,
const string & htmltext,
const string & tag,
UInt32 tagindex,
UInt32 unpair = HTML_TAGUNPAIR_TEXT )
参数:
obj [out]
返回检索的元素/元素内容, 见说明
htmltext [in]
需要检索的HTML文本, htmltext应经过规范化处理
tag [in]
需要检索的HTML元素名称,如 html, body, div, table, form, a 等等各种HTML文档中使用的元素标签。
tagindex [in]
需要检索的HTML元素的序号,即从文本起始的第tagindex个标签
取值为 1,2,3,......
unpair [in]
配对设定。
当在html_text中检索到不配对的元素标签, 即没有对应的</tag>时,返回的内容根据此参数设定为:
* HTML_TAGUNPAIR_TEXT
返回元素的内容文本,舍弃元素定义部分
* HTML_TAGUNPAIR_THIS
返回元素定义部分,舍弃元素内容
* HTML_TAGUNPAIR_NEXT
返回元素定义和内容,一直到下一个同名的元素起始
即, 如果是 <tag>content<tag>, 则返回 <tag>content
说明:
这二个函数的功能类似于 DHTML中的 innerHTML/innerText, outerHTML/outerText
getHTMLObject, 读取的内容包括元素本身定义,形式如 <tag>content</tag>
getHTMLObjectText, 读取的内容不包括元素本身定义,对形式如 <tag>content</tag>的元素,返回content,并删除前后的空白
函数对输入的HTML文本没有作HTML有效性检验(validation)
代码:
void
getHTMLObject( string & obj,
const string & htmltext,
const string & tag,
UInt32 tagindex,
UInt32 unpair )
{
if ( htmltext.empty()
|| tag.empty() )
return;
string start_tag( "<" + tag );
UInt32 e_pos = string::npos;
UInt32 s_pos = string::npos;
UInt32 tpos = 0;
// find the start tag
for ( UInt32 i = 1; i <= tagindex; i++ )
{
tpos = htmltext.find( start_tag.c_str(), tpos );
if ( tpos != string::npos )
tpos += start_tag.size() ;
else
i = tagindex + 1;
}
UInt32 h_sz = htmltext.size();
if ( tpos != string::npos )
{
s_pos = tpos - start_tag.size();
++tpos;
// find the end tag
stack<UInt32> s_tags;
s_tags.push( s_pos );
bool found = false;
while ( !found )
{
tpos += tag.size();
tpos = htmltext.find( tag, tpos );
if ( tpos != string::npos )
{
--tpos;
switch ( htmltext[tpos] )
{
case '<':
s_tags.push( tpos );
break;
case '/':
if ( htmltext[tpos - 1] == '<'
&& htmltext[tpos + tag.size() + 1] == '>' )
{
s_tags.pop();
found = s_tags.empty();
}
break;
default:
break;
}
++tpos;
}
else
found = true;
}
if ( tpos != string::npos )
e_pos = tpos + tag.size() + 1;
else
{
e_pos = h_sz;
switch ( unpair )
{
case HTML_TAGUNPAIR_TEXT:
break;
case HTML_TAGUNPAIR_THIS:
tpos = htmltext.find( ">", s_pos + start_tag.size() );
if ( tpos != string::npos )
e_pos = tpos + 1;
break;
case HTML_TAGUNPAIR_NEXT:
while ( s_tags.size() > 1 )
{
tpos = s_tags.top();
s_tags.pop();
}
if ( tpos != string::npos )
e_pos = tpos;
break;
default:
break;
}
}
}
if ( s_pos != string::npos )
obj = htmltext.substr( s_pos, e_pos - s_pos );
else
obj.resize( 0 );
}
void
getHTMLObjectText( string & obj,
const string & htmltext,
const string & tag,
UInt32 tagindex,
UInt32 unpair )
{
if ( htmltext.empty()
|| tag.empty() )
return;
string whitespaces( "\t\n\v\f\r " );
string start_tag( "<" + tag );
UInt32 e_pos = string::npos;
UInt32 s_pos = string::npos;
UInt32 tpos = 0;
// find the start tag
for ( UInt32 i = 1; i <= tagindex; i++ )
{
tpos = htmltext.find( start_tag.c_str(), tpos );
if ( tpos != string::npos )
tpos += start_tag.size();
else
i = tagindex + 1;
}
UInt32 h_sz = htmltext.size();
if ( tpos != string::npos )
{
tpos = htmltext.find( ">", tpos );
if ( tpos != string::npos )
{
++tpos;
s_pos = tpos;
// find the end tag
stack<UInt32> s_tags;
s_tags.push( s_pos );
tpos -= tag.size();
bool found = false;
while ( !found )
{
tpos += tag.size();
tpos = htmltext.find( tag, tpos );
if ( tpos != string::npos )
{
--tpos;
switch ( htmltext[tpos] )
{
case '<':
s_tags.push( tpos );
break;
case '/':
if ( htmltext[tpos - 1] == '<'
&& htmltext[tpos + tag.size() + 1] == '>' )
{
s_tags.pop();
found = s_tags.empty();
}
break;
default:
break;
}
++tpos;
}
else
found = true;
}
if ( tpos != string::npos )
e_pos = tpos - 2;
else
{
e_pos = h_sz;
switch ( unpair )
{
case HTML_TAGUNPAIR_TEXT:
break;
case HTML_TAGUNPAIR_THIS:
e_pos = s_pos;
break;
case HTML_TAGUNPAIR_NEXT:
while ( s_tags.size() > 1 )
{
tpos = s_tags.top();
s_tags.pop();
}
if ( tpos != string::npos )
e_pos = tpos;
break;
default:
break;
}
}
}
}
if ( s_pos != string::npos
&& s_pos != e_pos )
{
obj = htmltext.substr( s_pos, e_pos - s_pos );
// trim left whitespaces
tpos = obj.find_first_not_of( whitespaces, 0 );
if ( tpos > 0 )
obj.erase( obj.begin(), obj.begin() + tpos );
// trim right whitespaces
tpos = obj.find_last_not_of( whitespaces );
if ( tpos < obj.size() - 1 )
obj.erase( obj.begin() + tpos + 1, obj.end() );
}
else
obj.resize( 0 );
}