HtmlStreamTokenizer 是纯java写的html 解析器,把html处理成三种类型tags, comments, and text,类试于StreamTokenizer class,但HtmlStreamTokenizer 处理的是html stream数据流,可以用来处理html文件,
下面是一个例子
import adc.parser.*;
//
HtmlStreamTokenizer tok = new HtmlStreamTokenizer(inputstream);
HtmlTag tag = new HtmlTag();
while (tok.nextToken() != HtmlStreamTokenizer.TT_EOF)
{
int ttype = tok.getTokenType();
if (ttype == HtmlStreamTokenizer.TT_TAG)
{
tok.parseTag(tok.getStringValue(), tag);
System.out.println("tag: " + tag.toString());
}
else if (ttype == HtmlStreamTokenizer.TT_TEXT)
{
System.out.println("text: " + tok.getStringValue());
}
else if (ttype == HtmlStreamTokenizer.TT_COMMENT)
{
System.out.println("comment: <!--" +
tok.getStringValue() + "-->");
}
}
下载地址