分享
 
 
 

Delphi通过MSHTML实现一个HTML解析类

王朝学院·作者佚名  2010-01-07
窄屏简体版  字體: |||超大  

最近经常会模拟网页提交返回网页源码,然后获得网页中相应的元素,于是需要常常解析Html中相应的各种元素,网络是个好东西,搜索一番,就找到了好几个Delphi版本的HtmlParser的类库,试着使用了几个,发现解析起来都不完整,或多或少的回出现一些问题!于是想到了如果界面上有一个浏览器,我们可以通过WebBrowser的Document接口对网页元素进行操作,很是方便!但是模拟网页提交,界面上是不一定要出现WebBrowser的,肯定有办法,不通过WebBrowser就直接解析HTML的,那便是我不要WebBrowser这个外壳,只要他里面的Document文档接口对象就能实现对Html的解析了,查找了一番MSDN,然后Google一下,果然可行,构建方法如下:

//创建IHTMLDocument2接口

CoCreateInstance(CLASS_HTMLDocument, nil, CLSCTX_INPROC_SERVER, IID_IHTMLDocument2, FHtmlDoc);

接口创建好了之后就能够对文档元素进行解析了,很是爽快!

结合了我自己的特有操作,我对Combobox,Table,Frame等一些网页元素做了相应的封装,实现了一个HTMLParser,大致代码如下:

这里只给出声明,代码请在最后下载

代码

(******************************************************)

(* 得闲工作室 *)

(* 网页元素操作类库 *)

(* *)

(* DxHtmlElement Unit *)

(* Copyright(c) 2008-2010 不得闲 *)

(* email:appleak46@yahoo.com.cn QQ:75492895 *)

(******************************************************)

unit DxHtmlElement;

interface

uses Windows,sysUtils,Clipbrd,MSHTML,ActiveX,OleCtrls,Graphics,TypInfo;

{Get EleMent Type}

function IsSelectElement(eleElement: IHTMLElement): Boolean;

function IsPwdElement(eleElement: IHTMLElement): Boolean;

function IsTextElement(element: IHTMLElement): boolean;

function IsTableElement(element: IHTMLElement): Boolean;

function IsElementCollection(element: IHTMLElement): Boolean;

function IsChkElement(element: IHTMLElement): boolean;

function IsRadioBtnElement(element: IHTMLElement): boolean;

function IsMemoElement(element: IHTMLElement): boolean;

function IsFormElement(element: IHTMLElement): boolean;

function IsIMGElement(element: IHTMLElement): boolean;

function IsInIMGElement(element: IHTMLElement): boolean;

function IsLabelElement(element: IHTMLElement): boolean;

function IsLinkElement(element: IHTMLElement): boolean;

function IsListElement(element: IHTMLElement): boolean;

function IsControlElement(element: IHTMLElement): boolean;

function IsObjectElement(element: IHTMLElement): boolean;

function IsFrameElement(element: IHTMLElement): boolean;

function IsInPutBtnElement(element: IHTMLElement): boolean;

function IsInHiddenElement(element: IHTMLElement): boolean;

function IsSubmitElement(element: IHTMLElement): boolean;

{Get ImgElement Data}

function GetPicIndex(doc: IHTMLDocument2; Src: string; Alt: string): Integer;

function GetPicElement(doc: IHTMLDocument2;imgName: string;src: string;Alt: string): IHTMLImgElement;

function GetRegCodePic(doc: IHTMLDocument2;ImgName: string; Src: string; Alt: string): TPicture; overload;

function GetRegCodePic(doc: IHTMLDocument2;Index: integer): TPicture; overload;

function GetRegCodePic(doc: IHTMLDocument2;element: IHTMLIMGElement): TPicture;overload;

type

TObjectFromLResult = function(LRESULT: lResult;const IID: TIID; WPARAM: wParam;out pObject): HRESULT; stdcall;

TEleMentType = (ELE_UNKNOW,ELE_TEXT,ELE_PWD,ELE_SELECT,ELE_CHECKBOX,ELE_RADIOBTN,ELE_MEMO,ELE_FORM,ELE_IMAGE,

ELE_LABEL,ELE_LINK,ELE_LIST,ELE_CONTROL,ELE_OBJECT,ELE_FRAME,ELE_INPUTBTN,ELE_INIMAGE,ELE_INHIDDEN);

function GetElementType(element: IHTMLELEMENT): TEleMentType;

function GetElementTypeName(element: IHTMLELEMENT): string;

function GetHtmlTableCell(aTable: IHTMLTable;aRow,aCol: Integer): IHTMLElement;

function GetHtmlTable(aDoc: IHTMLDocument2; aIndex: Integer): IHTMLTable;

function GetWebBrowserHtmlTableCellText(Doc: IHTMLDocument2;

const TableIndex, RowIndex, ColIndex: Integer;var ResValue: string): Boolean;

function GetHtmlTableRowHtml(aTable: IHTMLTable; aRow: Integer): IHTMLElement;

function GetWebBrowserHtmlTableCellHtml(Doc: IHTMLDocument2;

const TableIndex,RowIndex,ColIndex: Integer;var ResValue: string): Boolean;

function GeHtmlTableHtml(aTable: IHTMLTable; aRow: Integer): IHTMLElement;

function GetWebBrowserHtmlTableHtml(Doc: IHTMLDocument2;

const TableIndex,RowIndex: Integer;var ResValue: string): Boolean;

type

TDxWebFrameCollection = class;

TDxWebElementCollection = class;

TLoadState = (Doc_Loading,Doc_Completed,Doc_Invalidate);

TDxWebFrame = class

private

FFrame: IHTMLWINDOW2;

FElementCollections: TDxWebElementCollection;

FWebFrameCollections: TDxWebFrameCollection;

function GetSrc: string;

function GetElementCount: integer;

function GetWebFrameCollections: TDxWebFrameCollection;

function GetElementCollections: TDxWebElementCollection;

function GetDocument: IHTMLDOCUMENT2;

function GetReadState: TLoadState;

function GetIsLoaded: boolean;

procedure SetFrame(const Value: IHTMLWINDOW2);

function GetName: string;

public

Constructor Create(IFrame: IHTMLWINDOW2);

Destructor Destroy;override;

property Frame: IHTMLWINDOW2 read FFrame write SetFrame;

property Src: string read GetSrc;

property Document: IHTMLDOCUMENT2 read GetDocument;

property Name: string read GetName;

property Frames: TDxWebFrameCollection read GetWebFrameCollections;

property ElementCount: integer read GetElementCount;

property ElementCollections: TDxWebElementCollection read GetElementCollections;

property ReadyState: TLoadState read GetReadState;

property IsLoaded: boolean read GetIsLoaded;

end;

TDxWebFrameCollection = Class

private

FFrameCollection: IHTMLFramesCollection2;

Frame: TDxWebFrame;

function GetCount: integer;

function GetFrameInterfaceByIndex(index: integer): IHTMLWINDOW2;

function GetFrameInterfaceByName(Name: string): IHTMLWINDOW2;

function GetFrameByIndex(index: integer): TDxWebFrame;

function GetFrameByName(Name: string): TDxWebFrame;

procedure SetFrameCollection(const Value: IHTMLFramesCollection2);

public

Constructor Create(ACollection: IHTMLFramesCollection2);

Destructor Destroy;override;

property FrameCollection: IHTMLFramesCollection2 read FFrameCollection write SetFrameCollection;

property Count: integer read GetCount;

property FrameInterfaceByIndex[index: integer]: IHTMLWINDOW2 read GetFrameInterfaceByIndex;

property FrameInterfaceByName[Name: string]: IHTMLWINDOW2 read GetFrameInterfaceByName;

property FrameByIndex[index: integer]: TDxWebFrame read GetFrameByIndex;

property FrameByName[Name: string]: TDxWebFrame read GetFrameByName;

end;

TDxWebElementCollection = class

private

FCollection: IHTMLElementCollection;

FChildCollection: TDxWebElementCollection;

function GetCollection(index: String): TDxWebElementCollection;

function GetCount: integer;

function GetElement(itemName: string; index: integer): IHTMLElement;

function GetElementByName(itemName: string): IHTMLELEMENT;

function GetElementByIndex(index: integer): IHTMLELEMENT;

procedure SetCollection(const Value: IHTMLElementCollection);

public

Constructor Create(ACollection: IHTMLElementCollection);

Destructor Destroy;override;

property Collection: IHTMLElementCollection read FCollection write SetCollection;

property ChildElementCollection[index: String]: TDxWebElementCollection read GetCollection;

property ElementCount: integer read GetCount;

property Element[itemName: string;index: integer]: IHTMLElement read GetElement;

property ElementByName[itemName: string]: IHTMLELEMENT read GetElementByName;

property ElementByIndex[index: integer]: IHTMLELEMENT read GetElementByIndex;

end;

TLinkCollection = class(TDxWebElementCollection)

end;

TDxWebTable = class;

TDxTableCollection = class

private

FTableCollection: IHTMLElementCollection;

FDocument: IHTMLDOCUMENT2;

FWebTable: TDxWebTable;

function GetTableInterfaceByName(AName: string): IHTMLTABLE;

procedure SetDocument(Value: IHTMLDOCUMENT2);

function GetTableInterfaceByIndex(index: integer): IHTMLTABLE;

function GetCount: integer;

function GetTableByIndex(index: integer): TDxWebTable;

function GetTableByName(AName: string): TDxWebTable;

public

Constructor Create(Doc: IHTMLDOCUMENT2);

destructor Destroy;override;

property TableInterfaceByName[AName: string]: IHTMLTABLE read GetTableInterfaceByName;

property TableInterfaceByIndex[index: integer]: IHTMLTABLE read GetTableInterfaceByIndex;

property TableByName[AName: string]: TDxWebTable read GetTableByName;

property TableByIndex[index: integer]: TDxWebTable read GetTableByIndex;

property Document: IHTMLDOCUMENT2 read FDocument write SetDocument;

property Count: integer read GetCount;

end;

TDxWebTable = class

private

FTableInterface: IHTMLTABLE;

function GetRowCount: integer;

procedure SetTableInterface(const Value: IHTMLTABLE);

function GetCell(ACol, ARow: integer): string;

function GetRowColCount(RowIndex: integer): integer;

function GetInnerHtml: string;

function GetInnerText: string;

function GetCellElement(ACol, ARow: Integer): IHTMLTableCell;

public

Constructor Create(ATable: IHTMLTABLE);

property TableInterface: IHTMLTABLE read FTableInterface write SetTableInterface;

property RowCount: integer read GetRowCount;

property Cell[ACol: integer;ARow: integer]: string read GetCell;

property CellElement[ACol: Integer;ARow: Integer]: IHTMLTableCell read GetCellElement;

property RowColCount[RowIndex: integer]: integer read GetRowColCount;

property InnerHtml: string read GetInnerHtml;

property InnerText: string read GetInnerText;

end;

TDxWebCombobox = class

private

FHtmlSelect: IHTMLSelectElement;

function GetCount: Integer;

procedure SetItemIndex(const Value: Integer);

function GetItemIndex: Integer;

function GetName: string;

procedure SetName(const Value: string);

function GetValue: string;

procedure SetValue(const Value: string);

procedure SetCombInterface(const Value: IHTMLSelectElement);

function GetItemByName(EleName: string): string;

function GetItemByIndex(index: integer): string;

function GetItemAttribute(index: Integer; AttribName: string): OleVariant;

public

constructor Create(AWebCombo: IHTMLSelectElement);

procedure Add(Ele: IHTMLElement);

procedure Insert(Ele: IHTMLElement;Index: Integer);

procedure Remove(index: Integer);

property CombInterface: IHTMLSelectElement read FHtmlSelect write SetCombInterface;

property Count: Integer read GetCount;

property ItemIndex: Integer read GetItemIndex write SetItemIndex;

property ItemByIndex[index: integer]: string read GetItemByIndex;

property ItemByName[EleName: string]: string read GetItemByName;

property ItemAttribute[index: Integer;AttribName: string]: OleVariant read GetItemAttribute;

property Name: string read GetName write SetName;

property value: string read GetValue write SetValue;

end;

implementation

end.

HTMLParser解析类的代码实现单元

代码

(******************************************************)

(* 得闲工作室 *)

(* HTML解析单元库 *)

(* *)

(* DxHtmlParser Unit *)

(* Copyright(c) 2008-2010 不得闲 *)

(* email:appleak46@yahoo.com.cn QQ:75492895 *)

(******************************************************)

unit DxHtmlParser;

interface

uses Windows,MSHTML,ActiveX,DxHtmlElement,Forms;

type

TDxHtmlParser = class

private

FHtmlDoc: IHTMLDocument2;

FHTML: string;

FWebTables: TDxTableCollection;

FWebElements: TDxWebElementCollection;

FWebComb: TDxWebCombobox;

procedure SetHTML(const Value: string);

function GetWebCombobox(AName: string): TDxWebCombobox;

public

constructor Create;

destructor Destroy;override;

property HTML: string read FHTML write SetHTML;

property WebTables: TDxTableCollection read FWebTables;

property WebElements: TDxWebElementCollection read FWebElements;

property WebCombobox[Name: string]: TDxWebCombobox read GetWebCombobox;

end;

implementation

{ TDxHtmlParser }

constructor TDxHtmlParser.Create;

begin

CoInitialize(nil);

//创建IHTMLDocument2接口

CoCreateInstance(CLASS_HTMLDocument, nil, CLSCTX_INPROC_SERVER, IID_IHTMLDocument2, FHtmlDoc);

Assert(FHtmlDoc<>nil,'构建HTMLDocument接口失败');

FHtmlDoc.Set_designMode('On'); //设置为设计模式,不执行脚本

while not (FHtmlDoc.readyState = 'complete') do

begin

sleep(1);

Application.ProcessMessages;

end;

FWebTables := TDxTableCollection.Create(FHtmlDoc);

FWebElements := TDxWebElementCollection.Create(nil);

FWebComb := TDxWebCombobox.Create(nil);

end;

destructor TDxHtmlParser.Destroy;

begin

FWebTables.Free;

FWebElements.Free;

FWebComb.Free;

CoUninitialize;

inherited;

end;

function TDxHtmlParser.GetWebCombobox(AName: string): TDxWebCombobox;

begin

if FWebElements.Collection <> nil then

begin

FWebComb.CombInterface := FWebElements.ElementByName[AName] as IHTMLSelectElement;

Result := FWebComb;

end

else Result := nil;

end;

procedure TDxHtmlParser.SetHTML(const Value: string);

begin

if FHTML <> Value then

begin

FHTML := Value;

FHtmlDoc.body.innerHTML := FHTML;

FWebElements.Collection := FHtmlDoc.all;

end;

end;

end.

 
 
 
免责声明:本文为网络用户发布,其观点仅代表作者个人观点,与本站无关,本站仅提供信息存储服务。文中陈述内容未经本站证实,其真实性、完整性、及时性本站不作任何保证或承诺,请读者仅作参考,并请自行核实相关内容。
2023年上半年GDP全球前十五强
 百态   2023-10-24
美众议院议长启动对拜登的弹劾调查
 百态   2023-09-13
上海、济南、武汉等多地出现不明坠落物
 探索   2023-09-06
印度或要将国名改为“巴拉特”
 百态   2023-09-06
男子为女友送行,买票不登机被捕
 百态   2023-08-20
手机地震预警功能怎么开?
 干货   2023-08-06
女子4年卖2套房花700多万做美容:不但没变美脸,面部还出现变形
 百态   2023-08-04
住户一楼被水淹 还冲来8头猪
 百态   2023-07-31
女子体内爬出大量瓜子状活虫
 百态   2023-07-25
地球连续35年收到神秘规律性信号,网友:不要回答!
 探索   2023-07-21
全球镓价格本周大涨27%
 探索   2023-07-09
钱都流向了那些不缺钱的人,苦都留给了能吃苦的人
 探索   2023-07-02
倩女手游刀客魅者强控制(强混乱强眩晕强睡眠)和对应控制抗性的关系
 百态   2020-08-20
美国5月9日最新疫情:美国确诊人数突破131万
 百态   2020-05-09
荷兰政府宣布将集体辞职
 干货   2020-04-30
倩女幽魂手游师徒任务情义春秋猜成语答案逍遥观:鹏程万里
 干货   2019-11-12
倩女幽魂手游师徒任务情义春秋猜成语答案神机营:射石饮羽
 干货   2019-11-12
倩女幽魂手游师徒任务情义春秋猜成语答案昆仑山:拔刀相助
 干货   2019-11-12
倩女幽魂手游师徒任务情义春秋猜成语答案天工阁:鬼斧神工
 干货   2019-11-12
倩女幽魂手游师徒任务情义春秋猜成语答案丝路古道:单枪匹马
 干货   2019-11-12
倩女幽魂手游师徒任务情义春秋猜成语答案镇郊荒野:与虎谋皮
 干货   2019-11-12
倩女幽魂手游师徒任务情义春秋猜成语答案镇郊荒野:李代桃僵
 干货   2019-11-12
倩女幽魂手游师徒任务情义春秋猜成语答案镇郊荒野:指鹿为马
 干货   2019-11-12
倩女幽魂手游师徒任务情义春秋猜成语答案金陵:小鸟依人
 干货   2019-11-12
倩女幽魂手游师徒任务情义春秋猜成语答案金陵:千金买邻
 干货   2019-11-12
 
推荐阅读
 
 
 
>>返回首頁<<
 
靜靜地坐在廢墟上,四周的荒凉一望無際,忽然覺得,淒涼也很美
© 2005- 王朝網路 版權所有