WEB开发网
开发学院WEB开发ASP Delphi通过MSHTML实现一个HTML解析类 阅读

Delphi通过MSHTML实现一个HTML解析类

 2010-01-06 10:43:09 来源:WEB开发网   
核心提示:最近经常会模拟网页提交返回网页源码,然后获得网页中相应的元素,Delphi通过MSHTML实现一个HTML解析类,于是需要常常解析Html中相应的各种元素,网络是个好东西,实现了一个HTMLParser,大致代码如下:这里只给出声明,搜索一番,就找到了好几个Delphi版本的HtmlParser的类库
最近经常会模拟网页提交返回网页源码,然后获得网页中相应的元素,于是需要常常解析Html中相应的各种元素,网络是个好东西,搜索一番,就找到了好几个Delphi版本的HtmlParser的类库,试着使用了几个,发现解析起来都不完整,或多或少的回出现一些问题!于是想到了如果界面上有一个浏览器,我们可以通过WebBrowser的Document接口对网页元素进行操作,很是方便!但是模拟网页提交,界面上是不一定要出现WebBrowser的,肯定有办法,不通过WebBrowser就直接解析HTML的,那便是我不要WebBrowser这个外壳,只要他里面的Document文档接口对象就能实现对Html的解析了,查找了一番MSDN,然后Google一下,果然可行,构建方法如下:

//创建IHTMLDocument2接口
 CoCreateInstance(CLASS_HTMLDocument, nil, CLSCTX_INPROC_SERVER, IID_IHTMLDocument2, FHtmlDoc);

接口创建好了之后就能够对文档元素进行解析了,很是爽快!

结合了我自己的特有操作,我对Combobox,Table,Frame等一些网页元素做了相应的封装,实现了一个HTMLParser,大致代码如下:

这里只给出声明,代码请在最后下载

代码
(******************************************************)
(*        得闲工作室             *)
(*       网页元素操作类库           *)
(*                          *)
(*       DxHtmlElement Unit          *)
(*  Copyright(c) 2008-2010 不得闲         *)
(*  email:appleak46@yahoo.com.cn   QQ:75492895  *)
(******************************************************)
unit DxHtmlElement;

interface
uses Windows,sysUtils,Clipbrd,MSHTML,ActiveX,OleCtrls,Graphics,TypInfo;

{Get EleMent Type}
function IsSelectElement(eleElement: IHTMLElement): Boolean;
function IsPwdElement(eleElement: IHTMLElement): Boolean;
function IsTextElement(element: IHTMLElement): boolean;
function IsTableElement(element: IHTMLElement): Boolean;
function IsElementCollection(element: IHTMLElement): Boolean;
function IsChkElement(element: IHTMLElement): boolean;
function IsRadioBtnElement(element: IHTMLElement): boolean;
function IsMemoElement(element: IHTMLElement): boolean;
function IsFormElement(element: IHTMLElement): boolean;
function IsIMGElement(element: IHTMLElement): boolean;
function IsInIMGElement(element: IHTMLElement): boolean;
function IsLabelElement(element: IHTMLElement): boolean;
function IsLinkElement(element: IHTMLElement): boolean;
function IsListElement(element: IHTMLElement): boolean;
function IsControlElement(element: IHTMLElement): boolean;
function IsObjectElement(element: IHTMLElement): boolean;
function IsFrameElement(element: IHTMLElement): boolean;
function IsInPutBtnElement(element: IHTMLElement): boolean;
function IsInHiddenElement(element: IHTMLElement): boolean;
function IsSubmitElement(element: IHTMLElement): boolean;
{Get ImgElement Data}
function GetPicIndex(doc: IHTMLDocument2; Src: string; Alt: string): Integer;
function GetPicElement(doc: IHTMLDocument2;imgName: string;src: string;Alt: string): IHTMLImgElement;
function GetRegCodePic(doc: IHTMLDocument2;ImgName: string; Src: string; Alt: string): TPicture; overload;
function GetRegCodePic(doc: IHTMLDocument2;Index: integer): TPicture; overload;
function GetRegCodePic(doc: IHTMLDocument2;element: IHTMLIMGElement): TPicture;overload;

type
 TObjectFromLResult = function(LRESULT: lResult;const IID: TIID; WPARAM: wParam;out pObject): HRESULT; stdcall;
 TEleMentType = (ELE_UNKNOW,ELE_TEXT,ELE_PWD,ELE_SELECT,ELE_CHECKBOX,ELE_RADIOBTN,ELE_MEMO,ELE_FORM,ELE_IMAGE,
 ELE_LABEL,ELE_LINK,ELE_LIST,ELE_CONTROL,ELE_OBJECT,ELE_FRAME,ELE_INPUTBTN,ELE_INIMAGE,ELE_INHIDDEN);


function GetElementType(element: IHTMLELEMENT): TEleMentType;
function GetElementTypeName(element: IHTMLELEMENT): string;
function GetHtmlTableCell(aTable: IHTMLTable;aRow,aCol: Integer): IHTMLElement;
function GetHtmlTable(aDoc: IHTMLDocument2; aIndex: Integer): IHTMLTable;
function GetWebBrowserHtmlTableCellText(Doc: IHTMLDocument2;
     const TableIndex, RowIndex, ColIndex: Integer;var ResValue: string):  Boolean;
function GetHtmlTableRowHtml(aTable: IHTMLTable; aRow: Integer): IHTMLElement;

function GetWebBrowserHtmlTableCellHtml(Doc: IHTMLDocument2;
     const TableIndex,RowIndex,ColIndex: Integer;var ResValue: string):  Boolean;
function GeHtmlTableHtml(aTable: IHTMLTable; aRow: Integer): IHTMLElement;
function GetWebBrowserHtmlTableHtml(Doc: IHTMLDocument2;
     const TableIndex,RowIndex: Integer;var ResValue: string):  Boolean;

type
 TDxWebFrameCollection = class;
 TDxWebElementCollection = class;


 TLoadState = (Doc_Loading,Doc_Completed,Doc_Invalidate);

 TDxWebFrame = class
 private
  FFrame: IHTMLWINDOW2;
  FElementCollections: TDxWebElementCollection;
  FWebFrameCollections: TDxWebFrameCollection;
  function GetSrc: string;
  function GetElementCount: integer;
  function GetWebFrameCollections: TDxWebFrameCollection;
  function GetElementCollections: TDxWebElementCollection;
  function GetDocument: IHTMLDOCUMENT2;
  function GetReadState: TLoadState;
  function GetIsLoaded: boolean;
  procedure SetFrame(const Value: IHTMLWINDOW2);
  function GetName: string;
 public
  Constructor Create(IFrame: IHTMLWINDOW2);
  Destructor Destroy;override;
  property Frame: IHTMLWINDOW2 read FFrame write SetFrame;
  property Src: string read GetSrc;
  property Document: IHTMLDOCUMENT2 read GetDocument;
  property Name: string read GetName;
  property Frames: TDxWebFrameCollection read GetWebFrameCollections;
  property ElementCount: integer read GetElementCount;
  property ElementCollections: TDxWebElementCollection read GetElementCollections;
  property ReadyState: TLoadState read GetReadState;
  property IsLoaded: boolean read GetIsLoaded; 
 end;


 TDxWebFrameCollection = Class
 private
  FFrameCollection: IHTMLFramesCollection2;
  Frame: TDxWebFrame;
  function GetCount: integer;
  function GetFrameInterfaceByIndex(index: integer): IHTMLWINDOW2;
  function GetFrameInterfaceByName(Name: string): IHTMLWINDOW2;
  function GetFrameByIndex(index: integer): TDxWebFrame;
  function GetFrameByName(Name: string): TDxWebFrame;
  procedure SetFrameCollection(const Value: IHTMLFramesCollection2);
 public
  Constructor Create(ACollection: IHTMLFramesCollection2);
  Destructor Destroy;override;
  property FrameCollection: IHTMLFramesCollection2 read FFrameCollection write SetFrameCollection;
  property Count: integer read GetCount;
  property FrameInterfaceByIndex[index: integer]: IHTMLWINDOW2 read GetFrameInterfaceByIndex;
  property FrameInterfaceByName[Name: string]: IHTMLWINDOW2 read GetFrameInterfaceByName;

  property FrameByIndex[index: integer]: TDxWebFrame read GetFrameByIndex;
  property FrameByName[Name: string]: TDxWebFrame read GetFrameByName;
 end;
 
 TDxWebElementCollection = class
 private
  FCollection: IHTMLElementCollection;
  FChildCollection: TDxWebElementCollection;
  function GetCollection(index: String): TDxWebElementCollection;
  function GetCount: integer;
  function GetElement(itemName: string; index: integer): IHTMLElement;
  function GetElementByName(itemName: string): IHTMLELEMENT;
  function GetElementByIndex(index: integer): IHTMLELEMENT;
  procedure SetCollection(const Value: IHTMLElementCollection);
 public
  Constructor Create(ACollection: IHTMLElementCollection);
  Destructor Destroy;override;
  property Collection: IHTMLElementCollection read FCollection write SetCollection;
  property ChildElementCollection[index: String]: TDxWebElementCollection read GetCollection;
  property ElementCount: integer read GetCount;
  property Element[itemName: string;index: integer]: IHTMLElement read GetElement;
  property ElementByName[itemName: string]: IHTMLELEMENT read GetElementByName;
  property ElementByIndex[index: integer]: IHTMLELEMENT read GetElementByIndex;
 end;

 TLinkCollection = class(TDxWebElementCollection)
 
 end;
 TDxWebTable = class;

 TDxTableCollection = class
 private
  FTableCollection: IHTMLElementCollection;
  FDocument: IHTMLDOCUMENT2;
  FWebTable: TDxWebTable;
  function GetTableInterfaceByName(AName: string): IHTMLTABLE;
  procedure SetDocument(Value: IHTMLDOCUMENT2);
  function GetTableInterfaceByIndex(index: integer): IHTMLTABLE;
  function GetCount: integer;
  function GetTableByIndex(index: integer): TDxWebTable;
  function GetTableByName(AName: string): TDxWebTable;
 public
  Constructor Create(Doc: IHTMLDOCUMENT2);
  destructor Destroy;override;
  property TableInterfaceByName[AName: string]: IHTMLTABLE read GetTableInterfaceByName;
  property TableInterfaceByIndex[index: integer]: IHTMLTABLE read GetTableInterfaceByIndex;

  property TableByName[AName: string]: TDxWebTable read GetTableByName;
  property TableByIndex[index: integer]: TDxWebTable read GetTableByIndex;
  
  property Document: IHTMLDOCUMENT2 read FDocument write SetDocument;
  property Count: integer read GetCount;
 end;

 TDxWebTable = class
 private
  FTableInterface: IHTMLTABLE;
  function GetRowCount: integer;
  procedure SetTableInterface(const Value: IHTMLTABLE);
  function GetCell(ACol, ARow: integer): string;
  function GetRowColCount(RowIndex: integer): integer;
  function GetInnerHtml: string;
  function GetInnerText: string;
  function GetCellElement(ACol, ARow: Integer): IHTMLTableCell;
 public
  Constructor Create(ATable: IHTMLTABLE);
  property TableInterface: IHTMLTABLE read FTableInterface write SetTableInterface;
  property RowCount: integer read GetRowCount;
  property Cell[ACol: integer;ARow: integer]: string read GetCell;
  property CellElement[ACol: Integer;ARow: Integer]: IHTMLTableCell read GetCellElement;
  property RowColCount[RowIndex: integer]: integer read GetRowColCount;
  property InnerHtml: string read GetInnerHtml;
  property InnerText: string read GetInnerText;
 end;

 TDxWebCombobox = class
 private
  FHtmlSelect: IHTMLSelectElement;
  function GetCount: Integer;
  procedure SetItemIndex(const Value: Integer);
  function GetItemIndex: Integer;
  function GetName: string;
  procedure SetName(const Value: string);
  function GetValue: string;
  procedure SetValue(const Value: string);
  procedure SetCombInterface(const Value: IHTMLSelectElement);
  function GetItemByName(EleName: string): string;
  function GetItemByIndex(index: integer): string;
  function GetItemAttribute(index: Integer; AttribName: string): OleVariant;
 public
  constructor Create(AWebCombo: IHTMLSelectElement);
  procedure Add(Ele: IHTMLElement);
  procedure Insert(Ele: IHTMLElement;Index: Integer);
  procedure Remove(index: Integer);

  property CombInterface: IHTMLSelectElement read FHtmlSelect write SetCombInterface;
  property Count: Integer read GetCount;
  property ItemIndex: Integer read GetItemIndex write SetItemIndex;
  property ItemByIndex[index: integer]: string read GetItemByIndex;
  property ItemByName[EleName: string]: string read GetItemByName;
  property ItemAttribute[index: Integer;AttribName: string]: OleVariant read GetItemAttribute;
  property Name: string read GetName write SetName;
  property value: string read GetValue write SetValue;
 end;

implementation
end.



HTMLParser解析类的代码实现单元

代码
(******************************************************)
(*        得闲工作室             *)
(*       HTML解析单元库            *)
(*                          *)
(*       DxHtmlParser Unit           *)
(*  Copyright(c) 2008-2010 不得闲         *)
(*  email:appleak46@yahoo.com.cn   QQ:75492895  *)
(******************************************************)
unit DxHtmlParser;

interface
uses Windows,MSHTML,ActiveX,DxHtmlElement,Forms;

type
 TDxHtmlParser = class
 private
  FHtmlDoc: IHTMLDocument2;
  FHTML: string;
  FWebTables: TDxTableCollection;
  FWebElements: TDxWebElementCollection;
  FWebComb: TDxWebCombobox;
  procedure SetHTML(const Value: string);
  function GetWebCombobox(AName: string): TDxWebCombobox;
 public
  constructor Create;
  destructor Destroy;override;
  property HTML: string read FHTML write SetHTML;
  property WebTables: TDxTableCollection read FWebTables;
  property WebElements: TDxWebElementCollection read FWebElements;
  property WebCombobox[Name: string]: TDxWebCombobox read GetWebCombobox;
 end;
implementation

{ TDxHtmlParser }

constructor TDxHtmlParser.Create;
begin
 CoInitialize(nil);
 //创建IHTMLDocument2接口
 CoCreateInstance(CLASS_HTMLDocument, nil, CLSCTX_INPROC_SERVER, IID_IHTMLDocument2, FHtmlDoc);
 Assert(FHtmlDoc<>nil,'构建HTMLDocument接口失败');
 FHtmlDoc.Set_designMode('On'); //设置为设计模式,不执行脚本
 while not (FHtmlDoc.readyState = 'complete') do
 begin
  sleep(1);
  application.ProcessMessages;
 end;          
 FWebTables := TDxTableCollection.Create(FHtmlDoc);
 FWebElements := TDxWebElementCollection.Create(nil);
 FWebComb := TDxWebCombobox.Create(nil);
end;

destructor TDxHtmlParser.Destroy;
begin
 FWebTables.Free;
 FWebElements.Free;
 FWebComb.Free;
 CoUninitialize;
 inherited;
end;

function TDxHtmlParser.GetWebCombobox(AName: string): TDxWebCombobox;
begin
  if FWebElements.Collection <> nil then
  begin
   FWebComb.CombInterface := FWebElements.ElementByName[AName] as IHTMLSelectElement;
   Result := FWebComb;
  end
  else Result := nil;
end;

procedure TDxHtmlParser.SetHTML(const Value: string);
begin
 if FHTML <> Value then
 begin
  FHTML := Value;
  FHtmlDoc.body.innerHTML := FHTML;
  FWebElements.Collection := FHtmlDoc.all;
 end;
end;

end.

Tags:Delphi 通过 MSHTML

编辑录入:爽爽 [复制链接] [打 印]
赞助商链接