用PHP抓取网页

王朝php·作者佚名  2006-12-16
窄屏简体版  字體: |||超大  

抓取网页,并将文字和图片存入数据库中,利用getimg.php?id=读取数据库中的图片

getarticle.php?id=读取文档

<?

/**建表文档 articletype对应的类型1:oracle,2:java,3:system

CREATE TABLE article (

id int(6) NOT NULL auto_increment,

title varchar(80) default NULL,

content text,

url varchar(80) default NULL,

joindate varchar(12) default NULL,

articletype int(2) not null,

PRIMARY KEY (id)

) ;

CREATE TABLE images (

id int(4) NOT NULL auto_increment,

bin_data longblob,

filetype varchar(50) default NULL,

title varchar(50) default NULL,

articleid int(6) NOT NULL,

PRIMARY KEY (id)

) TYPE=MyISAM;

*/

class SaveWeb

{

var $title;

var $url;

var $typeid;

var $content;

var $getUrl = true;

var $getimg = "getimg.php?id=";

var $dbuser = "root";

var $dbpassword = "whf76128";

var $dbname = "tech";

var $dbhost = "127.0.0.1";

function SaveWeb($title,$url,$typeid) //初始化,

{

$this->title=$title;

$this->url=$url;

$this->typeid=$typeid;

}

function setContent($html) //初始化,

{

$this->content = $html;

$this->getUrl = false;

}

function saveContent() //直接存储段落文字

{

$date = gmdate("Y-m-d");

$data = nl2br($this->content);

$data = addslashes($data);

MYSQL_CONNECT( $this->dbhost, $this->dbuser, $this->dbpassword);

mysql_select_db( $this->dbname);

$result=MYSQL_QUERY( "INSERT INTO article (title,content,url,joindate,articletype) VALUES ('$this->title','$data','$this->url','$date',$this->typeid)");

$id= mysql_insert_id();

MYSQL_CLOSE();

return $id;

}

function webSave() //存储页面

{

if($this->title==""||$this->url=="")

return false;

if($this->getUrl==true)

$text = $this->getHtml($this->url);

else

{

$text = $this->content;

}

$text2 = $this->parserHtml($text);

$id = $this->saveHtml($text2);

$this->updateImgPID($id,$this->title);

$this->delimg();

return $id;

}

//在$strobj中查找$strchild,返回值为位置(找到)和false(没有找到相应的字符串).

function strfind($strobj,$strchild,$int)

{

$intobj=strlen($strobj);

$intchild=strlen($strchild);

while($int<=$intobj)

{

if(strtolower(substr($strobj,$int,1))==$strchild[0]) //当从$strobj上截取的首字符与$strchild的首字符相同时,作进一步判断.

{

if(strtolower(substr($strobj,$int,$intchild))==$strchild)

return $int;

}

$int++;

}

return false;

}

function getHtml($url)

{

if(($fp = fopen($url,"r"))==false)

{

echo "<font color=red>读取失败,文件位置:$url</font><br>";

return false;

}

$data = "";

while(!feof($fp))

{

$data = $data.fread($fp,512);

}

fclose($fp);

return $data;

}

function delImg()

{

MYSQL_CONNECT( $this->dbhost, $this->dbuser, $this->dbpassword);

mysql_select_db( $this->dbname);

$result=MYSQL_QUERY( "delete from images where articleid = 0");

MYSQL_CLOSE();

}

function updateImgPID($id,$title)

{

MYSQL_CONNECT( $this->dbhost, $this->dbuser, $this->dbpassword);

mysql_select_db( $this->dbname);

MYSQL_QUERY( "update images set articleid = $id where title='$title'");

MYSQL_CLOSE();

}

function saveHtml($data)

{

$date = gmdate("Y-m-d");

$data = addslashes($data);

MYSQL_CONNECT( $this->dbhost, $this->dbuser, $this->dbpassword);

mysql_select_db( $this->dbname);

$result=MYSQL_QUERY( "INSERT INTO article (title,content,url,joindate,articletype) VALUES ('$this->title','$data','$this->url','$date',$this->typeid)");

$id= mysql_insert_id();

MYSQL_CLOSE();

return $id;

}

function saveImg($url)

{

$data = $this->getHtml($url);

$data = addslashes($data);

MYSQL_CONNECT( $this->dbhost, $this->dbuser, $this->dbpassword);

mysql_select_db( $this->dbname);

$result=MYSQL_QUERY( "INSERT INTO images (bin_data,filetype,title,articleid) VALUES ('$data','".$this->getContentType($url)."','$this->title',0)");

$id= mysql_insert_id();

MYSQL_CLOSE();

return $id;

}

function getContentName($inFileName)

{

return basename($inFileName);

}

function getContentType($inFileName)

{

//--剥去路径

$inFileName = basename($inFileName);

//--检查文件扩展名

if(strrchr($inFileName, ".") == false)

{

return "application/octet-stream";

}

//--得到文件扩展名,并判断文件类型

$extension = strrchr($inFileName, ".");

switch($extension)

{

case ".gif": return "image/gif";

case ".gz": return "application/x-gzip";

case ".htm": return "text/html";

case ".html": return "text/html";

case ".jpg": return "image/jpeg";

case ".tar": return "application/x-tar";

case ".txt": return "text/plain";

case ".zip": return "application/zip";

case ".png": return "image/png";

case ".bmp": return "image/bmp";

default: return "application/octet-stream";

}

return "application/octet-stream";

}

function parserHtml($text)

{

$int = 0;

$baseUrl = parse_url($this->url);

$urlHost = "http://".$baseUrl["host"];

$urlDir = $urlHost.dirname($baseUrl["path"]);

$urlDir = str_replace("\\","/",$urlDir);

//更新<img>标签

while($int = $this->strfind($text,"<img",$int))

{

$closeCharPos = $this->strfind($text,">",$int);

$tmpTxt = substr($text,$int,$closeCharPos-$int+1);

$srcStart = $this->strfind($tmpTxt,"src=",0);

$srcEnd = 0;

switch(substr($tmpTxt,$srcStart+4,1))

{

case '"':

$srcEnd = $this->strfind($tmpTxt,'"',$srcStart+5);

$imgUrl = substr($tmpTxt,$srcStart+5,$srcEnd-$srcStart-5);

break;

case "'":

$srcEnd = $this->strfind($tmpTxt,"'",$srcStart+5);

$imgUrl = substr($tmpTxt,$srcStart+5,$srcEnd-$srcStart-5);

break;

default:

$srcEnd = $this->strfind($tmpTxt," ",$srcStart+4);

if($srcEnd == false)

$srcEnd = $this->strfind($tmpTxt,'>',$srcStart+4);

$imgUrl = substr($tmpTxt,$srcStart+4,$srcEnd-$srcStart-4);

}

$tempImgUrl = $imgUrl;

$tempFile = parse_url($this->getimg);

if($this->strfind($tmpTxt,"http://",0)!=true)

{

switch(substr($imgUrl,0,1))

{

case "/":

$imgUrl = $urlHost.$imgUrl;

break;

default:

if(substr($urlDir,strlen($urlDir)-1,1)=="/")

$imgUrl = $urlDir.$imgUrl;

else

$imgUrl = $urlDir."/".$imgUrl;

}

}

if($this->strfind($imgUrl,$tempFile["path"],0)!=false)

{

$int++;

continue;

}

$id = $this->saveImg($imgUrl);

if($id == false)

{

$int++;

continue;

}

$newImgUrl = $this->getimg.$id;

$text = str_replace($tempImgUrl,$newImgUrl,$text);

$int++;

}

$int = 0;

//更新<a></a>标签

while($int = $this->strfind($text,"<a",$int))

{

$closeCharPos = $this->strfind($text,">",$int);

$tmpTxt = substr($text,$int,$closeCharPos-$int+1);

$srcStart = $this->strfind($tmpTxt,"href=",0);

$srcEnd = 0;

switch(substr($tmpTxt,$srcStart+5,1))

{

case '"':

$srcEnd = $this->strfind($tmpTxt,'"',$srcStart+6);

$imgUrl = substr($tmpTxt,$srcStart+6,$srcEnd-$srcStart-6);

break;

case "'":

$srcEnd = $this->strfind($tmpTxt,"'",$srcStart+6);

$imgUrl = substr($tmpTxt,$srcStart+6,$srcEnd-$srcStart-6);

break;

default:

$srcEnd = $this->strfind($tmpTxt," ",$srcStart+5);

if($srcEnd == false)

$srcEnd = $this->strfind($tmpTxt,'>',$srcStart+5);

$imgUrl = substr($tmpTxt,$srcStart+5,$srcEnd-$srcStart-5);

}

$tempImgUrl = $imgUrl;

if($this->strfind($tmpTxt,"http://",0)!=true)

{

switch(substr($imgUrl,0,1))

{

case "/":

$imgUrl = $urlHost.$imgUrl;

break;

default:

if(substr($urlDir,strlen($urlDir)-1,1)=="/")

$imgUrl = $urlDir.$imgUrl;

else

$imgUrl = $urlDir."/".$imgUrl;

}

$text = str_replace($tempImgUrl,$imgUrl,$text);

}

$int++;

}

return $text;

}

}

?>

 
 
 
免责声明:本文为网络用户发布,其观点仅代表作者个人观点,与本站无关,本站仅提供信息存储服务。文中陈述内容未经本站证实,其真实性、完整性、及时性本站不作任何保证或承诺,请读者仅作参考,并请自行核实相关内容。
 
 
© 2005- 王朝網路 版權所有 導航