分享
 
 
 

用PHP抓取网页

王朝php·作者佚名  2006-12-16
窄屏简体版  字體: |||超大  

抓取网页,并将文字和图片存入数据库中,利用getimg.php?id=读取数据库中的图片

getarticle.php?id=读取文档

<?

/**建表文档 articletype对应的类型1:oracle,2:java,3:system

CREATE TABLE article (

id int(6) NOT NULL auto_increment,

title varchar(80) default NULL,

content text,

url varchar(80) default NULL,

joindate varchar(12) default NULL,

articletype int(2) not null,

PRIMARY KEY (id)

) ;

CREATE TABLE images (

id int(4) NOT NULL auto_increment,

bin_data longblob,

filetype varchar(50) default NULL,

title varchar(50) default NULL,

articleid int(6) NOT NULL,

PRIMARY KEY (id)

) TYPE=MyISAM;

*/

class SaveWeb

{

var $title;

var $url;

var $typeid;

var $content;

var $getUrl = true;

var $getimg = "getimg.php?id=";

var $dbuser = "root";

var $dbpassword = "whf76128";

var $dbname = "tech";

var $dbhost = "127.0.0.1";

function SaveWeb($title,$url,$typeid) //初始化,

{

$this->title=$title;

$this->url=$url;

$this->typeid=$typeid;

}

function setContent($html) //初始化,

{

$this->content = $html;

$this->getUrl = false;

}

function saveContent() //直接存储段落文字

{

$date = gmdate("Y-m-d");

$data = nl2br($this->content);

$data = addslashes($data);

MYSQL_CONNECT( $this->dbhost, $this->dbuser, $this->dbpassword);

mysql_select_db( $this->dbname);

$result=MYSQL_QUERY( "INSERT INTO article (title,content,url,joindate,articletype) VALUES ('$this->title','$data','$this->url','$date',$this->typeid)");

$id= mysql_insert_id();

MYSQL_CLOSE();

return $id;

}

function webSave() //存储页面

{

if($this->title==""||$this->url=="")

return false;

if($this->getUrl==true)

$text = $this->getHtml($this->url);

else

{

$text = $this->content;

}

$text2 = $this->parserHtml($text);

$id = $this->saveHtml($text2);

$this->updateImgPID($id,$this->title);

$this->delimg();

return $id;

}

//在$strobj中查找$strchild,返回值为位置(找到)和false(没有找到相应的字符串).

function strfind($strobj,$strchild,$int)

{

$intobj=strlen($strobj);

$intchild=strlen($strchild);

while($int<=$intobj)

{

if(strtolower(substr($strobj,$int,1))==$strchild[0]) //当从$strobj上截取的首字符与$strchild的首字符相同时,作进一步判断.

{

if(strtolower(substr($strobj,$int,$intchild))==$strchild)

return $int;

}

$int++;

}

return false;

}

function getHtml($url)

{

if(($fp = fopen($url,"r"))==false)

{

echo "<font color=red>读取失败,文件位置:$url</font><br>";

return false;

}

$data = "";

while(!feof($fp))

{

$data = $data.fread($fp,512);

}

fclose($fp);

return $data;

}

function delImg()

{

MYSQL_CONNECT( $this->dbhost, $this->dbuser, $this->dbpassword);

mysql_select_db( $this->dbname);

$result=MYSQL_QUERY( "delete from images where articleid = 0");

MYSQL_CLOSE();

}

function updateImgPID($id,$title)

{

MYSQL_CONNECT( $this->dbhost, $this->dbuser, $this->dbpassword);

mysql_select_db( $this->dbname);

MYSQL_QUERY( "update images set articleid = $id where title='$title'");

MYSQL_CLOSE();

}

function saveHtml($data)

{

$date = gmdate("Y-m-d");

$data = addslashes($data);

MYSQL_CONNECT( $this->dbhost, $this->dbuser, $this->dbpassword);

mysql_select_db( $this->dbname);

$result=MYSQL_QUERY( "INSERT INTO article (title,content,url,joindate,articletype) VALUES ('$this->title','$data','$this->url','$date',$this->typeid)");

$id= mysql_insert_id();

MYSQL_CLOSE();

return $id;

}

function saveImg($url)

{

$data = $this->getHtml($url);

$data = addslashes($data);

MYSQL_CONNECT( $this->dbhost, $this->dbuser, $this->dbpassword);

mysql_select_db( $this->dbname);

$result=MYSQL_QUERY( "INSERT INTO images (bin_data,filetype,title,articleid) VALUES ('$data','".$this->getContentType($url)."','$this->title',0)");

$id= mysql_insert_id();

MYSQL_CLOSE();

return $id;

}

function getContentName($inFileName)

{

return basename($inFileName);

}

function getContentType($inFileName)

{

//--剥去路径

$inFileName = basename($inFileName);

//--检查文件扩展名

if(strrchr($inFileName, ".") == false)

{

return "application/octet-stream";

}

//--得到文件扩展名,并判断文件类型

$extension = strrchr($inFileName, ".");

switch($extension)

{

case ".gif": return "image/gif";

case ".gz": return "application/x-gzip";

case ".htm": return "text/html";

case ".html": return "text/html";

case ".jpg": return "image/jpeg";

case ".tar": return "application/x-tar";

case ".txt": return "text/plain";

case ".zip": return "application/zip";

case ".png": return "image/png";

case ".bmp": return "image/bmp";

default: return "application/octet-stream";

}

return "application/octet-stream";

}

function parserHtml($text)

{

$int = 0;

$baseUrl = parse_url($this->url);

$urlHost = "http://".$baseUrl["host"];

$urlDir = $urlHost.dirname($baseUrl["path"]);

$urlDir = str_replace("\\","/",$urlDir);

//更新<img>标签

while($int = $this->strfind($text,"<img",$int))

{

$closeCharPos = $this->strfind($text,">",$int);

$tmpTxt = substr($text,$int,$closeCharPos-$int+1);

$srcStart = $this->strfind($tmpTxt,"src=",0);

$srcEnd = 0;

switch(substr($tmpTxt,$srcStart+4,1))

{

case '"':

$srcEnd = $this->strfind($tmpTxt,'"',$srcStart+5);

$imgUrl = substr($tmpTxt,$srcStart+5,$srcEnd-$srcStart-5);

break;

case "'":

$srcEnd = $this->strfind($tmpTxt,"'",$srcStart+5);

$imgUrl = substr($tmpTxt,$srcStart+5,$srcEnd-$srcStart-5);

break;

default:

$srcEnd = $this->strfind($tmpTxt," ",$srcStart+4);

if($srcEnd == false)

$srcEnd = $this->strfind($tmpTxt,'>',$srcStart+4);

$imgUrl = substr($tmpTxt,$srcStart+4,$srcEnd-$srcStart-4);

}

$tempImgUrl = $imgUrl;

$tempFile = parse_url($this->getimg);

if($this->strfind($tmpTxt,"http://",0)!=true)

{

switch(substr($imgUrl,0,1))

{

case "/":

$imgUrl = $urlHost.$imgUrl;

break;

default:

if(substr($urlDir,strlen($urlDir)-1,1)=="/")

$imgUrl = $urlDir.$imgUrl;

else

$imgUrl = $urlDir."/".$imgUrl;

}

}

if($this->strfind($imgUrl,$tempFile["path"],0)!=false)

{

$int++;

continue;

}

$id = $this->saveImg($imgUrl);

if($id == false)

{

$int++;

continue;

}

$newImgUrl = $this->getimg.$id;

$text = str_replace($tempImgUrl,$newImgUrl,$text);

$int++;

}

$int = 0;

//更新<a></a>标签

while($int = $this->strfind($text,"<a",$int))

{

$closeCharPos = $this->strfind($text,">",$int);

$tmpTxt = substr($text,$int,$closeCharPos-$int+1);

$srcStart = $this->strfind($tmpTxt,"href=",0);

$srcEnd = 0;

switch(substr($tmpTxt,$srcStart+5,1))

{

case '"':

$srcEnd = $this->strfind($tmpTxt,'"',$srcStart+6);

$imgUrl = substr($tmpTxt,$srcStart+6,$srcEnd-$srcStart-6);

break;

case "'":

$srcEnd = $this->strfind($tmpTxt,"'",$srcStart+6);

$imgUrl = substr($tmpTxt,$srcStart+6,$srcEnd-$srcStart-6);

break;

default:

$srcEnd = $this->strfind($tmpTxt," ",$srcStart+5);

if($srcEnd == false)

$srcEnd = $this->strfind($tmpTxt,'>',$srcStart+5);

$imgUrl = substr($tmpTxt,$srcStart+5,$srcEnd-$srcStart-5);

}

$tempImgUrl = $imgUrl;

if($this->strfind($tmpTxt,"http://",0)!=true)

{

switch(substr($imgUrl,0,1))

{

case "/":

$imgUrl = $urlHost.$imgUrl;

break;

default:

if(substr($urlDir,strlen($urlDir)-1,1)=="/")

$imgUrl = $urlDir.$imgUrl;

else

$imgUrl = $urlDir."/".$imgUrl;

}

$text = str_replace($tempImgUrl,$imgUrl,$text);

}

$int++;

}

return $text;

}

}

?>

 
 
 
免责声明:本文为网络用户发布,其观点仅代表作者个人观点,与本站无关,本站仅提供信息存储服务。文中陈述内容未经本站证实,其真实性、完整性、及时性本站不作任何保证或承诺,请读者仅作参考,并请自行核实相关内容。
2023年上半年GDP全球前十五强
 百态   2023-10-24
美众议院议长启动对拜登的弹劾调查
 百态   2023-09-13
上海、济南、武汉等多地出现不明坠落物
 探索   2023-09-06
印度或要将国名改为“巴拉特”
 百态   2023-09-06
男子为女友送行,买票不登机被捕
 百态   2023-08-20
手机地震预警功能怎么开?
 干货   2023-08-06
女子4年卖2套房花700多万做美容:不但没变美脸,面部还出现变形
 百态   2023-08-04
住户一楼被水淹 还冲来8头猪
 百态   2023-07-31
女子体内爬出大量瓜子状活虫
 百态   2023-07-25
地球连续35年收到神秘规律性信号,网友:不要回答!
 探索   2023-07-21
全球镓价格本周大涨27%
 探索   2023-07-09
钱都流向了那些不缺钱的人,苦都留给了能吃苦的人
 探索   2023-07-02
倩女手游刀客魅者强控制(强混乱强眩晕强睡眠)和对应控制抗性的关系
 百态   2020-08-20
美国5月9日最新疫情:美国确诊人数突破131万
 百态   2020-05-09
荷兰政府宣布将集体辞职
 干货   2020-04-30
倩女幽魂手游师徒任务情义春秋猜成语答案逍遥观:鹏程万里
 干货   2019-11-12
倩女幽魂手游师徒任务情义春秋猜成语答案神机营:射石饮羽
 干货   2019-11-12
倩女幽魂手游师徒任务情义春秋猜成语答案昆仑山:拔刀相助
 干货   2019-11-12
倩女幽魂手游师徒任务情义春秋猜成语答案天工阁:鬼斧神工
 干货   2019-11-12
倩女幽魂手游师徒任务情义春秋猜成语答案丝路古道:单枪匹马
 干货   2019-11-12
倩女幽魂手游师徒任务情义春秋猜成语答案镇郊荒野:与虎谋皮
 干货   2019-11-12
倩女幽魂手游师徒任务情义春秋猜成语答案镇郊荒野:李代桃僵
 干货   2019-11-12
倩女幽魂手游师徒任务情义春秋猜成语答案镇郊荒野:指鹿为马
 干货   2019-11-12
倩女幽魂手游师徒任务情义春秋猜成语答案金陵:小鸟依人
 干货   2019-11-12
倩女幽魂手游师徒任务情义春秋猜成语答案金陵:千金买邻
 干货   2019-11-12
 
推荐阅读
 
 
 
>>返回首頁<<
 
靜靜地坐在廢墟上,四周的荒凉一望無際,忽然覺得,淒涼也很美
© 2005- 王朝網路 版權所有