Java 根据关键字抓取google 新闻 网络数据

王朝学院·作者佚名  2009-11-20
窄屏简体版  字體: |||超大  

用户要求统计所提供关键字在网络出现的新闻,下面为一个测试的main方法。

package com.net;

import java.io.IOException;

import java.io.InputStream;

import java.net.HttpURLConnection;

import java.net.URL;

import java.util.ArrayList;

import java.util.List;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

/**

* @{#} NetTools.java Create on Nov 18, 2009 4:55:57 PM

*

* Copyright (c) 2009 by ThinkIT

* @author Jack He ,jackhexl@gmail.com

* @version 1.0

*/

public class NetTools {

private String url = "";// 请求的URL

private String keyword = "";// 搜索的关键字

private StringBuffer strBuffer = new StringBuffer("");

private List newsList=new ArrayList();//新闻数组

public static void main(String [] args){

List list=new ArrayList();

NetTools nt=new NetTools();

try {

list=nt.getNews("", new String[]{"环保局"});

for(int i=0;i<list.size();i++){

System.out.println(list.get(i).toString());

}

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

}

public List getNews(String url,String[] keywords) throws IOException{

if(url.equals("")||null==url)

url="http://news.google.cn/news/search?cf=all&scoring=n&pz=1&cf=all&ned=ccn&hl=zh-CN&q=";

int i=0;

this.url=url;

//关键字

for(;i<keywords.length;i++){

this.keyword+=keywords[i]+" ";

}

this.url+=java.net.URLEncoder.encode(this.keyword,"UTF-8");// 带参数的请求地址

System.out.println("请求地址为:"+this.url);

URL requestUrl=new URL(this.url);

// 打开链接

HttpURLConnection connection = (HttpURLConnection) requestUrl.openConnection();

connection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");

connection.connect();

InputStream is=connection.getInputStream();

String content;

while ((is.read()) != -1)

{

int all = is.available();

byte[] b = new byte[all];

is.read(b);

strBuffer.append(new String(b, "UTF-8"));

}

if(is!=null) is.close();

content=strBuffer.toString();

Pattern regexContent = Pattern.compile("<h2 class=\"title\">*</h2>",

Pattern.CASE_INSENSITIVE);

Matcher mcContent = regexContent.matcher(content);

while (mcContent.find()) {

String news = mcContent.group();

newsList.add(news);

}

return newsList;

}

}

 
 
 
免责声明:本文为网络用户发布,其观点仅代表作者个人观点,与本站无关,本站仅提供信息存储服务。文中陈述内容未经本站证实,其真实性、完整性、及时性本站不作任何保证或承诺,请读者仅作参考,并请自行核实相关内容。
 
 
© 2005- 王朝網路 版權所有 導航