开发学院 WEB开发 Jsp Java 根据关键字抓取google 新闻网络数据阅读

Java 根据关键字抓取google 新闻网络数据

　2009-11-19 21:02:25　来源：WEB开发网　闂備線娼уΛ鎾箯閿燂拷

濠电姭鎷冮崨顓濈捕闂侀潧娲ゅú銊╁焵椤掍胶鈯曢柕鍥╁仧缁辩偤鏁撻敓锟�闂備線娼уΛ鎾箯閿燂拷　　闂備胶枪缁绘鈻嶉弴銏犳瀬闁绘劕鎼痪褔鏌曟繝蹇曠窗闁煎壊浜滈—鍐偓锝庡墮娴犙勭箾閸喎鐏ユい鏇樺劦椤㈡瑩鎮℃惔銇帮拷

核心提示：用户要求统计所提供关键字在网络出现的新闻，下面为一个测试的main方法，Java 根据关键字抓取google 新闻网络数据，package com.net;import java.io.IOException;import java.io.InputStream;import java.net.HttpURLConn

用户要求统计所提供关键字在网络出现的新闻，下面为一个测试的main方法。

package com.net;

import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* @{#} NetTools.java Create on Nov 18, 2009 4:55:57 PM
*
* Copyright (c) 2009 by ThinkIT
* @author Jack He ,jackhexl@Gmail.com
* @version 1.0
*/

public class NetTools {

PRivate String url = "";// 请求的URL
private String keyWord = "";// 搜索的关键字
private StringBuffer strBuffer = new StringBuffer("");
private List newsList=new ArrayList();//新闻数组

public static void main(String [] args){
　List list=new ArrayList();
　NetTools nt=new NetTools();
　try {
　 list=nt.getNews("", new String[]{"环保局"});
　 for(int i=0;i<list.size();i++){
　　System.out.println(list.get(i).toString());
　 }
　
　} catch (IOException e) {
　 // TODO Auto-generated catch block
　 e.printStackTrace();
　}
}
public List getNews(String url,String[] keywords) throws IOException{
　if(url.equals("")||null==url)
　 url="http://news.google.cn/news/search?cf=all&scoring=n&pz=1&cf=all&ned=ccn&hl=zh-CN&q=";
　int i=0;
　this.url=url;
　//关键字
　for(;i<keywords.length;i++){
　 this.keyword+=keywords[i]+" ";　
　}
　this.url+=java.net.URLEncoder.encode(this.keyword,"UTF-8");// 带参数的请求地址
　System.out.println("请求地址为："+this.url);
　URL requestUrl=new URL(this.url);
　// 打开链接
　HttpURLConnection connection = (HttpURLConnection) requestUrl.openConnection();
　connection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
　connection.connect();

　InputStream is=connection.getInputStream();
　String content;
　　　　while ((is.read()) != -1)　
　　　　{　
　　　　　　int all = is.available();　
　　　　　　byte[] b = new byte[all];　
　　　　　　is.read(b);　
　　　　　　strBuffer.append(new String(b, "UTF-8"));　
　　　　}
　　　　if(is!=null) is.close();
　　　　content=strBuffer.toString();
　Pattern regexContent = Pattern.compile("<h2 class=\"title\">*</h2>",
　　Pattern.CASE_INSENSITIVE);
　Matcher mcContent = regexContent.matcher(content);
　while (mcContent.find()) {
　 String news = mcContent.group();
　 newsList.add(news);
　}

　return newsList;
}
}