Java 根据关键字抓取google 新闻 网络数据
2009-11-19 21:02:25 来源:WEB开发网核心提示:用户要求统计所提供关键字在网络出现的新闻,下面为一个测试的main方法,Java 根据关键字抓取google 新闻 网络数据,package com.net;import java.io.IOException;import java.io.InputStream;import java.net.HttpURLConn
用户要求统计所提供关键字在网络出现的新闻,下面为一个测试的main方法。
package com.net;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @{#} NetTools.java Create on Nov 18, 2009 4:55:57 PM
*
* Copyright (c) 2009 by ThinkIT
* @author Jack He ,jackhexl@Gmail.com
* @version 1.0
*/
public class NetTools {
PRivate String url = "";// 请求的URL
private String keyWord = "";// 搜索的关键字
private StringBuffer strBuffer = new StringBuffer("");
private List newsList=new ArrayList();//新闻数组
public static void main(String [] args){
List list=new ArrayList();
NetTools nt=new NetTools();
try {
list=nt.getNews("", new String[]{"环保局"});
for(int i=0;i<list.size();i++){
System.out.println(list.get(i).toString());
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public List getNews(String url,String[] keywords) throws IOException{
if(url.equals("")||null==url)
url="http://news.google.cn/news/search?cf=all&scoring=n&pz=1&cf=all&ned=ccn&hl=zh-CN&q=";
int i=0;
this.url=url;
//关键字
for(;i<keywords.length;i++){
this.keyword+=keywords[i]+" ";
}
this.url+=java.net.URLEncoder.encode(this.keyword,"UTF-8");// 带参数的请求地址
System.out.println("请求地址为:"+this.url);
URL requestUrl=new URL(this.url);
// 打开链接
HttpURLConnection connection = (HttpURLConnection) requestUrl.openConnection();
connection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
connection.connect();
InputStream is=connection.getInputStream();
String content;
while ((is.read()) != -1)
{
int all = is.available();
byte[] b = new byte[all];
is.read(b);
strBuffer.append(new String(b, "UTF-8"));
}
if(is!=null) is.close();
content=strBuffer.toString();
Pattern regexContent = Pattern.compile("<h2 class=\"title\">*</h2>",
Pattern.CASE_INSENSITIVE);
Matcher mcContent = regexContent.matcher(content);
while (mcContent.find()) {
String news = mcContent.group();
newsList.add(news);
}
return newsList;
}
}
package com.net;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @{#} NetTools.java Create on Nov 18, 2009 4:55:57 PM
*
* Copyright (c) 2009 by ThinkIT
* @author Jack He ,jackhexl@Gmail.com
* @version 1.0
*/
public class NetTools {
PRivate String url = "";// 请求的URL
private String keyWord = "";// 搜索的关键字
private StringBuffer strBuffer = new StringBuffer("");
private List newsList=new ArrayList();//新闻数组
public static void main(String [] args){
List list=new ArrayList();
NetTools nt=new NetTools();
try {
list=nt.getNews("", new String[]{"环保局"});
for(int i=0;i<list.size();i++){
System.out.println(list.get(i).toString());
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public List getNews(String url,String[] keywords) throws IOException{
if(url.equals("")||null==url)
url="http://news.google.cn/news/search?cf=all&scoring=n&pz=1&cf=all&ned=ccn&hl=zh-CN&q=";
int i=0;
this.url=url;
//关键字
for(;i<keywords.length;i++){
this.keyword+=keywords[i]+" ";
}
this.url+=java.net.URLEncoder.encode(this.keyword,"UTF-8");// 带参数的请求地址
System.out.println("请求地址为:"+this.url);
URL requestUrl=new URL(this.url);
// 打开链接
HttpURLConnection connection = (HttpURLConnection) requestUrl.openConnection();
connection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
connection.connect();
InputStream is=connection.getInputStream();
String content;
while ((is.read()) != -1)
{
int all = is.available();
byte[] b = new byte[all];
is.read(b);
strBuffer.append(new String(b, "UTF-8"));
}
if(is!=null) is.close();
content=strBuffer.toString();
Pattern regexContent = Pattern.compile("<h2 class=\"title\">*</h2>",
Pattern.CASE_INSENSITIVE);
Matcher mcContent = regexContent.matcher(content);
while (mcContent.find()) {
String news = mcContent.group();
newsList.add(news);
}
return newsList;
}
}
更多精彩
赞助商链接