Google爬取天气预报代码
2009-09-17 00:00:00 来源:WEB开发网用了国内的几个web service的天气预报服务,打着中央气象局的幌子,本来用的还好好地,过了几天,发现不能调用了,原来是服务器超过请求次数了,Free到这种程序,对国内的Free服务深感失望,心想还是用一直信赖的Google吧,所以就有了以下利用Http请求爬取Google天气预报的代码,并将请求过的城市天气预报按天缓存一下:
所有代码如下:
Java代码
public NodeList getWeatherDiv(String htmlUrl) {
NodeList res = null;
try{
Parser parser = new Parser(htmlUrl);
parser.setEncoding("GBK");
NodeFilter divFilter = new NodeClassFilter(Div.class);
OrFilter lastFilter = new OrFilter();
lastFilter
.setPredicates(new NodeFilter[] { divFilter });
NodeList nodeList = parser.parse(lastFilter);
Node[] nodes = nodeList.toNodeArray();
for (int i = 0; i < nodes.length; i++) {
Node anode = (Node) nodes[i];
if(anode instanceof Div){
Div mydiv = (Div)anode;
String className = mydiv.getAttribute("class");
if(className!=null && className.equals("e")){
res = mydiv.getChildren();
}
}
}
} catch (ParserException e) {
e.printStackTrace();
}
return res;
}
public static void cleanCache() {
if(isStart) return;
isStart = true;
TimerTask task = new TimerTask() {
public void run() {
Iterator it = hmCache.entrySet().iterator();
while (it.hasNext()) {
Map.Entry entry = (Map.Entry) it.next();
Object key = entry.getKey();
String today = DateTimeUtil.format(new Date(),"yyyyMMdd");
if(key.toString().indexOf(today)>=0){
it.remove();
hmCache.remove(key);
}
}
}
};
Timer timer = new Timer();
timer.schedule(task, Calendar.getInstance ().getTime(), 24*3600 * 1000);
}
private void addWeatherDay(JSONObject json,int flag,String htmlContent){
String tt = (flag==0?"t":("t"+flag));
try{
Node anode = null;
Parser parser = Parser.createParser(htmlContent, "GBK");
NodeFilter textFilter = new NodeClassFilter(TextNode.class);
NodeFilter imgFilter = new NodeClassFilter(ImageTag.class);
OrFilter lastFilter = new OrFilter();
lastFilter.setPredicates(new NodeFilter[] { textFilter,imgFilter });
//String t = "",t_res = "",t_tp="";
NodeList nodeList = parser.parse(lastFilter);
Node[] nodes = nodeList.toNodeArray();
for (int i = 0; i < nodes.length; i++) {
anode = (Node) nodes[i];
if(anode instanceof ImageTag){
ImageTag img = (ImageTag)anode;
if(img!=null){
json.put(tt+"_res", img.getAttribute("title"));
json.put(tt+"_result", img.getAttribute("title"));
json.put(tt+"_tp", ("http://www.google.cn"+img.getImageURL()));
}
}else if(anode instanceof TextNode){
TextNode text = (TextNode)anode;
String t = text.getText();
if(t.indexOf("°C")>0){
json.put(tt, t);
}
}
}
}catch(Exception ex){
ex.printStackTrace();
}
}
private void getDivText(JSONObject json, String htmlContent) {
String line = "";
Node anode = null;
Div divnode = null;
try {
Parser parser = Parser.createParser(htmlContent, "GBK");
NodeFilter divFilter = new NodeClassFilter(Div.class);
OrFilter lastFilter = new OrFilter();
lastFilter.setPredicates(new NodeFilter[] { divFilter });
NodeList nodeList = parser.parse(lastFilter);
int idx = 0;
Node[] nodes = nodeList.toNodeArray();
for (int i = 0; i < nodes.length; i++) {
anode = (Node) nodes[i];
line = "";
if (anode instanceof Div) {
divnode = (Div) anode;
String className = StrCharUtil.formatNullStr(divnode.getAttribute("class"));
String align = StrCharUtil.formatNullStr(divnode.getAttribute("align"));
if(align.equals("")) continue;
if(className.equals("") && align.equals("center")){
line = divnode.getChildrenHTML();
addWeatherDay(json,idx,line);
idx ++;
}
}
if (StrCharUtil.formatNullStr(line).equals(""))
continue;
}
} catch (ParserException pe) {
pe.printStackTrace();
}
}
public JSONObject getWeather(String city){
String today = DateTimeUtil.format(new Date(),"yyyyMMdd");
if(hmCache.get(city+today)!=null){
return hmCache.get(city+today);
}
JSONObject hm =new JSONObject();
hm.put("zhishu","");
try{
city = getCityName(city);
final String googleWeatherURL = "http://www.google.cn/search?hl=zh-CN&newwindow=1&q=tq+"+URLEncoder.encode(city,"UTF-8")+"&aq=f&oq=";
NodeList nodeListDiv = getWeatherDiv(googleWeatherURL);
int idx = 0;
if(nodeListDiv!=null){
getDivText(hm,nodeListDiv.toHtml());
}
}catch(Exception ex){
ex.printStackTrace();
}
hmCache.put(city+today, hm);
return hm;
}
- ››Google搜索引擎的奥秘
- ››Google测试搜索结果页面右侧内容更丰富的信息栏
- ››Google Dart精粹:应用构建,快照和隔离体
- ››google的代码审查
- ››google analytics清晰追踪爬虫的爬行信息
- ››Google+中文用户在两千万Google+大军中是少数派
- ››Google AdWords最昂贵点击成本的20种关键词分类
- ››Google运作经理Bryan Power给出的GOOGLE求职意见
- ››Google用户体验的十大设计原则
- ››Google Analytics(分析)能为网站带来什么
- ››代码隐藏文件如何与页面链接
- ››Google goggles图片搜索 如何优化一个wap网站
更多精彩
赞助商链接