博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
java网页数据抓取实例
阅读量:6032 次
发布时间:2019-06-20

本文共 16436 字,大约阅读时间需要 54 分钟。

网页上面数据如下:

如果想要过去上图所示网页的数据,代码如下:

(1)调度类,主要调用工具类中的方法获取数据并入库

package com.jointsky.jointframe.weather.jobservice;import java.util.HashMap;import java.util.List;import java.util.Map;import org.apache.commons.lang.StringUtils;import org.springframework.transaction.annotation.Transactional;import com.jointsky.jointframe.scheduler.exception.ExecutionException;import com.jointsky.jointframe.scheduler.quartz.JobService;import com.jointsky.jointframe.weather.entity.ActuallyForecastWeather;import com.jointsky.jointframe.weather.entity.ActuallyWeather;import com.jointsky.jointframe.weather.service.ActuallyForecastWeatherManager;import com.jointsky.jointframe.weather.service.ActuallyWeatherManager;import com.jointsky.jointframe.weather.utils.UrlInfo;/* * 

Description:实况天气资料数据资料调度类

*/@Transactionalpublic class ActuallyWeatherJobService implements JobService{ /** * 实况天气资料(当天)数据管理类 */ private ActuallyWeatherManager actuallyWeatherManager; /** * 实况天气资料(当天)数据实体类 */ private ActuallyWeather actuallyWeather; /** * 实况天气资料预报数据管理类 */ private ActuallyForecastWeatherManager actuallyForecastWeatherManager; /** * 实况天气资料预报数据实体类 */ private ActuallyForecastWeather actuallyForecastWeather; /** * 存放参数的map集合 */ private Map
map = new HashMap
(); @Override public void execute(Map
arg0) throws ExecutionException { System.out.println("实况天气资料数据获取调度成功"); String[] countyCodes = UrlInfo.getCountyCodes().split(","); for (int j = 0; j < countyCodes.length; j++) { String countyCode = countyCodes[j]; if (StringUtils.isNotEmpty(countyCode)) { try { String url = UrlInfo.getUrl(countyCode); Map
filterMap = UrlInfo.getDistrict(countyCode); //市级名称 String cityLevel = (String) filterMap.get("cityLevel"); //区县级名称 String countyLevel = (String) filterMap.get("countyLevel"); //银川天气预报详细信息数据 List
list_actually = UrlInfo.getURLInfoOfActully(url,"utf-8"); List
list_forecast = UrlInfo.getURLInfoOfForecast(url, "utf-8"); for (int i = 0; i < list_actually.size(); i++) { actuallyWeather = list_actually.get(i); actuallyWeather.setCityLevel(cityLevel); actuallyWeather.setCountyLevel(countyLevel); //预报时间 if (StringUtils.isNotEmpty(actuallyWeather.getForecastTime())) { map.put("forecastTime", actuallyWeather.getForecastTime()); } //地名 if (StringUtils.isNotEmpty(actuallyWeather.getPlaceName())) { map.put("placeName", actuallyWeather.getPlaceName()); } String actuallyWeatherId = actuallyWeatherManager.findIdByParams(map); map = new HashMap
(); if (StringUtils.isNotEmpty(actuallyWeatherId)) { actuallyWeather.setId(actuallyWeatherId); actuallyWeatherManager.updateWeather(actuallyWeather); }else { actuallyWeatherManager.save(actuallyWeather); } } for (int i = 0; i < list_forecast.size(); i++) { actuallyForecastWeather = list_forecast.get(i); actuallyForecastWeather.setCityLevel(cityLevel); actuallyForecastWeather.setCountyLevel(countyLevel); //预报时间 if (StringUtils.isNotEmpty(actuallyForecastWeather.getForecastTime())) { map.put("forecastTime", actuallyForecastWeather.getForecastTime()); } //地名 if (StringUtils.isNotEmpty(actuallyForecastWeather.getPlaceName())) { map.put("placeName", actuallyForecastWeather.getPlaceName()); } String actuallyForecastWeatherId = actuallyForecastWeatherManager.findIdByParams(map); map = new HashMap
(); if (StringUtils.isNotEmpty(actuallyForecastWeatherId)) { actuallyForecastWeather.setId(actuallyForecastWeatherId); actuallyForecastWeatherManager.updateForecastWeather(actuallyForecastWeather); }else { actuallyForecastWeatherManager.save(actuallyForecastWeather); } } } catch (Exception e) { e.printStackTrace(); } } } } public ActuallyWeatherManager getActuallyWeatherManager() { return actuallyWeatherManager; } public void setActuallyWeatherManager( ActuallyWeatherManager actuallyWeatherManager) { this.actuallyWeatherManager = actuallyWeatherManager; } public ActuallyWeather getActuallyWeather() { return actuallyWeather; } public void setActuallyWeather(ActuallyWeather actuallyWeather) { this.actuallyWeather = actuallyWeather; } public Map
getMap() { return map; } public void setMap(Map
map) { this.map = map; } public ActuallyForecastWeatherManager getActuallyForecastWeatherManager() { return actuallyForecastWeatherManager; } public void setActuallyForecastWeatherManager( ActuallyForecastWeatherManager actuallyForecastWeatherManager) { this.actuallyForecastWeatherManager = actuallyForecastWeatherManager; } public ActuallyForecastWeather getActuallyForecastWeather() { return actuallyForecastWeather; } public void setActuallyForecastWeather( ActuallyForecastWeather actuallyForecastWeather) { this.actuallyForecastWeather = actuallyForecastWeather; } }
View Code

(2)工具类,主要为一些执行查询数据的实现方法

package com.jointsky.jointframe.weather.utils;import java.io.BufferedReader;import java.io.InputStream;import java.io.InputStreamReader;import java.net.HttpURLConnection;import java.net.URL;import java.text.SimpleDateFormat;import java.util.ArrayList;import java.util.Date;import java.util.HashMap;import java.util.List;import java.util.Map;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.apache.commons.lang.StringUtils;import com.jointsky.jointframe.weather.entity.ActuallyForecastWeather;import com.jointsky.jointframe.weather.entity.ActuallyWeather;/** * 

Description:实况天气资料工具类

*/public class UrlInfo { /** * 生成一个Pattern,同时编译一个正则表达式 */ private static Pattern proInfo = Pattern.compile("(.*?)", Pattern.DOTALL); /** * 宁夏区县编码(总乡镇数190) * 银川{市辖区(11个乡镇):53614;贺兰县(7):53610;永宁县(8):53618;灵武市(8):53619} */ private static String countyCodes = "53614,53610,53618,53619"; /** * 获取实况天气(当天)数据的方法 * @param urlInfo * @param charset * @return * @throws Exception */ public static List
getURLInfoOfActully(String urlInfo,String charset) throws Exception { String info = getUrlInfo(urlInfo); //获得网页源码(0是当天) return getDataStructure(info,0); } /** * 获取实况天气(预报)数据的方法 * @param urlInfo * @param charset * @return * @throws Exception */ public static List
getURLInfoOfForecast(String urlInfo,String charset) throws Exception { String info = getUrlInfo(urlInfo); //获得网页源码(1是预报) return getDataStructure(info,1); } /** * 网页信息 * @param urlInfo * @return * @throws Exception */ public static String getUrlInfo(String urlInfo) throws Exception { //读取目的网页URL地址,获取网页源码 URL url = new URL(urlInfo); HttpURLConnection httpUrl = (HttpURLConnection)url.openConnection(); InputStream is = httpUrl.getInputStream(); BufferedReader br = new BufferedReader(new InputStreamReader(is,"utf-8")); StringBuilder sb = new StringBuilder(); String line; while ((line = br.readLine()) != null) { //这里是对链接进行处理 line = line.replaceAll("
]*>", ""); //这里是对样式进行处理 line = line.replaceAll("<(\\w+)[^>]*>", "<$1>"); sb.append(line); } is.close(); br.close(); return sb.toString().trim(); } private static List getDataStructure(String str,int j) { //运用正则表达式对获取的网页源码进行数据匹配,提取我们所要的数据,在以后的过程中,我们可以采用httpclient+jsoup, //现在暂时运用正则表达式对数据进行抽取提取 //String[] info = str.split(""); SimpleDateFormat sf = new SimpleDateFormat("HH"); Date dateTime = new Date(); String hour = sf.format(dateTime); Integer h = Integer.parseInt(hour); int t = 0; //如果十二点之前当天会有四个时间段模块(今天上午6~12;今天下午12~18;今天前半夜18~24;今天后半夜次日00~06) if (h<=12) { t=4; //如果十二点之后十八点之前当天会有三个时间段模块(今天下午12~18;今天前半夜18~24;今天后半夜次日00~06) }else if (12
<=18) { t=3; //如果十八点之后当天会有两个时间段模块(今天前半夜18~24;今天后半夜次日00~06) }else if(h>18) { t=2; } String[] info = str.split(""); List
list_actually = new ArrayList
(); List
list_forecast = new ArrayList
(); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); //当t的值是几的时候相应的当天的天气数据就还剩下几个模块,k就是用来控制第几个模块数据的参数 int k = 0; for (String s : info) { //这个Pattern对象将会使用matcher()方法来生成一个Matcher实例,接着便可以使用该 Matcher实例以编译的正则表达式为基础对目标字符串进行匹配工作,多个Matcher是可以共用一个Pattern对象的。 Matcher m = proInfo.matcher(s); ActuallyWeather actually = null; ActuallyForecastWeather forecast = null; //使用find()方法查找第一个匹配的对象 if (m.find()) { actually = new ActuallyWeather(); forecast = new ActuallyForecastWeather(); //返回与组匹配的子串内容 String[] ss = m.group(1).trim().replace(" ", "").split(">"); if ("风力".equals(ss[0])) { k++; String[] strsss = s.split(""); int i = 0; if (k<=t&&j==0) { actually = new ActuallyWeather(); for (String ss1 : strsss) { Matcher mm = proInfo.matcher(ss1); if (mm.find()) { //设置产品型号 String[] sss = mm.group(1).trim().replace(" ", "").split(">"); if (i%4==1) { actually.setPlaceName(sss[0]); }else if (i%4==2) { actually.setWeatherStatus(sss[0]); }else if (i%4==3) { String temp = sss[0]; String[] temps = temp.split("/"); actually.setMaxTemperature(temps[0]+"℃"); actually.setMinTemperature(temps[1]); }else if (i%4==0&&i!=0) { actually.setWindPower(sss[0]); } } if (i%4==0&&i!=0) { Date date = new Date(); //发布日期 actually.setPubTime(sdf.format(date)); //前四模块数据从当天早上七点开始加六个小时 //date = new Date(date.getTime() + (k-1)*21600000); int p = 0; //t是当日数据剩余次数;k是当前循环次数 if ((t-k)==3) { p = 6; }else if ((t-k)==2) { p = 12; }else if ((t-k)==1) { p = 18; }else if ((t-k)==0) { p = 24; } //次日凌晨 if (24==p) { Date time = new Date(date.getTime() + 86400000); actually.setForecastTime(sdf.format(time)+" 00"); }else if (p<10) { actually.setForecastTime(sdf.format(date)+" 0"+p); }else if (p>10&&p!=24) { actually.setForecastTime(sdf.format(date)+" "+p); } list_actually.add(actually); actually=new ActuallyWeather(); } i++; } }else if (k>t&&j==1) { forecast = new ActuallyForecastWeather(); for (String ss1 : strsss) { Matcher mm = proInfo.matcher(ss1); if (mm.find()) { //设置产品型号 String[] sss = mm.group(1).trim().replace(" ", "").split(">"); if (i%4==1) { forecast.setPlaceName(sss[0]); }else if (i%4==2) { forecast.setWeatherStatus(sss[0]); }else if (i%4==3) { String temp = sss[0]; String[] temps = temp.split("/"); forecast.setMaxTemperature(temps[0]+"℃"); forecast.setMinTemperature(temps[1]); }else if (i%4==0&&i!=0) { forecast.setWindPower(sss[0]); } } if (i%4==0&&i!=0) { Date date = new Date(); //发布日期 forecast.setPubTime(sdf.format(date)); //从第五个模块数据开始数据是从次日凌晨开始每下一组比上一组晚24小时 date = new Date(date.getTime() + (k-t)*86400000); forecast.setForecastTime(sdf.format(date)+" 00"); list_forecast.add(forecast); forecast=new ActuallyForecastWeather(); } i++; } } } } } if (0==j) { return list_actually; }else if (1==j) { return list_forecast; } return null; } /** * 生成url * @param countyCode * @return * @throws Exception */ public static String getUrl(String countyCode) throws Exception { String url = "http://3g.nx121.com/pc/tqybxzb.aspx"; if (StringUtils.isNotEmpty(countyCode)) { url = url + "?sd="+countyCode; } return url; } /** * 根据城市编码查询所属市级和区县级行政区 * @param countyCode * @return * @throws Exception */ public static Map
getDistrict(String countyCode) throws Exception { Map
map = new HashMap
(); if ("53614".equals(countyCode)) { //市级行政区 map.put("cityLevel", "银川市"); //区县级行政区 map.put("countyLevel", "市辖区"); }else if ("53610".equals(countyCode)) { //市级行政区 map.put("cityLevel", "银川市"); //区县级行政区 map.put("countyLevel", "贺兰县"); }else if ("53618".equals(countyCode)) { //市级行政区 map.put("cityLevel", "银川市"); //区县级行政区 map.put("countyLevel", "永宁县"); }else if ("53619".equals(countyCode)) { //市级行政区 map.put("cityLevel", "银川市"); //区县级行政区 map.put("countyLevel", "灵武市"); } return map; } public static Pattern getProInfo() { return proInfo; } public static void setProInfo(Pattern proInfo) { UrlInfo.proInfo = proInfo; } public static String getCountyCodes() { return countyCodes; } public static void setCountyCodes(String countyCodes) { UrlInfo.countyCodes = countyCodes; } }
View Code

(3)实体类,用于存放数据的bean

package com.jointsky.jointframe.weather.entity;import javax.persistence.Column;import javax.persistence.Entity;import javax.persistence.Table;import org.apache.commons.lang.builder.EqualsBuilder;import org.apache.commons.lang.builder.HashCodeBuilder;import org.apache.commons.lang.builder.ToStringBuilder;import org.apache.commons.lang.builder.ToStringStyle;import org.hibernate.annotations.Cache;import org.hibernate.annotations.CacheConcurrencyStrategy;import com.jointsky.jointframe.core.entity.IdEntity;/** * 

Description:实况天气资料(当天)Entity类

*/@Entity@Table(name = "t_actually_weather")@Cache(usage = CacheConcurrencyStrategy.READ_WRITE)public class ActuallyWeather extends IdEntity { /** * */ private static final long serialVersionUID = -5324072662712469478L; /** * 市级行政区(名称) */ private String cityLevel; /** * 区县级行政区名(名称) */ private String countyLevel; /** * 发布时间 */ private String pubTime; /** * 地名 */ private String placeName; /** * 天气状态:多云、晴、小雨...... */ private String weatherStatus; /** * 最高温度 */ private String maxTemperature; /** * 最低温度 */ private String minTemperature; /** * 风力 */ private String windPower; /** * 预报时间 */ private String forecastTime; @Column(name="pub_time",length=50) public String getPubTime() { return pubTime; } public void setPubTime(String pubTime) { this.pubTime = pubTime; } @Column(name="place_name",length=50) public String getPlaceName() { return placeName; } public void setPlaceName(String placeName) { this.placeName = placeName; } @Column(name="weather_status",length=50) public String getWeatherStatus() { return weatherStatus; } public void setWeatherStatus(String weatherStatus) { this.weatherStatus = weatherStatus; } @Column(name="max_temperature",length=50) public String getMaxTemperature() { return maxTemperature; } public void setMaxTemperature(String maxTemperature) { this.maxTemperature = maxTemperature; } @Column(name="min_temperature",length=50) public String getMinTemperature() { return minTemperature; } public void setMinTemperature(String minTemperature) { this.minTemperature = minTemperature; } @Column(name="wind_power",length=50) public String getWindPower() { return windPower; } public void setWindPower(String windPower) { this.windPower = windPower; } public static long getSerialversionuid() { return serialVersionUID; } @Column(name="forecast_time",length=50) public String getForecastTime() { return forecastTime; } public void setForecastTime(String forecastTime) { this.forecastTime = forecastTime; } @Override public String toString() { return new ToStringBuilder(this, ToStringStyle.MULTI_LINE_STYLE) .append("id", id).toString(); } @Column(name="city_level",length=50) public String getCityLevel() { return cityLevel; } public void setCityLevel(String cityLevel) { this.cityLevel = cityLevel; } @Column(name="county_level",length=50) public String getCountyLevel() { return countyLevel; } public void setCountyLevel(String countyLevel) { this.countyLevel = countyLevel; } //MeetingArrangement是当前实体 @Override public boolean equals(Object o) { boolean equal = false; if (o != null && ActuallyWeather.class.isAssignableFrom(o.getClass())) { ActuallyWeather actuallyWeather = (ActuallyWeather) o; equal = (new EqualsBuilder().append(this.id, actuallyWeather.id)).isEquals(); } return equal; } @Override public int hashCode() { return new HashCodeBuilder(17, 37).append(id).toHashCode(); }}
View Code

预报实体类和当天的字段完全一致,不过表名不一致。

参考文档:

转载地址:http://gdzhx.baihongyu.com/

你可能感兴趣的文章
the archive which is referenced by ...问题解决方案
查看>>
西部开源学习笔记BOOK2-《unit 4》
查看>>
C++下的DLL编程入门
查看>>
oracle中怎样查询用户权限
查看>>
s5pv210运行裸机程序的方法之在SDRAM(DDR2)中运行
查看>>
吊炸天的 PHP 7 ,你值得拥有
查看>>
java循环练习:水仙花数
查看>>
HTTPS 接入优化建议
查看>>
线程安全与可重入函数的区别及联系
查看>>
python-简单测试wsgi
查看>>
C语言 飞机
查看>>
三台主机分别部署LAMP
查看>>
驱动学习之led-class.c源码分析
查看>>
Linux 6.8 root密码丢失找回
查看>>
『中级篇』docker之wordpress容器SSL(番外篇)(78)
查看>>
『高级篇』docker之DockerSwarm的集群环境搭建(28)
查看>>
Python 实现扫码二维码登录
查看>>
ASP.NET Core配置环境变量和启动设置
查看>>
百度云盘上传文件和下载文件慢的解决办法
查看>>
机器大神 Michael Jordan 教授主题演讲:机器学习——创新视角,直面挑战》
查看>>