技术标签: ssm框架 jsoup httpclient java爬虫
首先访问京东,搜索手机,分析页面,我们抓取以下商品数据:
商品图片、价格、标题、商品详情页
技术要求:springboot spring data jpa httpclient jsoup
数据库准备:
use crawler;
CREATE TABLE `jd_item` (
`id` bigint(10) NOT NULL AUTO_INCREMENT COMMENT '主键 id',
`spu` bigint(15) DEFAULT NULL COMMENT '商品集合 id',
`sku` bigint(15) DEFAULT NULL COMMENT '商品最小品类单元 id',
`title` varchar(100) DEFAULT NULL COMMENT '商品标题',
`price` bigint(10) DEFAULT NULL COMMENT '商品价格',
`pic` varchar(200) DEFAULT NULL COMMENT '商品图片',
`url` varchar(200) DEFAULT NULL COMMENT '商品详情地址',
`created` datetime DEFAULT NULL COMMENT '创建时间',
`updated` datetime DEFAULT NULL COMMENT '更新时间',
PRIMARY KEY (`id`),
KEY `sku` (`sku`) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8 COMMENT='京东商品表'
1.创建springboot 工程 ,导入依赖
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.0.1.RELEASE</version>
</parent>
<groupId>com.itheima</groupId>
<artifactId>crawler</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<!-- HttpClient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.3</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
</dependency>
<!-- 日志 -->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.25</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>
<!--Jsoup-->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.3</version>
</dependency>
<!--工具-->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.7</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.6</version>
</dependency>
</dependencies>
2.编写配置文件
3.编写启动类
@SpringBootApplication
public class SpringBootRun {
public static void main(String[] args) {
SpringApplication.run(SpringBootRun.class,args);
}
}
4.编写持久化类
@Entity
@Table(name = "jd_item")
public class Item {
@Id
@GeneratedValue(strategy = GenerationType.IDENTITY)
private Long id;
private String spu;
private String sku;
private String title;
private Long price;
private String pic;
private String url;
private Date created;
private Date updated;
public Long getId() {
return id;
}
public void setId(Long id) {
this.id = id;
}
public String getSpu() {
return spu;
}
public void setSpu(String spu) {
this.spu = spu;
}
public String getSku() {
return sku;
}
public void setSku(String sku) {
this.sku = sku;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public Long getPrice() {
return price;
}
public void setPrice(Long price) {
this.price = price;
}
public String getPic() {
return pic;
}
public void setPic(String pic) {
this.pic = pic;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public Date getCreated() {
return created;
}
public void setCreated(Date created) {
this.created = created;
}
public Date getUpdated() {
return updated;
}
public void setUpdated(Date updated) {
this.updated = updated;
}
}
5.编写dao接口
public interface ItemDao extends JpaRepository<Item,Long>{
}
6.编写service接口以及实现列
@Service
@Transactional
public class ItemServiceImpl implements ItemService {
@Autowired
private ItemDao itemDao;
@Override
public void save(Item item) {
itemDao.save(item);
}
//条件查询
@Override
public List<Item> findAll(Item item) {
Example example = Example.of(item);
List<Item> all = this.itemDao.findAll(example);
return all;
}
}
封装HttpClient
package com.itheima.utils;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.springframework.context.annotation.ComponentScan;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.UUID;
@ComponentScan
public class HttpUtil {
//http连接池
private static PoolingHttpClientConnectionManager pool;
static{
pool = new PoolingHttpClientConnectionManager();
pool.setMaxTotal(100);
pool.setDefaultMaxPerRoute(50);
}
/**
* 获取页面源码
*/
public String getHtml(String url){
CloseableHttpClient build = HttpClients.custom().setConnectionManager(pool).build();
//通过get请求
HttpGet httpGet = new HttpGet();
httpGet.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36");
httpGet.setConfig(this.getConfig());
//发送请求
try {
CloseableHttpResponse response = build.execute(httpGet);
//判断发送的返回的状态
if (response.getStatusLine().getStatusCode()==200){
String string = EntityUtils.toString(response.getEntity(), "UTF-8");
return string;
}
} catch (IOException e) {
e.printStackTrace();
return null;
}
return null;
}
/**
* 下载图片
*/
public String getImage(String imgUrl){
CloseableHttpClient build = HttpClients.custom().setConnectionManager(pool).build();
HttpGet httpGet = new HttpGet();
httpGet.setConfig(this.getConfig());
try {
CloseableHttpResponse response = build.execute(httpGet);
if (response.getStatusLine().getStatusCode()==200){
//后缀名 .jpg .png
String suffix = imgUrl.substring(imgUrl.lastIndexOf("."));
String newImg = UUID.randomUUID()+suffix;
//保存图片
FileOutputStream fileOutputStream = new FileOutputStream(new File("F:\\img\\" + newImg));
response.getEntity().writeTo(fileOutputStream);
return newImg;
}
} catch (IOException e) {
e.printStackTrace();
return null;
}
return null;
}
private RequestConfig getConfig(){
RequestConfig requestConfig = RequestConfig.custom()
.setConnectionRequestTimeout(500)
.setConnectTimeout(500)
.setSocketTimeout(1000 * 10)
.build();
return requestConfig;
}
}
使用定时任务编写页面抓取代码
package com.itheima.utils;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.itheima.po.Item;
import com.itheima.service.ItemService;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import org.springframework.util.StringUtils;
import java.io.IOException;
import java.util.Date;
@Component
public class ItemTask {
@Autowired
private ItemService itemService;
@Autowired
private HttpUtil httpUtil;
public static final ObjectMapper MAPPER = new ObjectMapper();
//设置定时任务,间隔100秒执行一次
@Scheduled(fixedDelay = 1000 * 50)
public void process(){
//京东的url 地址
String url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&s=59&click=0&page=";
for (int i = 1; i < 10; i=i+2) {
String html = this.httpUtil.getHtml(url+i);
//解析页面数据保存到数据库
this.parseHtml(html);
}
System.out.println("执行完成");
}
//解析页面数据库保存到数据库
private void parseHtml(String html) {
//通过Jsoup解析文件
Document document = Jsoup.parse(html);
//获取spu 的dom 数据
Elements spuelements = document.select("div#J_goodsList li.gl-item");
for (Element spuelement : spuelements) {
String spuId = spuelement.attr("data-spu");
Elements skuelements = spuelement.select("div.p-scroll li.ps-item");
for (Element skuelement : skuelements) {
Item item = new Item();
item.setSpu(spuId);
//获得skuid
String skuid = skuelement.select("img").attr("data-sku");
item.setSku(skuid);
//获得图片的url路径
String skuUrl = "https://item.jd.com/"+skuid+".html";
item.setUrl(skuUrl);
//发送请求获得商品详情页的数据
String skuHtml = httpUtil.getHtml(skuUrl);
//获得详情页的dom树
Document skuDocumet = Jsoup.parse(skuHtml);
String skuTitle = skuDocumet.select("div.sku-name").text();
item.setTitle(skuTitle);
//注意:因为价格是异步请求的,所以我们通过ajax获得
String priceUrl = "https://p.3.cn/prices/mgets?skuIds=J_"+skuid;
String parceHtml = httpUtil.getHtml(priceUrl);
ObjectMapper objectMapper = new ObjectMapper();
try {
long price = objectMapper.readTree(priceUrl).get(0).get("p").asLong();
item.setPrice(price);
} catch (IOException e) {
e.printStackTrace();
}
//获取路径
String imgUrl = skuelement.select("img").attr("src");
if (StringUtils.isEmpty(imgUrl)){
imgUrl = skuelement.select("img").attr("data-lazy-img");
}
imgUrl = imgUrl.replace("/n9/","/n7/");
String imageNewName= httpUtil.getImg("http:" + imgUrl);
item.setPic(imageNewName);
item.setCreated(new Date());
itemService.save(item);
}
}
}
}
使用json解析获得的页面
package com.itheima.utils;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.springframework.stereotype.Component;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.UUID;
@Component
public class HttpUtil {
private static PoolingHttpClientConnectionManager pool;
static{
pool = new PoolingHttpClientConnectionManager();
pool.setMaxTotal(200);
pool.setDefaultMaxPerRoute(50);
}
/**
* 使用httpclient抓取页面
* @param url
* @return
*/
public String getHtml(String url){
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(pool).build();
HttpGet httpGet = new HttpGet(url);
httpGet.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36");
httpGet.setConfig(this.getConfig());
try {
CloseableHttpResponse response = httpClient.execute(httpGet);
if(response.getStatusLine().getStatusCode() == 200){
String content = EntityUtils.toString(response.getEntity(), "UTF-8");
return content;
}
} catch (IOException e) {
e.printStackTrace();
return null;
}
return null;
}
/**
* 获取图片,并且保存在本地文件
* 返回图片的新文件名
* @param imgUrl
* @return
*/
public String getImg(String imgUrl){
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(pool).build();
HttpGet httpGet = new HttpGet(imgUrl);
httpGet.setConfig(this.getConfig());
try {
CloseableHttpResponse response = httpClient.execute(httpGet);
//获取图片后缀 .jpg .png
String suffix = imgUrl.substring(imgUrl.lastIndexOf("."));
//创建新文件名
String imgNewName = UUID.randomUUID()+suffix;
if(response.getStatusLine().getStatusCode() ==200){
//通过流保存图片文件
FileOutputStream outputStream = new FileOutputStream("F:\\img\\"+imgNewName);
response.getEntity().writeTo(outputStream);
return imgNewName;
}
} catch (IOException e) {
e.printStackTrace();
return null;
}
return null;
}
/**
* 生成配置
* @return
*/
private RequestConfig getConfig(){
RequestConfig requestConfig = RequestConfig.custom()
.setConnectionRequestTimeout(1000)
.setConnectTimeout(1000)
.setSocketTimeout(1000 * 10)
.build();
return requestConfig;
}
}
文章浏览阅读2w次,点赞7次,收藏51次。四个步骤1.创建C++ Win32项目动态库dll 2.在Win32项目动态库中添加 外部依赖项 lib头文件和lib库3.导出C接口4.c#调用c++动态库开始你的表演...①创建一个空白的解决方案,在解决方案中添加 Visual C++ , Win32 项目空白解决方案的创建:添加Visual C++ , Win32 项目这......_c#调用lib
文章浏览阅读4.6k次。苹方字体是苹果系统上的黑体,挺好看的。注重颜值的网站都会使用,例如知乎:font-family: -apple-system, BlinkMacSystemFont, Helvetica Neue, PingFang SC, Microsoft YaHei, Source Han Sans SC, Noto Sans CJK SC, W..._ubuntu pingfang
文章浏览阅读159次。表单表单概述表单标签表单域按钮控件demo表单标签表单标签基本语法结构<form action="处理数据程序的url地址“ method=”get|post“ name="表单名称”></form><!--action,当提交表单时,向何处发送表单中的数据,地址可以是相对地址也可以是绝对地址--><!--method将表单中的数据传送给服务器处理,get方式直接显示在url地址中,数据可以被缓存,且长度有限制;而post方式数据隐藏传输,_html表单的处理程序有那些
文章浏览阅读1.2k次。使用说明:开启Google的登陆二步验证(即Google Authenticator服务)后用户登陆时需要输入额外由手机客户端生成的一次性密码。实现Google Authenticator功能需要服务器端和客户端的支持。服务器端负责密钥的生成、验证一次性密码是否正确。客户端记录密钥后生成一次性密码。下载谷歌验证类库文件放到项目合适位置(我这边放在项目Vender下面)https://github.com/PHPGangsta/GoogleAuthenticatorPHP代码示例://引入谷_php otp 验证器
文章浏览阅读4.3k次,点赞5次,收藏11次。matplotlib.plot画图横坐标混乱及间隔处理_matplotlib更改横轴间距
文章浏览阅读2.2k次。①Storage driver 处理各镜像层及容器层的处理细节,实现了多层数据的堆叠,为用户 提供了多层数据合并后的统一视图②所有 Storage driver 都使用可堆叠图像层和写时复制(CoW)策略③docker info 命令可查看当系统上的 storage driver主要用于测试目的,不建议用于生成环境。_docker 保存容器
文章浏览阅读834次,点赞27次,收藏13次。网络拓扑结构是指计算机网络中各组件(如计算机、服务器、打印机、路由器、交换机等设备)及其连接线路在物理布局或逻辑构型上的排列形式。这种布局不仅描述了设备间的实际物理连接方式,也决定了数据在网络中流动的路径和方式。不同的网络拓扑结构影响着网络的性能、可靠性、可扩展性及管理维护的难易程度。_网络拓扑csdn
文章浏览阅读1.8k次,点赞5次,收藏8次。IOS系统Date的坑要创建一个指定时间的new Date对象时,通常的做法是:new Date("2020-09-21 11:11:00")这行代码在 PC 端和安卓端都是正常的,而在 iOS 端则会提示 Invalid Date 无效日期。在IOS年月日中间的横岗许换成斜杠,也就是new Date("2020/09/21 11:11:00")通常为了兼容IOS的这个坑,需要做一些额外的特殊处理,笔者在开发的时候经常会忘了兼容IOS系统。所以就想试着重写Date函数,一劳永逸,避免每次ne_date.prototype 将所有 ios
文章浏览阅读5.3k次。方法一:用PLSQL Developer工具。 1 在PLSQL Developer的sql window里输入select * from test for update; 2 按F8执行 3 打开锁, 再按一下加号. 鼠标点到第一列的列头,使全列成选中状态,然后粘贴,最后commit提交即可。(前提..._excel导入pl/sql
文章浏览阅读83次。Git常用命令速查手册1、初始化仓库git init2、将文件添加到仓库git add 文件名 # 将工作区的某个文件添加到暂存区 git add -u # 添加所有被tracked文件中被修改或删除的文件信息到暂存区,不处理untracked的文件git add -A # 添加所有被tracked文件中被修改或删除的文件信息到暂存区,包括untracked的文件...
文章浏览阅读202次。分享119个ASP.NET源码总有一个是你想要的_千博二手车源码v2023 build 1120
文章浏览阅读1.8k次。版权声明:转载请注明出处 http://blog.csdn.net/irean_lau。目录(?)[+]1、缺省构造函数。2、缺省拷贝构造函数。3、 缺省析构函数。4、缺省赋值运算符。5、缺省取址运算符。6、 缺省取址运算符 const。[cpp] view plain copy_空类默认产生哪些类成员函数