爬取京东商品数据
我把项目部署到了linux中,进行爬取,爬到了3000条手机信息,只是爬了一些简单的文本信息.
本文爬取的数据为京东手机信息
准备工作
- 导入爬取数据需要的依赖包
- 编写httpClient工具类
- 编写pojo类
- 编写dao
<dependencies>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.4</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.3</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.38</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-jdbc</artifactId>
<version>4.2.4.RELEASE</version>
</dependency>
<dependency>
<groupId>c3p0</groupId>
<artifactId>c3p0</artifactId>
<version>0.9.1.2</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.8.1</version>
</dependency>
</dependencies>
package com.hrh.utils;
import com.hrh.pojo.Product;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpRequestBase;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
public class HttpClientUtils {
//创建httpclient连接池
private static PoolingHttpClientConnectionManager connectionManager;
static{
connectionManager=new PoolingHttpClientConnectionManager();
//定义连接池最大连接数
connectionManager.setMaxTotal(200);
//对指定的网址最多只有20个连接
connectionManager.setDefaultMaxPerRoute(20);
}
private static CloseableHttpClient getCloseableHttpClient(){
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(connectionManager).build();
return httpClient;
}
private static String execute(HttpRequestBase httpRequestBase) throws IOException {
httpRequestBase.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0");
//设置超时时间
RequestConfig config = RequestConfig.custom().setConnectionRequestTimeout(5000).setConnectTimeout(5000).setSocketTimeout(10 * 1000).build();
httpRequestBase.setConfig(config);
CloseableHttpClient httpClient = getCloseableHttpClient();
CloseableHttpResponse response = httpClient.execute(httpRequestBase);
String html = EntityUtils.toString(response.getEntity(), "utf-8");
return html;
}
public static String doGet(String url) throws IOException {
HttpGet httpGet = new HttpGet(url);
String html = execute(httpGet);
return html;
}
public static String doPost(String url, Map<String,String> params) throws IOException {
HttpPost httpPost = new HttpPost(url);
List<BasicNameValuePair> list = new ArrayList<>();
for (String key : params.keySet()) {
list.add(new BasicNameValuePair(key,params.get(key)));
}
UrlEncodedFormEntity entity = new UrlEncodedFormEntity(list);
httpPost.setEntity(entity);
return execute(httpPost);
}
}
package com.hrh.pojo;
/**
* 商品表
*/
public class Product {
private String pid;
private String title;
private String brand;
private String pname;
private String price;
public String getPid() {
return pid;
}
public void setPid(String pid) {
this.pid = pid;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getBrand() {
return brand;
}
public void setBrand(String brand) {
this.brand = brand;
}
public String getPname() {
return pname;
}
public void setPname(String pname) {
this.pname = pname;
}
public String getPrice() {
return price;
}
public void setPrice(String price) {
this.price = price;
}
@Override
public String toString() {
return "Product{" +
"pid=" + pid +
", title='" + title + '\'' +
", brand='" + brand + '\'' +
", pname='" + pname + '\'' +
", price=" + price +
'}';
}
}
package com.hrh.dao;
import com.hrh.pojo.Product;
import com.mchange.v2.c3p0.ComboPooledDataSource;
import org.springframework.jdbc.core.JdbcTemplate;
import java.beans.PropertyVetoException;
public class ProductDao extends JdbcTemplate{
public ProductDao(){
//定义c3p0连接池
ComboPooledDataSource ds = new ComboPooledDataSource();
try {
ds.setDriverClass("com.mysql.jdbc.Driver");
ds.setUser("root");
ds.setPassword("123");
ds.setJdbcUrl("jdbc:mysql://localhost:3306/crawler?characterEncoding=utf-8");
} catch (PropertyVetoException e) {
e.printStackTrace();
}
super.setDataSource(ds);
}
public void addProduct(Product product){
super.update("insert into jd_phone values (?,?,?,?,?)",
product.getPid(),product.getTitle(),product.getPname(),product.getBrand(),product.getPrice());
}
}
核心代码
创建线程池和队列 开启线程 等待队列中的数据并进行分析
博客: 线程池和队列的基本使用在获得手机列表时,pid一个一个的解析,效率太低,解析完一个页面的pid,才能进入下一页,继续解析.所以要引入多线程
- 线程池的使用: 提高程序执行效率
- 如果使用线程池,就要考虑线程安全问题
- pid在存储时,要放到线程安全的容器中, 并且容器时FIFO的
- 队列的使用: 线程安全(阻塞队列)
基本流程 :
- 确定手机列表页的URL 进行分析 得到本页中所有手机的pid
- 将pid 放入阻塞队列中 等待线程的解析
- 根据pid获得确定具体手机的URL 进行解析 将数据封装到product对象中
- 调用dao 将product对象存到数据库中
package com.hrh.test;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.hrh.dao.ProductDao;
import com.hrh.pojo.Product;
import com.hrh.utils.HttpClientUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.concurrent.*;
public class JDPhone {
//创建dao对象
static ProductDao productDao = new ProductDao();
//创建线程池
static ExecutorService threadPool = Executors.newFixedThreadPool(20);
//创建原生阻塞队列 队列最大容量为1000
static BlockingQueue<String> queue=new ArrayBlockingQueue<String>(1000);
public static void main(String[] args) throws IOException, InterruptedException {
//监视队列大小的线程
threadPool.execute(new Runnable() {
@Override
public void run() {
while(true){
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
//获得队列当前的大小
int size = queue.size();
System.out.println("当前队列中有"+size+"个pid");
}
}
});
//开启10个线程去解析手机列表页获得的pids
for (int i = 1; i <=10; i++) {
threadPool.execute(new Runnable() {
@Override
public void run() {
while (true){
String pid=null;
try {
//从队列中取出pid
pid = queue.take();
Product product = parsePid(pid);
//存入数据库
productDao.addProduct(product);
} catch (Exception e) {
e.printStackTrace();
try {
//出现异常则放回队列
queue.put(pid);
} catch (InterruptedException e1) {
e1.printStackTrace();
}
}
}
}
});
}
//分页查找手机数据 共100页
for (int i = 1; i <=100 ; i++) {
//京东分页page为 1 3 5 7 .....
// 对应第一页 第二页....
String url="https://search.jd/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&page="+(2*i-1);
String html = HttpClientUtils.doGet(url);
parseIndex(html);
}
}
//解析手机列表页
private static void parseIndex(String html) throws IOException, InterruptedException {
Document document = Jsoup.parse(html);
//手机列表
Elements elements = document.select("#J_goodsList>ul>li");
if(elements!=null||elements.size()!=0){
for (Element element : elements) {
//获得每个li的pid
String pid = element.attr("data-pid");
//将pid放入队列中
queue.put(pid);
}
}
}
//解析每个手机的页面 获得某个手机的详细数据
private static Product parsePid(String pid) throws IOException {
//拼接url 进入手机详情页
String productUrl="https://item.jd/"+pid+".html";
String productHtml = HttpClientUtils.doGet(productUrl);
Document document = Jsoup.parse(productHtml);
Product product = new Product();
//获得手机标题
if(document.select("div.sku-name").size()>0){
String title = document.select("div.sku-name").get(0).text();
product.setTitle(title);
}
//获得手机品牌
String brand = document.select("#parameter-brand li").attr("title");
product.setBrand(brand);
//获得手机名称
String pname = document.select("[class=parameter2 p-parameter-list] li:first-child").attr("title");
product.setPname(pname);
/* 此方案无法获取到价格
jd的价格采用异步刷新,price不在返回的html文档中,需要我们去请求价格页面
Elements select = document.select("span[class=price J-p-" + pid + "]");
System.out.println(select);
*/
//拼接价格页面url 经过测试 返回Json数据 jd对IP进行了限制,加入pduid为随机数,是为了可以获取更多数据,但是依然只能爬取部分
String priceUrl="https://p.3/prices/mgets?pduid="+Math.random()+"&skuIds=J_"+pid;
String priceJson = HttpClientUtils.doGet(priceUrl);
System.out.println(priceJson);
Gson gson = new GsonBuilder().create();
List<Map<String,String>> list = gson.fromJson(priceJson, List.class);
String price = list.get(0).get("p");
product.setPrice(price);
product.setPid(pid);
return product;
}
}
出现的问题:
- SocketTimeException 超时异常,因为jd对IP进行了限制,请求次数太多时,会被限制, 以后的文章会解决这个问题…
---------------------------------------------------------------------更新…-------------------------------------------------------------
爬笔记本数据的代码(只是更换了URL)
package com.hrh.test;
import com.google.gson.Gson;
import com.hrh.dao.ProductDao;
import com.hrh.pojo.Product;
import com.hrh.utils.HttpClientUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
/**
* @author QuietHR
* @create 2018/9/22
**/
public class JDPC {
private static BlockingQueue<String> queue=new ArrayBlockingQueue<String>(1000);
private static ExecutorService executorService = Executors.newFixedThreadPool(50);
private static ProductDao productDao=new ProductDao();
public static void main(String[] args) throws Exception {
executorService.execute(new Runnable() {
@Override
public void run() {
while (true){
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
int size = queue.size();
System.out.println("当前队列中有"+size+"个pid");
}
}
});
for (int i = 0; i < 30; i++) {
executorService.execute(new Runnable() {
@Override
public void run() {
while (true){
String pid = null;
try {
pid = queue.take();
Product product = parsePid(pid);
productDao.addProduct(product);
} catch (Exception e) {
e.printStackTrace();
try {
queue.put(pid);
} catch (InterruptedException e1) {
e1.printStackTrace();
}
}
}
}
});
}
page();
}
private static void page() throws Exception {
for (int i = 1; i <=100 ; i++) {
String url="https://search.jd/Search?keyword=%E7%AC%94%E8%AE%B0%E6%9C%AC&enc=utf-8&page="+(2*i-1);
String html = HttpClientUtils.doGet(url);
parseIndex(html);
}
}
private static void parseIndex(String html) throws InterruptedException {
Document document = Jsoup.parse(html);
Elements liEl = document.select("[class=gl-warp clearfix]>li");
for (Element li : liEl) {
queue.put(li.attr("data-sku"));
}
}
private static Product parsePid(String pid) throws Exception {
String url="https://item.jd/"+pid+".html";
String html = HttpClientUtils.doGet(url);
Document document = Jsoup.parse(html);
Product product = new Product();
product.setPid(pid);
Elements titleEl = document.select("[class=sku-name]");
product.setTitle(titleEl.text());
Elements brandEl = document.select("#parameter-brand>li");
product.setBrand(brandEl.attr("title"));
Elements pnameEl = document.select("[class=parameter2 p-parameter-list]>li:first-child");
product.setPname(pnameEl.attr("title"));
String productUrl="https://p.3/prices/mgets?pduid="+Math.random()+"&skuIds=J_"+pid;
String json = HttpClientUtils.doGet(productUrl);
Gson gson = new Gson();
List<Map<String,String>> list = gson.fromJson(json, List.class);
String price = list.get(0).get("p");
product.setPrice(price);
return product;
}
}
更多推荐
Java爬取京东商品数据
发布评论