Java实现爬取网页数据:PhantomJS+Webdriver

编程入门 行业动态 更新时间:2024-10-06 15:19:55

Java实现爬取<a href=https://www.elefans.com/category/jswz/34/1771338.html style=网页数据:PhantomJS+Webdriver"/>

Java实现爬取网页数据:PhantomJS+Webdriver

   本文根据工作中爬取数据需要所做工作整理而来。最初我使用了HttpClient+Jsoup,然后这种最简单的方式只能得到普通的静态页面数据以及暴露在浏览器F12调试窗口中的可见URL的数据采集,对于一些需要模仿浏览器行为比如点击事件,比如页面采用了JS框架进行重新布局的就无能为力了。因此,对于此类情况,最后经过摸索,得到了这个比较好一点的实践方式。下面废话不多说,来一个具体实践:抓取点击打开链接/  网页上的数据。


第一步:创建Maven工程:mycrawler

第二步:导入Maven依赖:

<project xmlns=".0.0" xmlns:xsi="" xsi:schemaLocation=".0.0 .0.0.xsd"><modelVersion>4.0.0</modelVersion><groupId>com.szzc.crawler</groupId><artifactId>mycrawler</artifactId><version>0.0.1-SNAPSHOT</version><dependencies><dependency><groupId>org.apachemons</groupId><artifactId>commons-lang3</artifactId><version>3.3.2</version></dependency><dependency><groupId>org.jsoup</groupId><artifactId>jsoup</artifactId><version>1.8.1</version></dependency><dependency><groupId>org.apache.httpcomponents</groupId><artifactId>httpclient</artifactId><version>4.3.5</version></dependency><dependency><groupId>junit</groupId><artifactId>junit</artifactId><version>4.2</version><scope>test</scope></dependency><dependency><groupId>org.seleniumhq.selenium</groupId><artifactId>selenium-java</artifactId><version>2.53.0</version></dependency><dependency><groupId>com.opera</groupId><artifactId>operadriver</artifactId></dependency><dependency><groupId>org.apachemons</groupId><artifactId>commons-exec</artifactId><version>1.3</version></dependency><dependency><groupId>log4j</groupId><artifactId>log4j</artifactId><version>1.2.17</version></dependency>
      <dependency>
	    <groupId>com.github.detro</groupId>
	    <artifactId>phantomjsdriver</artifactId>
	    <version>1.2.0</version>
	</dependency></dependencies><dependencyManagement><dependencies><dependency><groupId>com.opera</groupId><artifactId>operadriver</artifactId><version>0.16</version><exclusions><exclusion><groupId>org.seleniumhq.selenium</groupId><artifactId>selenium-remote-driver</artifactId></exclusion></exclusions></dependency></dependencies></dependencyManagement><build><plugins><plugin><groupId>org.apache.maven.plugins</groupId><artifactId>maven-compiler-plugin</artifactId><version>3.3</version></plugin></plugins></build>
</project>

第三步:封装的实体类CoinData:

package com.szzc;public class CoinData {private Integer rowId;private String marketName;//交易市场private String CurrentPrice;//最新价格private String platformPrice;//平台价格private String highestPrice;//最高价private String lowestPrice;//最低价private String upsAndDowns;//涨跌private String increment;//涨幅private String trading;//成交量public Integer getRowId() {return rowId;}public void setRowId(Integer rowId) {this.rowId = rowId;}public String getMarketName() {return marketName;}public void setMarketName(String marketName) {this.marketName = marketName;}public String getCurrentPrice() {return CurrentPrice;}public void setCurrentPrice(String currentPrice) {CurrentPrice = currentPrice;}public String getPlatformPrice() {return platformPrice;}public void setPlatformPrice(String platformPrice) {this.platformPrice = platformPrice;}public String getHighestPrice() {return highestPrice;}public void setHighestPrice(String highestPrice) {this.highestPrice = highestPrice;}public String getLowestPrice() {return lowestPrice;}public void setLowestPrice(String lowestPrice) {this.lowestPrice = lowestPrice;}public String getUpsAndDowns() {return upsAndDowns;}public void setUpsAndDowns(String upsAndDowns) {this.upsAndDowns = upsAndDowns;}public String getIncrement() {return increment;}public void setIncrement(String increment) {this.increment = increment;}public String getTrading() {return trading;}public void setTrading(String trading) {this.trading = trading;}@Overridepublic String toString() {return "CoinData [rowId=" + rowId + ", marketName=" + marketName + ", CurrentPrice=" + CurrentPrice+ ", platformPrice=" + platformPrice + ", highestPrice=" + highestPrice + ", lowestPrice=" + lowestPrice+ ", upsAndDowns=" + upsAndDowns + ", increment=" + increment + ", trading=" + trading + "]";}}
第四步:抓取数据的Main方法所在的类:

package com.szzc;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;import org.openqa.selenium.By;   
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;   
public class FirstTest {public static final String TR = "tr";public static final String TD = "td";public static Integer ROWID = 1;private static String[] tableDiv = null;private static String[] liIds = null;static {tableDiv = new String[4];tableDiv[0] = "default_market_tabs-pane-btc";tableDiv[1] = "default_market_tabs-pane-ltc";tableDiv[2] = "default_market_tabs-pane-eth";tableDiv[3] = "default_market_tabs-pane-etc";liIds = new String[4];liIds[0] = "default_market_tabs-tab-btc";liIds[1] = "default_market_tabs-tab-ltc";liIds[2] = "default_market_tabs-tab-eth";liIds[3] = "default_market_tabs-tab-etc";}public static void main(String[] args) throws Exception {    //加载Chrome的驱动并打开浏览器   System.setProperty("webdriver.chrome.driver","D:/Google/chromedriver.exe");  ChromeOptions options = new ChromeOptions();options.addArguments("--start-maximized", "allow-running-insecure-content", "--test-type");WebDriver driver = new ChromeDriver(options);          //打开sosobtc页面    driver.get("/"); //给浏览器初始化页面响应时间Thread.sleep(5000);  //定义一个Map来存储获取到的四个币种的数据Map<String,List<CoinData>> data = new HashMap<>();String[] coinName = {"btc","ltc","eth","etc"};//依次点击页面的li标签,并获取数据for (int i = 0; i < liIds.length; i++) {List<CoinData> coidDataList = getCoidData(driver, liIds[i], tableDiv[i]);data.put(coinName[i], coidDataList);}for (String coinname : data.keySet()) {List<CoinData> list = data.get(coinname);for (CoinData coinData : list) {System.out.println(coinData);}}//关闭浏览器driver.quit();}/*** * @Description:* @param driver* @param liId 切换数据表格的li标签的id* @param id 存储数据的div的id* @throws Exception* @version 1.0* @return * @time 2017年7月9日下午9:28:20*/public static List<CoinData> getCoidData(WebDriver driver,String liId,String id) throws Exception {//点击切换li标签来显式不同币种的数据driver.findElement(By.id(liId)).click();//给数据响应的时间Thread.sleep(500L);//获取存储数据的table所在的divWebElement div = driver.findElement(By.id(id));//获得所有的行对象List<WebElement> trs = div.findElements(By.tagName(TR));//定义一个list来存储数据,每个元素代表一行List<CoinData> coinDataList = new ArrayList<>();for (WebElement tr : trs) {//获取一个列对象列表List<WebElement> tds = tr.findElements(By.tagName(TD));//获取的列对象集合不为空时,开始封装对象if (tds != null && tds.size() > 0) {CoinData coinData = new CoinData();coinData.setRowId(ROWID++);coinData.setMarketName(tds.get(0).getText());coinData.setCurrentPrice(tds.get(1).getText());coinData.setPlatformPrice(tds.get(2).getText());coinData.setHighestPrice(tds.get(3).getText());coinData.setLowestPrice(tds.get(4).getText());coinData.setUpsAndDowns(tds.get(5).getText());coinData.setIncrement(tds.get(6).getText());coinData.setTrading(tds.get(7).getText());coinDataList.add(coinData);}}//切换币种时,重新从第一个市场名字开始计数ROWID = 1;return coinDataList;}
}


现在可以抓取到页面的不同标签下的数据了。但是不完美的是,每次运行程序还会有一个浏览器的窗口弹出来比较讨厌。我们可以使用PhantomJS来实现无界面的浏览器效果:

实现抓取的Main函数所在的类:

package com.szzc;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;import org.openqa.selenium.By;   
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.phantomjs.PhantomJSDriver;
import org.openqa.selenium.remote.DesiredCapabilities;   
public class SecondTest {public static final String TR = "tr";public static final String TD = "td";public static Integer ROWID = 1;private static String[] tableDiv = null;private static String[] liIds = null;static {tableDiv = new String[4];tableDiv[0] = "default_market_tabs-pane-btc";tableDiv[1] = "default_market_tabs-pane-ltc";tableDiv[2] = "default_market_tabs-pane-eth";tableDiv[3] = "default_market_tabs-pane-etc";liIds = new String[4];liIds[0] = "default_market_tabs-tab-btc";liIds[1] = "default_market_tabs-tab-ltc";liIds[2] = "default_market_tabs-tab-eth";liIds[3] = "default_market_tabs-tab-etc";}public static void main(String[] args) throws Exception {    //加载Chrome的驱动并打开浏览器   //System.setProperty("webdriver.chrome.driver","D:/Google/chromedriver.exe");System.setProperty("phantomjs.binary.path", "/usr/bin/phantomjs");System.setProperty("phantomjs.binary.path", "./phantomjs/win/phantomjs.exe");DesiredCapabilities desiredCapabilities = DesiredCapabilities.phantomjs();//此处可以设置一些desiredCapabilities的属性(浏览器的头信息)WebDriver driver = new PhantomJSDriver(desiredCapabilities);//打开sosobtc页面    driver.get("/"); //给浏览器初始化页面响应时间Thread.sleep(5000);  //定义一个Map来存储获取到的四个币种的数据Map<String,List<CoinData>> data = new HashMap<>();String[] coinName = {"btc","ltc","eth","etc"};//依次点击页面的li标签,并获取数据for (int i = 0; i < liIds.length; i++) {List<CoinData> coidDataList = getCoidData(driver, liIds[i], tableDiv[i]);data.put(coinName[i], coidDataList);}for (String coinname : data.keySet()) {List<CoinData> list = data.get(coinname);for (CoinData coinData : list) {System.out.println(coinData);}}//关闭浏览器driver.quit();}/*** * @Description:* @param driver* @param liId 切换数据表格的li标签的id* @param id 存储数据的div的id* @throws Exception* @version 1.0* @return * @time 2017年7月9日下午9:28:20*/public static List<CoinData> getCoidData(WebDriver driver,String liId,String id) throws Exception {//点击切换li标签来显式不同币种的数据driver.findElement(By.id(liId)).click();//给数据响应的时间Thread.sleep(500L);//获取存储数据的table所在的divWebElement div = driver.findElement(By.id(id));//获得所有的行对象List<WebElement> trs = div.findElements(By.tagName(TR));//定义一个list来存储数据,每个元素代表一行List<CoinData> coinDataList = new ArrayList<>();for (WebElement tr : trs) {//获取一个列对象列表List<WebElement> tds = tr.findElements(By.tagName(TD));//获取的列对象集合不为空时,开始封装对象if (tds != null && tds.size() > 0) {CoinData coinData = new CoinData();coinData.setRowId(ROWID++);coinData.setMarketName(tds.get(0).getText());coinData.setCurrentPrice(tds.get(1).getText());coinData.setPlatformPrice(tds.get(2).getText());coinData.setHighestPrice(tds.get(3).getText());coinData.setLowestPrice(tds.get(4).getText());coinData.setUpsAndDowns(tds.get(5).getText());coinData.setIncrement(tds.get(6).getText());coinData.setTrading(tds.get(7).getText());coinDataList.add(coinData);}}//切换币种时,重新从第一个市场名字开始计数ROWID = 1;return coinDataList;}
}

至此我们已经可以完美的模仿一个浏览器的行为,来简单抓取一些网页的数据了。


更多推荐

Java实现爬取网页数据:PhantomJS+Webdriver

本文发布于:2024-02-28 10:41:19,感谢您对本站的认可!
本文链接:https://www.elefans.com/category/jswz/34/1769338.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
本文标签:网页   数据   Java   PhantomJS   Webdriver

发布评论

评论列表 (有 0 条评论)
草根站长

>www.elefans.com

编程频道|电子爱好者 - 技术资讯及电子产品介绍!