from selenium import webdriver
from lxml import etree
from time import sleep
import re
import random
import time
import requests
from selenium.webdrivermon.by import By
from get_user_agent import get_user_agent_of_pc
from selenium.webdrivermon.service import Service
def roll_down(chrome):
for x in range(1, 11,3):
time.sleep(random.random())
j = x / 10
js = "document.documentElement.scroll=document.documentElement.scrollHeight*%s" % j
chrome.execute_script(js)
def parse_html(html):
tree = etree.HTML(html)
names = tree.xpath('//div[@id="J_goodsList"]/ul/li/div/div[3]/a/em')
prices = tree.xpath('//div[@id="J_goodsList"]/ul/li/div/div[2]/strong/i/text()')
shops = tree.xpath('//div[@id="J_goodsList"]/ul/li/div/div[5]/span/a/text()')
hrefs = tree.xpath('//div[@id="J_goodsList"]/ul/li/div/div[1]/a/@href')
img_urls = tree.xpath('//div[@id="J_goodsList"]/ul/li/div/div[1]/a/img/@src')
comments = tree.xpath('//div[@id="J_goodsList"]/ul/li/div/div[4]/strong/a/text()')
for name, price, shop, href, img_url, commnet in zip(names, prices, shops, hrefs, img_urls, comments):
with open('新.txt', "a+", encoding='utf-8') as f :
s = re.sub('\s+','',name.xpath('string(.)'))
s1= re.sub('\/+','-', s)#删除/
s2 = re.sub('\|+','_', s1)
s3 = re.sub('\?+','', s2)
s4 = re.sub('\*+','', s3)
s5 = re.sub('\\+','', s4)
f.write("商品名:"+ s +'\t' +
'价格:' + price + '\t' +
'商品链接:' + 'https:' + href + '\t' +
"卖家:" + shop + '\t' +
"图片地址" + 'https://' + img_url + '\t' +
"评论数" + commnet + '\n')
#下载图片
response = requests.get('https:' + img_url , headers = headers )
with open(image/{}.jpg ,'wb') as f:
f.write(response.content)
def JD_Spider(url):
chrome_driver='F:\daolun\chromedriver.exe'
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('user-agent='+ get_user_agent_of_pc())
options.add_argument('disable-infobars')
options.add_experimental_option('excludeSwitches',['enalble-automation'])
chrome = webdriver.Chrome(options=options, executable_path=chrome_driver)
chrome.get(url)
roll_down(chrome)
print('正在爬取{}页...'.format(1))
html=chrome.page_source
parse_html(html)
num = 1
js = 'return document.getElementsByClassName("pn-next disabled").length'
has_next = chrome.execute_script(js) # 判断“下一页”按钮是否可用 has_next=0表示“下一页”按钮是可用,has_next=1表示“下一页”按钮不可用
while has_next == 0:
try:
next_page_button = chrome.find_element(by=By.xpath,value='//a[@class="pn-next"]')
next_page_button.click()
except Exception as e :
break
num+=1
print('正在爬取第{}页...'.format(num))
roll_down(chrome)
sleep(3 + random.random())
next_html = chrome.page_source
parse_html(next_html)
js = 'return document.getElementsByClassName("pn-next disabled").length'
has_next = chrome.execute_script(js)
if __name__=='__main__':
headers = {
"User-Agent": get_user_agent_of_pc()
}
first_page='https://search.jd/Search?keyword=macbook&enc=utf-8&wq=macbook&pvid=0798b177abbc445e9b25431224c3c63b'
JD_Spider(first_page)
更多推荐
这个爬虫为什么只能爬一页,而且为啥显示image没有被定义
发布评论