爬虫：你一定要知道的模板（附源码）

编程入门行业动态更新时间:2024-10-17 22:29:07

爬虫：你一定要知道的模板（附源码）

作为一名cv程序员，梦想是造飞机，现实是拧螺丝

普通的爬虫，无非就是固定的模板

xpath，bs定位标签
axios返回数据，json解析
selenium

json解析版

import requests
import codecs,csv
import time
import randomheaders={#防盗链'referer':'=7200628769755876648',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.63',
#有的网站需要带cookie才能返回数据'cookie':''
}# ip池
ip_list = [{'HTTP': '116.9.163.205:58080'},
{'HTTP': '61.216.185.88:60808'},
{'HTTP': '182.34.102.50:9999'},
{'HTTP': '183.236.232.160:8080'},
{'HTTP': '117.94.124.21:9000'},
{'HTTP': '210.5.10.87:53281'},
{'HTTP': '121.13.252.58:41564'},
{'HTTP': '121.13.252.60:41564'},
{'HTTP': '117.114.149.66:55443'},
{'HTTP': '112.14.47.6:52024'}
]
# 随机获取ip
# ip = ip_list.pop(random.randint(0, len(ip_list) - 1))
ip=random.choice(ip_list)def getPlayUrl():for i in range(0,1000):try: print('正在爬取'+str(i)+'页')#请求url，一般只需要拼接url，即可获取多页url = ''res=requests.get(url,headers=headers,proxies=ip)# print(res.text)print(res)#json格式，直接提取data=res.json()['comments']# print(data)for item in data:comments={}comments['cid']=item['cid']create_time=item['create_time']comments['time']=time.strftime("%Y-%m-%d %H:%M",time.localtime(create_time))# comments['user']=item['user']['nickname']comments['comment']=item['text']yield commentstime.sleep(1)except Exception as e:print(e)breakif __name__=='__main__':#保存数据f=codecs.open('抖音评论_1.csv','a+',encoding='utf-8-sig')filename=['cid','time','comment']writer=csv.DictWriter(f,filename)writer.writeheader()for i in getPlayUrl():print(i)writer.writerow(i)# getPlayUrl()

元素定位

BeautifulSoup

from bs4 import BeautifulSoup
import requests
import time
import codecs,csvheaders = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}f=codecs.open('歌单_1.csv','w+', encoding='utf-8-sig')
filename=['歌单详情页地址','歌单标题','歌单播放量','歌单贡献者名字']
writer=csv.DictWriter(f,filename)
writer.writeheader()
j=0
for i in range(0, 656, 35):time.sleep(2)print('正在爬取'+str(j)+'页')url = '/?&order=hot&limit=35&offset=' + str(i)response = requests.get(url=url, headers=headers)html = response.textsoup = BeautifulSoup(html, 'html.parser')# 获取包含歌单详情页网址的标签ids = soup.select('.dec a')# 获取包含歌单索引页信息的标签lis = soup.select('#m-pl-container li')print(len(lis))for j in range(len(lis)):# 获取歌单详情页地址url = ''+ids[j]['href']# 获取歌单标题title = ids[j]['title']# 获取歌单播放量play = lis[j].select('.nb')[0].get_text()# 获取歌单贡献者名字user = lis[j].select('p')[1].select('a')[0].get_text()# 输出歌单索引页信息obj={'歌单详情页地址':url,'歌单标题':title,'歌单播放量':play,'歌单贡献者名字':user}print(obj)# 将信息写入CSV文件中writer.writerow(obj)j+=1

xpath

import requests
from lxml import etree
import json
import re
import pprint
import codecs,csv
import time
import randomheaders={'referer':'.html','user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.44','cookie':''
}
def getSpot():for i in range(0,16):try:print('爬取第' + str(i) + '页……')url=''data={'mddid': '10183','page': i}res=requests.post(url=url,headers=headers,data=data,proxies=ip)# print(res)# print(res.text)data=json.loads(res.text)text=data['list']html=etree.HTML(text)total=html.xpath("//li[@class='item ']")for item in total:info={}info['地址']=item.xpath(".//div[@class='title']/text()")[0]info['累计游玩人数']=item.xpath(".//div[@class='nums']/b/text()")[0]info['url']='/'+item.xpath('.//a/@href')[0]yield infotime.sleep(2)except Exception as e:print(e)breakif __name__=="__main__":# ip池ip_list = [{'HTTP': '61.164.39.68:53281'},{'HTTP': '27.42.168.46:55481'},{'HTTP': '116.9.163.205:58080'},{'HTTP': '182.34.102.50:9999'},{'HTTP': '183.236.232.160:8080'},{'HTTP': '113.124.86.24:9999'},{'HTTP': '210.5.10.87:53281'}]# 随机获取ipip = ip_list.pop(random.randint(0, len(ip_list) - 1))f=open('日本旅游目的地.csv','a+',newline='',encoding='utf-8-sig')filename=['地址','累计游玩人数','url']writer=csv.DictWriter(f,filename)# writer.writeheader()for comment in getSpot():print(comment)writer.writerow(comment)

个人觉得xpath比较简单，用的比较上手

selenium版

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdrivermon.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import pandas as pd
import timedata = pd.read_csv('zhiwu.csv')
names=data['name']
urls=data['link']def add_options():print("—————————— options ——————————")# 创建谷歌浏览器驱动参数对象chrome_options = webdriver.ChromeOptions()# 不加载图片# prefs = {"profile.managed_default_content_settings.images": 2}# chrome_options.add_experimental_option("prefs", prefs)# 使用无界面浏览器模式！！chrome_options.add_argument('--headless')# 使用隐身模式（无痕模式）chrome_options.add_argument('--incognito')# 禁用GPU加速chrome_options.add_argument('--disable-gpu')return chrome_options# 配置Selenium ChromeDriver
# service = Service('path/to/chromedriver')
# driver = webdriver.Chrome()driver = webdriver.Chrome(options=add_options())
# 设置等待时间
wait = WebDriverWait(driver, 10)# 循环爬取每一页的数据
for name,url in zip(names,urls):# 访问超链接并提取数据driver.get(url)# 提取评估信息、形态特征、地理分布等信息try:eval_info = driver.find_element(By.XPATH, '//*[@id="swx"]').textexcept:eval_info = ''try:morpho_feature = driver.find_element(By.XPATH, '//*[@id="tezheng"]').text# print(morpho_feature)except:morpho_feature = ''try:geo_distribution = driver.find_element(By.XPATH, '//*[@id="chandi"]').text# print(geo_distribution)except:geo_distribution = ''try:func_application = driver.find_element(By.XPATH, '//*[@id="gongneng"]').text# print(func_application)except:func_application = ''try:protection_value = driver.find_element(By.XPATH, '//*[@id="protvalue"]').text# print(protection_value)except:protection_value = ''try:protection_measure = driver.find_element(By.XPATH, '//*[@id="protway"]').text# print(protection_measure)except:protection_measure = ''try:cultivation_points = driver.find_element(By.XPATH, '//*[@id="growway"]').text# print(cultivation_points)except:cultivation_points = ''try:iframe = driver.find_element(By.XPATH, '//*[@id="Label1"]/iframe')driver.switch_to.frame(iframe)img_url = driver.find_element(By.XPATH, '//*[@id="pinfo"]/a').get_attribute('href')print(img_url)except:img_url = ''# 保存为csv文件，文件以表格的中文名列进行命名with open(f'{name}.csv', 'w', newline='', encoding='utf-8-sig') as csvfile:writer = csv.writer(csvfile)writer.writerow(['评估信息', '形态特征', '地理分布', '功能用途', '保护价值', '保护措施', '栽培要点','图片链接'])writer.writerow([eval_info, morpho_feature, geo_distribution, func_application,protection_value,protection_measure, cultivation_points,img_url])time.sleep(1)# 关闭浏览器
driver.quit()

新闻版+详情页

import requests
import codecs,csv
import time
import json
from lxml import etree
from pprint import pprintheaders={#防盗链'referer':'/',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.63',
}def getInfo():for i in range(1,31):try: print('正在爬取'+str(i)+'页')#请求url，一般只需要拼接url，即可获取多页url = '=121&lid=1356&num=20&versionNumber=1.2.4&page='+str(i)+'&encode=utf-8&callback=feedCardJsonpCallback'res=requests.get(url,headers=headers)# print(res.text)print(res)#json格式，直接提取text=res.text.split('try{feedCardJsonpCallback(')[1].split(');}catch(e){};')[0]# print(text)data=json.loads(text)# pprint(data)for item in data['result']['data']:info={}info['标题']=item['title']info['简介']=item['intro']create_time=int(item['ctime'])info['发布时间']=time.strftime("%Y-%m-%d %H:%M",time.localtime(create_time))info['主题']=item['keywords']try:     info['评论数']=item['comment_total']except:info['评论数']=''#爬取详情页d_url=item['url']res_1=requests.get(d_url,headers=headers)res_1.encoding='utf-8'   html=etree.HTML(res_1.text)info['详情']=html.xpath('//*[@id="article"]')[0].xpath('string(.)').replace('\n','').replace('\t','')yield infotime.sleep(2)except Exception as e:print(e)breakif __name__=='__main__':#保存数据f=codecs.open('新浪新闻.csv','w+',encoding='utf-8-sig')filename=['标题','简介','发布时间','主题','评论数','详情']writer=csv.DictWriter(f,filename)writer.writeheader()for info in getInfo():print(info)writer.writerow(info)

更多推荐

爬虫：你一定要知道的模板（附源码）

本文发布于:2024-02-11 19:55:03，感谢您对本站的认可！

本文链接:https://www.elefans.com/category/jswz/34/1683068.html