利用BeautifulSoup和Xpath爬取赶集网北京二手房房价信息

编程入门行业动态更新时间:2024-10-07 02:25:45

利用BeautifulSoup和Xpath爬取赶集网北京二手房<a href=https://www.elefans.com/category/jswz/34/1769574.html style= 房价信息"/>

利用BeautifulSoup和Xpath爬取赶集网北京二手房房价信息

文章开始把我喜欢的这句话送个大家：这个世界上还有什么比自己写的代码运行在一亿人的电脑上更酷的事情吗，如果有那就是让这个数字再扩大十倍！

1.BeautifulSoup实现

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Oct 12 17:41:06 2018
Beautiful Soup爬取
@author: Macbook
"""import requests
import re
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
import csv
import timeheaders = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}def get_one_page(url):try:response = requests.get(url,headers = headers)if response.status_code == 200:return response.textreturn Noneexcept RequestException:return Nonedef parse_one_page(content):try:soup = BeautifulSoup(content,'html.parser')items = soup.find('div',class_=repile('js-tips-list'))for div in items.find_all('div',class_=repile('ershoufang-list')):yield {'Name':div.find('a',class_=repile('js-title')).text,'Type': div.find('dd', class_=repile('size')).contents[1].text,#tag的 .contents 属性可以将tag的子节点以列表的方式输出'Area':div.find('dd',class_=repile('size')).contents[5].text,'Towards':div.find('dd',class_=repile('size')).contents[9].text,'Floor':div.find('dd',class_=repile('size')).contents[13].text.replace('\n',''),'Decorate':div.find('dd',class_=repile('size')).contents[17].text,'Address':div.find('span',class_=repile('area')).text.strip().replace(' ','').replace('\n',''),'TotalPrice':div.find('span',class_=repile('js-price')).text+div.find('span',class_=repile('yue')).text,'Price':div.find('div',class_=repile('time')).text}#有一些二手房信息缺少部分信息，如：缺少装修信息，或者缺少楼层信息，这时候需要加个判断，不然爬取就会中断。if div['Name', 'Type', 'Area', 'Towards', 'Floor', 'Decorate', 'Address', 'TotalPrice', 'Price'] == None:return Noneexcept Exception:return Nonedef main():for i in range(1,500):url = '{}/'.format(i)content = get_one_page(url)print('第{}页抓取完毕'.format(i))for div in parse_one_page(content):print(div)with open('Data.csv', 'a',encoding='gbk',newline='') as f:  # Data.csv 文件存储的路径,如果默认路径就直接写文件名即可。fieldnames = ['Name', 'Type', 'Area', 'Towards', 'Floor', 'Decorate', 'Address', 'TotalPrice', 'Price']writer = csv.DictWriter(f, fieldnames=fieldnames)writer.writeheader()for item in parse_one_page(content):writer.writerow(item)#.encode('utf-8', 'ignore')time.sleep(2)#设置爬取频率，爬取的太快，导致网页需要验证。if __name__=='__main__':main()

2.xpath实现

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Oct 12 15:37:50 2018
爬取赶集网北京二手房数据
主要练习应用Xpath判断某些元素是否存在，以防某些元素不存在导致爬取中断
@author: Macbook
"""
import requests
from lxml import etree
from requests.exceptions import RequestException
import multiprocessing
import timeheaders = {'User-Agent':'Mozilla/5.0(Windows NT 10.0;WOW64) AppleWebKit/537.36(KHTML,like Gecko) Chrome/55.0.2883.87 Safari/537.36'}def get_one_page(url):try:response = requests.get(url,headers=headers)if response.status_code == 200:return response.textreturn Noneexcept RequestException:return Nonedef parse_one_page(content):try:selector = etree.HTML(content)ALL = selector.xpath('//*[@id="f_mew_list"]/div[6]/div[1]/div[3]/div[1]/div')for div in ALL:yield{'Name': div.xpath('dl/dd[1]/a/text()')[0],'Type': div.xpath('dl/dd[2]/span[1]/text()')[0],'Area': div.xpath('dl/dd[2]/span[3]/text()')[0],'Towards': div.xpath('dl/dd[2]/span[5]/text()')[0],'Floor': div.xpath('dl/dd[2]/span[7]/text()')[0].strip().replace('\n', ""),'Decorate': div.xpath('dl/dd[2]/span[9]/text()')[0],#地址需要特殊处理一下'Address': div.xpath('dl/dd[3]//text()')[1]+div.xpath('dl/dd[3]//text()')[3].replace('\n','')+div.xpath('dl/dd[3]//text()')[4].strip(),'TotalPrice': div.xpath('dl/dd[5]/div[1]/span[1]/text()')[0] + div.xpath('dl/dd[5]/div[1]/span[2]/text()')[0],'Price': div.xpath('dl/dd[5]/div[2]/text()')[0]}if div['Name','Type','Area','Towards','Floor','Decorate','Address','TotalPrice','Price'] == None:##这里加上判断，如果有一个为空输出nullreturn Noneexcept Exception:return Nonedef main():for i in range(1,500):#爬500页url = "/com/fang5/o{}/".format(i)content = get_one_page(url)print('第{}页抓取完毕'.format(i))for div in parse_one_page(content):print(div)if __name__ == '__main__':main()

加油吧，程序员！

更多推荐

利用BeautifulSoup和Xpath爬取赶集网北京二手房房价信息

本文发布于:2024-02-19 18:09:54，感谢您对本站的认可！

本文链接:https://www.elefans.com/category/jswz/34/1764942.html