直接爬取个省市区的行政代码编号

编程入门行业动态更新时间:2024-10-10 14:31:53

直接爬取个<a href=https://www.elefans.com/category/jswz/34/1764371.html style= 省市区的行政代码编号"/>

直接爬取个省市区的行政代码编号

今天的工作内容，需要获取某省级所有的行政区域编码，由于数据量太多，又懒得逐条整理，索性花费一点时间，写了一个爬虫。

由于又懒得定位某省的，索性全国的编码都获取下来，至于剩下的查看某省份的信息，那就交给其他同事好了。

再懒，也得说一下代码的结构，这是乌龟的屁股（规定），O(∩_∩)O~

一、爬取的内容以csv文件存储，

二、爬取的层数：

1层：爬取省份的信息

2层：爬取市的信息

3层：爬取区的信息

下面直接上代码：

# coding = utf -8
#auth = 'carl_DJ'import requests
from lxml  import etree
import csv,time
import  pandas as pd
from queue import Queue
from threading import Thread#获取网页数据def getUrl(url,num_retries = 5):headers = {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"}try:response = requests.get(url, headers=headers)response.encoding = 'GBK'data = response.textreturn dataexcept Ellipsis as e:if num_retries > 0:time.sleep(5)print(url)print("request fail,retry!")return getUrl(url,num_retries-1) #递归调用else:print("retry fail!")print("errors:%s" %e + "" +url)#获取省级代码函数
def getProvice(url):provice = []data = getUrl(url)selector = etree.HTML(data)proviceList = selector.xpath('//tr[@class="provincetr"]')for i  in proviceList:proviceName  = i.xpath('td/a/text()')proviceLink = i.xpath('td/a/@href')for j in range(len(proviceLink)):##根据获取到的每个省的链接进行补全，得到真实的URproviceURL = url[:-10] + proviceLink[j]provice.append({'name':proviceName[j],'link':proviceURL})return provice#获取市级代码函数
def getCity(url_list):city_all = []for url in url_list:data = getUrl(url)selector = etree.HTML(data)cityList = selector.xpath('//tr[@class="citytr"]')#获取每个城市的代码，urlcity =[]for i  in cityList:cityCode = i.xpath('td[1]/a/text()')cityLink = i.xpath('td[1]/a/@href')cityName = i.xpath('td[2]/a/text()')for j in range(len(cityLink)):##根据获取到的每市省的链接进行补全，得到真实的URcityURL = url[:-7] +cityLink[j]city.append({'name':cityName[j],'code':cityCode[j],'link':cityURL})#所有省份的城市信息合并到一起city_all.extend(city)return city_all#获取区级代码函数 --- 多线程
def getCounty(url_list):queue_county =Queue()  #列队thread_num = 10  #进程数county = []   #记录区级信息字典(全局)def produce_url(url_list):for url in url_list:queue_county.put(url)  #生成列队，等待提取def getData():while not queue_county.empty():  #可以遍历到所有，并能正常退出url =queue_county.get()data = getUrl(url=url)selector = etree.HTML(data)countyList = selector.xpath('//tr[@class="countytr"]')#爬取每个区域的代码，urlfor i in countyList:countryCode = i.xpath('td[1]/a/text()')countyLink = i.xpath('td[1]/a/@href')countyName = i.xpath('td[2]/a/text()')# 存储格式为字典for j in range(len(countyLink)):countyURL = url[:-9] + countyLink[j]county.append({'code': countryCode[j], 'link':countyURL, 'name': countyName[j]})def run (url_list):produce_url(url_list)ths = []for _ in range(thread_num):th = Thread(target=getData)th.start()ths.append(th)for  th in ths:th.join()run(url_list)return county#省级信息获取
# url = ".html"
# pro = getProvice(url)
pro = getProvice(".html")
df_province = pd.DataFrame(pro)
df_province.info()#写入csv
df_province.to_csv('province.csv',sep=',',header=True,index=False)#市级信息获取
city = getCity(df_province['link'])
df_city=pd.DataFrame(city)
df_city.info()#写入csv
df_city.to_csv('city.csv',sep=',',header=True,index=False)#区级信息获取
county = getCounty(df_city['link'])
df_county=pd.DataFrame(county)
df_county.info()#写入csv
df_county.to_csv('county.csv',sep=',',header=True,index=False)

更多推荐

直接爬取个省市区的行政代码编号

本文发布于:2024-03-23 18:28:37，感谢您对本站的认可！

本文链接:https://www.elefans.com/category/jswz/34/1741398.html