爬取微博头条的数据并将数据保存到Mysql和mongodb的数据库中

编程入门行业动态更新时间:2024-10-08 13:34:02

爬取微博头条的<a href=https://www.elefans.com/category/jswz/34/1771445.html style= 数据并将数据保存到Mysql和mongodb的数据库中"/>

爬取微博头条的数据并将数据保存到Mysql和mongodb的数据库中

分析微博中属于头条的那一栏数据为ajax数据，分析接口获取动态加载页面的链接，分析json数据从而拿到每条数据的子链接，进入二级爬取页面获取待爬取的数据。

import requests
import json
import re
from wenben import SaveMysqlModule,mongoSaveModule
#wenben是一个py文件，我自己写的sql和mondb保存的模块就放在这个文件中
from lxml import etree
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'}
class WeiBo(object):def __init__(self,fname = None):self.fname = fnameself.url = '=6&category=1760&page=%d&lefnav=0&cursor=&__rnd=1566899070160'#ajax数据的urldef startspider(self):for page in range(40):url = self.url % page#各个页面ajax数据的urlself.parseurl(url)def parseurl(self,url):response = requests.get(url,headers = headers).content.decode()res = json.loads(response)text = res['data']href = re.findall('<a.*?href="(.*?)".*?target="_blank".*?</a>',text,re.S)#每条数据的urlfor src in href:if src.startswith('https'):print(src)self.parse_src(src)def parse_src(self,src):#直接根据src发送requests请求会被识别出为爬虫，所以在发送requests时添加登录成功后的cookies信息，可以防止反爬虫cook = '_s_tentry=passport.weibo; Apache=3590400122564.0547.1566897853524; SINAGLOBAL=3590400122564.0547.1566897853524; ULV=1566897853569:1:1:1:3590400122564.0547.1566897853524:; YF-V5-G0=95d69db6bf5dfdb71f82a9b7f3eb261a; login_sid_t=a29303dc49d6981b41e8d5b8c7e90010; cross_origin_proto=SSL; Ugrow-G0=e1a5a1aae05361d646241e28c550f987; UOR=,,www.baidu; wb_view_log=1366*7681; TC-V5-G0=4e714161a27175839f5a8e7411c8b98c; WBtopGlobal_register_version=307744aa77dd5677; SUB=_2A25wYX-zDeRhGeNP6FcW9SjEwzSIHXVTF9Z7rDV8PUNbmtAKLULnkW9NToBs74JFI4dmGFSRJhdgJ2P4Jj_uG65R; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWZ9iWxfzriauyb41dI1CC45JpX5o275NHD95QfeKefS0-c1hnRWs4Dqcjgi--fiKnNiKn4i--ciKLhiKnRdsLu9Btt; SUHB=0rr7n67ulfCL08; ALF=1567509093; SSOLoginState=1566904293; un=15013122417; YF-Page-G0=f1e19cba80f4eeaeea445d7b50e14ebb|1566904305|1566904175; TC-Page-G0=7f6863db1952ff8adac0858ad5825a3b|1566912201|1566912199'items = dict([item.split('=', 1) for item in cook.split('; ')])response = requests.get(url=src,headers=headers,cookies = items).content.decode()text = etree.HTML(response)url = srctitle = ''.join(text.xpath('//*[@id="plc_main"]/div/div/div/div[2]/div[1]/text()'))title = re.sub(r'\W','',title)zuozhe = ''.join(text.xpath('//*[@id="plc_main"]/div/div/div/div[2]/div[2]/div[1]/span[1]/a/em/text()'))zhuanfa = ''.join(text.xpath('//*[@id="plc_main"]/div/div/div/div[5]/div/div/div[1]/div/ul/li[1]/a/span/span/text()'))dianzan = ''.join(text.xpath('//*[@id="plc_main"]/div/div/div/div[5]/div/div/div[1]/div/ul/li[3]/a/span/span/span/em/text()'))wenben = ' '.join(text.xpath('//*[@id="plc_main"]/div/div/div/div[2]/div[2]/div[1]//text()'))wenben = re.sub(r'\n|\s','',wenben)wenben = re.search(r'(\d{2}\-\d{4}\:\d{2})',wenben)#将时间匹配出来item = {'链接':url,'标题':title,'发布时间':wenben.group(),'作者':zuozhe,'转发数':zhuanfa,'点赞数':dianzan,}print(item)self.saveData(item)def saveData(self, data):keys = ['链接', '标题', '发布时间', '作者', '转发数',\'点赞数']#传入键值是为了在mysql中更加方便地写sql语句# values = [data.get(key, "None") for key in keys]#values在此保存self.fname.write(keys,data)#写入csv文件时应该以列表的形式写入，如果以字符串的形式写入的话会产生很多分隔符
if __name__ == '__main__':sqlconfig = {'host': 'localhost', 'user': 'root', 'password': None, 'db': 'weibo'}t = SaveMysqlModule('toutiao',**sqlconfig)#调用一个mysql的数据库，toutiao是表的名称，weibo是数据库的名字t1 = mongoSaveModule('weibo','toutiao')#'weibo 这个是数据名  toutiao 这个是集合的名称spider = WeiBo(t1)#实例化一个类spider.startspider()#开始爬虫

以下是数据库的类，保存在另外一个py文件中，在上面的代码中需要用的时候直接导入即可。数据中用到的数据库，表，集合都要自己事先创建

class SaveMysqlModule(object):def __init__(self,tbname,**kwargs):#host="",user="",password="",db="",port=""self.db = pymysql.connect(charset = 'utf8',**kwargs)self.cur = self.db.cursor()self.tbname = tbnamedef write(self,keys,data):# print(data)# sql = 'insert into testform values(%s,%s,%s)'sql = 'insert into %s values'%self.tbname#根据键值的个数拼接出有多少个%s数r = ['%s' for key in keys]sql +='(%s)' %(','.join(r))values = [data.get(key,"None") for key in keys]self.cur.execute(sql,values)self.dbmit()def close(self):self.db.close()class mongoSaveModule(object):def __init__(self,db,collection,host='localhost',port=27017):self.client = pymongo.MongoClient(host=host,port=port)self.db = self.client[db]self.col = self.db[collection]def write(self,keys,data):values = [data.get(key,"None") for key in keys]#过滤掉非空字符vals = dict(zip(keys,values))self.col.insert_one(vals)def close(self):self.client.close()

更多推荐

爬取微博头条的数据并将数据保存到Mysql和mongodb的数据库中

本文发布于:2024-02-07 08:57:39，感谢您对本站的认可！

本文链接:https://www.elefans.com/category/jswz/34/1755814.html