python 爬虫 猫眼top100存入 csv mysq mogon

编程入门 行业动态 更新时间:2024-10-23 05:33:59

python <a href=https://www.elefans.com/category/jswz/34/1770264.html style=爬虫 猫眼top100存入 csv mysq mogon"/>

python 爬虫 猫眼top100存入 csv mysq mogon


import requests
import re
import csvclass MaoYanSpider:def __init__(self):self.headers = {"User-Agent":"Mozilla4.0/"}self.page = 1self.offset = 0self.baseurl = "="# 获取html源码def getPage(self,url):res = requests.get(url,headers=self.headers)res.encoding = "utf-8"html = res.textreturn html# 正则解析html源码def parsePage(self,html):p = re.compile('<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?class="releasetime">(.*?)</p>.*?</div>',re.S)content_list = p.findall(html)# [("霸王别姬","张国荣","1993"),(),()]return content_list# 写入本地文件def writeToCSV(self,content_list):for r_t in content_list:L = [r_t[0].strip(),r_t[1].strip(),r_t[2].strip()[5:9]]with open("猫眼电影.csv","a",newline="",encoding="gb18030") as f:writer = csv.writer(f)writer.writerow(L)# 主函数def workOn(self):with open("猫眼电影.csv","a",newline="",encoding="gb18030") as f:writer = csv.writer(f)L = ["电影名称","主演","上映时间"]writer.writerow(L)while True:url = self.baseurl + str(self.offset)print("正在爬取第%d页" % self.page)html = self.getPage(url)content_list = self.parsePage(html)self.writeToCSV(content_list)print("第%d页爬取成功" % self.page)c = input("是否继续爬取(y/n):")if c.strip().lower() == "y":self.offset += 10self.page += 1else:print("爬取结束,谢谢使用!")breakif __name__ == "__main__":spider = MaoYanSpider()spider.workOn()---------------------------
---------------------------import requests
import re
import pymongoclass MaoYanSpider:def __init__(self):self.headers = {"User-Agent": "Mozilla4.0/"}self.page = 1self.offset = 0self.baseurl = "="self.conn = pymongo.MongoClient("localhost", 27017)self.db = self.conn.MaoYanself.myset = self.db.film# 获取html源码def getPage(self, url):res = requests.get(url, headers=self.headers)res.encoding = "utf-8"html = res.textreturn html# 正则解析html源码def parsePage(self, html):p = re.compile('<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?class="releasetime">(.*?)</p>.*?</div>',re.S)content_list = p.findall(html)# [("霸王别姬","张国荣","1993"),(),()]return content_list# 写入本地文件def writeToMongo(self, content_list):for r_t in content_list:d = {"name": r_t[0].strip(),"star": r_t[1].strip(),"time": r_t[2].strip()[5:9]}self.myset.insert(d)# 主函数def workOn(self):while True:url = self.baseurl + str(self.offset)print("正在爬取第%d页" % self.page)html = self.getPage(url)content_list = self.parsePage(html)self.writeToMongo(content_list)print("第%d页爬取成功" % self.page)c = input("是否继续爬取(y/n):")if c.strip().lower() == "y":self.offset += 10self.page += 1else:print("爬取结束,谢谢使用!")breakif __name__ == "__main__":spider = MaoYanSpider()spider.workOn()---------------------------
---------------------------import requests
import re
from pymysql import connect
import warningsclass MaoYanSpider:def __init__(self):self.headers = {"User-Agent": "Mozilla4.0/"}self.page = 1self.offset = 0self.baseurl = "="self.db = connect("localhost", "root", "mysql密码", charset="utf8")self.cursor = self.db.cursor()# 获取html源码def getPage(self, url):res = requests.get(url, headers=self.headers)res.encoding = "utf-8"html = res.textreturn html# 正则解析html源码def parsePage(self, html):p = re.compile('<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?class="releasetime">(.*?)</p>.*?</div>',re.S)content_list = p.findall(html)# [("霸王别姬","张国荣","1993"),(),()]return content_list# 写入本地文件def writeToMysql(self, content_list):c_db = "create database if not exists MaoYan;"u_db = "use MaoYan;"c_tab = "create table if not exists film(\id int primary key auto_increment,\name varchar(30),\star varchar(50),\time year)charset=utf8;"warnings.filterwarnings("error")try:self.cursor.execute(c_db)except Warning:passself.cursor.execute(u_db)try:self.cursor.execute(c_tab)except Warning:passfor r_t in content_list:s_insert = "insert into film(name,star,time) \values('%s','%s','%s')" % \(r_t[0].strip(), r_t[1].strip(),r_t[2].strip()[5:9])print(s_insert)self.cursor.execute(s_insert)self.db.commit()# 主函数def workOn(self):while True:url = self.baseurl + str(self.offset)print("正在爬取第%d页" % self.page)html = self.getPage(url)content_list = self.parsePage(html)self.writeToMysql(content_list)print("第%d页爬取成功" % self.page)c = input("是否继续爬取(y/n):")if c.strip().lower() == "y":self.offset += 10self.page += 1else:self.cursor.close()self.db.close()print("爬取结束,谢谢使用!")breakif __name__ == "__main__":spider = MaoYanSpider()spider.workOn()

更多推荐

python 爬虫 猫眼top100存入 csv mysq mogon

本文发布于:2023-07-28 17:00:21,感谢您对本站的认可!
本文链接:https://www.elefans.com/category/jswz/34/1256750.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
本文标签:爬虫   猫眼   python   mogon   mysq

发布评论

评论列表 (有 0 条评论)
草根站长

>www.elefans.com

编程频道|电子爱好者 - 技术资讯及电子产品介绍!