python 苹果app爬虫

编程入门 行业动态 更新时间:2024-10-09 10:25:53

python 苹果app<a href=https://www.elefans.com/category/jswz/34/1770264.html style=爬虫"/>

python 苹果app爬虫

#-*- coding: utf-8 -*-

import urllib2

import urllib

import re

import time

import MySQLdb

import time,datetime

#from datetime import date

#----------- APP store 排行榜 -----------

class Spider_Model:

def __init__(self):

self.page = 1

self.pages = []

self.enable = False

def startWork(self,url,tabName):

nowtime = int(time.time())

content = self.GetCon(url)

oneItems = self.Match(content) #匹配一级参数

time.sleep(1)

for index,item in enumerate(oneItems):

content_two = self.GetCon(item[1])

twoItems = self.Match_two(content_two)

oneItems[index].append([twoItems[0],twoItems[1]])

if oneItems[index][6][0] == '0':

fabutime = '0'

else:

fabutime=int(time.mktime(time.strptime(oneItems[index][6][0].strip(),'%Y年%m月%d日')))

sql = "INSERT INTO "+tabName+"(`rank`,`detailurl`,`logo`,`name`,`type`,`appid`,`appstoretime`,`compatible`,`ctime`) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)"%('"'+oneItems[index][0]+'"','"'+oneItems[index][1]+'"','"'+oneItems[index][2]+'"','"'+oneItems[index][3]+'"','"'+oneItems[index][4]+'"','"'+oneItems[index][5]+'"',fabutime,'"'+oneItems[index][6][1]+'"',nowtime)

self.contentDb(sql)

time.sleep(1)

def GetCon(self,url):

myUrl = url

headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11','Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'}

//网站禁止爬虫解决方法加上上面的代码 模拟浏览器访问

req = urllib2.Request(myUrl, headers = headers)

global myResponse

try:

myResponse = urllib2.urlopen(req)

except urllib2.HTTPError, e:

print e.fp.read()

//异常处理必须加 否则就算模拟了浏览器 也会返回 403 原因不知道......

myPage = myResponse.read()

#encode的作用是将unicode编码转换成其他编码的字符串

#decode的作用是将其他编码的字符串转换成unicode编码

#unicodePage = myPage.decode('utf-8').encode('gbk','ignore')

#unicodePage = myPage.decode('utf-8','ignore')

return myPage

def Match(self,con):

# 找出所有class="content"的div标记

#re.S是任意匹配模式,也就是.可以匹配换行符

pattenA = repile(r'(.*?)',re.U|re.S)

pattenB = repile(r'

(.*?).
(.*?)
(.*?)
.*?',re.U|re.S)

match = re.findall(pattenA,con)

myItems = re.findall(pattenB,match[0])

items = []

for item in myItems:

items.append([item[0].replace("\n",""),item[1].replace("\n",""),item[2].replace("\n",""),(item[3].replace("\n","")).split('-')[0],item[4].replace("\n",""),(item[1].split('id')[1]).split('?')[0]])

return items

def Match_two(self,con):

pattenTwoA = repile(r'

.*?(.*?)',re.U|re.S)

pattenTwoB = repile(r'.*?(.*?)

',re.U|re.S)

matchTwoA = self.is_empty(re.findall(pattenTwoA,con))

matchTwoB = self.is_empty(re.findall(pattenTwoB,con))

itemsTwo = [matchTwoA,matchTwoB]

return itemsTwo

def is_empty(self,param):

if len(param):

res = param[0]

else:

res = '0'

return res

def contentDb(self,sql):

try:

conn = MySQLdb.connect(host="主机", user="用户", passwd="密码", db="表名",charset='utf8')

cur = conn.cursor()

result = cur.execute(sql)

connmit()

except MySQLdb.Error,e:

print "Mysql Error %d: %s" %(e.args[0],e.args[1])

addArr = [["/",'cg_jp_free'],

["/",'cg_jp_paid']]

myModel = Spider_Model()

for val in addArr:

myModel.startWork(val[0],val[1])

初识Python 代码写的有点烂,自制罪孽深重......

python版本:2.7.5  测试环境:Linux、Windows

望高手拍砖 带我一起装逼!一起飞!

更多推荐

python 苹果app爬虫

本文发布于:2024-02-06 14:14:39,感谢您对本站的认可!
本文链接:https://www.elefans.com/category/jswz/34/1749429.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
本文标签:爬虫   苹果   python   app

发布评论

评论列表 (有 0 条评论)
草根站长

>www.elefans.com

编程频道|电子爱好者 - 技术资讯及电子产品介绍!