python 苹果app爬虫

编程入门行业动态更新时间:2024-10-09 10:25:53

python 苹果app<a href=https://www.elefans.com/category/jswz/34/1770264.html style= 爬虫"/>

python 苹果app爬虫

#-*- coding: utf-8 -*-

import urllib2

import urllib

import re

import time

import MySQLdb

import time,datetime

#from datetime import date

#----------- APP store 排行榜 -----------

class Spider_Model:

def __init__(self):

self.page = 1

self.pages = []

self.enable = False

def startWork(self,url,tabName):

nowtime = int(time.time())

content = self.GetCon(url)

oneItems = self.Match(content) #匹配一级参数

time.sleep(1)

for index,item in enumerate(oneItems):

content_two = self.GetCon(item[1])

twoItems = self.Match_two(content_two)

oneItems[index].append([twoItems[0],twoItems[1]])

if oneItems[index][6][0] == '0':

fabutime = '0'

else:

fabutime=int(time.mktime(time.strptime(oneItems[index][6][0].strip(),'%Y年%m月%d日')))

sql = "INSERT INTO "+tabName+"(`rank`,`detailurl`,`logo`,`name`,`type`,`appid`,`appstoretime`,`compatible`,`ctime`) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)"%('"'+oneItems[index][0]+'"','"'+oneItems[index][1]+'"','"'+oneItems[index][2]+'"','"'+oneItems[index][3]+'"','"'+oneItems[index][4]+'"','"'+oneItems[index][5]+'"',fabutime,'"'+oneItems[index][6][1]+'"',nowtime)

self.contentDb(sql)

time.sleep(1)

def GetCon(self,url):

myUrl = url

headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11','Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'}

//网站禁止爬虫解决方法加上上面的代码模拟浏览器访问

req = urllib2.Request(myUrl, headers = headers)

global myResponse

try:

myResponse = urllib2.urlopen(req)

except urllib2.HTTPError, e:

print e.fp.read()

//异常处理必须加否则就算模拟了浏览器也会返回 403 原因不知道......

myPage = myResponse.read()

#encode的作用是将unicode编码转换成其他编码的字符串

#decode的作用是将其他编码的字符串转换成unicode编码

#unicodePage = myPage.decode('utf-8').encode('gbk','ignore')

#unicodePage = myPage.decode('utf-8','ignore')

return myPage

def Match(self,con):

# 找出所有class="content"的div标记

#re.S是任意匹配模式，也就是.可以匹配换行符

pattenA = repile(r'(.*?)',re.U|re.S)

pattenB = repile(r'

(.*?).

(.*?)

.*?',re.U|re.S)

match = re.findall(pattenA,con)

myItems = re.findall(pattenB,match[0])

items = []

for item in myItems:

items.append([item[0].replace("\n",""),item[1].replace("\n",""),item[2].replace("\n",""),(item[3].replace("\n","")).split('-')[0],item[4].replace("\n",""),(item[1].split('id')[1]).split('?')[0]])

return items

def Match_two(self,con):

pattenTwoA = repile(r'

.*?(.*?)',re.U|re.S)

pattenTwoB = repile(r'.*?(.*?)

',re.U|re.S)

matchTwoA = self.is_empty(re.findall(pattenTwoA,con))

matchTwoB = self.is_empty(re.findall(pattenTwoB,con))

itemsTwo = [matchTwoA,matchTwoB]

return itemsTwo

def is_empty(self,param):

if len(param):