[Python + MySQL]多线程股票数据爬虫

编程入门 行业动态 更新时间:2024-10-09 02:25:57

[Python + MySQL]多线程股票数据<a href=https://www.elefans.com/category/jswz/34/1770264.html style=爬虫"/>

[Python + MySQL]多线程股票数据爬虫

股票数据爬取

目的

拿到历史数据,结合经典的数据算法,来看看能不能找到一些数学规律。

准备

需要连接MySQL数据库,我用的是安卓机上的Termux,这样方便以后把代码放到手机上跑,还能结合itchat与微信交互(暂未开写)。

直接上代码

就是把所有数据下下来

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Author: CK
# Date: 2020-03-13
import datetime
import json
import random
import sys
import threading
import time
from queue import Queue
import os
import pymysql
import requests
from warnings import filterwarnings
from tqdm import tqdm
from stock_fomula import StockFomula
from data_simulator import Simulatorfile_path = os.path.abspath(sys.argv[0])
abs_path = file_path[: file_path.rfind('/')]class GetData:def __init__(self):"""初始化mysql连接"""self.connect = pymysql.connect(host='127.0.0.1',# host='192.168.3.42',port=3306,user='root',password='1@Qwertyuiop',database='stock',charset='utf8')self.raw = ''# 超时时间设置延长self.connect._write_timeout = 10000# 线程锁self.lock = threading.Lock()@staticmethoddef get_headers():"""随机获取User-Agent:return:"""upper_path = abs_path[: abs_path.rfind('/')]with open(os.path.join(upper_path, "sources/headers.csv")) as ua:user_agent_list = ua.readlines()return user_agent_listdef get_raw(self, url):"""获取源数据:param url::return:"""headers = {'Host': 'd.10jqka','Referer': '.html','User-Agent': random.choice(self.get_headers()).strip()}flag = 0while flag < 5:try:r = requests.get(url=url, headers=headers, timeout=60)flag = 5r.close()return r.textexcept Exception as ex:print('链接读取超时,开始重试,重试次数:%s' % flag) if flag > 0 else Noneflag += 1time.sleep(10)def data_to_json(self, url):"""对raw数据进行转换处理:return:"""raw_data = self.get_raw(url)if raw_data:try:start_index = raw_data.find('{')end_index = raw_data.find('}')stock_id = raw_data[: start_index - 1].split('_')[-3]json_str = raw_data[start_index: end_index + 1]return stock_id, json_strexcept IndexError:return None@staticmethoddef date_transfer(price, dates, year, priceFactor):"""用于将输入的价格、时间列表连接起来变成实际有意义的结构:param price::param dates::param year::param priceFactor::return:"""result = []for i in range(len(dates)):date = str(year) + dates[i]opening = float(price[i * 4]) + float(price[i * 4 + 1])high = float(price[i * 4]) + float(price[i * 4 + 2])low = float(price[i * 4])closing = float(price[i * 4]) + float(price[i * 4 + 3])result.append(date + ',' + str(opening / priceFactor) + ',' + str(high / priceFactor) + ','+ str(low / priceFactor) + ',' + str(closing / priceFactor))return resultdef data_to_mysql(self, url, days=0):"""大致分数据处理 + 数据写入两部分:param url::param days: 表示最近几天,用于更新近期数据:return:"""# 数据处理json_data = self.data_to_json(url)all_data = []if json_data:sf = StockFomula()stock_id = json_data[0]json_raw = json.loads(json_data[1])name = json_raw['name']sortYear = json_raw['sortYear']priceFactor = json_raw['priceFactor']price = json_raw['price'].split(',')dates = json_raw['dates'].split(',')for i in sortYear:year = i[0]num = i[1]front_price = price[: num * 4]price = price[num * 4:]front_dates = dates[: num]dates = dates[num:]lis = GetData.date_transfer(front_price, front_dates, year, priceFactor)all_data.extend(lis)# 计算各指标参数macd = sf.macd(all_data)kdj = sf.kdj(all_data)boll = sf.boll(all_data)ma = sf.ma(all_data)data = [all_data, macd, kdj, boll, ma]self.write_to_mysql(data, stock_id, name, days)return all_dataelse:# print('无数据')return -1def write_to_mysql(self, data, stock_id, name, days):"""将数据写入到MySQL中:param data::param stock_id::param name::param days: 见data_to_mysql处注释:return:"""# if not exists无法工作,使用try except代替cursor = self.connect.cursor()try:stock_data_create = '''CREATE TABLE stock_data (id int,stock_id varchar(8),date varchar(15),opening DECIMAL(10, 2),high DECIMAL(10, 2),low DECIMAL(10, 2),closing DECIMAL(10, 2),dif DECIMAL(10, 3),dea DECIMAL(10, 3),macd_bar DECIMAL(10, 3),k DECIMAL(10, 3),d DECIMAL(10, 3),j DECIMAL(10, 3),up DECIMAL(10, 3),mb DECIMAL(10, 3),dn DECIMAL(10, 3),ma_5 DECIMAL(10, 3),ma_6 DECIMAL(10, 3),ma_7 DECIMAL(10, 3),ma_8 DECIMAL(10, 3),ma_9 DECIMAL(10, 3),ma_10 DECIMAL(10, 3),ma_11 DECIMAL(10, 3),ma_12 DECIMAL(10, 3),ma_13 DECIMAL(10, 3),ma_14 DECIMAL(10, 3),ma_15 DECIMAL(10, 3),ma_16 DECIMAL(10, 3),ma_17 DECIMAL(10, 3),ma_18 DECIMAL(10, 3),ma_19 DECIMAL(10, 3),ma_20 DECIMAL(10, 3),ma_30 DECIMAL(10, 3),ma_60 DECIMAL(10, 3),ma_120 DECIMAL(10, 3),PRIMARY KEY (stock_id, date));'''cursor.execute(stock_data_create)except:passtry:stock_name_create = 'CREATE TABLE IF NOT EXISTS stock_name' \'(stock_id VARCHAR(10) PRIMARY KEY , name VARCHAR(15)) CHARSET "utf8";'cursor.execute(stock_name_create)except:pass# 假如days = 0意味着遍历所有if days == 0:days = len(data[0])now = datetime.datetime.now()now_date = now.strftime('%Y%m%d')# print(data[3])# for i in tqdm(range(len(data[0]) - days, len(data[0]))):for i in range(len(data[0]) - days, len(data[0])):try:date_price = data[0][i].split(',')except IndexError:returnid = idate = date_price[0]# 不更新今天日期的数据if date == now_date:continueopening = float(date_price[1])high = float(date_price[2])low = float(date_price[3])closing = float(date_price[4])macd_data = data[1]kdj_data = data[2]boll_data = data[3]ma_data = data[4]dif = macd_data[1][i]dea = macd_data[2][i]macd_bar = macd_data[3][i]if not kdj_data:returnk = kdj_data[1][i]d = kdj_data[2][i]j = kdj_data[3][i]up = boll_data[1][i]mb = boll_data[2][i]dn = boll_data[3][i]ma_5 = ma_data[0][i]ma_6 = ma_data[1][i]ma_7 = ma_data[2][i]ma_8 = ma_data[3][i]ma_9 = ma_data[4][i]ma_10 = ma_data[5][i]ma_11 = ma_data[6][i]ma_12 = ma_data[7][i]ma_13 = ma_data[8][i]ma_14 = ma_data[9][i]ma_15 = ma_data[10][i]ma_16 = ma_data[11][i]ma_17 = ma_data[12][i]ma_18 = ma_data[13][i]ma_19 = ma_data[14][i]ma_20 = ma_data[15][i]ma_30 = ma_data[16][i]ma_60 = ma_data[17][i]ma_120 = ma_data[18][i]try:self.connect.ping(reconnect=True)stock_data_add = 'REPLACE INTO stock_data (id, stock_id, date, opening, high, low, closing, dif, dea, ' \'macd_bar, k, d, j, up, mb, dn, ma_5, ma_6, ma_7, ma_8, ma_9, ma_10, ma_11, ma_12, ma_13, ' \'ma_14, ma_15, ma_16, ma_17, ma_18, ma_19, ma_20, ma_30, ma_60, ma_120) VALUES (' \'%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, ' \'%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'cursor.execute(stock_data_add,[id, stock_id, date, opening, high, low, closing, dif, dea, macd_bar, k, d, j, up, mb,dn, ma_5, ma_6, ma_7, ma_8, ma_9, ma_10, ma_11, ma_12, ma_13, ma_14, ma_15, ma_16,ma_17,ma_18, ma_19, ma_20, ma_30, ma_60, ma_120])self.connectmit()except Exception as ex:print(ex)# self.connect.ping(reconnect=True)self.connect.rollback()# 插入数据,跳过重复try:self.connect.ping(reconnect=True)stock_name_add = 'INSERT IGNORE INTO stock_name(stock_id, name) VALUES (%s, %s);'cursor.execute(stock_name_add, [stock_id, name])self.connectmit()except Exception as ex:print(ex)# self.connect.ping(reconnect=True)self.connect.rollback()cursor.close()def downloader(self, queue, days=0):"""统一的下载API,支持多线程:param queue: 队列,方便线程之间通信:return:"""if not queue.empty():stock_id = queue.get()url = '' + stock_id + '/01/all.js'# 申请线程锁self.lock.acquire()feedback = self.data_to_mysql(url, days)# 设置失败重试count = 1while feedback == -1 and count <= 50:# if count == 1:#     print("%s进入重试" % stock_id)feedback = self.data_to_mysql(url, days)count += 1if feedback == -1 and count == 51:print("\033[01;32m%s重试失败\033[0m" % stock_id)open('fail.txt', 'a').write(stock_id + '\n')# print("\033[01;31m%s重试成功\033[0m" % stock_id) if feedback != -1 and count != 1 else None# 释放线程锁self.lock.release()def main(thread_num, days=0):""":param thread_num: 线程数:param days: 同步天数,为0时表示同步全部:return:"""s = Simulator()last_2rd_day = s.get_cur_date(timedelta=2)last_day = s.get_cur_date(timedelta=1)# 创建股票代码队列flag = 0  # 用来调节下载进度的显示频度with open(os.path.join(abs_path, 'all_stock_code.txt')) as asc:all_stock_code = asc.readlines()# all_stock_code = ['1A0001']size = len(all_stock_code)cur_time = datetime.datetime.now()print('股票数量:%d' % size, '开始时间:%s' % cur_time.strftime('%Y%m%d %H:%M:%S'), sep='\n')stock_code_queue = Queue()for i in all_stock_code:stock_code_queue.put(i.strip())gd = GetData()while not stock_code_queue.empty():flag += 1if flag % 14 == 0:print("下载进度%.2f%%" % ((size - stock_code_queue.qsize()) / size * 100))threads = []for i in range(thread_num):t = threading.Thread(target=gd.downloader, args=(stock_code_queue, days,))threads.append(t)t.start()for thread in threads:thread.join()time.sleep(random.random() * 2 + 1)cur_time = datetime.datetime.now()print('结束时间:%s' % cur_time.strftime('%Y%m%d %H:%M:%S'))if __name__ == '__main__':# 去除Warning信息filterwarnings('ignore', category=pymysql.Warning)try:gd = GetData()one_queue = Queue()one_queue.put(sys.argv[1])gd.downloader(one_queue, 3)print('股票%s信息更新完成' % sys.argv[1])except IndexError:main(10, 5)# gd = GetData()# url = '' + '600016' + '/01/all.js'# print(gd.get_raw(url))# 当天的信息可以通过.js获得

代码中引用到的headers.csv如下

Mozilla/5.0 (Windows NT 10.0; WOW64)
Mozilla/5.0 (Windows NT 6.3; WOW64)
Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11
Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko
Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36
Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)
Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1
Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3
Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12
Opera/9.27 (Windows NT 5.2; U; zh-cn)
Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0
Opera/8.0 (Macintosh; PPC Mac OS X; U; en)
Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6
Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)
Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)
Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1
Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)
Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER
Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11

更多推荐

[Python + MySQL]多线程股票数据爬虫

本文发布于:2024-02-07 06:57:35,感谢您对本站的认可!
本文链接:https://www.elefans.com/category/jswz/34/1753856.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
本文标签:爬虫   多线程   股票   数据   Python

发布评论

评论列表 (有 0 条评论)
草根站长

>www.elefans.com

编程频道|电子爱好者 - 技术资讯及电子产品介绍!