规划合理的数据库封装
import logging
import pymysql
from pymysql.cursors import DictCursor
from DBUtils.PooledDB import PooledDB
logger = logging.getLogger()
class PyMysqlPoolBase(object):
"""
建立 mysql 连接池的 python 封装
与直接建立 mysql 连接的不同之处在于连接在完成操作之后不会立即提交关闭
可复用以提高效率
"""
_pool = None
def __init__(self,
host,
port,
user,
password,
db=None, init=True):
self.db_host = host
self.db_port = int(port)
self.user = user
self.password = str(password)
self.db = db
self.connection = None
self.cursor = None
if init:
self.init
@property
def init(self):
self._getConn
@property
def _getConn(self):
"""
@summary: 静态方法,从连接池中取出连接
@return MySQLdb.connection
"""
if PyMysqlPoolBase._pool is None:
_pool = PooledDB(creator=pymysql, # 用以链接数据库的模块
mincached=1, # 初始化时,连接池中至少创建的空闲链接 0 表示不创建
maxcached=20, # 链接池中最多闲置的链接,0和None不限制
host=self.db_host,
port=self.db_port,
user=self.user,
passwd=self.password,
db=self.db,
use_unicode=True,
charset="utf8",
cursorclass=DictCursor)
self.connection = _pool.connection()
def _exec_sql(self, sql, param=None):
with self.connection.cursor() as cursor:
if param is None:
count = cursor.execute(sql)
else:
count = cursor.execute(sql, param)
return count
def insert(self, sql, params=None):
"""
@summary: 更新数据表记录
@param sql: SQL 格式及条件,使用 (%s,%s)
@param params: 要更新的值: tuple/list
@return: count 受影响的行数
"""
return self._exec_sql(sql, params)
def select_all(self, sql, params=None):
with self.connection.cursor() as cursor:
cursor.execute(sql, params)
results = cursor.fetchall()
return results
def select_many(self, sql, params=None, size=1):
with self.connection.cursor() as cursor:
cursor.execute(sql, params)
results = cursor.fetchmany(size)
return results
def select_one(self, sql, params=None):
with self.connection.cursor() as cursor:
cursor.execute(sql, params)
result = cursor.fetchone()
return result
def insert_many(self, sql, values):
"""
@summary: 向数据表插入多条记录
@param sql:要插入的 SQL 格式
@param values:要插入的记录数据tuple(tuple)/list[list]
@return: count 受影响的行数
"""
with self.connection.cursor() as cursor:
count = cursor.executemany(sql, values)
return count
def update(self, sql, param=None):
"""
@summary: 更新数据表记录
@param sql: SQL 格式及条件,使用(%s,%s)
@param param: 要更新的 值 tuple/list
@return: count 受影响的行数
"""
return self._exec_sql(sql, param)
def delete(self, sql, param=None):
"""
@summary: 删除数据表记录
@param sql: SQL 格式及条件,使用(%s,%s)
@param param: 要删除的条件 值 tuple/list
@return: count 受影响的行数
"""
return self._exec_sql(sql, param)
def begin(self):
"""
@summary: 开启事务
"""
self.connection.autocommit(0)
def end(self, option='commit'):
"""
@summary: 结束事务
"""
if option == 'commit':
self.connection.commit()
else:
self.connection.rollback()
def dispose(self, isEnd=1):
"""
@summary: 释放连接池资源
"""
if isEnd == 1:
self.end('commit')
else:
self.end('rollback')
self.connection.close()
这样封装的好处是: 在完成操作之后连接将不会立即被提交。有效提高了复用的效率。
配置项处理
将项目的配置项以键值对的形式写入 .conf 文件中:
TEST_MYSQL_HOST = 127.0.0.1
TEST_MYSQL_PORT = 3306
TEST_MYSQL_USER = root
TEST_MYSQL_PASSWORD = ruiyang
TEST_MYSQL_DB = test_furuiyang
# ...
在 configs 文件中通过 configparser 对配置项进行读取。
import configparser
import os
env = os.environ
cf = configparser.ConfigParser()
thisdir = os.path.dirname(__file__)
cf.read(os.path.join(thisdir, '.conf'))
TEST_MYSQL_HOST = env.get("TEST_MYSQL_HOST", cf.get('test', 'TEST_MYSQL_HOST'))
TEST_MYSQL_PORT = int(env.get("TEST_MYSQL_PORT", cf.get('test', 'TEST_MYSQL_PORT')))
TEST_MYSQL_USER = env.get("TEST_MYSQL_USER", cf.get('test', 'TEST_MYSQL_USER'))
TEST_MYSQL_PASSWORD = env.get("TEST_MYSQL_PASSWORD", cf.get('test', 'TEST_MYSQL_PASSWORD'))
TEST_MYSQL_DB = env.get("TEST_MYSQL_DB", cf.get('test', 'TEST_MYSQL_DB'))
# ...
在 configs.py 中测试配置项的值是否被正确设置:
if __name__ == "__main__":
import sys
mod = sys.modules[__name__]
attrs = dir(mod)
attrs = [attr for attr in attrs if not attr.startswith("__") and attr.isupper()]
for attr in attrs:
print(attr, ":", getattr(mod, attr))
同时在 .gitignore 中配置不上传 .conf 文件。
设计爬虫基类
基本组合封装了爬虫类的基本功能。
import base64
import hashlib
import hmac
import json
import logging
import os
import sys
import time
import traceback
import requests
import urllib.parse
cur_path = os.path.split(os.path.realpath(__file__))[0]
file_path = os.path.abspath(os.path.join(cur_path, ".."))
sys.path.insert(0, file_path)
from ExchangeMargin.configs import (SPIDER_MYSQL_HOST, SPIDER_MYSQL_PORT, SPIDER_MYSQL_USER, SPIDER_MYSQL_PASSWORD,
SPIDER_MYSQL_DB, PRODUCT_MYSQL_HOST, PRODUCT_MYSQL_PORT, PRODUCT_MYSQL_USER,
PRODUCT_MYSQL_PASSWORD, PRODUCT_MYSQL_DB, JUY_HOST, JUY_PORT, JUY_USER, JUY_PASSWD,
JUY_DB, DC_HOST, DC_PORT, DC_USER, DC_PASSWD, DC_DB, SECRET, TOKEN, TEST_MYSQL_HOST,
TEST_MYSQL_PORT, TEST_MYSQL_USER, TEST_MYSQL_PASSWORD, TEST_MYSQL_DB, LOCAL)
from ExchangeMargin.sql_pool import PyMysqlPoolBase
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
class MarginBase(object):
# 爬虫库
spider_cfg = {
"host": SPIDER_MYSQL_HOST,
"port": SPIDER_MYSQL_PORT,
"user": SPIDER_MYSQL_USER,
"password": SPIDER_MYSQL_PASSWORD,
"db": SPIDER_MYSQL_DB,
}
# 正式库
product_cfg = {
"host": PRODUCT_MYSQL_HOST,
"port": PRODUCT_MYSQL_PORT,
"user": PRODUCT_MYSQL_USER,
"password": PRODUCT_MYSQL_PASSWORD,
"db": PRODUCT_MYSQL_DB,
}
# 聚源数据库
juyuan_cfg = {
"host": JUY_HOST,
"port": JUY_PORT,
"user": JUY_USER,
"password": JUY_PASSWD,
"db": JUY_DB,
}
# 数据中心库
dc_cfg = {
"host": DC_HOST,
"port": DC_PORT,
"user": DC_USER,
"password": DC_PASSWD,
"db": DC_DB,
}
# 测试库
test_cfg = {
"host": TEST_MYSQL_HOST,
"port": TEST_MYSQL_PORT,
"user": TEST_MYSQL_USER,
"password": TEST_MYSQL_PASSWORD,
"db": TEST_MYSQL_DB,
}
def __init__(self):
self.target_table_name = 'stk_mttargetsecurities'
self.juyuan_table_name = 'MT_TargetSecurities'
self.is_local = LOCAL
self.dc_client = None
self.target_client = None
self.juyuan_client = None
self.test_client = None
def _init_pool(self, cfg: dict):
"""
eg.
conf = {
"host": LOCAL_MYSQL_HOST,
"port": LOCAL_MYSQL_PORT,
"user": LOCAL_MYSQL_USER,
"password": LOCAL_MYSQL_PASSWORD,
"db": LOCAL_MYSQL_DB,
}
:param cfg:
:return:
"""
pool = PyMysqlPoolBase(**cfg)
return pool
def _dc_init(self):
if not self.dc_client:
self.dc_client = self._init_pool(self.dc_cfg)
def _target_init(self):
if not self.target_client:
self.target_client = self._init_pool(self.product_cfg)
def _juyuan_init(self):
if not self.juyuan_client:
self.juyuan_client = self._init_pool(self.juyuan_cfg)
def _test_init(self):
if not self.test_client:
self.test_client = self._init_pool(self.test_cfg)
def __del__(self):
for sql_client in (self.dc_client, self.target_client,
self.juyuan_client, self.test_client):
if sql_client:
sql_client.dispose()
def contract_sql(self, datas, table: str, update_fields: list):
if not isinstance(datas, list):
datas = [datas, ]
to_insert = datas[0]
ks = []
vs = []
for k in to_insert:
ks.append(k)
vs.append(to_insert.get(k))
fields_str = "(" + ",".join(ks) + ")"
values_str = "(" + "%s," * (len(vs) - 1) + "%s" + ")"
base_sql = '''INSERT INTO `{}` '''.format(table) + fields_str + ''' values ''' + values_str
params = []
for data in datas:
vs = []
for k in ks:
vs.append(data.get(k))
params.append(vs)
if update_fields:
on_update_sql = ''' ON DUPLICATE KEY UPDATE '''
for update_field in update_fields:
on_update_sql += '{}=values({}),'.format(update_field, update_field)
on_update_sql = on_update_sql.rstrip(",")
sql = base_sql + on_update_sql + """;"""
else:
sql = base_sql + ";"
return sql, params
def _batch_save(self, sql_pool, to_inserts, table, update_fields):
try:
sql, values = self.contract_sql(to_inserts, table, update_fields)
count = sql_pool.insert_many(sql, values)
except:
traceback.print_exc()
logger.warning("失败")
else:
logger.info("批量插入的数量是{}".format(count))
sql_pool.end()
return count
def _save(self, sql_pool, to_insert, table, update_fields):
try:
insert_sql, values = self.contract_sql(to_insert, table, update_fields)
value = values[0]
count = sql_pool.insert(insert_sql, value)
except:
traceback.print_exc()
logger.warning("失败")
else:
if count == 1:
logger.info("插入新数据 {}".format(to_insert))
elif count == 2:
logger.info("刷新数据 {}".format(to_insert))
else:
logger.info("已有数据 {} ".format(to_insert))
sql_pool.end()
return count
def get_inner_code(self, secu_code):
ret = self.inner_code_map.get(secu_code)
if not ret:
logger.warning("{} 不存在内部编码".format(secu_code))
raise
return ret
@property
def inner_code_map(self):
"""
获取聚源内部编码映射表
https://dd.gildata.com/#/tableShow/27/column///
https://dd.gildata.com/#/tableShow/718/column///
"""
self._juyuan_init()
# 8 是开放式基金
sql = 'SELECT SecuCode,InnerCode from SecuMain WHERE SecuCategory in (1, 2, 8) and SecuMarket in (83, 90) and ListedSector in (1, 2, 6, 7);'
ret = self.juyuan_client.select_all(sql)
info = {}
for r in ret:
key = r.get("SecuCode")
value = r.get('InnerCode')
info[key] = value
return info
def ding(self, msg):
def get_url():
timestamp = str(round(time.time() * 1000))
secret_enc = SECRET.encode('utf-8')
string_to_sign = '{}\n{}'.format(timestamp, SECRET)
string_to_sign_enc = string_to_sign.encode('utf-8')
hmac_code = hmac.new(secret_enc, string_to_sign_enc, digestmod=hashlib.sha256).digest()
sign = urllib.parse.quote_plus(base64.b64encode(hmac_code))
url = 'https://oapi.dingtalk.com/robot/send?access_token={}×tamp={}&sign={}'.format(
TOKEN, timestamp, sign)
return url
url = get_url()
header = {
"Content-Type": "application/json",
"Charset": "UTF-8"
}
message = {
"msgtype": "text",
"text": {
"content": "{}@15626046299".format(msg)
},
"at": {
"atMobiles": [
"15626046299",
],
"isAtAll": False
}
}
message_json = json.dumps(message)
resp = requests.post(url=url, data=message_json, headers=header)
if resp.status_code == 200:
logger.info("钉钉发送消息成功: {}".format(msg))
else:
logger.warning("钉钉消息发送失败")
爬取交易所的融资融券公告
在将证券移出融资融券清单的时间,交易所将会给出公告。(但是移入的时间不会)。
我将这些公告保存下来,作为核对数据的一个依据。
import datetime
import json
import os
import random
import sys
import time
from urllib.parse import urljoin
import requests
from lxml import html
cur_path = os.path.split(os.path.realpath(__file__))[0]
file_path = os.path.abspath(os.path.join(cur_path, ".."))
sys.path.insert(0, file_path)
from ExchangeMargin.base import MarginBase, logger
class MarginBroadcast(MarginBase):
"""爬取上交所和深交所的融资融券公告"""
def __init__(self):
super(MarginBroadcast, self).__init__()
self. firelds = ['title', 'link', 'time', 'content', 'keyword']
# sh
self.sh_url = 'http://www.sse.com.cn/disclosure/magin/announcement/s_index.htm'
self.sh_base_url = 'http://www.sse.com.cn/disclosure/magin/announcement/s_index_{}.htm'
self.dt_format = "%Y-%m-%d"
self.error_urls = []
# sz
self.sz_url = 'http://www.szse.cn/api/search/content?random={}'.format(random.random())
self.error_pages = []
self.headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Content-Length': '85',
'Content-Type': 'application/x-www-form-urlencoded',
'Host': 'www.szse.cn',
'Origin': 'http://www.szse.cn',
'Pragma': 'no-cache',
'Referer': 'http://www.szse.cn/disclosure/margin/business/index.html',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',
'X-Request-Type': 'ajax',
'X-Requested-With': 'XMLHttpRequest',
}
self.announcement_table = 'margin_announcement'
def _create_table(self):
"""对公告爬虫建表 """
self._spider_init()
sql = '''
CREATE TABLE IF NOT EXISTS `{}` (
`id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT 'ID',
`market` int(11) DEFAULT NULL COMMENT '证券市场',
`title` varchar(200) DEFAULT NULL COMMENT '公告标题',
`link` varchar(200) DEFAULT NULL COMMENT '公告链接',
`time` datetime NOT NULL COMMENT '公告发布时间',
`content` text CHARACTER SET utf8 COLLATE utf8_bin COMMENT '公告内容',
`keyword` text CHARACTER SET utf8 COLLATE utf8_bin COMMENT '公告关键词',
`CREATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP,
`UPDATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
PRIMARY KEY (`id`),
UNIQUE KEY `un2` (`market`, `link`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin COMMENT='交易所融资融券公告信息';
'''.format(self.announcement_table)
self.spider_client.insert(sql)
logger.info("爬虫公告表建表成功")
def _make_sz_params(self, page_num):
'''
keyword:
time: 0
range: title
channelCode[]: business_news
currentPage: 1
pageSize: 20
'''
datas = {
"keyword": '',
'time': '',
'range': 'title',
'channelCode[]': 'business_news',
'currentPage': page_num,
'pageSize': 20,
}
return datas
def parse_json_content(self, url):
"""eg. http://www.szse.cn/disclosure/notice/general/t20200430_576647.json """
resp = requests.get(url)
if resp.status_code == 200:
ret = resp.text
py_data = json.loads(ret)
content = py_data.get("data", {}).get("content", '')
if content:
doc = html.fromstring(content)
content = self._process_content(doc.text_content())
return content
return ''
def trans_dt(self, dt: int):
"""eg. 1588176000000 """
if not isinstance(dt, int):
dt = int(dt)
tl = time.localtime(dt/1000)
ret = time.strftime("%Y-%m-%d %H:%M:%S", tl)
return ret
def sz_start(self):
self._spider_init()
for page in range(1, 8):
logger.info("page is {}".format(page))
datas = self._make_sz_params(page)
resp = requests.post(self.sz_url, headers=self.headers, data=datas)
# print(resp)
if resp.status_code == 200:
ret = resp.text
py_ret = json.loads(ret)
announcements = py_ret.get("data")
items = []
for a in announcements:
# print(a)
item = dict()
item['market'] = 90 # 深交所
item['title'] = a.get("doctitle")
item['link'] = a.get("docpuburl")
item['time'] = self.trans_dt(a.get('docpubtime'))
# eg. http://www.szse.cn/disclosure/notice/general/t20200430_576647.json
content_json_url = urljoin("http://www.szse.cn", a.get("docpubjsonurl"))
content = self.parse_json_content(content_json_url)
item['content'] = content
items.append(item)
print(items)
ret = self._batch_save(self.spider_client, items, self.announcement_table, self.firelds)
else:
self.error_pages.append(page)
def sh_start(self):
for page in range(1, 23):
if page == 1:
url = self.sh_url
else:
url = self.sh_base_url.format(page)
self.post_sh(url)
def post_sh(self, url):
self._spider_init()
resp = requests.post(url)
if resp.status_code == 200:
body = resp.text
try:
body = body.encode("ISO-8859-1").decode("utf-8")
except:
self.error_urls.append(url)
doc = html.fromstring(body)
broadcasts = doc.xpath(".//div[@class='sse_list_1 js_createPage']/dl/dd")
items = []
for b in broadcasts:
item = dict()
item['market'] = 83 # 上交所
show_dt_str = b.xpath("./span")[0].text_content()
show_dt = datetime.datetime.strptime(show_dt_str, self.dt_format)
item['time'] = show_dt
title = b.xpath("./a")[0].text_content()
item['title'] = title
href = b.xpath("./a/@href")[0]
href = urljoin("http://www.sse.com.cn/", href)
item['link'] = href
if href.endswith(".pdf") or href.endswith(".doc"):
item['content'] = ''
item['keyword'] = ''
else:
ret = self.parse_sh_detail(href)
content = ret.get("content")
keyword = ret.get("keyword")
item['content'] = content
item['keyword'] = keyword
items.append(item)
print(items)
ret = self._batch_save(self.spider_client, items, self.announcement_table, self.firelds)
def parse_sh_detail(self, url):
"""
eg. http://www.sse.com.cn/disclosure/magin/announcement/ssereport/c/c_20200430_5085195.shtml
:param url:
:return:
"""
resp = requests.get(url)
if resp.status_code == 200:
body = resp.text
try:
body = body.encode("ISO-8859-1").decode("utf-8")
except:
self.error_urls.append(url)
doc = html.fromstring(body)
try:
# TODO 对 content 进行去噪处理
content = doc.xpath("//div[@class='allZoom']")[0].text_content()
except:
content = ''
if not content:
try:
content = doc.xpath("//div[@class='article-infor']")[0].text_content()
except:
content = ''
if content:
content = self._process_content(content)
# 提取本篇的关键词
try:
key_words = doc.xpath("//span[@id='keywords']")[0].text_content().split()
words = []
for word in key_words:
word = word.strip(",")
words.append(word)
words_str = ','.join(words)
except:
words_str = ''
return {"content": content, "keyword": words_str}
def start(self):
self._create_table()
self.sh_start()
self.sz_start()
logger.info(self.error_urls)
logger.info(self.error_pages)
def task():
MarginBroadcast().start()
if __name__ == "__main__":
task()
公告资讯播报
对最近几天的公告资讯进行钉钉播报:
import datetime
import os
import re
import sys
cur_path = os.path.split(os.path.realpath(__file__))[0]
file_path = os.path.abspath(os.path.join(cur_path, ".."))
sys.path.insert(0, file_path)
from ExchangeMargin.base import MarginBase
class BoardCast(MarginBase):
"""公告播报"""
def __init__(self):
super(BoardCast, self).__init__()
# 展示这个时间点之后的公告
self.offset = 10
self.early_day = datetime.datetime.combine(datetime.datetime.today(), datetime.time.min) - datetime.timedelta(days=self.offset)
self.table = 'margin_announcement'
def show_info(self, market):
"""选择爬虫库的原始资讯数据"""
self._spider_init()
sql = """
select time, content from {} where market = {} and time >= '{}';
""".format(self.table, market, self.early_day)
board_info = self.spider_client.select_all(sql)
return board_info
def show_sql_info(self, secu_code):
"""查看 datacenter 数据库的情况"""
self._dc_init()
format_str = "聚源内部编码: {} 列出时间: {}, 移除时间: {} 当前是否在清单内: {}\n"
inner_code = self.get_inner_code(secu_code)
sql = """
select * from {} where InnerCode = {};
""".format(self.target_table_name, inner_code)
ret = self.dc_client.select_all(sql)
# 将其转化为 str 语句
msg1 = '融资:'
msg2 = '融券:'
for r in ret:
if r.get("TargetCategory") == 10:
msg1 += format_str.format(r.get("InnerCode"), r.get("InDate"), r.get("OutDate"), r.get("TargetFlag"))
else:
msg2 += format_str.format(r.get("InnerCode"), r.get("InDate"), r.get("OutDate"), r.get("TargetFlag"))
return msg1 + msg2
def start(self):
# sh
msg = "*** 公告播报[最近{}天] *** \n".format(self.offset) + '*'*20 + '上交所' + "*"*20 + '\n'
infos = self.show_info(83)
for info in infos:
_time = info.get("time")
content = info.get("content")
msg += "发布时间: {} 公告内容: {}\n".format(_time, content)
secu_codes = re.findall(r"\d{6}", content)
msg += '公告中包含的证券代码有 {}\n'.format(secu_codes)
if secu_codes:
for secu_code in secu_codes:
msg += "{}: \n".format(secu_code)
sql_info = self.show_sql_info(secu_code)
msg += sql_info
msg += '\n'
msg += "*"*20 + '\n'
# sz
msg += '*'*20 + '深交所' + "*"*20 + '\n'
infos = self.show_info(90)
for info in infos:
_time = info.get("time")
content = info.get("content")
msg += "发布时间: {} 公告内容: {}\n".format(_time, content)
secu_codes = re.findall(r"\d{6}", content)
msg += '公告中包含的证券代码有 {}\n'.format(secu_codes)
if secu_codes:
for secu_code in secu_codes:
msg += "{}: \n".format(secu_code)
sql_info = self.show_sql_info(secu_code)
msg += sql_info
msg += '\n'
msg += "*"*20 + '\n'
print(msg)
self.ding(msg)
def board_task():
BoardCast().start()
if __name__ == "__main__":
board_task()
上交所列表页爬虫
# coding=utf8
import datetime
import os
import re
import sys
import demjson
import requests
from lxml import html
cur_path = os.path.split(os.path.realpath(__file__))[0]
file_path = os.path.abspath(os.path.join(cur_path, ".."))
sys.path.insert(0, file_path)
from ExchangeMargin.configs import LOCAL
from ExchangeMargin.base import MarginBase, logger
class SHMarginSpider(MarginBase):
"""爬取当日的上交所融资融券列表
网址: http://www.sse.com.cn/services/tradingservice/margin/info/againstmargin/
更新时间: 当日即可更新当日数据
更新内容: 目前[当日的]融资买入以及融券卖出列表
爬虫表: margin_sh_list_spider
更新时间: 每天的 3、9、15 点更新一次
说明: 清单数据不可回溯历史。爬虫表时间是从部署上线时间 2020-05-11 开始的。
"""
def __init__(self):
super(SHMarginSpider, self).__init__()
self.url = 'http://www.sse.com.cn/services/tradingservice/margin/info/againstmargin/'
self.spider_table_name = 'margin_sh_list_spider'
def _create_table(self):
"""创建 sh list 爬虫数据库"""
sql = '''
CREATE TABLE IF NOT EXISTS `{}` (
`id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT 'ID',
`SecuMarket` int(11) DEFAULT NULL COMMENT '证券市场',
`InnerCode` int(11) NOT NULL COMMENT '证券内部编码',
`SecuCode` varchar(10) DEFAULT NULL COMMENT '证券代码',
`SecuAbbr` varchar(200) DEFAULT NULL COMMENT '证券简称',
`SerialNumber` int(10) DEFAULT NULL COMMENT '网站清单序列号',
`ListDate` datetime NOT NULL COMMENT '列入时间',
`TargetCategory` int(11) NOT NULL COMMENT '标的类别',
`CREATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP,
`UPDATETIMEJZ` datetime DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
PRIMARY KEY (`id`),
UNIQUE KEY `un2` (`SecuMarket`, `TargetCategory`,`ListDate`, `InnerCode`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin COMMENT='融资融券标的证券清单';
'''.format(self.spider_table_name)
self.spider_client.insert(sql)
self.spider_client.end()
def start(self):
"""
<li><a href="#tableData_961" data-toggle="tab">融资买入标的证券一览表
</a></li><li><a href="#tableData_962" data-toggle="tab">融券卖出标的证券一览表
</a></li><li><a href="#tableData_960" data-toggle="tab">融资融券可充抵保证金证券一览表
"""
msg = '本地测试:\n' if LOCAL else "远程:\n"
self._spider_init()
self._create_table()
resp = requests.get(self.url)
if resp.status_code == 200:
page = resp.text.encode("ISO-8859-1").decode("utf-8")
doc = html.fromstring(page)
fields = ['SecuMarket', 'InnerCode', 'SecuCode', 'SecuAbbr', 'SerialNumber', 'ListDate', 'TargetCategory']
spider = self._init_pool(self.spider_cfg)
# 962
datas = doc.xpath("//div[@class='table-responsive sse_table_T01 tdclickable']/table[@class='table search_']/script[@type='text/javascript']")[0].text
table_datas = re.findall('''tableData\['tableData_962'\] = (.*);''', datas, re.DOTALL)[0]
py_datas = demjson.decode(table_datas)
show_date = py_datas.get("staticDate")
show_dt = datetime.datetime.combine(datetime.datetime.strptime(show_date, "%Y-%m-%d %H:%M:%S"), datetime.time.min)
list_datas = py_datas.get("list")
lst_datas = []
for data in list_datas:
if data:
lst_datas.append(data)
items = []
for data in lst_datas:
item = dict()
item['SerialNumber'] = int(data[0]) # str
item['SecuAbbr'] = data[2]
item['ListDate'] = show_dt
# 标的类别:10-融资买入标的,20-融券卖出标的
item['TargetCategory'] = 20
item['SecuMarket'] = 83
secu_code = data[1].strip()
item['SecuCode'] = secu_code
inner_code = self.get_inner_code(secu_code)
item['InnerCode'] = inner_code
items.append(item)
self._batch_save(self.spider_client, items, self.spider_table_name, fields)
msg += "{} 上交所的融券卖出标的爬虫入库成功\n".format(show_dt)
# 961
datas = doc.xpath("//table[@class='table search_bdzqkc search3T']/script[@type='text/javascript']")[0].text
show_dt = datetime.datetime.strptime(re.findall("var showdate = '(.*)'", datas)[0], "%Y%m%d")
table_datas = re.findall('''tableData\['tableData_961'\] = (.*);''', datas, re.DOTALL)[0]
py_datas = demjson.decode(table_datas)
list_datas = py_datas.get("list")
lst_datas = []
for data in list_datas:
if data:
lst_datas.append(data)
# 编号 证券代码 证券简称
# [['1', '510050 ', '50ETF '], ['2', '510180 ', '180ETF '], ...
items = []
for data in lst_datas:
item = dict()
item['SerialNumber'] = int(data[0])
item['SecuAbbr'] = data[2]
item['ListDate'] = show_dt
# 标的类别:10-融资买入标的,20-融券卖出标的
item['TargetCategory'] = 10
item['SecuMarket'] = 83
secu_code = data[1].strip()
item['SecuCode'] = secu_code
inner_code = self.get_inner_code(secu_code)
item['InnerCode'] = inner_code
items.append(item)
self._batch_save(self.spider_client, items, self.spider_table_name, fields)
msg += "{} 上交所的融资买入标的爬虫入库成功\n".format(show_dt)
self.ding(msg)
else:
raise Exception("网页请求失败")
def sh_list_task():
SHMarginSpider().start()
if __name__ == '__main__':
sh_list_task()
根据列表差获取变化
import datetime
import os
import sys
cur_path = os.path.split(os.path.realpath(__file__))[0]
file_path = os.path.abspath(os.path.join(cur_path, ".."))
sys.path.insert(0, file_path)
from ExchangeMargin.configs import LOCAL
from ExchangeMargin.base import MarginBase, logger
class ShSync(MarginBase):
"""上交融资融券标的正式表记录生成
思路: list spider 的时间是连续的, 在某一天将昨天和前天的数据进行 diff, 得到 to_add 和 to_delete。
核对中发现: 将某只证券移出会出公告, 但是将其加入的时候不会。 因无法回溯历史列表,使用 detail 列表的 diff 替代。 存疑。
运行时间: 在每天的 12 点运行
"""
def __init__(self):
super(ShSync, self).__init__()
self.spider_table_name = 'margin_sh_list_spider'
self.fields = ["SecuMarket", "InnerCode", "InDate", "OutDate", "TargetCategory", "TargetFlag", "ChangeReasonDesc"]
def get_spider_latest_list(self, market, category):
"""获取爬虫库中最新的清单"""
self._spider_init()
sql = '''select InnerCode from {} where ListDate = (select max(ListDate) from {} \
where SecuMarket = {} and TargetCategory = {}) and SecuMarket = {} and TargetCategory = {};
'''.format(self.spider_table_name, self.spider_table_name, market, category, market, category)
ret = self.spider_client.select_all(sql)
ret = [r.get("InnerCode") for r in ret]
return ret
def get_spider_dt_list(self, dt, category):
"""获取爬虫库中具体某一天的清单"""
self._spider_init()
sql_dt = '''select max(ListDate) as mx from {} where ListDate <= '{}' and SecuMarket =83 and TargetCategory = {};
'''.format(self.spider_table_name, dt, category)
dt_ = self.spider_client.select_one(sql_dt).get("mx")
logger.info("距离 {} 最近的之前的一天是{}".format(dt, dt_))
if dt_:
sql = '''select InnerCode from {} where ListDate = '{}' and SecuMarket = 83 and TargetCategory = {};
'''.format(self.spider_table_name, dt_, category)
ret = self.spider_client.select_all(sql)
ret = [r.get("InnerCode") for r in ret]
return ret
else:
return []
def _update(self, inner_code, dt, type, to_add):
"""
数据库操作封装
:param inner_code: 聚源内部编码
:param dt: 变更发生时间
:param type: 10 融资 20 融券
:param to_add: 1 移入 0 移出
:return:
"""
self._target_init()
if to_add: # 被列入的情况
if type == 10: # 融资
item = {
"SecuMarket": 83, # 上交所
"InnerCode": inner_code, # 聚源内部编码
"InDate": dt, # 被列入的时间
'TargetCategory': 10, # 融资
'TargetFlag': 1,
'ChangeReasonDesc': None,
'UpdateTime': datetime.datetime.now(),
}
else:
item = {
"SecuMarket": 83,
"InnerCode": inner_code, # 聚源内部编码
"InDate": dt,
'TargetCategory': 20, # 融券
'TargetFlag': 1,
'ChangeReasonDesc': None,
'UpdateTime': datetime.datetime.now(),
}
count = self._save(self.target_client, item, self.target_table_name, self.fields)
else: # 被移出列表的情况
if type == 10: # 融资
base_sql = '''update {} set OutDate = '{}', TargetFlag = 0 where SecuMarket = 83 and InnerCode = {}\
and TargetCategory = 10 and TargetFlag = 1; '''
else: # 融券
base_sql = '''update {} set OutDate = '{}', TargetFlag = 0 where SecuMarket = 83 and InnerCode = {}\
and TargetCategory = 20 and TargetFlag = 1; '''
sql = base_sql.format(self.target_table_name, dt, inner_code)
count = self.target_client.update(sql)
self.target_client.end()
def start(self):
msg = ''
local_str = "本地测试: " if LOCAL else "远程: "
msg += local_str
msg += '上交所数据生成:\n'
_today = datetime.datetime.combine(datetime.datetime.today(), datetime.time.min)
_yester_day = _today - datetime.timedelta(days=1)
_before_yester_day = _today - datetime.timedelta(days=2)
print(_yester_day, "***", _before_yester_day)
msg += '时间点: {} 与 {}\n'.format(_yester_day, _before_yester_day)
_type_str_map = {
10: "融资",
20: "融券",
}
for _type in (10, 20):
logger.info(_type)
# 昨日的清单
_yester_day_list = self.get_spider_dt_list(_yester_day, _type)
# 前日的清单
_before_yester_day_list = self.get_spider_dt_list(_before_yester_day, _type)
print(len(_yester_day_list), len(_before_yester_day_list))
if _yester_day and _before_yester_day:
to_add = set(_yester_day_list) - set(_before_yester_day_list)
to_delete = set(_before_yester_day_list) - set(_yester_day_list)
logger.info("需新增数据: {}, 需删除数据: {}".format(to_add, to_delete))
msg += "{}: 需新增数据: {}, 需删除数据: {}\n".format(_type_str_map.get(_type), to_add, to_delete)
if to_add:
for one in to_add:
# 数据 时间 融资融券类型 移入移出类型
self._update(one, _yester_day, _type, 1)
if to_delete:
for one in to_delete:
self._update(one, _yester_day, _type, 0)
msg += '一致性检查: \n'
# 状态计算清单
dc_list_10 = set(sorted(self.product_dt_datas(83, 10)))
# 真正清单
spider_list_10 = set(sorted(self.get_spider_latest_list(83, 10)))
msg += "融资一致性 check : dc_list - latest_list_spider >> {},latest_list_spider - dc_list>>{} \n".format(
dc_list_10 - spider_list_10, spider_list_10 - dc_list_10)
dc_list_20 = set(sorted(self.product_dt_datas(83, 20)))
spider_list_20 = set(sorted(self.get_spider_latest_list(83, 20)))
msg += "融券一致性 check : dc_list - latest_list_spider >> {},latest_list_spider - dc_list>>{} \n".format(
dc_list_20 - spider_list_20, spider_list_20 - dc_list_20)
print(msg)
self.ding(msg)
def diff_task():
ShSync().start()
if __name__ == "__main__":
diff_task()