我正在参加「掘金·启航计划」
近段时间又迷上了双色球,即使已经知道这是一个随机过程。曾经一个朋友说过:一注彩票付出两块钱,带来的是对生活的一份期待。当然研究彩票不是目的而是手段,主要还是借此学习一下可视化库pyecharts。
官方文档 pyecharts - A Python Echarts Plotting Library built with love.
图表Demo - Document (pyecharts.org)
数据抓取
数据来源:
datachart.500.com/ssq/history…
F12发现数据请求接口网址是:
datachart.500.com/ssq/history…
返回的是html文本,结果如下,我们需要解析的数据位于<tbody id="tdata">中
这里通过xpath来提取(也可以用BeautifulSoup)。首先找到id为tdata的tbody标签,然后选取属性为t_tr1的tr子节点,再遍历所有子节点,找到对应的期数+6个红球+1个蓝球+开奖日期这9列。不知道是不是编码的原因,提取出来的文本类型为lxml.etree._ElementUnicodeResult,需要将它转换为字符串 。
详细代码:
def fetch_data(start: int = None, end: int = '', limit: int = None) -> List[list]:
"""抓取并解析数据"""
base_url = "http://datachart.500.com/ssq/history/newinc/history.php"
url = f"{base_url}?start={start}&end={end}"
if limit:
url = f"{base_url}?limit={limit}&sort=0"
res = requests.get(url)
html = etree.HTML(res.content.decode("utf-8"))
# 解析为字符串,并按照utf8解码
# doc = etree.tostring(html, pretty_print=True, encoding="utf-8").decode('utf-8')
el = html.xpath("//tbody[@id='tdata']/tr[@class='t_tr1']")
result = []
for e in el:
# 提取指定位置的td
# 用这种方式也可:e.xpath("./td[not(text()='\xa0')]/text()"),其中\xa0表示不间断空白符,即
temp = e.xpath("./td[not(position()>=9 and position()<=last()-1)]/text()")
# 提取出来的文本类型为lxml.etree._ElementUnicodeResult,需要转换为字符串
result.append(list(map(str, temp)))
# df = data_processing(result)
return result
数据处理
接下来将上一步返回的数据列表转换为Dataframe,为了方便后续分析,日期列设为索引,数据类型也要做相应的转换,并且新构建了两个特征:奇偶次数和连号个数。
详细代码:
@classmethod
def data_processing(cls, data: List[list]) -> pd.DataFrame:
df = pd.DataFrame(data, columns=['period'] + cls.cols + ['date']).sort_values('date')
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
df.set_index('date', inplace=True)
# 转换为int类型
df = df.astype({i: int for i in cls.cols})
return cls.build_feature(df)
@classmethod
def build_feature(cls, data: pd.DataFrame) -> pd.DataFrame:
"""构建特征:奇偶个数、连号个数"""
def is_odd(x):
count = 0
for i in x:
if i & 1 == 1:
count += 1
return count
def is_continuous(x):
count = 1
q = [x[0]]
for i in range(1, len(x)):
if q and x[i] - q[-1] != 1:
count = max(len(q), count)
q.clear()
q.append(x[i])
return max(len(q), count)
data['odd'] = data[cls.cols].apply(lambda x: is_odd(x), axis=1)
data['continuous'] = data[cls.cols].apply(lambda x: is_continuous(x), axis=1)
return data
绘制图表
此前前端vue项目用过echarts库,所以对其中的配置参数还有点印象,官方文档也很详细,包含各种图表的示例demo。
关于我第一次使用pyecharts遇到的坑:
- 若使用jupyter环境,必须声明环境类型,否则图表无法显示。
# jupyter notebook环境
from pyecharts.globals import ThemeType, CurrentConfig,
CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_NOTEBOOK
-
pyecharts不支持pandas中的数据类型,可以通过
tolist()方法转换 -
Page组件中无法嵌入Tab组件,会报错找不到chart_id -
以百分比形式为Tab组件内嵌的Line折线图设置图表宽度,无法正常显示,而px单位可以(不知道是不是bug)
-
图表渲染方式:
生成html文件:render()
jupyter notebook环境:render_notebook()
Flask或Django直接渲染:render_embed()
绘图相关代码:
由于创建图表基本上都涉及全局配置和初始化配置项,自定义了两个方法方便调用
import pyecharts.options as opts
from pyecharts.globals import ThemeType
def update_global_opts(chart, title_opts=None, xaxis_opts=None, yaxis_opts=None, legend_opts=None, x_rotate=None):
# 图表类型
# chart_type = chart.get_options()['series'][0]['type']
dic = dict(
xaxis_opts=opts.AxisOpts(splitline_opts=opts.SplitLineOpts(is_show=True),
axislabel_opts=opts.LabelOpts(rotate=x_rotate)),
yaxis_opts=opts.AxisOpts(splitline_opts=opts.SplitLineOpts(is_show=True)),
tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="cross"),
toolbox_opts=opts.ToolboxOpts(is_show=True,
orient='horizontal',
feature={
"saveAsImage": {},
"dataZoom": {"yAxisIndex": "none"},
"restore": {},
"magicType": {"show": True, "type": ["line", "bar"]},
"dataView": {},
}
),
)
# 标题配置
if title_opts:
dic['title_opts'] = opts.TitleOpts(**title_opts)
if xaxis_opts:
dic['xaxis_opts'] = opts.AxisOpts(splitline_opts=opts.SplitLineOpts(is_show=True),
axislabel_opts=opts.LabelOpts(rotate=x_rotate), **xaxis_opts)
if yaxis_opts:
dic['yaxis_opts'] = opts.AxisOpts(splitline_opts=opts.SplitLineOpts(is_show=True),**yaxis_opts)
# 图例配置
if legend_opts:
dic['legend_opts'] = opts.LegendOpts(**legend_opts)
return dic
def update_init_opts(width="900px", theme=None, bg_color=None):
dic = dict(
width=width,
theme=ThemeType.WESTEROS,
# bg_color='rgba(48, 56, 69, 0.9)',
)
if theme:
dic['theme'] = theme
if bg_color:
dic['bg_color'] = bg_color
init_opts = opts.InitOpts(**dic)
return init_opts
具体绘图代码:
def draw_trend(self) -> Page:
"""绘制各号码走势曲线及出现次数分析"""
data = self.data
page = Page(layout=Page.DraggablePageLayout, page_title='实时彩票分析平台')
start, end = data.index[0].strftime("%Y-%m-%d"), data.index[-1].strftime("%Y-%m-%d")
for col in self.cols:
count = data[col].value_counts().sort_index()
bar = (
Bar(init_opts=update_init_opts())
.add_yaxis('出现次数', count.values.tolist())
.add_xaxis(count.index.tolist())
.reversal_axis()
)
line = (
Line(init_opts=update_init_opts())
.add_xaxis(data.index.strftime("%m-%d").tolist())
.add_yaxis('号码', data[col].values.tolist(), linestyle_opts=opts.LineStyleOpts(width=2))
)
# 'pos_left': '2%'表示右边留白2%
global_opts_bar = update_global_opts(bar,
yaxis_opts={'type_': 'category'},
title_opts={'title': f'【{start}至{end}】{col}次数', 'pos_left': '2%'},
legend_opts={'pos_left': '25%', 'is_show': False})
global_opts_line = update_global_opts(line,
x_rotate=45,
title_opts={'title': f'{col}号码走势', 'pos_left': '40%'},
legend_opts={'pos_right': '25%', 'is_show': False}, )
line.set_global_opts(**global_opts_line)
bar.set_global_opts(**global_opts_bar)
grid = (
Grid(init_opts=update_init_opts(width='100%'))
.add(bar, grid_opts=opts.GridOpts(pos_right="75%"))
.add(line, grid_opts=opts.GridOpts(pos_left="35%"))
)
page.add(grid)
return page
def draw_features(self) -> Line:
data = self.data
start, end = data.index[0].strftime("%Y-%m-%d"), data.index[-1].strftime("%Y-%m-%d")
line = (
Line(init_opts=update_init_opts(width='100%'))
.add_xaxis(data.index.strftime("%m-%d").tolist())
.add_yaxis('奇数个数', data['odd'].values.tolist())
.add_yaxis('连号个数', data['continuous'].values.tolist())
)
global_opts = update_global_opts(line, x_rotate=30)
line.set_global_opts(**global_opts)
return line
结合Flask
步骤:
- 创建Flask项目
- 找到pyecharts库安装位置,拷贝路径
pyecharts.render.templates中的文件至项目templates目录中 - 设置环境变量
CurrentConfig.GLOBAL_ENV = Environment(loader=FileSystemLoader("./templates")) - 运行
按照官网文档中from jinja2 import Markup会报错,改为from jinja2.utils import markupsafe
Flask app.py文件:
import datetime
import random
from typing import List
import requests
from flask import Flask, request, render_template
from jinja2 import Environment, FileSystemLoader
from lxml import etree
import platform
from scipy.stats import truncnorm
from pyecharts.globals import CurrentConfig
CurrentConfig.GLOBAL_ENV = Environment(loader=FileSystemLoader("./templates"))
from flask_bootstrap import Bootstrap4
from lottery_predict import Lottery
app = Flask(__name__, static_folder="templates")
bootstrap = Bootstrap4()
bootstrap.init_app(app)
HOST = "http://47.98.97.198:5000/"
if platform.system() == 'Windows':
HOST = "http://127.0.0.1:5000/"
@app.route('/')
def index():
cur_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
limit = request.args.get('limit')
menu_id = request.args.get('menu_id')
col = request.args.get('col')
url = HOST + "trend"
if not limit:
limit = 50
if not col:
col = 'red1'
if menu_id and int(menu_id) == 1:
url = HOST + "features"
return render_template("./html/index.html", cur_time=cur_time, limit=limit, menu_id=url, col=col)
@app.route('/trend')
def trend():
limit = request.args.get('limit')
col = request.args.get('col')
if not limit:
limit = 50
data = fetch_data(limit=int(limit))
chart = Lottery(data).draw_trend(col)
# return markupsafe.Markup(chart.render_embed())
return chart.dump_options_with_quotes()
@app.route('/features')
def draw_charts():
limit = request.args.get('limit')
if not limit:
limit = 30
data = fetch_data(limit=int(limit))
chart = Lottery(data).draw_features()
return chart.dump_options_with_quotes()
def fetch_data(start: int = None, end: int = '', limit: int = None) -> List[list]:
"""抓取并解析数据"""
base_url = "http://datachart.500.com/ssq/history/newinc/history.php"
url = f"{base_url}?start={start}&end={end}"
if limit:
url = f"{base_url}?limit={limit}&sort=0"
res = requests.get(url)
html = etree.HTML(res.content.decode("utf-8"))
# 解析为字符串,并按照utf8解码
# doc = etree.tostring(html, pretty_print=True, encoding="utf-8").decode('utf-8')
el = html.xpath("//tbody[@id='tdata']/tr[@class='t_tr1']")
result = []
for e in el:
# 提取指定位置的td
# 用这种方式也可:e.xpath("./td[not(text()='\xa0')]/text()"),其中\xa0表示不间断空白符,即
temp = e.xpath("./td[not(position()>=9 and position()<=last()-1)]/text()")
# 提取出来的文本类型为lxml.etree._ElementUnicodeResult,需要转换为字符串
result.append(list(map(str, temp)))
return result
@app.route('/latest', methods=['GET'])
@app.context_processor
def get_latest():
res = fetch_data(limit=1)[0]
return {"latest": res[1:-1], "date": res[-1]}
@app.route('/predict', methods=['GET'], endpoint='predict')
# @app.context_processor
def auto_predict():
df = Lottery.data_processing(fetch_data(limit=100))
cols = [col for col in df.columns if 'red' in col]
low, high, sd = 1, 33, 1
random_numbers = []
for col in cols:
mod = df[col].mode()[0]
# 生成正态分布随机数,loc: 均值,scale: 标准差
random_numbers.append(round(truncnorm((low - mod) / sd, (high - mod) / sd, loc=mod, scale=sd).rvs(size=1)[0]))
blue_mod = random.randint(1, 16)
random_numbers.append(blue_mod)
return {"predict_res": random_numbers}
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000, debug=True)
基于Bootstrap效果:
总结
至此接触过matplotlib、plotly、plotly express、dash、seaborn、pyecharts,特点如下:
| matplotlib | plotly | plotly express | dash | seaborn | pyecharts |
|---|---|---|---|---|---|
| 配置灵活 文档完善 交互性差 代码量多 | 代码量较多 文档不太友好 | plotly进一步封装,支持动态交互,文档不太友好 | 同样是plot生态一员,支持web框架整合 | 代码量少,图表比matplotlib好看,也不支持交互 | 代码量较多 文档完善 样式丰富 |
seaborn
plotly express