Pyecharts结合Flask案例: 双色球数据分析

1,238 阅读5分钟

我正在参加「掘金·启航计划」

近段时间又迷上了双色球,即使已经知道这是一个随机过程。曾经一个朋友说过:一注彩票付出两块钱,带来的是对生活的一份期待。当然研究彩票不是目的而是手段,主要还是借此学习一下可视化库pyecharts。

官方文档 pyecharts - A Python Echarts Plotting Library built with love.
图表Demo - Document (pyecharts.org)

数据抓取

数据来源:
datachart.500.com/ssq/history…
F12发现数据请求接口网址是:
datachart.500.com/ssq/history…

返回的是html文本,结果如下,我们需要解析的数据位于<tbody id="tdata">

image.png

这里通过xpath来提取(也可以用BeautifulSoup)。首先找到id为tdatatbody标签,然后选取属性为t_tr1的tr子节点,再遍历所有子节点,找到对应的期数+6个红球+1个蓝球+开奖日期这9列。不知道是不是编码的原因,提取出来的文本类型为lxml.etree._ElementUnicodeResult,需要将它转换为字符串 。

详细代码:

def fetch_data(start: int = None, end: int = '', limit: int = None) -> List[list]:
    """抓取并解析数据"""
    base_url = "http://datachart.500.com/ssq/history/newinc/history.php"
    url = f"{base_url}?start={start}&end={end}"
    if limit:
        url = f"{base_url}?limit={limit}&sort=0"

    res = requests.get(url)
    html = etree.HTML(res.content.decode("utf-8"))
    # 解析为字符串,并按照utf8解码
    # doc = etree.tostring(html, pretty_print=True, encoding="utf-8").decode('utf-8')
    el = html.xpath("//tbody[@id='tdata']/tr[@class='t_tr1']")
    result = []
    for e in el:
        # 提取指定位置的td
        # 用这种方式也可:e.xpath("./td[not(text()='\xa0')]/text()"),其中\xa0表示不间断空白符,即&nbsp;
        temp = e.xpath("./td[not(position()>=9 and position()<=last()-1)]/text()")
        # 提取出来的文本类型为lxml.etree._ElementUnicodeResult,需要转换为字符串
        result.append(list(map(str, temp)))
    # df = data_processing(result)
    return result

数据处理

接下来将上一步返回的数据列表转换为Dataframe,为了方便后续分析,日期列设为索引,数据类型也要做相应的转换,并且新构建了两个特征:奇偶次数和连号个数。
详细代码:

@classmethod
def data_processing(cls, data: List[list]) -> pd.DataFrame:
    df = pd.DataFrame(data, columns=['period'] + cls.cols + ['date']).sort_values('date')
    df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
    df.set_index('date', inplace=True)
    # 转换为int类型
    df = df.astype({i: int for i in cls.cols})
    return cls.build_feature(df)

@classmethod
def build_feature(cls, data: pd.DataFrame) -> pd.DataFrame:
    """构建特征:奇偶个数、连号个数"""
    def is_odd(x):
        count = 0
        for i in x:
            if i & 1 == 1:
                count += 1
        return count

    def is_continuous(x):
        count = 1
        q = [x[0]]
        for i in range(1, len(x)):
            if q and x[i] - q[-1] != 1:
                count = max(len(q), count)
                q.clear()
            q.append(x[i])
        return max(len(q), count)

    data['odd'] = data[cls.cols].apply(lambda x: is_odd(x), axis=1)
    data['continuous'] = data[cls.cols].apply(lambda x: is_continuous(x), axis=1)

    return data

绘制图表

此前前端vue项目用过echarts库,所以对其中的配置参数还有点印象,官方文档也很详细,包含各种图表的示例demo。

关于我第一次使用pyecharts遇到的坑:

  1. 若使用jupyter环境,必须声明环境类型,否则图表无法显示。
# jupyter notebook环境
from pyecharts.globals import ThemeType, CurrentConfig,
CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_NOTEBOOK
  1. pyecharts不支持pandas中的数据类型,可以通过tolist()方法转换

  2. Page组件中无法嵌入Tab组件,会报错找不到chart_id

  3. 以百分比形式为Tab组件内嵌的Line折线图设置图表宽度,无法正常显示,而px单位可以(不知道是不是bug)

  4. 图表渲染方式:
    生成html文件:render()
    jupyter notebook环境:render_notebook()
    Flask或Django直接渲染:render_embed()

绘图相关代码:

由于创建图表基本上都涉及全局配置和初始化配置项,自定义了两个方法方便调用

import pyecharts.options as opts
from pyecharts.globals import ThemeType


def update_global_opts(chart, title_opts=None, xaxis_opts=None, yaxis_opts=None, legend_opts=None, x_rotate=None):
    # 图表类型
    # chart_type = chart.get_options()['series'][0]['type']
    dic = dict(
        xaxis_opts=opts.AxisOpts(splitline_opts=opts.SplitLineOpts(is_show=True),
                                 axislabel_opts=opts.LabelOpts(rotate=x_rotate)),
        yaxis_opts=opts.AxisOpts(splitline_opts=opts.SplitLineOpts(is_show=True)),
        tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="cross"),
        toolbox_opts=opts.ToolboxOpts(is_show=True,
                                      orient='horizontal',
                                      feature={
                                          "saveAsImage": {},
                                          "dataZoom": {"yAxisIndex": "none"},
                                          "restore": {},
                                          "magicType": {"show": True, "type": ["line", "bar"]},
                                          "dataView": {},
                                      }
                                      ),
    )

    # 标题配置
    if title_opts:
        dic['title_opts'] = opts.TitleOpts(**title_opts)
    if xaxis_opts:
        dic['xaxis_opts'] = opts.AxisOpts(splitline_opts=opts.SplitLineOpts(is_show=True),
                                          axislabel_opts=opts.LabelOpts(rotate=x_rotate), **xaxis_opts)
    if yaxis_opts:
        dic['yaxis_opts'] = opts.AxisOpts(splitline_opts=opts.SplitLineOpts(is_show=True),**yaxis_opts)
    # 图例配置
    if legend_opts:
        dic['legend_opts'] = opts.LegendOpts(**legend_opts)

    return dic


def update_init_opts(width="900px", theme=None, bg_color=None):
    dic = dict(
        width=width,
        theme=ThemeType.WESTEROS,
        # bg_color='rgba(48, 56, 69, 0.9)',
    )
    if theme:
        dic['theme'] = theme
    if bg_color:
        dic['bg_color'] = bg_color
    init_opts = opts.InitOpts(**dic)
    return init_opts

具体绘图代码:

def draw_trend(self) -> Page:
    """绘制各号码走势曲线及出现次数分析"""
    data = self.data
    page = Page(layout=Page.DraggablePageLayout, page_title='实时彩票分析平台')
    start, end = data.index[0].strftime("%Y-%m-%d"), data.index[-1].strftime("%Y-%m-%d")

    for col in self.cols:
        count = data[col].value_counts().sort_index()
        bar = (
            Bar(init_opts=update_init_opts())
                .add_yaxis('出现次数', count.values.tolist())
                .add_xaxis(count.index.tolist())
                .reversal_axis()
        )

        line = (
            Line(init_opts=update_init_opts())
                .add_xaxis(data.index.strftime("%m-%d").tolist())
                .add_yaxis('号码', data[col].values.tolist(), linestyle_opts=opts.LineStyleOpts(width=2))
        )

        # 'pos_left': '2%'表示右边留白2%
        global_opts_bar = update_global_opts(bar,
                                             yaxis_opts={'type_': 'category'},
                                             title_opts={'title': f'【{start}{end}{col}次数', 'pos_left': '2%'},
                                             legend_opts={'pos_left': '25%', 'is_show': False})
        global_opts_line = update_global_opts(line,
                                              x_rotate=45,
                                              title_opts={'title': f'{col}号码走势', 'pos_left': '40%'},
                                              legend_opts={'pos_right': '25%', 'is_show': False}, )

        line.set_global_opts(**global_opts_line)
        bar.set_global_opts(**global_opts_bar)

        grid = (
            Grid(init_opts=update_init_opts(width='100%'))
                .add(bar, grid_opts=opts.GridOpts(pos_right="75%"))
                .add(line, grid_opts=opts.GridOpts(pos_left="35%"))
        )

        page.add(grid)
    return page

def draw_features(self) -> Line:
    data = self.data
    start, end = data.index[0].strftime("%Y-%m-%d"), data.index[-1].strftime("%Y-%m-%d")
    line = (
        Line(init_opts=update_init_opts(width='100%'))
            .add_xaxis(data.index.strftime("%m-%d").tolist())
            .add_yaxis('奇数个数', data['odd'].values.tolist())
            .add_yaxis('连号个数', data['continuous'].values.tolist())
    )

    global_opts = update_global_opts(line, x_rotate=30)
    line.set_global_opts(**global_opts)
    
    return line

结合Flask

步骤:

  • 创建Flask项目
  • 找到pyecharts库安装位置,拷贝路径pyecharts.render.templates中的文件至项目templates目录中
  • 设置环境变量
    CurrentConfig.GLOBAL_ENV = Environment(loader=FileSystemLoader("./templates"))
    
  • 运行

按照官网文档中from jinja2 import Markup会报错,改为from jinja2.utils import markupsafe

Flask app.py文件:

import datetime
import random
from typing import List
import requests
from flask import Flask, request, render_template
from jinja2 import Environment, FileSystemLoader
from lxml import etree
import platform
from scipy.stats import truncnorm

from pyecharts.globals import CurrentConfig
CurrentConfig.GLOBAL_ENV = Environment(loader=FileSystemLoader("./templates"))

from flask_bootstrap import Bootstrap4
from lottery_predict import Lottery


app = Flask(__name__, static_folder="templates")
bootstrap = Bootstrap4()
bootstrap.init_app(app)

HOST = "http://47.98.97.198:5000/"
if platform.system() == 'Windows':
    HOST = "http://127.0.0.1:5000/"


@app.route('/')
def index():
    cur_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
    limit = request.args.get('limit')
    menu_id = request.args.get('menu_id')
    col = request.args.get('col')
    url = HOST + "trend"
    if not limit:
        limit = 50
    if not col:
        col = 'red1'
    if menu_id and int(menu_id) == 1:
        url = HOST + "features"

    return render_template("./html/index.html", cur_time=cur_time, limit=limit, menu_id=url, col=col)


@app.route('/trend')
def trend():
    limit = request.args.get('limit')
    col = request.args.get('col')
    if not limit:
        limit = 50

    data = fetch_data(limit=int(limit))
    chart = Lottery(data).draw_trend(col)
    # return markupsafe.Markup(chart.render_embed())
    return chart.dump_options_with_quotes()


@app.route('/features')
def draw_charts():
    limit = request.args.get('limit')
    if not limit:
        limit = 30
    data = fetch_data(limit=int(limit))
    chart = Lottery(data).draw_features()
    return chart.dump_options_with_quotes()


def fetch_data(start: int = None, end: int = '', limit: int = None) -> List[list]:
    """抓取并解析数据"""
    base_url = "http://datachart.500.com/ssq/history/newinc/history.php"
    url = f"{base_url}?start={start}&end={end}"
    if limit:
        url = f"{base_url}?limit={limit}&sort=0"

    res = requests.get(url)
    html = etree.HTML(res.content.decode("utf-8"))
    # 解析为字符串,并按照utf8解码
    # doc = etree.tostring(html, pretty_print=True, encoding="utf-8").decode('utf-8')
    el = html.xpath("//tbody[@id='tdata']/tr[@class='t_tr1']")
    result = []
    for e in el:
        # 提取指定位置的td
        # 用这种方式也可:e.xpath("./td[not(text()='\xa0')]/text()"),其中\xa0表示不间断空白符,即&nbsp;
        temp = e.xpath("./td[not(position()>=9 and position()<=last()-1)]/text()")
        # 提取出来的文本类型为lxml.etree._ElementUnicodeResult,需要转换为字符串
        result.append(list(map(str, temp)))
    return result


@app.route('/latest', methods=['GET'])
@app.context_processor
def get_latest():
    res = fetch_data(limit=1)[0]
    return {"latest": res[1:-1], "date": res[-1]}


@app.route('/predict', methods=['GET'], endpoint='predict')
# @app.context_processor
def auto_predict():
    df = Lottery.data_processing(fetch_data(limit=100))
    cols = [col for col in df.columns if 'red' in col]
    low, high, sd = 1, 33, 1
    random_numbers = []
    for col in cols:
        mod = df[col].mode()[0]
        # 生成正态分布随机数,loc: 均值,scale: 标准差
        random_numbers.append(round(truncnorm((low - mod) / sd, (high - mod) / sd, loc=mod, scale=sd).rvs(size=1)[0]))
    blue_mod = random.randint(1, 16)
    random_numbers.append(blue_mod)
    return {"predict_res": random_numbers}


if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000, debug=True)

基于Bootstrap效果:


image.png ---

总结

至此接触过matplotlib、plotly、plotly express、dash、seaborn、pyecharts,特点如下:

matplotlibplotlyplotly expressdashseabornpyecharts
配置灵活
文档完善
交互性差
代码量多
代码量较多
文档不太友好
plotly进一步封装,支持动态交互,文档不太友好同样是plot生态一员,支持web框架整合代码量少,图表比matplotlib好看,也不支持交互代码量较多
文档完善
样式丰富

seaborn

奇偶性分析.png

plotly express

newplot (2).png