华为AI Gallery社区《基于机器学习的欧洲杯赛事预测》技术分析系列——(完)预测

66 阅读6分钟

如果您是单纯的进行数据分析,利用赔率亚盘凯利必发等找出与赛果的关系,那么可以通过免费获得即可,不要先消耗大量的时间和精力去做爬虫,等到数据齐全再去数据分析,因为数据分析也不一定会给你带来任何启示。

这期介绍赛事进行赛果预测。

赛果预测

本小节根据第2节对本届欧洲杯赛事进行赛果预测。

  • 将第1节2015-01-01至2021-05-31的所有数据作为训练集,训练模型。

  • 处理数据,将需要预测的数据按照第一节的数据处理步骤得到预测集。

  • 得到预测结果。 #3.1 将2015-01-01至2021-05-31的所有数据作为训练集,训练模型 inputs = { "dataframe":missing_value_impute.get_outputs()['output_port_1'] #@input {"label":"dataframe","type":"DataFrame"} } params = { "inputs": inputs, "column_name": "date", #@param {"label":"column_name","type":"string","required":"true","helpTip":""} "condition_map_str": "BETWEEN:2015-01-01,2021-05-31" #@param {"label":"condition_map_str","type":"string","required":"true","helpTip":""} } dataset_filter_all_data = MLSDatasetFilter(**params) dataset_filter_all_data.run() #@output {"label":"dataframe","name":"dataset_filter_all_data.get_outputs()['output_port_1']","type":"DataFrame"}

inputs = { "dataframe": dataset_filter_all_data.get_outputs()['output_port_1'] #@input {"label":"dataframe","type":"DataFrame"} } params = { "inputs": inputs, "selected_cols_str": "month, season, home_team, away_team, tournament, neutral, win_result,num_5,diff_num_5,win_num_5,lose_num_5,num_3,diff_num_3,win_num_3,lose_num_3,num_1,diff_num_1,win_num_1,lose_num_1,
num_team_5,diff_num_team_5,win_num_team_5,lose_num_team_5,num_team_3,diff_num_team_3,win_num_team_3,lose_num_team_3,num_team_1,diff_num_team_1,win_num_team_1,lose_num_team_1,
num_year_15,diff_num_year_15,win_num_year_15,lose_num_year_15,num_year_7,diff_num_year_7,win_num_year_7,lose_num_year_7,num_year_3,diff_num_year_3,win_num_year_3,lose_num_year_3,num_year_2,diff_num_year_2,win_num_year_2,lose_num_year_2,num_year_1,diff_num_year_1,win_num_year_1,lose_num_year_1,
away_num,away_win_num,away_lose_num,away_win_rate,home_num,home_win_num,home_lose_num,home_win_rate"} select_columns_all_data = MLSSelectColumns(**params) select_columns_all_data.run() #@output {"label":"dataframe","name":"select_columns_all_data.get_outputs()['output_port_1']","type":"DataFrame"}

inputs = { "dataframe": select_columns_all_data.get_outputs()['output_port_1'] #@input {"label":"dataframe","type":"DataFrame"} } params = { "inputs": inputs, "b_output_action": True, "b_use_default_encoder": True, #@param {"label": "b_use_default_encoder", "type": "boolean", "required": "true", "helpTip": ""} "input_features_str": "", #@param {"label": "input_features_str", "type": "string", "required": "false", "helpTip": ""} "outer_pipeline_stages": None, "label_col": "win_result", #@param {"label": "label_col", "type": "string", "required": "true", "helpTip": "target label column"} "classifier_label_index_col": "label_index", #@param {"label": "classifier_label_index_col", "type": "string", "required": "true", "helpTip": ""} "classifier_feature_vector_col": "model_features", #@param {"label": "classifier_feature_vector_col", "type": "string", "required": "true", "helpTip": ""} "prediction_col": "prediction", #@param {"label": "prediction_col", "type": "string", "required": "true", "helpTip": ""} "prediction_index_col": "prediction_index", #@param {"label": "prediction_index_col", "type": "string", "required": "true", "helpTip": ""} "max_iter": 100, #@param {"label": "max_iter", "type": "integer", "required": "true", "range": "(0,2147483647]", "helpTip": ""} "reg_param": 0, #@param {"label": "reg_param", "type": "number", "required": "true", "range": "[0,none)", "helpTip": ""} "elastic_net_param": 0, #@param {"label": "elastic_net_param", "type": "number", "required": "true", "range": "[0,none)", "helpTip": ""} "tol": 0.000001, #@param {"label": "tol", "type": "number", "required": "true", "range": "(0,none)", "helpTip": ""} "fit_intercept": True, #@param {"label": "fit_intercept", "type": "boolean", "required": "true", "helpTip": ""} "standardization": True, #@param {"label": "standardization", "type": "boolean", "required": "true", "helpTip": ""} "aggregation_depth": 2, #@param {"label": "aggregation_depth", "type": "integer", "required": "true", "range": "(0,2147483647]", "helpTip": ""} "family": "auto", #@param {"label": "family", "type": "enum", "required": "true", "options":"auto,binomial,multinomial", "helpTip": ""} "lower_bounds_on_coefficients": None, "upper_bounds_on_coefficients": None, "lower_bounds_on_intercepts": None, "upper_bounds_on_intercepts": None } lr_classifier_model = MLSLogisticRegressionClassifier(**params) lr_classifier_model.run() #@output {"label":"pipeline_model","name":"lr_classifier_model.get_outputs()['output_port_1']","type":"PipelineModel"}

#3.2 处理预测数据集

初始化预测数据test_data

以2021-06-17土耳其VS威尔士为例,构建测试数据

date,home_team,away_team,tournament,neutral

test_data = {'date':'2021-06-17','home_team':'Turkey','away_team':'Wales','tournament':'UEFA Euro','neutral':'True'} numeric_features = ['num_5','diff_num_5','win_num_5','lose_num_5','num_3','diff_num_3','win_num_3','lose_num_3','num_1','diff_num_1','win_num_1','lose_num_1','num_team_5','diff_num_team_5','win_num_team_5','num_team_3', 'diff_num_team_3','win_num_team_3','num_team_1','diff_num_team_1','win_num_team_1','lose_num_team_5','lose_num_team_3','lose_num_team_1', 'num_year_15','diff_num_year_15','win_num_year_15','num_year_7','diff_num_year_7','win_num_year_7','num_year_3','diff_num_year_3','win_num_year_3','num_year_2','diff_num_year_2','win_num_year_2','num_year_1','diff_num_year_1','win_num_year_1','lose_num_year_15','lose_num_year_7','lose_num_year_3','lose_num_year_2','lose_num_year_1', 'away_num','away_win_num','away_lose_num','away_win_rate','home_num','home_win_num','home_lose_num','home_win_rate'] def init_test_data(test_data): for col in numeric_features: test_data[col]= 0.0 init_test_data(test_data)

生成测试数据

import numpy as np import pandas as pd

读取数据

df = pd.read_csv("raw_data.csv")

生成测试数据

def process_test_data(test_data,df): df['date']= pd.to_datetime(df['date']) df['diff'] = df['home_score']-df['away_score'] df['win_result'] = df['diff'].apply(lambda x: 1 if x>0 else 0) df = df[df['tournament']!='Friendly'] home_team = test_data['home_team'] away_team = test_data['away_team'] home_team_df = df[df['home_team']==home_team] home_away_team_df = df[(df['home_team']==home_team) & (df['away_team']==away_team)] home_team_df = home_team_df.sort_values(['date'],ascending=False).reset_index() home_away_team_df = home_away_team_df.sort_values(['date'],ascending=False).reset_index() home_team_year = int(test_data['date'].split('-')[0]) away_team_df = df[df['away_team']==away_team] if(home_team_df.shape[0]>0): for suffix in ['5','3','1']: flag = False j = 0 while j<=int(suffix)-1 and j<=home_team_df.shape[0]:
j+=1 flag = True j = j-1 if flag and j>=0: test_data['num_'+suffix] = j+1 test_data['diff_num_'+suffix] = home_team_df.loc[0:j,'diff'].mean() test_data['win_num_'+suffix]=home_team_df.loc[0:j,'win_result'].sum()
test_data['lose_num_'+suffix]= test_data['num_'+suffix]-test_data['win_num_'+suffix]

if(home_away_team_df.shape[0]>0):
    for suffix in ['5','3','1']:
        j = 0
        flag = False    
        while j<=int(suffix)-1 and j<=home_away_team_df.shape[0]:       
            j+=1
            flag = True 
        j = j-1
        if flag and j<int(suffix) and j>=0:
            test_data['num_team_'+suffix]=j+1
            test_data['diff_num_team_'+suffix]=home_away_team_df.loc[0:j,'diff'].mean()
            test_data['win_num_team_'+suffix]=home_away_team_df.loc[0:j,'win_result'].sum() 
            test_data['lose_num_team_'+suffix] = test_data['num_team_'+suffix]-test_data['win_num_team_'+suffix]

if(home_team_df.shape[0]>0):
    for suffix in ['15','7','3','2','1']:
        j = 0
        flag = False 
        while j<=home_team_df.shape[0] and home_team_year-home_team_df.loc[j,'date'].year<=int(suffix):       
            j+=1
            flag = True 
        j = j-1
        if flag: 
            test_data['num_year_'+suffix]=j+1
            test_data['diff_num_year_'+suffix] = home_team_df.loc[0:j,'diff'].mean()
            test_data['win_num_year_'+suffix] = home_team_df.loc[0:j,'win_result'].sum() 
            test_data['lose_num_year_'+suffix] = test_data['num_year_'+suffix] - test_data['win_num_year_'+suffix]              
    test_data['home_num']=home_team_df.shape[0]
    test_data['home_win_num']=home_team_df['win_result'].sum()  
    test_data['home_lose_num']= test_data['home_num']-test_data['home_win_num']
    test_data['home_win_rate']=test_data['home_win_num']/test_data['home_num']

if away_team_df.shape[0]>0:
    test_data['away_num']=away_team_df.shape[0]
    test_data['away_lose_num'] = away_team_df['win_result'].sum()
    test_data['away_win_num'] =test_data['away_num']-test_data['away_lose_num']
    test_data['away_win_rate']=test_data['away_win_num']/test_data['away_num']
test_data['month'] = str(int(test_data['date'].split('-')[1]))
test_data['season'] = str(int(int(test_data['date'].split('-')[1])/3))

保存测试数据

process_test_data(test_data,df) test_data.pop('date') test_df = pd.DataFrame([test_data]) test_df.to_csv('./predict.csv')

#3.2 得到预测结果 params = { "input_file_path": "./predict.csv", #@param {"label":"input_file_path","type":"string","required":"true","helpTip":""} "format": "csv", #@param {"label":"format","type":"string","required":"false","helpTip":""} "has_header": True, #@param {"label":"has_header","type":"boolean","required":"false","helpTip":""} "delimiter": "," #@param {"label":"delimiter","type":"string","required":"false","helpTip":""} } read_predict_data = MLSReadData(**params) read_predict_data.run() #@output {"label":"dataframe","name":"read_predict_data.get_outputs()['output_port_1']","type":"DataFrame"}

inputs = { "dataframe": read_predict_data.get_outputs()['output_port_1'] #@input {"label":"dataframe","type":"DataFrame"} } params = { "inputs": inputs, "column_type_map_str": "neutral:string,month:string,season:string" #@param {"label":"column_type_map_str","type":"string","required":"true","helpTip":""} } modify_predict_data_type = MLSModifyDataType(**params) modify_predict_data_type.run() #@output {"label":"dataframe","name":"modify_predict_data_type.get_outputs()['output_port_1']","type":"DataFrame"}

inputs = { "dataframe":modify_predict_data_type.get_outputs()['output_port_1'], #@input {"label":"dataframe","type":"DataFrame"} "pipeline_model": lr_classifier_model.get_outputs()['output_port_1'] #@input {"label":"pipeline_model","type":"PipelineModel"} } params = { "inputs": inputs } model_predict = MLSModelPredict(**params) model_predict.run() #@output {"label":"dataframe","name":"model_predict.get_outputs()['output_port_1']","type":"DataFrame"}

inputs = { "dataframe": model_predict.get_outputs()['output_port_1'] #@input {"type":"DataFrame", "label": "dataframe"} } p_df = inputs["dataframe"].toPandas() p_df[['home_team','away_team','prediction']] home_team     away_team     prediction
Turkey              Wales                   0
#该预测结果为Wales胜利。