如果您是单纯的进行数据分析,利用赔率亚盘凯利必发等找出与赛果的关系,那么可以通过免费获得即可,不要先消耗大量的时间和精力去做爬虫,等到数据齐全再去数据分析,因为数据分析也不一定会给你带来任何启示。
这期介绍赛事进行赛果预测。
赛果预测
本小节根据第2节对本届欧洲杯赛事进行赛果预测。
-
将第1节2015-01-01至2021-05-31的所有数据作为训练集,训练模型。
-
处理数据,将需要预测的数据按照第一节的数据处理步骤得到预测集。
-
得到预测结果。 #3.1 将2015-01-01至2021-05-31的所有数据作为训练集,训练模型 inputs = { "dataframe":missing_value_impute.get_outputs()['output_port_1'] #@input {"label":"dataframe","type":"DataFrame"} } params = { "inputs": inputs, "column_name": "date", #@param {"label":"column_name","type":"string","required":"true","helpTip":""} "condition_map_str": "BETWEEN:2015-01-01,2021-05-31" #@param {"label":"condition_map_str","type":"string","required":"true","helpTip":""} } dataset_filter_all_data = MLSDatasetFilter(**params) dataset_filter_all_data.run() #@output {"label":"dataframe","name":"dataset_filter_all_data.get_outputs()['output_port_1']","type":"DataFrame"}
inputs = {
"dataframe": dataset_filter_all_data.get_outputs()['output_port_1'] #@input {"label":"dataframe","type":"DataFrame"}
}
params = {
"inputs": inputs,
"selected_cols_str": "month, season, home_team, away_team, tournament, neutral, win_result,num_5,diff_num_5,win_num_5,lose_num_5,num_3,diff_num_3,win_num_3,lose_num_3,num_1,diff_num_1,win_num_1,lose_num_1,
num_team_5,diff_num_team_5,win_num_team_5,lose_num_team_5,num_team_3,diff_num_team_3,win_num_team_3,lose_num_team_3,num_team_1,diff_num_team_1,win_num_team_1,lose_num_team_1,
num_year_15,diff_num_year_15,win_num_year_15,lose_num_year_15,num_year_7,diff_num_year_7,win_num_year_7,lose_num_year_7,num_year_3,diff_num_year_3,win_num_year_3,lose_num_year_3,num_year_2,diff_num_year_2,win_num_year_2,lose_num_year_2,num_year_1,diff_num_year_1,win_num_year_1,lose_num_year_1,
away_num,away_win_num,away_lose_num,away_win_rate,home_num,home_win_num,home_lose_num,home_win_rate"}
select_columns_all_data = MLSSelectColumns(**params)
select_columns_all_data.run()
#@output {"label":"dataframe","name":"select_columns_all_data.get_outputs()['output_port_1']","type":"DataFrame"}
inputs = { "dataframe": select_columns_all_data.get_outputs()['output_port_1'] #@input {"label":"dataframe","type":"DataFrame"} } params = { "inputs": inputs, "b_output_action": True, "b_use_default_encoder": True, #@param {"label": "b_use_default_encoder", "type": "boolean", "required": "true", "helpTip": ""} "input_features_str": "", #@param {"label": "input_features_str", "type": "string", "required": "false", "helpTip": ""} "outer_pipeline_stages": None, "label_col": "win_result", #@param {"label": "label_col", "type": "string", "required": "true", "helpTip": "target label column"} "classifier_label_index_col": "label_index", #@param {"label": "classifier_label_index_col", "type": "string", "required": "true", "helpTip": ""} "classifier_feature_vector_col": "model_features", #@param {"label": "classifier_feature_vector_col", "type": "string", "required": "true", "helpTip": ""} "prediction_col": "prediction", #@param {"label": "prediction_col", "type": "string", "required": "true", "helpTip": ""} "prediction_index_col": "prediction_index", #@param {"label": "prediction_index_col", "type": "string", "required": "true", "helpTip": ""} "max_iter": 100, #@param {"label": "max_iter", "type": "integer", "required": "true", "range": "(0,2147483647]", "helpTip": ""} "reg_param": 0, #@param {"label": "reg_param", "type": "number", "required": "true", "range": "[0,none)", "helpTip": ""} "elastic_net_param": 0, #@param {"label": "elastic_net_param", "type": "number", "required": "true", "range": "[0,none)", "helpTip": ""} "tol": 0.000001, #@param {"label": "tol", "type": "number", "required": "true", "range": "(0,none)", "helpTip": ""} "fit_intercept": True, #@param {"label": "fit_intercept", "type": "boolean", "required": "true", "helpTip": ""} "standardization": True, #@param {"label": "standardization", "type": "boolean", "required": "true", "helpTip": ""} "aggregation_depth": 2, #@param {"label": "aggregation_depth", "type": "integer", "required": "true", "range": "(0,2147483647]", "helpTip": ""} "family": "auto", #@param {"label": "family", "type": "enum", "required": "true", "options":"auto,binomial,multinomial", "helpTip": ""} "lower_bounds_on_coefficients": None, "upper_bounds_on_coefficients": None, "lower_bounds_on_intercepts": None, "upper_bounds_on_intercepts": None } lr_classifier_model = MLSLogisticRegressionClassifier(**params) lr_classifier_model.run() #@output {"label":"pipeline_model","name":"lr_classifier_model.get_outputs()['output_port_1']","type":"PipelineModel"}
#3.2 处理预测数据集
初始化预测数据test_data
以2021-06-17土耳其VS威尔士为例,构建测试数据
date,home_team,away_team,tournament,neutral
test_data = {'date':'2021-06-17','home_team':'Turkey','away_team':'Wales','tournament':'UEFA Euro','neutral':'True'} numeric_features = ['num_5','diff_num_5','win_num_5','lose_num_5','num_3','diff_num_3','win_num_3','lose_num_3','num_1','diff_num_1','win_num_1','lose_num_1','num_team_5','diff_num_team_5','win_num_team_5','num_team_3', 'diff_num_team_3','win_num_team_3','num_team_1','diff_num_team_1','win_num_team_1','lose_num_team_5','lose_num_team_3','lose_num_team_1', 'num_year_15','diff_num_year_15','win_num_year_15','num_year_7','diff_num_year_7','win_num_year_7','num_year_3','diff_num_year_3','win_num_year_3','num_year_2','diff_num_year_2','win_num_year_2','num_year_1','diff_num_year_1','win_num_year_1','lose_num_year_15','lose_num_year_7','lose_num_year_3','lose_num_year_2','lose_num_year_1', 'away_num','away_win_num','away_lose_num','away_win_rate','home_num','home_win_num','home_lose_num','home_win_rate'] def init_test_data(test_data): for col in numeric_features: test_data[col]= 0.0 init_test_data(test_data)
生成测试数据
import numpy as np import pandas as pd
读取数据
df = pd.read_csv("raw_data.csv")
生成测试数据
def process_test_data(test_data,df):
df['date']= pd.to_datetime(df['date'])
df['diff'] = df['home_score']-df['away_score']
df['win_result'] = df['diff'].apply(lambda x: 1 if x>0 else 0)
df = df[df['tournament']!='Friendly']
home_team = test_data['home_team']
away_team = test_data['away_team']
home_team_df = df[df['home_team']==home_team]
home_away_team_df = df[(df['home_team']==home_team) & (df['away_team']==away_team)]
home_team_df = home_team_df.sort_values(['date'],ascending=False).reset_index()
home_away_team_df = home_away_team_df.sort_values(['date'],ascending=False).reset_index()
home_team_year = int(test_data['date'].split('-')[0])
away_team_df = df[df['away_team']==away_team]
if(home_team_df.shape[0]>0):
for suffix in ['5','3','1']:
flag = False
j = 0
while j<=int(suffix)-1 and j<=home_team_df.shape[0]:
j+=1
flag = True
j = j-1
if flag and j>=0:
test_data['num_'+suffix] = j+1
test_data['diff_num_'+suffix] = home_team_df.loc[0:j,'diff'].mean()
test_data['win_num_'+suffix]=home_team_df.loc[0:j,'win_result'].sum()
test_data['lose_num_'+suffix]= test_data['num_'+suffix]-test_data['win_num_'+suffix]
if(home_away_team_df.shape[0]>0):
for suffix in ['5','3','1']:
j = 0
flag = False
while j<=int(suffix)-1 and j<=home_away_team_df.shape[0]:
j+=1
flag = True
j = j-1
if flag and j<int(suffix) and j>=0:
test_data['num_team_'+suffix]=j+1
test_data['diff_num_team_'+suffix]=home_away_team_df.loc[0:j,'diff'].mean()
test_data['win_num_team_'+suffix]=home_away_team_df.loc[0:j,'win_result'].sum()
test_data['lose_num_team_'+suffix] = test_data['num_team_'+suffix]-test_data['win_num_team_'+suffix]
if(home_team_df.shape[0]>0):
for suffix in ['15','7','3','2','1']:
j = 0
flag = False
while j<=home_team_df.shape[0] and home_team_year-home_team_df.loc[j,'date'].year<=int(suffix):
j+=1
flag = True
j = j-1
if flag:
test_data['num_year_'+suffix]=j+1
test_data['diff_num_year_'+suffix] = home_team_df.loc[0:j,'diff'].mean()
test_data['win_num_year_'+suffix] = home_team_df.loc[0:j,'win_result'].sum()
test_data['lose_num_year_'+suffix] = test_data['num_year_'+suffix] - test_data['win_num_year_'+suffix]
test_data['home_num']=home_team_df.shape[0]
test_data['home_win_num']=home_team_df['win_result'].sum()
test_data['home_lose_num']= test_data['home_num']-test_data['home_win_num']
test_data['home_win_rate']=test_data['home_win_num']/test_data['home_num']
if away_team_df.shape[0]>0:
test_data['away_num']=away_team_df.shape[0]
test_data['away_lose_num'] = away_team_df['win_result'].sum()
test_data['away_win_num'] =test_data['away_num']-test_data['away_lose_num']
test_data['away_win_rate']=test_data['away_win_num']/test_data['away_num']
test_data['month'] = str(int(test_data['date'].split('-')[1]))
test_data['season'] = str(int(int(test_data['date'].split('-')[1])/3))
保存测试数据
process_test_data(test_data,df) test_data.pop('date') test_df = pd.DataFrame([test_data]) test_df.to_csv('./predict.csv')
#3.2 得到预测结果 params = { "input_file_path": "./predict.csv", #@param {"label":"input_file_path","type":"string","required":"true","helpTip":""} "format": "csv", #@param {"label":"format","type":"string","required":"false","helpTip":""} "has_header": True, #@param {"label":"has_header","type":"boolean","required":"false","helpTip":""} "delimiter": "," #@param {"label":"delimiter","type":"string","required":"false","helpTip":""} } read_predict_data = MLSReadData(**params) read_predict_data.run() #@output {"label":"dataframe","name":"read_predict_data.get_outputs()['output_port_1']","type":"DataFrame"}
inputs = { "dataframe": read_predict_data.get_outputs()['output_port_1'] #@input {"label":"dataframe","type":"DataFrame"} } params = { "inputs": inputs, "column_type_map_str": "neutral:string,month:string,season:string" #@param {"label":"column_type_map_str","type":"string","required":"true","helpTip":""} } modify_predict_data_type = MLSModifyDataType(**params) modify_predict_data_type.run() #@output {"label":"dataframe","name":"modify_predict_data_type.get_outputs()['output_port_1']","type":"DataFrame"}
inputs = { "dataframe":modify_predict_data_type.get_outputs()['output_port_1'], #@input {"label":"dataframe","type":"DataFrame"} "pipeline_model": lr_classifier_model.get_outputs()['output_port_1'] #@input {"label":"pipeline_model","type":"PipelineModel"} } params = { "inputs": inputs } model_predict = MLSModelPredict(**params) model_predict.run() #@output {"label":"dataframe","name":"model_predict.get_outputs()['output_port_1']","type":"DataFrame"}
inputs = {
"dataframe": model_predict.get_outputs()['output_port_1'] #@input {"type":"DataFrame", "label": "dataframe"}
}
p_df = inputs["dataframe"].toPandas()
p_df[['home_team','away_team','prediction']]
home_team away_team prediction
Turkey Wales 0
#该预测结果为Wales胜利。