ML - Learning

226 阅读2分钟


划分时间段

def hour_tag(s):
    """
    时间段划分函数
    时间段划分,并打标签:
    时间段1 : 1<=hours<=8
    时间段1 : 9<=hours<=12
    时间段1 : 13<=hours<=16
    时间段1 : 17<=hours<=20
    时间段1 : 21<=hours<=24
    """
    if (s>=1)&(s<=8):
        return 1
    elif (s>=9)&(s<=12):
        return 2
    elif (s>=13)&(s<=16):
        return 3
    elif (s>=17)&(s<=20):
        return 4
    elif (s>=21)|(s<=0):
        return 5
        
data['hour_tag']=data['hour'].apply(hour_tag)  #应用时间段划分函数的打标签
data['weekday']=pd.to_datetime(data['day']).apply(lambda x : x.weekday()) + 1  #根据日期计算周几
data['is_weekend']=data['weekday'].apply(lambda x : 1 if x>=6 else 0)   #根据日期计算是否周末

对数据进行分组计算

group_by

Sklearn-train_test_split随机划分训练集和测试集:

from sklearn.model_selection import train_test_split

  • train_data:所要划分的样本特征集
  • train_target:所要划分的样本结果
  • test_size:样本占比,如果是整数的话就是样本的数量
  • random_state:是随机数的种子。

随机数种子:其实就是该组随机数的编号,在需要重复试验的时候,保证得到一组一样的随机数。比如你每次都填1,其他参数一样的情况下你得到的随机数组是一样的。但填0或不填,每次都会不一样。 时间段划分

按时间划分训练集和测试集

if train_time_tag != -1:
    train = data[(data['day']>=train_start_day)&(data['day']<=train_end_day)&(data['hour_tag']==train_time_tag)]
else:
    train = data[(data['day']>=train_start_day)&(data['day']<=train_end_day)]
    
if test_time_tag != -1:
    test = data[(data['day']>=test_start_day)&(data['day']<=test_end_day)&(data['hour_tag']==test_time_tag)]
else:
    test = data[(data['day']>=test_start_day)&(data['day']<=test_end_day)]

Lightgbm 模型训练

params = {'num_leaves': 10, #结果对最终效果影响较大,越大值越好,太大会出现过拟合
          'min_data_in_leaf': 30,
          'objective': 'binary', #定义的目标函数
          'max_depth': -1,
          'learning_rate': 0.03,
          "min_sum_hessian_in_leaf": 6,
          "boosting": "gbdt",
          "feature_fraction": 0.9,  #提取的特征比率
          "bagging_freq": 1,
          "bagging_fraction": 0.8,
          "bagging_seed": 11,
          "lambda_l1": 1,             #l1正则
          'lambda_l2': 2,     #l2正则
          "verbosity": -1,
          "nthread": -1,                #线程数量,-1表示全部线程,线程越多,运行的速度越快
          'metric': {'auc'},  ##评价函数选择
          "random_state": 2019, #随机数种子,可以防止每次运行的结果不一致
          # 'device': 'gpu' ##如果安装的事gpu版本的lightgbm,可以加快运算
          }

num_round = 10000

folds = StratifiedKFold(n_splits=5)

def feature_test(features,train_df,test_df,label):
    oof_train = np.zeros((len(train_df),1))
    oof_test = np.zeros((len(test_df),1))
    target = train_df[label]
    feature_importance_df = pd.DataFrame()

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, target.values)):
            print("Fold {}".format(fold_))
            trn_data = lgb.Dataset(train_df.iloc[trn_idx][features], label=target.iloc[trn_idx])
            val_data = lgb.Dataset(train_df.iloc[val_idx][features], label=target.iloc[val_idx])
            clf = lgb.train(params, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 2500)
            oof_train[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration=clf.best_iteration).reshape(-1,1)
            oof_test += clf.predict(test_df[features], num_iteration=clf.best_iteration).reshape(-1,1) / folds.n_splits
            #add feature importance log
            fold_importance_df = pd.DataFrame()
            fold_importance_df["feature"] = features
            fold_importance_df["importance"] = clf.feature_importance()
            fold_importance_df["fold"] = fold_ + 1
            feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)


    return oof_train,oof_test,feature_importance_df

oof_train,oof_test,fea_imp = feature_test(feature_importances_columns,train_input,test_input,'if_retain_ratio')