划分时间段
def hour_tag(s):
"""
时间段划分函数
时间段划分,并打标签:
时间段1 : 1<=hours<=8
时间段1 : 9<=hours<=12
时间段1 : 13<=hours<=16
时间段1 : 17<=hours<=20
时间段1 : 21<=hours<=24
"""
if (s>=1)&(s<=8):
return 1
elif (s>=9)&(s<=12):
return 2
elif (s>=13)&(s<=16):
return 3
elif (s>=17)&(s<=20):
return 4
elif (s>=21)|(s<=0):
return 5
data['hour_tag']=data['hour'].apply(hour_tag) #应用时间段划分函数的打标签
data['weekday']=pd.to_datetime(data['day']).apply(lambda x : x.weekday()) + 1 #根据日期计算周几
data['is_weekend']=data['weekday'].apply(lambda x : 1 if x>=6 else 0) #根据日期计算是否周末
对数据进行分组计算
Sklearn-train_test_split随机划分训练集和测试集:
from sklearn.model_selection import train_test_split
- train_data:所要划分的样本特征集
- train_target:所要划分的样本结果
- test_size:样本占比,如果是整数的话就是样本的数量
- random_state:是随机数的种子。
随机数种子:其实就是该组随机数的编号,在需要重复试验的时候,保证得到一组一样的随机数。比如你每次都填1,其他参数一样的情况下你得到的随机数组是一样的。但填0或不填,每次都会不一样。 时间段划分
按时间划分训练集和测试集
if train_time_tag != -1:
train = data[(data['day']>=train_start_day)&(data['day']<=train_end_day)&(data['hour_tag']==train_time_tag)]
else:
train = data[(data['day']>=train_start_day)&(data['day']<=train_end_day)]
if test_time_tag != -1:
test = data[(data['day']>=test_start_day)&(data['day']<=test_end_day)&(data['hour_tag']==test_time_tag)]
else:
test = data[(data['day']>=test_start_day)&(data['day']<=test_end_day)]
Lightgbm 模型训练
params = {'num_leaves': 10, #结果对最终效果影响较大,越大值越好,太大会出现过拟合
'min_data_in_leaf': 30,
'objective': 'binary', #定义的目标函数
'max_depth': -1,
'learning_rate': 0.03,
"min_sum_hessian_in_leaf": 6,
"boosting": "gbdt",
"feature_fraction": 0.9, #提取的特征比率
"bagging_freq": 1,
"bagging_fraction": 0.8,
"bagging_seed": 11,
"lambda_l1": 1, #l1正则
'lambda_l2': 2, #l2正则
"verbosity": -1,
"nthread": -1, #线程数量,-1表示全部线程,线程越多,运行的速度越快
'metric': {'auc'}, ##评价函数选择
"random_state": 2019, #随机数种子,可以防止每次运行的结果不一致
# 'device': 'gpu' ##如果安装的事gpu版本的lightgbm,可以加快运算
}
num_round = 10000
folds = StratifiedKFold(n_splits=5)
def feature_test(features,train_df,test_df,label):
oof_train = np.zeros((len(train_df),1))
oof_test = np.zeros((len(test_df),1))
target = train_df[label]
feature_importance_df = pd.DataFrame()
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, target.values)):
print("Fold {}".format(fold_))
trn_data = lgb.Dataset(train_df.iloc[trn_idx][features], label=target.iloc[trn_idx])
val_data = lgb.Dataset(train_df.iloc[val_idx][features], label=target.iloc[val_idx])
clf = lgb.train(params, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 2500)
oof_train[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration=clf.best_iteration).reshape(-1,1)
oof_test += clf.predict(test_df[features], num_iteration=clf.best_iteration).reshape(-1,1) / folds.n_splits
#add feature importance log
fold_importance_df = pd.DataFrame()
fold_importance_df["feature"] = features
fold_importance_df["importance"] = clf.feature_importance()
fold_importance_df["fold"] = fold_ + 1
feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
return oof_train,oof_test,feature_importance_df
oof_train,oof_test,fea_imp = feature_test(feature_importances_columns,train_input,test_input,'if_retain_ratio')