第四届工业大数据创新竞赛融合模型代码

158 阅读1分钟

本文已参与「新人创作礼」活动,一起开启掘金创作之路

import numpy as np
import pandas as pd
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from mlxtend.regressor import StackingCVRegressor
import xgboost as xgb
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

In [2]:

    df_x_train = pd.read_csv('整理.csv')
    df_x_test = pd.read_csv('整理test.csv')
    df_y_train = pd.read_csv('label.csv')
    df_x_train.drop('Id', axis=1, inplace=True)
    df_x_test.drop('Id', axis=1, inplace=True)
    df_y_train.drop('Id', axis=1, inplace=True)
#     df_y_test.drop('Id', axis=1, inplace=True)
    
    x_train = np.array(df_x_train)
    y_train = np.array(df_y_train)
    x_test = np.array(df_x_test)
#     y_test = np.array(df_y_test)

In [3]:

n_folds = 5
# 给特征排序,删除无用特征
def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(x_train)
    mse = np.sqrt(-cross_val_score(model, x_train, y_train, scoring="neg_mean_squared_error", cv=kf))
    return mse


kfolds = KFold(n_splits=n_folds, shuffle=True, random_state=42)
alph = [0.01, 0.001, 0.0001, 0.0002, 0.0004, 0.0008, 0.002, 0.004, 0.008, 1, 2, 4, 6, 8, 10, 12]
alph2 = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
lasso = make_pipeline(RobustScaler(), LassoCV(alphas=alph, cv=kfolds, random_state=1))
ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=alph2, cv=kfolds))
ENet = make_pipeline(RobustScaler(), ElasticNetCV(alphas=alph, l1_ratio=.9, cv=kfolds, random_state=3))
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
GBoost = GradientBoostingRegressor(n_estimators=300, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10,
                                   loss='huber', random_state=5)

In [4]:

model_xgb = xgb.XGBRegressor(max_depth=10,
                             learning_rate=0.05,
                             n_estimators=340,
                             subsample=0.6,
                             colsample_bytree=0.6,
                             min_child_weight=3,
                             reg_lambda=2,
                             seed=1000)

model_lgb = lgb.LGBMRegressor(boosting_type='gbdt',  # 设置提升类型
                              objective='regression',  # 目标函数
                              #     metric= 'l2',  # 评估函数
                              num_leaves=31,  # 叶子节点数
                              learning_rate=0.1,  # 学习速率
                              feature_fraction=0.9,  # 建树的特征选择比例
                              bagging_fraction=0.8,  # 建树的样本采样比例
                              bagging_freq=5,  # k 意味着每 k 次迭代执行bagging
                              verbose=1)  # <0 显示致命的, =0 显示错误 (警告), >0 显示信息)

stacked_averaged_models = StackingCVRegressor(regressors=(ENet, GBoost, KRR, model_xgb),
                                              meta_regressor=model_lgb,
                                              use_features_in_secondary=True)

In [5]:

def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

In [ ]:

stacked_averaged_models.fit(x_train, y_train[:, 0])
stacked_pred1 = stacked_averaged_models.predict(x_test)

# 预测数据size_2
stacked_averaged_models.fit(x_train, y_train[:, 1])
stacked_pred2 = stacked_averaged_models.predict(x_test)

# 预测数据size_3
stacked_averaged_models.fit(x_train, y_train[:, 2])
stacked_pred3 = stacked_averaged_models.predict(x_test)

y_pred = np.zeros((3953, 3))
y_pred[:, 0] = stacked_pred1
y_pred[:, 1] = stacked_pred2
y_pred[:, 2] = stacked_pred3

df = pd.DataFrame(y_pred)
df.to_csv('new/模型融合.csv')