本文已参与「新人创作礼」活动,一起开启掘金创作之路
import numpy as np
import pandas as pd
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from mlxtend.regressor import StackingCVRegressor
import xgboost as xgb
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
In [2]:
df_x_train = pd.read_csv('整理.csv')
df_x_test = pd.read_csv('整理test.csv')
df_y_train = pd.read_csv('label.csv')
df_x_train.drop('Id', axis=1, inplace=True)
df_x_test.drop('Id', axis=1, inplace=True)
df_y_train.drop('Id', axis=1, inplace=True)
# df_y_test.drop('Id', axis=1, inplace=True)
x_train = np.array(df_x_train)
y_train = np.array(df_y_train)
x_test = np.array(df_x_test)
# y_test = np.array(df_y_test)
In [3]:
n_folds = 5
# 给特征排序,删除无用特征
def rmsle_cv(model):
kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(x_train)
mse = np.sqrt(-cross_val_score(model, x_train, y_train, scoring="neg_mean_squared_error", cv=kf))
return mse
kfolds = KFold(n_splits=n_folds, shuffle=True, random_state=42)
alph = [0.01, 0.001, 0.0001, 0.0002, 0.0004, 0.0008, 0.002, 0.004, 0.008, 1, 2, 4, 6, 8, 10, 12]
alph2 = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
lasso = make_pipeline(RobustScaler(), LassoCV(alphas=alph, cv=kfolds, random_state=1))
ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=alph2, cv=kfolds))
ENet = make_pipeline(RobustScaler(), ElasticNetCV(alphas=alph, l1_ratio=.9, cv=kfolds, random_state=3))
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
GBoost = GradientBoostingRegressor(n_estimators=300, learning_rate=0.05,
max_depth=4, max_features='sqrt',
min_samples_leaf=15, min_samples_split=10,
loss='huber', random_state=5)
In [4]:
model_xgb = xgb.XGBRegressor(max_depth=10,
learning_rate=0.05,
n_estimators=340,
subsample=0.6,
colsample_bytree=0.6,
min_child_weight=3,
reg_lambda=2,
seed=1000)
model_lgb = lgb.LGBMRegressor(boosting_type='gbdt', # 设置提升类型
objective='regression', # 目标函数
# metric= 'l2', # 评估函数
num_leaves=31, # 叶子节点数
learning_rate=0.1, # 学习速率
feature_fraction=0.9, # 建树的特征选择比例
bagging_fraction=0.8, # 建树的样本采样比例
bagging_freq=5, # k 意味着每 k 次迭代执行bagging
verbose=1) # <0 显示致命的, =0 显示错误 (警告), >0 显示信息)
stacked_averaged_models = StackingCVRegressor(regressors=(ENet, GBoost, KRR, model_xgb),
meta_regressor=model_lgb,
use_features_in_secondary=True)
In [5]:
def rmsle(y, y_pred):
return np.sqrt(mean_squared_error(y, y_pred))
In [ ]:
stacked_averaged_models.fit(x_train, y_train[:, 0])
stacked_pred1 = stacked_averaged_models.predict(x_test)
# 预测数据size_2
stacked_averaged_models.fit(x_train, y_train[:, 1])
stacked_pred2 = stacked_averaged_models.predict(x_test)
# 预测数据size_3
stacked_averaged_models.fit(x_train, y_train[:, 2])
stacked_pred3 = stacked_averaged_models.predict(x_test)
y_pred = np.zeros((3953, 3))
y_pred[:, 0] = stacked_pred1
y_pred[:, 1] = stacked_pred2
y_pred[:, 2] = stacked_pred3
df = pd.DataFrame(y_pred)
df.to_csv('new/模型融合.csv')