[机器学习]网格搜索+交叉验证

2,030 阅读3分钟

图片.png

参考网址

blog.csdn.net/weixin_4627…

1/分类: 网格搜索+交叉验证

import xgboost as xgb  

from sklearn.datasets import load_iris  

from sklearn.model_selection import GridSearchCV, cross_val_score  
from sklearn.model_selection import train_test_split  

from sklearn.metrics import accuracy_score

# 加载数据  
iris = load_iris()  
x = iris.data  
y = iris.target  
  
# 划分训练集和测试集  
X_train, X_test, y_train, y_test = train_test_split(x, y, 
                                                    test_size=0.3, 
                                                    random_state=42)

# 定义XGBoost模型
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, 
                            eval_metric='mlogloss')

# 设置网格搜索参数
param_grid_dict = { 'max_depth': [3, 5, 7], 
                    'n_estimators': [50, 100, 200], 
                    'learning_rate': [0.01, 0.1, 0.2],
                    'subsample': [0.8, 1.0], 
                    'colsample_bytree': [0.6, 0.8, 1.0] 
                  }


# 初始化GridSearchCV
# 网格搜索
grid_search = GridSearchCV(estimator=xgb_clf, 
                           param_grid=param_grid_dict, 
                           cv=5, 
                           scoring='accuracy',
                           verbose=2, 
                           n_jobs=-1)

# 训练模型
grid_search.fit(X_train, y_train)
   
# 输出模型的相关信息
print(   grid_search.best_params_  )  
print(   grid_search.best_score_  )
print(   grid_search.best_estimator_  )

图片.png

2/回归: 网格搜索+交叉验证

from xgboost.sklearn import XGBRegressor

from sklearn.datasets import load_iris, load_boston
from sklearn.datasets import load_breast_cancer

from sklearn.model_selection import KFold, train_test_split, GridSearchCV

from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

# 加载数据
x, y= load_boston(return_X_y=True)

# 划分训练集,测试集
X_train, X_test, y_train, y_test = train_test_split(x, y, 
                                                   test_size=0.3,
                                                   random_state=1)
                                                   
# 定义模型                                              
model = XGBRegressor(objective='reg:squarederror', 
                     n_estimators=300, 
                     max_depth=6,
                     subsample=0.6, 
                     colsample_bytree=0.8, 
                     learning_rate=0.1, 
                     random_state=0)

# 训练模型
model.fit(X_train, y_train)

# 模型的评分
model.score(X_test, y_test)

pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, pred))
rmse

# 回归交叉验证
rng = np.random.RandomState(123)
kf = KFold(n_splits=3, shuffle=True, random_state=rng)
print("在3折数据上的交叉验证")

for train_index, test_index in kf.split(X):
    xgb_model = xgb.XGBRegressor(objective='reg:squarederror',
                                 n_estimators=300, 
                                 max_depth=6,
                                 subsample=0.6,
                                 colsample_bytree=0.8,   
                                 learning_rate=0.1,
                                 random_state=0).fit(X[train_index],y[train_index])
                                 
    predictions = xgb_model.predict(  X[test_index]  )
    actuals = y[test_index]
    print("均方根误差:")
    print(np.sqrt(mean_squared_error(actuals, predictions)))
    print('拟合优度')
    print(xgb_model.score(X[test_index],y[test_index]))

# 回归网格化搜索最优超参数
model = xgb.XGBRegressor(objective='reg:squarederror',
                         subsample=0.6, 
                         colsample_bytree=0.8, 
                         random_state=0,nthread=8)
param_dict = {'max_depth': [5,6,7,8],
              'n_estimators': [100,200,300],
              'learning_rate':[0.05,0.1,0.2]}
              
clf = GridSearchCV(model, param_dict, cv=10, verbose=1 , scoring='r2')
clf.fit(X_train, y_train)
print(clf.best_score_)
print(clf.best_params_)

自定义目标函数和损失函数

from sklearn.datasets import load_breast_cancer
X,y=load_breast_cancer(return_X_y=True)
X.shape,y.shape

X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.2,random_state=8)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)
xgb_train = xgb.DMatrix(X_train, y_train)
xgb_test = xgb.DMatrix( X_test,y_test )

params = {'booster':'gbtree','max_depth':5, 'eta':0.1}
num_round=50
watchlist  = [(xgb_train,'train'), (xgb_test,'test')]

def logregobj(preds,dtrain):
    labels=dtrain.get_label()
    preds=1.0/(1.0+np.exp(-preds))
    grad=preds-labels
    hess=preds*(1.0-preds)
    return grad,hess
def evalerror(preds,dtrain):
    labels=dtrain.get_label()
    return 'error',float(sum(labels!=(preds>0.0)))/len(labels)

# 训练
model = xgb.train(params, 
                    xgb_train, 
                    num_round, 
                    watchlist,
                    obj=logregobj,
                    feval=evalerror)
result=xgb.cv(params=params,dtrain=xgb_train,nfold=10,metrics='auc',#'auc'
             num_boost_round=300,as_pandas=True,seed=123,obj=logregobj,feval=evalerror)
 
result.head()


# Plot CV Errors
import matplotlib.pyplot as plt
plt.plot(range(1, 301), result['train-error-mean'], 'k', label='Training Error')
plt.plot(range(1, 301), result['test-error-mean'], 'b', label='Test Error')
plt.xlabel('Number of Trees')
plt.ylabel('AUC')
plt.axhline(0, linestyle='--', color='k', linewidth=1)
plt.legend()
plt.title('CV Errors for XGBoost')
plt.show()

特征的重要程度

<1>xgb包自带的画图用法

xgb.plot_importance(clf,height=0.5,importance_type='gain',max_num_features=10)

图片.png

<2>sklearn库用法

clf.feature_importances_
cancer=load_breast_cancer()

cancer.feature_names
sorted_index = clf.feature_importances_.argsort()
plt.figure(figsize=(10,5))
plt.barh(range(len(cancer.feature_names)), clf.feature_importances_[sorted_index])
plt.yticks(np.arange(len(cancer.feature_names)),cancer.feature_names[sorted_index])
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('GradientBoosting')
#plt.savefig('梯度提升特征排序.png')
plt.tight_layout()

图片.png

特征的筛选

根据变量的重要程度, 小于阈值的变量就直接扔掉
from sklearn.feature_selection import SelectFromModel
selection =SelectFromModel(clf,threshold=0.05,prefit=True)
select_X_train=selection.transform(X_train)
select_X_train.shape

# threshold=0.05,表示变量重要性小于0.05就扔掉,最后只留下了四个变量(和上图也一致)

# 将测试集也筛选一下
select_X_val=selection.transform(X_val)
select_X_val.shape