
参考网址
blog.csdn.net/weixin_4627…
1/分类: 网格搜索+交叉验证
import xgboost as xgb
from sklearn.datasets import load_iris
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
iris = load_iris()
x = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(x, y,
test_size=0.3,
random_state=42)
xgb_clf = xgb.XGBClassifier(use_label_encoder=False,
eval_metric='mlogloss')
param_grid_dict = { 'max_depth': [3, 5, 7],
'n_estimators': [50, 100, 200],
'learning_rate': [0.01, 0.1, 0.2],
'subsample': [0.8, 1.0],
'colsample_bytree': [0.6, 0.8, 1.0]
}
grid_search = GridSearchCV(estimator=xgb_clf,
param_grid=param_grid_dict,
cv=5,
scoring='accuracy',
verbose=2,
n_jobs=-1)
grid_search.fit(X_train, y_train)
print( grid_search.best_params_ )
print( grid_search.best_score_ )
print( grid_search.best_estimator_ )

2/回归: 网格搜索+交叉验证
from xgboost.sklearn import XGBRegressor
from sklearn.datasets import load_iris, load_boston
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
x, y= load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(x, y,
test_size=0.3,
random_state=1)
model = XGBRegressor(objective='reg:squarederror',
n_estimators=300,
max_depth=6,
subsample=0.6,
colsample_bytree=0.8,
learning_rate=0.1,
random_state=0)
model.fit(X_train, y_train)
model.score(X_test, y_test)
pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, pred))
rmse
rng = np.random.RandomState(123)
kf = KFold(n_splits=3, shuffle=True, random_state=rng)
print("在3折数据上的交叉验证")
for train_index, test_index in kf.split(X):
xgb_model = xgb.XGBRegressor(objective='reg:squarederror',
n_estimators=300,
max_depth=6,
subsample=0.6,
colsample_bytree=0.8,
learning_rate=0.1,
random_state=0).fit(X[train_index],y[train_index])
predictions = xgb_model.predict( X[test_index] )
actuals = y[test_index]
print("均方根误差:")
print(np.sqrt(mean_squared_error(actuals, predictions)))
print('拟合优度')
print(xgb_model.score(X[test_index],y[test_index]))
model = xgb.XGBRegressor(objective='reg:squarederror',
subsample=0.6,
colsample_bytree=0.8,
random_state=0,nthread=8)
param_dict = {'max_depth': [5,6,7,8],
'n_estimators': [100,200,300],
'learning_rate':[0.05,0.1,0.2]}
clf = GridSearchCV(model, param_dict, cv=10, verbose=1 , scoring='r2')
clf.fit(X_train, y_train)
print(clf.best_score_)
print(clf.best_params_)
自定义目标函数和损失函数
from sklearn.datasets import load_breast_cancer
X,y=load_breast_cancer(return_X_y=True)
X.shape,y.shape
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.2,random_state=8)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)
xgb_train = xgb.DMatrix(X_train, y_train)
xgb_test = xgb.DMatrix( X_test,y_test )
params = {'booster':'gbtree','max_depth':5, 'eta':0.1}
num_round=50
watchlist = [(xgb_train,'train'), (xgb_test,'test')]
def logregobj(preds,dtrain):
labels=dtrain.get_label()
preds=1.0/(1.0+np.exp(-preds))
grad=preds-labels
hess=preds*(1.0-preds)
return grad,hess
def evalerror(preds,dtrain):
labels=dtrain.get_label()
return 'error',float(sum(labels!=(preds>0.0)))/len(labels)
model = xgb.train(params,
xgb_train,
num_round,
watchlist,
obj=logregobj,
feval=evalerror)
result=xgb.cv(params=params,dtrain=xgb_train,nfold=10,metrics='auc',
num_boost_round=300,as_pandas=True,seed=123,obj=logregobj,feval=evalerror)
result.head()
import matplotlib.pyplot as plt
plt.plot(range(1, 301), result['train-error-mean'], 'k', label='Training Error')
plt.plot(range(1, 301), result['test-error-mean'], 'b', label='Test Error')
plt.xlabel('Number of Trees')
plt.ylabel('AUC')
plt.axhline(0, linestyle='--', color='k', linewidth=1)
plt.legend()
plt.title('CV Errors for XGBoost')
plt.show()
特征的重要程度
<1>xgb包自带的画图用法
xgb.plot_importance(clf,height=0.5,importance_type='gain',max_num_features=10)

<2>sklearn库用法
clf.feature_importances_
cancer=load_breast_cancer()
cancer.feature_names
sorted_index = clf.feature_importances_.argsort()
plt.figure(figsize=(10,5))
plt.barh(range(len(cancer.feature_names)), clf.feature_importances_[sorted_index])
plt.yticks(np.arange(len(cancer.feature_names)),cancer.feature_names[sorted_index])
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('GradientBoosting')
plt.tight_layout()

特征的筛选
根据变量的重要程度, 小于阈值的变量就直接扔掉
from sklearn.feature_selection import SelectFromModel
selection =SelectFromModel(clf,threshold=0.05,prefit=True)
select_X_train=selection.transform(X_train)
select_X_train.shape
select_X_val=selection.transform(X_val)
select_X_val.shape