本文已参与「新人创作礼」活动,一起开启掘金创作之路
import pdpbox
In [2]:
pdpbox.__version__
Out[2]:
'0.2.1'
In [3]:
import matplotlib
In [4]:
matplotlib.__version__
Out[4]:
'3.1.1'
In [5]:
import numpy as np
import pandas as pd
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from mlxtend.regressor import StackingCVRegressor
import xgboost as xgb
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import zipfile
from sklearn.model_selection import KFold
import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
In [6]:
df=pd.read_excel('训练验证集.xlsx')
In [7]:
y_train=df.iloc[:,-2:-1]
x_train=df.iloc[:,:-2]
In [8]:
base_features=x_train.columns.values.tolist()
base_features
# 转换成矩阵
x_train = np.array(x_train)
y_train = np.array(y_train)
rf= RandomForestRegressor(n_estimators=151,
min_samples_split=6,
max_features=0.32757967156069284, # float
max_depth=15,
random_state=2
)
In [11]:
rf.fit(x_train,y_train)
Out[11]:
RandomForestRegressor(max_depth=15, max_features=0.32757967156069284,
min_samples_split=6, n_estimators=151, random_state=2)
In [12]:
len(rf.estimators_)
Out[12]:
151
In [13]:
estimator=rf.estimators_[7]
estimator
Out[13]:
DecisionTreeRegressor(max_depth=15, max_features=0.32757967156069284,
min_samples_split=6, random_state=2081981515)
shap值计算
In [14]:
import shap
shap.initjs() # notebook环境下,加载用于可视化的JS代码
In [15]:
explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(x_train) # 传入特征矩阵X,计算SHAP值
In [16]:
len(shap_values)
Out[16]:
1612
In [17]:
shap_values
Out[17]:
array([[-3.96753355e-03, -6.51443856e-04, 1.32161205e-02, ..., 5.74927788e-06, -3.03458209e-04, -1.96459520e-03],
[-7.41014457e-02, -1.96309485e-03, -6.40134377e-02, ..., -4.14909325e-04, -1.11277680e-03, 1.13786111e-02],
[ 1.52521667e-03, -8.22765108e-03, -5.97435647e-03, ..., 2.44378338e-04, -1.87738210e-03, 7.86619630e-03],
...,
[ 1.26195934e-02, -7.90231905e-03, 1.92048901e-02, ..., 2.99581980e-04, 3.06260106e-03, -7.44143871e-03],
[ 1.82118786e-02, 9.21700844e-03, -3.05057511e-02, ..., 6.59092515e-04, 2.49748872e-03, 1.06991509e-02],
[-5.78209781e-03, -1.51761964e-02, 3.70238744e-02, ..., 4.06063173e-04, 3.03302050e-03, 6.24285396e-03]])
可视化预测的解释
In [21]:
y_train=df.iloc[:,-2:-1]
x_train=df.iloc[:,:-2]
# 解决中文和负号显示问题
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
shap.decision_plot(explainer.expected_value, shap_values, x_train)
#查看典型的决策路径
shap.decision_plot(explainer.expected_value, shap_values, x_train,feature_order="hclust")
#对数几率缩放变换
shap.decision_plot(explainer.expected_value, shap_values, x_train,link="logit")
#绘制单个样本
shap.decision_plot(explainer.expected_value, shap_values[0,:], x_train.iloc[0,:])