模型可解释分析-shap决策图(随机森林)

1,965 阅读1分钟

本文已参与「新人创作礼」活动,一起开启掘金创作之路

import pdpbox

In [2]:

pdpbox.__version__

Out[2]:

'0.2.1'

In [3]:

import matplotlib

In [4]:

matplotlib.__version__

Out[4]:

'3.1.1'

In [5]:

import numpy as np
import pandas as pd
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from mlxtend.regressor import StackingCVRegressor
import xgboost as xgb
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import zipfile
from sklearn.model_selection import KFold
import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor

In [6]:

df=pd.read_excel('训练验证集.xlsx')

In [7]:

y_train=df.iloc[:,-2:-1]
x_train=df.iloc[:,:-2]

In [8]:

base_features=x_train.columns.values.tolist()
base_features
# 转换成矩阵
x_train = np.array(x_train)
y_train = np.array(y_train)
rf= RandomForestRegressor(n_estimators=151,
            min_samples_split=6,
            max_features=0.32757967156069284, # float
            max_depth=15,
            random_state=2
        )

In [11]:

rf.fit(x_train,y_train)

Out[11]:

RandomForestRegressor(max_depth=15, max_features=0.32757967156069284,
                      min_samples_split=6, n_estimators=151, random_state=2)

In [12]:

len(rf.estimators_)

Out[12]:

151

In [13]:

estimator=rf.estimators_[7]
estimator

Out[13]:

DecisionTreeRegressor(max_depth=15, max_features=0.32757967156069284,
                      min_samples_split=6, random_state=2081981515)

shap值计算

In [14]:

import shap
shap.initjs()  # notebook环境下,加载用于可视化的JS代码

In [15]:

explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(x_train)  # 传入特征矩阵X,计算SHAP值

In [16]:

len(shap_values)

Out[16]:

1612

In [17]:

shap_values

Out[17]:

array([[-3.96753355e-03, -6.51443856e-04,  1.32161205e-02, ...,         5.74927788e-06, -3.03458209e-04, -1.96459520e-03],
       [-7.41014457e-02, -1.96309485e-03, -6.40134377e-02, ...,        -4.14909325e-04, -1.11277680e-03,  1.13786111e-02],
       [ 1.52521667e-03, -8.22765108e-03, -5.97435647e-03, ...,         2.44378338e-04, -1.87738210e-03,  7.86619630e-03],
       ...,
       [ 1.26195934e-02, -7.90231905e-03,  1.92048901e-02, ...,         2.99581980e-04,  3.06260106e-03, -7.44143871e-03],
       [ 1.82118786e-02,  9.21700844e-03, -3.05057511e-02, ...,         6.59092515e-04,  2.49748872e-03,  1.06991509e-02],
       [-5.78209781e-03, -1.51761964e-02,  3.70238744e-02, ...,         4.06063173e-04,  3.03302050e-03,  6.24285396e-03]])

可视化预测的解释

In [21]:

y_train=df.iloc[:,-2:-1]
x_train=df.iloc[:,:-2]

# 解决中文和负号显示问题
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

shap.decision_plot(explainer.expected_value, shap_values, x_train)

image.png

#查看典型的决策路径
shap.decision_plot(explainer.expected_value, shap_values, x_train,feature_order="hclust")

image.png

#对数几率缩放变换
shap.decision_plot(explainer.expected_value, shap_values, x_train,link="logit")

image.png

#绘制单个样本
shap.decision_plot(explainer.expected_value, shap_values[0,:], x_train.iloc[0,:])

image.png