模型可解释分析-PDP和ICE图(随机森林)

2,267 阅读2分钟

本文已参与「新人创作礼」活动,一起开启掘金创作之路

import pdpbox

In [2]:

pdpbox.__version__

Out[2]:

'0.2.1'

In [3]:

import matplotlib

In [4]:

matplotlib.__version__

Out[4]:

'3.1.1'

In [5]:

import numpy as np
import pandas as pd
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from mlxtend.regressor import StackingCVRegressor
import xgboost as xgb
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import zipfile
from sklearn.model_selection import KFold
import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor

In [6]:

import sklearn
sklearn.__version__

Out[6]:

'1.0.2'

In [7]:

df=pd.read_excel('训练验证集.xlsx')

In [8]:

y_train=df.iloc[:,-2:-1]
x_train=df.iloc[:,:-2]

In [9]:

base_features=x_train.columns.values.tolist()
base_features
# 转换成矩阵
x_train = np.array(x_train)
y_train = np.array(y_train)
rf= RandomForestRegressor(n_estimators=151,
            min_samples_split=6,
            max_features=0.32757967156069284, # float
            max_depth=15,
            random_state=2
        )

In [12]:

rf.fit(x_train,y_train)

Out[12]:

RandomForestRegressor(max_depth=15, max_features=0.32757967156069284,
                      min_samples_split=6, n_estimators=151, random_state=2)

In [13]:

len(rf.estimators_)

Out[13]:

151

In [14]:

estimator=rf.estimators_[7]
estimator

Out[14]:

DecisionTreeRegressor(max_depth=15, max_features=0.32757967156069284,
                      min_samples_split=6, random_state=2081981515)

先验和后验分布图

In [14]:

from pdpbox import info_plots,get_dataset,pdp

In [20]:

# 解决中文和负号显示问题
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [23]:

#考虑模型的影响
fig,axes,summary_df=info_plots.actual_plot(
model=rf,X=x_train,feature='挤出机1区(℃)实际',feature_name='挤出机1区温度',predict_kwds={})

image.png

#先验分布,最初始化的数据可视化,不包括模型
fig, axes, summary_df = info_plots.target_plot(
    df=df, 
    feature='挤出机1区(℃)实际', 
    feature_name='挤出机1区温度', 
    target="PFE过滤效率")

image.png

PDP图

In [47]:

feature='挤出机1区(℃)实际'
feature_name='挤出机1区温度'

pdp_dist=pdp.pdp_isolate(
model=rf,
dataset=x_train,
model_features=base_features,
feature=feature)

fig,axes=pdp.pdp_plot(pdp_dist,feature_name)

image.png

ICE图

In [48]:

feature='挤出机1区(℃)实际'
feature_name='挤出机1区温度'

pdp_dist=pdp.pdp_isolate(
model=rf,
dataset=x_train,
model_features=base_features,
feature=feature)

fig,axes=pdp.pdp_plot(pdp_dist,feature_name,center=True,plot_lines=True,frac_to_plot=0.8,plot_pts_dist=True)

image.png

循环遍历,给每一列都画一个pdp图

In [49]:

for each in base_features:
    feature=each
    pdp_dist=pdp.pdp_isolate(
    model=rf,
    dataset=x_train,
    model_features=base_features,
    feature=feature)

    fig,axes=pdp.pdp_plot(pdp_dist,feature_name)
    
    plt.show()

绘制二维pdp图

In [50]:

from pdpbox import pdp, get_dataset

In [51]:

# 解决中文和负号显示问题
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False
%matplotlib inline
%config InlineBackend.figure_format = 'retina'


inter1 = pdp.pdp_interact(model=rf,
                          dataset=x_train,
                          model_features=base_features,
                          features=['挤出机1区(℃)实际', '挤出机2区(℃)实际'],
                          num_grid_points=[10, 10],
                          percentile_ranges=[(5, 95), (5, 95)])
fig, axes = pdp.pdp_interact_plot(pdp_interact_out=inter1,
                                  feature_names=['挤出机1区(℃)实际', '挤出机2区(℃)实际'],
                                  plot_type='contour',
                                  x_quantile=True,
                                  plot_pdp=True)

image.png

#坐标轴均匀分割
fig, axes = pdp.pdp_interact_plot(pdp_interact_out=inter1,
                                  feature_names=['挤出机1区(℃)实际', '挤出机2区(℃)实际'],
                                  plot_type='contour',
                                  x_quantile=True,
                                  plot_pdp=True)

image.png

#矩阵形式
fig, axes = pdp.pdp_interact_plot(pdp_interact_out=inter1,
                                  feature_names=['挤出机1区(℃)实际', '挤出机2区(℃)实际'],
                                  plot_type='grid',
                                  x_quantile=True,
                                  plot_pdp=True)

image.png

sklearn实现pdp图

In [15]:

from sklearn.inspection import PartialDependenceDisplay

In [17]:

features = [0, 1, (0, 1)]
PartialDependenceDisplay.from_estimator(rf, x_train, features)

image.png

from sklearn.inspection import partial_dependence

pdp, axes = partial_dependence(rf, x_train, [0])
pdp

Out[19]:

array([[95.68515357, 95.68515357, 95.68426954, 95.68426954, 95.69376765,
        95.69412572, 95.6890373 , 95.68920405, 95.68907742, 95.68907742,
        95.6888087 , 95.6888087 , 95.68626793, 95.68549023, 95.68327186,
        95.68480083, 95.69438092, 95.69939998, 95.71252517, 95.72603375,
        95.73258157, 95.73683229, 95.75477367, 95.76273241, 95.76474213,
        95.75677723, 95.76798666, 95.76834687, 95.76636083, 95.76337077,
        95.75180081, 95.7483819 , 95.74601886, 95.74455192, 95.74505961,
        95.74571943, 95.74577111, 95.74523812, 95.74440591, 95.74436503,
        95.7450667 , 95.7450667 , 95.74510521, 95.74529334]])

In [20]:

axes

Out[20]:

[array([-5.43428604e+00, -5.20774039e+00, -4.98119475e+00, -4.75464911e+00,
        -4.52810347e+00, -4.30155783e+00, -4.07501219e+00, -3.84846655e+00,
        -3.62192091e+00, -3.39537527e+00, -2.94228399e+00, -2.71573834e+00,
        -2.48919270e+00, -2.26264706e+00, -2.03610142e+00, -1.80955578e+00,
        -1.58301014e+00, -1.35646450e+00, -1.12991886e+00, -9.03373218e-01,
        -6.76827577e-01, -4.50281936e-01, -2.23736295e-01,  2.80934574e-03,
         2.29354987e-01,  4.55900627e-01,  6.82446268e-01,  9.08991909e-01,
         1.13553755e+00,  1.36208319e+00,  1.58862883e+00,  1.81517447e+00,
         2.04172011e+00,  2.26826575e+00,  2.49481140e+00,  2.72135704e+00,
         2.94790268e+00,  3.17444832e+00,  3.40099396e+00,  3.62753960e+00,
         3.85408524e+00,  4.08063088e+00,  4.30717652e+00,  4.53372216e+00])]

sklearn实现ICE图

In [21]:

features = [0, 1]
PartialDependenceDisplay.from_estimator(rf, x_train, features,
    kind='individual')

image.png

features = [0, 1]
PartialDependenceDisplay.from_estimator(rf, x_train, features,
    kind='both')

image.png