本文已参与「新人创作礼」活动,一起开启掘金创作之路
import pdpbox
In [2]:
pdpbox.__version__
Out[2]:
'0.2.1'
In [3]:
import matplotlib
In [4]:
matplotlib.__version__
Out[4]:
'3.1.1'
In [5]:
import numpy as np
import pandas as pd
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from mlxtend.regressor import StackingCVRegressor
import xgboost as xgb
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import zipfile
from sklearn.model_selection import KFold
import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
In [6]:
import sklearn
sklearn.__version__
Out[6]:
'1.0.2'
In [7]:
df=pd.read_excel('训练验证集.xlsx')
In [8]:
y_train=df.iloc[:,-2:-1]
x_train=df.iloc[:,:-2]
In [9]:
base_features=x_train.columns.values.tolist()
base_features
# 转换成矩阵
x_train = np.array(x_train)
y_train = np.array(y_train)
rf= RandomForestRegressor(n_estimators=151,
min_samples_split=6,
max_features=0.32757967156069284, # float
max_depth=15,
random_state=2
)
In [12]:
rf.fit(x_train,y_train)
Out[12]:
RandomForestRegressor(max_depth=15, max_features=0.32757967156069284,
min_samples_split=6, n_estimators=151, random_state=2)
In [13]:
len(rf.estimators_)
Out[13]:
151
In [14]:
estimator=rf.estimators_[7]
estimator
Out[14]:
DecisionTreeRegressor(max_depth=15, max_features=0.32757967156069284,
min_samples_split=6, random_state=2081981515)
先验和后验分布图
In [14]:
from pdpbox import info_plots,get_dataset,pdp
In [20]:
# 解决中文和负号显示问题
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
In [23]:
#考虑模型的影响
fig,axes,summary_df=info_plots.actual_plot(
model=rf,X=x_train,feature='挤出机1区(℃)实际',feature_name='挤出机1区温度',predict_kwds={})
#先验分布,最初始化的数据可视化,不包括模型
fig, axes, summary_df = info_plots.target_plot(
df=df,
feature='挤出机1区(℃)实际',
feature_name='挤出机1区温度',
target="PFE过滤效率")
PDP图
In [47]:
feature='挤出机1区(℃)实际'
feature_name='挤出机1区温度'
pdp_dist=pdp.pdp_isolate(
model=rf,
dataset=x_train,
model_features=base_features,
feature=feature)
fig,axes=pdp.pdp_plot(pdp_dist,feature_name)
ICE图
In [48]:
feature='挤出机1区(℃)实际'
feature_name='挤出机1区温度'
pdp_dist=pdp.pdp_isolate(
model=rf,
dataset=x_train,
model_features=base_features,
feature=feature)
fig,axes=pdp.pdp_plot(pdp_dist,feature_name,center=True,plot_lines=True,frac_to_plot=0.8,plot_pts_dist=True)
循环遍历,给每一列都画一个pdp图
In [49]:
for each in base_features:
feature=each
pdp_dist=pdp.pdp_isolate(
model=rf,
dataset=x_train,
model_features=base_features,
feature=feature)
fig,axes=pdp.pdp_plot(pdp_dist,feature_name)
plt.show()
绘制二维pdp图
In [50]:
from pdpbox import pdp, get_dataset
In [51]:
# 解决中文和负号显示问题
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
inter1 = pdp.pdp_interact(model=rf,
dataset=x_train,
model_features=base_features,
features=['挤出机1区(℃)实际', '挤出机2区(℃)实际'],
num_grid_points=[10, 10],
percentile_ranges=[(5, 95), (5, 95)])
fig, axes = pdp.pdp_interact_plot(pdp_interact_out=inter1,
feature_names=['挤出机1区(℃)实际', '挤出机2区(℃)实际'],
plot_type='contour',
x_quantile=True,
plot_pdp=True)
#坐标轴均匀分割
fig, axes = pdp.pdp_interact_plot(pdp_interact_out=inter1,
feature_names=['挤出机1区(℃)实际', '挤出机2区(℃)实际'],
plot_type='contour',
x_quantile=True,
plot_pdp=True)
#矩阵形式
fig, axes = pdp.pdp_interact_plot(pdp_interact_out=inter1,
feature_names=['挤出机1区(℃)实际', '挤出机2区(℃)实际'],
plot_type='grid',
x_quantile=True,
plot_pdp=True)
sklearn实现pdp图
In [15]:
from sklearn.inspection import PartialDependenceDisplay
In [17]:
features = [0, 1, (0, 1)]
PartialDependenceDisplay.from_estimator(rf, x_train, features)
from sklearn.inspection import partial_dependence
pdp, axes = partial_dependence(rf, x_train, [0])
pdp
Out[19]:
array([[95.68515357, 95.68515357, 95.68426954, 95.68426954, 95.69376765,
95.69412572, 95.6890373 , 95.68920405, 95.68907742, 95.68907742,
95.6888087 , 95.6888087 , 95.68626793, 95.68549023, 95.68327186,
95.68480083, 95.69438092, 95.69939998, 95.71252517, 95.72603375,
95.73258157, 95.73683229, 95.75477367, 95.76273241, 95.76474213,
95.75677723, 95.76798666, 95.76834687, 95.76636083, 95.76337077,
95.75180081, 95.7483819 , 95.74601886, 95.74455192, 95.74505961,
95.74571943, 95.74577111, 95.74523812, 95.74440591, 95.74436503,
95.7450667 , 95.7450667 , 95.74510521, 95.74529334]])
In [20]:
axes
Out[20]:
[array([-5.43428604e+00, -5.20774039e+00, -4.98119475e+00, -4.75464911e+00,
-4.52810347e+00, -4.30155783e+00, -4.07501219e+00, -3.84846655e+00,
-3.62192091e+00, -3.39537527e+00, -2.94228399e+00, -2.71573834e+00,
-2.48919270e+00, -2.26264706e+00, -2.03610142e+00, -1.80955578e+00,
-1.58301014e+00, -1.35646450e+00, -1.12991886e+00, -9.03373218e-01,
-6.76827577e-01, -4.50281936e-01, -2.23736295e-01, 2.80934574e-03,
2.29354987e-01, 4.55900627e-01, 6.82446268e-01, 9.08991909e-01,
1.13553755e+00, 1.36208319e+00, 1.58862883e+00, 1.81517447e+00,
2.04172011e+00, 2.26826575e+00, 2.49481140e+00, 2.72135704e+00,
2.94790268e+00, 3.17444832e+00, 3.40099396e+00, 3.62753960e+00,
3.85408524e+00, 4.08063088e+00, 4.30717652e+00, 4.53372216e+00])]
sklearn实现ICE图
In [21]:
features = [0, 1]
PartialDependenceDisplay.from_estimator(rf, x_train, features,
kind='individual')
features = [0, 1]
PartialDependenceDisplay.from_estimator(rf, x_train, features,
kind='both')