# 特征工程-使用随机森林进行缺失值填补

·  阅读 105

「这是我参与2022首次更文挑战的第2天，活动详情查看：2022首次更文挑战」。

## 二、缺失值填补

1. 删除有缺省值的数据
2. 使用数据中该特征的均值填充缺失值
3. 使用数据中该特征的中位数填充缺失值
4. 使用数据中该特征的众数填充缺失值
5. 使用机器学习模型对缺失值进行填充

## 三、数据预处理

### 3.1、处理思路

namesexagetarget
zackmale201
rudymale301
alicefemale200
atommale310
alexfemale321
kerryfemale0
king201
nyxmale201
pettyfemale0

namegenderagecitytarget
zackmale21city_011
alicefemale22city_020

namegender=malegender=femaleagecity=city_01city=city_02city=city_03target
zack10211001
alice01220100

### 3.2、代码实现

``````import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer

# 创建DictVectorizer
dv = DictVectorizer(sparse=False)
# 读取数据
# 删除name列
df = df.drop(['name'], axis=1)
# 裁剪出特征值
X = df.iloc[:, 0:-1]

# 遍历特征值的列
for colum in X.iteritems():
# 对非数值型列进行处理（多类别数据）
if colum[1].dtype == np.object_:
# 拆分出列名和数据
feature_name, data = colum

# ①、将该列转换成字典
colum = data.map(lambda x: {feature_name: x})
colum = dv.fit_transform(colum)
# 多分类特征名转换后的特征名，如gender->[gender=male, gender=female]
features = dv.get_feature_names_out()
# 将新创建的列添加进去
X[features] = colum
# 删除当前列
X = X.drop([feature_name], axis=1)
# ②、如果原先值是空，则吧所以新添加的列设置为nan
if list(features).__contains__(feature_name):
features = list(features)
features.remove(feature_name)
features = np.array(features)

# 对于特征值是null的数据，转换后的各个特征也应为null
# 如：gender为null,那gender=male为null，gender=female为null

### 3.3、代码解析

#### （1）问题①

``````from sklearn.feature_extraction import DictVectorizer
# 待处理字典列表
data = [
{"gender": "male"},
{"gender": "female"},
{"gender": "unknow"},
{"gender": "male"},
{"gender": "male"}
]
dv = DictVectorizer(sparse=False)
# 转换数据
data = dv.fit_transform(data)
print(dv.get_feature_names_out())
print(data)

``````['gender=female' 'gender=male' 'gender=unknow']
[[0. 1. 0.]
[1. 0. 0.]
[0. 0. 1.]
[0. 1. 0.]
[0. 1. 0.]]

``````colum = data.map(lambda x: {feature_name: x})

#### （2）问题②

``````from sklearn.feature_extraction import DictVectorizer
data = [
{"gender": "male"},
{"gender": "female"},
{"gender": "unknow"},
{"gender": "male"},
{"gender": None}
]
dv = DictVectorizer(sparse=False)
data = dv.fit_transform(data)
print(dv.get_feature_names_out())
print(data)

``````['gender' 'gender=female' 'gender=male' 'gender=unknow']
[[ 0.  0.  1.  0.]
[ 0.  1.  0.  0.]
[ 0.  0.  0.  1.]
[ 0.  0.  1.  0.]
[nan  0.  0.  0.]]

## 四、使用随机森林填补缺失值

heightweightage
1817020
17818
16050
1706019

### 4.2、代码实现

``````y = df.iloc[:, [-1]]

# 按照当前列缺失值的数量进行升序排列
sortindex = np.argsort(X.isnull().sum(axis=0)).values #axis=0按列进行加和
for i in sortindex:
# 将当前列作为目标值
feature_i = X.iloc[:, i]

# 将其余列作为特征值（包括目标值）
tmp_df = pd.concat([X.iloc[:, X.columns != i], y], axis=1)
# 使用众数填充其余列缺失值
imp_mf = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
tmp_df_mf = imp_mf.fit_transform(tmp_df)

# 将feature_i中非空的样本作为训练数据
y_notnull = feature_i[feature_i.notnull()]
y_null = feature_i[feature_i.isnull()]
X_notnull = tmp_df_mf[y_notnull.index, :]
X_null = tmp_df_mf[y_null.index, :]

# 如果没有缺失值则填充下一列
if y_null.shape[0] == 0:
continue

# 建立随机森林回归树进行训练
rfc = RandomForestRegressor(n_estimators=100)
rfc = rfc.fit(X_notnull, y_notnull)

# 对缺失值进行预测
y_predict = rfc.predict(X_null)

# 填充缺失值
X.loc[X.iloc[:, i].isnull(), X.columns[i]] = y_predict

``````import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.impute import SimpleImputer

dv = DictVectorizer(sparse=False)
name = df['name']
df = df.drop(['name'], axis=1)
X = df.iloc[:, 0:-1]

# 遍历数据的列
for colum in X.iteritems():
# 对非数值型列进行处理
if colum[1].dtype == np.object_:
# 拆分出列名和数据
feature_name, data = colum

# 将该列转换成字典
colum = data.map(lambda x: {feature_name: x})
colum = dv.fit_transform(colum)
features = dv.get_feature_names_out()

# 将新创建的列添加进去
X[features] = colum

# 删除当前列
X = X.drop([feature_name], axis=1)

# 如果原先值是空，则吧所以新添加的列设置为nan
if list(features).__contains__(feature_name):
features = list(features)
features.remove(feature_name)
features = np.array(features)

y = df.iloc[:, [-1]]

# 按照当前列缺失值的数量进行升序排列
sortindex = np.argsort(X.isnull().sum(axis=0)).values
for i in sortindex:
# 将当前列作为目标值
feature_i = X.iloc[:, i]

# 将其余列作为特征值（包括目标值）
tmp_df = pd.concat([X.iloc[:, X.columns != i], y], axis=1)
# 使用众数填充其余列缺失值
imp_mf = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
tmp_df_mf = imp_mf.fit_transform(tmp_df)

# 将feature_i中非空的样本作为训练数据
y_notnull = feature_i[feature_i.notnull()]
y_null = feature_i[feature_i.isnull()]
X_notnull = tmp_df_mf[y_notnull.index, :]
X_null = tmp_df_mf[y_null.index, :]

# 如果没有缺失值则下一列
if y_null.shape[0] == 0:
continue

# 建立随机森林回归树进行训练
rfc = RandomForestRegressor(n_estimators=100)
rfc = rfc.fit(X_notnull, y_notnull)

# 对缺失值进行预测
y_predict = rfc.predict(X_null)

# 填充缺失值
X.loc[X.iloc[:, i].isnull(), X.columns[i]] = y_predict