前言
数据分析这一方法已经在很多领域都获得了良好的应用反馈,它源于人们在海量数据中用科学的方式方法去探索事物发展的规律,了解事物运动的状态,以便于在未来做出更为合理的决策,本文旨在以二手车的交易价格为预测目标进行探索。
1.1 Jupyter设置、导包及数据集加载
导入相关模块。
import pandas as pd,numpy as np,matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.exceptions import ConvergenceWarning
from typing import types
import sklearn
import pandas_profiling
拦截警告
warnings.filterwarnings('ignore')
warnings.filterwarnings(action ='ignore',category=ConvergenceWarning)
防止中文乱码,设置seaborn中文字体。
mpl.rcParams['font.sans-serif'] =[u'simHei']
mpl.rcParams['axes.unicode_minus'] =False
sns.set(font='SimHei')
设置jupyter显示行数
pd.options.display.min_rows = None
pd.set_option('display.expand_frame_repr', False)
pd.set_option('expand_frame_repr', False)
pd.set_option('max_rows', 30)
pd.set_option('max_columns', 30)
加载数据。
df_train = pd.read_csv('used_car_train_20200313.csv',sep=' ',encoding='utf-8')
df_test = pd.read_csv('used_car_testB_20200421.csv',sep=' ',encoding='utf-8')
1.2 探索性分析
1.2.1 预览数据集
-预览数据集
df_train.head().append(df_train.tail())
————————————————————————————————————————————————————————
SaleID name regDate model brand bodyType fuelType gearbox power kilometer notRepairedDamage regionCode seller offerType creatDate ... v_0 v_1 v_2 v_3 v_4 v_5 v_6 v_7 v_8 v_9 v_10 v_11 v_12 v_13 v_14
0 0 736 20040402 30.0 6 1.0 0.0 0.0 60 12.5 0.0 1046 0 0 20160404 ... 43.357796 3.966344 0.050257 2.159744 1.143786 0.235676 0.101988 0.129549 0.022816 0.097462 -2.881803 2.804097 -2.420821 0.795292 0.914762
1 1 2262 20030301 40.0 1 2.0 0.0 0.0 0 15.0 - 4366 0 0 20160309 ... 45.305273 5.236112 0.137925 1.380657 -1.422165 0.264777 0.121004 0.135731 0.026597 0.020582 -4.900482 2.096338 -1.030483 -1.722674 0.245522
2 2 14874 20040403 115.0 15 1.0 0.0 0.0 163 12.5 0.0 2806 0 0 20160402 ... 45.978359 4.823792 1.319524 -0.998467 -0.996911 0.251410 0.114912 0.165147 0.062173 0.027075 -4.846749 1.803559 1.565330 -0.832687 -0.229963
3 3 71865 19960908 109.0 10 0.0 0.0 1.0 193 15.0 0.0 434 0 0 20160312 ... 45.687478 4.492574 -0.050616 0.883600 -2.228079 0.274293 0.110300 0.121964 0.033395 0.000000 -4.509599 1.285940 -0.501868 -2.438353 -0.478699
4 4 111080 20120103 110.0 5 1.0 0.0 0.0 68 5.0 0.0 6977 0 0 20160313 ... 44.383511 2.031433 0.572169 -1.571239 2.246088 0.228036 0.073205 0.091880 0.078819 0.121534 -1.896240 0.910783 0.931110 2.834518 1.923482
149995 149995 163978 20000607 121.0 10 4.0 0.0 1.0 163 15.0 0.0 4576 0 0 20160327 ... 45.316543 -3.139095 -1.269707 -0.736609 -1.505820 0.280264 0.000310 0.048441 0.071158 0.019174 1.988114 -2.983973 0.589167 -1.304370 -0.302592
149996 149996 184535 20091102 116.0 11 0.0 0.0 0.0 125 10.0 0.0 2826 0 0 20160312 ... 45.972058 -3.143764 -0.023523 -2.366699 0.698012 0.253217 0.000777 0.084079 0.099681 0.079371 1.839166 -2.774615 2.553994 0.924196 -0.272160
149997 149997 147587 20101003 60.0 11 1.0 1.0 0.0 90 6.0 0.0 3302 0 0 20160328 ... 44.733481 -3.105721 0.595454 -2.279091 1.423661 0.233353 0.000705 0.118872 0.100118 0.097914 2.439812 -1.630677 2.290197 1.891922 0.414931
149998 149998 45907 20060312 34.0 10 3.0 1.0 0.0 156 15.0 0.0 1877 0 0 20160401 ... 45.658634 -3.204785 -0.441680 -1.179812 0.620680 0.256369 0.000252 0.081479 0.083558 0.081498 2.075380 -2.633719 1.414937 0.431981 -1.659014
149999 149999 177672 19990204 19.0 28 6.0 0.0 1.0 193 12.5 0.0
- 预览相关统计量
df_train.describe()
————————————————————————————————————————————————————————
- 预览数据类型
df_train.info()
————————————————————————————————————————————————————————
- 预览训练集、测试集维度
df_train.shape,df_test.shape
————————————————————————————————————————————————————————
((150000, 31), (50000, 30))
- 缺失值数量与分布
df_train.isnull().sum()
missing_pct = df_train.isnull().sum() * 100 / len(df_train) #将列中为空的个数统计出来
missing = pd.DataFrame({
'name': df_train.columns,
'missing_pct': missing_pct,
})
missing.sort_values(by='missing_pct', ascending=False).head()
————————————————————————————————————————————————————————
name missing_pct
fuelType fuelType 5.786667
gearbox gearbox 3.987333
bodyType bodyType 3.004000
model model 0.000667
SaleID SaleID 0.000000
1.2.2 二手车价格分数量与分布
- price在0~1000的样本数量达到120000
- price在1000~2000的样本数量不足20000
- price在2000以上的样本数量相对更少
fig=plt.figure(figsize=(15,8))
df_train['price'].plot.hist(color='pink',alpha=0.75)
plt.title('价格分布',fontsize=23)
1.2.3 品牌与价格
- brand为24的二手车价格最高,达到35000以上
- 绝大多数二手车价格在15000以下
plt.figure()
df_train[['brand','price']].groupby('brand')['price'].mean().plot.bar(
color='r')
plt.show()
1.2.4 车身类型与价格
- 车身类型为5,6的二手车价格超过10000
- 车身类型为1的二手车价格较低,不足4000
plt.figure()
df_train[['bodyType','price']].groupby('bodyType')['price'].mean().plot.bar(
color='g')
plt.show()
1.2.5 变速箱、发动机功率与价格
- 手动挡二手车价格普遍较低
- 自动挡二手车价格整体高于手动挡
fig = plt.figure(figsize=(25, 10))
sns.violinplot(x='fuelType', y='price',
hue='gearbox', data=df_train,split=True,alpha=0.9,
palette={0: "r", 1: "g"})
plt.title('燃油类型-价格分布',fontsize=25)
plt.xlabel('fuelType',fontsize=22)
plt.ylabel('price',fontsize=22)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.legend(fontsize=20)
1.3 数据清洗
- 将测试集添加到训练集中
targets = df_train['price']
combined = df_train.drop('price',axis=1).append(df_test)
1.3.1 缺失值处理
- 存在缺失值的特征
combined.isnull().sum().sort_values(ascending=False)
————————————————————————————————————————————————————————
fuelType 11604
gearbox 7949
bodyType 6010
model 1
v_14 0
- 用众数来填充缺失值
combined['fuelType'] = combined['fuelType'].fillna(0)
combined['gearbox'] = combined['gearbox'].fillna(0)
combined['bodyType'] = combined['bodyType'].fillna(0)
combined['model'] = combined['model'].fillna(0)
combined.isnull().sum().sort_values(ascending=False)
————————————————————————————————————————————————————————
v_14 0
v_13 0
name 0
regDate 0
model 0
..
v_9 0
1.3.2 异常值处理
- 将power处理为[0,600]范围内
combined['power'] =combined['power'].map(lambda x : x if x<600 else 600)
- 转换price数值使得price更符合正态分布
plt.figure()
plt.figure(figsize=(10, 3))
plt.subplot(1, 3,1)
sns.distplot(targets)
plt.subplot(1,3,2)
targets.plot.box()
plt.subplot(1,3,3)
sns.distplot(np.log1p(targets))
plt.xlabel('tr_price')
plt.show()
targets= np.log1p(targets)
- 将notRepairedDamage数据类型改为float,‘-’字符设置为-1
combined['notRepairedDamage'] =combined['notRepairedDamage'].astype(str).map(lambda x : -1 if x=='-' else x).astype(float)
1.4 特征工程
1.4.1处理seller
- 20万样本中只有一个样本的值为1,因此这个特征对于模型泛化没有价值。
combined['seller'].value_counts()
combined.drop('seller',axis=1)
————————————————————————————————————————————————————————
0 199999
1 1
Name: seller, dtype: int64
1.4.2 处理offerType
- 在全部样本中offerType只有一个0值,因此删除
df_train['offerType'].value_counts()
1.4.3 处理SaleID、name
- 每一次的交易仅有唯一随机生成的SaleID,name也是交易时产生的,因此对price的预测来看并无价值
del combined['name']
del combined['SaleID']
combined.shape
————————————————————————————————————————————————————————
(200000, 26)
1.4.4 处理regDate、creatDate
- 转换成时间格式后,计算二手车的使用时间
- 转成时间格式,提取时间
def date_modify(date):
year = int(str(date)[:4])
month = int(str(date)[4:6])
day = int(str(date)[6:8])
if month < 1:
month = 1
date_tr = datetime(year, month, day)
return date_tr
combined['regDate'] = combined['regDate'].apply(date_modify)
combined['creatDate'] = combined['creatDate'].apply(date_modify)
- 计算使用时间
sur_days=(combined['creatDate']-combined['regDate']).dt.days
sur_years =round(sur_days/365,1)
combined['sur_years'] =sur_years
del combined['regDate']
del combined['creatDate']
1.4.5 处理model、power、kilometer、sur_years
-
待处理的特征
-
regDate 汽车注册日期,如20160101,2016年01月01日 int64
-
1 model 车型编码 float64
-
2 brand 汽车品牌 int64
-
3 bodyType 车身类型 0,1,2,3,4,5,6,7 float64
-
4 fuelType 燃油类型 0,1,2,3,4,5,6 float64
-
5 gearbox 变速箱 0,1 float64
-
6 power 发动机功率 [0,600] int64
-
7 kilometer 汽车已行驶公里 float64
-
8 notRepairedDamage 汽车有尚未修复的损坏 0,1 float64
-
9 regionCode 地区编码 int64
-
10 creatDate 汽车上线时间
-
数据分箱处理model、power、kilometer
model_list =[50,100,150]
kilometer_list= [4,8,12.5]
power_list =[100,200,300]
def modify_df(feature,feature_list):
global combined
combined.loc[combined[feature]<feature_list[0],feature]=0
combined.loc[(combined[feature] >= feature_list[0]) & (combined[feature] < feature_list[1]),
feature] = 1
combined.loc[(combined[feature] >= feature_list[1]) & (combined[feature] < feature_list[2]),
feature] = 2
combined.loc[(combined[feature] >= feature_list[2]) , feature] = 3
return combined
combined=modify_df('model',model_list)
combined=modify_df('kilometer',kilometer_list)
combined=modify_df('power',power_list)
1.4.6 处理bodyType、fuelType、gearbox、notRepairedDamage
- 独热编码处理bodyType、fuelType、gearbox、notRepairedDamage
# 独热编码
#bodyType、fuelType、gearbox、notRepairedDamage
def dummies_coder():
global combined
for name in ['bodyType','fuelType','gearbox',
'notRepairedDamage']:
df_dummies = pd.get_dummies(combined[name],prefix=name)
combined = pd.concat([combined,df_dummies],axis=1)
combined.drop(name,axis=1,inplace=True)
return combined
combined =dummies_coder()
1.4.7 处理v系列特征
- 利用‘v’开头的特点提取v系列特征
v_features=[i for i in combined.columns if i.startswith('v')]
v_features
————————————————————————————————————————————————————————
['v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13', 'v_14']
- 特征相关性分析,相关系数大于等于0.8是为高度相关
- 与v_1相关性较高的是v_6,v_10
- 与v_2相关性较高的是v_5,v_7,v_11
- 与v_3相关性较高的是v_8,v_12
- 与v_4相关性较高的是v_9,v_13
sns.heatmap(combined[v_features].corr(),annot=True)
plt.show()
- 删除高度相关的特征、
combined.drop(['v_6','v_10','v_5','v_7','v_11','v_8','v_12','v_9','v_13'],axis=1,inplace=True),
1.5 模型训练
- 导入相关模块并划分数据集
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.preprocessing import MinMaxScaler,Normalizer
x_train,x_val,y_train,y_val =train_test_split(train,targets,test_size=0.2,random_state=42)
train =combined[:150000]
test=combined[150000:]
targets =targets.ravel()
1.5.1 利用基础模型及评分
-创建模型,使用MinMaxScaler后mae低于Normalizer
extree =ExtraTreesRegressor()
xgb =xgb.XGBRegressor()
lgbm = lgb.sklearn.LGBMRegressor()
models =[extree,lgbm ,xgb]
mms=MinMaxScaler()
norm =Normalizer()
# x_train=norm.fit_transform(x_train)
# x_val=norm.fit_transform(x_val)
x_train=mms.fit_transform(x_train)
x_val=mms.transform(x_val)
- 模型训练与评估
for model in models:
model=model.fit(x_train,y_train)
predict_train =model.predict(x_train)
predict_val=model.predict(x_val)
print(model)
print('val mean_absolute_error-score :',metrics.mean_absolute_error(y_val,predict_val))
print('**********************************')————————————————————————————————————————————————————————
ExtraTreesRegressor()
val mean_absolute_error-score : 0.14903028851909225
**********************************
LGBMRegressor()
val mean_absolute_error-score : 0.1624232817793703
**********************************
XGBRegressor()
val mean_absolute_error-score : 0.18783559941510433
**********************************
1.5.2 调节超参数
-xgboost调参
model_xgb=xgb.XGBRegressor(objective='reg:linear',random_state=2021)
xgb_dic =dict(learning_rate=[0.05,0.1],n_estimators=[50,120,300],min_child_weight=list(range(1,3,1)))
grid_search = GridSearchCV(model_lgb, cv=5,param_grid=xgb_dic,
scoring='neg_mean_absolute_error')
grid_search.fit(x_train,y_train)
print(f'最好的参数是:{grid_search.best_params_}')
print(f'最好的分数是:{-(grid_search.best_score_)}')
————————————————————————————————————————————————————————
最好的参数是:{'learning_rate': 0.1, 'min_child_weight': 1, 'n_estimators': 300}
最好的分数是:0.15341777583319216
- lightgbm调参
model_lgb=lgb.sklearn.LGBMRegressor(random_state=2021)
lgb_dic=dict(learning_rate=[0.01, 0.1, 0.5], n_estimators=[20,120,300],
num_leaves=[10,30,50],max_depth=[-1,4,10])
grid_search = GridSearchCV(model_lgb, cv=5,param_grid=lgb_dic,
scoring='neg_mean_absolute_error')
grid_search.fit(x_train,y_train)
print(f'最好的参数是:{grid_search.best_params_}')
print(f'最好的分数是:{-grid_search.best_score_}')
# 二次调参
lgb_final =LGBMRegressor(learning_rate=0.095, n_estimators=1000,
num_leaves=1000,max_depth=40)
lgb_final=lgb_final.fit(x_train,y_train)
pre_lgb=lgb_final.predict(x_val)
print(lgb_final)
print('val mean_absolute_error-score :',mean_absolute_error(y_val,pre_lgb))
________________________________________________________
最好的参数是:{'learning_rate': 0.1, 'max_depth': -1, 'n_estimators': 300, 'num_leaves': 50}
最好的分数是:0.15050557602691242
LGBMRegressor(learning_rate=0.095, max_depth=40, n_estimators=1000,
num_leaves=1000)
val mean_absolute_error-score : 0.14219576061384298
- ExtraTree调参
model_extree=ExtraTreesRegressor(random_state=2021)
extree_dic=dict(learning_rate=[0.01,0.1], n_estimators=[10,20,120],
num_leaves=[10,50],max_depth=[-1,10])
grid_search = GridSearchCV(model_lgb, cv=5,param_grid=extree_dic,
scoring='neg_mean_absolute_error')
grid_search.fit(x_train,y_train)
print(f'最好的参数是:{grid_search.best_params_}')
print(f'最好的分数是:{-grid_search.best_score_}')
______________________________________________________
最好的参数是:{'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 120, 'num_leaves': 50}
最好的分数是:0.15625675201570516
- 模型验证
for model in model_box:
model_fit=model.fit(x_train,y_train)
val_pred =model_fit.predict(x_val)
mean_absolute_error=metrics.mean_absolute_error(y_val,val_pred)
print(model)
print(f'mean_absolute_error:{mean_absolute_error}')
print('*' * 50)
————————————————————————————————————————————————————————————————————
XGBRegressor(n_estimators=300, random_state=2021)
mean_absolute_error:0.16986349426659875
**************************************************
LGBMRegressor(learning_rate=0.095, max_depth=40, n_estimators=1000,
num_leaves=1000,random_state=2021)
val mean_absolute_error-score : 0.14219576061384298
**************************************************
ExtraTreesRegressor(max_depth=10, n_estimators=120, random_state=2021)
mean_absolute_error:0.1932620690315051
**************************************************
1.5.3 模型预测与输出结果
- 加权融合
test_xgb=xgb_final.fit(x_train,y_train).predict(test)
test_lgb=lgb_final.fit(x_train,y_train).predict(test)
test_extree=extree_final.fit(x_train,y_train).predict(test)
test_mix = 1/3*test_xgb + 1/3*test_lgb + 1/3*test_extree
test_mixed=np.expm1(test_mix)
- 输出结果
df_predictions = pd.DataFrame()
abc = pd.read_csv('used_car_sample_submit.csv',encoding='gbk')
df_predictions['SaleID'] = abc['SaleID']
df_predictions['price'] = test_mixed
df_predictions[['SaleID','price']].to_csv('417submit.csv', index=False)