# 二手车价格预测 | 构建AI模型并部署Web应用 ⛵

·  阅读 4075

## 💡 数据分析处理&特征工程

🏆 实战数据集下载（百度网盘）：公众号『ShowMeAI研究中心』回复『实战』，或者点击 这里 获取本文 [11] 构建AI模型并部署Web应用，预测二手车价格CarPrice 二手车价格预测数据集

ShowMeAI官方GitHubgithub.com/ShowMeAI-Hu…

### ① 数据探索

``````import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
%matplotlib.inline

``````df.corr()

``````sns.set(rc={"figure.figsize":(20, 20)})
sns.heatmap(df.corr(), annot = True)

``````for col in df.columns:
if df[col].dtypes != 'object':
sns.lmplot(data = df, x = col, y = 'price')

``````df.drop(['car_ID'], axis = 1, inplace = True)
to_drop = ['peakrpm', 'compressionratio', 'stroke', 'symboling']
df.drop(df[to_drop], axis = 1, inplace = True)

### ② 特征工程

``````df['CarName'] = df['CarName'].apply(lambda x: x.split()[0])

``````df['CarName'] = df['CarName'].str.lower()
df['CarName']=df['CarName'].replace({'vw':'volkswagen','vokswagen':'volkswagen','toyouta':'toyota','maxda':'mazda','porcshce':'porsche'})

``````sns.set(rc={'figure.figsize':(30,10)})
sns.countplot(data = df, x='CarName')

### ③ 特征编码&数据变换

• 类别型特征

• 数值型特征

``````categorical = []
numerical = []
for col in df.columns:
if df[col].dtypes == 'object':
categorical.append(col)
else:
numerical.append(col)

``````# 独热向量编码
x1 = pd.get_dummies(df[categorical], drop_first = False)
x2 = df[numerical]
X = pd.concat([x2,x1], axis = 1)
X.drop('price', axis = 1, inplace = True)

``````sns.histplot(data=df, x="price", kde=True)

``````#修复偏态分布
df["price_log"]=np.log(df["price"])
sns.histplot(data=df, x="price_log", kde=True)

## 💡 机器学习建模

### ① 数据集切分&数据变换

``````＃切分数据
from sklearn.model_selection import train_test_split

y = df['price_log']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.333, random_state=1)

＃特征工程-幅度缩放
from sklearn.preprocessing import StandardScaler
sc= StandardScaler()
X_train[:, :(len(x1.columns))]= sc.fit_transform(X_train[:, :(len(x1.columns))])
X_test[:, :(len(x1.columns))]= sc.transform(X_test[:, :(len(x1.columns))])

### ② 建模&调优

• Lasso regression
• Ridge regression
• 随机森林回归器
• XGBoost回归器

``````#回归模型
from sklearn.linear_model import Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

### ③ 建模 pipeline

• 使用随机超参数训练评估每个模型。
• 使用网格搜索调优每个模型的超参数。
• 用找到的最佳参数重新训练评估模型。

``````from sklearn.model_selection import GridSearchCV

``````def metrics(model):
res_r2 = []
res_RMSE = []
res_MSE = []
model.fit(X_train, y_train)
Y_pred = model.predict(X_test)

#计算R方
r2 = round(r2_score(y_test, Y_pred),4)
print( 'R2_Score: ', r2)
res_r2.append(r2)

#计算RMSE
rmse = round(mean_squared_error(np.exp(y_test),np.exp(Y_pred), squared=False), 2)
print("RMSE: ",rmse)
res_RMSE.append(rmse)

#计算MAE
mse = round(mean_absolute_error(np.exp(y_test),np.exp(Y_pred)), 2)
print("MAE: ", mse)
res_MSE.append(mse)

``````# 候选模型
models={
'rfr':RandomForestRegressor(bootstrap=False, max_depth=15, max_features='sqrt', min_samples_split=2, n_estimators=100),

'lasso':Lasso(alpha=0.005, fit_intercept=True),

'ridge':Ridge(alpha = 10, fit_intercept=True), 'xgb':xgb.XGBRegressor(bootstrap=True, max_depth=2, max_features = 'auto', min_sample_split = 2, n_estimators = 100)
}

# 不同的模型不同建模方法
for mod in models:
if mod == 'rfr' or mod == 'xgb':
print('Untuned metrics for: ', mod)
metrics(models[mod])
print('\n')
print('Starting grid search for: ', mod)
params = {
"n_estimators"      : [10,100, 1000, 2000, 4000, 6000],
"max_features"      : ["auto", "sqrt", "log2"],
"max_depth"         : [2, 4, 8, 12, 15],
"min_samples_split" : [2,4,8],
"bootstrap": [True, False],
}
if mod == 'rfr':
rfr = RandomForestRegressor()
grid = GridSearchCV(rfr, params, verbose=5, cv=2)
grid.fit(X_train, y_train)
print("Best score: ", grid.best_score_ )
print("Best: params", grid.best_params_)
else:
xgboost = xgb.XGBRegressor()
grid = GridSearchCV(xgboost, params, verbose=5, cv=2)
grid.fit(X_train, y_train)
print("Best score: ", grid.best_score_ )
print("Best: params", grid.best_params_)
else:
print('Untuned metrics for: ', mod)
metrics(models[mod])
print('\n')
print('Starting grid search for: ', mod)
params = {
"alpha": [0.005, 0.05, 0.1, 1, 10, 100, 290, 500],
"fit_intercept": [True, False]
}
if mod == 'lasso':
lasso = Lasso()
grid = GridSearchCV(lasso, params, verbose = 5, cv = 2)
grid.fit(X_train, y_train)
print("Best score: ", grid.best_score_ )
print("Best: params", grid.best_params_)
else:
ridge = Ridge()
grid = GridSearchCV(ridge, params, verbose = 5, cv = 2)
grid.fit(X_train, y_train)
print("Best score: ", grid.best_score_ )
print("Best: params", grid.best_params_)

``````lasso_reg = Lasso(alpha = 0.005, fit_intercept = True)
pickle.dump(lasso_reg, open('model.pkl','wb'))

## 💡 web应用开发

• 用户在网页表单中输入数据
• 处理数据（特征编码&变换）
• 数据处理以匹配模型输入格式
• 预测并呈现给用户的价格

### ① 基本开发

``````# df的列
#Columns of the df
df.drop(['Unnamed: 0','price'], axis = 1, inplace=True)
cols = df.columns

# df的哑变量列
dummy.drop('Unnamed: 0', axis = 1, inplace=True)
cols_to_use = dummy.columns

``````# 构建应用中的候选值

# 车品牌首字母大写
cars = df['CarName'].unique().tolist()
carNameCap = []
for col in cars:
carNameCap.append(col.capitalize())

#fueltype字段
fuel = df['fueltype'].unique().tolist()
fuelCap = []
for fu in fuel:
fuelCap.append(fu.capitalize())

#carbod, engine type, fuel systems等字段
carb = df['carbody'].unique().tolist()
engtype = df['enginetype'].unique().tolist()
fuelsys = df['fuelsystem'].unique().tolist()

OK，我们会针对上面这些模型预估需要用到的类别型字段，开发下拉功能并添加候选项。

``````# 数据变换处理以匹配模型
def transform(data):
# 数据幅度缩放
sc = StandardScaler()

# 导入模型

# 新数据Dataframe
new_df = pd.DataFrame([data],columns = cols)
# 区分类别型和数值型特征
cat = []
num = []
for col in new_df.columns:
if new_df[col].dtypes == 'object':
cat.append(col)
else:
num.append(col)
x1_new = pd.get_dummies(new_df[cat], drop_first = False)
x2_new = new_df[num]

X_new = pd.concat([x2_new,x1_new], axis = 1)
final_df = pd.DataFrame(columns = cols_to_use)
final_df = pd.concat([final_df, X_new])
final_df = final_df.fillna(0)
X_new = final_df.values
X_new[:, :(len(x1_new.columns))]= sc.fit_transform(X_new[:,
:(len(x1_new.columns))])
output = model.predict(X_new)
return "The price of the car " + str(round(np.exp(output)[0],2)) + "\$"

``````# 类别型
car = gr.Dropdown(label = "Car brand", choices=carNameCap)
# 数值型
curbweight = gr.Slider(label = "Weight of the car (in pounds)", minimum = 500, maximum = 6000)

### ② 部署

``````export GRADIO_SERVER_NAME=0.0.0.0

``````numpy
pandas
scikit-learn
argparse
gunicorn
rq

``````import gradio as gr
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler

# 数据字典
asp = {
'Standard':'std',
'Turbo':'turbo'
}

drivew = {
'Rear wheel drive': 'rwd',
'Front wheel drive': 'fwd',
'4 wheel drive': '4wd'
}

cylnum = {
2: 'two',
3: 'three',
4: 'four',
5: 'five',
6: 'six',
8: 'eight',
12: 'twelve'
}

# 原始df字段名
df.drop(['Unnamed: 0','price'], axis = 1, inplace=True)
cols = df.columns

# 独热向量编码过后的字段名
dummy.drop('Unnamed: 0', axis = 1, inplace=True)
cols_to_use = dummy.columns

# 车品牌名
cars = df['CarName'].unique().tolist()
carNameCap = []
for col in cars:
carNameCap.append(col.capitalize())

# fuel
fuel = df['fueltype'].unique().tolist()
fuelCap = []
for fu in fuel:
fuelCap.append(fu.capitalize())

#For carbod, engine type, fuel systme
carb = df['carbody'].unique().tolist()
engtype = df['enginetype'].unique().tolist()
fuelsys = df['fuelsystem'].unique().tolist()

#Function to model data to fit the model
def transform(data):
# 数值型幅度缩放
sc= StandardScaler()

# 导入模型

# 新数据Dataframe
new_df = pd.DataFrame([data],columns = cols)

# 切分类别型与数值型字段
cat = []
num = []
for col in new_df.columns:
if new_df[col].dtypes == 'object':
cat.append(col)
else:
num.append(col)

# 构建模型所需数据格式
x1_new = pd.get_dummies(new_df[cat], drop_first = False)
x2_new = new_df[num]
X_new = pd.concat([x2_new,x1_new], axis = 1)

final_df = pd.DataFrame(columns = cols_to_use)
final_df = pd.concat([final_df, X_new])
final_df = final_df.fillna(0)
final_df = pd.concat([final_df,dummy])

X_new = final_df.values
X_new[:, :(len(x1_new.columns))]= sc.fit_transform(X_new[:, :(len(x1_new.columns))])
print(X_new[-1].reshape(-1, 1))
output = lasso_reg.predict(X_new[-1].reshape(1, -1))
return "The price of the car " + str(round(np.exp(output)[0],2)) + "\$"

# 预估价格的主函数
def predict_price(car, fueltype, aspiration, doornumber, carbody, drivewheel, enginelocation, wheelbase, carlength, carwidth,
carheight, curbweight, enginetype, cylindernumber, enginesize, fuelsystem, boreratio, horsepower, citympg, highwaympg):

new_data = [car.lower(), fueltype.lower(), asp[aspiration], doornumber.lower(), carbody, drivew[drivewheel], enginelocation.lower(),
wheelbase, carlength, carwidth, carheight, curbweight, enginetype, cylnum[cylindernumber], enginesize, fuelsystem,
boreratio, horsepower, citympg, highwaympg]

return transform(new_data)

car = gr.Dropdown(label = "Car brand", choices=carNameCap)

fueltype = gr.Radio(label = "Fuel Type", choices = fuelCap)

aspiration = gr.Radio(label = "Aspiration type", choices = ["Standard", "Turbo"])

doornumber = gr.Radio(label = "Number of doors", choices = ["Two", "Four"])

carbody = gr.Dropdown(label ="Car body type", choices = carb)

drivewheel = gr.Radio(label = "Drive wheel", choices = ['Rear wheel drive', 'Front wheel drive', '4 wheel drive'])

enginelocation = gr.Radio(label = "Engine location", choices = ['Front', 'Rear'])

wheelbase = gr.Slider(label = "Distance between the wheels on the side of the car (in inches)", minimum = 50, maximum = 300)

carlength = gr.Slider(label = "Length of the car (in inches)", minimum = 50, maximum = 300)

carwidth = gr.Slider(label = "Width of the car (in inches)", minimum = 50, maximum = 300)

carheight = gr.Slider(label = "Height of the car (in inches)", minimum = 50, maximum = 300)

curbweight = gr.Slider(label = "Weight of the car (in pounds)", minimum = 500, maximum = 6000)

enginetype = gr.Dropdown(label = "Engine type", choices = engtype)

cylindernumber = gr.Radio(label = "Cylinder number", choices = [2, 3, 4, 5, 6, 8, 12])

enginesize = gr.Slider(label = "Engine size (swept volume of all the pistons inside the cylinders)", minimum = 50, maximum = 500)

fuelsystem = gr.Dropdown(label = "Fuel system (link to ressource: ", choices = fuelsys)

boreratio = gr.Slider(label = "Bore ratio (ratio between cylinder bore diameter and piston stroke)", minimum = 1, maximum = 6)

horsepower = gr.Slider(label = "Horse power of the car", minimum = 25, maximum = 400)

citympg = gr.Slider(label = "Mileage in city (in km)", minimum = 0, maximum = 100)

highwaympg = gr.Slider(label = "Mileage on highway (in km)", minimum = 0, maximum = 100)

Output = gr.Textbox()

app = gr.Interface(title="Predict the price of a car based on its specs",
fn=predict_price,
inputs=[car,
fueltype,
aspiration,
doornumber,
carbody,
drivewheel,
enginelocation,
wheelbase,
carlength,
carwidth,
carheight,
curbweight,
enginetype,
cylindernumber,
enginesize,
fuelsystem,
boreratio,
horsepower,
citympg,
highwaympg
],
outputs=Output)

app.launch()