Python 多项式回归训练示例

794 阅读3分钟

酶活性预测实战task(非线性曲线预测 ): 1、基于T-R-train.csv数据,建立线性回归模型,计算其在T-R-test.csv数据上的r2分数,可视化模型预测结果 2、加入多项式特征(2次、5次),建立回归模型 3、计算多项式回归模型对测试数据进行预测的r2分数,判断哪个模型预测更准确 4、可视化多项式回归模型数据预测结果,判断哪个模型预测更准确

#加载数据
import pandas as pd
import numpy as np
data_train = pd.read_csv('T-R-train.csv')
data_train.head()#展示头部数据
T rate
0 45.376344 2.334559
1 52.186380 2.775735
2 61.863799 2.930147
3 73.154122 2.488971
4 78.888889 1.981618
5 82.473118 1.518382
6 43.046595 2.080882
#定义训练数据
X_train = data_train.loc[:,'T']
y_train = data_train.loc[:,'rate']
#训练数据可视化
%matplotlib inline
from matplotlib import pyplot as plt
fig1 = plt.figure(figsize=(5,5))
plt.scatter(X_train,y_train)
plt.title('raw data')
plt.xlabel('temperature')
plt.ylabel('rate')
plt.show()

gqe0t1.png

X_train = np.array(X_train).reshape(-1,1)#必须转换成一维的数组,否则会报错,reshape(-1,1)若干行一列
#模型预测(第一次尝试用线性回归模型) 
from sklearn.linear_model import  LinearRegression
lr1 = LinearRegression()#创建lr1训练模型
lr1.fit(X_train,y_train)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
#加载测试数据集
data_test = pd.read_csv('T-R-test.csv')
X_test = data_test.loc[:,'T']
y_test = data_test.loc[:,'rate']
T rate
0 45.376344 2.334559
1 52.186380 2.775735
2 61.863799 2.930147
3 73.154122 2.488971
4 78.888889 1.981618
5 82.473118 1.518382
6 43.046595 2.080882
X_test = np.array(X_test).reshape(-1 ,1)
#用测试数据集预测
y_train_predict = lr1.predict(X_train)
y_test_predict = lr1.predict(X_test)
from sklearn.metrics import r2_score  #R2决定系数(拟合优度)模型越好:r2->1 模型越差:r2->0
r2_train = r2_score(y_train,y_train_predict)
r2_test = r2_score(y_test,y_test_predict)
print('training r2:',r2_train)
print('test r2:',r2_test)
#可明显看出训练效果很差
training r2: 0.016665703886981964
test r2: -0.7583363437351314
#训练结果可视化--可以看出线性模型不使用当前情况
X_range = np.linspace(40,90,300).reshape(-1,1)
y_range_predict = lr1.predict(X_range)
fig2 = plt.figure(figsize=(5,5))
plt.plot(X_range,y_range_predict)
plt.scatter(X_train,y_train)

plt.title('prediction data')
plt.xlabel('temperature')
plt.ylabel('rate')
plt.show()

gqes1K.png

#第二次尝试-采用多项式回归模型
from sklearn.preprocessing import PolynomialFeatures
poly2 = PolynomialFeatures(degree=2)#创建2阶回归模型的实例
X_2_train = poly2.fit_transform(X_train)#将数据转换为2阶模型训练数据
X_2_test = poly2.transform(X_test)#第一次调用fit_transform,第二次只需要transform
poly5 = PolynomialFeatures(degree=5)
X_5_train = poly5.fit_transform(X_train)
X_5_test = poly5.transform(X_test)
print(X_5_train.shape)
(18, 6)
lr2 = LinearRegression()
lr2.fit(X_2_train,y_train)
y_2_train_predict = lr2.predict(X_2_train)
y_2_test_predict = lr2.predict(X_2_test)
r2_2_train = r2_score(y_train,y_2_train_predict)
r2_2_test = r2_score(y_test,y_2_test_predict)

lr5 = LinearRegression()
lr5.fit(X_5_train,y_train)
y_5_train_predict = lr5.predict(X_5_train)
y_5_test_predict = lr5.predict(X_5_test)
r2_5_train = r2_score(y_train,y_5_train_predict)
r2_5_test = r2_score(y_test,y_5_test_predict)

print('training r2_2:',r2_2_train)
print('test r2_2:',r2_2_test)
print('training r2_5:',r2_5_train)
print('test r2_5:',r2_5_test)
#两个模型的训练结果都很好,但5阶模型出现过拟合,预测结果很差
training r2_2: 0.9700515400689422
test r2_2: 0.9963954556468684
training r2_5: 0.9978527267187657
test r2_5: 0.5437837627381457
X_2_range = np.linspace(40,90,300).reshape(-1,1)
X_2_range = poly2.transform(X_2_range)
y_2_range_predict = lr2.predict(X_2_range)

X_5_range = np.linspace(40,90,300).reshape(-1,1)
X_5_range = poly5.transform(X_5_range)
y_5_range_predict = lr5.predict(X_5_range)
fig3 = plt.figure(figsize=(5,5))
plt.plot(X_range,y_2_range_predict)#绘制训2阶练模型的曲线图
plt.scatter(X_train,y_train)
plt.scatter(X_test,y_test)

plt.title('polynomial prediction result (2)')
plt.xlabel('temperature')
plt.ylabel('rate')
plt.show()

gqegne.png

fig4 = plt.figure(figsize=(5,5))
plt.plot(X_range,y_5_range_predict)#绘制训5阶练模型的曲线图
plt.scatter(X_train,y_train)
plt.scatter(X_test,y_test)

plt.title('polynomial prediction result (5)')
plt.xlabel('temperature')
plt.ylabel('rate')
plt.show()

gqefAA.png

酶活性预测实战总结: 1、通过建立二阶多项式回归模型,对酶活性实现了一个较好的预测,无论针对训练或测试数据都得到一个高的r2分数; 2、通过建立线性回归、五阶多项式回归模型,发现存在过拟合或欠拟合情况。过拟合情况下,对于训练数据r2分数高(预测准确),但对于预测数据r2分数低(预测不准确); 3、无论时通过r2分数,或是可视化模型结果,都可以发现二阶多项式回归模型效果最好