python模型——线性回归分析

204 阅读1分钟
#导入数据包
import pandas as pd
import numpy as np
from collections import OrderedDict
examDict={
    '学习时间':[0.5,0.75,1,1.25,1.5,1.75,1.75,2,2.25,2.5,2.75,3,3.25,3.5,4,4.25,4.5,4.75,5,5.5],
    '分数':[10,22,13,43,20,22,33,50,62,48,55,75,62,73,81,76,64,82,90,93]
}
examOrderDict=OrderedDict(examDict)
examDf=pd.DataFrame(examOrderDict)
#打印头部数据
print(examDf.head())
#打印尾部数据
print(examDf.tail())

#提取特征和标签
exam_X=examDf.loc[:,'学习时间']
exam_y=examDf.loc[:,'分数']
#绘制散点图
import matplotlib.pyplot as plt
# plt.scatter(exam_X,exam_y,color='b',label='exam data')
# plt.xlabel('Hours')
# plt.ylabel('Score')
# plt.show()
#相关系数
print(examDf.corr())

#建立训练数据和测试数据:train_test_split函数
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(exam_X,exam_y,train_size=0.8)
#输出数据大小
print('原始数据特征:',exam_X.shape,
      '训练数据特征:',X_train.shape,
      '测试数据特征:',X_test.shape)
print('原始数据特征:',exam_y.shape,
      '训练数据特征:',y_train.shape,
      '测试数据特征:',y_test.shape)
#绘图
plt.scatter(X_train,y_train,color='b',label='train data')
plt.scatter(X_test,y_test,color='r',label='test data')
plt.legend(loc=0)
plt.xlabel('Hours')
plt.ylabel('Score')
plt.show()
#建立线性回归模型
from sklearn.linear_model import LinearRegression
model=LinearRegression()
#model.fit()接受二维数组,要增加维度,也可以X_train.reshape(-1,1)
X_train=X_train[:,np.newaxis]
X_test=X_test[:,np.newaxis]
#训练模型:a截距,b系数
model.fit(X_train,y_train)

#LinearRegression(copy_X=True,fit_intercept=True,n_jobs=1,normalize=False)
a=model.intercept_
b=model.coef_
print('最佳拟合线:截距a=','回归系数b=',b)

#训练数据预测值
y_train_pred=model.predict(X_train)
plt.plot(X_train,y_train_pred,color='k',linewidth=3,label='best line')
plt.legend(loc=0)
plt.xlabel('Hours')
plt.ylabel('Score')
plt.show()
#评价模型拟合效果:R^2越大越好
print(model.score(X_test,y_test))
#训练数据散点图
plt.scatter(X_train,y_train,color='b',label='train data')
plt.scatter(X_train,y_train_pred,color='k',linewidths=3,label='best line')
#测试数据散点图
plt.scatter(X_test,y_test,color='r',label='test data')
#loc为图例位置,一般是best
plt.legend(loc='best')
plt.xlabel('hours')
plt.ylabel('score')
plt.show()