本文已参与「新人创作礼」活动，一起开启掘金创作之路。

单变量线性回归

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

A=np.eye(5)
A.shape

(5, 5)

path="C:\\Users\\ha'er\\Desktop\\cpp_work\\ex1-linear regression\\ex1data1.txt"  #数据
data=pd.read_table(path,header=None,sep=',',names=['Population','Profit'])
data.head()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	Population	Profit
0	6.1101	17.5920
1	5.5277	9.1302
2	8.5186	13.6620
3	7.0032	11.8540
4	5.8598	6.8233

data.describe()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	Population	Profit
count	97.000000	97.000000
mean	8.159800	5.839135
std	3.869884	5.510262
min	5.026900	-2.680700
25%	5.707700	1.986900
50%	6.589400	4.562300
75%	8.578100	7.046700
max	22.203000	24.147000

data.plot(kind='scatter',x='Population',y='Profit') #构造散点图scatter
plt.show()

在这里插入图片描述

#computeCost用于计算以参数θ为特征函数的代价函数
def computeCost(X,Y,Theta):
    inner=np.power(((X*Theta.T)-Y),2)
    return np.sum(inner)/(2*len(X))
    pass

#x0=1 x1=X(0) 在X中插入x0
data.insert(0,'Ones',1)
data.head()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	Ones	Population	Profit
0	1	6.1101	17.5920
1	1	5.5277	9.1302
2	1	8.5186	13.6620
3	1	7.0032	11.8540
4	1	5.8598	6.8233

#col为data的列数 X为data的前两列 Y为data的最后一列
col=data.shape[1]
X=data.iloc[:,0:col-1]
Y=data.iloc[:,col-1:col]

#查看dataframeX
X.head()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	Ones	Population
0	1	6.1101
1	1	5.5277
2	1	8.5186
3	1	7.0032
4	1	5.8598

#查看dataframeY
Y.head()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	Profit
0	17.5920
1	9.1302
2	13.6620
3	11.8540
4	6.8233

#将X、Y从dataframe类型转成矩阵（多维数组）,并且创建向量θ
X=np.matrix(X.values)
Y=np.matrix(Y.values)
Theta=np.matrix(np.array([0,0]))

X.shape,Y.shape,Theta.shape

((97, 2), (97, 1), (1, 2))

computeCost(X, Y, Theta)

32.072733877455676

#批量梯度下降(矩阵X，向量Y，特征向量Theta，学习速率alpha和迭代次数iter_cnt)
def GradientDescent(X,Y,Theta,alpha,iter_cnt):
    Theta_temp=np.matrix(np.zeros(Theta.shape))
    cost=np.zeros(iter_cnt)
    for i in range(iter_cnt):
        error=(X*Theta.T)-Y
        for j in range(Theta.shape[1]):
            temp=np.multiply(error,X[:,j])
            Theta_temp[0,j]=Theta[0,j]-np.sum(temp)*(alpha/len(X))
        Theta=Theta_temp
        cost[i]=computeCost(X, Y, Theta)
    return Theta,cost
    pass

#学习速率alpha=0.01 执行的迭代次数iter_cnt=1000
alpha = 0.01 
iter_cnt = 1000

#调用GradientDescent
g, cost = GradientDescent(X, Y, Theta, alpha, iter_cnt)

#查看g，cost
g,cost

(matrix([[-3.24140214,  1.1272942 ]]),
 array([6.73719046, 5.93159357, 5.90115471, 5.89522859, 5.89009494,
        ...
        4.51652227, 4.51637981, 4.51623786, 4.51609643, 4.5159555 ]))

#使用拟合的参数计算训练模型的代价函数（误差）
computeCost(X, Y, g)

4.515955503078914

#绘制线性模型以及数据，直观地看出它的拟合
x=np.linspace(data.Population.min(),data.Population.max(),200)
f=g[0,0]+(g[0,1]*x)#f就是拟合的y向量

#subplots()是一个返回包含图形和轴对象的元组的函数.fig:改变人物级别的属性或保存数字作为以后的图像文件是非常有用的
fig,ax=plt.subplots(figsize=(8,6))
ax.plot(x,f,'orange',label="Prediction")
ax.scatter(data.Population,data.Profit,label="Traning Data")
ax.legend(loc=2)#图例的位置在图的第二象限（loc=1在第一象限）
ax.set_xlabel("Population")
ax.set_ylabel("Profit")
ax.set_title('Predicted Profit vs. Population Size')
plt.show()

在这里插入图片描述

#由于梯度方程式函数也在每个训练迭代中输出一个代价的向量，绘制下取曲线。 请注意，代价总是降低（凸优化）。
fig,ax=plt.subplots(figsize=(8,6))
ax.plot(np.arange(iter_cnt),cost,'red')
ax.set_xlabel('Iterations')
ax.set_ylabel('Cost')
ax.set_title('Error vs. Training Epoch')
plt.show()

在这里插入图片描述

多变量线性回归

# path="C:\\Users\\ha'er\\Desktop\\cpp_work\\ex1-linear regression\\ex1data2.txt" #绝对路径
path="ex1data2.txt" #相对路径
data2=pd.read_table(path,header=None,sep=",",names=["Size","Bedrooms","Price"])
data2.head()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	Size	Bedrooms	Price
0	2104	3	399900
1	1600	3	329900
2	2400	3	369000
3	1416	2	232000
4	3000	4	539900

#特征归一化
data2=(data2-data2.mean())/data2.std() #((x-u)/σ)(u:平均值;σ:标准差)
data2.head()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	Size	Bedrooms	Price
0	0.130010	-0.223675	0.475747
1	-0.504190	-0.223675	-0.084074
2	0.502476	-0.223675	0.228626
3	-0.735723	-1.537767	-0.867025
4	1.257476	1.090417	1.595389

#重复第1部分的预处理步骤，并对新数据集运行线性回归程序
data2.insert(0,"Ones",1) #加入x0

cols=data2.shape[1]
X2=data2.iloc[:,0:cols-1]
Y2=data2.iloc[:,cols-1:cols]

X2=np.matrix(X2.values)
Y2=np.matrix(Y2.values)
Theta2=np.matrix(np.array([0,0,0]))

g2,cost2=GradientDescent(X2, Y2, Theta2, alpha, iter_cnt)

computeCost(X2, Y2, g2)

0.1307033696077189

fig,ax=plt.subplots(figsize=(8,6))
ax.plot(np.arange(iter_cnt),cost2,'red')
ax.set_xlabel("Iterations")
ax.set_ylabel("Cost")
ax.set_title("Error vs. Training Epoch")
plt.show()

在这里插入图片描述

scikit-learn的线性回归函数

from sklearn import linear_model
model=linear_model.LinearRegression()
model.fit(X,Y)

LinearRegression()

#绘制线性模型以及数据，直观地看出它的拟合
x=X[:,1].A1.flatten()
f=model.predict(X).flatten()#f就是拟合的y向量 flatten返回一个一维数组

#subplots()是一个返回包含图形和轴对象的元组的函数.fig:改变人物级别的属性或保存数字作为以后的图像文件是非常有用的
fig,ax=plt.subplots(figsize=(8,6))
ax.plot(x,f,'orange',label="Prediction")
ax.scatter(data.Population,data.Profit,label="Traning Data")
ax.legend(loc=2)#图例的位置在图的第二象限（loc=1在第一象限）
ax.set_xlabel("Population")
ax.set_ylabel("Profit")
ax.set_title('Predicted Profit vs. Population Size')
plt.show()

在这里插入图片描述

normal equation（正规方程）

假设我们的训练集特征矩阵为X（包含了 $x_0=1$ ）并且我们的训练集结果为向量y,用正规方程解出向量 $θ=(X^TX)^{-1}X^TY$ . 梯度下降与正规方程的比较：梯度下降需要选择学习率α，需要多次迭代，当特征数量n大时也能较好适用，适用于各种类型的模型; 正规方程不需要选择学习率α，一次计算得出，如果特征数量n较大则运算代价大,通常来说当n nn小于10000 时还是可以接受的，只适用于线性模型，不适合逻辑回归模型等其他模型.

def normalEqn(X,Y):
    Theta=np.linalg.inv(X.T@X)@X.T@Y
    return Theta

Normal_Theta=normalEqn(X, Y)
Normal_Theta

matrix([[-3.89578088],
        [ 1.19303364]])

梯度下降求出来的g=(matrix([[-3.24140214, 1.1272942 ]])

吴恩达机器学习作业1：线性回归（Python实现）

单变量线性回归

多变量线性回归

scikit-learn的线性回归函数

normal equation（正规方程）