自定义线性回归和sklearn线性回归对比:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression as _LinearRegression
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
class LinearRegression:
def __init__(self, alpha=0.1, epoch=5000, fit_bias=True):
self.alpha = alpha
self.epoch = epoch
self.cost_record = []
self.fit_bias = fit_bias
def predict(self, X_test):
if self.fit_bias:
x_0 = np.ones(X_test.shape[0])
X_test = np.column_stack((x_0, X_test))
return np.dot(X_test, self.w)
def fit(self, X_train, y_train):
if self.fit_bias:
x_0 = np.ones(X_train.shape[0])
X_train = np.column_stack((x_0, X_train))
m = X_train.shape[0]
n = X_train.shape[1]
self.w = np.ones(n)
for i in range(self.epoch):
y_pred = np.dot(X_train, self.w)
cost = np.dot(y_pred - y_train, y_pred - y_train) / (2 * m)
self.cost_record.append(cost)
self.w -= self.alpha / m * np.dot(y_pred - y_train, X_train)
self.save_model()
def plot_cost(self):
plt.plot(np.arange(self.epoch, self.cost_record))
plt.xlabel("epoch")
plt.ylabel("cost")
plt.show()
def save_model(self):
np.savetxt("model.txt", self.w)
def load_model(self):
self.w = np.loadtxt("model.txt")
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]
X = data
y = target
X = MinMaxScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2020)
our_model = LinearRegression()
sklearn_model = _LinearRegression()
our_model.fit(X_train, y_train)
sklearn_model.fit(X_train, y_train)
our_model_pred = our_model.predict(X_test)
sklearn_model_pred = sklearn_model.predict(X_test)
housing_price = pd.DataFrame([y_test, our_model_pred, sklearn_model_pred]).T.head(10)
housing_price.columns = ['true_value', 'our_model_pred', 'sklearn_model_pred']
feature_weight = pd.DataFrame([our_model.w[1:], sklearn_model.coef_]).T
feature_weight.columns = ['our_model', 'sklearn_mode']
feature_weight = feature_weight.sort_values('our_model', ascending=False)
print(feature_weight)