利用线性回归预测波士顿房价(2)

416 阅读1分钟

自定义线性回归和sklearn线性回归对比:

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression as _LinearRegression
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


class LinearRegression:

    def __init__(self, alpha=0.1, epoch=5000, fit_bias=True):
        self.alpha = alpha
        self.epoch = epoch
        self.cost_record = []
        self.fit_bias = fit_bias

    def predict(self, X_test):
        if self.fit_bias:
            x_0 = np.ones(X_test.shape[0])
            X_test = np.column_stack((x_0, X_test))
        return np.dot(X_test, self.w)

    def fit(self, X_train, y_train):
        if self.fit_bias:
            x_0 = np.ones(X_train.shape[0])
            X_train = np.column_stack((x_0, X_train))
        m = X_train.shape[0]
        n = X_train.shape[1]
        self.w = np.ones(n)
        for i in range(self.epoch):
            y_pred = np.dot(X_train, self.w)
            cost = np.dot(y_pred - y_train, y_pred - y_train) / (2 * m)
            self.cost_record.append(cost)
            self.w -= self.alpha / m * np.dot(y_pred - y_train, X_train)
        self.save_model()

    def plot_cost(self):
        plt.plot(np.arange(self.epoch, self.cost_record))
        plt.xlabel("epoch")
        plt.ylabel("cost")
        plt.show()

    def save_model(self):
        np.savetxt("model.txt", self.w)

    def load_model(self):
        self.w = np.loadtxt("model.txt")


data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]
X = data
y = target
X = MinMaxScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2020)

our_model = LinearRegression()
sklearn_model = _LinearRegression()

our_model.fit(X_train, y_train)
sklearn_model.fit(X_train, y_train)

our_model_pred = our_model.predict(X_test)
sklearn_model_pred = sklearn_model.predict(X_test)

housing_price = pd.DataFrame([y_test, our_model_pred, sklearn_model_pred]).T.head(10)
housing_price.columns = ['true_value', 'our_model_pred', 'sklearn_model_pred']
# print(housing_price)

feature_weight = pd.DataFrame([our_model.w[1:], sklearn_model.coef_]).T
feature_weight.columns = ['our_model', 'sklearn_mode']
feature_weight = feature_weight.sort_values('our_model', ascending=False)
print(feature_weight)