训练

一、配置

首先，让我们导入一些常用模块，确保 MatplotLib 内联绘制图形并准备一个函数来保存图形。我们还检查是否安装了 Python 3.5 或更高版本（尽管 Python 2.x 可能工作，但它已被弃用，因此我们强烈建议您使用 Python 3），以及 Scikit-Learn ≥0.20。

# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "training_linear_models"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

二、线性回归

1. 正规方程

$\theta = (X^TX)^{-1}X^Ty$

import numpy as np
# 随机生成 100 个样本
X = 2 * np.random.rand(100, 1)
# 生成 100 个标签
y = 4 + 3 * X + np.random.randn(100, 1)

plt.plot(X, y, "b.")
plt.xlabel("$x_1$", fontsize=18)
plt.ylabel("$y$", rotation=0, fontsize=18)
plt.axis([0, 2, 0, 15])
save_fig("generated_data_plot")
plt.show()

Saving figure generated_data_plot

# 为了训练偏置项，我们需要在第一列添加一个全一列
X_b = np.c_[np.ones((100, 1)), X]  
# 直接使用正规方程公式计算最优参数：theta0,theta1
theta_best = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y)

theta_best

array([[4.21509616],
       [2.77011339]])

# 拟合一个新样本 x0 = 0, x1 = 2
X_new = np.array([[0], [2]])
# 拟合一个新样本 x0 = 1, x1 = 0,x3 = 2
X_new_b = np.c_[np.ones((2, 1)), X_new]  # add x0 = 1 to each instance
# 使用最优参数进行预测
y_predict = X_new_b.dot(theta_best)
y_predict

array([[4.21509616],
       [9.75532293]])

plt.plot(X_new, y_predict, "r-")
plt.plot(X, y, "b.")
plt.axis([0, 2, 0, 15])
plt.show()

书中的图形实际上对应于以下代码，带有图例和轴标签:

plt.plot(X_new, y_predict, "r-", linewidth=2, label="Predictions")
plt.plot(X, y, "b.")
plt.xlabel("$x_1$", fontsize=18)
plt.ylabel("$y$", rotation=0, fontsize=18)
plt.legend(loc="upper left", fontsize=14)
plt.axis([0, 2, 0, 15])
save_fig("linear_model_predictions_plot")
plt.show()

Saving figure linear_model_predictions_plot

# 从线性模块导入线性回归模型
from sklearn.linear_model import LinearRegression
# 实例化
lin_reg = LinearRegression()
# 训练
lin_reg.fit(X, y)
# 显示模型的训练完毕的偏置项和权重
lin_reg.intercept_, lin_reg.coef_

(array([4.21509616]), array([[2.77011339]]))

# 预测
lin_reg.predict(X_new)

array([[4.21509616],
       [9.75532293]])

LinearRegression线性回归类是基于scipy.linalg.lstsq()(又称最小二乘)函数，我们可以直接调用

# 奇异值分解
theta_best_svd, residuals, rank, s = np.linalg.lstsq(X_b, y, rcond=1e-6)
theta_best_svd

array([[4.21509616],
       [2.77011339]])

该函数计算 $\mathbf{X}^+\mathbf{y}$ ，其中 $\mathbf{X}^{+}$ 是 $\mathbf{X}$ 的_pseudoinverse_（特别是 Moore-Penrose 逆）。您可以使用 np.linalg.pinv() 直接计算伪逆：

np.linalg.pinv(X_b).dot(y)

array([[4.21509616],
       [2.77011339]])

三、梯度下降

1. 批量下降

# 学习率
eta = 0.1 
# 迭代次数，也就是遍历整个数据集的次数
n_iterations = 1000
# 训练集大小
m = 100

# 随机生成参数
theta = np.random.randn(2,1)
# 迭代
for iteration in range(n_iterations):
    # 使用向量化一次性计算所有参数的向量
    gradients = 2/m * X_b.T.dot(X_b.dot(theta) - y)
    # 更新参数
    theta = theta - eta * gradients

# 可以看出，BG得到的参数和正规方程计算所得的参数相近
theta

array([[4.21509616],
       [2.77011339]])

X_new_b.dot(theta)

array([[4.21509616],
       [9.75532293]])

theta_path_bgd = []

def plot_gradient_descent(theta, eta, theta_path=None):
    m = len(X_b)
    plt.plot(X, y, "b.")
    n_iterations = 1000
    for iteration in range(n_iterations):
        if iteration < 10:
            y_predict = X_new_b.dot(theta)
            style = "b-" if iteration > 0 else "r--"
            plt.plot(X_new, y_predict, style)
        gradients = 2/m * X_b.T.dot(X_b.dot(theta) - y)
        theta = theta - eta * gradients
        if theta_path is not None:
            theta_path.append(theta)
    plt.xlabel("$x_1$", fontsize=18)
    plt.axis([0, 2, 0, 15])
    plt.title(r"$\eta = {}$".format(eta), fontsize=16)

np.random.seed(42)
theta = np.random.randn(2,1)  # random initialization

plt.figure(figsize=(10,4))
plt.subplot(131); plot_gradient_descent(theta, eta=0.02)
plt.ylabel("$y$", rotation=0, fontsize=18)
plt.subplot(132); plot_gradient_descent(theta, eta=0.1, theta_path=theta_path_bgd)
plt.subplot(133); plot_gradient_descent(theta, eta=0.5)

save_fig("gradient_descent_plot")
plt.show()

Saving figure gradient_descent_plot

2. 随机梯度下降

theta_path_sgd = []
m = len(X_b)
np.random.seed(42)

n_epochs = 50
# 设置学习率衰减参数
t0, t1 = 5, 50 
# 学习率衰减参数
def learning_schedule(t):
    return t0 / (t + t1)

theta = np.random.randn(2,1)  # random initialization
# 总共将100个训练样本遍历 n_epochs次
for epoch in range(n_epochs):
    # 遍历 100 个样本
    for i in range(m):
        if epoch == 0 and i < 20:                    # not shown in the book
            y_predict = X_new_b.dot(theta)           # not shown
            style = "b-" if i > 0 else "r--"         # not shown
            plt.plot(X_new, y_predict, style)        # not shown
        # 随机挑选一个样本用于计算所有参数的梯度并进行更新 
        random_index = np.random.randint(m)
        xi = X_b[random_index:random_index+1]
        yi = y[random_index:random_index+1]
        gradients = 2 * xi.T.dot(xi.dot(theta) - yi)
        # 更新学习率
        eta = learning_schedule(epoch * m + i)
        theta = theta - eta * gradients
        theta_path_sgd.append(theta)                 # not shown

plt.plot(X, y, "b.")                                 # not shown
plt.xlabel("$x_1$", fontsize=18)                     # not shown
plt.ylabel("$y$", rotation=0, fontsize=18)           # not shown
plt.axis([0, 2, 0, 15])                              # not shown
save_fig("sgd_plot")                                 # not shown
plt.show()                                           # not shown

Saving figure sgd_plot

# 查看训练结果
theta

array([[4.21076011],
       [2.74856079]])

# 直接导入SGD回归器
from sklearn.linear_model import SGDRegressor
# 传入最大迭代次数，损失值阈值，正则化，学习率
sgd_reg = SGDRegressor(max_iter=1000, tol=1e-3, penalty=None, eta0=0.1, random_state=42)
# 训练时会自动在X前加入一列全1，将y展平后作为标签
sgd_reg.fit(X, y.ravel())

# 得到偏置项和权重
sgd_reg.intercept_, sgd_reg.coef_

(array([4.24365286]), array([2.8250878]))

3. 小批量梯度下降

这是最后一种梯度下降方法，是SGD和GD的中和版，就是将原始训练集均分为多个子集，每次使用一个子集更新参数，可以使用向量化来更新参数，同时也比GD收敛更快，比SGD更加平稳；但是也容易陷入局部最优

theta_path_mgd = []
# 定义迭代次数
n_iterations = 50
# 子集大小
minibatch_size = 20

np.random.seed(42)
theta = np.random.randn(2,1)  # random initialization

t0, t1 = 200, 1000
def learning_schedule(t):
    return t0 / (t + t1)

t = 0
for epoch in range(n_iterations):
    # 打乱数据集
    shuffled_indices = np.random.permutation(m)
    X_b_shuffled = X_b[shuffled_indices]
    y_shuffled = y[shuffled_indices]
    # 起点，中点，步长
    for i in range(0, m, minibatch_size):
        # 迭代学习率
        t += 1
        # 分离子集
        xi = X_b_shuffled[i:i+minibatch_size]
        yi = y_shuffled[i:i+minibatch_size]
        # 使用向量化同时计算所有参数的梯度
        gradients = 2/minibatch_size * xi.T.dot(xi.dot(theta) - yi)
        # 学习率迭代
        eta = learning_schedule(t)
        # 更新参数
        theta = theta - eta * gradients
        # 将更新的参数存入数组
        theta_path_mgd.append(theta)

theta

array([[4.25214635],
       [2.7896408 ]])

theta_path_bgd = np.array(theta_path_bgd)
theta_path_sgd = np.array(theta_path_sgd)
theta_path_mgd = np.array(theta_path_mgd)

plt.figure(figsize=(7,4))
plt.plot(theta_path_sgd[:, 0], theta_path_sgd[:, 1], "r-s", linewidth=1, label="Stochastic")
plt.plot(theta_path_mgd[:, 0], theta_path_mgd[:, 1], "g-+", linewidth=2, label="Mini-batch")
plt.plot(theta_path_bgd[:, 0], theta_path_bgd[:, 1], "b-o", linewidth=3, label="Batch")
plt.legend(loc="upper left", fontsize=16)
plt.xlabel(r"$\theta_0$", fontsize=20)
plt.ylabel(r"$\theta_1$   ", fontsize=20, rotation=0)
plt.axis([2.5, 4.5, 2.3, 3.9])
save_fig("gradient_descent_paths_plot")
plt.show()

Saving figure gradient_descent_paths_plot

四、多项式回归

import numpy as np
import numpy.random as rnd

np.random.seed(42)

m = 100
X = 6 * np.random.rand(m, 1) - 3
y = 0.5 * X**2 + X + 2 + np.random.randn(m, 1)

对应方程： $y = \frac{1}{2}x^2 + x + 2$

plt.plot(X, y, "b.")
plt.xlabel("$x_1$", fontsize=18)
plt.ylabel("$y$", rotation=0, fontsize=18)
plt.axis([-3, 3, 0, 10])
save_fig("quadratic_data_plot")
plt.show()

Saving figure quadratic_data_plot

# 从预处理模块中导入多项式模块
from sklearn.preprocessing import PolynomialFeatures
# 基于单个特征X创建2阶特征：x^2
poly_features = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly_features.fit_transform(X)
X[0]

array([-0.75275929])

PolynomialFeatures在扩充特征时，若原始特征只有一个，那么就会生成阶数 <= degree 的特征多项式值，若不只一个，其还会生成各特征之间的多项式特征：degree = n_features = 3 ,则会生成a^3,a^2b,abc,...

# 看一下多项式特征的第一个样本的扩充后版本
X_poly[0]

array([-0.75275929,  0.56664654])

# 创建线性回归模型
lin_reg = LinearRegression()
# 利用多项式特征进行训练
lin_reg.fit(X_poly, y)
# 查看模型的bias和权重(两个，和扩充后的多项式特征列数保持一致)
lin_reg.intercept_, lin_reg.coef_

(array([1.78134581]), array([[0.93366893, 0.56456263]]))

X_new=np.linspace(-3, 3, 100).reshape(100, 1)
X_new_poly = poly_features.transform(X_new)
y_new = lin_reg.predict(X_new_poly)
plt.plot(X, y, "b.")
plt.plot(X_new, y_new, "r-", linewidth=2, label="Predictions")
plt.xlabel("$x_1$", fontsize=18)
plt.ylabel("$y$", rotation=0, fontsize=18)
plt.legend(loc="upper left", fontsize=14)
plt.axis([-3, 3, 0, 10])
save_fig("quadratic_predictions_plot")
plt.show()

Saving figure quadratic_predictions_plot

# 导入缩放器
from sklearn.preprocessing import StandardScaler
# 导入流水线
from sklearn.pipeline import Pipeline

for style, width, degree in (("g-", 1, 300), ("b--", 2, 2), ("r-+", 2, 1)):
    polybig_features = PolynomialFeatures(degree=degree, include_bias=False)
    std_scaler = StandardScaler()
    lin_reg = LinearRegression()
    polynomial_regression = Pipeline([
            ("poly_features", polybig_features),
            ("std_scaler", std_scaler),
            ("lin_reg", lin_reg),
        ])
    polynomial_regression.fit(X, y)
    y_newbig = polynomial_regression.predict(X_new)
    plt.plot(X_new, y_newbig, style, label=str(degree), linewidth=width)

plt.plot(X, y, "b.", linewidth=3)
plt.legend(loc="upper left")
plt.xlabel("$x_1$", fontsize=18)
plt.ylabel("$y$", rotation=0, fontsize=18)
plt.axis([-3, 3, 0, 10])
save_fig("high_degree_polynomials_plot")
plt.show()

Saving figure high_degree_polynomials_plot

五、学习曲线

# 导入均方误差
from sklearn.metrics import mean_squared_error
# 导入分离训练/测试集
from sklearn.model_selection import train_test_split

# 绘制学习曲线
def plot_learning_curves(model, X, y):
    # 分离出测试，验证集
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=10)
    # 用于装载每次训练和验证的误差
    train_errors, val_errors = [], []
    for m in range(1, len(X_train) + 1):
        # 每次增大投入训练样本数
        model.fit(X_train[:m], y_train[:m])
        y_train_predict = model.predict(X_train[:m])
        # 使用的验证集一致
        y_val_predict = model.predict(X_val)
        # 计算投入训练的样本和对应标签的均方误差加入列表
        train_errors.append(mean_squared_error(y_train[:m], y_train_predict))
        # 计算验证误差并加入列表
        val_errors.append(mean_squared_error(y_val, y_val_predict))

    plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train")
    plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="val")
    plt.legend(loc="upper right", fontsize=14)   # not shown in the book
    plt.xlabel("Training set size", fontsize=14) # not shown
    plt.ylabel("RMSE", fontsize=14)              # not shown

lin_reg = LinearRegression()
plot_learning_curves(lin_reg, X, y)
# 设置x轴和y轴的显示区间
plt.axis([0, 80, 0, 3])                         # not shown in the book
save_fig("underfitting_learning_curves_plot")   # not shown
plt.show()                                      # not shown

Saving figure underfitting_learning_curves_plot

可以看出，曲线的验证误差和测试误差都比较高，证明模型欠拟合。

from sklearn.pipeline import Pipeline
# 封装多项式回归的预处理操作
polynomial_regression = Pipeline([
        # 将原始特征升阶为degree阶的特征
        ("poly_features", PolynomialFeatures(degree=10, include_bias=False)),
        # 线性回归，返回一个回归器
        ("lin_reg", LinearRegression()),
    ])

plot_learning_curves(polynomial_regression, X, y)
plt.axis([0, 80, 0, 3])           # not shown
save_fig("learning_curves_plot")  # not shown
plt.show()                        # not shown

Saving figure learning_curves_plot

可以看出，模型的训练误差较小，而验证误差较大，证明存在过拟合。

六、正则化线性模型

1. 岭回归

np.random.seed(42)
m = 20
X = 3 * np.random.rand(m, 1)
y = 1 + 0.5 * X + np.random.randn(m, 1) / 1.5
X_new = np.linspace(0, 3, 100).reshape(100, 1)

# 从线性模型中导入岭回归
from sklearn.linear_model import Ridge
# 传入正则化系数，越大则惩罚力度越高
ridge_reg = Ridge(alpha=1, solver="cholesky", random_state=42)
ridge_reg.fit(X, y)
ridge_reg.predict([[1.5]])

array([[1.55071465]])

ridge_reg = Ridge(alpha=1, solver="sag", random_state=42)
ridge_reg.fit(X, y)
ridge_reg.predict([[1.5]])

array([[1.5507201]])

from sklearn.linear_model import Ridge

def plot_model(model_class, polynomial, alphas, **model_kargs):
    for alpha, style in zip(alphas, ("b-", "g--", "r:")):
        model = model_class(alpha, **model_kargs) if alpha > 0 else LinearRegression()
        if polynomial:
            model = Pipeline([
                    ("poly_features", PolynomialFeatures(degree=10, include_bias=False)),
                    ("std_scaler", StandardScaler()),
                    ("regul_reg", model),
                ])
        model.fit(X, y)
        y_new_regul = model.predict(X_new)
        lw = 2 if alpha > 0 else 1
        plt.plot(X_new, y_new_regul, style, linewidth=lw, label=r"$\alpha = {}$".format(alpha))
    plt.plot(X, y, "b.", linewidth=3)
    plt.legend(loc="upper left", fontsize=15)
    plt.xlabel("$x_1$", fontsize=18)
    plt.axis([0, 3, 0, 4])

plt.figure(figsize=(8,4))
plt.subplot(121)
# 线性回归的三种正则化力度
plot_model(Ridge, polynomial=False, alphas=(0, 10, 100), random_state=42)
plt.ylabel("$y$", rotation=0, fontsize=18)
plt.subplot(122)
# 多项式回归的三种正则化力度
plot_model(Ridge, polynomial=True, alphas=(0, 10**-5, 1), random_state=42)

save_fig("ridge_regression_plot")
plt.show()

Saving figure ridge_regression_plot

# 使用l2范数，和之前的岭回归正则化项一致
sgd_reg = SGDRegressor(penalty="l2", max_iter=1000, tol=1e-3, random_state=42)
sgd_reg.fit(X, y.ravel())
sgd_reg.predict([[1.5]])

array([1.47012588])

2. Lasso回归

# 从线性模型中导入lasso回归
from sklearn.linear_model import Lasso

plt.figure(figsize=(8,4))
plt.subplot(121)
# 绘制线性回归的三种正则化力度
plot_model(Lasso, polynomial=False, alphas=(0, 0.1, 1), random_state=42)
plt.ylabel("$y$", rotation=0, fontsize=18)
plt.subplot(122)
# 绘制多项式回归的三种正则化力度
plot_model(Lasso, polynomial=True, alphas=(0, 10**-7, 1), random_state=42)

save_fig("lasso_regression_plot")
plt.show()

/home/zhoumingyao/anaconda/yes/envs/tf_gpu/lib/python3.10/site-packages/sklearn/linear_model/_coordinate_descent.py:648: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.803e+00, tolerance: 9.295e-04
  model = cd_fast.enet_coordinate_descent(


Saving figure lasso_regression_plot

from sklearn.linear_model import Lasso
lasso_reg = Lasso(alpha=0.1)
lasso_reg.fit(X, y)
lasso_reg.predict([[1.5]])

array([1.53788174])

3. 弹性网络

# 从线性模型中导入弹性网络
from sklearn.linear_model import ElasticNet
# 传入正则化力度，混合Lasso正则化项和岭回归正则化项的比例
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42)
elastic_net.fit(X, y)
elastic_net.predict([[1.5]])

array([1.54333232])

4. 早停止

np.random.seed(42)
m = 100
X = 6 * np.random.rand(m, 1) - 3
y = 2 + X + 0.5 * X**2 + np.random.randn(m, 1)
# 使用数据集的前50个样本随机抽样分离为训练集(25)和验证集(25)
X_train, X_val, y_train, y_val = train_test_split(X[:50], y[:50].ravel(), test_size=0.5, random_state=10)

可以看出，多项式回归在验证时也需要对原始数据进行扩充。

from copy import deepcopy
# 创造扩充数据集和缩放特征吃尺寸的流水线
poly_scaler = Pipeline([
        # 基于x创造额外的90阶特征
        ("poly_features", PolynomialFeatures(degree=90, include_bias=False)),
        # 简单缩放
        ("std_scaler", StandardScaler())
    ])
# 对训练集和验证集进行缩放
X_train_poly_scaled = poly_scaler.fit_transform(X_train)
X_val_poly_scaled = poly_scaler.transform(X_val)
# 随机梯度下降，为了实现早终止，我们需要将损失阈值设置为负无穷
sgd_reg = SGDRegressor(max_iter=1, tol=-np.infty, warm_start=True,
                       penalty=None, learning_rate="constant", eta0=0.0005, random_state=42)
# 用于记录最小的验证误差
minimum_val_error = float("inf")
best_epoch = None
best_model = None
# 将训练集遍历1000次
for epoch in range(1000):
    # 训练，验证时只需要对训练，验证集进行缩放，标签是不需要缩放的(归一化也是)
    sgd_reg.fit(X_train_poly_scaled, y_train)  # continues where it left off
    y_val_predict = sgd_reg.predict(X_val_poly_scaled)
    val_error = mean_squared_error(y_val, y_val_predict)
    # 迭代最小验证误差
    if val_error < minimum_val_error:
        minimum_val_error = val_error
        best_epoch = epoch
        best_model = deepcopy(sgd_reg)

虽然名为早终止，但是我们还是将数据集便利了1000次，只是挑选其中验证误差最小的模型作为结果返回而已。

sgd_reg = SGDRegressor(max_iter=1, tol=-np.infty, warm_start=True,
                       penalty=None, learning_rate="constant", eta0=0.0005, random_state=42)

n_epochs = 500
train_errors, val_errors = [], []
for epoch in range(n_epochs):
    sgd_reg.fit(X_train_poly_scaled, y_train)
    y_train_predict = sgd_reg.predict(X_train_poly_scaled)
    y_val_predict = sgd_reg.predict(X_val_poly_scaled)
    train_errors.append(mean_squared_error(y_train, y_train_predict))
    val_errors.append(mean_squared_error(y_val, y_val_predict))

best_epoch = np.argmin(val_errors)
best_val_rmse = np.sqrt(val_errors[best_epoch])

plt.annotate('Best model',
             xy=(best_epoch, best_val_rmse),
             xytext=(best_epoch, best_val_rmse + 1),
             ha="center",
             arrowprops=dict(facecolor='black', shrink=0.05),
             fontsize=16,
            )

best_val_rmse -= 0.03  # just to make the graph look better
plt.plot([0, n_epochs], [best_val_rmse, best_val_rmse], "k:", linewidth=2)
plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="Validation set")
plt.plot(np.sqrt(train_errors), "r--", linewidth=2, label="Training set")
plt.legend(loc="upper right", fontsize=14)
plt.xlabel("Epoch", fontsize=14)
plt.ylabel("RMSE", fontsize=14)
save_fig("early_stopping_plot")
plt.show()

Saving figure early_stopping_plot

# 查看最佳迭代次数和最佳模型
best_epoch, best_model

(239,
 SGDRegressor(eta0=0.0005, learning_rate='constant', max_iter=1, penalty=None,
              random_state=42, tol=-inf, warm_start=True))

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

t1a, t1b, t2a, t2b = -1, 3, -1.5, 1.5

t1s = np.linspace(t1a, t1b, 500)
t2s = np.linspace(t2a, t2b, 500)
t1, t2 = np.meshgrid(t1s, t2s)
T = np.c_[t1.ravel(), t2.ravel()]
Xr = np.array([[1, 1], [1, -1], [1, 0.5]])
yr = 2 * Xr[:, :1] + 0.5 * Xr[:, 1:]
# 
J = (1/len(Xr) * np.sum((T.dot(Xr.T) - yr.T)**2, axis=1)).reshape(t1.shape)

N1 = np.linalg.norm(T, ord=1, axis=1).reshape(t1.shape)
N2 = np.linalg.norm(T, ord=2, axis=1).reshape(t1.shape)

t_min_idx = np.unravel_index(np.argmin(J), J.shape)
t1_min, t2_min = t1[t_min_idx], t2[t_min_idx]

t_init = np.array([[0.25], [-1]])

def bgd_path(theta, X, y, l1, l2, core = 1, eta = 0.05, n_iterations = 200):
    path = [theta]
    for iteration in range(n_iterations):
        gradients = core * 2/len(X) * X.T.dot(X.dot(theta) - y) + l1 * np.sign(theta) + l2 * theta
        theta = theta - eta * gradients
        path.append(theta)
    return np.array(path)

fig, axes = plt.subplots(2, 2, sharex=True, sharey=True, figsize=(10.1, 8))
for i, N, l1, l2, title in ((0, N1, 2., 0, "Lasso"), (1, N2, 0,  2., "Ridge")):
    JR = J + l1 * N1 + l2 * 0.5 * N2**2
    
    tr_min_idx = np.unravel_index(np.argmin(JR), JR.shape)
    t1r_min, t2r_min = t1[tr_min_idx], t2[tr_min_idx]

    levelsJ=(np.exp(np.linspace(0, 1, 20)) - 1) * (np.max(J) - np.min(J)) + np.min(J)
    levelsJR=(np.exp(np.linspace(0, 1, 20)) - 1) * (np.max(JR) - np.min(JR)) + np.min(JR)
    levelsN=np.linspace(0, np.max(N), 10)
    
    path_J = bgd_path(t_init, Xr, yr, l1=0, l2=0)
    path_JR = bgd_path(t_init, Xr, yr, l1, l2)
    path_N = bgd_path(np.array([[2.0], [0.5]]), Xr, yr, np.sign(l1)/3, np.sign(l2), core=0)

    ax = axes[i, 0]
    ax.grid(True)
    ax.axhline(y=0, color='k')
    ax.axvline(x=0, color='k')
    ax.contourf(t1, t2, N / 2., levels=levelsN)
    ax.plot(path_N[:, 0], path_N[:, 1], "y--")
    ax.plot(0, 0, "ys")
    ax.plot(t1_min, t2_min, "ys")
    ax.set_title(r"$\ell_{}$ penalty".format(i + 1), fontsize=16)
    ax.axis([t1a, t1b, t2a, t2b])
    if i == 1:
        ax.set_xlabel(r"$\theta_1$", fontsize=16)
    ax.set_ylabel(r"$\theta_2$", fontsize=16, rotation=0)

    ax = axes[i, 1]
    ax.grid(True)
    ax.axhline(y=0, color='k')
    ax.axvline(x=0, color='k')
    ax.contourf(t1, t2, JR, levels=levelsJR, alpha=0.9)
    ax.plot(path_JR[:, 0], path_JR[:, 1], "w-o")
    ax.plot(path_N[:, 0], path_N[:, 1], "y--")
    ax.plot(0, 0, "ys")
    ax.plot(t1_min, t2_min, "ys")
    ax.plot(t1r_min, t2r_min, "rs")
    ax.set_title(title, fontsize=16)
    ax.axis([t1a, t1b, t2a, t2b])
    if i == 1:
        ax.set_xlabel(r"$\theta_1$", fontsize=16)

save_fig("lasso_vs_ridge_plot")
plt.show()

Saving figure lasso_vs_ridge_plot

七、逻辑回归

1. 决策边界

t = np.linspace(-10, 10, 100)
sig = 1 / (1 + np.exp(-t))
plt.figure(figsize=(9, 3))
plt.plot([-10, 10], [0, 0], "k-")
plt.plot([-10, 10], [0.5, 0.5], "k:")
plt.plot([-10, 10], [1, 1], "k:")
plt.plot([0, 0], [-1.1, 1.1], "k-")
plt.plot(t, sig, "b-", linewidth=2, label=r"$\sigma(t) = \frac{1}{1 + e^{-t}}$")
plt.xlabel("t")
plt.legend(loc="upper left", fontsize=20)
plt.axis([-10, 10, -0.1, 1.1])
save_fig("logistic_function_plot")
plt.show()

Saving figure logistic_function_plot

from sklearn import datasets
# 导入鸢尾花数据集
iris = datasets.load_iris()
list(iris.keys())

['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module']

print(iris.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

    ============== ==== ==== ======= ===== ====================
                    Min  Max   Mean    SD   Class Correlation
    ============== ==== ==== ======= ===== ====================
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)
    ============== ==== ==== ======= ===== ====================

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :Date: July, 1988

The famous Iris database, first used by Sir R.A. Fisher. The dataset is taken
from Fisher's paper. Note that it's the same as in R, but not as in the UCI
Machine Learning Repository, which has two wrong data points.

This is perhaps the best known database to be found in the
pattern recognition literature.  Fisher's paper is a classic in the field and
is referenced frequently to this day.  (See Duda & Hart, for example.)  The
data set contains 3 classes of 50 instances each, where each class refers to a
type of iris plant.  One class is linearly separable from the other 2; the
latter are NOT linearly separable from each other.

.. topic:: References

   - Fisher, R.A. "The use of multiple measurements in taxonomic problems"
     Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to
     Mathematical Statistics" (John Wiley, NY, 1950).
   - Duda, R.O., & Hart, P.E. (1973) Pattern Classification and Scene Analysis.
     (Q327.D83) John Wiley & Sons.  ISBN 0-471-22361-1.  See page 218.
   - Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System
     Structure and Classification Rule for Recognition in Partially Exposed
     Environments".  IEEE Transactions on Pattern Analysis and Machine
     Intelligence, Vol. PAMI-2, No. 1, 67-71.
   - Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule".  IEEE Transactions
     on Information Theory, May 1972, 431-433.
   - See also: 1988 MLC Proceedings, 54-64.  Cheeseman et al"s AUTOCLASS II
     conceptual clustering system finds 3 classes in the data.
   - Many, many more ...

# 分离出数据集的第4列特征：花瓣宽度
X = iris["data"][:, 3:]  # petal width
# 重构标签，我们需要判别是否为弗吉尼亚鸢尾花
y = (iris["target"] == 2).astype(np.int)  # 1 if Iris virginica, else 0

/tmp/ipykernel_3625908/3869195518.py:4: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y = (iris["target"] == 2).astype(np.int)  # 1 if Iris virginica, else 0

from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(solver="lbfgs", random_state=42)
log_reg.fit(X, y)

X_new = np.linspace(0, 3, 1000).reshape(-1, 1)
# 获得是否为弗吉尼亚鸢尾花的概率
y_proba = log_reg.predict_proba(X_new)

plt.plot(X_new, y_proba[:, 1], "g-", linewidth=2, label="Iris virginica")
plt.plot(X_new, y_proba[:, 0], "b--", linewidth=2, label="Not Iris virginica")

[<matplotlib.lines.Line2D at 0x7fe6fb866bf0>]

X_new = np.linspace(0, 3, 1000).reshape(-1, 1)
y_proba = log_reg.predict_proba(X_new)
decision_boundary = X_new[y_proba[:, 1] >= 0.5][0]

plt.figure(figsize=(8, 3))
plt.plot(X[y==0], y[y==0], "bs")
plt.plot(X[y==1], y[y==1], "g^")
plt.plot([decision_boundary, decision_boundary], [-1, 2], "k:", linewidth=2)
plt.plot(X_new, y_proba[:, 1], "g-", linewidth=2, label="Iris virginica")
plt.plot(X_new, y_proba[:, 0], "b--", linewidth=2, label="Not Iris virginica")
plt.text(decision_boundary+0.02, 0.15, "Decision  boundary", fontsize=14, color="k", ha="center")
plt.arrow(decision_boundary, 0.08, -0.3, 0, head_width=0.05, head_length=0.1, fc='b', ec='b')
plt.arrow(decision_boundary, 0.92, 0.3, 0, head_width=0.05, head_length=0.1, fc='g', ec='g')
plt.xlabel("Petal width (cm)", fontsize=14)
plt.ylabel("Probability", fontsize=14)
plt.legend(loc="center left", fontsize=14)
plt.axis([0, 3, -0.02, 1.02])
save_fig("logistic_regression_plot")
plt.show()

Saving figure logistic_regression_plot


/home/zhoumingyao/anaconda/yes/envs/tf_gpu/lib/python3.10/site-packages/matplotlib/patches.py:1479: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.
  self.verts = np.dot(coords, M) + [

decision_boundary

array([1.66066066])

log_reg.predict([[1.7], [1.5]])

array([1, 0])

2. Softmax回归

# 导入逻辑回归
from sklearn.linear_model import LogisticRegression
# 分离出花瓣长度和宽度
X = iris["data"][:, (2, 3)]  # petal length, petal width
y = (iris["target"] == 2).astype(np.int)
# 最小二乘
log_reg = LogisticRegression(solver="lbfgs", C=10**10, random_state=42)
log_reg.fit(X, y)

x0, x1 = np.meshgrid(
        np.linspace(2.9, 7, 500).reshape(-1, 1),
        np.linspace(0.8, 2.7, 200).reshape(-1, 1),
    )
X_new = np.c_[x0.ravel(), x1.ravel()]

y_proba = log_reg.predict_proba(X_new)

plt.figure(figsize=(10, 4))
plt.plot(X[y==0, 0], X[y==0, 1], "bs")
plt.plot(X[y==1, 0], X[y==1, 1], "g^")

zz = y_proba[:, 1].reshape(x0.shape)
contour = plt.contour(x0, x1, zz, cmap=plt.cm.brg)


left_right = np.array([2.9, 7])
boundary = -(log_reg.coef_[0][0] * left_right + log_reg.intercept_[0]) / log_reg.coef_[0][1]

plt.clabel(contour, inline=1, fontsize=12)
plt.plot(left_right, boundary, "k--", linewidth=3)
plt.text(3.5, 1.5, "Not Iris virginica", fontsize=14, color="b", ha="center")
plt.text(6.5, 2.3, "Iris virginica", fontsize=14, color="g", ha="center")
plt.xlabel("Petal length", fontsize=14)
plt.ylabel("Petal width", fontsize=14)
plt.axis([2.9, 7, 0.8, 2.7])
save_fig("logistic_regression_contour_plot")
plt.show()

/tmp/ipykernel_3625908/1887530942.py:5: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y = (iris["target"] == 2).astype(np.int)


Saving figure logistic_regression_contour_plot

X = iris["data"][:, (2, 3)]  # petal length, petal width
y = iris["target"]

softmax_reg = LogisticRegression(multi_class="multinomial",solver="lbfgs", C=10, random_state=42)
softmax_reg.fit(X, y)

x0, x1 = np.meshgrid(
        np.linspace(0, 8, 500).reshape(-1, 1),
        np.linspace(0, 3.5, 200).reshape(-1, 1),
    )
X_new = np.c_[x0.ravel(), x1.ravel()]


y_proba = softmax_reg.predict_proba(X_new)
y_predict = softmax_reg.predict(X_new)

zz1 = y_proba[:, 1].reshape(x0.shape)
zz = y_predict.reshape(x0.shape)

plt.figure(figsize=(10, 4))
plt.plot(X[y==2, 0], X[y==2, 1], "g^", label="Iris virginica")
plt.plot(X[y==1, 0], X[y==1, 1], "bs", label="Iris versicolor")
plt.plot(X[y==0, 0], X[y==0, 1], "yo", label="Iris setosa")

from matplotlib.colors import ListedColormap
custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0'])

plt.contourf(x0, x1, zz, cmap=custom_cmap)
contour = plt.contour(x0, x1, zz1, cmap=plt.cm.brg)
plt.clabel(contour, inline=1, fontsize=12)
plt.xlabel("Petal length", fontsize=14)
plt.ylabel("Petal width", fontsize=14)
plt.legend(loc="center left", fontsize=14)
plt.axis([0, 7, 0, 3.5])
save_fig("softmax_regression_contour_plot")
plt.show()

Saving figure softmax_regression_contour_plot

softmax_reg.predict([[5, 2]])

array([2])

softmax_reg.predict_proba([[5, 2]])

array([[6.38014896e-07, 5.74929995e-02, 9.42506362e-01]])

八、课后练习

1. 题目

在Softmax回归中使用带早停止的梯度下降。

X = iris["data"][:, (2, 3)]  # petal length, petal width
y = iris["target"]

X.shape

(150, 2)

为了训练\theta_0我们需要在X的第一列加上全一。

# 注意内置函数c_的调用方式
X = np.c_[np.ones((X.shape[0],1)),X]
X[:5]

array([[1. , 1.4, 0.2],
       [1. , 1.4, 0.2],
       [1. , 1.3, 0.2],
       [1. , 1.5, 0.2],
       [1. , 1.4, 0.2]])

# 设定随机种子便于复现
np.random.seed(2042)

手动分离验证集，测试集，训练集

val_ratio = 0.2
test_ratio = 0.2

# 需要强制转换，便于整数分离
val_size = int(len(X)*val_ratio)
test_size = int(len(X)*test_ratio)
train_size = len(X) - val_size - test_size

# 打乱数据集
rnd_indices = np.random.permutation(len(X))

# 分离数据集
X_train = X[rnd_indices[:train_size]]
y_train = y[rnd_indices[:train_size]]
X_valid = X[rnd_indices[train_size:-test_size]]
y_valid = y[rnd_indices[train_size:-test_size]]
X_test = X[rnd_indices[-test_size:]]
y_test = y[rnd_indices[-test_size:]]

目标是目前的类指数(0,1或2) ，但我们需要目标类概率训练Softmax回归模型。每个实例的目标类概率对于所有类都等于0.0，除了目标类的概率为1.0(换句话说，给定实例的类概率向量是One-hot向量)。让我们编写一个小函数，将类索引的向量转换为一个矩阵，其中包含每个实例的one-hot向量:

def to_one_hot(y):
    # 创建向量长度
    n_classes = y.max() + 1
    # 获取标签(样本)个数
    m = len(y)
    # 构建One_hot矩阵
    Y_one_hot = np.zeros((m, n_classes))
    Y_one_hot[np.arange(m), y] = 1
    return Y_one_hot

y_train[:10]

array([1, 0, 0, 1, 0, 2, 1, 0, 0, 0])

to_one_hot(y_train[:10])

array([[0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]])

# 对所有需要用到的标签全部转化为One-hot向量
Y_train_one_hot = to_one_hot(y_train)
Y_valid_one_hot = to_one_hot(y_valid)
Y_test_one_hot = to_one_hot(y_test)

我们需要用到下面的Softmax函数： $\sigma\left(\mathbf{s}(\mathbf{x})\right)_k = \dfrac{\exp\left(s_k(\mathbf{x})\right)}{\sum\limits_{j=1}^{K}{\exp\left(s_j(\mathbf{x})\right)}}$

def softmax(logits):
    # 先对输入向量求指数
    exps = np.exp(logits)
    # 对所有分量求和
    exp_sums = np.sum(exps, axis=1, keepdims=True)
    # 返回向量
    return exps / exp_sums

# 扩充之后的特征列数
n_inputs = X_train.shape[1] # == 3 (2 features plus the bias term)
# 分类数
n_outputs = len(np.unique(y_train))   # == 3 (3 iris classes)

现在最难的部分来了：训练！从理论上讲，这很简单：只需将数学方程式转换为 Python 代码即可。但在实践中，这可能非常棘手：特别是，很容易混淆术语或索引的顺序。您甚至可以得到看起来可以正常工作但实际上计算不正确的代码。如果不确定，您应该记下方程式中每个项的形状，并确保代码中的相应项紧密匹配。它还可以帮助独立评估每个术语并将其打印出来。好消息是您不必每天都这样做，因为所有这些都由 Scikit-Learn 很好地实现，但它将帮助您了解幕后发生的事情。

所以我们需要的方程是成本函数： $J(\mathbf{\Theta}) =- \dfrac{1}{m}\sum\limits_{i=1}^{m}\sum\limits_{k=1}^{K}{y_k^{(i)}\log\left(\hat{p}_k^{(i)}\right)}$

下面是梯度的等式： $\nabla_{\mathbf{\theta}^{(k)}} \, J(\mathbf{\Theta}) =\dfrac{1}{m} \sum\limits_{i=1}^{m}{ \left (\hat{p}^{(i)}_k - y_k^{(i)} \right ) \mathbf{x}^{(i)}}$

请注意，如果 $\hat{p}_k^{(i)} = 0$ ， $\log\left(\hat{p}_k^{(i)}\right)$ 可能不可计算。所以我们将在 $\log\left(\hat{p}_k^{(i)}\right)$ 中添加一个很小的值 $\epsilon$ 以避免获得 nan 值。

# 学习率
eta = 0.01
# 迭代次数
n_iterations = 5001
# 获取训练样本个数
m = len(X_train)
# 为了避免计算无穷，我们需要加上的小偏移量
epsilon = 1e-7
# 随机初始化，类似FC
Theta = np.random.randn(n_inputs, n_outputs)

for iteration in range(n_iterations):
    # 计算线性部分
    logits = X_train.dot(Theta)
    # 通过softmax函数得到每个样本所属类别的概率向量
    Y_proba = softmax(logits)
    # 
    if iteration % 500 == 0:
        # 计算对数损失
        loss = -np.mean(np.sum(Y_train_one_hot * np.log(Y_proba + epsilon), axis=1))
        print(iteration, loss)
    # 获取预测误差
    error = Y_proba - Y_train_one_hot
    # 计算梯度
    gradients = 1/m * X_train.T.dot(error)
    # 更新参数
    Theta = Theta - eta * gradients

0 1.7566514272963718
500 0.6499122631441034
1000 0.5635970396781447
1500 0.5083336196322823
2000 0.46932504144195186
2500 0.4398469322872781
3000 0.4164631955737996
3500 0.3972424890678693
4000 0.38101408194917813
4500 0.3670243091579063
5000 0.35476389757347665

查看训练所得参数

Theta

array([[ 3.09718355, -1.34108953, -3.21412704],
       [-1.56676206, -0.083574  , -0.55030641],
       [-1.26594203, -0.82304282,  1.81463983]])

上面每一列参数都代表着一个线性模型。

查看预测精度

logits = X_valid.dot(Theta)
Y_proba = softmax(logits)
y_predict = np.argmax(Y_proba, axis=1)

accuracy_score = np.mean(y_predict == y_valid)
accuracy_score

0.8666666666666667

嗯，这个模型看起来不错。为了练习，让我们添加一点 $\ell_2$ 正则化。下面的训练代码与上面的类似，但是现在损失有一个额外的 $\ell_2$ 惩罚，并且梯度有适当的附加项（注意我们没有正则化 Theta 的第一个元素，因为它对应到偏置项）。另外，让我们尝试增加学习率 eta。

eta = 0.1
n_iterations = 5001
m = len(X_train)
epsilon = 1e-7
alpha = 0.1  # regularization hyperparameter

Theta = np.random.randn(n_inputs, n_outputs)

for iteration in range(n_iterations):
    logits = X_train.dot(Theta)
    Y_proba = softmax(logits)
    if iteration % 500 == 0:
        # 交叉熵损失
        xentropy_loss = -np.mean(np.sum(Y_train_one_hot * np.log(Y_proba + epsilon), axis=1))
        # 正则化损失
        l2_loss = 1/2 * np.sum(np.square(Theta[1:]))
        loss = xentropy_loss + alpha * l2_loss
        print(iteration, loss)
    error = Y_proba - Y_train_one_hot
    gradients = 1/m * X_train.T.dot(error) + np.r_[np.zeros([1, n_outputs]), alpha * Theta[1:]]
    Theta = Theta - eta * gradients

0 3.5042205555294186
500 0.5305939452296691
1000 0.4940063325671582
1500 0.48359691213952016
2000 0.4797382344449844
2500 0.47816481468935207
3000 0.4774924122336185
3500 0.47719737797330136
4000 0.47706584562218807
4500 0.47700661503537
5000 0.4769797695891438

由于额外的 $\ell_2$ 惩罚，损失似乎比之前更大，但也许这个模型会表现得更好？让我们来了解一下：

logits = X_valid.dot(Theta)
Y_proba = softmax(logits)
y_predict = np.argmax(Y_proba, axis=1)

accuracy_score = np.mean(y_predict == y_valid)
accuracy_score

0.9333333333333333

酷，完美的准确性！我们可能只是幸运地使用了这个验证集，但仍然令人愉快。

现在让我们添加提前停止。为此，我们只需要在每次迭代时测量验证集的损失，并在错误开始增长时停止。

eta = 0.1 
n_iterations = 5001
m = len(X_train)
epsilon = 1e-7
alpha = 0.1  # regularization hyperparameter
best_loss = np.infty

Theta = np.random.randn(n_inputs, n_outputs)

for iteration in range(n_iterations):
    logits = X_train.dot(Theta)
    Y_proba = softmax(logits)
    error = Y_proba - Y_train_one_hot
    gradients = 1/m * X_train.T.dot(error) + np.r_[np.zeros([1, n_outputs]), alpha * Theta[1:]]
    Theta = Theta - eta * gradients

    logits = X_valid.dot(Theta)
    Y_proba = softmax(logits)
    xentropy_loss = -np.mean(np.sum(Y_valid_one_hot * np.log(Y_proba + epsilon), axis=1))
    l2_loss = 1/2 * np.sum(np.square(Theta[1:]))
    loss = xentropy_loss + alpha * l2_loss
    if iteration % 500 == 0:
        print(iteration, loss)
    if loss < best_loss:
        best_loss = loss
    else:
        print(iteration - 1, best_loss)
        print(iteration, loss, "early stopping!")
        break

0 3.9205514576185587
500 0.5972692961242838
1000 0.5798097236791675
1500 0.5730485589413589
2000 0.5700477927720872
2500 0.5686095581482349
3000 0.5678757735233323
3500 0.5674801609122399
4000 0.5672561669147707
4500 0.5671238645011465
5000 0.5670429310742888

logits = X_valid.dot(Theta)
Y_proba = softmax(logits)
y_predict = np.argmax(Y_proba, axis=1)

accuracy_score = np.mean(y_predict == y_valid)
accuracy_score

0.9333333333333333

x0, x1 = np.meshgrid(
        np.linspace(0, 8, 500).reshape(-1, 1),
        np.linspace(0, 3.5, 200).reshape(-1, 1),
    )
X_new = np.c_[x0.ravel(), x1.ravel()]
X_new_with_bias = np.c_[np.ones([len(X_new), 1]), X_new]

logits = X_new_with_bias.dot(Theta)
Y_proba = softmax(logits)
y_predict = np.argmax(Y_proba, axis=1)

zz1 = Y_proba[:, 1].reshape(x0.shape)
zz = y_predict.reshape(x0.shape)

plt.figure(figsize=(10, 4))
plt.plot(X[y==2, 0], X[y==2, 1], "g^", label="Iris virginica")
plt.plot(X[y==1, 0], X[y==1, 1], "bs", label="Iris versicolor")
plt.plot(X[y==0, 0], X[y==0, 1], "yo", label="Iris setosa")

from matplotlib.colors import ListedColormap
custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0'])

plt.contourf(x0, x1, zz, cmap=custom_cmap)
contour = plt.contour(x0, x1, zz1, cmap=plt.cm.brg)
plt.clabel(contour, inline=1, fontsize=12)
plt.xlabel("Petal length", fontsize=14)
plt.ylabel("Petal width", fontsize=14)
plt.legend(loc="upper left", fontsize=14)
plt.axis([0, 7, 0, 3.5])
plt.show()

第四章 - 训练

训练