机器学习之参数优化训练代价增加了些，不再是0了。也就是说我们减轻过拟合

欢迎关注公众号:sumsmile /专注图像处理的移动开发老兵

基于偏差和方差的优化

模型训练完成，怎么直观的观察模型的好坏呢？观察训练集的方差和交叉验证的方差，可以大致判断是欠拟合还是过拟合

方差的评价

观察训练集的损失图和交叉样本集的损失图，判断优化方向

过拟合-增加正则	欠拟合-需增加多项式拟合

减轻过拟合	lambda太大造成欠拟合

代码说明

这个案例中，输入是水位，输出是流速。即研究水库中水位是如何影响水的流速的。

加载数据

import numpy as np
import scipy.io as sio
import scipy.optimize as opt
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def load_data():
    """for ex5
    d['X'] shape = (12, 1)
    pandas has trouble taking this 2d ndarray to construct a dataframe, so I ravel
    the results
    pandas 通过二维数组构造dataframe的功能有点问题，这里将结果摊开，原始数据是列形式，转成行
    """
    d = sio.loadmat('ex5data1.mat')
    return map(np.ravel, [d['X'], d['y'], d['Xval'], d['yval'], d['Xtest'], d['ytest']])

X, y, Xval, yval, Xtest, ytest = load_data()
df = pd.DataFrame({'water_level':X, 'flow':y})

sns.set(context="notebook", style="darkgrid", palette=sns.color_palette("RdBu", 2))
sns.lmplot(x = 'water_level', y = 'flow', data=df, fit_reg=False, height=7)
plt.show()

# 增加一列"1"，即截距
X, Xval, Xtest = [np.insert(x.reshape(x.shape[0], 1), 0, np.ones(x.shape[0]), axis=1) for x in (X, Xval, Xtest)]

复习下代价函数

代码函数

def cost(theta, X, y):
    """
    X: R(m*n), m records, n features
    y: R(m)
    theta : R(n), linear regression parameters
    """
    m = X.shape[0]

    inner = X @ theta - y  # R(m*1)

    # 1*m @ m*1 = 1*1 in matrix multiplication
    # but you know numpy didn't do transpose in 1d array, so here is just a
    # vector inner product to itselves
    # 既然一维向量相乘不做转置处理，求的是内积，那这里的转置有什么意义呢？看起来一维向量不需要转置
    square_sum = inner.T @ inner
    cost = square_sum / (2 * m)

    return cost

theta = np.ones(X.shape[1])
cost(theta, X, y)
303.9515255535976

复习下梯度

偏导求梯度求解运算

def gradient(theta, X, y):
    m = X.shape[0]

    inner = X.T @ (X @ theta - y)  # (m,n).T @ (m, 1) -> (n, 1)

    return inner / m

gradient(theta, X, y)
array([-15.30301567, 598.16741084])
# 这里的theta向量是两个参数，即一维的线性拟合

正则化梯度

def regularized_gradient(theta, X, y, l=1):
    m = X.shape[0]

    regularized_term = theta.copy()  # same shape as theta
    regularized_term[0] = 0  # don't regularize intercept theta

    regularized_term = (l / m) * regularized_term

    return gradient(theta, X, y) + regularized_term
regularized_gradient(theta, X, y)
array([-15.30301567, 598.25074417])
# l = 1时，求第一次的梯度差别不大

拟合数据

正则项 ${\lambda = 0}$

def linear_regression_np(X, y, l=1):
    """linear regression
    args:
        X: feature matrix, (m, n+1) # with incercept x0=1
        y: target vector, (m, )
        l: lambda constant for regularization

    return: trained parameters
    """
    # init theta
    theta = np.ones(X.shape[1])

    # train it
    res = opt.minimize(fun=regularized_cost,
                       x0=theta,
                       args=(X, y, l),
                       method='TNC',
                       jac=regularized_gradient,
                       options={'disp': True})
    return res

def regularized_cost(theta, X, y, l=1):
    m = X.shape[0]

    regularized_term = (l / (2 * m)) * np.power(theta[1:], 2).sum()

    return cost(theta, X, y) + regularized_term

theta = np.ones(X.shape[0])

final_theta = linear_regression_np(X, y, l=0).get('x')

b = final_theta[0] # intercept
m = final_theta[1] # slope

plt.scatter(X[:,1], y, label="Training data")
plt.plot(X[:, 1], X[:, 1]*m + b, label="Prediction")
plt.legend(loc=2)
plt.show()

线性拟合

线性拟合

1.使用训练集的子集来拟合应模型 2.在计算训练代价和交叉验证代价时，没有用正则化 3.记住使用相同的训练集子集来计算训练代价

m = X.shape[0]
for i in range(1, m+1):
#     print('i={}'.format(i))
    res = linear_regression_np(X[:i, :], y[:i], l=0)
    
    tc = regularized_cost(res.x, X[:i, :], y[:i], l=0)
    cv = regularized_cost(res.x, Xval, yval, l=0)
#     print('tc={}, cv={}'.format(tc, cv))
    
    training_cost.append(tc)
    cv_cost.append(cv)

plt.plot(np.arange(1, m+1), training_cost, label='training cost')
plt.plot(np.arange(1, m+1), cv_cost, label='cv cost')
plt.legend(loc=1)
plt.show()

拟合不太好，欠拟合

多项式拟合

注意asMatrix新版本里已经去掉了，用values替代；

def prepare_poly_data(*args, power):
    """
    args: keep feeding in X, Xval, or Xtest
        will return in the same order
    """
    def prepare(x):
        # expand feature
        df = poly_features(x, power=power)

        # normalization
        ndarr = normalize_feature(df).values

        # add intercept term
        return np.insert(ndarr, 0, np.ones(ndarr.shape[0]), axis=1)

    return [prepare(x) for x in args]

def poly_features(x, power, as_ndarray=False):
    data = {'f{}'.format(i): np.power(x, i) for i in range(1, power + 1)}
    df = pd.DataFrame(data)

    return df.values if as_ndarray else df

X, y, Xval, yval, Xtest, ytest = load_data()
poly_features(X, power=3)

扩展多项式特征

准备多项式回归数据

扩展特征到 8阶,或者你需要的阶数
使用 归一化 来合并 $x^n$
don't forget intercept term

def normalize_feature(df):
    """Applies function along input axis(default 0) of DataFrame."""
    return df.apply(lambda column: (column - column.mean()) / column.std())

X_poly, Xval_poly, Xtest_poly= prepare_poly_data(X, Xval, Xtest, power=8)
X_poly[:3, :]

画出学习曲线

首先，我们没有使用正则化，所以 $\lambda=0$ plt.legend(loc = 1)，loc设置"training cost/cv cost的位置"

def plot_learning_curve(X, y, Xval, yval, l=0):
    training_cost, cv_cost = [], []
    m = X.shape[0]

    for i in range(1, m + 1):
        # regularization applies here for fitting parameters
        res = linear_regression_np(X[:i, :], y[:i], l=l)

        # remember, when you compute the cost here, you are computing
        # non-regularized cost. Regularization is used to fit parameters only
        tc = cost(res.x, X[:i, :], y[:i])
        cv = cost(res.x, Xval, yval)

        training_cost.append(tc)
        cv_cost.append(cv)

    plt.plot(np.arange(1, m + 1), training_cost, label='training cost')
    plt.plot(np.arange(1, m + 1), cv_cost, label='cv cost')
    plt.legend(loc=1)

plot_learning_curve(X_poly, y, Xval_poly, yval, l=0)
plt.show()

训练的代价太低，不真实，这是过拟合了，即泛化的能力很差

try $\lambda=1$

plot_learning_curve(X_poly, y, Xval_poly, yval, l=1)
plt.show()

训练代价增加了些，不再是0了。也就是说我们减轻过拟合

try $\lambda=100$

plot_learning_curve(X_poly, y, Xval_poly, yval, l=100)
plt.show()

过度正则化，欠拟合

找到最佳的 $\lambda$

l_candidate = [0, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10]
training_cost, cv_cost = [], []

for l in l_candidate:
    res = linear_regression_np(X_poly, y, l)
    
    tc = cost(res.x, X_poly, y)
    cv = cost(res.x, Xval_poly, yval)
    
    training_cost.append(tc)
    cv_cost.append(cv)

plt.plot(l_candidate, training_cost, label='training')
plt.plot(l_candidate, cv_cost, label='cross validation')
plt.legend(loc=2)

plt.xlabel('lambda')

plt.ylabel('cost')
plt.show()

# best cv I got from all those candidates
l_candidate[np.argmin(cv_cost)]
1

# use test data to compute the cost
for l in l_candidate:
    theta = linear_regression_np(X_poly, y, l).x
    print('test cost(l={}) = {}'.format(l, cost(theta, Xtest_poly, ytest)))

test cost(l=0) = 10.804375286491785
test cost(l=0.001) = 10.911365745177878
test cost(l=0.003) = 11.265060784108712
test cost(l=0.01) = 10.879143763702967
test cost(l=0.03) = 10.022378551698187
test cost(l=0.1) = 8.631776100446476
test cost(l=0.3) = 7.3365081011786275
test cost(l=1) = 7.466282452677015
test cost(l=3) = 11.643940740451052
test cost(l=10) = 27.715080273166386

调参后， $\lambda = 0.3$ 是最优选择，这个时候测试代价最小

欢迎关注公众号:sumsmile /专注图像处理的移动开发老兵