03 反向传播

100 阅读3分钟

反向传播

高效求解深度神经网络中的梯度下降

notations

假设模型的输出层是一个向量(a1l,a2l,...anl)(a^l_1,a^l_2,...a^l_n)ll表示层数,a表示经过激活函数σ\sigma后了,经过前设为(z1l,z2l,...,znl)(z^l_1, z^l_2, ...,z^l_n),再往前的参数表示为[wij][w_{ij}] ii连接l1l-1层的第ii个神经元,jj连接ll层第jj个神经元

先考虑单一数据,那他的label和输出形式是一样的(y1,...yn)(y_1, ...y_n),loss function表示为L(y,w)L(y,w)

δ\delta 的引入

尝试直接对第ll层某一个参数求梯度,可以得到

Lwijl=Lzklzklwijl\frac{\partial L}{\partial w^l_{ij}}=\frac{\partial L}{\partial z^l_k}\frac{\partial z_k^l}{\partial w^l_{ij}},进一步发现wijlw^l_{ij}只会影响ll层第jj个神经元,所以求和里面最后一项几乎都是0,只有zjlwijl=ail1\frac{\partial z^l_j}{\partial w^l_{ij}}=a^{l-1}_i,这个很容易得到(因为zjl=kwkjlakl1z_j^l=\sum_k w^l_{kj}a^{l-1}_k),所以重点关注前面的部分,即δjl=Lzjl\delta^l_j=\frac{\partial L}{\partial z^l_j}

递推思路求出所有δ\delta

首先求输出层的δl\delta^l

对于输出层的第jj个神经元,可以求Lzjl=Lajlajlzjl=Lajlσ(zj)\frac{\partial L}{\partial z^l_j}=\frac{\partial L}{\partial a^l_j}\frac{\partial a^l_j}{\partial z^l_j}=\frac{\partial L}{\partial a^l_j}\sigma'(z_j)

对整个输出层,可以得到δl=Lz=Lalσ(z)\delta^l = \frac{\partial L}{\partial z}=\frac{\partial L}{\partial a^l}\odot \sigma'(z)\odot表示element-wise相乘

接下来尝试用δl\delta ^l表示δl1\delta ^{l-1},这个思路也很简单,为了产生下一层的δ\delta,我们需要再隔一层求梯度,这样利用链式求导法则就能实现递推。

Lzjl1=kLzklzklzjl1\frac{\partial L}{\partial z^{l-1}_j}=\sum_k\frac{\partial L}{\partial z^l_k}\frac{\partial z^l_k}{\partial z^{l-1}_j} 直观解释这个式子,LL受输出层所有神经元的影响,而再往前一层的第jj个神经元通过第ll层的参数影响了所有的输出层神经元

Lzjl1=kLzklzklzjl1=kδklwjkσ(zjl1)=[wj1,wj2,...]δlσ(zjl1)\frac{\partial L}{\partial z^{l-1}_j}=\sum_k\frac{\partial L}{\partial z^l_k}\frac{\partial z^l_k}{\partial z^{l-1}_j}=\sum_k \delta^l_k w_{jk} \sigma'(z^{l-1}_j) = [w_{j1}, w_{j2},...]\delta^l \sigma'(z^{l-1}_j)

对所有的zl1z^{l-1}Lzl1=[w11...............wj1...............]δlσ(zl1)=(wl)Tδlσ(zl1)\frac{\partial L}{\partial z^{l-1}}=\begin{bmatrix}w_{11} & ... & ... \\ ... & ... & ... \\ w_{j1} & ... & ... \\ ... & ... & ...\end{bmatrix}\delta^l\odot \sigma'(z^{l-1}) = (w^l)^T\delta^l \odot \sigma'(z^{l-1})

这样可以通过反向传播计算出每一层的δ\delta,再结合前一层的输出,就能求解每一层的梯度了

再回到Lwij=Lzjlzjlwij=δjlail1\frac{\partial L}{\partial w_{ij}}=\frac{\partial L}{\partial z^l_j}\frac{\partial z^l_j}{\partial w_{ij}}=\delta^l_j a^{l-1}_i

所以对整个[wji][w_{ji}]Lw=δl(al1)T\frac{\partial L}{\partial w}=\delta^l (a^{l-1})^T

代码

from types import new_class
import numpy as np


def gen_data_linear(n_entries=100):
    w = np.array([1, 2, 3, 4, 5, 6]).reshape(6, 1)
    b = 1
    X = (np.random.randn(w.shape[0] * n_entries) * 5).reshape(n_entries, w.shape[0])
    y = np.matmul(X, w) + b + np.random.randn(n_entries).reshape(n_entries, 1)
    return X, y

def gen_class_data(n_entries=100, n_classes=2):
    X, y = gen_data_linear(n_entries)
    y_logits = np.zeros(shape=(n_entries, n_classes))
    max_y, min_y = np.max(y), np.min(y)
    seg = (max_y - min_y) / n_classes
    for i in range(n_entries):
        c = max(0, min(int((y[i][0] - min_y) / seg), n_classes - 1))
        y_logits[i][c] = 1
    return X, y_logits

def gen_multidim_linear_data(n_entries):
    w1 = np.array([1, 2, 3, 4, 5, 6]).reshape(6, 1)
    w2 = np.array([1.3, 2.3, 4.2, 2, 3.3, 2.1]).reshape(6, 1)
    w3 = np.array([4.2, 1.9, 2.2, 2.3, 1.2, 0.8]).reshape(6, 1)
    X = (np.random.randn(w1.shape[0] * n_entries) * 5).reshape(n_entries, w1.shape[0])
    y1 = np.matmul(X, w1) + np.random.randn(n_entries).reshape(n_entries, 1)
    y2 = np.matmul(X, w2) + np.random.randn(n_entries).reshape(n_entries, 1)
    y3 = np.matmul(X, w3) + np.random.randn(n_entries).reshape(n_entries, 1)
    return X, np.concatenate((y1, y2, y3), axis=1)

def sigmoid(z):
    # sigma(z) = 1 / (1 + e^(-z)), z is a vector [dim * 1] or scalar
    n = z.shape[0]
    z.reshape(n)
    return 1. / (1. + np.exp(-z))


def softmax(z):
    # softmax(z) = e^z{z_j} / sum{e^z_i}, z: [dim * 1]
    return (np.exp(z) / np.sum(np.exp(z))).reshape(z.shape[0], 1)

class Layer:
    def __init__(self, n_input, n_output, is_last_layer=False):
        self.n_input = n_input
        self.n_output = n_output
        self.weights = np.random.randn(n_input * n_output).reshape(n_output, n_input)
        self.activate = sigmoid
        self.z = None # before activate z = w^T * input
        self.a = None # after activate
        self.last_input = None
        self.gradient = None
        self.is_last_layer = is_last_layer
        self.delta = None

    def forward(self, input):
        self.last_input = input
        self.z = np.matmul(self.weights, input)
        self.a = self.activate(self.z)
        return self.a
    
    def backward(self, label=None, later_weight_delta=None):
        if self.is_last_layer:
            # assume the loss function is MSE
            self.delta = (self.a - label) * self.a * (1 - self.a)
        else:
            # if hidden layer
            self.delta =  later_weight_delta * self.a * (1 - self.a)
        self.gradient = np.matmul(self.delta, self.last_input.T)
        return np.matmul(self.weights.T, self.delta)

    def gradient_descent(self, learning_rate):
        self.weights -= learning_rate * self.gradient


class NN:
    def __init__(self, structure):
        self.n_input = structure[0]
        self.n_output = structure[-1]
        self.layers = []
        for i in range(1, len(structure) - 1):
            self.layers.append(Layer(structure[i - 1], structure[i], False))
        self.layers.append(Layer(structure[-2], structure[-1], True))
    
    def forward(self, input):
        tmp = input
        for layer in self.layers:
            tmp = layer.forward(tmp)
        return tmp
    
    def backward(self, label):
        later_weight_delta = self.layers[-1].backward(label=label)
        for i in range(len(self.layers) - 2, -1, -1):
            later_weight_delta = self.layers[i].backward(later_weight_delta=later_weight_delta)

    def gradient_descent(self, learning_rate):
        for layer in self.layers:
            layer.gradient_descent(learning_rate)
    
    def fit(self, data, labels, learning_rate, n_iterations):
        k = 0
        n_samples = data.shape[0]
        for i in range(n_iterations):
            self.forward(data[k].reshape(self.n_input, 1))
            self.backward(labels[k].reshape(self.n_output, 1))
            self.gradient_descent(learning_rate)
            k = (k + 1) % n_samples

    def predict(self, X):
        n_samples = X.shape[0]
        prediction = np.zeros(shape=(n_samples, self.n_output))
        for i in range(n_samples):
            prediction[i] = self.forward(X[i].reshape(self.n_input, 1)).reshape(self.n_output)
        return prediction

X, y = gen_multidim_linear_data(10000)
y[:, 0] = (y[:, 0] - np.min(y[:, 0])) / (np.max(y[:, 0]) - np.min(y[:, 0]))
y[:, 1] = (y[:, 1] - np.min(y[:, 1])) / (np.max(y[:, 1]) - np.min(y[:, 1]))
y[:, 2] = (y[:, 2] - np.min(y[:, 2])) / (np.max(y[:, 2]) - np.min(y[:, 2]))
# print(X, y)

train_size = 8000
X_train = X[0: train_size]
y_train = y[0: train_size]
X_test = X[train_size:]
y_test = y[train_size:]

# print(X_train)
# print(y_train)

learning_rate = 0.01
n_epoch = 10
n_iterations = n_epoch * train_size

nn = NN([6, 7, 5, 3])
nn.fit(X_train, y_train, learning_rate, n_iterations)
y_predict = nn.predict(X_test)