反向传播
高效求解深度神经网络中的梯度下降
notations
假设模型的输出层是一个向量,表示层数,a表示经过激活函数后了,经过前设为,再往前的参数表示为 连接层的第个神经元,连接层第个神经元
先考虑单一数据,那他的label和输出形式是一样的,loss function表示为
的引入
尝试直接对第层某一个参数求梯度,可以得到
,进一步发现只会影响层第个神经元,所以求和里面最后一项几乎都是0,只有,这个很容易得到(因为),所以重点关注前面的部分,即
递推思路求出所有
首先求输出层的
对于输出层的第个神经元,可以求
对整个输出层,可以得到 ,表示element-wise相乘
接下来尝试用表示,这个思路也很简单,为了产生下一层的,我们需要再隔一层求梯度,这样利用链式求导法则就能实现递推。
直观解释这个式子,受输出层所有神经元的影响,而再往前一层的第个神经元通过第层的参数影响了所有的输出层神经元
对所有的,
这样可以通过反向传播计算出每一层的,再结合前一层的输出,就能求解每一层的梯度了
再回到
所以对整个,
代码
from types import new_class
import numpy as np
def gen_data_linear(n_entries=100):
w = np.array([1, 2, 3, 4, 5, 6]).reshape(6, 1)
b = 1
X = (np.random.randn(w.shape[0] * n_entries) * 5).reshape(n_entries, w.shape[0])
y = np.matmul(X, w) + b + np.random.randn(n_entries).reshape(n_entries, 1)
return X, y
def gen_class_data(n_entries=100, n_classes=2):
X, y = gen_data_linear(n_entries)
y_logits = np.zeros(shape=(n_entries, n_classes))
max_y, min_y = np.max(y), np.min(y)
seg = (max_y - min_y) / n_classes
for i in range(n_entries):
c = max(0, min(int((y[i][0] - min_y) / seg), n_classes - 1))
y_logits[i][c] = 1
return X, y_logits
def gen_multidim_linear_data(n_entries):
w1 = np.array([1, 2, 3, 4, 5, 6]).reshape(6, 1)
w2 = np.array([1.3, 2.3, 4.2, 2, 3.3, 2.1]).reshape(6, 1)
w3 = np.array([4.2, 1.9, 2.2, 2.3, 1.2, 0.8]).reshape(6, 1)
X = (np.random.randn(w1.shape[0] * n_entries) * 5).reshape(n_entries, w1.shape[0])
y1 = np.matmul(X, w1) + np.random.randn(n_entries).reshape(n_entries, 1)
y2 = np.matmul(X, w2) + np.random.randn(n_entries).reshape(n_entries, 1)
y3 = np.matmul(X, w3) + np.random.randn(n_entries).reshape(n_entries, 1)
return X, np.concatenate((y1, y2, y3), axis=1)
def sigmoid(z):
# sigma(z) = 1 / (1 + e^(-z)), z is a vector [dim * 1] or scalar
n = z.shape[0]
z.reshape(n)
return 1. / (1. + np.exp(-z))
def softmax(z):
# softmax(z) = e^z{z_j} / sum{e^z_i}, z: [dim * 1]
return (np.exp(z) / np.sum(np.exp(z))).reshape(z.shape[0], 1)
class Layer:
def __init__(self, n_input, n_output, is_last_layer=False):
self.n_input = n_input
self.n_output = n_output
self.weights = np.random.randn(n_input * n_output).reshape(n_output, n_input)
self.activate = sigmoid
self.z = None # before activate z = w^T * input
self.a = None # after activate
self.last_input = None
self.gradient = None
self.is_last_layer = is_last_layer
self.delta = None
def forward(self, input):
self.last_input = input
self.z = np.matmul(self.weights, input)
self.a = self.activate(self.z)
return self.a
def backward(self, label=None, later_weight_delta=None):
if self.is_last_layer:
# assume the loss function is MSE
self.delta = (self.a - label) * self.a * (1 - self.a)
else:
# if hidden layer
self.delta = later_weight_delta * self.a * (1 - self.a)
self.gradient = np.matmul(self.delta, self.last_input.T)
return np.matmul(self.weights.T, self.delta)
def gradient_descent(self, learning_rate):
self.weights -= learning_rate * self.gradient
class NN:
def __init__(self, structure):
self.n_input = structure[0]
self.n_output = structure[-1]
self.layers = []
for i in range(1, len(structure) - 1):
self.layers.append(Layer(structure[i - 1], structure[i], False))
self.layers.append(Layer(structure[-2], structure[-1], True))
def forward(self, input):
tmp = input
for layer in self.layers:
tmp = layer.forward(tmp)
return tmp
def backward(self, label):
later_weight_delta = self.layers[-1].backward(label=label)
for i in range(len(self.layers) - 2, -1, -1):
later_weight_delta = self.layers[i].backward(later_weight_delta=later_weight_delta)
def gradient_descent(self, learning_rate):
for layer in self.layers:
layer.gradient_descent(learning_rate)
def fit(self, data, labels, learning_rate, n_iterations):
k = 0
n_samples = data.shape[0]
for i in range(n_iterations):
self.forward(data[k].reshape(self.n_input, 1))
self.backward(labels[k].reshape(self.n_output, 1))
self.gradient_descent(learning_rate)
k = (k + 1) % n_samples
def predict(self, X):
n_samples = X.shape[0]
prediction = np.zeros(shape=(n_samples, self.n_output))
for i in range(n_samples):
prediction[i] = self.forward(X[i].reshape(self.n_input, 1)).reshape(self.n_output)
return prediction
X, y = gen_multidim_linear_data(10000)
y[:, 0] = (y[:, 0] - np.min(y[:, 0])) / (np.max(y[:, 0]) - np.min(y[:, 0]))
y[:, 1] = (y[:, 1] - np.min(y[:, 1])) / (np.max(y[:, 1]) - np.min(y[:, 1]))
y[:, 2] = (y[:, 2] - np.min(y[:, 2])) / (np.max(y[:, 2]) - np.min(y[:, 2]))
# print(X, y)
train_size = 8000
X_train = X[0: train_size]
y_train = y[0: train_size]
X_test = X[train_size:]
y_test = y[train_size:]
# print(X_train)
# print(y_train)
learning_rate = 0.01
n_epoch = 10
n_iterations = n_epoch * train_size
nn = NN([6, 7, 5, 3])
nn.fit(X_train, y_train, learning_rate, n_iterations)
y_predict = nn.predict(X_test)