从头用 numpy 来写一个识别 MNIST 手写数据的神经网络

·  阅读 938

下载数据集

from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784')

X, y = mnist["data"],mnist["target"]

import sklearn
sklearn.__version__ #'0.23.1'

#首先对数据进行归一化
X = X/255

import numpy as np
y = y.astype(np.float32)
y_new = np.zeros(y.shape,dtype=np.float32)
y_new[np.where(y==0.0)[0]] = 1
y = y_new

# m 设置为 60000
m = 60000
# m_test 为
m_test = X.shape[0] - m

X_train, X_test = X[:m].T,X[m:].T
y_train, y_test = y[:m].reshape(1,m), y[m:].reshape(1,m_test)

X_train.shape

y_test.shape

对数据集进行重新排序

np.random.seed(138)
shuffle_index = np.random.permutation(m)
X_train, y_train = X_train[:,shuffle_index],y_train[:,shuffle_index]

# X_train.shape

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

i = 3
plt.imshow(X_train[:,i].reshape(28,28),cmap=matplotlib.cm.binary)
plt.axis("off")
plt.show()

前向传播

$\hat{y} = \sigma(w^Tx + b)$

定义 sigmoid 函数

$\sigma(z) = \frac{1}{1 + e^{-z}}$
def sigmoid(z):
s = 1 / (1 + np.exp(-z))
return s

损失函数

$L(y,\hat{y}) = -y\log(\hat{y}) - (1-y)\log(1-\hat{y})$

$L(y,\hat{y}) = - \frac{1}{m} \sum_{i=1}^m \left( y^{(i)}\log(\hat{y}^{(i)}) + (1-y^{(i)})\log(1-\hat{y}^{(i)})\right)$
def compute_loss(Y, Y_hat):
m = Y.shape[1]
L = -(1./m) * ( np.sum( np.multiply(np.log(Y_hat),Y) ) + np.sum( np.multiply(np.log(1 - Y_hat),(1 -Y))))
return L

反向传播

$z = w^Tx + b\\ \hat{y} = \sigma(z)\\ L(y,\hat{y}) = -y\log(\hat{y}) - (1-y)\log(1-\hat{y})$

$\frac{\partial L}{\partial w_j} = \frac{\partial L}{\partial \hat{y}}\frac{\partial \hat{y}}{\partial z}\frac{\partial z}{\partial w_j}$

$\frac{\partial L}{\partial \hat{y}} = \frac{\partial }{\partial \hat{y}}\left(-y\log(\hat{y}) - (1-y)\log(1-\hat{y})\right)\\ = -y \frac{\partial }{\partial \hat{y}} log(\hat{y}) - (1-y) \frac{\partial }{\partial \hat{y}}\ \log(1-\hat{y})\\ = \frac{-y}{\hat{y}} + \frac{1-y}{1-\hat{y}}\\ =\frac{\hat{y} - y}{\hat{y}(1 - \hat{y})}$
$\frac{\partial}{\partial z} \sigma(z) = \frac{\partial}{\partial z} \left( \frac{1}{1 + e^{-z}} \right)\\ = - \frac{1}{(1 + e^{-z})^2} \frac{\partial}{\partial z} (1 + e^{-z})\\ = \frac{e^{-z}}{(1 + e^{-z})^2}\\ = \frac{1}{(1 + e^{-z})}\frac{e^{-z}}{(1 + e^{-z})}\\ = \sigma(z)\frac{e^{-z}}{(1 + e^{-z})}\\ = \sigma(z) \left( 1 - \frac{1}{1 + e^{-z}} \right)\\ = \sigma(z)(1 - \sigma(z))\\ = \hat{y}(1 - \hat{y})$
$\frac{\partial}{\partial w_j}(w^T + b) = \frac{\partial}{\partial w_j}(w_0x_0 + \cdots + w_nx_n + b) = x_j$

$\frac{\partial L}{\partial b} = (\hat{y} - y)$
$\frac{\partial L}{\partial b} = \frac{1}{m} \sum_{i=1}^m (\hat{y}^{(i)} - y^{(i)})$

#超参数
learning_rate = 1
#设定 batch size
batch_size = 256
epochs = 2000

n_feature = X_train.shape[0]
#初始化变量
W = np.random.randn(n_feature,1)*0.01
b = np.zeros((1,1))

from tqdm import trange
for i in (t:=trange(epochs)):
samp = np.random.randint(0,X_train.shape[1],size=(batch_size))
# print(samp)
X_samp = X_train[:,samp]
Y_samp = y_train[:,samp].astype(np.float32)
#     X_samp = X_train
#     Y_samp = y_train.astype(np.float32)

# 前向传播
# W(784,1) W.T(1,784) X_sampe (784,bs) => 1xbs
Z = np.matmul(W.T,X_samp) + b
#(1,bs)->(1,bs)
A = sigmoid(Z)

#计算损失值
cost = compute_loss(Y_samp,A)

#计算梯度
dW= (1/batch_size) * np.matmul(X_samp,(A-Y_samp).T)
# A-Y_samp A(1,bs) - (1,bs) axis=1,如何不是用 keepdims=True 最终输出结果变为 (1,)
# 如果使用 keepdims 保持了维度
db = (1/batch_size) * np.sum(A-Y_samp,axis=1,keepdims=True)
#更新参数
W = W - learning_rate * dW
b = b - learning_rate * db
if( i % 100 == 0):
t.set_description(f"iter {i}, cost: {cost}")
print(f"final cost:{cost}")

      0%|                                                                                         | 0/2000 [00:00<?, ?it/s]

(1, 256) (1, 256)
final cost:0.7304988760416786

a = np.random.randn(1,10)
b = np.random.randn(1,10)
print(a -b)
np.sum(a-b,axis=1,keepdims=True)

[[-1.36047055 -2.50987615  0.78183155 -1.36933903  0.6333624   1.41838218
4.41512606  0.12380963 -0.96661973 -0.67886679]]

array([[0.48733957]])

from sklearn.metrics import classification_report, confusion_matrix

Z = np.matmul(W.T, X_test) + b
A = sigmoid(Z)

predictions = (A>.5)[0,:]
labels = (y_test == 1)[0,:]

print(confusion_matrix(predictions,labels))

[[8958   26]
[  62  954]]

print(classification_report(predictions,labels))

                  precision    recall  f1-score   support

False       0.99      1.00      1.00      8984
True       0.97      0.94      0.96      1016

accuracy                           0.99     10000
macro avg       0.98      0.97      0.98     10000
weighted avg       0.99      0.99      0.99     10000

X_test[:,0]
# print(y_test[:,0])

print(y_test[y_test == 0].shape)
# Z = np.matmul(W.T, X_test) + b
# A = sigmoid(Z)

#每次随机
a = np.random.randn(2,3)
b = np.random.randn(3,5)
np.matmul(a,b)


array([[-1.12541033,  0.67226206,  2.69369124,  0.31000053,  0.16015114],
[-0.52122664, -0.13428921,  2.76755925, -0.06326955, -1.24097515]])

添加隐藏层

$\frac{\partial L}{\partial w} = \frac{\partial L}{\partial z}\frac{\partial z}{\partial w}\\ \frac{\partial L}{\partial b} = \frac{\partial L}{\partial z}\frac{\partial z}{\partial b}\\$

$\frac{\partial L}{\partial z} = \frac{\partial L}{\partial a}\frac{\partial a}{\partial z}$
#超参数
learning_rate = 1
#设定 batch size
batch_size = 1024
epochs = 2000

input_feature = X_train.shape[0]
hidden_feature = 64
#初始化变量
W1 = np.random.randn(hidden_feature,input_feature)*0.01
b1 = np.zeros((hidden_feature,1))
W2 = np.random.randn(1,hidden_feature)*0.01
b2 = np.zeros((1,1))

from tqdm import trange
for i in (t:=trange(epochs)):

samp = np.random.randint(0,X_train.shape[1],size=(batch_size))
# print(samp)
X_samp = X_train[:,samp]
Y_samp = y_train[:,samp].astype(np.float32)

# W1(64,784) X_samp(784,bs) +(64,1) -> (64,bs)
Z1 = np.matmul(W1,X_samp) + b1
#(64,bs)
A1 = sigmoid(Z1)
# W1(1,64) A1(64,bs) +(1,1) -> (1,bs)
Z2 = np.matmul(W2,A1) + b2
# (1,bs)
A2 = sigmoid(Z2)
#(1,bs) - (1,bs)
cost = compute_loss(Y_samp,A2)

#
dZ2 = A2 - Y_samp
dW2 = (1./batch_size) * np.matmul(dZ2,A1.T)
db2 = (1./batch_size) * np.sum(dZ2,axis=1,keepdims=True)

#Z1 = (W1X + b) -> A1 = sigmoid(Z1) -> Z2 = A1W2 + b2
# dA1 = dZ2 dA2/dA1
# 这是需要重点解释的内容
dA1 = np.matmul(W2.T, dZ2)
dZ1 = dA1 * sigmoid(Z1) * (1 - sigmoid(Z1))
dW1 = (1./batch_size) * np.matmul(dZ1,X_samp.T)
db1 = (1./batch_size) * np.sum(dZ1,axis=1,keepdims=True)

#更新参数
W2 = W2 - learning_rate * W2
b2 = b2 - learning_rate * b2
W1 = W1 - learning_rate * W1
b1 = b1 - learning_rate * b1

if( i % 100 == 0):
t.set_description(f"iter {i}, cost: {cost}")
print(f"final cost:{cost}")

    iter 1900, cost: 0.6931471805599453: 100%|████████████████████████████████████████| 2000/2000 [00:16<00:00, 120.24it/s]

final cost:0.6931471805599452

Z1 = np.matmul(W1,X_test) + b1
A1 = sigmoid(Z1)
Z2 = np.matmul(W2,A1) + b2
A2 = sigmoid(Z2)

predictions = (A2 >.5)[0,:]
labels = (y_test == 1)[0,:]

print(confusion_matrix(predictions,labels))
print(classification_report(predictions,labels))

    [[9020  980]
[   0    0]]
precision    recall  f1-score   support

False       1.00      0.90      0.95     10000
True       0.00      0.00      0.00         0

accuracy                           0.90     10000
macro avg       0.50      0.45      0.47     10000
weighted avg       1.00      0.90      0.95     10000