hw2的目标是建立一个简单的神经网络库,然后用它来实现一个MLPResNet。
文件目录
.
├── apps
│ ├── mlp_resnet.py
│ └── __pycache__
│ └── mlp_resnet.cpython-38.pyc
├── data
│ ├── t10k-images-idx3-ubyte.gz
│ ├── t10k-labels-idx1-ubyte.gz
│ ├── train-images-idx3-ubyte.gz
│ └── train-labels-idx1-ubyte.gz
├── figures
│ ├── mlp_resnet.png
│ └── residualblock.png
├── hw2.ipynb
├── python
│ └── needle
│ ├── autograd.py
│ ├── data.py
│ ├── __init__.py
│ ├── init.py
│ ├── nn.py
│ ├── ops.py
│ ├── optim.py
│ └── __pycache__
│ ├── autograd.cpython-38.pyc
│ ├── data.cpython-38.pyc
│ ├── __init__.cpython-38.pyc
│ ├── init.cpython-38.pyc
│ ├── nn.cpython-38.pyc
│ ├── ops.cpython-38.pyc
│ └── optim.cpython-38.pyc
└── tests
├── __pycache__
│ ├── test_data.cpython-38-pytest-6.1.1.pyc
│ ├── test_data.cpython-38-pytest-7.1.2.pyc
│ ├── test_nn_and_optim.cpython-38-pytest-6.1.1.pyc
│ └── test_nn_and_optim.cpython-38-pytest-7.1.2.pyc
├── test_data.py
└── test_nn_and_optim.py
9 directories, 29 files
库文件在autograd.py、ops.py的基础上多出了data.py、data.py、nn.py、optim.py,也对应这节课要完成的几个任务。
Question 0
将hw1里面autograd.py、ops.py的实现复制到hw2,不是复制,hw2里有新的内容。
Question 1 初始化函数
在init.py文件里(不是__init__.py)实现几种不同的权重初始化方法
- Xavier uniform
- Xavier normal
- Kaiming uniform
- Kaiming normal 根据公式计算参数,使用rand和randn函数即可
Xavier uniform
def xavier_uniform(fan_in, fan_out, gain=1.0, **kwargs):
a = gain*math.sqrt(6/(fan_in+fan_out))
return a*rand(fan_in, fan_out, low=-1, high=1) # 维度以及取值范围
Xavier normal
def xavier_normal(fan_in, fan_out, gain=1.0, **kwargs):
std = gain*math.sqrt(2/(fan_in+fan_out))
return randn(fan_in, fan_out, mean=0, std = std)
# return randn(fan_in, fan_out)*std
# 两种写法都可
Kaiming uniform
def kaiming_uniform(fan_in, fan_out, nonlinearity="relu", **kwargs):
assert nonlinearity == "relu", "Only relu supported currently"
gain = math.sqrt(2)
bound = gain*math.sqrt(3/fan_in)
return rand(fan_in, fan_out, low=-1, high = 1)*bound
Kaiming normal
def kaiming_normal(fan_in, fan_out, nonlinearity="relu", **kwargs):
assert nonlinearity == "relu", "Only relu supported currently"
gain = math.sqrt(2)
std = gain/math.sqrt(fan_in)
return randn(fan_in, fan_out)*std
Question 2 神经网络常用组件
- Linear
- Relu
- Sequential
- LogSumExp
- SoftmaxLoss
- LayerNorm1d
- Flatten
- BatchNorm1d
- Dropout
- Residual
Linear
补上autograd.py
def __pow__(self, other):
### BEGIN YOUR SOLUTION
if isinstance(other, Tensor):
raise NotImplementedError()
else:
return needle.ops.PowerScalar(other)(self)
### END YOUR SOLUTION
# weight和bias均有两种实现方式,乘法有三种写法
class Linear(Module):
def __init__(self, in_features, out_features, bias=True, device=None, dtype="float32"):
super().__init__()
self.in_features = in_features
self.out_features = out_features
### BEGIN YOUR SOLUTION
# self.weight = Parameter(init.kaiming_uniform(in_features, out_features, requires_grad=True))
w = init.kaiming_uniform(in_features, out_features, requires_grad=True)
self.weight = Parameter(w, device=device, dtype=dtype)
self.use_bias = bias
if self.use_bias:
# self.bias = Parameter(init.kaiming_uniform(out_features, 1, requires_grad=True)).reshape((1, out_features))
b = ops.reshape(init.kaiming_uniform(out_features, 1, requires_grad=True), (1, out_features))
## reshape已经成为Tensor的成员函数
self.bias = Parameter(b, device=device, dtype=dtype)
### END YOUR SOLUTION
def forward(self, X: Tensor) -> Tensor:
### BEGIN YOUR SOLUTION
X_mul_weight = X@self.weight
# X_mul_weight = ops.matmul(X, self.weight)
# X_mul_weight = X.matmul(self.weight)
if self.use_bias:
return X_mul_weight + self.bias.broadcast_to(X_mul_weight.shape)
### END YOUR SOLUTION
Relu
class ReLU(Module):
def forward(self, x: Tensor) -> Tensor:
### BEGIN YOUR SOLUTION
return ops.relu
### END YOUR SOLUTION
Sequential
class Sequential(Module):
def __init__(self, *modules):
super().__init__()
self.modules = modules
def forward(self, x: Tensor) -> Tensor:
### BEGIN YOUR SOLUTION
for module in self.modules:
x = module(x)
return x
### END YOUR SOLUTION
LogSumExp
ops.py文件
减去最大值求指数和再求对数
class LogSumExp(TensorOp):
def __init__(self, axes: Optional[tuple] = None):
self.axes = axes
def compute(self, Z):
### BEGIN YOUR SOLUTION
maxz = array_api.max(Z, axis=self.axes, keepdims=1)
ret = array_api.log(array_api.exp(Z-maxz).sum(axis=self.axes, keepdims=1))+maxz
if self.axes:
out_shape = [size for i,size in enumerate(Z.shape) if i not in self.axes]
else:
out_shape = ()
ret.resize(tuple(out_shape))
return ret
### END YOUR SOLUTION
def gradient(self, out_grad, node):
### BEGIN YOUR SOLUTION
Z = node.inputs[0]
if self.axes:
shape = [1]*len(Z.shape)
s = set(self.axes)
j = 0
for i in range(len(shape)):
if i not in s:
shape[i] = node.shape[j]
j += 1
node_new = node.reshape(shape)
grad_new = out_grad.reshape(shape)
else:
node_new = node
grad_new = out_grad
return grad_new*exp(Z-node_new)
### END YOUR SOLUTION
def logsumexp(a, axes=None):
return LogSumExp(axes=axes)(a)
SoftmaxLoss
# 使用logsumexp求和
# 使用one_hot得到y对应的输出
class SoftmaxLoss(Module):
def forward(self, logits: Tensor, y: Tensor):
### BEGIN YOUR SOLUTION
exp_sum = ops.logsumexp(logits, axes=(1, )).sum()
z_y_sum = (logits * init.one_hot(logits.shape[1], y)).sum()
return (exp_sum - z_y_sum) / logits.shape[0]
# END YOUR SOLUTION
LayerNorm1d
class LayerNorm1d(Module):
def __init__(self, dim, eps=1e-5, device=None, dtype="float32"):
super().__init__()
self.dim = dim
self.eps = eps
### BEGIN YOUR SOLUTION
self.weight = Parameter(init.ones(self.dim, requires_grad=True))
self.bias = Parameter(init.zeros(self.dim, requires_grad=True))
### END YOUR SOLUTION
def forward(self, x: Tensor) -> Tensor:
### BEGIN YOUR SOLUTION
batch_size = x.shape[0]
feature_size = x.shape[1]
# NOTE need reshape, because (4, ) can brcsto (2, 4) but (4, ) cannot brcsto (4, 2)
mean = x.sum(axes=(1,)).reshape((batch_size,1))/feature_size
x_minus_mean = x - mean.broadcast_to(x.shape)
x_std = ((x_minus_mean**2).sum(axes=(1,)).reshape((batch_size, 1))/feature_size + self.eps)**0.5
normed = x_minus_mean/x_std.broadcast_to(x.shape)
return self.weight.broadcast_to(x.shape)*normed+self.bias.broadcast_to(x.shape)
### END YOUR SOLUTION
Flatten
class Flatten(Module):
def forward(self, X):
### BEGIN YOUR SOLUTION
return X.reshape((X.shape[0], -1))
### END YOUR SOLUTION
BatchNorm1d
class BatchNorm1d(Module):
def __init__(self, dim, eps=1e-5, momentum=0.1, device=None, dtype="float32"):
super().__init__()
self.dim = dim
self.eps = eps
self.momentum = momentum
### BEGIN YOUR SOLUTION
self.weight = Parameter(init.ones(self.dim, requires_grad=True))
self.bias = Parameter(init.zeros(self.dim, requires_grad=True))
self.running_mean = init.zeros(self.dim)
self.running_var = init.ones(self.dim)
### END YOUR SOLUTION
def forward(self, x: Tensor) -> Tensor:
### BEGIN YOUR SOLUTION
batch_size = x.shape[0]
mean = x.sum((0, ))/batch_size
x_minus_mean = x-mean.broadcast_to(x.shape)
var = (x_minus_mean**2).sum((0,))/batch_size
if self.training:
self.running_mean = (1-self.momentum)*self.running_mean + self.momentum*mean.data
self.running_var = (1 - self.momentum) * self.running_var + self.momentum * var.data
x_std = ((var+self.eps)**0.5).broadcast_to(x.shape)
x_normed = x_minus_mean/x_std
return x_normed * self.weight.broadcast_to(x.shape) + self.bias.broadcast_to(x.shape)
else:
x_normed = (x-self.running_mean)/(self.running_var + self.eps)**0.5
return x_normed * self.weight.broadcast_to(x.shape) + self.bias.broadcast_to(x.shape)
### END YOUR SOLUTION
Dropout
class Dropout(Module):
def __init__(self, p = 0.5):
super().__init__()
self.p = p
def forward(self, x: Tensor) -> Tensor:
### BEGIN YOUR SOLUTION
mask = init.randb(*x.shape, p=1-self.p)
if self.training:
x_mask = x*mask
return x_mask/(1-self.p)
else:
return x
### END YOUR SOLUTION
Residual
class Residual(Module):
def __init__(self, fn: Module):
super().__init__()
self.fn = fn
def forward(self, x: Tensor) -> Tensor:
### BEGIN YOUR SOLUTION
return x+self.fn(x)
### END YOUR SOLUTION
Question 3 优化器
optim.py
SGD
代码正确但过不去。
问题在于
nn.py文件里的Linear中,bias有问题
self.bias = Parameter(init.kaiming_uniform(out_features, 1, requires_grad=True).reshape((1, out_features)))
# 此时sgd通过
# self.bias = Parameter(init.kaiming_uniform(out_features, 1, requires_grad=True)).reshape((1, out_features))
# 此时sgd通不过
不懂为什么只是reshape前后顺序不一致,bias也一样,但一个能通过测试一个不能通过测试。
def step(self):
### BEGIN YOUR SOLUTION
for i, param in enumerate(self.params):
if i not in self.u:
self.u[i] = 0
# if param.grad is None:
# continue
grad_data = ndl.Tensor(param.grad.numpy(), dtype='float32').data + self.weight_decay*param.data
self.u[i] = self.momentum*self.u[i] + (1-self.momentum) * grad_data
param.data = param.data - self.u[i]*self.lr
### END YOUR SOLUTION
Adam
def step(self):
### BEGIN YOUR SOLUTION
self.t += 1
for i, param in enumerate(self.params):
if i not in self.m:
self.m[i] = ndl.init.zeros(*param.shape)
self.v[i] = ndl.init.zeros(*param.shape)
if param.grad is None:
continue
grad_data = ndl.Tensor(param.grad.numpy(), dtype='float32').data + self.weight_decay*param.data
# 不清楚param.grad哪里来的
self.m[i] = self.beta1 * self.m[i] + (1-self.beta1)*grad_data
self.v[i] = self.beta2 * self.v[i] + (1-self.beta2)*(grad_data**2)
u_hat = (self.m[i])/(1-self.beta1**self.t)
v_hat = (self.v[i])/(1-self.beta2**self.t)
param.data = param.data - self.lr*u_hat/(v_hat**0.5 + self.eps)
### END YOUR SOLUTION
Question 4 dataset和dataloader
首先将parse_mnist拷贝到data.py里
Transformations
RandomFlipHorizontal
上下翻转
class RandomFlipHorizontal(Transform):
def __init__(self, p = 0.5):
self.p = p
def __call__(self, img):
flip_img = np.random.rand() < self.p
### BEGIN YOUR SOLUTION
if flip_img:
# img = np.flip(img, axis = 1)
img = img[:, ::-1, :]
return img
RandomCrop
class RandomCrop(Transform):
def __init__(self, padding=3):
self.padding = padding
def __call__(self, img):
result = np.zeros_like(img)
H, W = img.shape[0], img.shape[1]
if abs(shift_x) >= H or abs(shift_y) >= W:
return result
st_1, ed_1 = max(0, -shift_x), min(H-shift_x, H) #保证一个大于等于0,一个小于等于H
st_2, ed_2 = max(0, -shift_y), min(W-shift_y, W) #保证一个大于等于0,一个小于等于W
img_st_1, img_ed_1 = max(0, shift_x), min(H + shift_x, H) #保证一个大于等于0,一个小于等于H
img_st_2, img_ed_2 = max(0, shift_y), min(W + shift_y, W) #保证一个大于等于0,一个小于等于W
result[st_1:ed_1, st_2:ed_2, :] = img[img_st_1:img_ed_1, img_st_2:img_ed_2, :]
return result
Dataset
MNISTDataset
class MNISTDataset(Dataset):
def __init__(
self,
image_filename: str,
label_filename: str,
transforms: Optional[List] = None,
):
### BEGIN YOUR SOLUTION
super().__init__(transforms)
self.images, self.labels = parse_mnist(image_filename, label_filename)
### END YOUR SOLUTION
def __getitem__(self, index) -> object:
### BEGIN YOUR SOLUTION
X, Y = self.images[index], self.labels[index]
if self.transforms:
X_in = X.reshape((28,28,-1))
X_out = self.apply_transforms(X_in)
X_ret = X_out.reshape(-1, 28*28)
return X_ret, Y
else:
return X, Y
### END YOUR SOLUTION
def __len__(self) -> int:
### BEGIN YOUR SOLUTION
return self.labels.shape[0]
### END YOUR SOLUTION
dataloader
class DataLoader:
r"""
Data loader. Combines a dataset and a sampler, and provides an iterable over
the given dataset.
Args:
dataset (Dataset): dataset from which to load the data.
batch_size (int, optional): how many samples per batch to load
(default: ``1``).
shuffle (bool, optional): set to ``True`` to have the data reshuffled
at every epoch (default: ``False``).
"""
dataset: Dataset
batch_size: Optional[int]
def __init__(
self,
dataset: Dataset,
batch_size: Optional[int] = 1,
shuffle: bool = False,
):
self.dataset = dataset
self.shuffle = shuffle
self.batch_size = batch_size
if not self.shuffle:
self.ordering = np.array_split(np.arange(len(dataset)),
range(batch_size, len(dataset), batch_size))
# add
else:
indices = np.arange(len(dataset))
np.random.shuffle(indices)
self.ordering = np.array_split(indices,
range(batch_size, len(dataset), batch_size))
def __iter__(self):
### BEGIN YOUR SOLUTION
self.start = 0
### END YOUR SOLUTION
return self
def __next__(self):
### BEGIN YOUR SOLUTION
if self.start == len(self.ordering):
raise StopIteration
a = self.start
self.start += 1
samples = [Tensor(x) for x in self.dataset[self.ordering[a]]]
return tuple(samples)
### END YOUR SOLUTION
Question 5 MLP_ResNet
最后一个测试无法通过,但大差不差,其他人似乎也没有通过。
ResidualBlock
def ResidualBlock(dim, hidden_dim, norm=nn.BatchNorm1d, drop_prob=0.1):
### BEGIN YOUR SOLUTION
modules = nn.Sequential(
nn.Linear(dim, hidden_dim),
norm(hidden_dim),
nn.ReLU(),
nn.Dropout(drop_prob),
nn.Linear(hidden_dim, dim),
norm(dim)
)
return nn.Sequential(
nn.Residual(modules),
# NOTE ReLU after Residual
nn.ReLU()
)
### END YOUR SOLUTION
MLPResNet
def MLPResNet(dim, hidden_dim=100, num_blocks=3, num_classes=10, norm=nn.BatchNorm1d, drop_prob=0.1):
### BEGIN YOUR SOLUTION
modules = [
nn.Linear(dim, hidden_dim),
nn.ReLU()
]
for i in range(num_blocks):
modules.append(ResidualBlock(hidden_dim, hidden_dim//2, norm, drop_prob))
# NOTE the line below will produce `num_block` same blocks
# modules += [ResidualBlock(hidden_dim, hidden_dim//2, norm, drop_prob)] * num_blocks
modules.append(nn.Linear(hidden_dim, num_classes))
return nn.Sequential(*modules)
### END YOUR SOLUTION
epoch
def epoch(dataloader, model, opt=None):
np.random.seed(4)
### BEGIN YOUR SOLUTION
loss_func = nn.SoftmaxLoss()
correct, loss_sum, n_step, n_samples= 0., 0., 0, 0
if opt:
model.train()
else:
model.eval()
for X, y in dataloader:
if opt:
opt.reset_grad()
pred = model(X)
loss = loss_func(pred, y)
correct += (pred.numpy().argmax(axis=1) == y.numpy()).sum()
if opt:
loss.backward()
opt.step()
loss_sum += loss.numpy()
n_step += 1
n_samples += X.shape[0]
# NOTE (1 - mean of acc over iterations) is not accurate enough
return (1 - correct / n_samples), loss_sum / n_step
### END YOUR SOLUTION
train_mnist
def train_mnist(batch_size=100, epochs=10, optimizer=ndl.optim.Adam,
lr=0.001, weight_decay=0.001, hidden_dim=100, data_dir="data"):
np.random.seed(4)
### BEGIN YOUR SOLUTION
train_data = ndl.data.MNISTDataset(
data_dir + '/train-images-idx3-ubyte.gz',
data_dir + '/train-labels-idx1-ubyte.gz'
)
test_data = ndl.data.MNISTDataset(
data_dir + '/t10k-images-idx3-ubyte.gz',
data_dir + '/t10k-labels-idx1-ubyte.gz',
)
train_loader = ndl.data.DataLoader(train_data, batch_size)
test_loader = ndl.data.DataLoader(test_data, batch_size)
model = MLPResNet(28 * 28, hidden_dim)
opt = optimizer(model.parameters(), lr=lr, weight_decay=weight_decay)
for _ in range(epochs):
train_acc, train_loss = epoch(train_loader, model, opt)
test_acc, test_loss = epoch(test_loader, model)
return (train_acc, train_loss, test_acc, test_loss)
### END YOUR SOLUTION
总结
本章主要是涉及五部分,数据初始化,神经网络常用组件,优化器,数据集,最后利用四个实现一个MLP_ResNet。
所有测试代码整理如下:
python3 -m pytest -v -k "test_init"
python3 -m pytest -v -k "test_nn_linear"
python3 -m pytest -v -k "test_nn_relu"
python3 -m pytest -v -k "test_nn_sequential"
python3 -m pytest -v -k "test_op_logsumexp"
python3 -m pytest -v -k "test_nn_softmax_loss"
python3 -m pytest -v -k "test_nn_layernorm"
python3 -m pytest -v -k "test_nn_flatten"
python3 -m pytest -v -k "test_nn_batchnorm"
python3 -m pytest -v -k "test_nn_dropout"
python3 -m pytest -v -k "test_nn_residual"
python3 -m pytest -v -k "test_optim_sgd"
python3 -m pytest -v -k "test_optim_adam"
python3 -m pytest -v -k "flip_horizontal"
python3 -m pytest -v -k "random_crop"
python3 -m pytest -v -k "test_mnist_dataset"
python3 -m pytest -v -k "test_dataloader"
python3 -m pytest -v -k "test_mlp"