hw4的目标是利用前三个作业,实现卷积和卷积神经网络并在CIFAR-10上完成分类器,然后实现LSTM在Penn Treebank数据集上进行单词级预测语言建模。
.
├── apps
│ ├── models.py
│ └── simple_training.py
├── CMakeLists.txt
├── hw4.ipynb
├── Makefile
├── python
│ └── needle
│ ├── autograd.py
│ ├── backend_ndarray
│ │ ├── __init__.py
│ │ ├── ndarray_backend_numpy.py
│ │ └── ndarray.py
│ ├── backend_numpy.py
│ ├── backend_selection.py
│ ├── data.py
│ ├── __init__.py
│ ├── init.py
│ ├── nn.py
│ ├── ops.py
│ └── optim.py
├── ResNet9.png
├── src
│ ├── ndarray_backend_cpu.cc
│ └── ndarray_backend_cuda.cu
└── tests
├── __pycache__
│ ├── test_cifar_ptb_data.cpython-38-pytest-6.1.1.pyc
│ ├── test_cifar_ptb_data.cpython-38-pytest-7.1.2.pyc
│ ├── test_conv.cpython-38-pytest-6.1.1.pyc
│ ├── test_conv.cpython-38-pytest-7.1.2.pyc
│ ├── test_nd_backend.cpython-38-pytest-6.1.1.pyc
│ ├── test_nd_backend.cpython-38-pytest-7.1.2.pyc
│ ├── test_sequence_models.cpython-38-pytest-6.1.1.pyc
│ └── test_sequence_models.cpython-38-pytest-7.1.2.pyc
├── test_cifar_ptb_data.py
├── test_conv.py
├── test_nd_backend.py
└── test_sequence_models.py
7 directories, 32 files
Part 1: ND Backend
什么都不做 报错,无法导入库
hw3src文件夹复制 make 4/118 将之前hw2中ops.py实现的函数复制到hw4的ops.py中,并添加实现新功能Tanh、Stack、Split。48/118
hw1 autograd.py 实现函数赋值 52/118
hw3 ndarray.py实现的函数进行复制 70/118
ops 两处错误修正 76/118
sum、max、swapaxes 我们写的n维数组库没有这些函数 broadcast_to 需要compact
Part 2: CIFAR-10 dataset
实现CIFAR-10图像分类数据集。 首先下载CIFAR-10数据集到data目录下面。
class CIFAR10Dataset(Dataset):
def __init__(
self,
base_folder: str,
train: bool,
p: Optional[int] = 0.5,
transforms: Optional[List] = None
):
super().__init__(transforms)
X = []
y = []
if train:
for i in range(1,6):
with open(os.path.join(base_folder, 'data_batch_%d'%i), 'rb') as fo:
dict = pickle.load(fo, encoding='bytes')
X.append(dict[b'data'].astype(np.float32))
y.append(dict[b'labels'])
else:
with open(os.path.join(base_folder, 'test_batch'), 'rb') as fo:
dict = pickle.load(fo, encoding='bytes')
X.append(dict[b'data'].astype(np.float32))
y.append(dict[b'labels'])
X = np.concatenate(X, axis=0)/255.0
y = np.concatenate(y, axis=0)
self.X = X
self.y = y
def __getitem__(self, index) -> object:
"""
Returns the image, label at given index
Image should be of shape (3, 32, 32)
"""
### BEGIN YOUR SOLUTION
X, y = self.X[index], self.y[index]
if self.transforms:
X_in = X.reshape((-1, 32, 32, 3))
X_out = self.apply_transforms(X_in)
X_ret = X_out.reshape(-1, 3, 32 ,32)
return X_ret, y
else:
return np.squeeze(X.reshape((-1, 3, 32, 32))), y
### END YOUR SOLUTION
def __len__(self) -> int:
### BEGIN YOUR SOLUTION
return self.y.shape[0]
### END YOUR SOLUTION
十二个测试通过十个,还有两个test_train_cifar10失败,等ResNet实现了再回头测。
本地测试
python3 -m pytest -l -v -k "cifar10"
Part 3: Convolutional neural network
任务3实现卷积神经网络
ndarray.py
在ndarray.py实现flip和pad函数
def flip(self, axes):
"""
Flip this ndarray along the specified axes.
Note: compact() before returning.
"""
### BEGIN YOUR SOLUTION
if axes is None:
axes = range(len(self.strides))
offset_sum = 0
new_strides = list(self.strides)
for axis in axes:
offset_sum += (self.shape[axis]-1)*self.strides[axis]
new_strides[axis] = -self.strides[axis]
ret = NDArray.make(
shape=self.shape,
strides=tuple(new_strides),
device = self.device,
handle = self._handle,
offset = offset_sum
)
return ret.compact()
### END YOUR SOLUTION
def pad(self, axes):
"""
Pad this ndarray by zeros by the specified amount in `axes`,
which lists for _all_ axes the left and right padding amount, e.g.,
axes = ( (0, 0), (1, 1), (0, 0)) pads the middle axis with a 0 on the left and right side.
"""
### BEGIN YOUR SOLUTION
new_shape = list(self.shape)
for i, ax in enumerate(axes):
new_shape[i]+= (ax[0]+ax[1])
ret = NDArray.make(tuple(new_shape), device=self.device)
ret.fill(0)
slices = [slice(ax[0], ax[0]+self.shape[i]) for i, ax in enumerate(axes)]
ret[tuple(slices)] = self
return ret
### END YOUR SOLUTION
本地测试
python3 -m pytest -l -v -k "pad_forward"
ops.py
Flip
flip测试通不过,
cc和cuda文件有问题
部分位置的uint32_t需要换成int32_t
本地测试python3 -m pytest -l -v -k "flip"
Dilate UnDilate
class Dilate(TensorOp):
def __init__(self, axes: tuple, dilation: int):
self.axes = axes
self.dilation = dilation
def compute(self, a):
### BEGIN YOUR SOLUTION
for ax in self.axes:
if ax>=len(a.shape):
return a
new_shape = list(a.shape)
for ax in self.axes:
new_shape[ax] += self.dilation*new_shape[ax]
#new_shape = new_shape*(self.dilation+1)
ret = init.zeros(*new_shape, device = a.device)
slices = [slice(0, new_shape[ax], self.dilation+1) if ax in self.axes else slice(0, new_shape[ax], 1) for ax in range(len(a.shape))]
ret.cached_data[tuple(slices)] = a
return ret.cached_data
### END YOUR SOLUTION
def gradient(self, out_grad, node):
### BEGIN YOUR SOLUTION
return undilate(out_grad, self.axes, self.dilation)
### END YOUR SOLUTION
def dilate(a, axes, dilation):
return Dilate(axes, dilation)(a)
class UnDilate(TensorOp):
def __init__(self, axes: tuple, dilation: int):
self.axes = axes
self.dilation = dilation
def compute(self, a):
### BEGIN YOUR SOLUTION
slices = [slice(0, a.shape[ax], self.dilation + 1) if ax in self.axes else slice(0, a.shape[ax]) for ax
in range(len(a.shape))]
return a[tuple(slices)]
### END YOUR SOLUTION
def gradient(self, out_grad, node):
### BEGIN YOUR SOLUTION
return dilate(out_grad, self.axes, self.dilation)
### END YOUR SOLUTION
本地测试 python3 -m pytest -l -v -k "dilate"
Conv
class Conv(TensorOp):
def __init__(self, stride: Optional[int] = 1, padding: Optional[int] = 0):
self.stride = stride
self.padding = padding
def compute(self, A, B):
### BEGIN YOUR SOLUTION
A = A.pad(((0,0), (self.padding, self.padding), (self.padding, self.padding), (0, 0)))
# 宽和高两边都pad
N, H, W, C_in = A.shape
K, _, _, C_out = B.shape
Ns, Hs, Ws, Cs = A.strides
A = A.as_strided((N, H-K+1, W-K+1, K, K, C_in), (Ns, Hs, Ws, Hs, Ws, Cs)).compact()
A = A.reshape((N*(H-K+1)*(W-K+1), K*K*C_in))
out = A@(B.reshape((K*K*C_in, C_out)))
return out.reshape((N, H-K+1, W-K+1, C_out))[:, ::self.stride, ::self.stride, :]
### END YOUR SOLUTION
def gradient(self, out_grad, node):
### BEGIN YOUR SOLUTION
X, weight = node.inputs[0], node.inputs[1]
'''
out_grad N, H-K+1+2*p, W-K+1+2*p, C_out
X N, H, W, C_in
W K, K, C_in, C_out
'''
_, H, W, _ = X.shape
K = weight.shape[0]
W_flip = flip(weight, (0,1)).transpose((2,3))
if self.stride > 1:
out_grad = dilate(out_grad, (1,2), self.stride-1)
dX = conv(out_grad, W_flip, padding=K - 1)
# dX = Tensor(dX.cached_data[:, K-1:H+K-1, K-1:W+K-1, :])
# NOTE: begin with self.padding!
# NOTE: device
dX = Tensor(dX.cached_data[:, self.padding:H + self.padding, self.padding:W + self.padding, :],
device=dX.device, dtype=dX.dtype)
'''
X, out_grad
C_in, H, W, N
H-K+1+2*p, W-K+1+2*p, N, C_out
C_in, K, K, C_out
'''
X = X.transpose((0, 3))
out_grad = out_grad.transpose((0, 2)).transpose((0, 1))
dW = conv(X, out_grad, padding=self.padding)
dW = dW.transpose((0, 2)).transpose((0, 1))
return dX, dW
### END YOUR SOLUTION
ndarray.py中的reshape重写,conv反向传播测试才能通过
def reshape(self, new_shape):
### BEGIN YOUR SOLUTION
# return self.as_strided(new_shape, NDArray.compact_strides(new_shape))
return self.compact().as_strided(new_shape, NDArray.compact_strides(new_shape))
### END YOUR SOLUTION
本地测试
python3 -m pytest -l -v -k "op\_conv and forward"
python3 -m pytest -l -v -k "op\_conv and backward"
nn.py
先将init.py里的四个初始化重写,函数参数有变化,不能直接粘贴复制之前的。
def kaiming_uniform(fan_in, fan_out, shape=None, nonlinearity="relu", **kwargs):
assert nonlinearity == "relu", "Only relu supported currently"
### BEGIN YOUR SOLUTION
gain = math.sqrt(2)
bound = gain * math.sqrt(3 / fan_in)
return bound * (2 * rand(*shape, **kwargs) - 1)
### END YOUR SOLUTION
本地测试
python3 -m pytest -l -v -k "kaiming_uniform"
再实现卷积网络(上面实现的是卷积算子)
class Conv(Module):
"""
Multi-channel 2D convolutional layer
IMPORTANT: Accepts inputs in NCHW format, outputs also in NCHW format
Only supports padding=same
No grouped convolution or dilation
Only supports square kernels
"""
def __init__(self, in_channels, out_channels, kernel_size, stride=1, bias=True, device=None, dtype="float32"):
super().__init__()
if isinstance(kernel_size, tuple):
kernel_size = kernel_size[0]
if isinstance(stride, tuple):
stride = stride[0]
self.in_channels = in_channels
self.out_channels = out_channels
self.kernel_size = kernel_size
self.stride = stride
### BEGIN YOUR SOLUTION
shape = (kernel_size, kernel_size, in_channels, out_channels)
self.weight = Parameter(init.kaiming_uniform(
self.in_channels*kernel_size*kernel_size,
self.out_channels*kernel_size*kernel_size,
shape = shape,
device = device,
dtype = dtype,
requires_grad = True))
if bias:
self.bias = Parameter(init.rand(
int(self.out_channels),
low = -1/(in_channels*kernel_size**2)**0.5,
high = 1/(in_channels*kernel_size**2)**0.5,
device = device,
dtype = dtype,
requires_grad = True
))
else:
self.bias = None
### END YOUR SOLUTION
def forward(self, x: Tensor) -> Tensor:
### BEGIN YOUR SOLUTION
N, C, H, W = x.shape
x = x.transpose((1, 2)).transpose((2,3))
out = ops.conv(x, self.weight, stride=self.stride, padding = self.kernel_size//2)
if self.bias:
out += self.bias.reshape((1,1,1,self.out_channels)).broadcast_to(out.shape)
return out.transpose((2, 3)).transpose((1,2))
### END YOUR SOLUTION
本地测试
python3 -m pytest -l -v -k "nn_conv_forward"
python3 -m pytest -l -v -k "nn_conv_backward"
models.py
将hw2中nn.py实现的基础网络结构复制到hw4的nn.py中,models.py实现ResNet9需要用到多种基础结构。 因为重写了init方法,所以一些网络中参数初始化部分需要重写。 Linear、BatchNorm、layerNorm、dropout
batchnorm等需要注意先reshape再broadcast
class Tanh(Module):
def forward(self, x: Tensor) -> Tensor:
### BEGIN YOUR SOLUTION
return ops.tanh(x)
### END YOUR SOLUTION
class Sigmoid(Module):
def __init__(self):
super().__init__()
def forward(self, x: Tensor) -> Tensor:
### BEGIN YOUR SOLUTION
return (1+ops.exp(-x))**-1
### END YOUR SOLUTION
本地测试
python3 -m pytest -l -v -k "resnet9"
simple_training.py
epoch_general_cifar10,train_cifar10evaluate_cifar10
dataloader重写,加上device、dtype以及shuffle判断
不能正常运行,ops和data文件有问题 'needle.backend_ndarray.ndarray_backend_cpu.Array' object has no attribute 'array' 主要是dtype、device问题
copy sgd之前代码
成功运行
Part 4: Recurrent neural network
注意变量名必须和提示一致,因为测试时候会用到
class RNNCell(Module):
def __init__(self, input_size, hidden_size, bias=True, nonlinearity='tanh', device=None, dtype="float32"):
"""
Applies an RNN cell with tanh or ReLU nonlinearity.
Parameters:
input_size: The number of expected features in the input X
hidden_size: The number of features in the hidden state h
bias: If False, then the layer does not use bias weights
nonlinearity: The non-linearity to use. Can be either 'tanh' or 'relu'.
Variables:
W_ih: The learnable input-hidden weights of shape (input_size, hidden_size).
W_hh: The learnable hidden-hidden weights of shape (hidden_size, hidden_size).
bias_ih: The learnable input-hidden bias of shape (hidden_size,).
bias_hh: The learnable hidden-hidden bias of shape (hidden_size,).
Weights and biases are initialized from U(-sqrt(k), sqrt(k)) where k = 1/hidden_size
"""
super().__init__()
### BEGIN YOUR SOLUTION
self.input_size = input_size
self.hidden_size = hidden_size
self.bias = bias
self.W_ih = Parameter(init.rand(
input_size, hidden_size, low = -1/hidden_size**0.5, high = 1/hidden_size**0.5,
device = device, dtype = dtype, requires_grad = True
))
self.W_hh = Parameter(init.rand(
hidden_size, hidden_size, low = -1/hidden_size**0.5, high = 1/hidden_size**0.5,
device = device, dtype = dtype, requires_grad = True
))
if bias:
self.bias_ih = Parameter((init.rand(
hidden_size, low = -1/hidden_size**0.5, high = 1/hidden_size**0.5,
device = device, dtype = dtype, requires_grad = True
)))
self.bias_hh = Parameter((init.rand(
hidden_size, low = -1/hidden_size**0.5, high = 1/hidden_size**0.5,
device = device, dtype = dtype, requires_grad = True
)))
self.nonlinearity = Tanh() if nonlinearity == 'tanh' else ReLU()
### END YOUR SOLUTION
def forward(self, X, h=None):
"""
Inputs:
X of shape (bs, input_size): Tensor containing input features
h of shape (bs, hidden_size): Tensor containing the initial hidden state
for each element in the batch. Defaults to zero if not provided.
Outputs:
h' of shape (bs, hidden_size): Tensor contianing the next hidden state
for each element in the batch.
"""
### BEGIN YOUR SOLUTION
bs = X.shape[0]
out = X @ self.W_ih
if self.bias:
out += self.bias_ih.reshape((1, self.hidden_size)).broadcast_to((bs, self.hidden_size))
if h is not None:
out += h @ self.W_hh
if self.bias:
out += self.bias_hh.reshape((1, self.hidden_size)).broadcast_to((bs, self.hidden_size))
return self.nonlinearity(out)
### END YOUR SOLUTION
class RNN(Module):
def __init__(self, input_size, hidden_size, num_layers=1, bias=True, nonlinearity='tanh', device=None, dtype="float32"):
"""
Applies a multi-layer RNN with tanh or ReLU non-linearity to an input sequence.
Parameters:
input_size - The number of expected features in the input x
hidden_size - The number of features in the hidden state h
num_layers - Number of recurrent layers.
nonlinearity - The non-linearity to use. Can be either 'tanh' or 'relu'.
bias - If False, then the layer does not use bias weights.
Variables:
rnn_cells[k].W_ih: The learnable input-hidden weights of the k-th layer,
of shape (input_size, hidden_size) for k=0. Otherwise the shape is
(hidden_size, hidden_size).
rnn_cells[k].W_hh: The learnable hidden-hidden weights of the k-th layer,
of shape (hidden_size, hidden_size).
rnn_cells[k].bias_ih: The learnable input-hidden bias of the k-th layer,
of shape (hidden_size,).
rnn_cells[k].bias_hh: The learnable hidden-hidden bias of the k-th layer,
of shape (hidden_size,).
"""
super().__init__()
### BEGIN YOUR SOLUTION
self.hidden_size = hidden_size
self.device = device
self.dtype = dtype
self.num_layers = num_layers
rnn_cells = [RNNCell(input_size, hidden_size, bias, nonlinearity, device, dtype)]
for i in range(num_layers - 1):
rnn_cells.append(RNNCell(hidden_size, hidden_size, bias, nonlinearity, device, dtype))
self.rnn_cells = rnn_cells
### END YOUR SOLUTION
def forward(self, X, h0=None):
"""
Inputs:
X of shape (seq_len, bs, input_size) containing the features of the input sequence.
h_0 of shape (num_layers, bs, hidden_size) containing the initial
hidden state for each element in the batch. Defaults to zeros if not provided.
Outputs
output of shape (seq_len, bs, hidden_size) containing the output features
(h_t) from the last layer of the RNN, for each t.
h_n of shape (num_layers, bs, hidden_size) containing the final hidden state for each element in the batch.
"""
### BEGIN YOUR SOLUTION
Xs = ops.split(X, 0)
hs = ops.split(h0, 0) if h0 is not None else [None] * self.num_layers
out = []
for t, x in enumerate(Xs):
hiddens = []
for l, model in enumerate(self.rnn_cells):
x = model(x, hs[l])
hiddens.append(x)
out.append(x)
hs = hiddens
out = ops.stack(out, 0)
hs = ops.stack(hs, 0)
return out, hs
### END YOUR SOLUTION
Part 5: Long short-term memory network
class LSTMCell(Module):
def __init__(self, input_size, hidden_size, bias=True, device=None, dtype="float32"):
"""
A long short-term memory (LSTM) cell.
Parameters:
input_size - The number of expected features in the input X
hidden_size - The number of features in the hidden state h
bias - If False, then the layer does not use bias weights
Variables:
W_ih - The learnable input-hidden weights, of shape (input_size, 4*hidden_size).
W_hh - The learnable hidden-hidden weights, of shape (hidden_size, 4*hidden_size).
bias_ih - The learnable input-hidden bias, of shape (4*hidden_size,).
bias_hh - The learnable hidden-hidden bias, of shape (4*hidden_size,).
Weights and biases are initialized from U(-sqrt(k), sqrt(k)) where k = 1/hidden_size
"""
super().__init__()
### BEGIN YOUR SOLUTION
self.input_size = input_size
self.hidden_size = hidden_size
self.bias = bias
il, hl = input_size, hidden_size
self.W_ih = Parameter(
init.rand(il, 4*hl, low=-1/hl**0.5, high=1/hl**0.5, device=device, dtype=dtype, requires_grad=True)
)
self.W_hh = Parameter(
init.rand(hl, 4*hl, low=-1/hl**0.5, high=1/hl**0.5, device=device, dtype=dtype, requires_grad=True)
)
if bias:
self.bias_ih = Parameter(
init.rand(4*hl, low=-1/hl**0.5, high=1/hl**0.5, device=device, dtype=dtype, requires_grad=True)
)
self.bias_hh = Parameter(
init.rand(4*hl, low=-1/hl**0.5, high=1/hl**0.5, device=device, dtype=dtype, requires_grad=True)
)
self.tanh = Tanh()
self.sigmoid = Sigmoid()
### END YOUR SOLUTION
def forward(self, X, h=None):
"""
Inputs: X, h
X of shape (batch, input_size): Tensor containing input features
h, tuple of (h0, c0), with
h0 of shape (bs, hidden_size): Tensor containing the initial hidden state
for each element in the batch. Defaults to zero if not provided.
c0 of shape (bs, hidden_size): Tensor containing the initial cell state
for each element in the batch. Defaults to zero if not provided.
Outputs: (h', c')
h' of shape (bs, hidden_size): Tensor containing the next hidden state for each
element in the batch.
c' of shape (bs, hidden_size): Tensor containing the next cell state for each
element in the batch.
"""
### BEGIN YOUR SOLUTION
bs = X.shape[0]
h0, c0 = (None, None) if h is None else h
hl = self.hidden_size
out = X @ self.W_ih
if h0 is not None:
out += h0 @ self.W_hh
if self.bias:
out += self.bias_ih.reshape((1, 4*hl)).broadcast_to((bs, 4*hl))
out += self.bias_hh.reshape((1, 4*hl)).broadcast_to((bs, 4*hl))
out_list = ops.split(out, 1)
i = ops.stack(tuple([out_list[i] for i in range(0, hl)]), 1)
f = ops.stack(tuple([out_list[i] for i in range(hl, 2*hl)]), 1)
g = ops.stack(tuple([out_list[i] for i in range(2*hl, 3*hl)]), 1)
o = ops.stack(tuple([out_list[i] for i in range(3*hl, 4*hl)]), 1)
g = self.tanh(g)
i, f, o = self.sigmoid(i), self.sigmoid(f), self.sigmoid(o)
c1 = i*g if c0 is None else f*c0+i*g
h1 = o*self.tanh(c1)
return (h1, c1)
### END YOUR SOLUTION
class LSTM(Module):
def __init__(self, input_size, hidden_size, num_layers=1, bias=True, device=None, dtype="float32"):
super().__init__()
"""
Applies a multi-layer long short-term memory (LSTM) RNN to an input sequence.
Parameters:
input_size - The number of expected features in the input x
hidden_size - The number of features in the hidden state h
num_layers - Number of recurrent layers.
bias - If False, then the layer does not use bias weights.
Variables:
lstm_cells[k].W_ih: The learnable input-hidden weights of the k-th layer,
of shape (input_size, 4*hidden_size) for k=0. Otherwise the shape is
(hidden_size, 4*hidden_size).
lstm_cells[k].W_hh: The learnable hidden-hidden weights of the k-th layer,
of shape (hidden_size, 4*hidden_size).
lstm_cells[k].bias_ih: The learnable input-hidden bias of the k-th layer,
of shape (4*hidden_size,).
lstm_cells[k].bias_hh: The learnable hidden-hidden bias of the k-th layer,
of shape (4*hidden_size,).
"""
### BEGIN YOUR SOLUTION
self.num_layers = num_layers
lstm_cells = [LSTMCell(input_size, hidden_size, bias, device, dtype)]
for i in range(num_layers-1):
lstm_cells.append(LSTMCell(hidden_size, hidden_size, bias, device, dtype))
self.lstm_cells = lstm_cells
### END YOUR SOLUTION
def forward(self, X, h=None):
"""
Inputs: X, h
X of shape (seq_len, bs, input_size) containing the features of the input sequence.
h, tuple of (h0, c0) with
h_0 of shape (num_layers, bs, hidden_size) containing the initial
hidden state for each element in the batch. Defaults to zeros if not provided.
c0 of shape (num_layers, bs, hidden_size) containing the initial
hidden cell state for each element in the batch. Defaults to zeros if not provided.
Outputs: (output, (h_n, c_n))
output of shape (seq_len, bs, hidden_size) containing the output features
(h_t) from the last layer of the LSTM, for each t.
tuple of (h_n, c_n) with
h_n of shape (num_layers, bs, hidden_size) containing the final hidden state for each element in the batch.
h_n of shape (num_layers, bs, hidden_size) containing the final hidden cell state for each element in the batch.
"""
### BEGIN YOUR SOLUTION
Xs = ops.split(X, 0)
h0, c0 = (None, None) if h is None else h
hs = [None] * self.num_layers if h0 is None else ops.split(h0, 0)
cs = [None] * self.num_layers if c0 is None else ops.split(c0, 0)
out = []
for t, x in enumerate(Xs):
hiddens = []
cells = []
for l, model in enumerate(self.lstm_cells):
x, c_out = model(x, (hs[l], cs[l]))
hiddens.append(x)
cells.append(c_out)
out.append(x)
hs = hiddens
cs = cells
out = ops.stack(out, 0)
hs = ops.stack(hs, 0)
cs = ops.stack(cs, 0)
return out, (hs, cs)
### END YOUR SOLUTION
Part 6: Penn Treebank dataset
由于dataloader等代码有改动,python3 -m pytest -l -v -k "cifar10"之前能通过的测试现在也无法通过了,需要加上device和dtype,写清参数。
test_cifar_ptb_data.py
BATCH_SIZES = [1, 15]
@pytest.mark.parametrize("batch_size", BATCH_SIZES)
@pytest.mark.parametrize("train", TRAIN)
@pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"])
def test_cifar10_loader(batch_size, train, device, dtype="float32"):
cifar10_train_dataset = ndl.data.CIFAR10Dataset("data/cifar-10-batches-py", train=True)
train_loader = ndl.data.DataLoader(cifar10_train_dataset, batch_size=batch_size, device = device, dtype=dtype)
for (X, y) in train_loader:
break
assert isinstance(X.cached_data, nd.NDArray)
assert isinstance(X, ndl.Tensor)
assert isinstance(y, ndl.Tensor)
assert X.dtype == 'float32'
目前ResNet9已经实现,之前任务2里面没有pass也可以pass,但也需要对dataloader增加参数
test_conv.py的test_train_cifar10
dataloader = ndl.data.DataLoader(\
dataset=dataset,
batch_size=128,
shuffle=False,
device=device,
dtype="float32"
)
实现ptb数据集读写
python3 -m pytest -l -v -k "ptb"不知道为什么也会对cifar10进行测试
python3 -m pytest -l -v -k "ptb_dataset"只测试ptb数据集
class Dictionary(object):
def __init__(self):
self.word2idx = {}
self.idx2word = []
def add_word(self, word):
### BEGIN YOUR SOLUTION
if word not in self.word2idx:
self.word2idx[word] = len(self.idx2word)
self.idx2word.append(word)
return self.word2idx[word]
### END YOUR SOLUTION
def __len__(self):
### BEGIN YOUR SOLUTION
return len(self.idx2word)
### END YOUR SOLUTION
class Corpus(object):
"""
Creates corpus from train, and test txt files.
"""
def __init__(self, base_dir, max_lines=None):
self.dictionary = Dictionary()
self.train = self.tokenize(os.path.join(base_dir, 'train.txt'), max_lines)
self.test = self.tokenize(os.path.join(base_dir, 'test.txt'), max_lines)
def tokenize(self, path, max_lines=None):
### BEGIN YOUR SOLUTION
with open(path, 'r', encoding='utf--8') as f:
lines = f.readlines()
i = 0
ids = []
for line in lines:
if i == max_lines:
break
words = line.strip().split(' ')
for word in words:
ids.append(self.dictionary.add_word(word))
ids.append(self.dictionary.add_word('<eos>'))
i += 1
return ids
### END YOUR SOLUTION
def batchify(data, batch_size, device, dtype):
### BEGIN YOUR SOLUTION
data = np.array(data, dtype=dtype)
nbatch = len(data)//batch_size
data = data[:batch_size*nbatch].reshape((batch_size, nbatch)).T
return data
### END YOUR SOLUTION
def get_batch(batches, i, bptt, device=None, dtype=None):
### BEGIN YOUR SOLUTION
bptt = min(bptt, len(batches)-1-i)
data = batches[i:i+bptt]
target = batches[i+1:i+1+bptt]
data = Tensor(data, device=device, dtype=dtype)
target = Tensor(target.reshape(-1), device=device, dtype=dtype)
return data, target
### END YOUR SOLUTION
Part 7: Training a word-level language model
python/needle/nn.pyimplementEmbeddingapps/models.pyimplementLanguageModelapps/simple_training.pyimplementepoch_general_ptb,train_ptb, andevaluate_ptb
nn.py
class Embedding(Module):
def __init__(self, num_embeddings, embedding_dim, device=None, dtype="float32"):
super().__init__()
"""
Maps one-hot word vectors from a dictionary of fixed size to embeddings.
Parameters:
num_embeddings (int) - Size of the dictionary
embedding_dim (int) - The size of each embedding vector
Variables:
weight - The learnable weights of shape (num_embeddings, embedding_dim)
initialized from N(0, 1).
"""
### BEGIN YOUR SOLUTION
self.num_embeddings = num_embeddings
self.embedding_dim = embedding_dim
self.weight = Parameter(init.randn(num_embeddings, embedding_dim, device=device, dtype=dtype))
### END YOUR SOLUTION
def forward(self, x: Tensor) -> Tensor:
"""
Maps word indices to one-hot vectors, and projects to embedding vectors
Input:
x of shape (seq_len, bs)
Output:
output of shape (seq_len, bs, embedding_dim)
"""
### BEGIN YOUR SOLUTION
one_hot_vectors = self.one_hot = init.one_hot(self.num_embeddings, x, device=x.device, dtype=x.dtype)
seq_len, bs, em = one_hot_vectors.shape
one_hot_vectors = one_hot_vectors.reshape((seq_len*bs, em))
out = one_hot_vectors @ self.weight
return out.reshape((seq_len, bs, self.embedding_dim))
### END YOUR SOLUTION
models.py
class LanguageModel(nn.Module):
def __init__(self, embedding_size, output_size, hidden_size, num_layers=1,
seq_model='rnn', device=None, dtype="float32"):
"""
Consists of an embedding layer, a sequence model (either RNN or LSTM), and a
linear layer.
Parameters:
output_size: Size of dictionary
embedding_size: Size of embeddings
hidden_size: The number of features in the hidden state of LSTM or RNN
seq_model: 'rnn' or 'lstm', whether to use RNN or LSTM
num_layers: Number of layers in RNN or LSTM
"""
super(LanguageModel, self).__init__()
### BEGIN YOUR SOLUTION
self.embedding_size = embedding_size
self.output_size = output_size
self.hidden_size = hidden_size
self.embedding = nn.Embedding(output_size, embedding_size, device=device, dtype=dtype)
if seq_model=='rnn':
self.model = nn.RNN(embedding_size, hidden_size, num_layers, device=device, dtype=dtype)
elif seq_model == 'lstm':
self.model = nn.LSTM(embedding_size, hidden_size, num_layers, device=device, dtype=dtype)
else:
raise NotImplementedError
self.linear = nn.Linear(hidden_size, output_size, device=device, dtype=dtype)
### END YOUR SOLUTION
def forward(self, x, h=None):
"""
Given sequence (and the previous hidden state if given), returns probabilities of next word
(along with the last hidden state from the sequence model).
Inputs:
x of shape (seq_len, bs)
h of shape (num_layers, bs, hidden_size) if using RNN,
else h is tuple of (h0, c0), each of shape (num_layers, bs, hidden_size)
Returns (out, h)
out of shape (seq_len*bs, output_size)
h of shape (num_layers, bs, hidden_size) if using RNN,
else h is tuple of (h0, c0), each of shape (num_layers, bs, hidden_size)
"""
### BEGIN YOUR SOLUTION
seq_len, bs = x.shape
x = self.embedding(x)
x, h_ = self.model(x, h)
x = self.linear(x.reshape((seq_len*bs, self.hidden_size)))
return x, h_
### END YOUR SOLUTION
simple_training.py
### PTB training ###
def epoch_general_ptb(data, model, seq_len=40, loss_fn=nn.SoftmaxLoss(), opt=None,
clip=None, device=None, dtype="float32"):
np.random.seed(4)
### BEGIN YOUR SOLUTION
correct, loss_sum, n_step, n_samplers = 0., 0., 0, 0
if opt:
model.train()
else:
model.eval()
h = None
for i in range(0, data.shape[0]-1, seq_len):
X, y = ndl.data.get_batch(data, i, seq_len, device=device, dtype=dtype)
print(n_step, " ", X.shape)
if opt:
opt.reset_grad()
pred, h = model(X, h)
if isinstance(h, tuple):
h = (h[0].detach(), h[1].detach())
else:
h = h.detach()
loss = loss_fn(pred, y)
correct += (pred.numpy().argmax(axis=1)==y.numpy()).sum()
if opt:
loss.backward()
opt.step()
loss_sum += loss.numpy()*y.shape[0]
n_step += 1
n_samplers += y.shape[0]
return correct/n_samplers, loss_sum/n_samplers
### END YOUR SOLUTION
def train_ptb(model, data, seq_len=40, n_epochs=1, optimizer=ndl.optim.SGD,
lr=4.0, weight_decay=0.0, loss_fn=nn.SoftmaxLoss, clip=None,
device=None, dtype="float32"):
np.random.seed(4)
### BEGIN YOUR SOLUTION
opt = optimizer(model.parameters(), lr=lr, weight_decay=weight_decay)
for _ in range(n_epochs):
train_acc, train_loss = epoch_general_ptb(data, model, seq_len=seq_len, loss_fn=loss_fn(), opt=opt, clip=clip, device=device, dtype=dtype)
print(train_acc," ", train_loss)
return train_acc, train_loss
### END YOUR SOLUTION
def evaluate_ptb(model, data, seq_len=40, loss_fn=nn.SoftmaxLoss,
device=None, dtype="float32"):
np.random.seed(4)
### BEGIN YOUR SOLUTION
train_acc, train_loss = epoch_general_ptb(data, model, seq_len=seq_len, loss_fn=loss_fn(), device=device, dtype=dtype)
print(train_acc," ", train_loss)
return train_acc, train_loss
### END YOUR SOLUTION
device=device,dtype=dtype 形参、实参都不能省
optim.py中SGD函数需要改动,要么加device,要么直接用parm.grad
两个本地测试都不能完全过去,第一个会检查有没有无梯度的参数,因为存在无梯度的参数,所以过不去。 第二个是算出来结果和给定值不一样,过不去。 但似乎不影响最后训练。
python3 -m pytest -l -v -k "language_model_implementation"
python3 -m pytest -l -v -k "language_model_training" 错
总结
作业4很难,主要难在和之前作业变化有点大,改了一个又要改其他的,一定要注意现在要考虑device和dtype,函数实现一定要显示写清楚设备和数据类型参数。
完成了一遍,但也是照着别人写的,关于如何调用cpu和gpu等还不是很懂。准备过段时间再复习一遍。