hw4实验指南——实现CNN和LSTMhw4的目标是利用前三个作业，实现卷积和卷积神经网络并在CIFAR-10上完成分类

hw4的目标是利用前三个作业，实现卷积和卷积神经网络并在CIFAR-10上完成分类器，然后实现LSTM在Penn Treebank数据集上进行单词级预测语言建模。

.
├── apps
│   ├── models.py
│   └── simple_training.py
├── CMakeLists.txt
├── hw4.ipynb
├── Makefile
├── python
│   └── needle
│       ├── autograd.py
│       ├── backend_ndarray
│       │   ├── __init__.py
│       │   ├── ndarray_backend_numpy.py
│       │   └── ndarray.py
│       ├── backend_numpy.py
│       ├── backend_selection.py
│       ├── data.py
│       ├── __init__.py
│       ├── init.py
│       ├── nn.py
│       ├── ops.py
│       └── optim.py
├── ResNet9.png
├── src
│   ├── ndarray_backend_cpu.cc
│   └── ndarray_backend_cuda.cu
└── tests
    ├── __pycache__
    │   ├── test_cifar_ptb_data.cpython-38-pytest-6.1.1.pyc
    │   ├── test_cifar_ptb_data.cpython-38-pytest-7.1.2.pyc
    │   ├── test_conv.cpython-38-pytest-6.1.1.pyc
    │   ├── test_conv.cpython-38-pytest-7.1.2.pyc
    │   ├── test_nd_backend.cpython-38-pytest-6.1.1.pyc
    │   ├── test_nd_backend.cpython-38-pytest-7.1.2.pyc
    │   ├── test_sequence_models.cpython-38-pytest-6.1.1.pyc
    │   └── test_sequence_models.cpython-38-pytest-7.1.2.pyc
    ├── test_cifar_ptb_data.py
    ├── test_conv.py
    ├── test_nd_backend.py
    └── test_sequence_models.py

7 directories, 32 files

Part 1: ND Backend

什么都不做报错，无法导入库

hw3src文件夹复制 make 4/118 将之前hw2中ops.py实现的函数复制到hw4的ops.py中，并添加实现新功能Tanh、Stack、Split。48/118

hw1 autograd.py 实现函数赋值 52/118

hw3 ndarray.py实现的函数进行复制 70/118

ops 两处错误修正 76/118

sum、max、swapaxes 我们写的n维数组库没有这些函数 broadcast_to 需要compact

Part 2: CIFAR-10 dataset

实现CIFAR-10图像分类数据集。首先下载CIFAR-10数据集到data目录下面。

class CIFAR10Dataset(Dataset):
    def __init__(
        self,
        base_folder: str,
        train: bool,
        p: Optional[int] = 0.5,
        transforms: Optional[List] = None
    ):
        super().__init__(transforms)
        X = []
        y = []
        if train:
            for i in range(1,6):
                with open(os.path.join(base_folder, 'data_batch_%d'%i), 'rb') as fo:
                    dict = pickle.load(fo, encoding='bytes')
                    X.append(dict[b'data'].astype(np.float32))
                    y.append(dict[b'labels'])
        else:
            with open(os.path.join(base_folder, 'test_batch'), 'rb') as fo:
                dict = pickle.load(fo, encoding='bytes')
                X.append(dict[b'data'].astype(np.float32))
                y.append(dict[b'labels'])
        X = np.concatenate(X, axis=0)/255.0
        y = np.concatenate(y, axis=0)
        self.X = X
        self.y = y

    def __getitem__(self, index) -> object:
        """
        Returns the image, label at given index
        Image should be of shape (3, 32, 32)
        """
        ### BEGIN YOUR SOLUTION
        X, y = self.X[index], self.y[index]
        if self.transforms:
            X_in = X.reshape((-1, 32, 32, 3))
            X_out = self.apply_transforms(X_in)
            X_ret = X_out.reshape(-1, 3, 32 ,32)
            return X_ret, y
        else:
            return np.squeeze(X.reshape((-1, 3, 32, 32))), y
        ### END YOUR SOLUTION

    def __len__(self) -> int:
        ### BEGIN YOUR SOLUTION
        return self.y.shape[0]
        ### END YOUR SOLUTION

十二个测试通过十个，还有两个test_train_cifar10失败，等ResNet实现了再回头测。

本地测试 python3 -m pytest -l -v -k "cifar10"

Part 3: Convolutional neural network

任务3实现卷积神经网络

ndarray.py

在ndarray.py实现flip和pad函数

    def flip(self, axes):
        """
        Flip this ndarray along the specified axes.
        Note: compact() before returning.
        """
        ### BEGIN YOUR SOLUTION
        if axes is None:
            axes = range(len(self.strides))
        offset_sum = 0
        new_strides = list(self.strides)
        for axis in axes:
            offset_sum += (self.shape[axis]-1)*self.strides[axis]
            new_strides[axis] = -self.strides[axis]
        ret = NDArray.make(
            shape=self.shape,
            strides=tuple(new_strides),
            device = self.device,
            handle = self._handle,
            offset = offset_sum
        )
        return ret.compact()
        ### END YOUR SOLUTION


    def pad(self, axes):
        """
        Pad this ndarray by zeros by the specified amount in `axes`,
        which lists for _all_ axes the left and right padding amount, e.g.,
        axes = ( (0, 0), (1, 1), (0, 0)) pads the middle axis with a 0 on the left and right side.
        """
        ### BEGIN YOUR SOLUTION
        new_shape = list(self.shape)
        for i, ax in enumerate(axes):
            new_shape[i]+= (ax[0]+ax[1])
        ret = NDArray.make(tuple(new_shape), device=self.device)
        ret.fill(0)
        slices = [slice(ax[0], ax[0]+self.shape[i]) for i, ax in enumerate(axes)]
        ret[tuple(slices)] = self
        return ret
        ### END YOUR SOLUTION

本地测试 python3 -m pytest -l -v -k "pad_forward"

ops.py

Flip

flip测试通不过，

cc和cuda文件有问题

部分位置的uint32_t需要换成int32_t

本地测试python3 -m pytest -l -v -k "flip"

Dilate UnDilate

class Dilate(TensorOp):
    def __init__(self, axes: tuple, dilation: int):
        self.axes = axes
        self.dilation = dilation

    def compute(self, a):
        ### BEGIN YOUR SOLUTION
        for ax in self.axes:
            if ax>=len(a.shape):
                return a
        new_shape = list(a.shape)
        for ax in self.axes:
            new_shape[ax] += self.dilation*new_shape[ax]
        #new_shape = new_shape*(self.dilation+1)
        ret = init.zeros(*new_shape, device = a.device)
        slices = [slice(0, new_shape[ax], self.dilation+1) if ax in self.axes else slice(0, new_shape[ax], 1) for ax in range(len(a.shape))]
        ret.cached_data[tuple(slices)] = a
        return ret.cached_data
        ### END YOUR SOLUTION

    def gradient(self, out_grad, node):
        ### BEGIN YOUR SOLUTION
        return undilate(out_grad, self.axes, self.dilation)
        ### END YOUR SOLUTION

def dilate(a, axes, dilation):
    return Dilate(axes, dilation)(a)

class UnDilate(TensorOp):
    def __init__(self, axes: tuple, dilation: int):
        self.axes = axes
        self.dilation = dilation

    def compute(self, a):
        ### BEGIN YOUR SOLUTION
        slices = [slice(0, a.shape[ax], self.dilation + 1) if ax in self.axes else slice(0, a.shape[ax]) for ax
                  in range(len(a.shape))]
        return a[tuple(slices)]
        ### END YOUR SOLUTION

    def gradient(self, out_grad, node):
        ### BEGIN YOUR SOLUTION
        return dilate(out_grad, self.axes, self.dilation)
        ### END YOUR SOLUTION

本地测试 python3 -m pytest -l -v -k "dilate"

Conv

class Conv(TensorOp):
    def __init__(self, stride: Optional[int] = 1, padding: Optional[int] = 0):
        self.stride = stride
        self.padding = padding

    def compute(self, A, B):
        ### BEGIN YOUR SOLUTION
        A = A.pad(((0,0), (self.padding, self.padding), (self.padding, self.padding), (0, 0)))
        # 宽和高两边都pad
        N, H, W, C_in = A.shape
        K, _, _, C_out = B.shape
        Ns, Hs, Ws, Cs = A.strides
        A = A.as_strided((N, H-K+1, W-K+1, K, K, C_in), (Ns, Hs, Ws, Hs, Ws, Cs)).compact()
        A = A.reshape((N*(H-K+1)*(W-K+1), K*K*C_in))
        out = A@(B.reshape((K*K*C_in, C_out)))
        return out.reshape((N, H-K+1, W-K+1, C_out))[:, ::self.stride, ::self.stride, :]
        ### END YOUR SOLUTION

    def gradient(self, out_grad, node):
        ### BEGIN YOUR SOLUTION
        X, weight = node.inputs[0], node.inputs[1]
        '''
        out_grad N, H-K+1+2*p, W-K+1+2*p, C_out
        X N, H, W, C_in
        W K, K, C_in, C_out
        '''
        _, H, W, _ = X.shape
        K = weight.shape[0]
        W_flip = flip(weight, (0,1)).transpose((2,3))
        if self.stride > 1:
            out_grad = dilate(out_grad, (1,2), self.stride-1)
        dX = conv(out_grad, W_flip, padding=K - 1)
        # dX = Tensor(dX.cached_data[:, K-1:H+K-1, K-1:W+K-1, :])
        # NOTE: begin with self.padding!
        # NOTE: device
        dX = Tensor(dX.cached_data[:, self.padding:H + self.padding, self.padding:W + self.padding, :],
                    device=dX.device, dtype=dX.dtype)
        '''
        X, out_grad
        C_in, H, W, N
        H-K+1+2*p, W-K+1+2*p, N, C_out

        C_in, K, K, C_out
        '''
        X = X.transpose((0, 3))
        out_grad = out_grad.transpose((0, 2)).transpose((0, 1))
        dW = conv(X, out_grad, padding=self.padding)
        dW = dW.transpose((0, 2)).transpose((0, 1))
        return dX, dW
        ### END YOUR SOLUTION

ndarray.py中的reshape重写，conv反向传播测试才能通过

    def reshape(self, new_shape):
        ### BEGIN YOUR SOLUTION
#        return self.as_strided(new_shape, NDArray.compact_strides(new_shape))
        return self.compact().as_strided(new_shape, NDArray.compact_strides(new_shape))
        ### END YOUR SOLUTION

本地测试

python3 -m pytest -l -v -k "op\_conv and forward"
python3 -m pytest -l -v -k "op\_conv and backward"

nn.py

先将init.py里的四个初始化重写，函数参数有变化，不能直接粘贴复制之前的。

def kaiming_uniform(fan_in, fan_out, shape=None, nonlinearity="relu", **kwargs):
    assert nonlinearity == "relu", "Only relu supported currently"
    ### BEGIN YOUR SOLUTION
    gain = math.sqrt(2)
    bound = gain * math.sqrt(3 / fan_in)
    return bound * (2 * rand(*shape, **kwargs) - 1)
    ### END YOUR SOLUTION

本地测试 python3 -m pytest -l -v -k "kaiming_uniform"

再实现卷积网络（上面实现的是卷积算子）

class Conv(Module):
    """
    Multi-channel 2D convolutional layer
    IMPORTANT: Accepts inputs in NCHW format, outputs also in NCHW format
    Only supports padding=same
    No grouped convolution or dilation
    Only supports square kernels
    """
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, bias=True, device=None, dtype="float32"):
        super().__init__()
        if isinstance(kernel_size, tuple):
            kernel_size = kernel_size[0]
        if isinstance(stride, tuple):
            stride = stride[0]
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride

        ### BEGIN YOUR SOLUTION
        shape = (kernel_size, kernel_size, in_channels, out_channels)
        self.weight = Parameter(init.kaiming_uniform(
                                self.in_channels*kernel_size*kernel_size, 
                                self.out_channels*kernel_size*kernel_size,
                                shape = shape,
                                device = device,
                                dtype = dtype,
                                requires_grad = True))
        if bias:
            self.bias = Parameter(init.rand(
                                    int(self.out_channels),
                                    low = -1/(in_channels*kernel_size**2)**0.5,
                                    high = 1/(in_channels*kernel_size**2)**0.5,
                                    device = device,
                                    dtype = dtype,
                                    requires_grad = True
                                    ))
        else:
            self.bias = None
        ### END YOUR SOLUTION

    def forward(self, x: Tensor) -> Tensor:
        ### BEGIN YOUR SOLUTION
        N, C, H, W = x.shape
        x = x.transpose((1, 2)).transpose((2,3))
        out = ops.conv(x, self.weight, stride=self.stride, padding = self.kernel_size//2)
        if self.bias:
            out += self.bias.reshape((1,1,1,self.out_channels)).broadcast_to(out.shape)
        return out.transpose((2, 3)).transpose((1,2))
        ### END YOUR SOLUTION

本地测试

python3 -m pytest -l -v -k "nn_conv_forward" 
python3 -m pytest -l -v -k "nn_conv_backward"

models.py

将hw2中nn.py实现的基础网络结构复制到hw4的nn.py中，models.py实现ResNet9需要用到多种基础结构。因为重写了init方法，所以一些网络中参数初始化部分需要重写。 Linear、BatchNorm、layerNorm、dropout

batchnorm等需要注意先reshape再broadcast

class Tanh(Module):
    def forward(self, x: Tensor) -> Tensor:
        ### BEGIN YOUR SOLUTION
        return ops.tanh(x)
        ### END YOUR SOLUTION


class Sigmoid(Module):
    def __init__(self):
        super().__init__()

    def forward(self, x: Tensor) -> Tensor:
        ### BEGIN YOUR SOLUTION
        return (1+ops.exp(-x))**-1
        ### END YOUR SOLUTION

本地测试

python3 -m pytest -l -v -k "resnet9"

simple_training.py

epoch_general_cifar10,
train_cifar10
evaluate_cifar10

dataloader重写，加上device、dtype以及shuffle判断

不能正常运行，ops和data文件有问题 'needle.backend_ndarray.ndarray_backend_cpu.Array' object has no attribute 'array' 主要是dtype、device问题

copy sgd之前代码

成功运行

Part 4: Recurrent neural network

注意变量名必须和提示一致，因为测试时候会用到

class RNNCell(Module):
    def __init__(self, input_size, hidden_size, bias=True, nonlinearity='tanh', device=None, dtype="float32"):
        """
        Applies an RNN cell with tanh or ReLU nonlinearity.

        Parameters:
        input_size: The number of expected features in the input X
        hidden_size: The number of features in the hidden state h
        bias: If False, then the layer does not use bias weights
        nonlinearity: The non-linearity to use. Can be either 'tanh' or 'relu'.

        Variables:
        W_ih: The learnable input-hidden weights of shape (input_size, hidden_size).
        W_hh: The learnable hidden-hidden weights of shape (hidden_size, hidden_size).
        bias_ih: The learnable input-hidden bias of shape (hidden_size,).
        bias_hh: The learnable hidden-hidden bias of shape (hidden_size,).

        Weights and biases are initialized from U(-sqrt(k), sqrt(k)) where k = 1/hidden_size
        """
        super().__init__()
        ### BEGIN YOUR SOLUTION
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bias = bias

        self.W_ih = Parameter(init.rand(
            input_size, hidden_size, low = -1/hidden_size**0.5, high = 1/hidden_size**0.5,
            device = device, dtype = dtype, requires_grad = True
        ))

        self.W_hh = Parameter(init.rand(
            hidden_size, hidden_size, low = -1/hidden_size**0.5, high = 1/hidden_size**0.5,
            device = device, dtype = dtype, requires_grad = True
        ))

        if bias:
            self.bias_ih = Parameter((init.rand(
                hidden_size, low = -1/hidden_size**0.5, high = 1/hidden_size**0.5,
                device = device, dtype = dtype, requires_grad = True
            )))
            self.bias_hh = Parameter((init.rand(
                hidden_size, low = -1/hidden_size**0.5, high = 1/hidden_size**0.5,
                device = device, dtype = dtype, requires_grad = True
            )))
        self.nonlinearity = Tanh() if nonlinearity == 'tanh' else ReLU()
        ### END YOUR SOLUTION

    def forward(self, X, h=None):
        """
        Inputs:
        X of shape (bs, input_size): Tensor containing input features
        h of shape (bs, hidden_size): Tensor containing the initial hidden state
            for each element in the batch. Defaults to zero if not provided.

        Outputs:
        h' of shape (bs, hidden_size): Tensor contianing the next hidden state
            for each element in the batch.
        """
        ### BEGIN YOUR SOLUTION
        bs = X.shape[0]
        out = X @ self.W_ih
        if self.bias:
            out += self.bias_ih.reshape((1, self.hidden_size)).broadcast_to((bs, self.hidden_size))
        if h is not None:
            out += h @ self.W_hh
        if self.bias:
            out += self.bias_hh.reshape((1, self.hidden_size)).broadcast_to((bs, self.hidden_size))
        return self.nonlinearity(out)
        ### END YOUR SOLUTION


class RNN(Module):
    def __init__(self, input_size, hidden_size, num_layers=1, bias=True, nonlinearity='tanh', device=None, dtype="float32"):
        """
        Applies a multi-layer RNN with tanh or ReLU non-linearity to an input sequence.

        Parameters:
        input_size - The number of expected features in the input x
        hidden_size - The number of features in the hidden state h
        num_layers - Number of recurrent layers.
        nonlinearity - The non-linearity to use. Can be either 'tanh' or 'relu'.
        bias - If False, then the layer does not use bias weights.

        Variables:
        rnn_cells[k].W_ih: The learnable input-hidden weights of the k-th layer,
            of shape (input_size, hidden_size) for k=0. Otherwise the shape is
            (hidden_size, hidden_size).
        rnn_cells[k].W_hh: The learnable hidden-hidden weights of the k-th layer,
            of shape (hidden_size, hidden_size).
        rnn_cells[k].bias_ih: The learnable input-hidden bias of the k-th layer,
            of shape (hidden_size,).
        rnn_cells[k].bias_hh: The learnable hidden-hidden bias of the k-th layer,
            of shape (hidden_size,).
        """
        super().__init__()
        ### BEGIN YOUR SOLUTION
        self.hidden_size = hidden_size
        self.device = device
        self.dtype = dtype
        self.num_layers = num_layers
        rnn_cells = [RNNCell(input_size, hidden_size, bias, nonlinearity, device, dtype)]
        for i in range(num_layers - 1):
            rnn_cells.append(RNNCell(hidden_size, hidden_size, bias, nonlinearity, device, dtype))
        self.rnn_cells = rnn_cells
        ### END YOUR SOLUTION

    def forward(self, X, h0=None):
        """
        Inputs:
        X of shape (seq_len, bs, input_size) containing the features of the input sequence.
        h_0 of shape (num_layers, bs, hidden_size) containing the initial
            hidden state for each element in the batch. Defaults to zeros if not provided.

        Outputs
        output of shape (seq_len, bs, hidden_size) containing the output features
            (h_t) from the last layer of the RNN, for each t.
        h_n of shape (num_layers, bs, hidden_size) containing the final hidden state for each element in the batch.
        """
        ### BEGIN YOUR SOLUTION
        Xs = ops.split(X, 0)
        hs = ops.split(h0, 0) if h0 is not None else [None] * self.num_layers
        out = []
        for t, x in enumerate(Xs):
            hiddens = []
            for l, model in enumerate(self.rnn_cells):
                x = model(x, hs[l])
                hiddens.append(x)
            out.append(x)
            hs = hiddens
        out = ops.stack(out, 0)
        hs = ops.stack(hs, 0)
        return out, hs
        ### END YOUR SOLUTION

Part 5: Long short-term memory network

class LSTMCell(Module):
    def __init__(self, input_size, hidden_size, bias=True, device=None, dtype="float32"):
        """
        A long short-term memory (LSTM) cell.

        Parameters:
        input_size - The number of expected features in the input X
        hidden_size - The number of features in the hidden state h
        bias - If False, then the layer does not use bias weights

        Variables:
        W_ih - The learnable input-hidden weights, of shape (input_size, 4*hidden_size).
        W_hh - The learnable hidden-hidden weights, of shape (hidden_size, 4*hidden_size).
        bias_ih - The learnable input-hidden bias, of shape (4*hidden_size,).
        bias_hh - The learnable hidden-hidden bias, of shape (4*hidden_size,).

        Weights and biases are initialized from U(-sqrt(k), sqrt(k)) where k = 1/hidden_size
        """
        super().__init__()
        ### BEGIN YOUR SOLUTION
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bias = bias

        il, hl = input_size, hidden_size
        self.W_ih = Parameter(
            init.rand(il, 4*hl, low=-1/hl**0.5, high=1/hl**0.5, device=device, dtype=dtype, requires_grad=True)
        )
        self.W_hh = Parameter(
            init.rand(hl, 4*hl, low=-1/hl**0.5, high=1/hl**0.5, device=device, dtype=dtype, requires_grad=True)
        )
        if bias:
            self.bias_ih = Parameter(
                init.rand(4*hl, low=-1/hl**0.5, high=1/hl**0.5, device=device, dtype=dtype, requires_grad=True)
            )
            self.bias_hh = Parameter(
                init.rand(4*hl, low=-1/hl**0.5, high=1/hl**0.5, device=device, dtype=dtype, requires_grad=True)
            )
        self.tanh = Tanh()
        self.sigmoid = Sigmoid()
        ### END YOUR SOLUTION


    def forward(self, X, h=None):
        """
        Inputs: X, h
        X of shape (batch, input_size): Tensor containing input features
        h, tuple of (h0, c0), with
            h0 of shape (bs, hidden_size): Tensor containing the initial hidden state
                for each element in the batch. Defaults to zero if not provided.
            c0 of shape (bs, hidden_size): Tensor containing the initial cell state
                for each element in the batch. Defaults to zero if not provided.

        Outputs: (h', c')
        h' of shape (bs, hidden_size): Tensor containing the next hidden state for each
            element in the batch.
        c' of shape (bs, hidden_size): Tensor containing the next cell state for each
            element in the batch.
        """
        ### BEGIN YOUR SOLUTION
        bs = X.shape[0]
        h0, c0 = (None, None) if h is None else h
        hl = self.hidden_size
        out = X @ self.W_ih
        if h0 is not None:
            out += h0 @ self.W_hh
        if self.bias:
            out += self.bias_ih.reshape((1, 4*hl)).broadcast_to((bs, 4*hl))
            out += self.bias_hh.reshape((1, 4*hl)).broadcast_to((bs, 4*hl))
        out_list = ops.split(out, 1)
        i = ops.stack(tuple([out_list[i] for i in range(0, hl)]), 1)
        f = ops.stack(tuple([out_list[i] for i in range(hl, 2*hl)]), 1)
        g = ops.stack(tuple([out_list[i] for i in range(2*hl, 3*hl)]), 1)
        o = ops.stack(tuple([out_list[i] for i in range(3*hl, 4*hl)]), 1)
        
        g = self.tanh(g)
        i, f, o = self.sigmoid(i), self.sigmoid(f), self.sigmoid(o)

        c1 = i*g if c0 is None else f*c0+i*g
        h1 = o*self.tanh(c1)
        return (h1, c1)
        ### END YOUR SOLUTION


class LSTM(Module):
    def __init__(self, input_size, hidden_size, num_layers=1, bias=True, device=None, dtype="float32"):
        super().__init__()
        """
        Applies a multi-layer long short-term memory (LSTM) RNN to an input sequence.

        Parameters:
        input_size - The number of expected features in the input x
        hidden_size - The number of features in the hidden state h
        num_layers - Number of recurrent layers.
        bias - If False, then the layer does not use bias weights.

        Variables:
        lstm_cells[k].W_ih: The learnable input-hidden weights of the k-th layer,
            of shape (input_size, 4*hidden_size) for k=0. Otherwise the shape is
            (hidden_size, 4*hidden_size).
        lstm_cells[k].W_hh: The learnable hidden-hidden weights of the k-th layer,
            of shape (hidden_size, 4*hidden_size).
        lstm_cells[k].bias_ih: The learnable input-hidden bias of the k-th layer,
            of shape (4*hidden_size,).
        lstm_cells[k].bias_hh: The learnable hidden-hidden bias of the k-th layer,
            of shape (4*hidden_size,).
        """
        ### BEGIN YOUR SOLUTION
        self.num_layers = num_layers
        lstm_cells = [LSTMCell(input_size, hidden_size, bias, device, dtype)]
        for i in range(num_layers-1):
            lstm_cells.append(LSTMCell(hidden_size, hidden_size, bias, device, dtype))
        self.lstm_cells = lstm_cells
        ### END YOUR SOLUTION

    def forward(self, X, h=None):
        """
        Inputs: X, h
        X of shape (seq_len, bs, input_size) containing the features of the input sequence.
        h, tuple of (h0, c0) with
            h_0 of shape (num_layers, bs, hidden_size) containing the initial
                hidden state for each element in the batch. Defaults to zeros if not provided.
            c0 of shape (num_layers, bs, hidden_size) containing the initial
                hidden cell state for each element in the batch. Defaults to zeros if not provided.

        Outputs: (output, (h_n, c_n))
        output of shape (seq_len, bs, hidden_size) containing the output features
            (h_t) from the last layer of the LSTM, for each t.
        tuple of (h_n, c_n) with
            h_n of shape (num_layers, bs, hidden_size) containing the final hidden state for each element in the batch.
            h_n of shape (num_layers, bs, hidden_size) containing the final hidden cell state for each element in the batch.
        """
        ### BEGIN YOUR SOLUTION
        Xs = ops.split(X, 0)
        h0, c0 = (None, None) if h is None else h
        hs = [None] * self.num_layers if h0 is None else ops.split(h0, 0)
        cs = [None] * self.num_layers if c0 is None else ops.split(c0, 0)
        out = []
        for t, x in enumerate(Xs):
            hiddens = []
            cells = []
            for l, model in enumerate(self.lstm_cells):
                x, c_out = model(x, (hs[l], cs[l]))
                hiddens.append(x)
                cells.append(c_out)
            out.append(x)
            hs = hiddens
            cs = cells
        out = ops.stack(out, 0)
        hs = ops.stack(hs, 0)
        cs = ops.stack(cs, 0)
        return out, (hs, cs)
        ### END YOUR SOLUTION

Part 6: Penn Treebank dataset

由于dataloader等代码有改动，python3 -m pytest -l -v -k "cifar10"之前能通过的测试现在也无法通过了，需要加上device和dtype，写清参数。

test_cifar_ptb_data.py

BATCH_SIZES = [1, 15]
@pytest.mark.parametrize("batch_size", BATCH_SIZES)
@pytest.mark.parametrize("train", TRAIN)
@pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"])
def test_cifar10_loader(batch_size, train, device, dtype="float32"):
    cifar10_train_dataset = ndl.data.CIFAR10Dataset("data/cifar-10-batches-py", train=True)
    train_loader = ndl.data.DataLoader(cifar10_train_dataset, batch_size=batch_size, device = device, dtype=dtype)
    for (X, y) in train_loader:
        break
    assert isinstance(X.cached_data, nd.NDArray)
    assert isinstance(X, ndl.Tensor)
    assert isinstance(y, ndl.Tensor)
    assert X.dtype == 'float32'

目前ResNet9已经实现，之前任务2里面没有pass也可以pass，但也需要对dataloader增加参数

test_conv.py的test_train_cifar10

dataloader = ndl.data.DataLoader(\
         dataset=dataset,
         batch_size=128,
         shuffle=False,
         device=device,
         dtype="float32"
         )

实现ptb数据集读写

python3 -m pytest -l -v -k "ptb"不知道为什么也会对cifar10进行测试

python3 -m pytest -l -v -k "ptb_dataset"只测试ptb数据集

class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        ### BEGIN YOUR SOLUTION
        if word not in self.word2idx:
            self.word2idx[word] = len(self.idx2word)
            self.idx2word.append(word)
        return self.word2idx[word]
        ### END YOUR SOLUTION

    def __len__(self):
        ### BEGIN YOUR SOLUTION
        return len(self.idx2word)
        ### END YOUR SOLUTION

class Corpus(object):
    """
    Creates corpus from train, and test txt files.
    """
    def __init__(self, base_dir, max_lines=None):
        self.dictionary = Dictionary()
        self.train = self.tokenize(os.path.join(base_dir, 'train.txt'), max_lines)
        self.test = self.tokenize(os.path.join(base_dir, 'test.txt'), max_lines)

    def tokenize(self, path, max_lines=None):
        ### BEGIN YOUR SOLUTION
        with open(path, 'r', encoding='utf--8') as f:
            lines = f.readlines()
        i = 0
        ids = []
        for line in lines:
            if i == max_lines:
                break
            words = line.strip().split(' ')
            for word in words:
                ids.append(self.dictionary.add_word(word))
            ids.append(self.dictionary.add_word('<eos>'))
            i += 1
        return ids
        ### END YOUR SOLUTION

def batchify(data, batch_size, device, dtype):
    ### BEGIN YOUR SOLUTION
    data = np.array(data, dtype=dtype)
    nbatch = len(data)//batch_size
    data = data[:batch_size*nbatch].reshape((batch_size, nbatch)).T
    return data
    ### END YOUR SOLUTION

def get_batch(batches, i, bptt, device=None, dtype=None):
    ### BEGIN YOUR SOLUTION
    bptt = min(bptt, len(batches)-1-i)
    data = batches[i:i+bptt]
    target = batches[i+1:i+1+bptt]
    data = Tensor(data, device=device, dtype=dtype)
    target = Tensor(target.reshape(-1), device=device, dtype=dtype)
    return data, target
    ### END YOUR SOLUTION

Part 7: Training a word-level language model

python/needle/nn.py implement Embedding
apps/models.py implement LanguageModel
apps/simple_training.py implement epoch_general_ptb, train_ptb, and evaluate_ptb

nn.py

class Embedding(Module):
    def __init__(self, num_embeddings, embedding_dim, device=None, dtype="float32"):
        super().__init__()
        """
        Maps one-hot word vectors from a dictionary of fixed size to embeddings.

        Parameters:
        num_embeddings (int) - Size of the dictionary
        embedding_dim (int) - The size of each embedding vector

        Variables:
        weight - The learnable weights of shape (num_embeddings, embedding_dim)
            initialized from N(0, 1).
        """
        ### BEGIN YOUR SOLUTION
        self.num_embeddings = num_embeddings
        self.embedding_dim = embedding_dim
        self.weight = Parameter(init.randn(num_embeddings, embedding_dim, device=device, dtype=dtype))
        ### END YOUR SOLUTION

    def forward(self, x: Tensor) -> Tensor:
        """
        Maps word indices to one-hot vectors, and projects to embedding vectors

        Input:
        x of shape (seq_len, bs)

        Output:
        output of shape (seq_len, bs, embedding_dim)
        """
        ### BEGIN YOUR SOLUTION
        one_hot_vectors = self.one_hot = init.one_hot(self.num_embeddings, x, device=x.device, dtype=x.dtype)
        seq_len, bs, em = one_hot_vectors.shape
        one_hot_vectors = one_hot_vectors.reshape((seq_len*bs, em))
        out = one_hot_vectors @ self.weight
        return out.reshape((seq_len, bs, self.embedding_dim))
        ### END YOUR SOLUTION

models.py


class LanguageModel(nn.Module):
    def __init__(self, embedding_size, output_size, hidden_size, num_layers=1,
                 seq_model='rnn', device=None, dtype="float32"):
        """
        Consists of an embedding layer, a sequence model (either RNN or LSTM), and a
        linear layer.
        Parameters:
        output_size: Size of dictionary
        embedding_size: Size of embeddings
        hidden_size: The number of features in the hidden state of LSTM or RNN
        seq_model: 'rnn' or 'lstm', whether to use RNN or LSTM
        num_layers: Number of layers in RNN or LSTM
        """
        super(LanguageModel, self).__init__()
        ### BEGIN YOUR SOLUTION
        self.embedding_size = embedding_size
        self.output_size = output_size
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, embedding_size, device=device, dtype=dtype)
        if seq_model=='rnn':
            self.model = nn.RNN(embedding_size, hidden_size, num_layers, device=device, dtype=dtype)
        elif seq_model == 'lstm':
            self.model = nn.LSTM(embedding_size, hidden_size, num_layers, device=device, dtype=dtype)
        else:
            raise NotImplementedError
        self.linear = nn.Linear(hidden_size, output_size, device=device, dtype=dtype)
        ### END YOUR SOLUTION

    def forward(self, x, h=None):
        """
        Given sequence (and the previous hidden state if given), returns probabilities of next word
        (along with the last hidden state from the sequence model).
        Inputs:
        x of shape (seq_len, bs)
        h of shape (num_layers, bs, hidden_size) if using RNN,
            else h is tuple of (h0, c0), each of shape (num_layers, bs, hidden_size)
        Returns (out, h)
        out of shape (seq_len*bs, output_size)
        h of shape (num_layers, bs, hidden_size) if using RNN,
            else h is tuple of (h0, c0), each of shape (num_layers, bs, hidden_size)
        """
        ### BEGIN YOUR SOLUTION
        seq_len, bs = x.shape
        x = self.embedding(x)
        x, h_ = self.model(x, h)
        x = self.linear(x.reshape((seq_len*bs, self.hidden_size)))
        return x, h_
        ### END YOUR SOLUTION

simple_training.py

### PTB training ###
def epoch_general_ptb(data, model, seq_len=40, loss_fn=nn.SoftmaxLoss(), opt=None,
        clip=None, device=None, dtype="float32"):
    np.random.seed(4)
    ### BEGIN YOUR SOLUTION
    correct, loss_sum, n_step, n_samplers = 0., 0., 0, 0
    if opt:
        model.train()
    else:
        model.eval()
    h = None
    for i in range(0, data.shape[0]-1, seq_len):
        X, y = ndl.data.get_batch(data, i, seq_len, device=device, dtype=dtype)
        print(n_step, " ", X.shape)
        if opt:
            opt.reset_grad()
        pred, h = model(X, h)
        if isinstance(h, tuple):
            h = (h[0].detach(), h[1].detach())
        else:
            h = h.detach()
        loss = loss_fn(pred, y)

        correct += (pred.numpy().argmax(axis=1)==y.numpy()).sum()
        if opt:
            loss.backward()
            opt.step()
        loss_sum += loss.numpy()*y.shape[0]
        n_step += 1
        n_samplers += y.shape[0]
    return correct/n_samplers, loss_sum/n_samplers
    ### END YOUR SOLUTION


def train_ptb(model, data, seq_len=40, n_epochs=1, optimizer=ndl.optim.SGD,
          lr=4.0, weight_decay=0.0, loss_fn=nn.SoftmaxLoss, clip=None,
          device=None, dtype="float32"):
    np.random.seed(4)
    ### BEGIN YOUR SOLUTION
    opt = optimizer(model.parameters(), lr=lr, weight_decay=weight_decay)
    for _ in range(n_epochs):
        train_acc, train_loss = epoch_general_ptb(data, model, seq_len=seq_len, loss_fn=loss_fn(), opt=opt, clip=clip, device=device, dtype=dtype)
        print(train_acc,"  ", train_loss)
    return train_acc, train_loss
    ### END YOUR SOLUTION


def evaluate_ptb(model, data, seq_len=40, loss_fn=nn.SoftmaxLoss,
        device=None, dtype="float32"):
    np.random.seed(4)
    ### BEGIN YOUR SOLUTION
    train_acc, train_loss = epoch_general_ptb(data, model, seq_len=seq_len, loss_fn=loss_fn(), device=device, dtype=dtype)
    print(train_acc,"  ", train_loss)
    return train_acc, train_loss
    ### END YOUR SOLUTION

device=device，dtype=dtype 形参、实参都不能省

optim.py中SGD函数需要改动，要么加device，要么直接用parm.grad

两个本地测试都不能完全过去，第一个会检查有没有无梯度的参数，因为存在无梯度的参数，所以过不去。第二个是算出来结果和给定值不一样，过不去。但似乎不影响最后训练。

python3 -m pytest -l -v -k "language_model_implementation" 
python3 -m pytest -l -v -k "language_model_training" 错

总结

作业4很难，主要难在和之前作业变化有点大，改了一个又要改其他的，一定要注意现在要考虑device和dtype，函数实现一定要显示写清楚设备和数据类型参数。