1. 使用 :numref:`sec_model_construction` 中定义的 `FancyMLP` 模型，访问各个层的参数。

X = torch.rand(2, 20)

class ParallelBlock(nn.Module):
    def __init__(self, *args):
        super().__init__()
        for idx, module in enumerate(args):
            self._modules[str(idx)] = module

    def forward(self, X):
        outs = []
        for block in self._modules.values():
            outs.append(block(X))
        out = torch.cat(outs, dim=1)
        return out
        
net1 = nn.Sequential(nn.Linear(20, 64), nn.ReLU(), nn.Linear(64, 32), nn.ReLU())
net2 = nn.Sequential(nn.Linear(20, 64), nn.ReLU(), nn.Linear(64, 32), nn.ReLU())
net = ParallelBlock(net1, net2)
net(X)

tensor([[0.0000, 0.0000, 0.0115, 0.0000, 0.0000, 0.0000, 0.2275, 0.0632, 0.0039,
         0.0000, 0.1527, 0.0000, 0.0000, 0.2280, 0.0000, 0.1764, 0.0000, 0.0000,
         0.0716, 0.0261, 0.1223, 0.0000, 0.0716, 0.0683, 0.0000, 0.0000, 0.0236,
         0.0000, 0.0559, 0.0000, 0.1414, 0.0404, 0.0000, 0.0000, 0.0000, 0.0267,
         0.0000, 0.1426, 0.0000, 0.0000, 0.0209, 0.0560, 0.0031, 0.0569, 0.0540,
         0.0000, 0.0000, 0.2587, 0.1901, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.3943, 0.0000, 0.0000, 0.0966, 0.0000, 0.1847, 0.1719, 0.0299, 0.0000,
         0.0000],
        [0.0577, 0.0000, 0.0370, 0.0000, 0.0000, 0.0033, 0.3014, 0.0000, 0.1267,
         0.0000, 0.1927, 0.0000, 0.0000, 0.2279, 0.0000, 0.1457, 0.0000, 0.0000,
         0.0779, 0.0288, 0.0533, 0.0000, 0.1213, 0.0000, 0.0000, 0.0000, 0.0224,
         0.0000, 0.0689, 0.0000, 0.0961, 0.1160, 0.0000, 0.0000, 0.0000, 0.0787,
         0.0000, 0.1249, 0.0000, 0.0000, 0.0000, 0.0961, 0.0561, 0.0474, 0.0839,
         0.0000, 0.0000, 0.2352, 0.1076, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.3123, 0.0000, 0.0000, 0.1224, 0.0000, 0.1729, 0.0000, 0.0606, 0.0000,
         0.0000]], grad_fn=<CatBackward0>)

print(net)

ParallelBlock(
  (0): Sequential(
    (0): Linear(in_features=20, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=32, bias=True)
    (3): ReLU()
  )
  (1): Sequential(
    (0): Linear(in_features=20, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=32, bias=True)
    (3): ReLU()
  )
)

print(*[(name, param.shape) for name, param in net.named_parameters()])

('0.0.weight', torch.Size([64, 20])) ('0.0.bias', torch.Size([64])) 
('0.2.weight', torch.Size([32, 64])) ('0.2.bias', torch.Size([32])) 
('1.0.weight', torch.Size([64, 20])) ('1.0.bias', torch.Size([64])) 
('1.2.weight', torch.Size([32, 64])) ('1.2.bias', torch.Size([32]))

2. 查看初始化模块文档以了解不同的初始化方法。

PyTorch Docs torch.nn.init

gain = nn.init.calculate_gain('leaky_relu', 0.2)  # leaky_relu with negative_slope=0.2
gain

1.3867504905630728

w = torch.empty(3, 5)
nn.init.uniform_(w)

tensor([[0.1377, 0.3783, 0.1233, 0.9311, 0.0264],
        [0.7690, 0.0774, 0.2472, 0.6879, 0.4036],
        [0.3922, 0.3004, 0.8534, 0.6462, 0.9871]])

w = torch.empty(3, 5)
nn.init.normal_(w)

tensor([[-0.2756, -0.1585,  0.9466,  0.4713, -1.0736],
        [-0.4746,  0.9015, -0.2237,  0.1088, -1.0558],
        [ 0.2747,  0.4570,  0.4468, -2.0215, -0.0246]])

w = torch.empty(3, 5)
nn.init.constant_(w, 0.3)

tensor([[0.3000, 0.3000, 0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000, 0.3000, 0.3000]])

w = torch.empty(3, 5)
nn.init.ones_(w)

tensor([[1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.]])

w = torch.empty(3, 5)
nn.init.zeros_(w)

tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]])

w = torch.empty(5, 5)
nn.init.eye_(w)

tensor([[1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 1.]])

w = torch.empty(1, 2, 2)
w2 = torch.empty(6, 2, 2)
nn.init.dirac_(w), nn.init.dirac_(w2, 3)

(tensor([[[0., 1.],
          [0., 0.]]]),
 tensor([[[0., 1.],
          [0., 0.]],
 
         [[0., 0.],
          [0., 1.]],
 
         [[0., 1.],
          [0., 0.]],
 
         [[0., 0.],
          [0., 1.]],
 
         [[0., 1.],
          [0., 0.]],
 
         [[0., 0.],
          [0., 1.]]]))

w = torch.empty(3, 5)
nn.init.xavier_uniform_(w, gain=nn.init.calculate_gain('relu'))

tensor([[ 0.3049, -0.3659,  0.6783, -0.3202,  1.1596],
        [ 0.4212,  0.9982, -0.1972,  0.1825, -0.2864],
        [-0.6435,  0.6522, -0.5300, -0.0236,  0.4392]])

w = torch.empty(3, 5)
nn.init.xavier_normal_(w)

tensor([[-0.4608, -0.0078,  0.0058, -0.0527,  0.5002],
        [ 0.2446, -0.3416, -0.8208,  1.2841,  0.0303],
        [ 0.0835, -0.2286, -0.4943,  0.7856,  0.0292]])

w = torch.empty(3, 5)
nn.init.kaiming_uniform_(w, mode='fan_in', nonlinearity='relu')

tensor([[-0.9845,  0.9773, -0.5689, -0.0049, -1.0222],
        [ 0.6000, -0.9605,  0.2954,  0.5001,  0.1511],
        [-0.5534, -0.2624,  0.7569,  0.5353, -0.3299]])

w = torch.empty(3, 5)
nn.init.kaiming_normal_(w, mode='fan_out', nonlinearity='relu')

tensor([[-0.9855, -0.9682, -1.0015, -0.0387, -0.2912],
        [-0.1965,  2.2835, -0.4523, -0.7682,  0.6579],
        [ 0.6841, -0.4058, -1.1320,  1.1610,  1.0600]])

w = torch.empty(3, 5)
nn.init.trunc_normal_(w)

tensor([[-0.6127,  1.3798,  0.0973,  0.1256,  0.6524],
        [-0.8308, -0.0908, -0.6288, -1.0203, -1.1182],
        [ 1.1715,  0.0958,  0.5504,  0.7930,  1.7137]])

w = torch.empty(3, 5)
nn.init.orthogonal_(w)

tensor([[ 0.3103, -0.7428,  0.1702, -0.2517, -0.5096],
        [-0.2807, -0.2295, -0.4385, -0.7304,  0.3779],
        [-0.2167, -0.0697, -0.7876,  0.3395, -0.4610]])

w = torch.empty(3, 5)
nn.init.sparse_(w, sparsity=0.1)

tensor([[ 0.0000,  0.0046,  0.0173,  0.0000, -0.0054],
        [-0.0016,  0.0000,  0.0000,  0.0071,  0.0000],
        [ 0.0096, -0.0012,  0.0009, -0.0083, -0.0039]])

3. 构建包含共享参数层的多层感知机并对其进行训练。在训练过程中，观察模型各层的参数和梯度。

from d2l import torch as d2l

num_inputs, num_outputs, num_hiddens = 784, 10, 256
dropout1, dropout2, dropout3 = 0.5, 0.3, 0.1

shared = nn.Linear(num_hiddens, num_hiddens)
net = nn.Sequential(nn.Flatten(),
                    nn.Linear(num_inputs, num_hiddens), nn.ReLU(), nn.Dropout(dropout1),
                    shared, nn.ReLU(), nn.Dropout(dropout2),
                    shared, nn.ReLU(), nn.Dropout(dropout3),
                    nn.Linear(num_hiddens, num_outputs))

num_epochs, lr, batch_size = 30, 0.5, 256
loss = nn.CrossEntropyLoss(reduction='none')
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)

trainer = torch.optim.SGD(net.parameters(), lr=lr)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, trainer)

print(*[(name, param.shape) for name, param in net.named_parameters()])

('1.weight', torch.Size([256, 784])) ('1.bias', torch.Size([256])) 
('4.weight', torch.Size([256, 256])) ('4.bias', torch.Size([256])) 
('10.weight', torch.Size([10, 256])) ('10.bias', torch.Size([10]))

4. 为什么共享参数是个好主意？

共享参数是深度学习中一个重要的概念，它指的是在模型的不同部分使用相同的权重矩阵或参数。这种做法有多个优点：

减少计算量：共享参数可以显著减少模型中的参数总数，从而减少计算量和内存需求。这对于大型模型或数据集尤其重要，因为它们可能需要处理数十亿甚至数万亿的参数。
提高泛化能力：共享参数可以迫使模型学习到更泛化的特征表示，因为相同的参数需要适应不同的输入数据。这有助于模型避免过拟合，并提高其在未见过的数据上的表现。
提高效率：由于参数共享减少了模型的规模，它还可以提高训练和推理的效率。较小的模型通常训练得更快，并且需要的计算资源更少。
正则化效果：在某种程度上，参数共享可以看作是一种正则化技术，因为它限制了模型可以学习的特征数量，从而减少了过拟合的风险。
简化模型结构：在某些情况下，参数共享可以简化模型的结构，使其更易于理解和解释。
迁移学习：共享参数是迁移学习的基础，其中在一个任务上训练的模型可以应用于另一个相关任务，而无需从头开始训练。
减少存储需求：较少的参数意味着模型占用的存储空间更少，这对于资源受限的环境（如移动设备）尤其重要。
经济性：在大规模部署机器学习模型时，减少参数数量可以降低硬件成本和运营成本。
模型压缩：参数共享是模型压缩技术的一部分，可以减少模型的大小，使其更适合在资源受限的平台上部署。
解决数据稀疏问题：在数据稀疏的情况下，共享参数可以帮助模型利用跨任务的共有信息，从而提高学习效率。

然而，共享参数也有其局限性，它可能不适用于所有类型的任务或模型。例如，在需要捕捉高度特定或复杂特征的任务中，不共享参数的模型可能会表现得更好。此外，参数共享可能会限制模型的容量，使其难以捕捉任务的所有细节。因此，是否共享参数以及如何共享，需要根据具体的应用场景和任务需求来决定。

参数管理｜深度学习计算｜动手学深度学习

1. 使用 :numref:sec_model_construction 中定义的 FancyMLP 模型，访问各个层的参数。

2. 查看初始化模块文档以了解不同的初始化方法。

3. 构建包含共享参数层的多层感知机并对其进行训练。在训练过程中，观察模型各层的参数和梯度。

4. 为什么共享参数是个好主意？

1. 使用 :numref:`sec_model_construction` 中定义的 `FancyMLP` 模型，访问各个层的参数。