PyTorch深度教程（六）：内置模块与实战应用第一部分：nn.Module核心模块 1.1 nn.Linear - 全

前置要求：完成前五篇教程 核心目标：掌握PyTorch内置模块的原理与实战应用
第一部分：nn.Module核心模块

1.1 nn.Linear - 全连接层

1.1.1 数学原理与源码解析

"""
线性层数学公式：
y = xW^T + b

其中：
- x: 输入 (batch_size, in_features)
- W: 权重矩阵 (out_features, in_features)
- b: 偏置向量 (out_features,)
- y: 输出 (batch_size, out_features)
"""

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt

class LinearLayerDeepDive:
    """全连接层深度解析"""

    def understand_linear_internals(self):
        """理解Linear层的内部实现"""

        # 创建Linear层
        linear = nn.Linear(in_features=10, out_features=5)

        # 查看参数
        print(f"权重形状: {linear.weight.shape}")  # (5, 10)
        print(f"偏置形状: {linear.bias.shape}")     # (5,)

        # 手动实现Linear层
        class MyLinear(nn.Module):
            def __init__(self, in_features, out_features, bias=True):
                super().__init__()
                self.in_features = in_features
                self.out_features = out_features

                # 初始化权重（Kaiming初始化）
                self.weight = nn.Parameter(
                    torch.randn(out_features, in_features) *
                    np.sqrt(2.0 / in_features)
                )

                if bias:
                    self.bias = nn.Parameter(torch.zeros(out_features))
                else:
                    self.register_parameter('bias', None)

            def forward(self, x):
                # y = xW^T + b
                output = torch.matmul(x, self.weight.t())
                if self.bias is not None:
                    output += self.bias
                return output

        # 测试
        my_linear = MyLinear(10, 5)
        x = torch.randn(32, 10)
        y = my_linear(x)
        print(f"输出形状: {y.shape}")  # (32, 5)

    def linear_layer_applications(self):
        """Linear层的实战应用"""

        # 1. 多层感知机（MLP）
        class MLP(nn.Module):
            """标准MLP网络"""
            def __init__(self, input_dim, hidden_dims, output_dim, dropout=0.5):
                super().__init__()

                layers = []
                prev_dim = input_dim

                # 隐藏层
                for hidden_dim in hidden_dims:
                    layers.append(nn.Linear(prev_dim, hidden_dim))
                    layers.append(nn.ReLU())
                    layers.append(nn.Dropout(dropout))
                    prev_dim = hidden_dim

                # 输出层
                layers.append(nn.Linear(prev_dim, output_dim))

                self.network = nn.Sequential(*layers)

            def forward(self, x):
                return self.network(x)

        # 2. 残差MLP
        class ResidualMLP(nn.Module):
            """带残差连接的MLP"""
            def __init__(self, dim, hidden_dim):
                super().__init__()
                self.fc1 = nn.Linear(dim, hidden_dim)
                self.fc2 = nn.Linear(hidden_dim, dim)
                self.norm = nn.LayerNorm(dim)

            def forward(self, x):
                residual = x
                x = F.relu(self.fc1(x))
                x = self.fc2(x)
                x = self.norm(x + residual)  # 残差连接
                return x

        # 3. 专家混合（Mixture of Experts）
        class MixtureOfExperts(nn.Module):
            """MoE架构"""
            def __init__(self, input_dim, hidden_dim, num_experts=4):
                super().__init__()
                self.num_experts = num_experts

                # 多个专家网络
                self.experts = nn.ModuleList([
                    nn.Linear(input_dim, hidden_dim)
                    for _ in range(num_experts)
                ])

                # 门控网络
                self.gate = nn.Linear(input_dim, num_experts)

            def forward(self, x):
                # 计算门控权重
                gate_weights = F.softmax(self.gate(x), dim=-1)

                # 计算每个专家的输出
                expert_outputs = torch.stack([
                    expert(x) for expert in self.experts
                ], dim=1)  # (batch, num_experts, hidden_dim)

                # 加权组合
                output = torch.einsum('be,beh->bh', gate_weights, expert_outputs)
                return output

### 1.2 nn.Conv2d - 卷积层

#### 1.2.1 卷积运算深度解析

```python
class ConvolutionalLayerMastery:
    """卷积层完全掌握"""

    def conv2d_mathematics(self):
        """
        卷积数学公式：

        out[b,c_out,h,w] = Σ_{c_in} Σ_{kh} Σ_{kw}
                           input[b,c_in,h*stride+kh,w*stride+kw] *
                           weight[c_out,c_in,kh,kw] +
                           bias[c_out]

        输出尺寸计算：
        H_out = floor((H_in + 2*padding - kernel_size) / stride) + 1
        W_out = floor((W_in + 2*padding - kernel_size) / stride) + 1
        """

        # 创建卷积层
        conv = nn.Conv2d(
            in_channels=3,
            out_channels=64,
            kernel_size=3,
            stride=1,
            padding=1,
            bias=True
        )

        # 输入：(batch, channels, height, width)
        x = torch.randn(32, 3, 224, 224)
        y = conv(x)
        print(f"输出形状: {y.shape}")  # (32, 64, 224, 224)

        # 手动实现卷积（教学用，低效）
        def manual_conv2d(input, weight, bias, stride=1, padding=0):
            """手动实现2D卷积"""
            batch, in_ch, in_h, in_w = input.shape
            out_ch, _, kh, kw = weight.shape

            # 添加padding
            if padding > 0:
                input = F.pad(input, (padding,)*4)

            # 计算输出尺寸
            out_h = (in_h + 2*padding - kh) // stride + 1
            out_w = (in_w + 2*padding - kw) // stride + 1

            # 初始化输出
            output = torch.zeros(batch, out_ch, out_h, out_w)

            # 卷积计算
            for b in range(batch):
                for oc in range(out_ch):
                    for h in range(out_h):
                        for w in range(out_w):
                            h_start = h * stride
                            w_start = w * stride

                            # 提取感受野
                            receptive_field = input[
                                b, :,
                                h_start:h_start+kh,
                                w_start:w_start+kw
                            ]

                            # 卷积
                            output[b, oc, h, w] = torch.sum(
                                receptive_field * weight[oc]
                            ) + (bias[oc] if bias is not None else 0)

            return output

    def advanced_convolution_patterns(self):
        """高级卷积模式"""

        # 1. 深度可分离卷积（Depthwise Separable Conv）
        class DepthwiseSeparableConv(nn.Module):
            """
            分解为：
            1. Depthwise Conv: 每个通道独立卷积
            2. Pointwise Conv: 1x1卷积混合通道

            参数量: in*k^2 + in*out (vs 标准卷积: in*out*k^2)
            """
            def __init__(self, in_channels, out_channels, kernel_size=3):
                super().__init__()

                # Depthwise: groups=in_channels
                self.depthwise = nn.Conv2d(
                    in_channels, in_channels, kernel_size,
                    padding=kernel_size//2, groups=in_channels
                )

                # Pointwise: 1x1卷积
                self.pointwise = nn.Conv2d(in_channels, out_channels, 1)

            def forward(self, x):
                x = self.depthwise(x)
                x = self.pointwise(x)
                return x

        # 2. 可变形卷积（Deformable Convolution）
        class DeformableConv2d(nn.Module):
            """
            学习卷积核的空间偏移
            允许不规则感受野
            """
            def __init__(self, in_channels, out_channels, kernel_size=3):
                super().__init__()
                self.kernel_size = kernel_size

                # 标准卷积
                self.conv = nn.Conv2d(
                    in_channels, out_channels, kernel_size,
                    padding=kernel_size//2
                )

                # 偏移预测网络
                self.offset_conv = nn.Conv2d(
                    in_channels,
                    2 * kernel_size * kernel_size,  # x和y偏移
                    kernel_size,
                    padding=kernel_size//2
                )

            def forward(self, x):
                # 预测偏移
                offset = self.offset_conv(x)

                # 应用可变形卷积（需要自定义CUDA实现）
                # 这里仅展示概念
                # output = deformable_conv2d(x, offset, self.conv.weight)

                # 简化版本：使用标准卷积
                output = self.conv(x)
                return output

        # 3. 八度卷积（Octave Convolution）
        class OctaveConv(nn.Module):
            """
            处理不同频率的特征
            高频：细节信息
            低频：全局信息
            """
            def __init__(self, in_channels, out_channels, kernel_size=3, alpha=0.5):
                super().__init__()
                self.alpha = alpha

                # 高低频通道数
                in_high = int(in_channels * (1 - alpha))
                in_low = in_channels - in_high
                out_high = int(out_channels * (1 - alpha))
                out_low = out_channels - out_high

                # 高频到高频
                self.high_to_high = nn.Conv2d(
                    in_high, out_high, kernel_size, padding=kernel_size//2
                )

                # 高频到低频
                self.high_to_low = nn.Conv2d(
                    in_high, out_low, kernel_size,
                    stride=2, padding=kernel_size//2
                )

                # 低频到高频
                self.low_to_high = nn.Conv2d(
                    in_low, out_high, kernel_size, padding=kernel_size//2
                )

                # 低频到低频
                self.low_to_low = nn.Conv2d(
                    in_low, out_low, kernel_size, padding=kernel_size//2
                )

            def forward(self, x):
                # 分离高低频
                x_high, x_low = x

                # 计算各路径
                high_to_high = self.high_to_high(x_high)
                high_to_low = self.high_to_low(x_high)

                low_to_high = F.interpolate(
                    self.low_to_high(x_low),
                    size=x_high.shape[2:],
                    mode='nearest'
                )
                low_to_low = self.low_to_low(x_low)

                # 合并
                out_high = high_to_high + low_to_high
                out_low = high_to_low + low_to_low

                return out_high, out_low

### 1.3 nn.BatchNorm2d - 批归一化

#### 1.3.1 BatchNorm原理与实现

```python
class BatchNormalizationMastery:
    """批归一化完全掌握"""

    def batchnorm_mathematics(self):
        """
        BatchNorm数学公式：

        训练阶段：
        1. μ_B = (1/m) Σ x_i          # 批均值
        2. σ²_B = (1/m) Σ (x_i - μ_B)²  # 批方差
        3. x̂_i = (x_i - μ_B) / √(σ²_B + ε)  # 归一化
        4. y_i = γ * x̂_i + β          # 缩放和平移

        推理阶段：
        使用运行时统计量（指数移动平均）
        """

        # 创建BatchNorm层
        bn = nn.BatchNorm2d(num_features=64, momentum=0.1, eps=1e-5)

        # 输入
        x = torch.randn(32, 64, 56, 56)
        y = bn(x)

        # 手动实现BatchNorm
        class MyBatchNorm2d(nn.Module):
            def __init__(self, num_features, eps=1e-5, momentum=0.1):
                super().__init__()
                self.num_features = num_features
                self.eps = eps
                self.momentum = momentum

                # 可学习参数
                self.gamma = nn.Parameter(torch.ones(num_features))
                self.beta = nn.Parameter(torch.zeros(num_features))

                # 运行统计量（不参与梯度）
                self.register_buffer('running_mean', torch.zeros(num_features))
                self.register_buffer('running_var', torch.ones(num_features))
                self.register_buffer('num_batches_tracked', torch.tensor(0, dtype=torch.long))

            def forward(self, x):
                # x: (N, C, H, W)
                if self.training:
                    # 计算批统计量
                    # 在(N, H, W)维度上求平均
                    mean = x.mean(dim=(0, 2, 3), keepdim=False)
                    var = x.var(dim=(0, 2, 3), unbiased=False, keepdim=False)

                    # 更新运行统计量
                    with torch.no_grad():
                        self.running_mean = (1 - self.momentum) * self.running_mean + \
                                           self.momentum * mean
                        self.running_var = (1 - self.momentum) * self.running_var + \
                                          self.momentum * var
                        self.num_batches_tracked += 1

                    # 归一化
                    x_norm = (x - mean[None, :, None, None]) / \
                             torch.sqrt(var[None, :, None, None] + self.eps)
                else:
                    # 使用运行统计量
                    x_norm = (x - self.running_mean[None, :, None, None]) / \
                             torch.sqrt(self.running_var[None, :, None, None] + self.eps)

                # 缩放和平移
                output = self.gamma[None, :, None, None] * x_norm + \
                         self.beta[None, :, None, None]

                return output

    def batchnorm_variants(self):
        """BatchNorm的变体"""

        # 1. LayerNorm（用于Transformer）
        class LayerNorm(nn.Module):
            """
            对每个样本的所有特征归一化
            不依赖batch，适合小batch或序列数据
            """
            def __init__(self, normalized_shape, eps=1e-5):
                super().__init__()
                self.normalized_shape = normalized_shape
                self.eps = eps

                self.gamma = nn.Parameter(torch.ones(normalized_shape))
                self.beta = nn.Parameter(torch.zeros(normalized_shape))

            def forward(self, x):
                # x: (..., normalized_shape)
                mean = x.mean(dim=-1, keepdim=True)
                var = x.var(dim=-1, unbiased=False, keepdim=True)

                x_norm = (x - mean) / torch.sqrt(var + self.eps)
                output = self.gamma * x_norm + self.beta

                return output

        # 2. GroupNorm（介于BatchNorm和LayerNorm之间）
        class GroupNorm(nn.Module):
            """
            将通道分组，在组内归一化
            不依赖batch size
            """
            def __init__(self, num_groups, num_channels, eps=1e-5):
                super().__init__()
                assert num_channels % num_groups == 0

                self.num_groups = num_groups
                self.num_channels = num_channels
                self.eps = eps

                self.gamma = nn.Parameter(torch.ones(num_channels))
                self.beta = nn.Parameter(torch.zeros(num_channels))

            def forward(self, x):
                # x: (N, C, H, W)
                N, C, H, W = x.shape
                G = self.num_groups

                # 重塑为 (N, G, C//G, H, W)
                x = x.view(N, G, C // G, H, W)

                # 在每组内归一化
                mean = x.mean(dim=(2, 3, 4), keepdim=True)
                var = x.var(dim=(2, 3, 4), unbiased=False, keepdim=True)

                x_norm = (x - mean) / torch.sqrt(var + self.eps)

                # 恢复形状
                x_norm = x_norm.view(N, C, H, W)

                # 缩放和平移
                output = self.gamma[None, :, None, None] * x_norm + \
                         self.beta[None, :, None, None]

                return output

        # 3. InstanceNorm（用于风格迁移）
        class InstanceNorm(nn.Module):
            """
            对每个样本的每个通道独立归一化
            """
            def __init__(self, num_features, eps=1e-5):
                super().__init__()
                self.num_features = num_features
                self.eps = eps

                self.gamma = nn.Parameter(torch.ones(num_features))
                self.beta = nn.Parameter(torch.zeros(num_features))

            def forward(self, x):
                # x: (N, C, H, W)
                mean = x.mean(dim=(2, 3), keepdim=True)
                var = x.var(dim=(2, 3), unbiased=False, keepdim=True)

                x_norm = (x - mean) / torch.sqrt(var + self.eps)
                output = self.gamma[None, :, None, None] * x_norm + \
                         self.beta[None, :, None, None]

                return output

### 1.4 nn.Dropout - 正则化

```python
class DropoutMastery:
    """Dropout完全掌握"""

    def dropout_mathematics(self):
        """
        Dropout数学原理：

        训练阶段：
        mask ~ Bernoulli(1-p)
        output = input * mask / (1-p)  # 缩放保持期望

        推理阶段：
        output = input  # 不dropout
        """

        # 标准Dropout
        dropout = nn.Dropout(p=0.5)

        x = torch.randn(32, 128)
        y_train = dropout(x)  # 训练模式

        dropout.eval()
        y_test = dropout(x)   # 推理模式

        # 手动实现Dropout
        class MyDropout(nn.Module):
            def __init__(self, p=0.5):
                super().__init__()
                self.p = p

            def forward(self, x):
                if not self.training:
                    return x

                # 生成mask
                mask = (torch.rand_like(x) > self.p).float()

                # 缩放
                return x * mask / (1 - self.p)

        # 测试
        my_dropout = MyDropout(0.5)
        y = my_dropout(x)

    def dropout_variants(self):
        """Dropout变体"""

        # 1. DropConnect
        class DropConnect(nn.Module):
            """
            丢弃权重而非激活
            更强的正则化
            """
            def __init__(self, linear, p=0.5):
                super().__init__()
                self.linear = linear
                self.p = p

            def forward(self, x):
                if not self.training:
                    return self.linear(x)

                # 对权重应用dropout
                weight = self.linear.weight
                mask = (torch.rand_like(weight) > self.p).float()
                dropped_weight = weight * mask / (1 - self.p)

                return F.linear(x, dropped_weight, self.linear.bias)

        # 2. Spatial Dropout（用于CNN）
        class SpatialDropout2d(nn.Module):
            """
            丢弃整个特征图
            保持空间相关性
            """
            def __init__(self, p=0.5):
                super().__init__()
                self.p = p

            def forward(self, x):
                # x: (N, C, H, W)
                if not self.training:
                    return x

                # 生成通道级mask
                N, C, H, W = x.shape
                mask = (torch.rand(N, C, 1, 1, device=x.device) > self.p).float()

                return x * mask / (1 - self.p)

        # 3. DropBlock（更激进的Spatial Dropout）
        class DropBlock2d(nn.Module):
            """
            丢弃连续的区域块
            更有效去除语义信息
            """
            def __init__(self, p=0.1, block_size=7):
                super().__init__()
                self.p = p
                self.block_size = block_size

            def forward(self, x):
                if not self.training:
                    return x

                N, C, H, W = x.shape

                # 计算gamma（使期望丢弃率为p）
                gamma = self.p * (H * W) / (self.block_size ** 2) / \
                        ((H - self.block_size + 1) * (W - self.block_size + 1))

                # 生成mask中心点
                mask = torch.rand(N, C, H, W, device=x.device) < gamma

                # 扩展为block
                mask = F.max_pool2d(
                    mask.float(),
                    kernel_size=self.block_size,
                    stride=1,
                    padding=self.block_size // 2
                )

                mask = 1 - mask

                # 归一化
                mask = mask / mask.mean()

                return x * mask

---

## 第二部分：激活函数模块

### 2.1 经典激活函数

```python
class ActivationFunctions:
    """激活函数完全掌握"""

    def classic_activations(self):
        """经典激活函数"""

        x = torch.linspace(-5, 5, 100)

        # 1. ReLU: max(0, x)
        relu = nn.ReLU()
        y_relu = relu(x)

        # 2. Sigmoid: 1 / (1 + e^(-x))
        sigmoid = nn.Sigmoid()
        y_sigmoid = sigmoid(x)

        # 3. Tanh: (e^x - e^(-x)) / (e^x + e^(-x))
        tanh = nn.Tanh()
        y_tanh = tanh(x)

        # 4. Leaky ReLU: max(αx, x)
        leaky_relu = nn.LeakyReLU(negative_slope=0.01)
        y_leaky = leaky_relu(x)

        # 手动实现
        class ManualActivations:
            @staticmethod
            def relu(x):
                return torch.maximum(x, torch.zeros_like(x))

            @staticmethod
            def sigmoid(x):
                return 1 / (1 + torch.exp(-x))

            @staticmethod
            def tanh(x):
                exp_x = torch.exp(x)
                exp_neg_x = torch.exp(-x)
                return (exp_x - exp_neg_x) / (exp_x + exp_neg_x)

            @staticmethod
            def leaky_relu(x, alpha=0.01):
                return torch.where(x > 0, x, alpha * x)

    def modern_activations(self):
        """现代激活函数"""

        # 1. GELU (Gaussian Error Linear Unit)
        class GELU(nn.Module):
            """
            GELU(x) = x * Φ(x)
            其中Φ是标准正态分布的CDF

            近似: 0.5x(1 + tanh(√(2/π)(x + 0.044715x³)))
            """
            def forward(self, x):
                return 0.5 * x * (1 + torch.tanh(
                    np.sqrt(2 / np.pi) * (x + 0.044715 * torch.pow(x, 3))
                ))

        # 2. Swish / SiLU
        class Swish(nn.Module):
            """
            Swish(x) = x * σ(x)
            自门控激活函数
            """
            def forward(self, x):
                return x * torch.sigmoid(x)

        # 3. Mish
        class Mish(nn.Module):
            """
            Mish(x) = x * tanh(softplus(x))
            平滑的非单调激活
            """
            def forward(self, x):
                return x * torch.tanh(F.softplus(x))

        # 4. Hardswish（移动端优化）
        class Hardswish(nn.Module):
            """
            Hardswish(x) = x * ReLU6(x+3) / 6
            Swish的分段线性近似
            """
            def forward(self, x):
                return x * F.relu6(x + 3) / 6

        # 5. PReLU（参数化ReLU）
        prelu = nn.PReLU(num_parameters=64)  # 每个通道一个参数

        # 6. ELU (Exponential Linear Unit)
        class ELU(nn.Module):
            """
            ELU(x) = x if x > 0
                     α(e^x - 1) if x ≤ 0
            """
            def __init__(self, alpha=1.0):
                super().__init__()
                self.alpha = alpha

            def forward(self, x):
                return torch.where(
                    x > 0,
                    x,
                    self.alpha * (torch.exp(x) - 1)
                )

### 2.2 注意力机制

```python
class AttentionMechanisms:
    """注意力机制完全掌握"""

    def scaled_dot_product_attention(self):
        """缩放点积注意力"""

        class ScaledDotProductAttention(nn.Module):
            """
            Attention(Q,K,V) = softmax(QK^T / √d_k)V
            """
            def __init__(self, dropout=0.1):
                super().__init__()
                self.dropout = nn.Dropout(dropout)

            def forward(self, q, k, v, mask=None):
                # q,k,v: (batch, heads, seq_len, d_k)
                d_k = q.size(-1)

                # 计算注意力分数
                scores = torch.matmul(q, k.transpose(-2, -1)) / np.sqrt(d_k)

                # 应用mask
                if mask is not None:
                    scores = scores.masked_fill(mask == 0, float('-inf'))

                # Softmax
                attn_weights = F.softmax(scores, dim=-1)
                attn_weights = self.dropout(attn_weights)

                # 加权求和
                output = torch.matmul(attn_weights, v)

                return output, attn_weights

    def multi_head_attention(self):
        """多头注意力"""

        class MultiHeadAttention(nn.Module):
            """
            MultiHead(Q,K,V) = Concat(head_1,...,head_h)W^O
            where head_i = Attention(QW^Q_i, KW^K_i, VW^V_i)
            """
            def __init__(self, d_model, num_heads, dropout=0.1):
                super().__init__()
                assert d_model % num_heads == 0

                self.d_model = d_model
                self.num_heads = num_heads
                self.d_k = d_model // num_heads

                # 线性投影
                self.W_q = nn.Linear(d_model, d_model)
                self.W_k = nn.Linear(d_model, d_model)
                self.W_v = nn.Linear(d_model, d_model)
                self.W_o = nn.Linear(d_model, d_model)

                self.attention = ScaledDotProductAttention(dropout)
                self.dropout = nn.Dropout(dropout)

            def forward(self, q, k, v, mask=None):
                batch_size = q.size(0)

                # 线性投影并分割成多头
                q = self.W_q(q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
                k = self.W_k(k).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
                v = self.W_v(v).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)

                # 注意力
                output, attn_weights = self.attention(q, k, v, mask)

                # 合并多头
                output = output.transpose(1, 2).contiguous().view(
                    batch_size, -1, self.d_model
                )

                # 输出投影
                output = self.W_o(output)

                return output, attn_weights

    def self_attention_variants(self):
        """自注意力变体"""

        # 1. 相对位置编码注意力
        class RelativePositionAttention(nn.Module):
            """
            加入相对位置信息
            """
            def __init__(self, d_model, num_heads, max_len=512):
                super().__init__()
                self.mha = MultiHeadAttention(d_model, num_heads)

                # 相对位置嵌入
                self.relative_pos_embedding = nn.Embedding(
                    2 * max_len - 1, d_model
                )

            def forward(self, x):
                # x: (batch, seq_len, d_model)
                seq_len = x.size(1)

                # 计算相对位置
                positions = torch.arange(seq_len, device=x.device)
                relative_positions = positions[None, :] - positions[:, None]
                relative_positions = relative_positions + seq_len - 1

                # 获取位置编码
                pos_embed = self.relative_pos_embedding(relative_positions)

                # 标准自注意力
                output, _ = self.mha(x, x, x)

                # 添加位置信息（简化版）
                output = output + pos_embed.mean(0, keepdim=True)

                return output

        # 2. 局部注意力（Local Attention）
        class LocalAttention(nn.Module):
            """
            只关注局部窗口
            降低计算复杂度
            """
            def __init__(self, d_model, num_heads, window_size=256):
                super().__init__()
                self.window_size = window_size
                self.mha = MultiHeadAttention(d_model, num_heads)

            def forward(self, x):
                # x: (batch, seq_len, d_model)
                batch_size, seq_len, d_model = x.shape

                # 分割为窗口
                num_windows = seq_len // self.window_size
                x_windows = x[:, :num_windows * self.window_size].view(
                    batch_size, num_windows, self.window_size, d_model
                )

                # 对每个窗口应用注意力
                outputs = []
                for i in range(num_windows):
                    window = x_windows[:, i]
                    output, _ = self.mha(window, window, window)
                    outputs.append(output)

                output = torch.cat(outputs, dim=1)

                # 处理剩余部分
                if seq_len % self.window_size != 0:
                    remaining = x[:, num_windows * self.window_size:]
                    remaining_out, _ = self.mha(remaining, remaining, remaining)
                    output = torch.cat([output, remaining_out], dim=1)

                return output

---

## 第三部分：损失函数

### 3.1 分类损失

```python
class ClassificationLosses:
    """分类损失函数"""

    def cross_entropy_deep_dive(self):
        """交叉熵损失深度解析"""

        # 标准交叉熵
        ce_loss = nn.CrossEntropyLoss()

        # 输入：logits (未归一化)
        logits = torch.randn(32, 10)
        targets = torch.randint(0, 10, (32,))

        loss = ce_loss(logits, targets)

        # 手动实现
        def manual_cross_entropy(logits, targets):
            """
            CrossEntropy = -log(softmax(logits)[targets])
            """
            # Softmax
            log_probs = F.log_softmax(logits, dim=-1)

            # 负对数似然
            nll = -log_probs[range(len(targets)), targets]

            return nll.mean()

        manual_loss = manual_cross_entropy(logits, targets)
        print(f"PyTorch: {loss.item():.4f}, Manual: {manual_loss.item():.4f}")

        # 带权重的交叉熵（处理类别不平衡）
        class_weights = torch.tensor([1.0, 2.0, 3.0] + [1.0]*7)
        weighted_ce = nn.CrossEntropyLoss(weight=class_weights)

        # Label Smoothing
        class LabelSmoothingCrossEntropy(nn.Module):
            """
            标签平滑：防止过拟合
            y_smooth = (1-ε)y_true + ε/K
            """
            def __init__(self, epsilon=0.1):
                super().__init__()
                self.epsilon = epsilon

            def forward(self, logits, targets):
                num_classes = logits.size(-1)
                log_probs = F.log_softmax(logits, dim=-1)

                # 平滑标签
                with torch.no_grad():
                    true_dist = torch.zeros_like(log_probs)
                    true_dist.fill_(self.epsilon / (num_classes - 1))
                    true_dist.scatter_(1, targets.unsqueeze(1), 1.0 - self.epsilon)

                return torch.mean(torch.sum(-true_dist * log_probs, dim=-1))

    def focal_loss(self):
        """Focal Loss - 处理类别不平衡"""

        class FocalLoss(nn.Module):
            """
            FL(p_t) = -α_t(1-p_t)^γ log(p_t)

            降低易分类样本的权重
            聚焦于难分类样本
            """
            def __init__(self, alpha=0.25, gamma=2.0):
                super().__init__()
                self.alpha = alpha
                self.gamma = gamma

            def forward(self, logits, targets):
                # 计算交叉熵
                ce_loss = F.cross_entropy(logits, targets, reduction='none')

                # 计算p_t
                p_t = torch.exp(-ce_loss)

                # Focal权重
                focal_weight = (1 - p_t) ** self.gamma

                # 最终损失
                loss = self.alpha * focal_weight * ce_loss

                return loss.mean()

### 3.2 回归损失

```python
class RegressionLosses:
    """回归损失函数"""

    def common_regression_losses(self):
        """常见回归损失"""

        pred = torch.randn(32, 1)
        target = torch.randn(32, 1)

        # 1. MSE (Mean Squared Error)
        mse_loss = nn.MSELoss()
        mse = mse_loss(pred, target)

        # 2. MAE (Mean Absolute Error) / L1 Loss
        mae_loss = nn.L1Loss()
        mae = mae_loss(pred, target)

        # 3. Smooth L1 Loss (Huber Loss的变体)
        smooth_l1 = nn.SmoothL1Loss()
        loss = smooth_l1(pred, target)

        # 手动实现
        def manual_losses(pred, target):
            # MSE
            mse = torch.mean((pred - target) ** 2)

            # MAE
            mae = torch.mean(torch.abs(pred - target))

            # Smooth L1
            diff = torch.abs(pred - target)
            smooth_l1 = torch.where(
                diff < 1,
                0.5 * diff ** 2,
                diff - 0.5
            ).mean()

            return mse, mae, smooth_l1

    def huber_loss(self):
        """Huber Loss - 鲁棒回归"""

        class HuberLoss(nn.Module):
            """
            结合L1和L2的优点
            对异常值更鲁棒

            L_δ(a) = 0.5 * a^2           if |a| ≤ δ
                     δ(|a| - 0.5δ)      otherwise
            """
            def __init__(self, delta=1.0):
                super().__init__()
                self.delta = delta

            def forward(self, pred, target):
                error = torch.abs(pred - target)

                quadratic = torch.min(error, torch.tensor(self.delta))
                linear = error - quadratic

                loss = 0.5 * quadratic ** 2 + self.delta * linear

                return loss.mean()

    def quantile_loss(self):
        """分位数损失 - 预测区间"""

        class QuantileLoss(nn.Module):
            """
            用于预测分位数

            L_τ(y, ŷ) = (y - ŷ)(τ - 1_{y < ŷ})
            """
            def __init__(self, quantile=0.5):
                super().__init__()
                self.quantile = quantile

            def forward(self, pred, target):
                error = target - pred
                loss = torch.max(
                    self.quantile * error,
                    (self.quantile - 1) * error
                )
                return loss.mean()

### 3.3 对比学习损失

```python
class ContrastiveLearningLosses:
    """对比学习损失函数"""

    def contrastive_loss(self):
        """对比损失（Contrastive Loss）"""

        class ContrastiveLoss(nn.Module):
            """
            用于学习相似性度量

            L = (1-Y) * 0.5 * D^2 +
                Y * 0.5 * max(0, margin - D)^2

            其中D是欧氏距离
            """
            def __init__(self, margin=1.0):
                super().__init__()
                self.margin = margin

            def forward(self, output1, output2, label):
                # label: 1表示相似，0表示不相似
                euclidean_distance = F.pairwise_distance(output1, output2)

                loss_contrastive = torch.mean(
                    (1 - label) * torch.pow(euclidean_distance, 2) +
                    label * torch.pow(torch.clamp(
                        self.margin - euclidean_distance, min=0.0
                    ), 2)
                )

                return loss_contrastive

    def triplet_loss(self):
        """三元组损失（Triplet Loss）"""

        class TripletLoss(nn.Module):
            """
            L = max(||a-p||^2 - ||a-n||^2 + margin, 0)

            拉近anchor和positive
            推远anchor和negative
            """
            def __init__(self, margin=1.0):
                super().__init__()
                self.margin = margin

            def forward(self, anchor, positive, negative):
                pos_dist = F.pairwise_distance(anchor, positive)
                neg_dist = F.pairwise_distance(anchor, negative)

                loss = F.relu(pos_dist - neg_dist + self.margin)

                return loss.mean()

        # 三元组挖掘策略
        class TripletMiningLoss(nn.Module):
            """
            在线三元组挖掘
            选择困难样本
            """
            def __init__(self, margin=1.0, mining='hard'):
                super().__init__()
                self.margin = margin
                self.mining = mining

            def forward(self, embeddings, labels):
                # 计算所有距离
                dist_matrix = torch.cdist(embeddings, embeddings, p=2)

                # 找到positive和negative
                mask_anchor_positive = labels.unsqueeze(0) == labels.unsqueeze(1)
                mask_anchor_negative = ~mask_anchor_positive

                if self.mining == 'hard':
                    # Hard positive: 同类中最远的
                    anchor_positive_dist = torch.where(
                        mask_anchor_positive,
                        dist_matrix,
                        torch.tensor(0.0, device=dist_matrix.device)
                    )
                    hardest_positive_dist, _ = anchor_positive_dist.max(dim=1)

                    # Hard negative: 不同类中最近的
                    anchor_negative_dist = torch.where(
                        mask_anchor_negative,
                        dist_matrix,
                        torch.tensor(float('inf'), device=dist_matrix.device)
                    )
                    hardest_negative_dist, _ = anchor_negative_dist.min(dim=1)

                    # 三元组损失
                    loss = F.relu(
                        hardest_positive_dist - hardest_negative_dist + self.margin
                    )

                return loss.mean()

    def ntxent_loss(self):
        """NT-Xent Loss (SimCLR)"""

        class NTXentLoss(nn.Module):
            """
            归一化温度缩放交叉熵损失
            用于自监督学习
            """
            def __init__(self, temperature=0.5):
                super().__init__()
                self.temperature = temperature

            def forward(self, z_i, z_j):
                """
                z_i, z_j: (batch_size, dim) - 同一样本的两个增强视图
                """
                batch_size = z_i.size(0)

                # 归一化
                z_i = F.normalize(z_i, dim=1)
                z_j = F.normalize(z_j, dim=1)

                # 拼接
                representations = torch.cat([z_i, z_j], dim=0)

                # 计算相似度矩阵
                similarity_matrix = F.cosine_similarity(
                    representations.unsqueeze(1),
                    representations.unsqueeze(0),
                    dim=2
                )

                # 温度缩放
                similarity_matrix = similarity_matrix / self.temperature

                # 构建标签
                labels = torch.cat([
                    torch.arange(batch_size) + batch_size,
                    torch.arange(batch_size)
                ]).to(z_i.device)

                # 移除对角线
                mask = torch.eye(2 * batch_size, dtype=torch.bool, device=z_i.device)
                similarity_matrix = similarity_matrix.masked_fill(mask, float('-inf'))

                # 交叉熵
                loss = F.cross_entropy(similarity_matrix, labels)

                return loss

---

## 第四部分：优化器深度解析

### 4.1 基础优化器

```python
class OptimizersDeepDive:
    """优化器深度解析"""

    def sgd_variants(self):
        """SGD及其变体"""

        model = nn.Linear(10, 1)

        # 1. 标准SGD
        sgd = torch.optim.SGD(model.parameters(), lr=0.01)

        # 2. SGD with Momentum
        sgd_momentum = torch.optim.SGD(
            model.parameters(),
            lr=0.01,
            momentum=0.9
        )

        # 3. SGD with Nesterov Momentum
        sgd_nesterov = torch.optim.SGD(
            model.parameters(),
            lr=0.01,
            momentum=0.9,
            nesterov=True
        )

        # 手动实现SGD with Momentum
        class SGDMomentum:
            """
            v_t = β*v_{t-1} + g_t
            θ_t = θ_{t-1} - α*v_t
            """
            def __init__(self, parameters, lr=0.01, momentum=0.9):
                self.parameters = list(parameters)
                self.lr = lr
                self.momentum = momentum
                self.velocities = [
                    torch.zeros_like(p.data) for p in self.parameters
                ]

            def step(self):
                with torch.no_grad():
                    for p, v in zip(self.parameters, self.velocities):
                        if p.grad is None:
                            continue

                        # 更新速度
                        v.mul_(self.momentum).add_(p.grad)

                        # 更新参数
                        p.add_(v, alpha=-self.lr)

            def zero_grad(self):
                for p in self.parameters:
                    if p.grad is not None:
                        p.grad.zero_()

    def adam_family(self):
        """Adam系列优化器"""

        model = nn.Linear(10, 1)

        # 1. Adam
        adam = torch.optim.Adam(
            model.parameters(),
            lr=0.001,
            betas=(0.9, 0.999),
            eps=1e-8
        )

        # 2. AdamW (权重衰减修正)
        adamw = torch.optim.AdamW(
            model.parameters(),
            lr=0.001,
            weight_decay=0.01
        )

        # 3. AdamW with Lookahead
        # 需要第三方库

        # 手动实现Adam
        class AdamOptimizer:
            """
            m_t = β₁*m_{t-1} + (1-β₁)*g_t
            v_t = β₂*v_{t-1} + (1-β₂)*g_t²
            m̂_t = m_t / (1 - β₁^t)
            v̂_t = v_t / (1 - β₂^t)
            θ_t = θ_{t-1} - α * m̂_t / (√v̂_t + ε)
            """
            def __init__(self, parameters, lr=0.001, betas=(0.9, 0.999), eps=1e-8):
                self.parameters = list(parameters)
                self.lr = lr
                self.beta1, self.beta2 = betas
                self.eps = eps
                self.t = 0

                self.m = [torch.zeros_like(p.data) for p in self.parameters]
                self.v = [torch.zeros_like(p.data) for p in self.parameters]

            def step(self):
                self.t += 1

                with torch.no_grad():
                    for p, m, v in zip(self.parameters, self.m, self.v):
                        if p.grad is None:
                            continue

                        # 更新矩估计
                        m.mul_(self.beta1).add_(p.grad, alpha=1 - self.beta1)
                        v.mul_(self.beta2).addcmul_(p.grad, p.grad, value=1 - self.beta2)

                        # 偏差修正
                        bias_correction1 = 1 - self.beta1 ** self.t
                        bias_correction2 = 1 - self.beta2 ** self.t

                        m_hat = m / bias_correction1
                        v_hat = v / bias_correction2

                        # 更新参数
                        p.add_(
                            m_hat / (torch.sqrt(v_hat) + self.eps),
                            alpha=-self.lr
                        )

            def zero_grad(self):
                for p in self.parameters:
                    if p.grad is not None:
                        p.grad.zero_()

### 4.2 高级优化器

```python
class AdvancedOptimizers:
    """高级优化器"""

    def lamb_optimizer(self):
        """LAMB - Layer-wise Adaptive Moments optimizer for Batch training"""

        # 使用第三方实现或手动实现
        class LAMB:
            """
            LAMB = Adam + Layer-wise适应
            特别适合大批量训练
            """
            def __init__(self, parameters, lr=0.001, betas=(0.9, 0.999),
                         eps=1e-6, weight_decay=0.01):
                self.parameters = list(parameters)
                self.lr = lr
                self.beta1, self.beta2 = betas
                self.eps = eps
                self.weight_decay = weight_decay
                self.t = 0

                self.m = [torch.zeros_like(p.data) for p in self.parameters]
                self.v = [torch.zeros_like(p.data) for p in self.parameters]

            def step(self):
                self.t += 1

                with torch.no_grad():
                    for p, m, v in zip(self.parameters, self.m, self.v):
                        if p.grad is None:
                            continue

                        grad = p.grad.data

                        # L2正则化
                        if self.weight_decay != 0:
                            grad = grad.add(p.data, alpha=self.weight_decay)

                        # Adam步骤
                        m.mul_(self.beta1).add_(grad, alpha=1 - self.beta1)
                        v.mul_(self.beta2).addcmul_(grad, grad, value=1 - self.beta2)

                        m_hat = m / (1 - self.beta1 ** self.t)
                        v_hat = v / (1 - self.beta2 ** self.t)

                        adam_step = m_hat / (torch.sqrt(v_hat) + self.eps)

                        # Layer-wise适应
                        weight_norm = torch.norm(p.data)
                        adam_norm = torch.norm(adam_step)

                        if weight_norm > 0 and adam_norm > 0:
                            trust_ratio = weight_norm / adam_norm
                        else:
                            trust_ratio = 1.0

                        # 更新
                        p.add_(adam_step, alpha=-self.lr * trust_ratio)

    def lookahead_wrapper(self):
        """Lookahead优化器包装器"""

        class Lookahead:
            """
            慢权重和快权重
            周期性同步
            """
            def __init__(self, optimizer, k=5, alpha=0.5):
                self.optimizer = optimizer
                self.k = k
                self.alpha = alpha
                self.step_counter = 0

                # 保存慢权重
                self.slow_weights = [
                    p.clone().detach()
                    for group in optimizer.param_groups
                    for p in group['params']
                ]

            def step(self):
                self.optimizer.step()
                self.step_counter += 1

                if self.step_counter % self.k == 0:
                    # 更新慢权重
                    for slow_param, group in zip(
                        self.slow_weights,
                        self.optimizer.param_groups
                    ):
                        for fast_param in group['params']:
                            slow_param.data.add_(
                                fast_param.data - slow_param.data,
                                alpha=self.alpha
                            )
                            fast_param.data.copy_(slow_param.data)

        # 使用示例
        model = nn.Linear(10, 1)
        base_optimizer = torch.optim.Adam(model.parameters())
        optimizer = Lookahead(base_optimizer, k=5, alpha=0.5)

---

## 第五部分：实战案例

### 5.1 图像分类完整流程

```python
class ImageClassificationPipeline:
    """图像分类完整流程"""

    def build_resnet_from_scratch(self):
        """从零构建ResNet"""

        class BasicBlock(nn.Module):
            """ResNet基础块"""
            expansion = 1

            def __init__(self, in_channels, out_channels, stride=1):
                super().__init__()

                self.conv1 = nn.Conv2d(
                    in_channels, out_channels, 3,
                    stride=stride, padding=1, bias=False
                )
                self.bn1 = nn.BatchNorm2d(out_channels)

                self.conv2 = nn.Conv2d(
                    out_channels, out_channels, 3,
                    padding=1, bias=False
                )
                self.bn2 = nn.BatchNorm2d(out_channels)

                # 捷径连接
                self.shortcut = nn.Sequential()
                if stride != 1 or in_channels != out_channels:
                    self.shortcut = nn.Sequential(
                        nn.Conv2d(
                            in_channels, out_channels, 1,
                            stride=stride, bias=False
                        ),
                        nn.BatchNorm2d(out_channels)
                    )

            def forward(self, x):
                out = F.relu(self.bn1(self.conv1(x)))
                out = self.bn2(self.conv2(out))
                out += self.shortcut(x)
                out = F.relu(out)
                return out

        class ResNet(nn.Module):
            """ResNet完整网络"""
            def __init__(self, block, num_blocks, num_classes=10):
                super().__init__()
                self.in_channels = 64

                # 初始层
                self.conv1 = nn.Conv2d(3, 64, 3, padding=1, bias=False)
                self.bn1 = nn.BatchNorm2d(64)

                # 残差层
                self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
                self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
                self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
                self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)

                # 分类头
                self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
                self.fc = nn.Linear(512 * block.expansion, num_classes)

            def _make_layer(self, block, out_channels, num_blocks, stride):
                strides = [stride] + [1] * (num_blocks - 1)
                layers = []

                for stride in strides:
                    layers.append(block(self.in_channels, out_channels, stride))
                    self.in_channels = out_channels * block.expansion

                return nn.Sequential(*layers)

            def forward(self, x):
                out = F.relu(self.bn1(self.conv1(x)))
                out = self.layer1(out)
                out = self.layer2(out)
                out = self.layer3(out)
                out = self.layer4(out)
                out = self.avgpool(out)
                out = out.view(out.size(0), -1)
                out = self.fc(out)
                return out

        # 创建ResNet-18
        def ResNet18(num_classes=10):
            return ResNet(BasicBlock, [2, 2, 2, 2], num_classes)

        model = ResNet18(num_classes=10)
        return model

    def complete_training_loop(self):
        """完整训练循环"""

        import torchvision
        import torchvision.transforms as transforms
        from torch.utils.data import DataLoader

        # 1. 数据准备
        transform_train = transforms.Compose([
            transforms.RandomCrop(32, padding=4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465),
                               (0.2023, 0.1994, 0.2010))
        ])

        transform_test = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465),
                               (0.2023, 0.1994, 0.2010))
        ])

        # 2. 加载数据集
        trainset = torchvision.datasets.CIFAR10(
            root='./data', train=True, download=True,
            transform=transform_train
        )
        trainloader = DataLoader(
            trainset, batch_size=128,
            shuffle=True, num_workers=2
        )

        testset = torchvision.datasets.CIFAR10(
            root='./data', train=False, download=True,
            transform=transform_test
        )
        testloader = DataLoader(
            testset, batch_size=100,
            shuffle=False, num_workers=2
        )

        # 3. 创建模型
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model = self.build_resnet_from_scratch().to(device)

        # 4. 损失函数和优化器
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.SGD(
            model.parameters(),
            lr=0.1,
            momentum=0.9,
            weight_decay=5e-4
        )
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, T_max=200
        )

        # 5. 训练循环
        def train_epoch(epoch):
            model.train()
            train_loss = 0
            correct = 0
            total = 0

            for batch_idx, (inputs, targets) in enumerate(trainloader):
                inputs, targets = inputs.to(device), targets.to(device)

                # 前向传播
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs, targets)

                # 反向传播
                loss.backward()
                optimizer.step()

                # 统计
                train_loss += loss.item()
                _, predicted = outputs.max(1)
                total += targets.size(0)
                correct += predicted.eq(targets).sum().item()

                if batch_idx % 100 == 0:
                    print(f'Epoch: {epoch} [{batch_idx}/{len(trainloader)}] '
                          f'Loss: {train_loss/(batch_idx+1):.3f} '
                          f'Acc: {100.*correct/total:.3f}%')

            return train_loss / len(trainloader), 100. * correct / total

        # 6. 验证循环
        def validate():
            model.eval()
            test_loss = 0
            correct = 0
            total = 0

            with torch.no_grad():
                for batch_idx, (inputs, targets) in enumerate(testloader):
                    inputs, targets = inputs.to(device), targets.to(device)
                    outputs = model(inputs)
                    loss = criterion(outputs, targets)

                    test_loss += loss.item()
                    _, predicted = outputs.max(1)
                    total += targets.size(0)
                    correct += predicted.eq(targets).sum().item()

            acc = 100. * correct / total
            print(f'Test Loss: {test_loss/len(testloader):.3f} '
                  f'Test Acc: {acc:.3f}%')

            return test_loss / len(testloader), acc

        # 7. 主训练循环
        num_epochs = 200
        best_acc = 0

        for epoch in range(num_epochs):
            print(f'\nEpoch: {epoch}')

            train_loss, train_acc = train_epoch(epoch)
            test_loss, test_acc = validate()
            scheduler.step()

            # 保存最佳模型
            if test_acc > best_acc:
                print('Saving...')
                torch.save({
                    'epoch': epoch,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'acc': test_acc,
                }, 'best_model.pth')
                best_acc = test_acc

### 5.2 文本分类（Transformer）

```python
class TextClassificationTransformer:
    """基于Transformer的文本分类"""

    def build_transformer_classifier(self):
        """构建Transformer分类器"""

        class TransformerEncoder(nn.Module):
            """Transformer编码器"""
            def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
                super().__init__()

                # 多头注意力
                self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
                self.norm1 = nn.LayerNorm(d_model)
                self.dropout1 = nn.Dropout(dropout)

                # 前馈网络
                self.ffn = nn.Sequential(
                    nn.Linear(d_model, d_ff),
                    nn.ReLU(),
                    nn.Dropout(dropout),
                    nn.Linear(d_ff, d_model)
                )
                self.norm2 = nn.LayerNorm(d_model)
                self.dropout2 = nn.Dropout(dropout)

            def forward(self, x, mask=None):
                # 自注意力
                attn_output, _ = self.self_attn(x, x, x, mask)
                x = self.norm1(x + self.dropout1(attn_output))

                # 前馈网络
                ffn_output = self.ffn(x)
                x = self.norm2(x + self.dropout2(ffn_output))

                return x

        class TextClassifier(nn.Module):
            """完整的文本分类器"""
            def __init__(self, vocab_size, d_model=512, num_heads=8,
                         num_layers=6, d_ff=2048, max_len=512,
                         num_classes=2, dropout=0.1):
                super().__init__()

                # 词嵌入
                self.embedding = nn.Embedding(vocab_size, d_model)
                self.pos_encoding = nn.Parameter(
                    torch.randn(1, max_len, d_model)
                )
                self.dropout = nn.Dropout(dropout)

                # Transformer编码器层
                self.encoder_layers = nn.ModuleList([
                    TransformerEncoder(d_model, num_heads, d_ff, dropout)
                    for _ in range(num_layers)
                ])

                # 分类头
                self.classifier = nn.Sequential(
                    nn.Linear(d_model, d_model),
                    nn.ReLU(),
                    nn.Dropout(dropout),
                    nn.Linear(d_model, num_classes)
                )

            def forward(self, x, mask=None):
                # x: (batch, seq_len)
                seq_len = x.size(1)

                # 嵌入 + 位置编码
                x = self.embedding(x) * np.sqrt(self.d_model)
                x = x + self.pos_encoding[:, :seq_len]
                x = self.dropout(x)

                # Transformer编码
                for layer in self.encoder_layers:
                    x = layer(x, mask)

                # 池化 (取[CLS]或平均)
                x = x.mean(dim=1)

                # 分类
                logits = self.classifier(x)

                return logits

---

## 总结

本教程详细讲解了PyTorch内置模块的原理与实战应用：

### 核心模块
- ✅ nn.Linear - 全连接层
- ✅ nn.Conv2d - 卷积层
- ✅ nn.BatchNorm2d - 批归一化
- ✅ nn.Dropout - 正则化

### 激活函数
- ✅ 经典激活（ReLU, Sigmoid, Tanh）
- ✅ 现代激活（GELU, Swish, Mish）
- ✅ 注意力机制

### 损失函数
- ✅ 分类损失（CrossEntropy, Focal）
- ✅ 回归损失（MSE, Huber, Quantile）
- ✅ 对比学习损失（Contrastive, Triplet, NT-Xent)

### 优化器
- ✅ 基础优化器（SGD, Adam）
- ✅ 高级优化器（LAMB, Lookahead）

### 实战案例
- ✅ 图像分类（ResNet）
- ✅ 文本分类（Transformer）

掌握这些内置模块，你就能构建任何深度学习模型！