原理
前馈神经网络(Feed forward Neural Network, FNN)是每个神经元都与前后两层(如有)中的每个神经元全部连接的神经网络。
输入层: [x₁, x₂, x₃, ..., x₅₁₂] (512个神经元)
│
│ 权重矩阵 W₁: (2048 × 512)
│ 偏置 b₁: (2048,)
▼
隐藏层: [h₁, h₂, h₃, ..., h₂₀₄₈] (2048个神经元)
│
│ ReLU激活函数
│ f(x) = max(0, x)
▼
[h₁', h₂', h₃', ..., h₂₀₄₈'] (ReLU后)
│
│ 权重矩阵 W₂: (512 × 2048)
│ 偏置 b₂: (512,)
▼
输出层: [y₁, y₂, y₃, ..., y₅₁₂] (512个神经元)
输入向量 (512维)
│
├─ x₁ ──────────────────┐
├─ x₂ ────────────────┐ │
├─ x₃ ──────────────┐ │ │
├─ ... │ │ │
└─ x₅₁₂ ────────┐ │ │ │
│ │ │ │
▼ ▼ ▼ ▼
┌───────────────┐
│ 隐藏层神经元 │
│ (2048个) │
│ │
│ h₁ = W₁₁x₁ + W₁₂x₂ + ... + W₁,512x₅₁₂ + b₁
│ h₂ = W₂₁x₁ + W₂₂x₂ + ... + W₂,512x₅₁₂ + b₁
│ ...
│ h₂₀₄₈ = W₂₀₄₈,₁x₁ + ... + W₂₀₄₈,512x₅₁₂ + b₁
└───────────────┘
│
│ ReLU
│ hᵢ' = max(0, hᵢ)
▼
┌───────────────┐
│ 激活后神经元 │
│ (2048个) │
└───────────────┘
│
├─ h₁' ──────────────────┐
├─ h₂' ────────────────┐ │
├─ h₃' ──────────────┐ │ │
├─ ... │ │ │
└─ h₂₀₄₈' ────────┐ │ │ │
│ │ │ │
▼ ▼ ▼ ▼
┌───────────────┐
│ 输出层神经元 │
│ (512个) │
│ │
│ y₁ = W₁₁'h₁' + W₁₂'h₂' + ... + b₂
│ y₂ = W₂₁'h₁' + W₂₂'h₂' + ... + b₂
│ ...
│ y₅₁₂ = W₅₁₂,₁'h₁' + ... + b₂
└───────────────┘
│
▼
输出向量 (512维)
实现
"""
Transformer前馈神经网络(FFN)实现
包含:两个线性层 + ReLU激活函数 + Dropout层
"""
import torch
import torch.nn as nn
import torch.nn.functional as F
class TransformerFFN(nn.Module):
"""
Transformer前馈神经网络
结构:
- 线性变换1 (d_model → d_ff)
- ReLU激活函数
- Dropout层
- 线性变换2 (d_ff → d_model)
"""
def __init__(self, d_model, d_ff, dropout=0.1):
super().__init__()
# 线性变换1: d_model → d_ff
self.linear1 = nn.Linear(d_model, d_ff)
# ReLU激活函数
self.activation = nn.ReLU()
# Dropout层
self.dropout = nn.Dropout(dropout)
# 线性变换2: d_ff → d_model
self.linear2 = nn.Linear(d_ff, d_model)
def forward(self, x):
"""
前向传播
Args:
x: 输入张量 (batch_size, seq_len, d_model)
Returns:
output: 输出张量 (batch_size, seq_len, d_model)
"""
# 线性变换1
x = self.linear1(x)
# ReLU激活
x = self.activation(x)
# Dropout
x = self.dropout(x)
# 线性变换2
x = self.linear2(x)
return x
def demo_ffn():
"""
逐步演示前馈神经网络的工作过程
"""
print("\n=== Transformer前馈神经网络演示 ===\n")
# 简化参数
batch_size = 2
seq_len = 5
d_model = 512
d_ff = 2048
dropout = 0.1
# 创建输入数据
input_x = torch.randn(batch_size, seq_len, d_model)
print(f"1. 输入数据:")
print(f" 形状: {input_x.shape}") # (2, 5, 512)
print(f" 数据示例 (第一个batch的第一个token):")
print(f" {input_x[0, 0, :10]}...\n")
# 创建FFN层
ffn = TransformerFFN(d_model, d_ff, dropout)
print(f"2. 创建FFN层:")
print(f" d_model: {d_model}")
print(f" d_ff: {d_ff}")
print(f" dropout: {dropout}\n")
# 设置为评估模式(关闭dropout)
ffn.eval()
# 逐步演示
print(f"3. 线性变换1 (Linear1):")
x = ffn.linear1(input_x)
print(f" 输入形状: {input_x.shape}") # (2, 5, 512)
print(f" 权重形状: {ffn.linear1.weight.shape}") # (2048, 512)
print(f" 偏置形状: {ffn.linear1.bias.shape}") # (2048,)
print(f" 输出形状: {x.shape}") # (2, 5, 2048)
print(f" 数据示例 (第一个batch的第一个token):")
print(f" {x[0, 0, :10]}...\n")
print(f"4. ReLU激活函数:")
x_before_relu = x.clone()
x = ffn.activation(x)
print(f" 输入形状: {x_before_relu.shape}") # (2, 5, 2048)
print(f" 输出形状: {x.shape}") # (2, 5, 2048)
print(f" 数据示例 (第一个batch的第一个token):")
print(f" 激活前: {x_before_relu[0, 0, :10]}...")
print(f" 激活后: {x[0, 0, :10]}...")
print(f" 说明: 负数变为0,正数保持不变\n")
print(f"5. Dropout层 (评估模式,不生效):")
x_before_dropout = x.clone()
x = ffn.dropout(x)
print(f" 输入形状: {x_before_dropout.shape}") # (2, 5, 2048)
print(f" 输出形状: {x.shape}") # (2, 5, 2048)
print(f" 数据是否变化: {not torch.equal(x_before_dropout, x)}")
print(f" 说明: 评估模式下Dropout不生效\n")
print(f"6. 线性变换2 (Linear2):")
x_before_linear2 = x.clone()
x = ffn.linear2(x)
print(f" 输入形状: {x_before_linear2.shape}") # (2, 5, 2048)
print(f" 权重形状: {ffn.linear2.weight.shape}") # (512, 2048)
print(f" 偏置形状: {ffn.linear2.bias.shape}") # (512,)
print(f" 输出形状: {x.shape}") # (2, 5, 512)
print(f" 数据示例 (第一个batch的第一个token):")
print(f" {x[0, 0, :10]}...\n")
print(f"7. 最终输出:")
print(f" 输入形状: {input_x.shape}") # (2, 5, 512)
print(f" 输出形状: {x.shape}") # (2, 5, 512)
print(f" 形状保持不变: {input_x.shape == x.shape}\n")
# 演示训练模式下的Dropout
print(f"8. 训练模式下的Dropout:")
ffn.train()
x_train = ffn.linear1(input_x)
x_train = ffn.activation(x_train)
x_before_dropout_train = x_train.clone()
x_train = ffn.dropout(x_train)
print(f" 输入形状: {x_before_dropout_train.shape}") # (2, 5, 2048)
print(f" 输出形状: {x_train.shape}") # (2, 5, 2048)
print(f" 数据是否变化: {not torch.equal(x_before_dropout_train, x_train)}")
print(f" 说明: 训练模式下Dropout会随机置0部分神经元\n")
# 参数量统计
print(f"9. 参数量统计:")
params_linear1 = d_ff * d_model + d_ff
params_linear2 = d_model * d_ff + d_model
total_params = params_linear1 + params_linear2
print(f" 线性变换1参数: {params_linear1:,}") # 1,050,624
print(f" 线性变换2参数: {params_linear2:,}") # 1,049,088
print(f" 总参数量: {total_params:,}") # 2,099,712
def demo_ffn_in_transformer():
"""
演示FFN在Transformer编码器层中的位置
"""
print("\n=== FFN在Transformer编码器层中的位置 ===\n")
batch_size = 2
seq_len = 5
d_model = 512
n_heads = 8
d_ff = 2048
# 创建输入
x = torch.randn(batch_size, seq_len, d_model)
print(f"1. 输入:")
print(f" 形状: {x.shape}") # (2, 5, 512)
# 多头自注意力
print(f"\n2. 多头自注意力 (Multi-Head Attention):")
mha = nn.MultiheadAttention(d_model, n_heads, batch_first=True)
attn_output, _ = mha(x, x, x)
print(f" 输出形状: {attn_output.shape}") # (2, 5, 512)
# 残差连接 + LayerNorm
print(f"\n3. 残差连接 + LayerNorm:")
norm1 = nn.LayerNorm(d_model)
x = norm1(x + attn_output)
print(f" 输出形状: {x.shape}") # (2, 5, 512)
# FFN
print(f"\n4. 前馈神经网络 (FFN):")
ffn = TransformerFFN(d_model, d_ff)
ffn.eval()
ffn_output = ffn(x)
print(f" 输入形状: {x.shape}") # (2, 5, 512)
print(f" 输出形状: {ffn_output.shape}") # (2, 5, 512)
# 残差连接 + LayerNorm
print(f"\n5. 残差连接 + LayerNorm:")
norm2 = nn.LayerNorm(d_model)
output = norm2(x + ffn_output)
print(f" 输出形状: {output.shape}") # (2, 5, 512)
print(f"\n6. 最终输出:")
print(f" 形状: {output.shape}") # (2, 5, 512)
if __name__ == "__main__":
# 演示FFN
demo_ffn()
# 演示FFN在Transformer中的位置
demo_ffn_in_transformer()
=== Transformer前馈神经网络演示 ===
1. 输入数据:
形状: torch.Size([2, 5, 512])
数据示例 (第一个batch的第一个token):
tensor([ 0.3350, -0.0768, -0.0509, -0.5749, -0.0755, 0.0681, 0.7161, -1.5507,
0.1527, 0.7693])...
2. 创建FFN层:
d_model: 512
d_ff: 2048
dropout: 0.1
3. 线性变换1 (Linear1):
输入形状: torch.Size([2, 5, 512])
权重形状: torch.Size([2048, 512])
偏置形状: torch.Size([2048])
输出形状: torch.Size([2, 5, 2048])
数据示例 (第一个batch的第一个token):
tensor([-0.4664, -0.4144, 0.0065, 0.2996, 0.1163, -0.1496, 0.0700, 0.0834,
-0.0931, -0.5704], grad_fn=<SliceBackward0>)...
4. ReLU激活函数:
输入形状: torch.Size([2, 5, 2048])
输出形状: torch.Size([2, 5, 2048])
数据示例 (第一个batch的第一个token):
激活前: tensor([-0.4664, -0.4144, 0.0065, 0.2996, 0.1163, -0.1496, 0.0700, 0.0834,
-0.0931, -0.5704], grad_fn=<SliceBackward0>)...
激活后: tensor([0.0000, 0.0000, 0.0065, 0.2996, 0.1163, 0.0000, 0.0700, 0.0834, 0.0000,
0.0000], grad_fn=<SliceBackward0>)...
说明: 负数变为0,正数保持不变
5. Dropout层 (评估模式,不生效):
输入形状: torch.Size([2, 5, 2048])
输出形状: torch.Size([2, 5, 2048])
数据是否变化: False
说明: 评估模式下Dropout不生效
6. 线性变换2 (Linear2):
输入形状: torch.Size([2, 5, 2048])
权重形状: torch.Size([512, 2048])
偏置形状: torch.Size([512])
输出形状: torch.Size([2, 5, 512])
数据示例 (第一个batch的第一个token):
tensor([-0.1169, 0.1816, -0.0892, -0.0025, -0.1541, 0.2057, -0.2915, -0.1475,
0.0740, 0.1474], grad_fn=<SliceBackward0>)...
7. 最终输出:
输入形状: torch.Size([2, 5, 512])
输出形状: torch.Size([2, 5, 512])
形状保持不变: True
8. 训练模式下的Dropout:
输入形状: torch.Size([2, 5, 2048])
输出形状: torch.Size([2, 5, 2048])
数据是否变化: True
说明: 训练模式下Dropout会随机置0部分神经元
9. 参数量统计:
线性变换1参数: 1,050,624
线性变换2参数: 1,049,088
总参数量: 2,099,712
=== FFN在Transformer编码器层中的位置 ===
1. 输入:
形状: torch.Size([2, 5, 512])
2. 多头自注意力 (Multi-Head Attention):
输出形状: torch.Size([2, 5, 512])
3. 残差连接 + LayerNorm:
输出形状: torch.Size([2, 5, 512])
4. 前馈神经网络 (FFN):
输入形状: torch.Size([2, 5, 512])
输出形状: torch.Size([2, 5, 512])
5. 残差连接 + LayerNorm:
输出形状: torch.Size([2, 5, 512])
6. 最终输出:
形状: torch.Size([2, 5, 512])
输入 (batch_size, seq_len, d_model)
↓ 线性变换1
(batch_size, seq_len, d_ff)
↓ ReLU激活
(batch_size, seq_len, d_ff)
↓ Dropout
(batch_size, seq_len, d_ff)
↓ 线性变换2
(batch_size, seq_len, d_model)
关键点
1、两个线性层:先扩展维度(512->2048),再压缩回原维度(2048->512)
2、ReLU 层:引入非线性,负数变为 0
3、Dropout 层:训练时随机置 0 部分神经元,防止过拟合;评估时不生效
4、参数量:约 210 万参数,是 Attention 层的 2 倍
5、作用:提供特征变换能力,增强模型的表达能力