[深度学习]transformer(实战)以下是一个完整的Transformer模型实现，用于时间序列预测任务。代码包含

以下是一个完整的Transformer模型实现，用于时间序列预测任务。代码包含数据模拟、模型构建、训练和评估全流程：

import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 设置随机种子确保可复现性
tf.random.set_seed(42)
np.random.seed(42)

# 1. 模拟时间序列数据 - 创建具有季节性和趋势的复杂时间序列
def generate_time_series(n_samples, seq_length, n_features):
    """
    生成多特征时间序列数据
    n_samples: 样本数量
    seq_length: 序列长度
    n_features: 特征数量
    """
    t = np.linspace(0, 10, seq_length)
    
    # 基础信号：正弦波、方波、锯齿波
    base_signals = [
        0.5 * np.sin(2 * np.pi * t),  # 正弦波
        0.3 * np.sign(np.sin(3 * np.pi * t)),  # 方波
        0.2 * (t % 1.0)  # 锯齿波
    ]
    
    # 趋势：线性和二次趋势
    trends = [
        0.1 * t,  # 线性趋势
        0.02 * (t - 5)**2  # 二次趋势
    ]
    
    # 生成多变量时间序列
    X = np.zeros((n_samples, seq_length, n_features))
    y = np.zeros((n_samples, 1))  # 预测下一个时间步的单一值
    
    for i in range(n_samples):
        # 随机组合基础信号
        signals = np.zeros(seq_length)
        for _ in range(3):  # 随机选择3个信号组合
            signal_idx = np.random.randint(0, len(base_signals))
            weight = np.random.uniform(0.5, 1.5)
            phase_shift = np.random.uniform(0, 2*np.pi)
            signals += weight * np.roll(base_signals[signal_idx], int(phase_shift))
        
        # 添加趋势
        trend_idx = np.random.randint(0, len(trends))
        trend_weight = np.random.uniform(0.2, 0.8)
        signals += trend_weight * trends[trend_idx]
        
        # 添加噪声
        noise = np.random.normal(0, 0.1, seq_length)
        signals += noise
        
        # 创建多个特征（特征1是主信号，其他特征是变换版本）
        for j in range(n_features):
            if j == 0:
                X[i, :, j] = signals
            else:
                # 创建相关特征：延迟、缩放或转换版本
                delay = np.random.randint(1, 5)
                scale = np.random.uniform(0.7, 1.3)
                X[i, :, j] = scale * np.roll(signals, delay) + np.random.normal(0, 0.05, seq_length)
        
        # 目标：预测下一个时间步的特征0的值
        y[i] = signals[-1] + 0.5 * signals[-2] - 0.3 * signals[-3] + np.random.normal(0, 0.1)
    
    return X, y

# 2. Transformer模型组件
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential([
            layers.Dense(ff_dim, activation="relu"),
            layers.Dense(embed_dim)
        ])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)
    
    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

class PositionalEncoding(layers.Layer):
    def __init__(self, position, d_model):
        super(PositionalEncoding, self).__init__()
        self.pos_encoding = self.positional_encoding(position, d_model)
    
    def get_angles(self, position, i, d_model):
        angles = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
        return position * angles
    
    def positional_encoding(self, position, d_model):
        angle_rads = self.get_angles(
            np.arange(position)[:, np.newaxis],
            np.arange(d_model)[np.newaxis, :],
            d_model
        )
        # 应用sin到偶数索引，cos到奇数索引
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
        pos_encoding = angle_rads[np.newaxis, ...]
        return tf.cast(pos_encoding, dtype=tf.float32)
    
    def call(self, inputs):
        seq_length = tf.shape(inputs)[1]
        return inputs + self.pos_encoding[:, :seq_length, :]

# 3. 构建Transformer模型
def build_transformer_model(input_shape, num_heads, embed_dim, ff_dim, num_blocks, dropout_rate=0.1):
    inputs = layers.Input(shape=input_shape)
    
    # 输入嵌入和位置编码
    x = layers.Dense(embed_dim)(inputs)  # 线性变换到嵌入维度
    x = PositionalEncoding(input_shape[0], embed_dim)(x)
    x = layers.Dropout(dropout_rate)(x)
    
    # Transformer块堆叠
    for _ in range(num_blocks):
        x = TransformerBlock(embed_dim, num_heads, ff_dim, dropout_rate)(x)
    
    # 全局平均池化和输出层
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dropout(dropout_rate)(x)
    outputs = layers.Dense(1)(x)
    
    return keras.Model(inputs=inputs, outputs=outputs)

# 4. 主程序
def main():
    # 生成模拟数据
    n_samples = 5000
    seq_length = 50
    n_features = 4
    
    print("生成模拟数据...")
    X, y = generate_time_series(n_samples, seq_length, n_features)
    
    # 数据预处理
    print("预处理数据...")
    # 重塑为2D以进行缩放
    X_flat = X.reshape(-1, n_features)
    y_flat = y.reshape(-1, 1)
    
    # 特征缩放
    scaler_X = MinMaxScaler(feature_range=(-1, 1))
    scaler_y = MinMaxScaler(feature_range=(-1, 1))
    
    X_scaled = scaler_X.fit_transform(X_flat).reshape(n_samples, seq_length, n_features)
    y_scaled = scaler_y.fit_transform(y_flat)
    
    # 分割数据集
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y_scaled, test_size=0.2, random_state=42
    )
    
    # 构建Transformer模型
    print("构建Transformer模型...")
    model = build_transformer_model(
        input_shape=(seq_length, n_features),
        num_heads=4,
        embed_dim=32,
        ff_dim=64,
        num_blocks=2,
        dropout_rate=0.1
    )
    
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=1e-3),
        loss="mse",
        metrics=["mae"]
    )
    
    model.summary()
    
    # 训练模型
    print("训练模型...")
    history = model.fit(
        X_train, y_train,
        batch_size=32,
        epochs=50,
        validation_split=0.2,
        verbose=1
    )
    
    # 评估模型
    print("评估模型...")
    y_pred_scaled = model.predict(X_test)
    
    # 反归一化
    y_test_inv = scaler_y.inverse_transform(y_test)
    y_pred_inv = scaler_y.inverse_transform(y_pred_scaled)
    
    # 计算评估指标
    mse = mean_squared_error(y_test_inv, y_pred_inv)
    mae = mean_absolute_error(y_test_inv, y_pred_inv)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test_inv, y_pred_inv)
    
    print("\n评估结果:")
    print(f"MSE: {mse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"R²: {r2:.4f}")
    
    # 可视化结果
    plt.figure(figsize=(15, 10))
    
    # 训练过程可视化
    plt.subplot(2, 2, 1)
    plt.plot(history.history['loss'], label='训练损失')
    plt.plot(history.history['val_loss'], label='验证损失')
    plt.title('模型损失')
    plt.ylabel('MSE')
    plt.xlabel('Epoch')
    plt.legend()
    plt.grid(True)
    
    # 预测值与真实值对比
    plt.subplot(2, 2, 2)
    plt.scatter(y_test_inv, y_pred_inv, alpha=0.5)
    plt.plot([y_test_inv.min(), y_test_inv.max()], 
             [y_test_inv.min(), y_test_inv.max()], 'r--')
    plt.title('预测值 vs 真实值')
    plt.xlabel('真实值')
    plt.ylabel('预测值')
    plt.grid(True)
    
    # 残差图
    residuals = y_test_inv - y_pred_inv
    plt.subplot(2, 2, 3)
    plt.scatter(y_pred_inv, residuals, alpha=0.5)
    plt.axhline(y=0, color='r', linestyle='-')
    plt.title('残差图')
    plt.xlabel('预测值')
    plt.ylabel('残差')
    plt.grid(True)
    
    # 指标对比
    plt.subplot(2, 2, 4)
    metrics = ['MSE', 'MAE', 'RMSE', 'R²']
    values = [mse, mae, rmse, r2]
    colors = ['blue', 'green', 'orange', 'red']
    bars = plt.bar(metrics, values, color=colors)
    plt.title('模型评估指标')
    plt.ylabel('值')
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2, height, 
                 f'{height:.4f}', ha='center', va='bottom', fontsize=10)
    
    plt.tight_layout()
    plt.show()
    
    # 可视化注意力权重（示例）
    # 注意：这里简化实现，实际应用中可能需要更复杂的提取逻辑
    print("\n可视化注意力权重示例...")
    sample_idx = np.random.randint(0, len(X_test))
    sample_input = X_test[sample_idx][np.newaxis, ...]
    
    # 创建一个返回注意力权重的模型
    attention_model = keras.Model(
        inputs=model.input,
        outputs=model.layers[3].output  # 获取第一个Transformer块的输出
    )
    
    attention_output = attention_model.predict(sample_input)
    attention_weights = attention_output[0]  # 获取注意力权重
    
    # 可视化第一个头的注意力权重
    plt.figure(figsize=(10, 8))
    plt.imshow(attention_weights[:, :, 0], cmap='viridis')  # 第一个注意力头
    plt.title('注意力权重热力图 (第一个头)')
    plt.xlabel('Key序列位置')
    plt.ylabel('Query序列位置')
    plt.colorbar()
    plt.show()

if __name__ == "__main__":
    main()

代码详细说明：

1. 数据模拟

生成了包含5000个样本的时间序列，每个序列长度为50，有4个特征
创建了复杂的信号模式：正弦波、方波和锯齿波的随机组合
添加了线性和二次趋势项
多个特征之间存在相关性（延迟和缩放变换）
目标值是下一个时间步的主特征值，基于过去三个时间步的非线性组合

2. Transformer模型组件

TransformerBlock：实现Transformer的核心结构
- 多头注意力机制
- 前馈神经网络
- 层归一化和残差连接
PositionalEncoding：位置编码层，为模型提供序列顺序信息

3. 模型架构

输入层：接受形状为(序列长度, 特征数)的输入
嵌入层：将特征映射到高维空间
位置编码：添加序列位置信息
Transformer块：堆叠2个Transformer块，每个块有4个注意力头和64维前馈网络
全局平均池化：将序列特征压缩为固定长度向量
输出层：预测下一个时间步的值

4. 训练与评估

使用Adam优化器和MSE损失函数
训练50个epoch，批量大小32
评估指标：MSE、MAE、RMSE和R²
可视化：
- 训练/验证损失曲线
- 预测值 vs 真实值散点图
- 残差分析图
- 评估指标柱状图
- 注意力权重热力图（示例）

Transformer模型关键特点：

自注意力机制：
- 模型可以学习序列中不同位置之间的关系
- 能够捕获长期依赖关系，优于传统RNN
位置编码：
- 为模型提供序列位置信息
- 使用正弦/余弦函数生成位置编码
多头注意力：
- 允许模型同时关注不同表示子空间的信息
- 增强模型捕捉不同模式的能力
残差连接和层归一化：
- 促进梯度流动，支持深层网络训练
- 稳定训练过程

扩展建议：

调整模型架构：

# 更深的Transformer
model = build_transformer_model(
    input_shape=(seq_length, n_features),
    num_heads=8,
    embed_dim=64,
    ff_dim=128,
    num_blocks=4
)

添加学习率调度：

lr_schedule = keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=1e-3,
    decay_steps=1000,
    decay_rate=0.9)
optimizer = keras.optimizers.Adam(learning_rate=lr_schedule)

实现序列到序列预测：
- 修改输出层为layers.Dense(seq_length)预测整个序列
- 添加解码器部分处理多步预测

添加正则化：

# 在Dense层添加L2正则化
layers.Dense(64, activation='relu', kernel_regularizer='l2')

实现更复杂的位置编码：

# 可学习的位置编码
class LearnablePositionalEncoding(layers.Layer):
    def __init__(self, max_len, embed_dim):
        super().__init__()
        self.pos_emb = layers.Embedding(input_dim=max_len, output_dim=embed_dim)
    
    def call(self, x):
        positions = tf.range(start=0, limit=tf.shape(x)[1], delta=1)
        positions = self.pos_emb(positions)
        return x + positions

这个实现展示了Transformer在时间序列预测中的应用，您可以根据具体任务调整模型结构和超参数。