Lucidrains 系列项目源码解析（四十七）

`.\lucidrains\isab-pytorch\isab_pytorch\init.py`

# 从 isab_pytorch 模块中导入 ISAB 类
from isab_pytorch.isab_pytorch import ISAB

Induced Set Attention Block (ISAB) - Pytorch

A concise implementation of (Induced) Set Attention Block, from the Set Transformers paper. It proposes to reduce attention from O(n²) to O(mn), where m is the number of inducing points (learned latents).

Update: Interesting enough, a new paper has used the ISAB block successfully, in the domain of denoising diffusion for efficient generation of images and video.

Install

$ pip install isab-pytorch

Usage

You can either set the number of latents, in which the parameters will be instantiated and returned on completion of cross attention.

import torch
from isab_pytorch import ISAB

attn = ISAB(
    dim = 512,
    heads = 8,
    num_latents = 128,
    latent_self_attend = True
)

seq = torch.randn(1, 16384, 512) # (batch, seq, dim)
mask = torch.ones((1, 16384)).bool()

out, latents = attn(seq, mask = mask) # (1, 16384, 512), (1, 128, 512)

Or you can choose not to set the number of latents, and pass in the latents yourself (some persistent latent that propagates down the transformer, as an example)

import torch
from isab_pytorch import ISAB

attn = ISAB(
    dim = 512,
    heads = 8
)

seq = torch.randn(1, 16384, 512) # (batch, seq, dim)
latents = torch.nn.Parameter(torch.randn(128, 512)) # some memory, passed through multiple ISABs

out, new_latents = attn(seq, latents) # (1, 16384, 512), (1, 128, 512)

Citations

@misc{lee2019set,
    title   = {Set Transformer: A Framework for Attention-based Permutation-Invariant Neural Networks},
    author  = {Juho Lee and Yoonho Lee and Jungtaek Kim and Adam R. Kosiorek and Seungjin Choi and Yee Whye Teh},
    year    = {2019},
    eprint  = {1810.00825},
    archivePrefix = {arXiv},
    primaryClass = {cs.LG}
}

@article{Alayrac2022Flamingo,
    title   = {Flamingo: a Visual Language Model for Few-Shot Learning},
    author  = {Jean-Baptiste Alayrac et al},
    year    = {2022}
}

`.\lucidrains\isab-pytorch\setup.py`

# 导入设置工具和查找包的函数
from setuptools import setup, find_packages

# 设置包的元数据
setup(
  name = 'isab-pytorch',  # 包的名称
  packages = find_packages(),  # 查找并包含所有包
  version = '0.2.3',  # 版本号
  license='MIT',  # 许可证信息
  description = 'Induced Set Attention Block - Pytorch',  # 描述
  long_description_content_type = 'text/markdown',  # 长描述内容类型
  author = 'Phil Wang',  # 作者
  author_email = 'lucidrains@gmail.com',  # 作者邮箱
  url = 'https://github.com/lucidrains/isab-pytorch',  # 项目链接
  keywords = [
    'artificial intelligence',  # 关键词：人工智能
    'attention mechanism'  # 关键词：注意力机制
  ],
  install_requires=[
    'torch',  # 安装所需的依赖包：torch
    'einops>=0.3'  # 安装所需的依赖包：einops，版本需大于等于0.3
  ],
  classifiers=[
    'Development Status :: 4 - Beta',  # 分类器：开发状态为Beta
    'Intended Audience :: Developers',  # 分类器：面向的受众为开发者
    'Topic :: Scientific/Engineering :: Artificial Intelligence',  # 分类器：主题为科学/工程 - 人工智能
    'License :: OSI Approved :: MIT License',  # 分类器：许可证为MIT
    'Programming Language :: Python :: 3.6',  # 分类器：编程语言为Python 3.6
  ],
)

`.\lucidrains\iTransformer\iTransformer\attend.py`

# 导入所需的库
from functools import partial

import torch
from torch import nn, einsum, Tensor
import torch.nn.functional as F

from collections import namedtuple
from functools import wraps
from packaging import version

from einops import rearrange, repeat

# 定义一个命名元组EfficientAttentionConfig，包含三个布尔类型的参数
EfficientAttentionConfig = namedtuple('EfficientAttentionConfig', ['enable_flash', 'enable_math', 'enable_mem_efficient'])

# 辅助函数

# 判断变量是否存在
def exists(val):
    return val is not None

# 如果变量存在则返回该变量，否则返回默认值
def default(val, d):
    return val if exists(val) else d

# 保证函数只被调用一次
def once(fn):
    called = False
    @wraps(fn)
    def inner(x):
        nonlocal called
        if called:
            return
        called = True
        return fn(x)
    return inner

# 打印函数，只打印一次
print_once = once(print)

# 主类

class Attend(nn.Module):
    def __init__(
        self,
        *,
        dropout = 0.,
        heads = None,
        scale = None,
        flash = False,
        causal = False
    ):
        super().__init__()
        self.scale = scale

        self.dropout = dropout
        self.attn_dropout = nn.Dropout(dropout)

        self.causal = causal

        # flash attention

        self.flash = flash
        assert not (flash and version.parse(torch.__version__) < version.parse('2.0.0')), 'in order to use flash attention, you must be using pytorch 2.0 or above'

        # determine efficient attention configs for cuda and cpu

        self.cpu_config = EfficientAttentionConfig(True, True, True)
        self.cuda_config = None

        if not torch.cuda.is_available() or not flash:
            return

        device_properties = torch.cuda.get_device_properties(torch.device('cuda'))

        major, minor = device_properties.major, device_properties.minor

        if (major, minor) == (8, 0):
            print_once('A100 GPU detected, using flash attention if input tensor is on cuda')
            self.cuda_config = EfficientAttentionConfig(True, False, False)
        elif (major, minor) == (9, 0):
            print_once('H100 GPU detected, using flash attention')
            self.cuda_config = EfficientAttentionConfig(True, False, False)
        else:
            print_once('Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda')
            self.cuda_config = EfficientAttentionConfig(False, True, True)

    # 实现flash attention
    def flash_attn(
        self,
        q, k, v
    ):
        batch, heads, q_len, _, k_len, is_cuda, device = *q.shape, k.shape[-2], q.is_cuda, q.device

        # 检查是否有兼容的设备用于flash attention

        config = self.cuda_config if is_cuda else self.cpu_config

        # 使用torch.backends.cuda.sdp_kernel(**config._asdict())来调用pytorch 2.0的flash attention
        with torch.backends.cuda.sdp_kernel(**config._asdict()):
            out = F.scaled_dot_product_attention(
                q, k, v,
                is_causal = self.causal,
                dropout_p = self.dropout if self.training else 0.
            )
        
        return out

    # 前向传播函数
    def forward(
        self,
        q, k, v
    ):
        """
        einstein notation
        b - batch
        h - heads
        n, i, j - sequence length (base sequence length, source, target)
        d - feature dimension
        """

        n, heads, kv_heads, device, dtype = q.shape[-2], q.shape[1], k.shape[1], q.device, q.dtype

        scale = default(self.scale, q.shape[-1] ** -0.5)

        if self.flash:
            return self.flash_attn(q, k, v)

        sim = einsum(f'b h i d, b h j d -> b h i j', q, k) * scale

        if self.causal:
            i, j, dtype = *sim.shape[-2:], sim.dtype
            mask_value = -torch.finfo(sim.dtype).max
            causal_mask = torch.ones((i, j), dtype = torch.bool, device = device).triu(j - i + 1)
            sim = sim.masked_fill(causal_mask, mask_value)

        attn = sim.softmax(dim = -1)
        attn = attn.type(dtype)

        attn = self.attn_dropout(attn)

        out = einsum(f'b h i j, b h j d -> b h i d', attn, v)

        return out

`.\lucidrains\iTransformer\iTransformer\iTransformer.py`

# 导入 torch 库
import torch
# 从 torch 库中导入 nn, einsum, Tensor
from torch import nn, einsum, Tensor
# 从 torch.nn 库中导入 Module, ModuleList
from torch.nn import Module, ModuleList
# 从 torch.nn.functional 库中导入 F
import torch.nn.functional as F

# 从 beartype 库中导入 beartype
from beartype import beartype
# 从 beartype.typing 库中导入 Optional, Union, Tuple
from beartype.typing import Optional, Union, Tuple

# 从 einops 库中导入 rearrange, reduce, repeat, pack, unpack
from einops import rearrange, reduce, repeat, pack, unpack
# 从 einops.layers.torch 库中导入 Rearrange
from einops.layers.torch import Rearrange

# 从 iTransformer.attend 模块中导入 Attend 类
from iTransformer.attend import Attend
# 从 iTransformer.revin 模块中导入 RevIN 类

# 辅助函数

# 判断变量是否存在
def exists(v):
    return v is not None

# 如果变量存在则返回变量，否则返回默认值
def default(v, d):
    return v if exists(v) else d

# 返回输入本身
def identity(t, *args, **kwargs):
    return t

# 将输入转换为元组
def cast_tuple(t):
    return (t,) if not isinstance(t, tuple) else t

# 注意力机制

class Attention(Module):
    def __init__(
        self,
        dim,
        dim_head = 32,
        heads = 4,
        dropout = 0.,
        flash = True
    ):
        super().__init__()
        # 缩放因子
        self.scale = dim_head ** -0.5
        dim_inner = dim_head * heads

        # 将输入转换为查询、键、值
        self.to_qkv = nn.Sequential(
            nn.Linear(dim, dim_inner * 3, bias = False),
            Rearrange('b n (qkv h d) -> qkv b h n d', qkv = 3, h = heads)
        )

        # 将输入转换为值门控制
        self.to_v_gates = nn.Sequential(
            nn.Linear(dim, dim_inner, bias = False),
            nn.SiLU(),
            Rearrange('b n (h d) -> b h n d', h = heads)
        )

        # 注意力机制
        self.attend = Attend(flash = flash, dropout = dropout)

        # 输出层
        self.to_out = nn.Sequential(
            Rearrange('b h n d -> b n (h d)'),
            nn.Linear(dim_inner, dim, bias = False),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        q, k, v = self.to_qkv(x)

        out = self.attend(q, k, v)

        out = out * self.to_v_gates(x)
        return self.to_out(out)

# 前馈神经网络

class GEGLU(Module):
    def forward(self, x):
        x, gate = rearrange(x, '... (r d) -> r ... d', r = 2)
        return x * F.gelu(gate)

# 创建前馈神经网络
def FeedForward(dim, mult = 4, dropout = 0.):
    dim_inner = int(dim * mult * 2 / 3)
    return nn.Sequential(
        nn.Linear(dim, dim_inner * 2),
        GEGLU(),
        nn.Dropout(dropout),
        nn.Linear(dim_inner, dim)
    )

# 主类

class iTransformer(Module):
    @beartype
    def __init__(
        self,
        *,
        num_variates: int,
        lookback_len: int,
        depth: int,
        dim: int,
        num_tokens_per_variate = 1,
        pred_length: Union[int, Tuple[int, ...]],
        dim_head = 32,
        heads = 4,
        attn_dropout = 0.,
        ff_mult = 4,
        ff_dropout = 0.,
        num_mem_tokens = 4,
        use_reversible_instance_norm = False,
        reversible_instance_norm_affine = False,
        flash_attn = True
    ):
        # 初始化函数，设置模型的变量数和回溯长度
        super().__init__()
        self.num_variates = num_variates
        self.lookback_len = lookback_len

        # 初始化内存令牌参数
        self.mem_tokens = nn.Parameter(torch.randn(num_mem_tokens, dim)) if num_mem_tokens > 0 else None

        # 处理预测长度
        pred_length = cast_tuple(pred_length)
        self.pred_length = pred_length

        # 初始化可逆实例归一化层
        self.reversible_instance_norm = RevIN(num_variates, affine = reversible_instance_norm_affine) if use_reversible_instance_norm else None
        self.num_tokens_per_variate = num_tokens_per_variate

        # 初始化模型的层
        self.layers = ModuleList([])
        for _ in range(depth):
            self.layers.append(ModuleList([
                Attention(dim, dim_head = dim_head, heads = heads, dropout = attn_dropout, flash = flash_attn),
                nn.LayerNorm(dim),
                FeedForward(dim, mult = ff_mult, dropout = ff_dropout),
                nn.LayerNorm(dim)
            ]))

        # 初始化 MLP 输入层
        self.mlp_in = nn.Sequential(
            nn.Linear(lookback_len, dim * num_tokens_per_variate),
            Rearrange('b v (n d) -> b (v n) d', n = num_tokens_per_variate),
            nn.LayerNorm(dim)
        )

        # 初始化预测头部
        self.pred_heads = ModuleList([])

        for one_pred_length in pred_length:
            head = nn.Sequential(
                Rearrange('b (v n) d -> b v (n d)', n = num_tokens_per_variate),
                nn.Linear(dim * num_tokens_per_variate, one_pred_length),
                Rearrange('b v n -> b n v')
            )

            self.pred_heads.append(head)

    @beartype
    def forward(
        self,
        x: Tensor,
        targets: Optional[Union[Tensor, Tuple[Tensor, ...]]] = None
    ):
        """
        einstein notation

        b - batch
        n - time
        v - variate
        t - num tokens per variate
        """
        t = self.num_tokens_per_variate

        has_mem = exists(self.mem_tokens)
        assert x.shape[1:] == (self.lookback_len, self.num_variates)

        # 将输入数据重新排列
        x = rearrange(x, 'b n v -> b v n')

        if exists(self.reversible_instance_norm):
            x, reverse_fn = self.reversible_instance_norm(x)

        x = self.mlp_in(x)

        # 内存令牌

        if has_mem:
            m = repeat(self.mem_tokens, 'm d -> b m d', b = x.shape[0])
            x, mem_ps = pack([m, x], 'b * d')

        # 注意力和前馈层

        for attn, attn_post_norm, ff, ff_post_norm in self.layers:
            x = attn(x) + x
            x = attn_post_norm(x)
            x = ff(x) + x
            x = ff_post_norm(x)

        # 剪切出内存令牌

        if has_mem:
            _, x = unpack(x, mem_ps, 'b * d')

        # 如果需要可逆实例归一化

        if exists(self.reversible_instance_norm):
            x = rearrange(x, 'b (n t) d -> t b n d', t = t)
            x = reverse_fn(x)
            x = rearrange(x, 't b n d -> b (n t) d', t = t)

        # 多次预测

        pred_list = [fn(x) for fn in self.pred_heads]

        # 如果传入了目标值，则计算损失

        if exists(targets):
            targets = cast_tuple(targets)
            assert len(targets) == len(pred_list)

            assert self.training
            mse_loss = 0.
            for target, pred in zip(targets, pred_list):
                assert target.shape == pred.shape

                mse_loss = mse_loss + F.mse_loss(target, pred)

            return mse_loss

        if len(pred_list) == 0:
            return pred_list[0]

        pred_dict = dict(zip(self.pred_length, pred_list))
        return pred_dict

`.\lucidrains\iTransformer\iTransformer\iTransformer2D.py`

# 导入 torch 库
import torch
# 从 torch 库中导入 nn, einsum, Tensor
from torch import nn, einsum, Tensor
# 从 torch.nn 库中导入 Module, ModuleList
from torch.nn import Module, ModuleList
# 从 torch.nn.functional 库中导入 F
import torch.nn.functional as F

# 从 beartype 库中导入 beartype
from beartype import beartype
# 从 beartype.typing 库中导入 Optional, Union, Tuple
from beartype.typing import Optional, Union, Tuple

# 导入 einops 库中的 rearrange, reduce, repeat, pack, unpack
from einops import rearrange, reduce, repeat, pack, unpack
# 从 einops.layers.torch 库中导入 Rearrange
from einops.layers.torch import Rearrange

# 从 iTransformer.attend 模块中导入 Attend 类
from iTransformer.attend import Attend
# 从 iTransformer.revin 模块中导入 RevIN 类
from iTransformer.revin import RevIN

# 从 gateloop_transformer 模块中导入 SimpleGateLoopLayer 类
from gateloop_transformer import SimpleGateLoopLayer
# 从 rotary_embedding_torch 模块中导入 RotaryEmbedding 类

# 定义 helper functions

# 判断变量是否存在
def exists(v):
    return v is not None

# 如果变量存在则返回变量，否则返回默认值
def default(v, d):
    return v if exists(v) else d

# 将输入的张量 t 按照指定的 pattern 进行打包
def pack_one(t, pattern):
    return pack([t], pattern)

# 将输入的张量 t 按照指定的 pattern 进行解包
def unpack_one(t, ps, pattern):
    return unpack(t, ps, pattern)[0]

# 返回输入的张量 t
def identity(t, *args, **kwargs):
    return t

# 判断 num 是否能被 den 整除
def divisible_by(num, den):
    return (num % den) == 0

# 将输入的变量 t 转换为元组形式
def cast_tuple(t):
    return (t,) if not isinstance(t, tuple) else t

# 定义 attention 类

class Attention(Module):
    def __init__(
        self,
        dim,
        dim_head = 32,
        heads = 4,
        dropout = 0.,
        causal = False,
        flash = True,
        rotary_emb: Optional[RotaryEmbedding] = None,
    ):
        super().__init__()
        self.scale = dim_head ** -0.5
        dim_inner = dim_head * heads

        self.rotary_emb = rotary_emb

        self.to_qkv = nn.Sequential(
            nn.Linear(dim, dim_inner * 3, bias = False),
            Rearrange('b n (qkv h d) -> qkv b h n d', qkv = 3, h = heads)
        )

        self.to_v_gates = nn.Sequential(
            nn.Linear(dim, dim_inner, bias = False),
            nn.SiLU(),
            Rearrange('b n (h d) -> b h n d', h = heads)
        )

        self.attend = Attend(flash = flash, dropout = dropout, causal = causal)

        self.to_out = nn.Sequential(
            Rearrange('b h n d -> b n (h d)'),
            nn.Linear(dim_inner, dim, bias = False),
            nn.Dropout(dropout)
        )

    @beartype
    def forward(self, x):
        q, k, v = self.to_qkv(x)

        if exists(self.rotary_emb):
            q, k = map(self.rotary_emb.rotate_queries_or_keys, (q, k))

        out = self.attend(q, k, v)

        out = out * self.to_v_gates(x)
        return self.to_out(out)

# 定义 GEGLU 类

class GEGLU(Module):
    def forward(self, x):
        x, gate = rearrange(x, '... (r d) -> r ... d', r = 2)
        return x * F.gelu(gate)

# 定义 FeedForward 函数

def FeedForward(dim, mult = 4, dropout = 0.):
    dim_inner = int(dim * mult * 2 / 3)
    return nn.Sequential(
        nn.Linear(dim, dim_inner * 2),
        GEGLU(),
        nn.Dropout(dropout),
        nn.Linear(dim_inner, dim)
    )

# 定义 transformer block 类

class TransformerBlock(Module):
    def __init__(
        self,
        *,
        dim,
        causal = False,
        dim_head = 32,
        heads = 8,
        ff_mult = 4,
        flash_attn = True,
        attn_dropout = 0.,
        ff_dropout = 0.,
        rotary_emb: Optional[RotaryEmbedding] = None,
    ):
        super().__init__()
        self.rotary_emb = rotary_emb

        self.attn = Attention(flash = flash_attn, rotary_emb = rotary_emb, causal = causal, dim = dim, dim_head = dim_head, heads = heads, dropout = attn_dropout)
        self.ff = FeedForward(dim = dim, mult = ff_mult, dropout = ff_dropout)
        self.attn_norm = nn.LayerNorm(dim)
        self.ff_norm = nn.LayerNorm(dim)

    def forward(self, x, rotary_emb: Optional[RotaryEmbedding] = None):

        x = self.attn(x) + x
        x = self.attn_norm(x)

        x = self.ff(x) + x
        x = self.ff_norm(x)

        return x

# 定义主类 iTransformer2D

class iTransformer2D(Module):
    @beartype
    # 初始化函数，设置模型的各种参数
    def __init__(
        self,
        *,
        num_variates: int,  # 变量数量
        lookback_len: int,  # 回溯长度
        num_time_tokens: int,  # 时间标记数量
        depth: int,  # 模型深度
        dim: int,  # 维度
        pred_length: Union[int, Tuple[int, ...]],  # 预测长度
        dim_head = 32,  # 头部维度
        heads = 4,  # 头部数量
        attn_dropout = 0.,  # 注意力机制的dropout
        ff_mult = 4,  # FeedForward 层的倍数
        ff_dropout = 0.,  # FeedForward 层的dropout
        num_mem_tokens = 4,  # 记忆标记数量
        use_reversible_instance_norm = False,  # 是否使用可逆实例归一化
        reversible_instance_norm_affine = True,  # 可逆实例归一化的可学习参数
        flash_attn = True  # 是否使用 Flash Attention
    ):
        super().__init__()
        assert divisible_by(lookback_len, num_time_tokens)  # 断言回溯长度可以被时间标记数量整除
        assert num_time_tokens >= 2  # 断言时间标记数量至少为2

        self.num_variates = num_variates  # 设置变量数量
        self.lookback_len = lookback_len  # 设置回溯长度
        self.num_time_tokens = num_time_tokens  # 设置时间标记数量

        self.mem_tokens = nn.Parameter(torch.randn(num_mem_tokens, dim)) if num_mem_tokens > 0 else None  # 设置记忆标记

        pred_length = cast_tuple(pred_length)  # 将预测长度转换为元组
        self.pred_length = pred_length  # 设置预测长度

        self.reversible_instance_norm = RevIN(num_variates, affine = reversible_instance_norm_affine) if use_reversible_instance_norm else None  # 设置可逆实例归一化

        rotary_emb = RotaryEmbedding(dim_head)  # 创建旋转嵌入对象

        self.layers = ModuleList([])  # 创建模型层列表

        block_kwargs = dict(
            dim = dim,
            dim_head = dim_head,
            heads = heads,
            ff_mult = ff_mult,
            attn_dropout = attn_dropout,
            ff_dropout = ff_dropout,
            flash_attn = flash_attn
        )

        # 循环创建模型层
        for _ in range(depth):
            self.layers.append(ModuleList([
                SimpleGateLoopLayer(dim = dim),
                TransformerBlock(causal = True, rotary_emb = rotary_emb, **block_kwargs),
                TransformerBlock(causal = False, **block_kwargs)
            ]))

        # 创建变量标记转换层
        self.to_variate_token = nn.Sequential(
            nn.Linear(lookback_len, dim),
            nn.LayerNorm(dim)
        )

        time_kernel_size = lookback_len // num_time_tokens  # 计算时间卷积核大小

        # 创建时间标记转换层
        self.to_time_tokens = nn.Sequential(
            Rearrange('b v n -> (b v) 1 n'),
            nn.ConstantPad1d((time_kernel_size, 0), value = 0.),
            nn.Conv1d(1, dim, time_kernel_size * 2),
            Rearrange('(b v) d t -> b v t d', v = num_variates),
            nn.LayerNorm(dim)
        )

        self.pred_heads = ModuleList([])  # 创建预测头列表

        # 循环创建预测头
        for one_pred_length in pred_length:
            head = nn.Sequential(
                nn.Linear(dim, one_pred_length),
                Rearrange('b v n -> b n v')
            )

            self.pred_heads.append(head)

    @beartype
    # 前向传播函数
    def forward(
        self,
        x: Tensor,  # 输入张量
        targets: Optional[Union[Tensor, Tuple[Tensor, ...]]] = None  # 目标张量
    ):
        """
        einstein notation

        b - batch
        n - time
        v - variate
        t - number of time tokens
        """

        # 检查是否存在记忆令牌
        has_mem = exists(self.mem_tokens)
        # 断言输入张量的形状符合预期
        assert x.shape[1:] == (self.lookback_len, self.num_variates)

        # 将输入张量重新排列，将时间维度放在最后
        x = rearrange(x, 'b n v -> b v n')

        # 如果存在可逆实例归一化，则对输入张量进行处理
        if exists(self.reversible_instance_norm):
            x, reverse_fn = self.reversible_instance_norm(x)

        # 推导每个变量的时间令牌 't'

        t = self.to_time_tokens(x)

        # 'v' 将是变量池令牌，与 iTransformer 中的每个变量令牌相同

        v = self.to_variate_token(x)

        # 将时间和变量令牌组合成二维特征图，包含变量和时间

        x, variate_pool_token_ps = pack((t, v), 'b v * d')

        # 记忆令牌

        if has_mem:
            m = repeat(self.mem_tokens, 'm d -> b m t d', b = x.shape[0], t = x.shape[-2])
            x, mem_ps = pack([m, x], 'b * t d')

        # 注意力和前馈层

        for gateloop_block, time_attn_block, variate_attn_block in self.layers:
            x, ps = pack_one(x, '* t d')

            # gateloop block
            x = gateloop_block(x) + x

            # 每个变量的时间上的因果关注
            x = time_attn_block(x)

            x = unpack_one(x, ps, '* t d')

            x = rearrange(x, 'b v t d -> b t v d')
            x, ps = pack_one(x, '* v d')

            # 全局变量关注（如反向 Transformer 论文中）
            x = variate_attn_block(x)

            x = unpack_one(x, ps, '* v d')
            x = rearrange(x, 'b t v d -> b v t d')

        # 剥离记忆令牌

        if has_mem:
            _, x = unpack(x, mem_ps, 'b * t d')

        # 获取原始的变量池令牌

        _, v = unpack(x, variate_pool_token_ps, 'b v * d')

        # 如果需要，进行可逆实例归一化

        if exists(self.reversible_instance_norm):
            v = reverse_fn(v)

        # 预测多个时间步

        pred_list = [fn(v) for fn in self.pred_heads]

        # 如果传入了目标值，则计算损失

        if exists(targets):
            targets = cast_tuple(targets)
            assert len(targets) == len(pred_list)

            assert self.training
            mse_loss = 0.
            for target, pred in zip(targets, pred_list):
                assert target.shape == pred.shape

                mse_loss = mse_loss + F.mse_loss(target, pred)

            return mse_loss

        if len(pred_list) == 0:
            return pred_list[0]

        pred_dict = dict(zip(self.pred_length, pred_list))
        return pred_dict

`.\lucidrains\iTransformer\iTransformer\iTransformerFFT.py`

# 导入 torch 库
import torch
# 从 torch.fft 模块中导入 fft 函数
from torch.fft import fft
# 从 torch 模块中导入 nn、einsum、Tensor
from torch import nn, einsum, Tensor
# 从 torch.nn 模块中导入 Module、ModuleList
from torch.nn import Module, ModuleList
# 从 torch.nn.functional 模块中导入 F
import torch.nn.functional as F

# 从 beartype 库中导入 beartype 函数
from beartype import beartype
# 从 beartype.typing 模块中导入 Optional、Union、Tuple
from beartype.typing import Optional, Union, Tuple

# 从 einops 库中导入 rearrange、reduce、repeat、pack、unpack
from einops import rearrange, reduce, repeat, pack, unpack
# 从 einops.layers.torch 模块中导入 Rearrange
from einops.layers.torch import Rearrange

# 从 iTransformer.attend 模块中导入 Attend 类
from iTransformer.attend import Attend
# 从 iTransformer.revin 模块中导入 RevIN 类

# 定义 helper functions

# 判断变量是否存在
def exists(v):
    return v is not None

# 如果变量存在则返回变量，否则返回默认值
def default(v, d):
    return v if exists(v) else d

# 返回输入的值
def identity(t, *args, **kwargs):
    return t

# 如果输入不是元组，则转换为元组
def cast_tuple(t):
    return (t,) if not isinstance(t, tuple) else t

# 定义 attention 类

class Attention(Module):
    def __init__(
        self,
        dim,
        dim_head = 32,
        heads = 4,
        dropout = 0.,
        flash = True
    ):
        super().__init__()
        self.scale = dim_head ** -0.5
        dim_inner = dim_head * heads

        self.to_qkv = nn.Sequential(
            nn.Linear(dim, dim_inner * 3, bias = False),
            Rearrange('b n (qkv h d) -> qkv b h n d', qkv = 3, h = heads)
        )

        self.to_v_gates = nn.Sequential(
            nn.Linear(dim, dim_inner, bias = False),
            nn.SiLU(),
            Rearrange('b n (h d) -> b h n d', h = heads)
        )

        self.attend = Attend(flash = flash, dropout = dropout)

        self.to_out = nn.Sequential(
            Rearrange('b h n d -> b n (h d)'),
            nn.Linear(dim_inner, dim, bias = False),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        q, k, v = self.to_qkv(x)

        out = self.attend(q, k, v)

        out = out * self.to_v_gates(x)
        return self.to_out(out)

# 定义 feedforward 类

class GEGLU(Module):
    def forward(self, x):
        x, gate = rearrange(x, '... (r d) -> r ... d', r = 2)
        return x * F.gelu(gate)

# 定义 FeedForward 函数
def FeedForward(dim, mult = 4, dropout = 0.):
    dim_inner = int(dim * mult * 2 / 3)
    return nn.Sequential(
        nn.Linear(dim, dim_inner * 2),
        GEGLU(),
        nn.Dropout(dropout),
        nn.Linear(dim_inner, dim)
    )

# 定义主类 iTransformerFFT

class iTransformerFFT(Module):
    @beartype
    def __init__(
        self,
        *,
        num_variates: int,
        lookback_len: int,
        depth: int,
        dim: int,
        num_tokens_per_variate = 1,
        pred_length: Union[int, Tuple[int, ...]],
        dim_head = 32,
        heads = 4,
        attn_dropout = 0.,
        ff_mult = 4,
        ff_dropout = 0.,
        num_mem_tokens = 4,
        use_reversible_instance_norm = False,
        reversible_instance_norm_affine = False,
        flash_attn = True
    # 定义模型的初始化方法
    ):
        # 调用父类的初始化方法
        super().__init__()
        # 设置模型的变量数和回溯长度
        self.num_variates = num_variates
        self.lookback_len = lookback_len

        # 如果存在记忆令牌数量大于0，则使用随机初始化的参数
        self.mem_tokens = nn.Parameter(torch.randn(num_mem_tokens, dim)) if num_mem_tokens > 0 else None

        # 将预测长度转换为元组形式
        pred_length = cast_tuple(pred_length)
        self.pred_length = pred_length

        # 如果使用可逆实例归一化，则初始化RevIN对象
        self.reversible_instance_norm = RevIN(num_variates, affine = reversible_instance_norm_affine) if use_reversible_instance_norm else None

        # 初始化模型的层列表
        self.layers = ModuleList([])
        # 根据深度循环添加多个层
        for _ in range(depth):
            self.layers.append(ModuleList([
                Attention(dim, dim_head = dim_head, heads = heads, dropout = attn_dropout, flash = flash_attn),
                nn.LayerNorm(dim),
                FeedForward(dim, mult = ff_mult, dropout = ff_dropout),
                nn.LayerNorm(dim)
            ]))

        # 定义MLP输入层
        self.mlp_in = nn.Sequential(
            nn.Linear(lookback_len, dim * num_tokens_per_variate),
            Rearrange('b v (n d) -> b (v n) d', n = num_tokens_per_variate),
            nn.LayerNorm(dim)
        )

        # 定义FFT-MLP输入层
        self.fft_mlp_in = nn.Sequential(
            Rearrange('b v n c -> b v (n c)'),
            nn.Linear(lookback_len * 2, dim * num_tokens_per_variate),
            Rearrange('b v (n d) -> b (v n) d', n = num_tokens_per_variate),
            nn.LayerNorm(dim)   
        )

        # 初始化预测头列表
        self.pred_heads = ModuleList([])

        # 针对每个预测长度，添加一个预测头
        for one_pred_length in pred_length:
            head = nn.Sequential(
                Rearrange('b (v n) d -> b v (n d)', n = num_tokens_per_variate),
                nn.Linear(dim * num_tokens_per_variate, one_pred_length),
                Rearrange('b v n -> b n v')
            )

            self.pred_heads.append(head)

    # 定义模型的前向传播方法
    @beartype
    def forward(
        self,
        x: Tensor,
        targets: Optional[Union[Tensor, Tuple[Tensor, ...]]] = None
        ):
        """
        einstein notation

        b - batch
        n - time
        v - variate
        """
        # 检查是否存在记忆令牌
        has_mem = exists(self.mem_tokens)
        # 断言输入张量的形状符合预期
        assert x.shape[1:] == (self.lookback_len, self.num_variates)

        # 论文的关键在于将变量视为注意力中的空间维度
        # 如果成功复制论文，有很多改进的机会

        # 重新排列输入张量的维度，将时间维度放在最后
        x = rearrange(x, 'b n v -> b v n')

        # 对输入张量进行傅立叶变换
        x_fft = fft(x)
        # 将傅立叶变换后的结果转换为实部和虚部
        x_fft = torch.view_as_real(x_fft)

        # 如果存在可逆实例归一化，则对输入张量进行归一化
        if exists(self.reversible_instance_norm):
            x, reverse_fn = self.reversible_instance_norm(x)

        # 将输入张量投影到变量令牌中，对时间和傅立叶域都进行投影
        x = self.mlp_in(x)
        x_fft = self.fft_mlp_in(x_fft)

        # 将傅立叶变换后的结果放在左侧，以便稍后拼接
        x, fft_ps = pack([x_fft, x], 'b * d')

        # 记忆令牌
        if has_mem:
            # 重复记忆令牌以匹配输入张量的批次维度
            m = repeat(self.mem_tokens, 'm d -> b m d', b = x.shape[0])
            x, mem_ps = pack([m, x], 'b * d')

        # 注意力和前馈层
        for attn, attn_post_norm, ff, ff_post_norm in self.layers:
            x = attn(x) + x
            x = attn_post_norm(x)
            x = ff(x) + x
            x = ff_post_norm(x)

        # 拼接出记忆令牌
        if has_mem:
            _, x = unpack(x, mem_ps, 'b * d')

        # 拼接出傅立叶令牌
        x_fft, x = unpack(x, fft_ps, 'b * d')

        # 如果需要，进行可逆实例归一化
        if exists(self.reversible_instance_norm):
            x = reverse_fn(x)

        # 预测多次
        pred_list = [fn(x) for fn in self.pred_heads]

        # 如果传入了目标值，则计算损失
        if exists(targets):
            targets = cast_tuple(targets)
            assert len(targets) == len(pred_list)

            assert self.training
            mse_loss = 0.
            for target, pred in zip(targets, pred_list):
                assert target.shape == pred.shape

                mse_loss = mse_loss + F.mse_loss(target, pred)

            return mse_loss

        # 如果预测列表为空，则返回第一个预测值
        if len(pred_list) == 0:
            return pred_list[0]

        # 将预测结果与预测长度组成字典返回
        pred_dict = dict(zip(self.pred_length, pred_list))
        return pred_dict

`.\lucidrains\iTransformer\iTransformer\revin.py`

# 导入必要的库
from collections import namedtuple
import torch
from torch import nn, einsum, Tensor
from torch.nn import Module, ModuleList
import torch.nn.functional as F

# 定义一个命名元组，用于存储统计信息
Statistics = namedtuple('Statistics', [
    'mean',
    'variance',
    'gamma',
    'beta'
])

# 可逆实例归一化
# 提议的实例归一化方法，参考 https://openreview.net/forum?id=cGDAkQo1C0p

class RevIN(Module):
    def __init__(
        self,
        num_variates,
        affine = True,
        eps = 1e-5
    ):
        super().__init__()
        self.eps = eps
        self.num_variates = num_variates
        # 初始化可学习参数 gamma 和 beta
        self.gamma = nn.Parameter(torch.ones(num_variates, 1), requires_grad = affine)
        self.beta = nn.Parameter(torch.zeros(num_variates, 1), requires_grad = affine)

    def forward(self, x, return_statistics = False):
        assert x.shape[1] == self.num_variates

        # 计算均值和方差
        var = torch.var(x, dim = -1, unbiased = False, keepdim = True)
        mean = torch.mean(x, dim = -1, keepdim = True)
        var_rsqrt = var.clamp(min = self.eps).rsqrt()
        # 实例归一化
        instance_normalized = (x - mean) * var_rsqrt
        # 重新缩放
        rescaled = instance_normalized * self.gamma + self.beta

        # 定义反向函数
        def reverse_fn(scaled_output):
            clamped_gamma = torch.sign(self.gamma) * self.gamma.abs().clamp(min = self.eps)
            unscaled_output = (scaled_output - self.beta) / clamped_gamma
            return unscaled_output * var.sqrt() + mean

        if not return_statistics:
            return rescaled, reverse_fn

        # 返回统计信息
        statistics = Statistics(mean, var, self.gamma, self.beta)

        return rescaled, reverse_fn, statistics

# 主函数，用于进行简单的测试
if __name__ == '__main__':

    # 创建 RevIN 实例
    rev_in = RevIN(512)

    # 生成随机输入
    x = torch.randn(2, 512, 1024)

    # 进行实例归一化并返回统计信息
    normalized, reverse_fn, statistics = rev_in(x, return_statistics = True)

    # 反向操作
    out = reverse_fn(normalized)

    # 断言输入和输出是否一致
    assert torch.allclose(x, out)

`.\lucidrains\iTransformer\iTransformer\init.py`

# 从 iTransformer 模块中导入 iTransformer 和 RevIN 类
# 从 iTransformer 模块中导入 iTransformer2D 类
# 从 iTransformer 模块中导入 iTransformerFFT 类
from iTransformer.iTransformer import (
    iTransformer,
    RevIN
)

from iTransformer.iTransformer2D import iTransformer2D

from iTransformer.iTransformerFFT import iTransformerFFT

iTransformer

Implementation of iTransformer - SOTA Time Series Forecasting using Attention networks, out of Tsinghua / Ant group

All that remains is tabular data (xgboost still champion here) before one can truly declare "Attention is all you need"

In before Apple gets the authors to change the name.

The official implementation has been released here!

Appreciation

StabilityAI and 🤗 Huggingface for the generous sponsorship, as well as my other sponsors, for affording me the independence to open source current artificial intelligence techniques.
Greg DeVos for sharing experiments he ran on iTransformer and some of the improvised variants

Install

$ pip install iTransformer

Usage

import torch
from iTransformer import iTransformer

# using solar energy settings

model = iTransformer(
    num_variates = 137,
    lookback_len = 96,                  # or the lookback length in the paper
    dim = 256,                          # model dimensions
    depth = 6,                          # depth
    heads = 8,                          # attention heads
    dim_head = 64,                      # head dimension
    pred_length = (12, 24, 36, 48),     # can be one prediction, or many
    num_tokens_per_variate = 1,         # experimental setting that projects each variate to more than one token. the idea is that the network can learn to divide up into time tokens for more granular attention across time. thanks to flash attention, you should be able to accommodate long sequence lengths just fine
    use_reversible_instance_norm = True # use reversible instance normalization, proposed here https://openreview.net/forum?id=cGDAkQo1C0p . may be redundant given the layernorms within iTransformer (and whatever else attention learns emergently on the first layer, prior to the first layernorm). if i come across some time, i'll gather up all the statistics across variates, project them, and condition the transformer a bit further. that makes more sense
)

time_series = torch.randn(2, 96, 137)  # (batch, lookback len, variates)

preds = model(time_series)

# preds -> Dict[int, Tensor[batch, pred_length, variate]]
#       -> (12: (2, 12, 137), 24: (2, 24, 137), 36: (2, 36, 137), 48: (2, 48, 137))

For an improvised version that does granular attention across time tokens (as well as the original per-variate tokens), just import iTransformer2D and set the additional num_time_tokens

Update: It works! Thanks goes out to Greg DeVos for running the experiment here!

Update 2: Got an email. Yes you are free to write a paper on this, if the architecture holds up for your problem. I have no skin in the game

import torch
from iTransformer import iTransformer2D

# using solar energy settings

model = iTransformer2D(
    num_variates = 137,
    num_time_tokens = 16,               # number of time tokens (patch size will be (look back length // num_time_tokens))
    lookback_len = 96,                  # the lookback length in the paper
    dim = 256,                          # model dimensions
    depth = 6,                          # depth
    heads = 8,                          # attention heads
    dim_head = 64,                      # head dimension
    pred_length = (12, 24, 36, 48),     # can be one prediction, or many
    use_reversible_instance_norm = True # use reversible instance normalization
)

time_series = torch.randn(2, 96, 137)  # (batch, lookback len, variates)

preds = model(time_series)

# preds -> Dict[int, Tensor[batch, pred_length, variate]]
#       -> (12: (2, 12, 137), 24: (2, 24, 137), 36: (2, 36, 137), 48: (2, 48, 137))

Experimental

iTransformer with fourier tokens

A iTransformer but also with fourier tokens (FFT of time series is projected into tokens of their own and attended along side with the variate tokens, spliced out at the end)

import torch
from iTransformer import iTransformerFFT

# using solar energy settings

model = iTransformerFFT(
    num_variates = 137,
    lookback_len = 96,                  # or the lookback length in the paper
    dim = 256,                          # model dimensions
    depth = 6,                          # depth
    heads = 8,                          # attention heads
    dim_head = 64,                      # head dimension
    pred_length = (12, 24, 36, 48),     # can be one prediction, or many
    num_tokens_per_variate = 1,         # experimental setting that projects each variate to more than one token. the idea is that the network can learn to divide up into time tokens for more granular attention across time. thanks to flash attention, you should be able to accommodate long sequence lengths just fine
    use_reversible_instance_norm = True # use reversible instance normalization, proposed here https://openreview.net/forum?id=cGDAkQo1C0p . may be redundant given the layernorms within iTransformer (and whatever else attention learns emergently on the first layer, prior to the first layernorm). if i come across some time, i'll gather up all the statistics across variates, project them, and condition the transformer a bit further. that makes more sense
)

time_series = torch.randn(2, 96, 137)  # (batch, lookback len, variates)

preds = model(time_series)

# preds -> Dict[int, Tensor[batch, pred_length, variate]]
#       -> (12: (2, 12, 137), 24: (2, 24, 137), 36: (2, 36, 137), 48: (2, 48, 137))

Todo

beef up the transformer with latest findings
improvise a 2d version across both variates and time
improvise a version that includes fft tokens
improvise a variant that uses adaptive normalization conditioned on statistics across all variates

Citation

@misc{liu2023itransformer,
  title   = {iTransformer: Inverted Transformers Are Effective for Time Series Forecasting}, 
  author  = {Yong Liu and Tengge Hu and Haoran Zhang and Haixu Wu and Shiyu Wang and Lintao Ma and Mingsheng Long},
  year    = {2023},
  eprint  = {2310.06625},
  archivePrefix = {arXiv},
  primaryClass = {cs.LG}
}

@misc{shazeer2020glu,
    title   = {GLU Variants Improve Transformer},
    author  = {Noam Shazeer},
    year    = {2020},
    url     = {https://arxiv.org/abs/2002.05202}
}

@misc{burtsev2020memory,
    title   = {Memory Transformer},
    author  = {Mikhail S. Burtsev and Grigory V. Sapunov},
    year    = {2020},
    eprint  = {2006.11527},
    archivePrefix = {arXiv},
    primaryClass = {cs.CL}
}

@inproceedings{Darcet2023VisionTN,
    title   = {Vision Transformers Need Registers},
    author  = {Timoth'ee Darcet and Maxime Oquab and Julien Mairal and Piotr Bojanowski},
    year    = {2023},
    url     = {https://api.semanticscholar.org/CorpusID:263134283}
}

@inproceedings{dao2022flashattention,
    title   = {Flash{A}ttention: Fast and Memory-Efficient Exact Attention with {IO}-Awareness},
    author  = {Dao, Tri and Fu, Daniel Y. and Ermon, Stefano and Rudra, Atri and R{\'e}, Christopher},
    booktitle = {Advances in Neural Information Processing Systems},
    year    = {2022}
}

@Article{AlphaFold2021,
    author  = {Jumper, John and Evans, Richard and Pritzel, Alexander and Green, Tim and Figurnov, Michael and Ronneberger, Olaf and Tunyasuvunakool, Kathryn and Bates, Russ and {\v{Z}}{\'\i}dek, Augustin and Potapenko, Anna and Bridgland, Alex and Meyer, Clemens and Kohl, Simon A A and Ballard, Andrew J and Cowie, Andrew and Romera-Paredes, Bernardino and Nikolov, Stanislav and Jain, Rishub and Adler, Jonas and Back, Trevor and Petersen, Stig and Reiman, David and Clancy, Ellen and Zielinski, Michal and Steinegger, Martin and Pacholska, Michalina and Berghammer, Tamas and Bodenstein, Sebastian and Silver, David and Vinyals, Oriol and Senior, Andrew W and Kavukcuoglu, Koray and Kohli, Pushmeet and Hassabis, Demis},
    journal = {Nature},
    title   = {Highly accurate protein structure prediction with {AlphaFold}},
    year    = {2021},
    doi     = {10.1038/s41586-021-03819-2},
    note    = {(Accelerated article preview)},
}

@inproceedings{kim2022reversible,
    title   = {Reversible Instance Normalization for Accurate Time-Series Forecasting against Distribution Shift},
    author  = {Taesung Kim and Jinhee Kim and Yunwon Tae and Cheonbok Park and Jang-Ho Choi and Jaegul Choo},
    booktitle = {International Conference on Learning Representations},
    year    = {2022},
    url     = {https://openreview.net/forum?id=cGDAkQo1C0p}
}

@inproceedings{Katsch2023GateLoopFD,
    title   = {GateLoop: Fully Data-Controlled Linear Recurrence for Sequence Modeling},
    author  = {Tobias Katsch},
    year    = {2023},
    url     = {https://api.semanticscholar.org/CorpusID:265018962}
}

`.\lucidrains\iTransformer\setup.py`

# 导入设置工具和查找包工具
from setuptools import setup, find_packages

# 设置包的信息
setup(
  name = 'iTransformer', # 包名
  packages = find_packages(exclude=[]), # 查找包
  version = '0.5.5', # 版本号
  license='MIT', # 许可证
  description = 'iTransformer - Inverted Transformer Are Effective for Time Series Forecasting', # 描述
  author = 'Phil Wang', # 作者
  author_email = 'lucidrains@gmail.com', # 作者邮箱
  long_description_content_type = 'text/markdown', # 长描述内容类型
  url = 'https://github.com/lucidrains/iTransformer', # URL
  keywords = [ # 关键词
    'artificial intelligence',
    'deep learning',
    'transformers',
    'attention mechanism',
    'time series forecasting'
  ],
  install_requires=[ # 安装依赖
    'beartype',
    'einops>=0.7.0',
    'gateloop-transformer>=0.2.3',
    'rotary-embedding-torch',
    'torch>=2.1',
  ],
  classifiers=[ # 分类
    'Development Status :: 4 - Beta',
    'Intended Audience :: Developers',
    'Topic :: Scientific/Engineering :: Artificial Intelligence',
    'License :: OSI Approved :: MIT License',
    'Programming Language :: Python :: 3.6',
  ],
)

`.\lucidrains\ITTR-pytorch\ITTR_pytorch\ITTR_pytorch.py`

import torch
import torch.nn.functional as F
from torch import nn, einsum

from einops import rearrange, reduce, repeat

# 定义辅助函数

def exists(val):
    return val is not None

def default(val, d):
    return val if exists(val) else d

def l2norm(t):
    return F.normalize(t, dim = -1)

# 定义辅助类

class Residual(nn.Module):
    def __init__(self, fn):
        super().__init__()
        self.fn = fn

    def forward(self, x, **kwargs):
        return self.fn(x, **kwargs) + x

class ChanLayerNorm(nn.Module):
    def __init__(self, dim, eps = 1e-5):
        super().__init__()
        self.eps = eps
        self.g = nn.Parameter(torch.ones(1, dim, 1, 1))
        self.b = nn.Parameter(torch.zeros(1, dim, 1, 1))

    def forward(self, x):
        var = torch.var(x, dim = 1, unbiased = False, keepdim = True)
        mean = torch.mean(x, dim = 1, keepdim = True)
        return (x - mean) / (var + self.eps).sqrt() * self.g + self.b

# 定义类

class HPB(nn.Module):
    """ Hybrid Perception Block """

    def __init__(
        self,
        dim,
        dim_head = 32,
        heads = 8,
        ff_mult = 4,
        attn_height_top_k = 16,
        attn_width_top_k = 16,
        attn_dropout = 0.,
        ff_dropout = 0.
    ):
        super().__init__()

        self.attn = DPSA(
            dim = dim,
            heads = heads,
            dim_head = dim_head,
            height_top_k = attn_height_top_k,
            width_top_k = attn_width_top_k,
            dropout = attn_dropout
        )

        self.dwconv = nn.Conv2d(dim, dim, 3, padding = 1, groups = dim)
        self.attn_parallel_combine_out = nn.Conv2d(dim * 2, dim, 1)

        ff_inner_dim = dim * ff_mult

        self.ff = nn.Sequential(
            nn.Conv2d(dim, ff_inner_dim, 1),
            nn.InstanceNorm2d(ff_inner_dim),
            nn.GELU(),
            nn.Dropout(ff_dropout),
            Residual(nn.Sequential(
                nn.Conv2d(ff_inner_dim, ff_inner_dim, 3, padding = 1, groups = ff_inner_dim),
                nn.InstanceNorm2d(ff_inner_dim),
                nn.GELU(),
                nn.Dropout(ff_dropout)
            )),
            nn.Conv2d(ff_inner_dim, dim, 1),
            nn.InstanceNorm2d(ff_inner_dim)
        )

    def forward(self, x):
        attn_branch_out = self.attn(x)
        conv_branch_out = self.dwconv(x)

        concatted_branches = torch.cat((attn_branch_out, conv_branch_out), dim = 1)
        attn_out = self.attn_parallel_combine_out(concatted_branches) + x

        return self.ff(attn_out)

class DPSA(nn.Module):
    """ Dual-pruned Self-attention Block """

    def __init__(
        self,
        dim,
        height_top_k = 16,
        width_top_k = 16,
        dim_head = 32,
        heads = 8,
        dropout = 0.
    ):
        super().__init__()
        self.heads = heads
        self.dim_head = dim_head
        self.scale = dim_head ** -0.5
        inner_dim = heads * dim_head

        self.norm = ChanLayerNorm(dim)
        self.to_qkv = nn.Conv2d(dim, inner_dim * 3, 1, bias = False)

        self.height_top_k = height_top_k
        self.width_top_k = width_top_k

        self.dropout = nn.Dropout(dropout)
        self.to_out = nn.Conv2d(inner_dim, dim, 1)
    # 定义前向传播函数，接受输入张量 x
    def forward(self, x):
        # 获取输入张量 x 的形状信息
        b, c, h, w = x.shape

        # 对输入张量 x 进行归一化处理
        x = self.norm(x)

        # 将输入张量 x 转换为查询、键、值张量
        q, k, v = self.to_qkv(x).chunk(3, dim = 1)

        # 将查询、键、值张量按照头数进行折叠
        q, k, v = map(lambda t: rearrange(t, 'b (h c) x y -> (b h) c x y', h = self.heads), (q, k, v))

        # 对查询、键进行 L2 归一化处理
        q, k = map(l2norm, (q, k))

        # 计算是否需要在高度和宽度上进行选择和排名
        need_height_select_and_rank = self.height_top_k < h
        need_width_select_and_rank = self.width_top_k < w

        # 选择和排名键/值，使用查询进行探测，查询在高度和宽度上进行降维，键在行和列上进行降维
        if need_width_select_and_rank or need_height_select_and_rank:
            q_probe = reduce(q, 'b h w d -> b d', 'sum')

        # 沿着高度和宽度进行聚合
        if need_height_select_and_rank:
            k_height = reduce(k, 'b h w d -> b h d', 'sum')

            top_h_indices = einsum('b d, b h d -> b h', q_probe, k_height).topk(k = self.height_top_k, dim = -1).indices

            top_h_indices = repeat(top_h_indices, 'b h -> b h w d', d = self.dim_head, w = k.shape[-2])

            k, v = map(lambda t: t.gather(1, top_h_indices), (k, v)) # 首先沿着高度进行聚合

        if need_width_select_and_rank:
            k_width = reduce(k, 'b h w d -> b w d', 'sum')

            top_w_indices = einsum('b d, b w d -> b w', q_probe, k_width).topk(k = self.width_top_k, dim = -1).indices

            top_w_indices = repeat(top_w_indices, 'b w -> b h w d', d = self.dim_head, h = k.shape[1])

            k, v = map(lambda t: t.gather(2, top_w_indices), (k, v)) # 然后沿着宽度进行聚合

        # 选择适当的键和值
        q, k, v = map(lambda t: rearrange(t, 'b ... d -> b (...) d'), (q, k, v))

        # 计算余弦相似度
        sim = einsum('b i d, b j d -> b i j', q, k)

        # 注意力计算
        attn = sim.softmax(dim = -1)
        attn = self.dropout(attn)

        # 聚合输出
        out = einsum('b i j, b j d -> b i d', attn, v)

        # 合并头部并组合输出
        out = rearrange(out, '(b h) (x y) d -> b (h d) x y', x = h, y = w, h = self.heads)
        return self.to_out(out)

`.\lucidrains\ITTR-pytorch\ITTR_pytorch\init.py`

# 从 ITTR_pytorch.ITTR_pytorch 模块中导入 HPB 和 DPSA 类
from ITTR_pytorch.ITTR_pytorch import HPB, DPSA

ITTR - Pytorch

Implementation of the Hybrid Perception Block (HPB) and Dual-Pruned Self-Attention (DPSA) block from the ITTR paper for Image to Image Translation using Transformers.

Install

$ pip install ITTR-pytorch

Usage

They had 9 blocks of Hybrid Perception Block (HPB) in the paper

import torch
from ITTR_pytorch import HPB

block = HPB(
    dim = 512,              # dimension
    dim_head = 32,          # dimension per attention head
    heads = 8,              # number of attention heads
    attn_height_top_k = 16, # number of top indices to select along height, for the attention pruning
    attn_width_top_k = 16,  # number of top indices to select along width, for the attention pruning
    attn_dropout = 0.,      # attn dropout
    ff_mult = 4,            # expansion factor of feedforward
    ff_dropout = 0.         # feedforward dropout
)

fmap = torch.randn(1, 512, 32, 32)

out = block(fmap) # (1, 512, 32, 32)

You can also use the dual-pruned self-attention as so

import torch
from ITTR_pytorch import DPSA

attn = DPSA(
    dim = 512,         # dimension
    dim_head = 32,     # dimension per attention head
    heads = 8,         # number of attention heads
    height_top_k = 48, # number of top indices to select along height, for the attention pruning
    width_top_k = 48,  # number of top indices to select along width, for the attention pruning
    dropout = 0.       # attn dropout
)

fmap = torch.randn(1, 512, 32, 32)

out = attn(fmap) # (1, 512, 32, 32)

Citations

@inproceedings{Zheng2022ITTRUI,
  title   = {ITTR: Unpaired Image-to-Image Translation with Transformers},
  author  = {Wanfeng Zheng and Qiang Li and Guoxin Zhang and Pengfei Wan and Zhongyuan Wang},
  year    = {2022}
}

`.\lucidrains\ITTR-pytorch\setup.py`

# 导入设置工具和查找包工具
from setuptools import setup, find_packages

# 设置包的元信息
setup(
  name = 'ITTR-pytorch', # 包名
  packages = find_packages(exclude=[]), # 查找所有包
  version = '0.0.4', # 版本号
  license='MIT', # 许可证
  description = 'ITTR - Implementation of the Hybrid Perception Block and Dual-Pruned Self-Attention block', # 描述
  author = 'Phil Wang', # 作者
  author_email = 'lucidrains@gmail.com', # 作者邮箱
  url = 'https://github.com/lucidrains/ITTR-pytorch', # 项目链接
  keywords = [ # 关键词列表
    'artificial intelligence',
    'deep learning',
    'transformers',
    'attention mechanism'
  ],
  install_requires=[ # 安装依赖
    'einops>=0.4',
    'torch>=1.6',
  ],
  classifiers=[ # 分类器
    'Development Status :: 4 - Beta',
    'Intended Audience :: Developers',
    'Topic :: Scientific/Engineering :: Artificial Intelligence',
    'License :: OSI Approved :: MIT License',
    'Programming Language :: Python :: 3.6',
  ],
)

`.\lucidrains\jax2torch\jax2torch\jax2torch.py`

# 导入需要的库
import torch
from torch.utils import dlpack as torch_dlpack

import jax
from jax import dlpack as jax_dlpack
import jax.numpy as jnp
from jax.tree_util import tree_map

from inspect import signature
from functools import wraps

# 定义将 JAX 数组转换为 PyTorch 张量的函数
def j2t(x_jax):
    x_torch = torch_dlpack.from_dlpack(jax_dlpack.to_dlpack(x_jax))
    return x_torch

# 定义将 PyTorch 张量转换为 JAX 数组的函数
def t2j(x_torch):
    x_torch = x_torch.contiguous() # 保证张量是连续的
    x_jax = jax_dlpack.from_dlpack(torch_dlpack.to_dlpack(x_torch))
    return x_jax

# 定义将树状结构中的 PyTorch 张量转换为 JAX 数组的函数
def tree_t2j(x_torch):
    return tree_map(lambda t: t2j(t) if isinstance(t, torch.Tensor) else t, x_torch)

# 定义将树状结构中的 JAX 数组转换为 PyTorch 张量的函数
def tree_j2t(x_jax):
    return tree_map(lambda t: j2t(t) if isinstance(t, jnp.ndarray) else t, x_jax)

# 定义装饰器，将 JAX 函数转换为 PyTorch 函数
def jax2torch(fn):
    @wraps(fn)
    def inner(*args, **kwargs):
        # 定义一个继承自 torch.autograd.Function 的类
        class JaxFun(torch.autograd.Function):
            @staticmethod
            def forward(ctx, *args):
                # 将输入参数转换为 JAX 数组
                args = tree_t2j(args)
                # 调用 JAX 的 vjp 函数计算函数值和梯度
                y_, ctx.fun_vjp = jax.vjp(fn, *args)
                # 将结果转换为 PyTorch 张量
                return tree_j2t(y_)

            @staticmethod
            def backward(ctx, *grad_args):
                # 将梯度参数转换为 JAX 数组
                grad_args = tree_t2j(grad_args) if len(grad_args) > 1 else t2j(grad_args[0])
                # 计算梯度
                grads = ctx.fun_vjp(grad_args)
                # 将梯度转换为 PyTorch 张量
                grads = tuple(map(lambda t: t if isinstance(t, jnp.ndarray) else None, grads))
                return tree_j2t(grads)

        # 获取函数的参数签名
        sig = signature(fn)
        bound = sig.bind(*args, **kwargs)
        bound.apply_defaults()
        # 调用 JaxFun 类的 apply 方法
        return JaxFun.apply(*bound.arguments.values())
    return inner

`.\lucidrains\jax2torch\jax2torch\init.py`

# 从 jax2torch.jax2torch 模块中导入 jax2torch 函数
from jax2torch.jax2torch import jax2torch

jax2torch

Use Jax functions in Pytorch with DLPack, as outlined in a gist by @mattjj. The repository was made for the purposes of making this differentiable alignment work interoperable with Pytorch projects.

Install

$ pip install jax2torch

Memory management

By default, Jax pre-allocates 90% of VRAM, which leaves Pytorch with very little left over. To prevent this behavior, set the XLA_PYTHON_CLIENT_PREALLOCATE environmental variable to false before running any Jax code:

import os
os.environ["XLA_PYTHON_CLIENT_PREALLOCATE"] = "false"

Usage

Quick test

import jax
import torch
from jax2torch import jax2torch
import os

os.environ["XLA_PYTHON_CLIENT_PREALLOCATE"] = "false"

# Jax function

@jax.jit
def jax_pow(x, y = 2):
  return x ** y

# convert to Torch function

torch_pow = jax2torch(jax_pow)

# run it on Torch data!

x = torch.tensor([1., 2., 3.])
y = torch_pow(x, y = 3)
print(y)  # tensor([1., 8., 27.])

# And differentiate!

x = torch.tensor([2., 3.], requires_grad = True)
y = torch.sum(torch_pow(x, y = 3))
y.backward()
print(x.grad) # tensor([12., 27.])

`.\lucidrains\jax2torch\setup.py`

# 导入设置工具和查找包的函数
from setuptools import setup, find_packages

# 设置包的元数据
setup(
  # 包的名称
  name = 'jax2torch',
  # 查找所有包，不排除任何包
  packages = find_packages(exclude=[]),
  # 版本号
  version = '0.0.7',
  # 许可证类型
  license='MIT',
  # 描述信息
  description = 'Jax 2 Torch',
  # 作者
  author = 'Phil Wang',
  # 作者邮箱
  author_email = 'lucidrains@gmail.com',
  # 项目链接
  url = 'https://github.com/lucidrains/jax2torch',
  # 关键词列表
  keywords = [
    'jax',
    'pytorch'
  ],
  # 安装依赖项
  install_requires=[
    'torch>=1.6',
    'jax>=0.2.20'
  ],
  # 分类器列表
  classifiers=[
    'Development Status :: 4 - Beta',
    'Intended Audience :: Developers',
    'Topic :: Scientific/Engineering :: Artificial Intelligence',
    'License :: OSI Approved :: MIT License',
    'Programming Language :: Python :: 3.6',
  ],
)

Kalman Filtering Attention (wip)

Implementation of the Kalman Filtering Attention proposed in Kalman Filtering Attention for User Behavior Modeling in CTR Prediction

Will use this repository as guidance. Looks like the core of Kalman filtering is just 5 lines of code.

Citations

@inproceedings{NEURIPS2020_68ce199e,
    author = {Liu, Hu and LU, Jing and Zhao, Xiwei and Xu, Sulong and Peng, Hao and Liu, Yutong and Zhang, Zehua and Li, Jian and Jin, Junsheng and Bao, Yongjun and Yan, Weipeng},
    booktitle = {Advances in Neural Information Processing Systems},
    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M.F. Balcan and H. Lin},
    pages = {9228--9238},
    publisher = {Curran Associates, Inc.},
    title = {Kalman Filtering Attention for User Behavior Modeling in CTR Prediction},
    url = {https://proceedings.neurips.cc/paper_files/paper/2020/file/68ce199ec2c5517597ce0a4d89620f55-Paper.pdf},
    volume = {33},
    year = {2020}
}

`.\lucidrains\kronecker-attention-pytorch\kronecker_attention_pytorch\kronecker_attention_pytorch.py`

import torch
from torch import nn, einsum
from einops import rearrange, repeat
import torch.nn.functional as F

class KroneckerSelfAttention(nn.Module):
    def __init__(self, dim, heads, dim_heads = 32):
        super().__init__()
        hidden_dim = heads * dim_heads

        self.heads = heads
        # 定义将输入转换为查询、键、值的卷积层
        self.to_qkv = nn.Conv1d(dim, hidden_dim * 3, 1, bias = False)
        # 定义将输出转换为原始维度的卷积层
        self.to_out = nn.Conv1d(hidden_dim, dim, 1)

    def forward(self, x):
        h = x.shape[-2]

        # 沿着最后两个维度对输入进行平均并拼接
        x = torch.cat((x.mean(dim=-1), x.mean(dim=-2)), dim=-1)

        # 将输入通过查询、键、值的卷积层
        qkv = self.to_qkv(x)
        # 重新排列查询、键、值的维度
        q, k, v = rearrange(qkv, 'b (qkv h d) n -> qkv b h d n', h=self.heads, qkv=3)
        
        # 计算点积注意力
        dots = einsum('bhdi,bhdj->bhij', q, k)
        # 对注意力进行 softmax 操作
        attn = dots.softmax(dim=-1)
        # 计算输出
        out = einsum('bhij,bhdj->bhdi', attn, v)
        
        # 重新排列输出的维度
        out = rearrange(out, 'b h d n -> b (h d) n')
        # 将输出通过输出转换卷积层
        out = self.to_out(out)

        # 对输出进行外部求和操作
        out = rearrange(out[..., :h], 'b c (n 1) -> b c n 1') + rearrange(out[..., h:], 'b c (1 n) -> b c 1 n')
        return out

Lucidrains-系列项目源码解析-四十七-

Lucidrains 系列项目源码解析（四十七）

.\lucidrains\isab-pytorch\isab_pytorch\__init__.py

Induced Set Attention Block (ISAB) - Pytorch

Install

Usage

Citations

.\lucidrains\isab-pytorch\setup.py

.\lucidrains\iTransformer\iTransformer\attend.py

.\lucidrains\iTransformer\iTransformer\iTransformer.py

.\lucidrains\iTransformer\iTransformer\iTransformer2D.py

.\lucidrains\iTransformer\iTransformer\iTransformerFFT.py

.\lucidrains\iTransformer\iTransformer\revin.py

.\lucidrains\iTransformer\iTransformer\__init__.py

iTransformer

Appreciation

Install

Usage

Experimental

iTransformer with fourier tokens

Todo

Citation

.\lucidrains\iTransformer\setup.py

.\lucidrains\ITTR-pytorch\ITTR_pytorch\ITTR_pytorch.py

.\lucidrains\ITTR-pytorch\ITTR_pytorch\__init__.py

ITTR - Pytorch

Install

Usage

Citations

.\lucidrains\ITTR-pytorch\setup.py

.\lucidrains\jax2torch\jax2torch\jax2torch.py

.\lucidrains\jax2torch\jax2torch\__init__.py

jax2torch

Install

Memory management

Usage

.\lucidrains\jax2torch\setup.py

Kalman Filtering Attention (wip)

Citations

.\lucidrains\kronecker-attention-pytorch\kronecker_attention_pytorch\kronecker_attention_pytorch.py

`.\lucidrains\isab-pytorch\isab_pytorch\init.py`

`.\lucidrains\isab-pytorch\setup.py`

`.\lucidrains\iTransformer\iTransformer\attend.py`

`.\lucidrains\iTransformer\iTransformer\iTransformer.py`

`.\lucidrains\iTransformer\iTransformer\iTransformer2D.py`

`.\lucidrains\iTransformer\iTransformer\iTransformerFFT.py`

`.\lucidrains\iTransformer\iTransformer\revin.py`

`.\lucidrains\iTransformer\iTransformer\init.py`

`.\lucidrains\iTransformer\setup.py`

`.\lucidrains\ITTR-pytorch\ITTR_pytorch\ITTR_pytorch.py`

`.\lucidrains\ITTR-pytorch\ITTR_pytorch\init.py`

`.\lucidrains\ITTR-pytorch\setup.py`

`.\lucidrains\jax2torch\jax2torch\jax2torch.py`

`.\lucidrains\jax2torch\jax2torch\init.py`

`.\lucidrains\jax2torch\setup.py`

`.\lucidrains\kronecker-attention-pytorch\kronecker_attention_pytorch\kronecker_attention_pytorch.py`