Lucidrains 系列项目源码解析（五十一）

`.\lucidrains\linear-attention-transformer\setup.py`

# 导入设置和查找包的函数
from setuptools import setup, find_packages

# 设置包的元数据
setup(
  name = 'linear_attention_transformer',  # 包的名称
  packages = find_packages(exclude=['examples']),  # 查找并包含除了 examples 之外的所有包
  version = '0.19.1',  # 版本号
  license='MIT',  # 许可证
  description = 'Linear Attention Transformer',  # 描述
  author = 'Phil Wang',  # 作者
  author_email = 'lucidrains@gmail.com',  # 作者邮箱
  url = 'https://github.com/lucidrains/linear-attention-transformer',  # 项目链接
  keywords = ['transformers', 'attention', 'artificial intelligence'],  # 关键词
  install_requires=[
      'axial-positional-embedding',  # 安装所需的依赖包
      'einops',
      'linformer>=0.1.0',
      'local-attention',
      'product-key-memory>=0.1.5',
      'torch',
  ],
  classifiers=[
      'Development Status :: 4 - Beta',  # 分类器
      'Intended Audience :: Developers',
      'Topic :: Scientific/Engineering :: Artificial Intelligence',
      'License :: OSI Approved :: MIT License',
      'Programming Language :: Python :: 3.6',
  ],
)

`.\lucidrains\linformer\linformer\linformer.py`

import math
import torch
from torch import nn
import torch.nn.functional as F

from linformer.reversible import ReversibleSequence, SequentialSequence

# 辅助函数

# 如果值为 None，则返回默认值
def default(val, default_val):
    return val if val is not None else default_val

# 初始化张量
def init_(tensor):
    dim = tensor.shape[-1]
    std = 1 / math.sqrt(dim)
    tensor.uniform_(-std, std)
    return tensor

# 辅助类

# 残差连接
class Residual(nn.Module):
    def __init__(self, fn):
        super().__init__()
        self.fn = fn
    def forward(self, x):
        return x + self.fn(x)

# 预层归一化
class PreNorm(nn.Module):
    def __init__(self, dim, fn):
        super().__init__()
        self.fn = fn
        self.norm = nn.LayerNorm(dim)
    def forward(self, x):
        x = self.norm(x)
        return self.fn(x)

# GELU 激活函数
class GELU_(nn.Module):
    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))

# 如果 PyTorch 中有 GELU 函数，则使用，否则使用自定义的 GELU_
GELU = nn.GELU if hasattr(nn, 'GELU') else GELU_

# 前馈神经网络
class FeedForward(nn.Module):
    def __init__(self, dim, mult = 4, dropout = 0., activation = None, glu = False):
        super().__init__()
        activation = default(activation, GELU)

        self.glu = glu
        self.w1 = nn.Linear(dim, dim * mult * (2 if glu else 1))
        self.act = activation()
        self.dropout = nn.Dropout(dropout)
        self.w2 = nn.Linear(dim * mult, dim)

    def forward(self, x, **kwargs):
        if not self.glu:
            x = self.w1(x)
            x = self.act(x)
        else:
            x, v = self.w1(x).chunk(2, dim=-1)
            x = self.act(x) * v

        x = self.dropout(x)
        x = self.w2(x)
        return x

# Linformer 自注意力机制
class LinformerSelfAttention(nn.Module):
    def __init__(self, dim, seq_len, k = 256, heads = 8, dim_head = None, one_kv_head = False, share_kv = False, dropout = 0.):
        super().__init__()
        assert (dim % heads) == 0, 'dimension must be divisible by the number of heads'

        self.seq_len = seq_len
        self.k = k

        self.heads = heads

        dim_head = default(dim_head, dim // heads)
        self.dim_head = dim_head

        self.to_q = nn.Linear(dim, dim_head * heads, bias = False)

        kv_dim = dim_head if one_kv_head else (dim_head * heads)
        self.to_k = nn.Linear(dim, kv_dim, bias = False)
        self.proj_k = nn.Parameter(init_(torch.zeros(seq_len, k)))

        self.share_kv = share_kv
        if not share_kv:
            self.to_v = nn.Linear(dim, kv_dim, bias = False)
            self.proj_v = nn.Parameter(init_(torch.zeros(seq_len, k)))

        self.dropout = nn.Dropout(dropout)
        self.to_out = nn.Linear(dim_head * heads, dim)
    # 定义前向传播函数，接受输入 x 和上下文 context，默认参数 kwargs
    def forward(self, x, context = None, **kwargs):
        # 获取输入 x 的形状信息
        b, n, d, d_h, h, k = *x.shape, self.dim_head, self.heads, self.k

        # 计算键/值的序列长度
        kv_len = n if context is None else context.shape[1]
        # 断言键/值的序列长度不超过最大序列长度
        assert kv_len <= self.seq_len, f'the sequence length of the key / values must be {self.seq_len} - {kv_len} given'

        # 将输入 x 转换为查询
        queries = self.to_q(x)

        # 定义函数用于对序列长度进行投影
        proj_seq_len = lambda args: torch.einsum('bnd,nk->bkd', *args)

        # 根据是否有上下文选择输入数据
        kv_input = x if context is None else context

        # 将输入数据转换为键和值
        keys = self.to_k(kv_input)
        values = self.to_v(kv_input) if not self.share_kv else keys

        # 定义键和值的投影
        kv_projs = (self.proj_k, self.proj_v if not self.share_kv else self.proj_k)

        # 如果键/值的序列长度小于最大序列长度，则对投影进行切片
        if kv_len < self.seq_len:
            kv_projs = map(lambda t: t[:kv_len], kv_projs)

        # 对键和值沿序列长度维度进行投影
        keys, values = map(proj_seq_len, zip((keys, values), kv_projs))

        # 将查询重塑为 batch, heads, -1 的形状
        queries = queries.reshape(b, n, h, -1).transpose(1, 2)

        # 定义函数用于将头部合并到批次中的查询和键/值
        merge_key_values = lambda t: t.reshape(b, k, -1, d_h).transpose(1, 2).expand(-1, h, -1, -1)
        keys, values = map(merge_key_values, (keys, values))

        # 注意力计算
        dots = torch.einsum('bhnd,bhkd->bhnk', queries, keys) * (d_h ** -0.5)
        attn = dots.softmax(dim=-1)
        attn = self.dropout(attn)
        out = torch.einsum('bhnk,bhkd->bhnd', attn, values)

        # 分割头部
        out = out.transpose(1, 2).reshape(b, n, -1)
        # 返回输出结果
        return self.to_out(out)
class Linformer(nn.Module):
    # 定义 Linformer 类，继承自 nn.Module
    def __init__(self, dim, seq_len, depth, k = 256, heads = 8, dim_head = None, one_kv_head = False, share_kv = False, reversible = False, dropout = 0.):
        # 初始化函数，接受多个参数，包括维度、序列长度、深度等
        super().__init__()
        # 调用父类的初始化函数
        layers = nn.ModuleList([])
        # 创建一个空的模块列表
        for _ in range(depth):
            # 循环 depth 次
            attn = LinformerSelfAttention(dim, seq_len, k = k, heads = heads, dim_head = dim_head, one_kv_head = one_kv_head, share_kv = share_kv, dropout = dropout)
            # 创建 LinformerSelfAttention 注意力机制对象
            ff = FeedForward(dim, dropout = dropout)
            # 创建 FeedForward 前馈神经网络对象

            layers.append(nn.ModuleList([
                PreNorm(dim, attn),
                PreNorm(dim, ff)
            ]))
            # 将 PreNorm 包装的注意力机制和前馈神经网络添加到模块列表中

        execute_type = ReversibleSequence if reversible else SequentialSequence
        # 根据 reversible 参数选择执行类型
        self.net = execute_type(layers)
        # 创建执行类型对象

    def forward(self, x):
        # 前向传播函数
        return self.net(x)
        # 返回执行类型对象对输入 x 的处理结果

class LinformerLM(nn.Module):
    # 定义 LinformerLM 类，继承自 nn.Module
    def __init__(self, num_tokens, dim, seq_len, depth, k = 256, heads = 8, dim_head = None, one_kv_head = False, share_kv = False, reversible = False, dropout = 0.):
        # 初始化函数，接受多个参数，包括标记数量、维度、序列长度、深度等
        super().__init__()
        # 调用父类的初始化函数
        self.token_emb = nn.Embedding(num_tokens, dim)
        # 创建标记嵌入层
        self.pos_emb = nn.Embedding(seq_len, dim)
        # 创建位置嵌入层
        self.linformer = Linformer(dim, seq_len, depth, k = k, heads = heads, dim_head = dim_head,
                one_kv_head = one_kv_head, share_kv = share_kv, reversible = reversible, dropout = dropout)
        # 创建 Linformer 对象
        self.to_logits = nn.Linear(dim, num_tokens)
        # 创建线性层，用于输出标记

    def forward(self, x):
        # 前向传播函数
        x = self.token_emb(x)
        # 对输入 x 进行标记嵌入
        x = self.pos_emb(torch.arange(x.shape[1], device=x.device)) + x
        # 对输入 x 进行位置嵌入
        x = self.linformer(x)
        # 使用 Linformer 处理输入 x
        out = self.to_logits(x)
        # 将���理结果传递给线性层
        return out
        # 返回输出结果

`.\lucidrains\linformer\linformer\reversible.py`

# 导入 torch 库
import torch
# 导入 torch 中的神经网络模块
import torch.nn as nn
# 从 operator 模块中导入 itemgetter 函数
from operator import itemgetter
# 从 torch.autograd.function 模块中导入 Function 类
from torch.autograd.function import Function
# 从 torch.utils.checkpoint 模块中导入 get_device_states 和 set_device_states 函数

# 用于将参数路由到可逆层函数中的函数
def route_args(router, args, depth):
    # 初始化路由后的参数列表
    routed_args = [(dict(), dict()) for _ in range(depth)]
    # 获取参数中与路由器匹配的键
    matched_keys = [key for key in args.keys() if key in router]

    for key in matched_keys:
        val = args[key]
        for depth, ((f_args, g_args), routes) in enumerate(zip(routed_args, router[key])):
            new_f_args, new_g_args = map(lambda route: ({key: val} if route else {}), routes)
            routed_args[depth] = ({**f_args, **new_f_args}, {**g_args, **new_g_args})
    return routed_args

# 根据概率丢弃层的函数
def layer_drop(layers, prob):
    to_drop = torch.empty(len(layers)).uniform_(0, 1) < prob
    blocks = [block for block, drop in zip(layers, to_drop) if not drop]
    blocks = layers[:1] if len(blocks) == 0 else blocks
    return blocks

# 保存和设置随机数种子的类
class Deterministic(nn.Module):
    def __init__(self, net):
        super().__init__()
        self.net = net
        self.cpu_state = None
        self.cuda_in_fwd = None
        self.gpu_devices = None
        self.gpu_states = None

    def record_rng(self, *args):
        self.cpu_state = torch.get_rng_state()
        if torch.cuda._initialized:
            self.cuda_in_fwd = True
            self.gpu_devices, self.gpu_states = get_device_states(*args)

    def forward(self, *args, record_rng = False, set_rng = False, **kwargs):
        if record_rng:
            self.record_rng(*args)

        if not set_rng:
            return self.net(*args, **kwargs)

        rng_devices = []
        if self.cuda_in_fwd:
            rng_devices = self.gpu_devices

        with torch.random.fork_rng(devices=rng_devices, enabled=True):
            torch.set_rng_state(self.cpu_state)
            if self.cuda_in_fwd:
                set_device_states(self.gpu_devices, self.gpu_states)
            return self.net(*args, **kwargs)

# 可逆块类，受启发于 https://github.com/RobinBruegger/RevTorch/blob/master/revtorch/revtorch.py
# 一旦多 GPU 工作正常，重构并将 PR 发回源代码
class ReversibleBlock(nn.Module):
    def __init__(self, f, g):
        super().__init__()
        self.f = Deterministic(f)
        self.g = Deterministic(g)

    def forward(self, x, f_args = {}, g_args = {}):
        x1, x2 = torch.chunk(x, 2, dim=2)
        y1, y2 = None, None

        with torch.no_grad():
            y1 = x1 + self.f(x2, record_rng=self.training, **f_args)
            y2 = x2 + self.g(y1, record_rng=self.training, **g_args)

        return torch.cat([y1, y2], dim=2)

    def backward_pass(self, y, dy, f_args = {}, g_args = {}):
        y1, y2 = torch.chunk(y, 2, dim=2)
        del y

        dy1, dy2 = torch.chunk(dy, 2, dim=2)
        del dy

        with torch.enable_grad():
            y1.requires_grad = True
            gy1 = self.g(y1, set_rng=True, **g_args)
            torch.autograd.backward(gy1, dy2)

        with torch.no_grad():
            x2 = y2 - gy1
            del y2, gy1

            dx1 = dy1 + y1.grad
            del dy1
            y1.grad = None

        with torch.enable_grad():
            x2.requires_grad = True
            fx2 = self.f(x2, set_rng=True, **f_args)
            torch.autograd.backward(fx2, dx1, retain_graph=True)

        with torch.no_grad():
            x1 = y1 - fx2
            del y1, fx2

            dx2 = dy2 + x2.grad
            del dy2
            x2.grad = None

            x = torch.cat([x1, x2.detach()], dim=2)
            dx = torch.cat([dx1, dx2], dim=2)

        return x, dx

# 可逆函数类
class _ReversibleFunction(Function):
    @staticmethod
    # 前向传播函数，接收上下文对象 ctx，输入数据 x，模块列表 blocks 和参数列表 args
    def forward(ctx, x, blocks, args):
        # 将参数列表 args 存储到上下文对象 ctx 中
        ctx.args = args
        # 遍历模块列表 blocks 和参数列表 args，对输入数据 x 进行处理
        for block, kwarg in zip(blocks, args):
            x = block(x, **kwarg)
        # 将处理后的数据 x 分离出来，并存储到上下文对象 ctx 中
        ctx.y = x.detach()
        # 将模块列表 blocks 存储到上下文对象 ctx 中
        ctx.blocks = blocks
        # 返回处理后的数据 x
        return x

    # 反向传播函数，接收上下文对象 ctx 和梯度 dy
    @staticmethod
    def backward(ctx, dy):
        # 获取上下文对象 ctx 中存储的处理后的数据 y 和参数列表 args
        y = ctx.y
        args = ctx.args
        # 反向遍历模块列表 blocks 和参数列表 args，对梯度 dy 进行处理
        for block, kwargs in zip(ctx.blocks[::-1], args[::-1]):
            # 调用模块的反向传播函数，更新梯度 dy 和数据 y
            y, dy = block.backward_pass(y, dy, **kwargs)
        # 返回更新后的梯度 dy
        return dy, None, None
class SequentialSequence(nn.Module):
    # 定义一个顺序执行的神经网络模块
    def __init__(self, layers, args_route = {}, layer_dropout = 0.):
        super().__init__()
        # 断言每个参数路由映射的深度与顺序层的数量相同
        assert all(len(route) == len(layers) for route in args_route.values()), 'each argument route map must have the same depth as the number of sequential layers'
        self.layers = layers
        self.args_route = args_route
        self.layer_dropout = layer_dropout

    def forward(self, x, **kwargs):
        # 根据参数路由和关键字参数获取参数
        args = route_args(self.args_route, kwargs, len(self.layers))
        layers_and_args = list(zip(self.layers, args))

        if self.training and self.layer_dropout > 0:
            # 如果处于训练状态且存在层丢弃率，则执行层丢弃
            layers_and_args = layer_drop(layers_and_args, self.layer_dropout)

        for (f, g), (f_args, g_args) in layers_and_args:
            # 依次执行每个顺序层的前向传播
            x = x + f(x, **f_args)
            x = x + g(x, **g_args)
        return x

class ReversibleSequence(nn.Module):
    # 定义一个可逆的序列神经网络模块
    def __init__(self, blocks, args_route = {}, layer_dropout = 0.):
        super().__init__()
        self.args_route = args_route
        self.layer_dropout = layer_dropout
        # 创建包含可逆块的模块列表
        self.blocks = nn.ModuleList([ReversibleBlock(f=f, g=g) for f, g in blocks])

    def forward(self, x, **kwargs):
        # 在最后一个维度上连接输入张量的副本
        x = torch.cat([x, x], dim=-1)

        blocks = self.blocks
        # 根据参数路由和关键字参数获取参数
        args = route_args(self.args_route, kwargs, len(blocks))
        args = list(map(lambda x: {'f_args': x[0], 'g_args': x[1]}, args))

        layers_and_args = list(zip(blocks, args))

        if self.training and self.layer_dropout > 0:
            # 如果处于训练状态且存在层丢弃率，则执行层丢弃
            layers_and_args = layer_drop(layers_and_args, self.layer_dropout)
            blocks, args = map(lambda ind: list(map(itemgetter(ind), layers_and_args)), (0, 1))

        # 调用自定义的可逆函数进行前向传播
        out =  _ReversibleFunction.apply(x, blocks, args)
        # 在最后一个维度上分割输出并求和
        return torch.stack(out.chunk(2, dim=-1)).sum(dim=0)

`.\lucidrains\linformer\linformer\init.py`

# 从 linformer.linformer 模块中导入 LinformerLM, Linformer, LinformerSelfAttention 类
from linformer.linformer import LinformerLM, Linformer, LinformerSelfAttention

Linformer for Pytorch

An implementation of Linformer in Pytorch. Linformer comes with two deficiencies. (1) It does not work for the auto-regressive case. (2) Assumes a fixed sequence length. However, if benchmarks show it to perform well enough, it will be added to this repository as a self-attention layer to be used in the encoder.

Linformer has been put into production by Facebook!

Install

$ pip install linformer

Usage

Linformer language model

import torch
from linformer import LinformerLM

model = LinformerLM(
    num_tokens = 20000,
    dim = 512,
    seq_len = 4096,
    depth = 12,
    heads = 8,
    dim_head = 128,        # be able to set the dimension of each head in multi-head attention
    k = 256,               # this is the k that the key/values are projected to along the sequence dimension
    one_kv_head = True,    # share one key/value head across all heads
    share_kv = False,      # share the same projection for keys and values
    reversible = True      # make network reversible, like Reformer
)

x = torch.randint(0, 20000, (1, 4096))
model(x) # (1, 4096, 20000)

Linformer

import torch
from linformer import Linformer

model = Linformer(
    dim = 512,
    seq_len = 4096,
    depth = 12,
    heads = 8,
    k = 256,
    one_kv_head = True,
    share_kv = True
)

x = torch.randn(1, 4096, 512)
model(x) # (1, 4096, 512)

Single Self-Attention layer

import torch
from linformer import LinformerSelfAttention

attn = LinformerSelfAttention(
    dim = 512,
    seq_len = 4096,
    heads = 8,
    k = 256,
    one_kv_head = True,
    share_kv = True
)

x = torch.randn(1, 4096, 512)
attn(x) # (1, 4096, 512)

Self-Attention layer above receiving contextual keys. The sequence length is validated on the length of the contextual keys instead of the source sequence.

import torch
from linformer import LinformerSelfAttention

attn = LinformerSelfAttention(
    dim = 512,
    seq_len = 8192,
    heads = 8,
    k = 256,
    one_kv_head = True,
    share_kv = True
)

x = torch.randn(1, 2048, 512)
context = torch.randn(1, 8192, 512)
attn(x, context) # (1, 2048, 512)

Citations

@misc{wang2020linformer,
    title={Linformer: Self-Attention with Linear Complexity},
    author={Sinong Wang and Belinda Z. Li and Madian Khabsa and Han Fang and Hao Ma},
    year={2020},
    eprint={2006.04768},
    archivePrefix={arXiv},
    primaryClass={cs.LG}
}

@inproceedings{kitaev2020reformer,
    title       = {Reformer: The Efficient Transformer},
    author      = {Nikita Kitaev and Lukasz Kaiser and Anselm Levskaya},
    booktitle   = {International Conference on Learning Representations},
    year        = {2020},
    url         = {https://openreview.net/forum?id=rkgNKkHtvB}
}

`.\lucidrains\linformer\setup.py`

# 导入设置工具和查找包的函数
from setuptools import setup, find_packages

# 设置包的元数据
setup(
  name = 'linformer',  # 包的名称
  packages = find_packages(),  # 查找所有包
  version = '0.2.3',  # 版本号
  license='MIT',  # 许可证
  description = 'Linformer implementation in Pytorch',  # 描述
  author = 'Phil Wang',  # 作者
  author_email = 'lucidrains@gmail.com',  # 作者邮箱
  url = 'https://github.com/lucidrains/linformer',  # 项目链接
  long_description_content_type = 'text/markdown',  # 长描述内容类型
  keywords = [
    'attention',  # 关键词：注意力
    'artificial intelligence'  # 关键词：人工智能
  ],
  install_requires=[
    'torch'  # 安装所需的依赖项
  ],
  classifiers=[
    'Development Status :: 4 - Beta',  # 分类器：开发状态为Beta
    'Intended Audience :: Developers',  # 分类器：面向的受众为开发者
    'Topic :: Scientific/Engineering :: Artificial Intelligence',  # 分类器：主题为科学/工程和人工智能
    'License :: OSI Approved :: MIT License',  # 分类器：许可证为MIT
    'Programming Language :: Python :: 3.6',  # 分类器：编程语言为Python 3.6
  ],
)

`.\lucidrains\lion-pytorch\lion_pytorch\lion_pytorch.py`

# 导入必要的库
from typing import Tuple, Optional, Callable
import torch
from torch.optim.optimizer import Optimizer

# 定义一个函数，用于检查值是否存在
def exists(val):
    return val is not None

# 定义权重更新函数
def update_fn(p, grad, exp_avg, lr, wd, beta1, beta2):
    # 根据学习率和权重衰减更新参数值
    p.data.mul_(1 - lr * wd)
    
    # 计算权重更新值
    update = exp_avg.clone().mul_(beta1).add(grad, alpha=1 - beta1).sign_()
    p.add_(update, alpha=-lr)
    
    # 更新动量的指数移动平均系数
    exp_avg.mul_(beta2).add_(grad, alpha=1 - beta2)

# 定义一个自定义优化器类 Lion，继承自 Optimizer 类
class Lion(Optimizer):
    def __init__(
        self,
        params,
        lr: float = 1e-4,
        betas: Tuple[float, float] = (0.9, 0.99),
        weight_decay: float = 0.0,
        use_triton: bool = False
    ):
        # 断言学习率必须大于0，beta值必须在0到1之间
        assert lr > 0.
        assert all([0. <= beta <= 1. for beta in betas])

        # 设置默认参数
        defaults = dict(
            lr=lr,
            betas=betas,
            weight_decay=weight_decay
        )

        # 调用父类的初始化方法
        super().__init__(params, defaults)

        # 设置更新函数为自定义的 update_fn
        self.update_fn = update_fn

        # 如果使用 Triton，则导入 Triton 的更新函数
        if use_triton:
            from lion_pytorch.triton import update_fn as triton_update_fn
            self.update_fn = triton_update_fn

    # 定义优化步骤函数
    @torch.no_grad()
    def step(
        self,
        closure: Optional[Callable] = None
    ):

        loss = None
        # 如果存在闭包函数，则计算损失值
        if exists(closure):
            with torch.enable_grad():
                loss = closure()

        # 遍历参数组
        for group in self.param_groups:
            for p in filter(lambda p: exists(p.grad), group['params']):
                # 获取参数的梯度、学习率、权重衰减、beta1、beta2以及参数状态
                grad, lr, wd, beta1, beta2, state = p.grad, group['lr'], group['weight_decay'], *group['betas'], self.state[p]

                # 初始化参数状态 - 梯度值的指数移动平均
                if len(state) == 0:
                    state['exp_avg'] = torch.zeros_like(p)

                exp_avg = state['exp_avg']

                # 调用更新函数更新参数
                self.update_fn(
                    p,
                    grad,
                    exp_avg,
                    lr,
                    wd,
                    beta1,
                    beta2
                )

        return loss

`.\lucidrains\lion-pytorch\lion_pytorch\triton.py`

import torch
# 导入 torch 库

try:
    import triton
    import triton.language as tl
except ImportError as e:
    print('triton is not installed, please install by running `pip install triton -U --pre`')
    exit()
# 尝试导入 triton 库，如果导入失败则打印错误信息并退出程序

# clone param and exp_avg before autotuning takes place
# as those are updated in-place
# 在自动调整参数之前克隆参数和 exp_avg，因为它们是原地更新的

def clone_inplace_updated_params(nargs):
    nargs['p_ptr'] = nargs['p_ptr'].clone()
    nargs['exp_avg_ptr'] = nargs['exp_avg_ptr'].clone()
# 克隆原地更新的参数和 exp_avg

# triton cuda kernel

@triton.autotune(configs = [
    triton.Config({'BLOCK_SIZE': 128}, num_warps = 4, pre_hook = clone_inplace_updated_params),
    triton.Config({'BLOCK_SIZE': 1024}, num_warps = 8, pre_hook = clone_inplace_updated_params),
], key = ['n_elements'])
@triton.jit
def update_fn_kernel(
    p_ptr,
    grad_ptr,
    exp_avg_ptr,
    lr,
    wd,
    beta1,
    beta2,
    n_elements,
    BLOCK_SIZE: tl.constexpr,
):
    pid = tl.program_id(axis = 0)
    # 获取程序 ID

    block_start = pid * BLOCK_SIZE
    offsets = block_start + tl.arange(0, BLOCK_SIZE)
    # 计算块的起始位置和偏移量

    mask = offsets < n_elements
    # 创建掩码以确保偏移量不超过元素数量

    # offsetted pointers

    offset_p_ptr = p_ptr + offsets
    offset_grad_ptr = grad_ptr + offsets
    offset_exp_avg_ptr = exp_avg_ptr + offsets
    # 计算偏移后的指针位置

    # load

    p = tl.load(offset_p_ptr, mask = mask)
    grad = tl.load(offset_grad_ptr, mask = mask)
    exp_avg = tl.load(offset_exp_avg_ptr, mask = mask)
    # 从指定位置加载数据

    # stepweight decay

    p = p * (1 - lr * wd)
    # 更新参数

    # diff between momentum running average and grad

    diff = exp_avg - grad
    # 计算动量的运行平均值和梯度之间的差异

    # weight update

    update = diff * beta1 + grad
    # 更新权重

    # torch.sign

    can_update = update != 0
    update_sign = tl.where(update > 0, -lr, lr)
    # 计算更新的符号

    p = p + update_sign * can_update
    # 更新参数

    # decay the momentum running average coefficient

    exp_avg = diff * beta2 + grad
    # 更新动量的运行平均系数

    # store new params and momentum running average coefficient

    tl.store(offset_p_ptr, p, mask = mask)
    tl.store(offset_exp_avg_ptr, exp_avg, mask = mask)
    # 存储新的参数和动量的运行平均系数

def update_fn(
    p: torch.Tensor,
    grad: torch.Tensor,
    exp_avg: torch.Tensor,
    lr: float,
    wd: float,
    beta1: float,
    beta2: float
):
    assert all([t.is_cuda for t in (p, grad, exp_avg)])
    n_elements = p.numel()
    # 确保参数在 GPU 上，并获取参数数量

    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)    
    # 定义网格大小

    update_fn_kernel[grid](
        p,
        grad,
        exp_avg,
        lr,
        wd,
        beta1,
        beta2,
        n_elements
    )
    # 调用 triton 内核函数进行参数更新

`.\lucidrains\lion-pytorch\lion_pytorch\init.py`

# 从 lion_pytorch 模块中导入 Lion 类
from lion_pytorch.lion_pytorch import Lion

🦁 Lion - Pytorch

🦁 Lion, EvoLved Sign Momentum, new optimizer discovered by Google Brain that is purportedly better than Adam(w), in Pytorch. This is nearly a straight copy from here, with few minor modifications.

It is so simple, we may as well get it accessible and used asap by everyone to train some great models, if it really works 🤞

Instructions

Learning rate and weight decay: the authors write in Section 5 - Based on our experience, a suitable learning rate for Lion is typically 3-10x smaller than that for AdamW. Since the effective weight decay is lr * λ, the value of decoupled weight decay λ used for Lion is 3-10x larger than that for AdamW in order to maintain a similar strength. The initial value, peak value, and end value in the learning rate schedule should be changed simultaneously with the same ratio compared to AdamW, evidenced by a researcher.
Learning rate schedule: the authors use the same learning rate schedule for Lion as AdamW in the paper. Nevertheless, they observe a larger gain when using a cosine decay schedule to train ViT, compared to a reciprocal square-root schedule.
β1 and β2: the authors write in Section 5 - The default values for β1 and β2 in AdamW are set as 0.9 and 0.999, respectively, with an ε of 1e−8, while in Lion, the default values for β1 and β2 are discovered through the program search process and set as 0.9 and 0.99, respectively. Similar to how people reduce β2 to 0.99 or smaller and increase ε to 1e-6 in AdamW to improve stability, using β1=0.95, β2=0.98 in Lion can also be helpful in mitigating instability during training, suggested by the authors. This was corroborated by a researcher.

Updates

Update: seems to work for my local enwik8 autoregressive language modeling.
Update 2: experiments, seems much worse than Adam if learning rate held constant.
Update 3: Dividing the learning rate by 3, seeing better early results than Adam. Maybe Adam has been dethroned, after nearly a decade.
Update 4: using the 10x smaller learning rate rule of thumb from the paper resulted in the worst run. So I guess it still takes a bit of tuning.

A summarization of previous updates: as shown in the experiments, Lion with a 3x smaller learning rate beats Adam. It still takes a bit of tuning as a 10x smaller learning rate leads to a worse result.

Update 5: so far hearing all positive results for language modeling, when done right. Also heard positive results for significant text-to-image training, although it takes a bit of tuning. The negative results seem to be with problems and architectures outside of what was evaluated in the paper - RL, feedforward networks, weird hybrid architectures with LSTMs + convolutions etc. Negative anecdata also confirms this technique is sensitive to batch size, amount of data / augmentation. Tbd what optimal learning rate schedule is, and whether cooldown affects results. Also interestingly have a positive result at open-clip, which became negative as the model size was scaled up (but may be resolvable).
Update 6: open clip issue resolved by the author, by setting a higher initial temperature.
Update 7: would only recommend this optimizer in the setting of high batch sizes (64 or above)

Install

$ pip install lion-pytorch

Usage

# toy model

import torch
from torch import nn

model = nn.Linear(10, 1)

# import Lion and instantiate with parameters

from lion_pytorch import Lion

opt = Lion(model.parameters(), lr=1e-4, weight_decay=1e-2)

# forward and backwards

loss = model(torch.randn(10))
loss.backward()

# optimizer step

opt.step()
opt.zero_grad()

To use a fused kernel for updating the parameters, first pip install triton -U --pre, then

opt = Lion(
    model.parameters(),
    lr=1e-4,
    weight_decay=1e-2,
    use_triton=True # set this to True to use cuda kernel w/ Triton lang (Tillet et al)
)

Appreciation

Stability.ai for the generous sponsorship to work and open source cutting edge artificial intelligence research

Citations

@misc{https://doi.org/10.48550/arxiv.2302.06675,
    url     = {https://arxiv.org/abs/2302.06675},
    author  = {Chen, Xiangning and Liang, Chen and Huang, Da and Real, Esteban and Wang, Kaiyuan and Liu, Yao and Pham, Hieu and Dong, Xuanyi and Luong, Thang and Hsieh, Cho-Jui and Lu, Yifeng and Le, Quoc V.},
    title   = {Symbolic Discovery of Optimization Algorithms},
    publisher = {arXiv},
    year = {2023}
}

@article{Tillet2019TritonAI,
    title   = {Triton: an intermediate language and compiler for tiled neural network computations},
    author  = {Philippe Tillet and H. Kung and D. Cox},
    journal = {Proceedings of the 3rd ACM SIGPLAN International Workshop on Machine Learning and Programming Languages},
    year    = {2019}
}

`.\lucidrains\lion-pytorch\setup.py`

# 导入设置和查找包的函数
from setuptools import setup, find_packages

# 设置包的元数据
setup(
  name = 'lion-pytorch',  # 包的名称
  packages = find_packages(exclude=[]),  # 查找所有包
  version = '0.1.2',  # 版本号
  license='MIT',  # 许可证
  description = 'Lion Optimizer - Pytorch',  # 描述
  author = 'Phil Wang',  # 作者
  author_email = 'lucidrains@gmail.com',  # 作者邮箱
  long_description_content_type = 'text/markdown',  # 长描述内容类型
  url = 'https://github.com/lucidrains/lion-pytorch',  # URL
  keywords = [
    'artificial intelligence',  # 关键词：人工智能
    'deep learning',  # 关键词：深度学习
    'optimizers'  # 关键词：优化器
  ],
  install_requires=[
    'torch>=1.6'  # 安装所需的依赖项
  ],
  classifiers=[
    'Development Status :: 4 - Beta',  # 分类器：开发状态为 Beta
    'Intended Audience :: Developers',  # 分类器：目标受众为开发者
    'Topic :: Scientific/Engineering :: Artificial Intelligence',  # 分类器：主题为科学/工程 - 人工智能
    'License :: OSI Approved :: MIT License',  # 分类器：许可证为 MIT
    'Programming Language :: Python :: 3.6',  # 分类器：编程语言为 Python 3.6
  ],
)

Liquid Conway Game of Life

Try online: lucidrains.github.io/liquid-conw…

Based on: www.jgallant.com/2d-liquid-s…

$ yarn
$ npm start

In a new terminal

$ open index.html

`.\lucidrains\liquid-conway\src\app.js`

# 导入 RxJS 库
import Rx from 'rxjs';
# 导入 helpers 模块
import helpers from './helpers';
# 导入样式文件
require('./app.sass');

# 从 helpers 模块中解构出 range、initArray、cacheFn 函数
const {
  range,
  initArray,
  cacheFn
} = helpers;

# 获取 canvas 元素
const c = document.getElementById('canvas');

# 定义常量
const FPS = 30;
const WIDTH = 150;
const HEIGHT = 75;
const CELL_SIZE = 7;
const CANVAS_WIDTH = WIDTH * CELL_SIZE;
const CANVAS_HEIGHT = HEIGHT * CELL_SIZE;
const CELL_FILL_STYLE = 'rgb(22, 109, 175)';
const BACKGROUND_COLOR = 'rgba(255, 255, 255, 0.5)';
const NEIGHBOR_COORS_CACHE = {};

# 定义方向数组 DIR
const DIR = range(-1, 1)
  .reduce((acc, x) => acc.concat(range(-1, 1).map(y => [x, y])), [])
  .filter(([x, y]) => !(x === 0 && y === 0));

# 设置 canvas 元素的宽度和高度
c.setAttribute('width', CANVAS_WIDTH.toString());
c.setAttribute('height', CANVAS_HEIGHT.toString());
c.style.display = 'block';
# 获取 2D 绘图上下文
const ctx = c.getContext('2d');

# 初始化网格
function initGrid(x, y, init) {
  return initArray(x, init).map(() => initArray(y, init));
}

# 初始化 grid 和 buffer
let [
  grid,
  buffer
] = [
  initGrid(WIDTH, HEIGHT, 0),
  initGrid(WIDTH, HEIGHT, 0)
];

# 获取网格坐标
const GRID_COORS = grid.reduce((acc, row, x) => {
  acc = acc.concat(row.map((_, y) => [x, y]));
  return acc;
}, []);

# 在网格中随机生成初始状态
GRID_COORS.forEach(([x, y]) => {
  grid[x][y] = Math.round(Math.random());
});

# 创建 RxJS Observable，处理鼠标事件
Rx.Observable
  .fromEvent(c, 'mousedown')
  .flatMap((md) => {
    md.preventDefault();
    let ev = md;

    return Rx.Observable.merge(
        Rx.Observable.interval(10).map(() => null),
        Rx.Observable.fromEvent(c, 'mousemove')
      )
      .map((mm) => {
        ev = mm || ev;
        const { left, top } = ev.target.getBoundingClientRect();
        const x = ev.clientX - left;
        const y = ev.clientY - top;
        const [coorX, coorY] = [x, y].map(el => Math.floor(el / CELL_SIZE));
        return [coorX, coorY];
      })
      .takeUntil(Rx.Observable.fromEvent(c, 'mouseup'));
  })
  .throttleTime(10)
  .subscribe(([x, y]) => {
    grid[x][y] = 1;
  });

# 判断坐标是否在网格范围内
function withinBounds(grid, x, y) {
  return x >= 0 && x < grid.length && y >= 0 && y < grid[0].length;
}

# 获取邻居坐标
function getNeighborCoors(grid, x, y) {
  return DIR.reduce((acc, [dx, dy]) => {
    const [nx, ny] = [dx + x, dy + y];
    if (withinBounds(grid, nx, ny)) {
      acc.push([nx, ny]);
    }
    return acc;
  }, []);
}

# 使用缓存函数获取邻居坐标
const getCacheNeighborCoors = cacheFn(
  getNeighborCoors,
  NEIGHBOR_COORS_CACHE,
  (_, x, y) => `${x}:${y}`
);

# 计算邻居中存活细胞数量
function countNeighborsAlive(grid, x, y) {
  const neighbors = getCacheNeighborCoors(grid, x, y);

  return neighbors.reduce((acc, [nx, ny]) => {
    if (grid[nx][ny] === 1) {
      acc += 1;
    }
    return acc;
  }, 0);
}

# 计算下一个状态
function computeNextState(curr, neighbors) {
  return ((curr === 1 && neighbors === 2) || neighbors === 3) ? 1 : 0;
}

# 计算下一个状态
function nextState(grid, buffer) {
  GRID_COORS.forEach(([x, y]) => {
    const cell = grid[x][y];
    const count = countNeighborsAlive(grid, x, y);
    buffer[x][y] = computeNextState(cell, count);
  });
}

# 渲染函数
function render(ctx, grid) {
  ctx.fillStyle = BACKGROUND_COLOR;
  ctx.fillRect(0, 0, CANVAS_WIDTH, CANVAS_HEIGHT);

  GRID_COORS.forEach(([x, y]) => {
    const cell = grid[x][y];
    if (cell === 1) {
      ctx.fillStyle = CELL_FILL_STYLE;
      ctx.fillRect(
        (x * CELL_SIZE) + 1,
        (y * CELL_SIZE) + 1,
        CELL_SIZE - 1,
        CELL_SIZE - 1
      );
    }
  });
}

# 定义动画函数
let start;
const throttleDiff = (1000 / FPS);

function step() {
  const now = +new Date();
  start = start || now;
  const diff = now - start;
  start = now;

  render(ctx, grid);

  const callNextFrame = window.requestAnimationFrame.bind(null, step);
  if (diff > throttleDiff) {
    callNextFrame();
  } else {
    setTimeout(callNextFrame, throttleDiff - diff);
  }
}

# 启动动画
step();

# 定时更新状态
setInterval(() => {
  nextState(grid, buffer);
  [buffer, grid] = [grid, buffer];
}, 80);

`.\lucidrains\liquid-conway\src\helpers.js`

# 从 lodash.clone 模块中导入 clone 函数
import clone from 'lodash.clone';

# 初始化一个包含 num 个元素的数组，每个元素都是 init 的克隆
function initArray(num, init) {
    return Array.from(Array(num)).map(() => clone(init));
}

# 生成一个从 low 到 high 的范围数组，步长为 step，默认为 1
function range(low, high, step = 1) {
    const arr = [];
    for (let i = low; i <= high; i += step) {
        arr.push(i);
    }
    return arr;
}

# 缓存函数的结果，使用 cacheObj 存储结果，deriveKeyFn 用于生成缓存的键
function cacheFn(fn, cacheObj, deriveKeyFn) {
    return (...args) => {
        let key;
        if (!deriveKeyFn) {
            key = JSON.stringify(args);
        } else {
            key = deriveKeyFn(...args);
        }

        if (cacheObj[key] !== undefined) {
            return cacheObj[key];
        }

        const ret = fn(...args);
        cacheObj[key] = ret;
        return ret;
    };
}

# 生成一个小于 num 的随机整数
function randInt(num) {
    return Math.floor(Math.random() * (num + 1));
}

# 导出包含 cacheFn、range、initArray、randInt 函数的对象
export default {
    cacheFn,
    range,
    initArray,
    randInt
};

`.\lucidrains\liquid-conway\src\liquid.js`

# 导入 Rx 模块
import Rx from 'rxjs';
# 导入 helpers 模块
import helpers from './helpers';

# 导入样式表
require('./app.sass');

# 从 helpers 模块中导入 initArray 函数
const { initArray } = helpers;

# 获取 canvas 元素
const c = document.getElementById('canvas');

# 阻止右键菜单弹出
c.oncontextmenu = (e) => {
  e.preventDefault();
};

# 设置常量
const FPS = 30;
const WIDTH = 80;
const HEIGHT = 60;
const CELL_SIZE = 10;
const CANVAS_WIDTH = WIDTH * CELL_SIZE;
const CANVAS_HEIGHT = HEIGHT * CELL_SIZE;

const CELL_COLOR_LIGHTEST = 'rgb(0, 204, 255)';
const CELL_COLOR_LIGHT = 'rgb(0, 153, 255)';
const CELL_COLOR = 'rgb(0, 102, 255)';
const CELL_COLOR_DARK = 'rgb(51, 102, 255)';
const CELL_COLOR_DARKEST = 'rgb(51, 51, 204)';

const BACKGROUND_COLOR = 'rgb(255, 255, 255)';

# 初始化网格
function initGrid(x, y, init) {
  return initArray(x, init).map(() => initArray(y, init));
}

# 创建网格
const GRID = initGrid(WIDTH, HEIGHT, { val: 0, diff: 0 });

# 获取网格坐标
const GRID_COORS = GRID.reduce((acc, row, x) =>
  acc.concat(row.map((_, y) => [x, y]))
, []);

# 检查坐标是否在网格内
function withinBounds(grid, x, y) {
  return x >= 0 && x < grid.length && y >= 0 && y < grid[0].length;
}

# 检查网格中的单元格是否为空
function isEmptyCell(grid, x, y) {
  return withinBounds(grid, x, y) && !grid[x][y].wall;
}

# 设置 canvas 元素的宽度和高度
c.setAttribute('width', CANVAS_WIDTH.toString());
c.setAttribute('height', CANVAS_HEIGHT.toString());
c.style.display = 'block';

# 获取 2D 上下文
const ctx = c.getContext('2d');

# 合并鼠标和触摸事件的 Observable 流
Rx.Observable.merge(
    Rx.Observable.fromEvent(c, 'mousedown'),
    Rx.Observable.fromEvent(c, 'touchstart')
  )
  .flatMap((md) => {
    md.preventDefault();
    let ev = md;

    return Rx.Observable.merge(
        Rx.Observable.interval(10).map(() => null),
        Rx.Observable.fromEvent(c, 'mousemove'),
        Rx.Observable.fromEvent(c, 'touchmove')
      )
      .map((mm) => {
        ev = mm || ev;
        return { ev, which: md.which };
      })
      .takeUntil(Rx.Observable.merge(
        Rx.Observable.fromEvent(c, 'mouseup'),
        Rx.Observable.fromEvent(c, 'mouseout'),
        Rx.Observable.fromEvent(c, 'touchend')
      ));
  })
  .throttleTime(10)
  .subscribe(({ ev, which }) => {
    const { target, touches, type } = ev;
    const isTouch = type === 'touchmove' || type === 'touchstart';

    const { left, top } = target.getBoundingClientRect();
    const { clientX, clientY } = isTouch ? touches[0] : ev;

    const x = clientX - left;
    const y = clientY - top;
    const [cx, cy] = [x, y].map(el => Math.floor(el / CELL_SIZE));

    if (!withinBounds(GRID, cx, cy)) {
      return;
    }

    const cell = GRID[cx][cy];

    if (which === 1 || isTouch) {
      delete cell.wall;
      cell.val += 100;
    } else if (which === 3) {
      cell.wall = true;
      cell.val = 0;
    }
  });

# 计算下一个状态
function nextState(grid) {
  const withinGrid = withinBounds.bind(null, grid);

  GRID_COORS.forEach(([x, y]) => {
    const cell = grid[x][y];
    const val = cell.val;

    if (cell.wall || val < 0) {
      return;
    }

    if (withinGrid(x, y + 1) && grid[x][y + 1].val < 100) {
      cell.diff -= val;
      grid[x][y + 1].diff += val;
      return;
    }

    let volume = val;

    const flowCoors = [[1, 0], [-1, 0]]
      .filter(([dx, dy]) => {
        const [nx, ny] = [x + dx, y + dy];
        return withinGrid(nx, ny) && val > grid[nx][ny].val;
      });

    const diffs = flowCoors.map(([dx, dy]) => {
      const [nx, ny] = [x + dx, y + dy];
      const diff = val - grid[nx][ny].val;
      return diff;
    });

    const totalDiff = diffs.reduce((acc, diff) => {
      acc += diff;
      return acc;
    }, 0);

    const finalDiff = Math.min(volume, totalDiff);

    diffs.forEach((diff, i) => {
      const [dx, dy] = flowCoors[i];
      const weightedDiff = Math.floor(finalDiff * (diff / totalDiff)) / 2;

      grid[x][y].diff -= weightedDiff;
      grid[x + dx][y + dy].diff += weightedDiff;
      volume -= weightedDiff;
    });

    if (volume < 0) {
      return;
    }
    # 如果当前单元格上方的单元格在网格内且数值小于当前单元格的数值，并且当前单元格的数值大于100
    if (withinGrid(x, y - 1) && grid[x][y - 1].val < cell.val && cell.val > 100) {
      # 计算数值差值，将差值的一部分分配给上方单元格
      const diff = Math.floor((val - grid[x][y - 1].val) / 20);
      grid[x][y - 1].diff += diff;
      # 减去分配的差值
      cell.diff -= diff;
      # 更新总体差值
      volume -= diff;
    }

    # 如果当前单元格下方的单元格在网格内且数值小于当前单元格的数值
    if (withinGrid(x, y + 1) && grid[x][y + 1].val < cell.val) {
      # 计算数值差值，将差值的一部分分配给下方单元格
      const diff = Math.floor((val - grid[x][y + 1].val) / 10);
      grid[x][y + 1].diff += diff;
      # 减去分配的差值
      cell.diff -= diff;
      # 更新总体差值
      volume -= diff;
    }
  });

  # 遍历所有网格坐标
  GRID_COORS.forEach(([x, y]) => {
    # 获取当前单元格
    const cell = grid[x][y];
    # 更新单元格数值，重置差值为0
    cell.val += cell.diff;
    cell.diff = 0;
  });
// 渲染函数，根据传入的上下文和网格对象进行绘制
function render(context, grid) {
  // 设置背景颜色并填充整个画布
  context.fillStyle = BACKGROUND_COLOR;
  context.fillRect(0, 0, CANVAS_WIDTH, CANVAS_HEIGHT);

  // 遍历所有网格坐标
  GRID_COORS.forEach(([x, y]) => {
    // 获取当前坐标对应的单元格对象
    const cell = grid[x][y];

    // 如果单元格是墙壁
    if (cell.wall) {
      // 设置颜色为黑色并填充墙壁单元格
      context.fillStyle = 'black';
      context.fillRect(
        (x * CELL_SIZE) + 1,
        (y * CELL_SIZE) + 1,
        CELL_SIZE,
        CELL_SIZE
      );
    } else {
      // 如果单元格不是墙壁
      const val = cell.val;

      // 如果值小于等于0，则跳过
      if (val <= 0) {
        return;
      }

      let fillStyle = CELL_COLOR;
      let cellHeight = CELL_SIZE - 1;
      let cellY = (y * CELL_SIZE) + 1;

      // 检查是否有底部相邻单元格或者顶部无相邻单元格
      const hasBottomNeighbor = (!isEmptyCell(grid, x, y + 1) || grid[x][y + 1].val > 0);
      const hasNoTopNeighbor = (!isEmptyCell(grid, x, y - 1) || grid[x][y - 1].val <= 0);

      // 根据条件调整单元格高度和位置
      if (val < 100 && hasBottomNeighbor && hasNoTopNeighbor) {
        cellHeight *= parseFloat(val) / 100;
        cellY += (CELL_SIZE - cellHeight);
      }

      // 根据值的大小设置不同的颜色
      if (val < 50) {
        fillStyle = CELL_COLOR_LIGHTEST;
      } else if (val < 80) {
        fillStyle = CELL_COLOR_LIGHT;
      } else if (val > 150) {
        fillStyle = CELL_COLOR_DARKEST;
      } else if (val > 120) {
        fillStyle = CELL_COLOR_DARK;
      }

      // 设置颜色并填充单元格
      context.fillStyle = fillStyle;
      context.fillRect(
        (x * CELL_SIZE) + 1,
        cellY,
        CELL_SIZE - 1,
        cellHeight
      );
    }
  });
}

// 初始化时间变量和节流时间间隔
let start;
const throttleDiff = (1000 / FPS);

// 每一帧的处理函数
function step() {
  const now = +new Date();
  start = start || now;
  const diff = now - start;
  start = now;

  // 调用渲染函数
  render(ctx, GRID);

  // 请求下一帧动画
  const callNextFrame = window.requestAnimationFrame.bind(null, step);
  if (diff > throttleDiff) {
    callNextFrame();
  } else {
    setTimeout(callNextFrame, throttleDiff - diff);
  }
}

// 开始执行动画
step();

// 每50毫秒更新一次网格状态
setInterval(() => {
  nextState(GRID);
}, 50);

`.\lucidrains\liquid-conway\webpack.config.js`

# 引入 Node.js 的 path 模块和 ExtractTextPlugin 插件
const path = require('path');
const ExtractTextPlugin = require('extract-text-webpack-plugin');

# 定义源代码目录和输出目录的绝对路径
const src = path.resolve(__dirname, 'src');
const dist = path.resolve(__dirname, 'dist');

# 配置对象，包括上下文、入口文件、输出文件、模块规则和插件
const config = {
  # 指定上下文为源代码目录
  context: src,
  # 配置入口文件，包括 regular 和 liquid 两个入口
  entry: {
    regular: './app.js',
    liquid: './liquid.js'
  },
  # 配置输出文件的路径和文件名
  output: {
    path: dist,
    filename: '[name].js'
  },
  # 配置模块规则，包括处理 js 文件和 css 文件的规则
  module: {
    rules: [{
      test: /\.js$/,
      include: src,
      use: [{
        loader: 'babel-loader',
        options: {
          presets: [
            ['es2015', { modules: false }]
          ]
        }
      }]
    }, {
      test: /\.css$/,
      use: ExtractTextPlugin.extract({
        fallback: 'style-loader',
        use: ['css-loader']
      })
    },
    {
      test: /\.*(sass|scss)$/,
      use: ExtractTextPlugin.extract({
        fallback: 'style-loader',
        use: ['css-loader', 'sass-loader']
      })
    }]
  },
  # 配置插件，使用 ExtractTextPlugin 插件生成样式文件
  plugins: [
    new ExtractTextPlugin('styles.css')
  ]
};

# 导出配置对象
module.exports = config;

`.\lucidrains\llama-qrlhf\llama_qrlhf\llama.py`

import torch  # 导入 PyTorch 库
from torch.nn import Module, ModuleList  # 导入 PyTorch 中的 Module 和 ModuleList
from torch import nn, einsum, Tensor  # 导入 PyTorch 中的 nn、einsum 和 Tensor
import torch.nn.functional as F  # 导入 PyTorch 中的 nn.functional，并使用别名 F

from einops import rearrange, reduce  # 导入 einops 库中的 rearrange 和 reduce 函数
from einops.layers.torch import Rearrange  # 从 einops 库中导入 torch 版的 Rearrange 模块

# helpers

def exists(v):  # 定义一个函数 exists，用于判断变量是否存在
    return v is not None  # 返回变量是否不为 None 的布尔值

# norm

class RMSNorm(Module):  # 定义一个 RMSNorm 类，继承自 Module
    def __init__(self, dim):  # 初始化方法，接收维度参数 dim
        super().__init__()  # 调用父类的初始化方法
        self.scale = dim ** 0.5  # 计算缩放因子
        self.gamma = nn.Parameter(torch.ones(dim))  # 创建一个可学习的参数 gamma

    def forward(self, x):  # 前向传播方法，接收输入 x
        return F.normalize(x, dim=-1) * self.scale * self.gamma  # 对输入 x 进行归一化处理并乘以缩放因子和 gamma

# rotary

class RotaryEmbedding(Module):  # 定义一个 RotaryEmbedding 类，继承自 Module
    def __init__(self, dim, theta=10000):  # 初始化方法，接收维度参数 dim 和 theta，默认值为 10000
        super().__init__()  # 调用父类的初始化方法
        inv_freq = theta ** -(torch.arange(0, dim, 2).float() / dim)  # 计算频率的倒数
        self.register_buffer('inv_freq', inv_freq)  # 将频率的倒数注册为缓冲张量

    def forward(self, seq_len, device):  # 前向传播方法，接收序列长度和设备信息
        t = torch.arange(seq_len, device=device).type_as(self.inv_freq)  # 生成序列长度张量 t
        freqs = torch.einsum('i , j -> i j', t, self.inv_freq)  # 计算频率
        return torch.cat((freqs, freqs), dim=-1)  # 拼接频率张量并返回

def rotate_half(x):  # 定义一个函数 rotate_half，用于将输入张量 x 分成两部分并旋转
    x1, x2 = x.chunk(2, dim=-1)  # 将输入张量 x 按照最后一个维度分成两部分
    return torch.cat((-x2, x1), dim=-1)  # 将两部分旋转后拼接并返回

def apply_rotary_pos_emb(pos, t):  # 定义一个函数 apply_rotary_pos_emb，用于应用旋转位置编码
    return t * pos.cos() + rotate_half(t) * pos.sin()  # 返回应用旋转位置编码后的结果

# feedforward

class GEGLU(Module):  # 定义一个 GEGLU 类，继承自 Module
    def forward(self, x):  # 前向传播方法，接收输入 x
        x, gate = x.chunk(2, dim=-1)  # 将输入 x 按照最后一个维度分成两部分
        return F.gelu(gate) * x  # 对其中一部分应用 GELU 激活函数并返回乘积结果

def FeedForward(dim, mult=4):  # 定义一个 FeedForward 函数，用于创建前馈神经网络
    dim_hidden = int(dim * mult * 2 / 3)  # 计算隐藏层维度
    return nn.Sequential(  # 返回一个序列模块
        RMSNorm(dim),  # 添加 RMSNorm 模块
        nn.Linear(dim, dim_hidden * 2),  # 添加线性层
        GEGLU(),  # 添加 GEGLU 模块
        nn.Linear(dim_hidden, dim)  # 添加线性层
    )

# attention

class Attention(Module):  # 定义一个 Attention 类，继承自 Module
    def __init__(  # 初始化方法，接收维度参数 dim 和关键字参数
        self,
        dim,
        *,
        dim_head=64,
        heads=8
    ):
        super().__init__()  # 调用父类的初始化方法
        self.scale = dim_head ** -0.5  # 计算缩放因子
        dim_hidden = dim_head * heads  # 计算隐藏层维度

        self.to_qkv = nn.Sequential(  # 创建一个序列模块
            RMSNorm(dim),  # 添加 RMSNorm 模块
            nn.Linear(dim, dim_hidden * 3, bias=False),  # 添加线性层
            Rearrange('b n (qkv h d) -> qkv b h n d', h=heads, qkv=3)  # 重新排列张量维度
        )

        self.to_out = nn.Sequential(  # 创建一个序列模块
            Rearrange('b h n d -> b n (h d)'),  # 重新排列张量维度
            nn.Linear(dim_hidden, dim, bias=False)  # 添加线性层
        )

    def forward(self, x, rotary_emb=None):  # 前向传播方法，接收输入 x 和旋转位置编码
        q, k, v = self.to_qkv(x)  # 将输入 x 转换为查询、键、值

        if exists(rotary_emb):  # 如果旋转位置编码存在
            q, k = map(lambda t: apply_rotary_pos_emb(rotary_emb, t), (q, k))  # 应用旋转位置编码到查询和键

        q = q * self.scale  # 缩放查询
        sim = einsum('b h i d, b h j d -> b h i j', q, k)  # 计算相似度

        i, j = sim.shape[-2:]  # 获取相似度张量的形状
        causal_mask = torch.ones((i, j), device=x.device, dtype=torch.bool).triu(j - i + 1)  # 创建因果掩码
        sim = sim.masked_fill(causal_mask, -torch.finfo(sim.dtype).max)  # 对相似度张量应用掩码

        attn = sim.softmax(dim=-1)  # 对相似度张量进行 softmax 操作

        out = einsum('b h i j, b h j d -> b h i d', attn, v)  # 计算加权和

        return self.to_out(out)  # 返回输出结果

# Q head

class DuelingHead(Module):  # 定义一个 DuelingHead 类，继承自 Module
    def __init__(  # 初始化方法，接收关键字参数
        self,
        *,
        dim,
        num_tokens,
        expansion_factor=2,
    ):
        super().__init__()  # 调用父类的初始化方法
        dim_hidden = int(dim * expansion_factor)  # 计算隐藏层维度

        self.stem = nn.Sequential(  # 创建一个序列模块
            nn.Linear(dim, dim_hidden),  # 添加线性层
            nn.SiLU()  # 添加 SiLU 激活函数
        )

        self.to_values = nn.Sequential(  # 创建一个序列模块
            nn.Linear(dim_hidden, 1)  # 添加线性层
        )

        self.to_advantages = nn.Sequential(  # 创建一个序列模块
            nn.Linear(dim_hidden, num_tokens)  # 添加线性层
        )

    def forward(self, x):  # 前向传播方法，接收输入 x
        x = self.stem(x)  # 应用 stem 模块到输入 x

        advantages = self.to_advantages(x)  # 计算优势值
        advantages = advantages - reduce(advantages, '... a -> ... 1', 'mean')  # 计算优势值的平均值

        values = self.to_values(x)  # 计算值函数

        q_values = values + advantages  # 计算 Q 值
        return q_values  # 返回 Q 值

# llama

class Llama(Module):  # 定义一个 Llama 类，继承自 Module
    def __init__(  # 初始化方法，接收关键字参数
        self,
        *,
        num_tokens,
        dim,
        depth,
        dim_head=64,
        heads=8,
        ff_mult=4,
        dueling_q_head=False,
        dueling_q_head_expansion_factor=2
    # 初始化模型，继承父类的初始化方法
    ):
        super().__init__()

        # 创建 token embedding 层，将输入 token 映射为指定维度的向量
        self.token_emb = nn.Embedding(num_tokens, dim)
        # 创建旋转 embedding 层，用于在注意力机制中引入旋转
        self.rotary_emb = RotaryEmbedding(dim_head)

        # 创建多层 Transformer 模型
        self.layers = ModuleList([])

        # 循环创建指定层数的 Transformer 层
        for _ in range(depth):
            # 每层包含注意力机制和前馈神经网络
            self.layers.append(ModuleList([
                Attention(dim = dim, dim_head = dim_head, heads = heads),
                FeedForward(dim = dim, mult = ff_mult)
            ]))

        # 创建最终的归一化层
        self.final_norm = RMSNorm(dim)

        # 创建输出层，将模型输出映射为预测的 token
        self.to_logits = nn.Linear(dim, num_tokens)

        # 如果使用 dueling q head，则创建 dueling 头部
        if dueling_q_head:
            self.to_q = DuelingHead(num_tokens = num_tokens, dim = dim, expansion_factor = dueling_q_head_expansion_factor)
        else:
            # 否则创建普通的线性层
            self.to_q = nn.Linear(dim, num_tokens)

    # 模型的前向传播方法
    def forward(
        self,
        x,
        return_q_values = False
    ):
        # 获取输入序列的长度和设备信息
        seq_len, device = x.shape[-1], x.device

        # 对输入序列进行 token embedding
        x = self.token_emb(x)

        # 创建旋转 embedding
        rotary_emb = self.rotary_emb(seq_len, device = device)

        # 遍历每一层 Transformer
        for attn, ff in self.layers:
            # 执行注意力机制和前馈神经网络
            x = attn(x, rotary_emb = rotary_emb) + x
            x = ff(x) + x

        # 对输出进行最终的归一化
        embed = self.final_norm(x)
        # 将归一化后的输出映射为预测的 token
        logits = self.to_logits(embed)

        # 如果需要返回 Q 值，则计算 Q 值并返回
        if not return_q_values:
            return logits

        return logits, self.to_q(embed)

`.\lucidrains\llama-qrlhf\llama_qrlhf\llama_qrlhf.py`

import torch
from torch.nn import Module
from torch.utils.data import Dataset
from torch import nn, einsum, Tensor
import torch.nn.functional as F

from einops import rearrange, repeat

from ema_pytorch import EMA

from beartype import beartype
from beartype.typing import Optional

from torchtyping import TensorType

from accelerate import Accelerator

# helper functions

# 检查变量是否存在
def exists(v):
    return v is not None

# tensor helpers

# 从输入的张量中选择指定索引的值
def batch_select_indices(t, indices):
    indices = rearrange(indices, '... -> ... 1')
    selected = t.gather(-1, indices)
    return rearrange(selected, '... 1 -> ...')

# Q functions

# 基于自回归的 Q 学习
def autoregressive_q_learn(
    model:          Module,
    ema_model:      Module,
    states:         TensorType['b', 'n', int],     # 包含提示和生成序列的整个序列
    prompt_len:     TensorType['b', int],          # 前导提示序列的长度
    next_states:    TensorType['b', int],          # 选择的动作成为下一个状态
    rewards:        TensorType['b', 'n', float],   # 奖励可以在最后给出，也可以在中间给出
    eos_id:         Optional[int] = None,          # 从 <eos> 标记 id 计算完成状态
    discount_gamma: float = 0.998                  # 奖励折扣因子，鼓励生成答案的简洁性
) -> TensorType[()]:
    """
    einops

    b - batch
    n - sequence len
    """
    seq_len, device = states.shape[-1], states.device

    # 因为希腊字母的 Unicode 看起来很好

    γ = discount_gamma

    # 获取每个动作的预测 Q 值

    q_pred_all_actions = model(states)
    q_pred = batch_select_indices(q_pred_all_actions, actions)

    # 将下一个状态附加到当前状态，以获取目标 Q

    q_target_input = pack([states[:, 1:], next_state], 'b *')

    # 获取目标 Q

    q_target = ema_model(q_target_input)
    q_target = q_target_all_actions.max(dim = -1).values

    # 第一个完成标志之后的任何内容都将被视为终止状态

    if exists(eos_id):
        done = states == eos_id
        dones = dones.cumsum(dim = -1) > 0
        dones = F.pad(dones, (1, -1), value = False)

        not_terminal = (~dones).float()

        # 奖励不应在终止步骤及之后给出

        rewards = rewards * not_terminal
        q_target = q_target.masked_fill(dones, 0.)

    # 论文的主要贡献是以下逻辑
    # 第 4.1 节 - 公式 1

    # 在没有给出奖励的情况下，时间 t 的 Q 预测是 t + 1 的 max(Q target)

    losses_without_rewards = F.mse_loss(q_pred, q_target, reduction = 'none')

    # 处理给出奖励的时间步骤。���典的贝尔曼方程

    q_target_with_rewards = rewards + γ * q_target

    losses_with_rewards = F.mse_loss(q_pred, q_target_with_rewards, reduction = 'none')

    # 最终损失

    losses = torch.where(
        rewards > 0.,
        losses_with_reward,
        losses_without_rewards
    )

    # 执行掩码平均值
    # 仅考虑从提示的最后一个标记开始的 'q logits' 作为 '动作'

    is_action_mask = torch.arange(seq_len, device = device) > rearrange(prompt_len - 1, 'b -> b 1')
    losses = losses[is_action_mask]

    return losses.mean()

# 保守正则化损失
def conservative_regularization_loss(
    q_values:           TensorType['b', 'n', 'a', float],
    states_and_actions: TensorType['b', 'n', int],
    action_mask:        TensorType['b', 'n', bool],
    reward_min:         float = 0.
) -> TensorType[()]:
    batch, seq_len, num_actions, device = *q_values.shape, q_values.device
    non_dataset_actions = torch.arange(num_actions, device = device) == rearrange(states_and_actions, '... -> ... 1')

    q_values = q_values[~non_dataset_actions]
    q_values = rearrange(q_values, '(b n a) -> b n a', b = batch, n = seq_len)
    # 从Q值中选择动作掩码对应的值
    q_values = q_values[action_mask]

    # 创建一个包含指定值的张量，用于计算奖励的最小值
    reward_min = torch.full((), reward_min, device=device) * seq_len

    # 使用均方误差损失函数计算Q值和奖励最小值之间的损失
    return F.mse_loss(q_values, reward_min)
# 主要类

# 定义 QRLHF 类，继承自 Module 类
class QRLHF(Module):
    # 初始化方法，接受模型、数据集、加速参数和指数移动平均参数
    @beartype
    def __init__(
        self,
        model:   Module,  # 模型对象
        dataset: Dataset,  # 数据集对象
        accelerate_kwargs: dict = dict(),  # 加速参数，默认为空字典
        ema_kwargs: dict = dict(  # 指数移动平均参数，默认包含 beta=0.99
            beta = 0.99
        )
    ):
        # 调用父类的初始化方法
        super().__init__()

        # 将传入的模型赋值给 lm 属性
        self.lm = model
        # 使用传入的模型创建 EMA 对象，并赋值给 lm_target 属性
        self.lm_target = EMA(model, **ema_kwargs)

    # 前向传播方法，抛出未实现错误
    def forward(self):
        raise NotImplementedError

`.\lucidrains\llama-qrlhf\llama_qrlhf\init.py`

# 从 llama_qrlhf 模块中导入 QRLHF 类
from llama_qrlhf.llama_qrlhf import QRLHF

Lucidrains-系列项目源码解析-五十一-

Lucidrains 系列项目源码解析（五十一）

.\lucidrains\linear-attention-transformer\setup.py

.\lucidrains\linformer\linformer\linformer.py

.\lucidrains\linformer\linformer\reversible.py

.\lucidrains\linformer\linformer\__init__.py

Linformer for Pytorch

Install

Usage

Citations

.\lucidrains\linformer\setup.py

.\lucidrains\lion-pytorch\lion_pytorch\lion_pytorch.py

.\lucidrains\lion-pytorch\lion_pytorch\triton.py

.\lucidrains\lion-pytorch\lion_pytorch\__init__.py

🦁 Lion - Pytorch

Instructions

Updates

Install

Usage

Appreciation

Citations

.\lucidrains\lion-pytorch\setup.py

Liquid Conway Game of Life

.\lucidrains\liquid-conway\src\app.js

.\lucidrains\liquid-conway\src\helpers.js

.\lucidrains\liquid-conway\src\liquid.js

.\lucidrains\liquid-conway\webpack.config.js

.\lucidrains\llama-qrlhf\llama_qrlhf\llama.py

.\lucidrains\llama-qrlhf\llama_qrlhf\llama_qrlhf.py

.\lucidrains\llama-qrlhf\llama_qrlhf\__init__.py

`.\lucidrains\linear-attention-transformer\setup.py`

`.\lucidrains\linformer\linformer\linformer.py`

`.\lucidrains\linformer\linformer\reversible.py`

`.\lucidrains\linformer\linformer\init.py`

`.\lucidrains\linformer\setup.py`

`.\lucidrains\lion-pytorch\lion_pytorch\lion_pytorch.py`

`.\lucidrains\lion-pytorch\lion_pytorch\triton.py`

`.\lucidrains\lion-pytorch\lion_pytorch\init.py`

`.\lucidrains\lion-pytorch\setup.py`

`.\lucidrains\liquid-conway\src\app.js`

`.\lucidrains\liquid-conway\src\helpers.js`

`.\lucidrains\liquid-conway\src\liquid.js`

`.\lucidrains\liquid-conway\webpack.config.js`

`.\lucidrains\llama-qrlhf\llama_qrlhf\llama.py`

`.\lucidrains\llama-qrlhf\llama_qrlhf\llama_qrlhf.py`

`.\lucidrains\llama-qrlhf\llama_qrlhf\init.py`