Lucidrains-系列项目源码解析-七十六-

165 阅读16分钟

Lucidrains 系列项目源码解析(七十六)

.\lucidrains\phenaki-pytorch\setup.py

# 导入设置和查找包的函数
from setuptools import setup, find_packages

# 设置包的信息
setup(
  name = 'phenaki-pytorch',  # 包的名称
  packages = find_packages(exclude=[]),  # 查找所有包
  version = '0.4.2',  # 版本号
  license='MIT',  # 许可证
  description = 'Phenaki - Pytorch',  # 描述
  author = 'Phil Wang',  # 作者
  author_email = 'lucidrains@gmail.com',  # 作者邮箱
  long_description_content_type = 'text/markdown',  # 长描述内容类型
  url = 'https://github.com/lucidrains/phenaki-pytorch',  # URL
  keywords = [  # 关键词列表
    'artificial intelligence',
    'deep learning',
    'transformers',
    'attention mechanisms',
    'text-to-video'
  ],
  install_requires = [  # 安装依赖列表
    'accelerate',
    'beartype',
    'einops>=0.7',
    'ema-pytorch>=0.2.2',
    'opencv-python',
    'pillow',
    'numpy',
    'sentencepiece',
    'torch>=1.6',
    'torchtyping',
    'torchvision',
    'transformers>=4.20.1',
    'tqdm',
    'vector-quantize-pytorch>=1.11.8'
  ],
  classifiers=[  # 分类器列表
    'Development Status :: 4 - Beta',
    'Intended Audience :: Developers',
    'Topic :: Scientific/Engineering :: Artificial Intelligence',
    'License :: OSI Approved :: MIT License',
    'Programming Language :: Python :: 3.6',
  ],
)

.\lucidrains\pi-GAN-pytorch\pi_gan_pytorch\coordconv.py

# 从给定链接中导入所需的库
# https://github.com/mkocabas/CoordConv-pytorch/blob/master/CoordConv.py
import torch
import torch.nn as nn

# 定义一个名为AddCoords的类,继承自nn.Module
class AddCoords(nn.Module):

    # 初始化函数,接受一个布尔值参数with_r,默认为False
    def __init__(self, with_r=False):
        super().__init__()
        self.with_r = with_r

    # 前向传播函数,接受一个输入张量input_tensor
    def forward(self, input_tensor):
        """
        Args:
            input_tensor: shape(batch, channel, x_dim, y_dim)
        """
        # 获取输入张量的维度信息
        batch_size, _, x_dim, y_dim = input_tensor.size()

        # 创建xx_channel和yy_channel张量,用于表示坐标信息
        xx_channel = torch.arange(x_dim).repeat(1, y_dim, 1)
        yy_channel = torch.arange(y_dim).repeat(1, x_dim, 1).transpose(1, 2)

        # 对坐标信息进行归一化处理
        xx_channel = xx_channel.float() / (x_dim - 1)
        yy_channel = yy_channel.float() / (y_dim - 1)

        # 将坐标信息映射到[-1, 1]范围内
        xx_channel = xx_channel * 2 - 1
        yy_channel = yy_channel * 2 - 1

        # 将坐标信息扩展到batch维度,并转置维度
        xx_channel = xx_channel.repeat(batch_size, 1, 1, 1).transpose(2, 3)
        yy_channel = yy_channel.repeat(batch_size, 1, 1, 1).transpose(2, 3)

        # 将坐标信息与输入张量拼接在一起
        ret = torch.cat([
            input_tensor,
            xx_channel.type_as(input_tensor),
            yy_channel.type_as(input_tensor)], dim=1)

        # 如果with_r为True,则计算距离信息并拼接到结果中
        if self.with_r:
            rr = torch.sqrt(torch.pow(xx_channel.type_as(input_tensor) - 0.5, 2) + torch.pow(yy_channel.type_as(input_tensor) - 0.5, 2))
            ret = torch.cat([ret, rr], dim=1)

        return ret

# 定义一个名为CoordConv的类,继承自nn.Module
class CoordConv(nn.Module):

    # 初始化函数,接受输入通道数in_channels、输出通道数out_channels和其他关键字参数
    def __init__(self, in_channels, out_channels, with_r=False, **kwargs):
        super().__init__()
        # 创建AddCoords对象,传入with_r参数
        self.addcoords = AddCoords(with_r=with_r)
        # 计算输入尺寸大小
        in_size = in_channels+2
        if with_r:
            in_size += 1
        # 创建卷积层对象
        self.conv = nn.Conv2d(in_size, out_channels, **kwargs)

    # 前向传播函数,接受输入张量x
    def forward(self, x):
        # 将输入张量经过AddCoords处理后再经过卷积层处理
        ret = self.addcoords(x)
        ret = self.conv(ret)
        return ret

.\lucidrains\pi-GAN-pytorch\pi_gan_pytorch\nerf.py

# 从给定链接中获取的代码,需要从3D输入重构为5D输入(包含光线方向)

import torch
import torch.nn.functional as F
from einops import repeat, rearrange

# 创建二维网格
def meshgrid_xy(tensor1, tensor2):
    ii, jj = torch.meshgrid(tensor1, tensor2)
    return ii.transpose(-1, -2), jj.transpose(-1, -2)

# 计算累积乘积(不包括当前元素)
def cumprod_exclusive(tensor):
    cumprod = torch.cumprod(tensor, dim = -1)
    cumprod = torch.roll(cumprod, 1, -1)
    cumprod[..., 0] = 1.
    return cumprod

# 获取光线束
def get_ray_bundle(height, width, focal_length, tform_cam2world):
    ii, jj = meshgrid_xy(
      torch.arange(width).to(tform_cam2world),
      torch.arange(height).to(tform_cam2world)
    )

    directions = torch.stack([(ii - width * .5) / focal_length,
                            -(jj - height * .5) / focal_length,
                            -torch.ones_like(ii)
                           ], dim=-1)
    ray_directions = torch.sum(directions[..., None, :] * tform_cam2world[:3, :3], dim=-1)
    ray_origins = tform_cam2world[:3, -1].expand(ray_directions.shape)
    return ray_origins, ray_directions

# 从光线计算查询点
def compute_query_points_from_rays(
    ray_origins,
    ray_directions,
    near_thresh,
    far_thresh,
    num_samples,
    randomize = True
):
    depth_values = torch.linspace(near_thresh, far_thresh, num_samples).to(ray_origins)
    if randomize is True:
        noise_shape = list(ray_origins.shape[:-1]) + [num_samples]
        depth_values = depth_values \
            + torch.rand(noise_shape).to(ray_origins) * (far_thresh
                - near_thresh) / num_samples
    query_points = ray_origins[..., None, :] + ray_directions[..., None, :] * depth_values[..., :, None]
    return query_points, depth_values

# 渲染体密度
def render_volume_density(
    radiance_field,
    ray_origins,
    depth_values
):
    sigma_a = F.relu(radiance_field[..., 3])
    rgb = torch.sigmoid(radiance_field[..., :3])
    one_e_10 = torch.tensor([1e10], dtype=ray_origins.dtype, device=ray_origins.device)
    dists = torch.cat((depth_values[..., 1:] - depth_values[..., :-1],
                  one_e_10.expand(depth_values[..., :1].shape)), dim=-1)
    alpha = 1. - torch.exp(-sigma_a * dists)
    weights = alpha * cumprod_exclusive(1. - alpha + 1e-10)

    rgb_map = (weights[..., None] * rgb).sum(dim=-2)
    depth_map = (weights * depth_values).sum(dim=-1)
    acc_map = weights.sum(-1)

    return rgb_map, depth_map, acc_map

# 从NERF模型获取图像
def get_image_from_nerf_model(
    model,
    latents,
    height,
    width,
    focal_length = 140,
    tform_cam2world = torch.eye(4),
    near_thresh = 2.,
    far_thresh = 6.,
    depth_samples_per_ray = 32
):
    tform_cam2world = tform_cam2world.to(latents)

    ray_origins, ray_directions = get_ray_bundle(height, width, focal_length,
                                               tform_cam2world)

    query_points, depth_values = compute_query_points_from_rays(
      ray_origins, ray_directions, near_thresh, far_thresh, depth_samples_per_ray
    )

    flattened_query_points = query_points.reshape((-1, 3))

    images = []
    for latent in latents.unbind(0):
        predictions = []
        predictions.append(model(latent, flattened_query_points))

        radiance_field_flattened = torch.cat(predictions, dim=0)

        unflattened_shape = list(query_points.shape[:-1]) + [4]
        radiance_field = torch.reshape(radiance_field_flattened, unflattened_shape)

        rgb_predicted, _, _ = render_volume_density(radiance_field, ray_origins, depth_values)
        image = rearrange(rgb_predicted, 'h w c -> c h w')
        images.append(image)

    return torch.stack(images)

.\lucidrains\pi-GAN-pytorch\pi_gan_pytorch\pi_gan_pytorch.py

# 导入所需的库
import math
from pathlib import Path
from functools import partial

import torch
from torch import nn, einsum
import torch.nn.functional as F
from torch.autograd import grad as torch_grad

from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from torch.optim.lr_scheduler import LambdaLR

from tqdm import trange
from PIL import Image
import torchvision
from torchvision.utils import save_image
import torchvision.transforms as T

# 导入自定义模块
from pi_gan_pytorch.coordconv import CoordConv
from pi_gan_pytorch.nerf import get_image_from_nerf_model
from einops import rearrange, repeat

# 检查是否有可用的 CUDA 设备
assert torch.cuda.is_available(), 'You need to have an Nvidia GPU with CUDA installed.'

# 定义一些辅助函数

def exists(val):
    return val is not None

def leaky_relu(p = 0.2):
    return nn.LeakyReLU(p)

def to_value(t):
    return t.clone().detach().item()

def get_module_device(module):
    return next(module.parameters()).device

# 定义损失函数

def gradient_penalty(images, output, weight = 10):
    batch_size, device = images.shape[0], images.device
    gradients = torch_grad(outputs=output, inputs=images,
                           grad_outputs=torch.ones(output.size(), device=device),
                           create_graph=True, retain_graph=True, only_inputs=True)[0]

    gradients = gradients.reshape(batch_size, -1)
    l2 = ((gradients.norm(2, dim = 1) - 1) ** 2).mean()
    return weight * l2

# 定义正弦激活函数

class Sine(nn.Module):
    def __init__(self, w0 = 1.):
        super().__init__()
        self.w0 = w0
    def forward(self, x):
        return torch.sin(self.w0 * x)

# 定义 Siren 层

class Siren(nn.Module):
    def __init__(self, dim_in, dim_out, w0 = 1., c = 6., is_first = False, use_bias = True, activation = None):
        super().__init__()
        self.dim_in = dim_in
        self.is_first = is_first

        weight = torch.zeros(dim_out, dim_in)
        bias = torch.zeros(dim_out) if use_bias else None
        self.init_(weight, bias, c = c, w0 = w0)

        self.weight = nn.Parameter(weight)
        self.bias = nn.Parameter(bias) if use_bias else None
        self.activation = Sine(w0) if activation is None else activation

    def init_(self, weight, bias, c, w0):
        dim = self.dim_in

        w_std = (1 / dim) if self.is_first else (math.sqrt(c / dim) / w0)
        weight.uniform_(-w_std, w_std)

        if bias is not None:
            bias.uniform_(-w_std, w_std)

    def forward(self, x, gamma = None, beta = None):
        out =  F.linear(x, self.weight, self.bias)

        # FiLM modulation

        if exists(gamma):
            out = out * gamma

        if exists(beta):
            out = out + beta

        out = self.activation(out)
        return out

# 定义映射网络

class EqualLinear(nn.Module):
    def __init__(self, in_dim, out_dim, lr_mul = 0.1, bias = True):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(out_dim, in_dim))
        if bias:
            self.bias = nn.Parameter(torch.zeros(out_dim))

        self.lr_mul = lr_mul

    def forward(self, input):
        return F.linear(input, self.weight * self.lr_mul, bias=self.bias * self.lr_mul)

class MappingNetwork(nn.Module):
    def __init__(self, *, dim, dim_out, depth = 3, lr_mul = 0.1):
        super().__init__()

        layers = []
        for i in range(depth):
            layers.extend([EqualLinear(dim, dim, lr_mul), leaky_relu()])

        self.net = nn.Sequential(*layers)

        self.to_gamma = nn.Linear(dim, dim_out)
        self.to_beta = nn.Linear(dim, dim_out)

    def forward(self, x):
        x = F.normalize(x, dim = -1)
        x = self.net(x)
        return self.to_gamma(x), self.to_beta(x)

# 定义 Siren 网络

class SirenNet(nn.Module):
    # 初始化神经网络模型
    def __init__(self, dim_in, dim_hidden, dim_out, num_layers, w0 = 1., w0_initial = 30., use_bias = True, final_activation = None):
        # 调用父类的初始化方法
        super().__init__()
        # 创建一个空的神经网络层列表
        self.layers = nn.ModuleList([])

        # 循环创建指定数量的 Siren 层
        for ind in range(num_layers):
            # 判断是否是第一层
            is_first = ind == 0
            # 根据是否是第一层选择不同的参数
            layer_w0 = w0_initial if is_first else w0
            layer_dim_in = dim_in if is_first else dim_hidden

            # 将创建的 Siren 层添加到神经网络层列表中
            self.layers.append(Siren(
                dim_in = layer_dim_in,
                dim_out = dim_hidden,
                w0 = layer_w0,
                use_bias = use_bias,
                is_first = is_first
            ))

        # 创建最后一层 Siren 层
        self.last_layer = Siren(dim_in = dim_hidden, dim_out = dim_out, w0 = w0, use_bias = use_bias, activation = final_activation)

    # 前向传播函数
    def forward(self, x, gamma, beta):
        # 遍历神经网络层列表,依次进行前向传播
        for layer in self.layers:
            x = layer(x, gamma, beta)
        # 返回最后一层的前向传播结果
        return self.last_layer(x)
# 定义 Siren 生成器类
class SirenGenerator(nn.Module):
    # 初始化函数
    def __init__(
        self,
        *,
        dim,
        dim_hidden,
        siren_num_layers = 8
    ):
        super().__init__()

        # 创建映射网络对象
        self.mapping = MappingNetwork(
            dim = dim,
            dim_out = dim_hidden
        )

        # 创建 Siren 网络对象
        self.siren = SirenNet(
            dim_in = 3,
            dim_hidden = dim_hidden,
            dim_out = dim_hidden,
            num_layers = siren_num_layers
        )

        # 创建输出 alpha 的线性层
        self.to_alpha = nn.Linear(dim_hidden, 1)

        # 创建 Siren 网络对象用于生成 RGB
        self.to_rgb_siren = Siren(
            dim_in = dim_hidden,
            dim_out = dim_hidden
        )

        # 创建输出 RGB 的线性层
        self.to_rgb = nn.Linear(dim_hidden, 3)

    # 前向传播函数
    def forward(self, latent, coors, batch_size = 8192):
        # 获取 gamma 和 beta
        gamma, beta = self.mapping(latent)

        outs = []
        # 分批处理坐标
        for coor in coors.split(batch_size):
            # 重排 gamma 和 beta 的维度
            gamma_, beta_ = map(lambda t: rearrange(t, 'n -> () n'), (gamma, beta))
            # 使用 Siren 网络生成 x
            x = self.siren(coor, gamma_, beta_)
            # 生成 alpha
            alpha = self.to_alpha(x)

            # 使用 Siren 网络生成 RGB
            x = self.to_rgb_siren(x, gamma, beta)
            rgb = self.to_rgb(x)
            # 拼接 RGB 和 alpha
            out = torch.cat((rgb, alpha), dim = -1)
            outs.append(out)

        return torch.cat(outs)

# 定义生成器类
class Generator(nn.Module):
    # 初始化函数
    def __init__(
        self,
        *,
        image_size,
        dim,
        dim_hidden,
        siren_num_layers
    ):
        super().__init__()
        self.dim = dim
        self.image_size = image_size

        # 创建 Siren 生成器对象
        self.nerf_model = SirenGenerator(
            dim = dim,
            dim_hidden = dim_hidden,
            siren_num_layers = siren_num_layers
        )

    # 设置图像尺寸
    def set_image_size(self, image_size):
        self.image_size = image_size

    # 前向传播函数
    def forward(self, latents):
        image_size = self.image_size
        device, b = latents.device, latents.shape[0]

        # 从 Siren 生成器模型获取生成的图像
        generated_images = get_image_from_nerf_model(
            self.nerf_model,
            latents,
            image_size,
            image_size
        )

        return generated_images

# 定义判别器块类
class DiscriminatorBlock(nn.Module):
    # 初始化函数
    def __init__(self, dim, dim_out):
        super().__init__()
        # 创建 CoordConv 层
        self.res = CoordConv(dim, dim_out, kernel_size = 1, stride = 2)

        # 创建网络序列
        self.net = nn.Sequential(
            CoordConv(dim, dim_out, kernel_size = 3, padding = 1),
            leaky_relu(),
            CoordConv(dim_out, dim_out, kernel_size = 3, padding = 1),
            leaky_relu()
        )

        # 下采样层
        self.down = nn.AvgPool2d(2)

    # 前向传播函数
    def forward(self, x):
        res = self.res(x)
        x = self.net(x)
        x = self.down(x)
        x = x + res
        return x

# 定义判别器类
class Discriminator(nn.Module):
    # 初始化函数
    def __init__(
        self,
        image_size,
        init_chan = 64,
        max_chan = 400,
        init_resolution = 32,
        add_layer_iters = 10000
    ):
        # 调用父类的构造函数
        super().__init__()
        # 计算图像大小的对数值
        resolutions = math.log2(image_size)
        # 断言图像大小必须是2的幂
        assert resolutions.is_integer(), 'image size must be a power of 2'
        # 断言初始分辨率必须是2的幂
        assert math.log2(init_resolution).is_integer(), 'initial resolution must be power of 2'

        # 将对数值转换为整数
        resolutions = int(resolutions)
        # 计算层数
        layers = resolutions - 1

        # 计算通道数列表
        chans = list(reversed(list(map(lambda t: 2 ** (11 - t), range(layers))))
        # 将通道数限制在最大通道数以内
        chans = list(map(lambda n: min(max_chan, n), chans))
        # 添加初始通道数到通道数列表
        chans = [init_chan, *chans]
        # 获取最终通道数
        final_chan = chans[-1]

        # 初始化 from_rgb_layers 和 layers
        self.from_rgb_layers = nn.ModuleList([])
        self.layers = nn.ModuleList([])
        self.image_size = image_size
        self.resolutions = list(map(lambda t: 2 ** (7 - t), range(layers)))

        # 遍历分辨率、输入通道数、输出通道数,创建 from_rgb_layer 和 DiscriminatorBlock
        for resolution, in_chan, out_chan in zip(self.resolutions, chans[:-1], chans[1:]):

            from_rgb_layer = nn.Sequential(
                CoordConv(3, in_chan, kernel_size = 1),
                leaky_relu()
            ) if resolution >= init_resolution else None

            self.from_rgb_layers.append(from_rgb_layer)

            self.layers.append(DiscriminatorBlock(
                dim = in_chan,
                dim_out = out_chan
            ))

        # 创建最终卷积层
        self.final_conv = CoordConv(final_chan, 1, kernel_size = 2)

        # 初始化 alpha、resolution 和 iterations
        self.add_layer_iters = add_layer_iters
        self.register_buffer('alpha', torch.tensor(0.))
        self.register_buffer('resolution', torch.tensor(init_resolution))
        self.register_buffer('iterations', torch.tensor(0.))

    # 增加分辨率
    def increase_resolution_(self):
        if self.resolution >= self.image_size:
            return

        self.alpha += self.alpha + (1 - self.alpha)
        self.iterations.fill_(0.)
        self.resolution *= 2

    # 更新迭代次数
    def update_iter_(self):
        self.iterations += 1
        self.alpha -= (1 / self.add_layer_iters)
        self.alpha.clamp_(min = 0.)

    # 前向传播函数
    def forward(self, img):
        x = img

        for resolution, from_rgb, layer in zip(self.resolutions, self.from_rgb_layers, self.layers):
            if self.resolution < resolution:
                continue

            if self.resolution == resolution:
                x = from_rgb(x)

            if bool(resolution == (self.resolution // 2)) and bool(self.alpha > 0):
                x_down = F.interpolate(img, scale_factor = 0.5)
                x = x * (1 - self.alpha) + from_rgb(x_down) * self.alpha

            x = layer(x)

        out = self.final_conv(x)
        return out
# 定义 piGAN 类
class piGAN(nn.Module):
    def __init__(
        self,
        *,
        image_size,
        dim,
        init_resolution = 32,
        generator_dim_hidden = 256,
        siren_num_layers = 6,
        add_layer_iters = 10000
    ):
        super().__init__()
        self.dim = dim

        # 初始化生成器 G
        self.G = Generator(
            image_size = image_size,
            dim = dim,
            dim_hidden = generator_dim_hidden,
            siren_num_layers = siren_num_layers
        )

        # 初始化判别器 D
        self.D = Discriminator(
            image_size = image_size,
            add_layer_iters = add_layer_iters,
            init_resolution = init_resolution
        )

# 定义数据集相关函数

# 无限循环迭代器
def cycle(iterable):
    while True:
        for i in iterable:
            yield i

# 调整图像大小至最小尺寸
def resize_to_minimum_size(min_size, image):
    if max(*image.size) < min_size:
        return torchvision.transforms.functional.resize(image, min_size)
    return image

# 图像数据集类
class ImageDataset(Dataset):
    def __init__(
        self,
        folder,
        image_size,
        transparent = False,
        aug_prob = 0.,
        exts = ['jpg', 'jpeg', 'png']
    ):
        super().__init__()
        self.folder = folder
        self.image_size = image_size
        self.paths = [p for ext in exts for p in Path(f'{folder}').glob(f'**/*.{ext}')]
        assert len(self.paths) > 0, f'No images were found in {folder} for training'
        self.create_transform(image_size)

    # 创建图像转换函数
    def create_transform(self, image_size):
        self.transform = T.Compose([
            T.Lambda(partial(resize_to_minimum_size, image_size)),
            T.Resize(image_size),
            T.CenterCrop(image_size),
            T.ToTensor()
        ])

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, index):
        path = self.paths[index]
        img = Image.open(path)
        return self.transform(img)

# 训练器类

# 生成器采样函数
def sample_generator(G, batch_size):
    dim = G.dim
    rand_latents = torch.randn(batch_size, dim).cuda()
    return G(rand_latents)

class Trainer(nn.Module):
    def __init__(
        self,
        *,
        gan,
        folder,
        add_layers_iters = 10000,
        batch_size = 8,
        gradient_accumulate_every = 4,
        sample_every = 100,
        log_every = 10,
        num_train_steps = 50000,
        lr_gen = 5e-5,
        lr_discr = 4e-4,
        target_lr_gen = 1e-5,
        target_lr_discr = 1e-4,
        lr_decay_span = 10000
    ):
        super().__init__()
        gan.D.add_layer_iters = add_layers_iters
        self.add_layers_iters = add_layers_iters

        # 将 gan 移至 GPU
        self.gan = gan.cuda()

        # 初始化判别器和生成器的优化器
        self.optim_D = Adam(self.gan.D.parameters(), betas=(0, 0.9), lr = lr_discr)
        self.optim_G = Adam(self.gan.G.parameters(), betas=(0, 0.9), lr = lr_gen)

        # 定义判别器和生成器的学习率衰减函数
        D_decay_fn = lambda i: max(1 - i / lr_decay_span, 0) + (target_lr_discr / lr_discr) * min(i / lr_decay_span, 1)
        G_decay_fn = lambda i: max(1 - i / lr_decay_span, 0) + (target_lr_gen / lr_gen) * min(i / lr_decay_span, 1)

        # 初始化判别器和生成器的学习率调度器
        self.sched_D = LambdaLR(self.optim_D, D_decay_fn)
        self.sched_G = LambdaLR(self.optim_G, G_decay_fn)

        self.iterations = 0
        self.batch_size = batch_size
        self.num_train_steps = num_train_steps

        self.log_every = log_every
        self.sample_every = sample_every
        self.gradient_accumulate_every = gradient_accumulate_every

        # 初始化数据集和数据加载器
        self.dataset = ImageDataset(folder = folder, image_size = gan.D.resolution.item())
        self.dataloader = cycle(DataLoader(self.dataset, batch_size = batch_size, shuffle = True, drop_last = True))

        self.last_loss_D = 0
        self.last_loss_G = 0
    # 定义每一步训练的操作
    def step(self):
        # 获取GAN模型的判别器D、生成器G、批量大小batch_size、维度dim、梯度累积次数accumulate_every
        D, G, batch_size, dim, accumulate_every = self.gan.D, self.gan.G, self.batch_size, self.gan.dim, self.gradient_accumulate_every

        # 设置适当的图像大小
        if self.iterations % self.add_layers_iters == 0:
            if self.iterations != 0:
                D.increase_resolution_()

            # 获取图像大小
            image_size = D.resolution.item()
            G.set_image_size(image_size)
            self.dataset.create_transform(image_size)

        # 是否应用梯度惩罚
        apply_gp = self.iterations % 4 == 0

        # 训练判别器
        D.train()
        loss_D = 0

        for _ in range(accumulate_every):
            # 获取下一个批量图像数据
            images = next(self.dataloader)
            images = images.cuda().requires_grad_()
            real_out = D(images)

            # 生成假图像
            fake_imgs = sample_generator(G, batch_size)
            fake_out = D(fake_imgs.clone().detach())

            # 计算梯度惩罚
            divergence = (F.relu(1 + real_out) + F.relu(1 - fake_out)).mean()
            loss = divergence

            if apply_gp:
                gp = gradient_penalty(images, real_out)
                self.last_loss_gp = to_value(gp)
                loss = loss + gp

            (loss / accumulate_every).backward()
            loss_D += to_value(divergence) / accumulate_every

        self.last_loss_D = loss_D

        self.optim_D.step()
        self.optim_D.zero_grad()

        # 训练生成器
        G.train()
        loss_G = 0

        for _ in range(accumulate_every):
            fake_out = sample_generator(G, batch_size)
            loss = D(fake_out).mean()
            (loss / accumulate_every).backward()
            loss_G += to_value(loss) / accumulate_every

        self.last_loss_G = loss_G

        self.optim_G.step()
        self.optim_G.zero_grad()

        # 更新调度器
        self.sched_D.step()
        self.sched_G.step()

        self.iterations += 1
        D.update_iter_()

    # 前向传播函数
    def forward(self):
        for _ in trange(self.num_train_steps):
            self.step()

            # 每隔一定步数打印损失信息
            if self.iterations % self.log_every == 0:
                print(f'I: {self.gan.D.resolution.item()} | D: {self.last_loss_D:.2f} | G: {self.last_loss_G:.2f} | GP: {self.last_loss_gp:.2f}')

            # 每隔一定步数保存生成的图像
            if self.iterations % self.sample_every == 0:
                i = self.iterations // self.sample_every
                imgs = sample_generator(self.gan.G, 4)
                imgs.clamp_(0., 1.)
                save_image(imgs, f'./{i}.png', nrow=2)

.\lucidrains\pi-GAN-pytorch\pi_gan_pytorch\__init__.py

# 从 pi_gan_pytorch.pi_gan_pytorch 模块中导入 Generator, Discriminator, piGAN, Trainer 类
from pi_gan_pytorch.pi_gan_pytorch import Generator, Discriminator, piGAN, Trainer

π-GAN - Pytorch (wip)

Implementation of π-GAN, for 3d-aware image synthesis, in Pytorch.

Project video from authors

Install

$ pip install pi-gan-pytorch

Usage

from pi_gan_pytorch import piGAN, Trainer

gan = piGAN(
    image_size = 128,
    dim = 512
).cuda()

trainer = Trainer(
    gan = gan,
    folder = '/path/to/images'
)

trainer()

Citations

@misc{chan2020pigan,
    title={pi-GAN: Periodic Implicit Generative Adversarial Networks for 3D-Aware Image Synthesis}, 
    author={Eric R. Chan and Marco Monteiro and Petr Kellnhofer and Jiajun Wu and Gordon Wetzstein},
    year={2020},
    eprint={2012.00926},
    archivePrefix={arXiv},
    primaryClass={cs.CV}
}

.\lucidrains\pi-GAN-pytorch\setup.py

# 导入设置安装和查找包的函数
from setuptools import setup, find_packages

# 设置包的信息
setup(
  # 包的名称
  name = 'pi-gan-pytorch',
  # 查找所有包
  packages = find_packages(),
  # 版本号
  version = '0.0.11',
  # 许可证
  license='MIT',
  # 描述
  description = 'π-GAN - Pytorch',
  # 作者
  author = 'Phil Wang',
  # 作者邮箱
  author_email = 'lucidrains@gmail.com',
  # 项目链接
  url = 'https://github.com/lucidrains/pi-gan-pytorch',
  # 关键词
  keywords = [
    'artificial intelligence',
    'generative adversarial network'
  ],
  # 安装依赖
  install_requires=[
    'einops>=0.3',
    'pillow',
    'torch>=1.6',
    'torchvision',
    'tqdm'
  ],
  # 分类
  classifiers=[
    'Development Status :: 4 - Beta',
    'Intended Audience :: Developers',
    'Topic :: Scientific/Engineering :: Artificial Intelligence',
    'License :: OSI Approved :: MIT License',
    'Programming Language :: Python :: 3.6',
  ],
)

.\lucidrains\pixel-level-contrastive-learning\pixel_level_contrastive_learning\pixel_level_contrastive_learning.py

# 导入数学库
import math
# 导入复制库
import copy
# 导入随机库
import random
# 导入wraps和partial函数
from functools import wraps, partial
# 从数学库中导入floor函数
from math import floor

# 导入torch库
import torch
# 从torch中导入nn和einsum模块
from torch import nn, einsum
# 从torch.nn中导入functional模块
import torch.nn.functional as F

# 从kornia库中导入augmentation、filters和color模块
from kornia import augmentation as augs
from kornia import filters, color

# 从einops库中导入rearrange函数
from einops import rearrange

# 辅助函数

# 返回输入的张量
def identity(t):
    return t

# 如果输入值为None,则返回默认值
def default(val, def_val):
    return def_val if val is None else val

# 根据概率返回True或False
def rand_true(prob):
    return random.random() < prob

# 缓存装饰器,用于缓存计算结果
def singleton(cache_key):
    def inner_fn(fn):
        @wraps(fn)
        def wrapper(self, *args, **kwargs):
            instance = getattr(self, cache_key)
            if instance is not None:
                return instance

            instance = fn(self, *args, **kwargs)
            setattr(self, cache_key, instance)
            return instance
        return wrapper
    return inner_fn

# 获取模块所在设备
def get_module_device(module):
    return next(module.parameters()).device

# 设置模型参数是否需要梯度
def set_requires_grad(model, val):
    for p in model.parameters():
        p.requires_grad = val

# 随机生成cutout的坐标和比例
def cutout_coordinates(image, ratio_range = (0.6, 0.8)):
    _, _, orig_h, orig_w = image.shape

    ratio_lo, ratio_hi = ratio_range
    random_ratio = ratio_lo + random.random() * (ratio_hi - ratio_lo)
    w, h = floor(random_ratio * orig_w), floor(random_ratio * orig_h)
    coor_x = floor((orig_w - w) * random.random())
    coor_y = floor((orig_h - h) * random.random())
    return ((coor_y, coor_y + h), (coor_x, coor_x + w)), random_ratio

# 对cutout后的图像进行插值缩放
def cutout_and_resize(image, coordinates, output_size = None, mode = 'nearest'):
    shape = image.shape
    output_size = default(output_size, shape[2:])
    (y0, y1), (x0, x1) = coordinates
    cutout_image = image[:, :, y0:y1, x0:x1]
    return F.interpolate(cutout_image, size = output_size, mode = mode)

# 数据增强工具

# 随机应用函数
class RandomApply(nn.Module):
    def __init__(self, fn, p):
        super().__init__()
        self.fn = fn
        self.p = p
    def forward(self, x):
        if random.random() > self.p:
            return x
        return self.fn(x)

# 指数移动平均

class EMA():
    def __init__(self, beta):
        super().__init__()
        self.beta = beta

    def update_average(self, old, new):
        if old is None:
            return new
        return old * self.beta + (1 - self.beta) * new

# 更新移动平均值
def update_moving_average(ema_updater, ma_model, current_model):
    for current_params, ma_params in zip(current_model.parameters(), ma_model.parameters()):
        old_weight, up_weight = ma_params.data, current_params.data
        ma_params.data = ema_updater.update_average(old_weight, up_weight)

# 损失函数

# 计算损失函数
def loss_fn(x, y):
    x = F.normalize(x, dim=-1, p=2)
    y = F.normalize(y, dim=-1, p=2)
    return 2 - 2 * (x * y).sum(dim=-1)

# 类

# 多层感知器
class MLP(nn.Module):
    def __init__(self, chan, chan_out = 256, inner_dim = 2048):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(chan, inner_dim),
            nn.BatchNorm1d(inner_dim),
            nn.ReLU(),
            nn.Linear(inner_dim, chan_out)
        )

    def forward(self, x):
        return self.net(x)

# 卷积多层感知器
class ConvMLP(nn.Module):
    def __init__(self, chan, chan_out = 256, inner_dim = 2048):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv2d(chan, inner_dim, 1),
            nn.BatchNorm2d(inner_dim),
            nn.ReLU(),
            nn.Conv2d(inner_dim, chan_out, 1)
        )

    def forward(self, x):
        return self.net(x)

# 空间金字塔池化
class PPM(nn.Module):
    # 初始化函数,设置网络的参数
    def __init__(
        self,
        *,
        chan,
        num_layers = 1,
        gamma = 2):
        # 调用父类的初始化函数
        super().__init__()
        # 设置网络的 gamma 参数
        self.gamma = gamma

        # 根据 num_layers 的值选择不同的转换网络
        if num_layers == 0:
            # 如果 num_layers 为 0,则使用恒等映射
            self.transform_net = nn.Identity()
        elif num_layers == 1:
            # 如果 num_layers 为 1,则使用一个卷积层
            self.transform_net = nn.Conv2d(chan, chan, 1)
        elif num_layers == 2:
            # 如果 num_layers 为 2,则使用两个卷积层和批归一化层
            self.transform_net = nn.Sequential(
                nn.Conv2d(chan, chan, 1),
                nn.BatchNorm2d(chan),
                nn.ReLU(),
                nn.Conv2d(chan, chan, 1)
            )
        else:
            # 如果 num_layers 不是 0、1 或 2,则抛出数值错误
            raise ValueError('num_layers must be one of 0, 1, or 2')

    # 前向传播函数,定义网络的计算流程
    def forward(self, x):
        # 对输入张量 x 进行维度扩展
        xi = x[:, :, :, :, None, None]
        xj = x[:, :, None, None, :, :]
        # 计算相似度矩阵,使用余弦相似度并进行非负化和幂运算
        similarity = F.relu(F.cosine_similarity(xi, xj, dim = 1)) ** self.gamma

        # 对输入张量 x 进行变换
        transform_out = self.transform_net(x)
        # 使用 einsum 函数将相似度矩阵和变换后的张量进行乘积和重组
        out = einsum('b x y h w, b c h w -> b c x y', similarity, transform_out)
        # 返回计算结果
        return out
# 一个用于基础神经网络的包装类
# 将管理隐藏层输出的拦截并将其传递到投影器和预测器网络中

class NetWrapper(nn.Module):
    def __init__(
        self,
        *,
        net,
        projection_size,
        projection_hidden_size,
        layer_pixel = -2,
        layer_instance = -2
    ):
        super().__init__()
        self.net = net
        self.layer_pixel = layer_pixel
        self.layer_instance = layer_instance

        self.pixel_projector = None
        self.instance_projector = None

        self.projection_size = projection_size
        self.projection_hidden_size = projection_hidden_size

        self.hidden_pixel = None
        self.hidden_instance = None
        self.hook_registered = False

    # 查找指定层
    def _find_layer(self, layer_id):
        if type(layer_id) == str:
            modules = dict([*self.net.named_modules()])
            return modules.get(layer_id, None)
        elif type(layer_id) == int:
            children = [*self.net.children()]
            return children[layer_id]
        return None

    # 钩子函数,用于拦截像素层输出
    def _hook_pixel(self, _, __, output):
        setattr(self, 'hidden_pixel', output)

    # 钩子函数,用于拦截实例层输出
    def _hook_instance(self, _, __, output):
        setattr(self, 'hidden_instance', output)

    # 注册钩子函数
    def _register_hook(self):
        pixel_layer = self._find_layer(self.layer_pixel)
        instance_layer = self._find_layer(self.layer_instance)

        assert pixel_layer is not None, f'hidden layer ({self.layer_pixel}) not found'
        assert instance_layer is not None, f'hidden layer ({self.layer_instance}) not found'

        pixel_layer.register_forward_hook(self._hook_pixel)
        instance_layer.register_forward_hook(self._hook_instance)
        self.hook_registered = True

    # 获取像素投影器
    @singleton('pixel_projector')
    def _get_pixel_projector(self, hidden):
        _, dim, *_ = hidden.shape
        projector = ConvMLP(dim, self.projection_size, self.projection_hidden_size)
        return projector.to(hidden)

    # 获取实例投影器
    @singleton('instance_projector')
    def _get_instance_projector(self, hidden):
        _, dim = hidden.shape
        projector = MLP(dim, self.projection_size, self.projection_hidden_size)
        return projector.to(hidden)

    # 获取表示
    def get_representation(self, x):
        if not self.hook_registered:
            self._register_hook()

        _ = self.net(x)
        hidden_pixel = self.hidden_pixel
        hidden_instance = self.hidden_instance
        self.hidden_pixel = None
        self.hidden_instance = None
        assert hidden_pixel is not None, f'hidden pixel layer {self.layer_pixel} never emitted an output'
        assert hidden_instance is not None, f'hidden instance layer {self.layer_instance} never emitted an output'
        return hidden_pixel, hidden_instance

    # 前向传播
    def forward(self, x):
        pixel_representation, instance_representation = self.get_representation(x)
        instance_representation = instance_representation.flatten(1)

        pixel_projector = self._get_pixel_projector(pixel_representation)
        instance_projector = self._get_instance_projector(instance_representation)

        pixel_projection = pixel_projector(pixel_representation)
        instance_projection = instance_projector(instance_representation)
        return pixel_projection, instance_projection

# 主类

class PixelCL(nn.Module):
    # 初始化函数,设置模型参数和数据增强方式等
    def __init__(
        self,
        net,
        image_size,
        hidden_layer_pixel = -2,
        hidden_layer_instance = -2,
        projection_size = 256,
        projection_hidden_size = 2048,
        augment_fn = None,
        augment_fn2 = None,
        prob_rand_hflip = 0.25,
        moving_average_decay = 0.99,
        ppm_num_layers = 1,
        ppm_gamma = 2,
        distance_thres = 0.7,
        similarity_temperature = 0.3,
        alpha = 1.,
        use_pixpro = True,
        cutout_ratio_range = (0.6, 0.8),
        cutout_interpolate_mode = 'nearest',
        coord_cutout_interpolate_mode = 'bilinear'
    ):
        # 调用父类的初始化函数
        super().__init__()

        # 默认的数据增强方式
        DEFAULT_AUG = nn.Sequential(
            RandomApply(augs.ColorJitter(0.8, 0.8, 0.8, 0.2), p=0.8),
            augs.RandomGrayscale(p=0.2),
            RandomApply(filters.GaussianBlur2d((3, 3), (1.5, 1.5)), p=0.1),
            augs.RandomSolarize(p=0.5),
            augs.Normalize(mean=torch.tensor([0.485, 0.456, 0.406]), std=torch.tensor([0.229, 0.224, 0.225]))
        )

        # 设置数据增强方式
        self.augment1 = default(augment_fn, DEFAULT_AUG)
        self.augment2 = default(augment_fn2, self.augment1)
        self.prob_rand_hflip = prob_rand_hflip

        # 在线编码器
        self.online_encoder = NetWrapper(
            net = net,
            projection_size = projection_size,
            projection_hidden_size = projection_hidden_size,
            layer_pixel = hidden_layer_pixel,
            layer_instance = hidden_layer_instance
        )

        self.target_encoder = None
        self.target_ema_updater = EMA(moving_average_decay)

        self.distance_thres = distance_thres
        self.similarity_temperature = similarity_temperature
        self.alpha = alpha

        self.use_pixpro = use_pixpro

        # 如果使用像素级处理
        if use_pixpro:
            self.propagate_pixels = PPM(
                chan = projection_size,
                num_layers = ppm_num_layers,
                gamma = ppm_gamma
            )

        self.cutout_ratio_range = cutout_ratio_range
        self.cutout_interpolate_mode = cutout_interpolate_mode
        self.coord_cutout_interpolate_mode = coord_cutout_interpolate_mode

        # 实例级别预测器
        self.online_predictor = MLP(projection_size, projection_size, projection_hidden_size)

        # 获取网络设备并将 wrapper 设置为相同设备
        device = get_module_device(net)
        self.to(device)

        # 发送一个模拟图像张量以实例化单例参数
        self.forward(torch.randn(2, 3, image_size, image_size, device=device))

    # 获取目标编码器的单例函数
    @singleton('target_encoder')
    def _get_target_encoder(self):
        target_encoder = copy.deepcopy(self.online_encoder)
        set_requires_grad(target_encoder, False)
        return target_encoder

    # 重置移动平均值
    def reset_moving_average(self):
        del self.target_encoder
        self.target_encoder = None

    # 更新移动平均值
    def update_moving_average(self):
        assert self.target_encoder is not None, 'target encoder has not been created yet'
        update_moving_average(self.target_ema_updater, self.target_encoder, self.online_encoder)

.\lucidrains\pixel-level-contrastive-learning\pixel_level_contrastive_learning\__init__.py

# 从 pixel_level_contrastive_learning.pixel_level_contrastive_learning 模块中导入 PPM 和 PixelCL 类
from pixel_level_contrastive_learning.pixel_level_contrastive_learning import PPM, PixelCL

Pixel-level Contrastive Learning

Implementation of Pixel-level Contrastive Learning, proposed in the paper "Propagate Yourself", in Pytorch. In addition to doing contrastive learning on the pixel level, the online network further passes the pixel level representations to a Pixel Propagation Module and enforces a similarity loss to the target network. They beat all previous unsupervised and supervised methods in segmentation tasks.

Install

$ pip install pixel-level-contrastive-learning

Usage

Below is an example of how you would use the framework to self-supervise training of a resnet, taking the output of layer 4 (8 x 8 'pixels').

import torch
from pixel_level_contrastive_learning import PixelCL
from torchvision import models
from tqdm import tqdm

resnet = models.resnet50(pretrained=True)

learner = PixelCL(
    resnet,
    image_size = 256,
    hidden_layer_pixel = 'layer4',  # leads to output of 8x8 feature map for pixel-level learning
    hidden_layer_instance = -2,     # leads to output for instance-level learning
    projection_size = 256,          # size of projection output, 256 was used in the paper
    projection_hidden_size = 2048,  # size of projection hidden dimension, paper used 2048
    moving_average_decay = 0.99,    # exponential moving average decay of target encoder
    ppm_num_layers = 1,             # number of layers for transform function in the pixel propagation module, 1 was optimal
    ppm_gamma = 2,                  # sharpness of the similarity in the pixel propagation module, already at optimal value of 2
    distance_thres = 0.7,           # ideal value is 0.7, as indicated in the paper, which makes the assumption of each feature map's pixel diagonal distance to be 1 (still unclear)
    similarity_temperature = 0.3,   # temperature for the cosine similarity for the pixel contrastive loss
    alpha = 1.,                      # weight of the pixel propagation loss (pixpro) vs pixel CL loss
    use_pixpro = True,               # do pixel pro instead of pixel contrast loss, defaults to pixpro, since it is the best one
    cutout_ratio_range = (0.6, 0.8)  # a random ratio is selected from this range for the random cutout
).cuda()

opt = torch.optim.Adam(learner.parameters(), lr=1e-4)

def sample_batch_images():
    return torch.randn(10, 3, 256, 256).cuda()

for _ in tqdm(range(100000)):
    images = sample_batch_images()
    loss = learner(images) # if positive pixel pairs is equal to zero, the loss is equal to the instance level loss

    opt.zero_grad()
    loss.backward()
    print(loss.item())
    opt.step()
    learner.update_moving_average() # update moving average of target encoder

# after much training, save the improved model for testing on downstream task
torch.save(resnet, 'improved-resnet.pt')

You can also return the number of positive pixel pairs on forward, for logging or other purposes

loss, positive_pairs = learner(images, return_positive_pairs = True)

Citations

@misc{xie2020propagate,
    title={Propagate Yourself: Exploring Pixel-Level Consistency for Unsupervised Visual Representation Learning}, 
    author={Zhenda Xie and Yutong Lin and Zheng Zhang and Yue Cao and Stephen Lin and Han Hu},
    year={2020},
    eprint={2011.10043},
    archivePrefix={arXiv},
    primaryClass={cs.CV}
}

.\lucidrains\pixel-level-contrastive-learning\setup.py

# 导入设置安装和查找包的函数
from setuptools import setup, find_packages

# 设置包的元数据
setup(
  # 包的名称
  name = 'pixel-level-contrastive-learning',
  # 查找并包含所有包
  packages = find_packages(),
  # 版本号
  version = '0.1.1',
  # 许可证信息
  license='MIT',
  # 描述信息
  description = 'Pixel-Level Contrastive Learning',
  # 作者信息
  author = 'Phil Wang',
  # 作者邮箱
  author_email = 'lucidrains@gmail.com',
  # 项目链接
  url = 'https://github.com/lucidrains/pixel-level-contrastive-learning',
  # 关键词
  keywords = ['self-supervised learning', 'artificial intelligence'],
  # 安装依赖
  install_requires=[
      'einops',
      'torch>=1.6',
      'kornia>=0.4.0'
  ],
  # 分类信息
  classifiers=[
      'Development Status :: 4 - Beta',
      'Intended Audience :: Developers',
      'Topic :: Scientific/Engineering :: Artificial Intelligence',
      'License :: OSI Approved :: MIT License',
      'Programming Language :: Python :: 3.6',
  ],
)

.\lucidrains\point-transformer-pytorch\point_transformer_pytorch\multihead_point_transformer_pytorch.py

# 导入 torch 库
import torch
# 从 torch 库中导入 nn, einsum 模块
from torch import nn, einsum
# 从 einops 库中导入 repeat, rearrange 函数

# helpers

# 判断值是否存在的函数
def exists(val):
    return val is not None

# 获取张量的最大值
def max_value(t):
    return torch.finfo(t.dtype).max

# 在指定维度上对批量索引进行选择的函数
def batched_index_select(values, indices, dim = 1):
    # 获取值的维度
    value_dims = values.shape[(dim + 1):]
    # 获取值和索引的形状
    values_shape, indices_shape = map(lambda t: list(t.shape), (values, indices))
    # 将索引扩展到与值相同的维度
    indices = indices[(..., *((None,) * len(value_dims))]
    indices = indices.expand(*((-1,) * len(indices_shape)), *value_dims)
    value_expand_len = len(indices_shape) - (dim + 1)
    values = values[(*((slice(None),) * dim), *((None,) * value_expand_len), ...)]

    value_expand_shape = [-1] * len(values.shape)
    expand_slice = slice(dim, (dim + value_expand_len))
    value_expand_shape[expand_slice] = indices.shape[expand_slice]
    values = values.expand(*value_expand_shape)

    dim += value_expand_len
    return values.gather(dim, indices)

# classes

# 多头点变换器层类
class MultiheadPointTransformerLayer(nn.Module):
    def __init__(
        self,
        *,
        dim,
        heads = 4,
        dim_head = 64,
        pos_mlp_hidden_dim = 64,
        attn_mlp_hidden_mult = 4,
        num_neighbors = None
    ):
        super().__init__()
        self.heads = heads
        inner_dim = dim_head * heads

        self.num_neighbors = num_neighbors

        # 线性变换,将输入维度映射到内部维度的三倍
        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)
        # 线性变换,将内部维度映射回输出维度
        self.to_out = nn.Linear(inner_dim, dim)

        # 位置多层感知机
        self.pos_mlp = nn.Sequential(
            nn.Linear(3, pos_mlp_hidden_dim),
            nn.ReLU(),
            nn.Linear(pos_mlp_hidden_dim, inner_dim)
        )

        attn_inner_dim = inner_dim * attn_mlp_hidden_mult

        # 注意力多层感知机
        self.attn_mlp = nn.Sequential(
            nn.Conv2d(inner_dim, attn_inner_dim, 1, groups = heads),
            nn.ReLU(),
            nn.Conv2d(attn_inner_dim, inner_dim, 1, groups = heads),
        )
    # 定义前向传播函数,接受输入 x、位置 pos 和可选的掩码 mask
    def forward(self, x, pos, mask = None):
        # 获取输入 x 的维度信息
        n, h, num_neighbors = x.shape[1], self.heads, self.num_neighbors

        # 获取查询、键、值
        q, k, v = self.to_qkv(x).chunk(3, dim = -1)

        # 将查询、键、值按照头数 h 进行分组
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v))

        # 计算相对位置嵌入
        rel_pos = rearrange(pos, 'b i c -> b i 1 c') - rearrange(pos, 'b j c -> b 1 j c')
        rel_pos_emb = self.pos_mlp(rel_pos)

        # 将相对位置嵌入按照头数 h 进行分组
        rel_pos_emb = rearrange(rel_pos_emb, 'b i j (h d) -> b h i j d', h = h)

        # 使用查询减去键。这可能是比点积更好的归纳偏差,适用于点云
        qk_rel = rearrange(q, 'b h i d -> b h i 1 d') - rearrange(k, 'b h j d -> b h 1 j d')

        # 准备掩码
        if exists(mask):
            mask = rearrange(mask, 'b i -> b i 1') * rearrange(mask, 'b j -> b 1 j')

        # 扩展值
        v = repeat(v, 'b h j d -> b h i j d', i = n)

        # 如果指定了 num_neighbors,则确定每个点的 k 近邻
        if exists(num_neighbors) and num_neighbors < n:
            rel_dist = rel_pos.norm(dim = -1)

            if exists(mask):
                mask_value = max_value(rel_dist)
                rel_dist.masked_fill_(~mask, mask_value)

            dist, indices = rel_dist.topk(num_neighbors, largest = False)

            indices_with_heads = repeat(indices, 'b i j -> b h i j', h = h)

            v = batched_index_select(v, indices_with_heads, dim = 3)
            qk_rel = batched_index_select(qk_rel, indices_with_heads, dim = 3)
            rel_pos_emb = batched_index_select(rel_pos_emb, indices_with_heads, dim = 3)

            if exists(mask):
                mask = batched_index_select(mask, indices, dim = 2)

        # 将相对位置嵌入添加到值中
        v = v + rel_pos_emb

        # 使用注意力 MLP,确保先添加相对位置嵌入
        attn_mlp_input = qk_rel + rel_pos_emb
        attn_mlp_input = rearrange(attn_mlp_input, 'b h i j d -> b (h d) i j')

        sim = self.attn_mlp(attn_mlp_input)

        # 掩码
        if exists(mask):
            mask_value = -max_value(sim)
            mask = rearrange(mask, 'b i j -> b 1 i j')
            sim.masked_fill_(~mask, mask_value)

        # 注意力
        attn = sim.softmax(dim = -2)

        # 聚合
        v = rearrange(v, 'b h i j d -> b i j (h d)')
        agg = einsum('b d i j, b i j d -> b i d', attn, v)

        # 合并头
        return self.to_out(agg)

.\lucidrains\point-transformer-pytorch\point_transformer_pytorch\point_transformer_pytorch.py

import torch
from torch import nn, einsum
from einops import repeat

# 辅助函数

# 检查值是否存在
def exists(val):
    return val is not None

# 获取张量的最大值
def max_value(t):
    return torch.finfo(t.dtype).max

# 在给定维度上对批量索引进行选择
def batched_index_select(values, indices, dim = 1):
    value_dims = values.shape[(dim + 1):]
    values_shape, indices_shape = map(lambda t: list(t.shape), (values, indices))
    indices = indices[(..., *((None,) * len(value_dims))]
    indices = indices.expand(*((-1,) * len(indices_shape)), *value_dims)
    value_expand_len = len(indices_shape) - (dim + 1)
    values = values[(*((slice(None),) * dim), *((None,) * value_expand_len), ...)]

    value_expand_shape = [-1] * len(values.shape)
    expand_slice = slice(dim, (dim + value_expand_len))
    value_expand_shape[expand_slice] = indices.shape[expand_slice]
    values = values.expand(*value_expand_shape)

    dim += value_expand_len
    return values.gather(dim, indices)

# 类

class PointTransformerLayer(nn.Module):
    def __init__(
        self,
        *,
        dim,
        pos_mlp_hidden_dim = 64,
        attn_mlp_hidden_mult = 4,
        num_neighbors = None
    ):
        super().__init__()
        self.num_neighbors = num_neighbors

        # 线性变换,将输入维度映射到查询、键、值的维度
        self.to_qkv = nn.Linear(dim, dim * 3, bias = False)

        # 位置信息的多层感知机
        self.pos_mlp = nn.Sequential(
            nn.Linear(3, pos_mlp_hidden_dim),
            nn.ReLU(),
            nn.Linear(pos_mlp_hidden_dim, dim)
        )

        # 注意力机制的多层感知机
        self.attn_mlp = nn.Sequential(
            nn.Linear(dim, dim * attn_mlp_hidden_mult),
            nn.ReLU(),
            nn.Linear(dim * attn_mlp_hidden_mult, dim),
        )

    def forward(self, x, pos, mask = None):
        n, num_neighbors = x.shape[1], self.num_neighbors

        # 获取查询、键、值
        q, k, v = self.to_qkv(x).chunk(3, dim = -1)

        # 计算相对位置嵌入
        rel_pos = pos[:, :, None, :] - pos[:, None, :, :]
        rel_pos_emb = self.pos_mlp(rel_pos)

        # 使用查询减去键。我认为这是点云的更好归纳偏差,而不是点积
        qk_rel = q[:, :, None, :] - k[:, None, :, :]

        # 准备掩码
        if exists(mask):
            mask = mask[:, :, None] * mask[:, None, :]

        # 扩展值
        v = repeat(v, 'b j d -> b i j d', i = n)

        # 如果指定了每个点的 k 近邻数,则确定 k 个最近邻
        if exists(num_neighbors) and num_neighbors < n:
            rel_dist = rel_pos.norm(dim = -1)

            if exists(mask):
                mask_value = max_value(rel_dist)
                rel_dist.masked_fill_(~mask, mask_value)

            dist, indices = rel_dist.topk(num_neighbors, largest = False)

            v = batched_index_select(v, indices, dim = 2)
            qk_rel = batched_index_select(qk_rel, indices, dim = 2)
            rel_pos_emb = batched_index_select(rel_pos_emb, indices, dim = 2)
            mask = batched_index_select(mask, indices, dim = 2) if exists(mask) else None

        # 将相对位置嵌入添加到值中
        v = v + rel_pos_emb

        # 使用注意力多层感知机,确保先添加相对位置嵌入
        sim = self.attn_mlp(qk_rel + rel_pos_emb)

        # 掩码
        if exists(mask):
            mask_value = -max_value(sim)
            sim.masked_fill_(~mask[..., None], mask_value)

        # 注意力
        attn = sim.softmax(dim = -2)

        # 聚合
        agg = einsum('b i j d, b i j d -> b i d', attn, v)
        return agg

.\lucidrains\point-transformer-pytorch\point_transformer_pytorch\__init__.py

# 从 point_transformer_pytorch 模块中导入 PointTransformerLayer 类
from point_transformer_pytorch.point_transformer_pytorch import PointTransformerLayer
# 从 point_transformer_pytorch 模块中导入 MultiheadPointTransformerLayer 类
from point_transformer_pytorch.multihead_point_transformer_pytorch import MultiheadPointTransformerLayer

Point Transformer - Pytorch

Implementation of the Point Transformer self-attention layer, in Pytorch. The simple circuit above seemed to have allowed their group to outperform all previous methods in point cloud classification and segmentation.

Install

$ pip install point-transformer-pytorch

Usage

import torch
from point_transformer_pytorch import PointTransformerLayer

attn = PointTransformerLayer(
    dim = 128,
    pos_mlp_hidden_dim = 64,
    attn_mlp_hidden_mult = 4
)

feats = torch.randn(1, 16, 128)
pos = torch.randn(1, 16, 3)
mask = torch.ones(1, 16).bool()

attn(feats, pos, mask = mask) # (1, 16, 128)

This type of vector attention is much more expensive than the traditional one. In the paper, they used k-nearest neighbors on the points to exclude attention on faraway points. You can do the same with a single extra setting.

import torch
from point_transformer_pytorch import PointTransformerLayer

attn = PointTransformerLayer(
    dim = 128,
    pos_mlp_hidden_dim = 64,
    attn_mlp_hidden_mult = 4,
    num_neighbors = 16          # only the 16 nearest neighbors would be attended to for each point
)

feats = torch.randn(1, 2048, 128)
pos = torch.randn(1, 2048, 3)
mask = torch.ones(1, 2048).bool()

attn(feats, pos, mask = mask) # (1, 16, 128)

Citations

@misc{zhao2020point,
    title={Point Transformer}, 
    author={Hengshuang Zhao and Li Jiang and Jiaya Jia and Philip Torr and Vladlen Koltun},
    year={2020},
    eprint={2012.09164},
    archivePrefix={arXiv},
    primaryClass={cs.CV}
}

.\lucidrains\point-transformer-pytorch\setup.py

# 导入设置工具和查找包的函数
from setuptools import setup, find_packages

# 设置包的元数据
setup(
  name = 'point-transformer-pytorch',  # 包的名称
  packages = find_packages(),  # 查找所有包
  version = '0.1.5',  # 版本号
  license='MIT',  # 许可证
  description = 'Point Transformer - Pytorch',  # 描述
  author = 'Phil Wang',  # 作者
  author_email = 'lucidrains@gmail.com',  # 作者邮箱
  url = 'https://github.com/lucidrains/point-transformer-pytorch',  # 项目链接
  keywords = [  # 关键词列表
    'artificial intelligence',
    'transformers',
    'attention mechanism',
    'point clouds'
  ],
  install_requires=[  # 安装依赖
    'einops>=0.3',
    'torch>=1.6'
  ],
  classifiers=[  # 分类器
    'Development Status :: 4 - Beta',
    'Intended Audience :: Developers',
    'Topic :: Scientific/Engineering :: Artificial Intelligence',
    'License :: OSI Approved :: MIT License',
    'Programming Language :: Python :: 3.6',
  ],
)

.\lucidrains\ponder-transformer\ponder_transformer\ponder_transformer.py

import torch
import torch.nn.functional as F
from torch import nn, einsum

from einops import rearrange, repeat
from einops.layers.torch import Rearrange, Reduce

# 常量

ABS_MAX_STEPS = 100

# 辅助函数

def exists(val):
    return val is not None

# 类

class PreNorm(nn.Module):
    def __init__(self, dim, fn):
        super().__init__()
        self.fn = fn
        self.norm = nn.LayerNorm(dim)

    def forward(self, x, **kwargs):
        x = self.norm(x)
        return self.fn(x, **kwargs)

def FeedForward(dim, mult = 4):
    return nn.Sequential(
        nn.Linear(dim, dim * mult),
        nn.GELU(),
        nn.Linear(dim * mult, dim)
    )

class Attention(nn.Module):
    def __init__(
        self,
        *,
        dim,
        dim_head = 64,
        heads = 8,
        causal = False
    ):
        super().__init__()
        self.heads = heads
        self.causal = causal
        self.scale = dim_head ** -0.5
        inner_dim = dim_head * heads

        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)
        self.to_out = nn.Linear(inner_dim, dim)

    def forward(self, x, mask = None):
        n, h, device = x.shape[1], self.heads, x.device
        qkv = self.to_qkv(x).chunk(3, dim = -1)
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), qkv)
        sim = einsum('b h i d, b h j d -> b h i j', q, k) * self.scale

        mask_value = -torch.finfo(sim.dtype).max

        if exists(mask):
            mask = rearrange(mask, 'b i -> b () i ()') * rearrange(mask, 'b j -> b () () j')
            sim = sim.masked_fill(mask, mask_value)

        if self.causal:
            i, j = sim.shape[-2:]
            causal_mask = torch.ones((i, j), device = device).triu(j - i + 1).bool()
            sim = sim.masked_fill(causal_mask, mask_value)

        attn = sim.softmax(dim = -1)
        out = einsum('b h i j, b h j d -> b h i d', attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)')
        return self.to_out(out)

# pondering 类和辅助函数

def pad_to(t, padding, dim = -1, value = 0.):
    if dim > 0:
        dim = dim - t.ndim
    zeroes = -dim - 1
    return F.pad(t, (*((0, 0) * zeroes), *padding), value = value)

def safe_cumprod(t, eps = 1e-10, dim = -1):
    t = torch.clip(t, min = eps, max = 1.)
    return torch.exp(torch.cumsum(torch.log(t), dim = dim))

def exclusive_cumprod(t, dim = -1):
    cum_prod = safe_cumprod(t, dim = dim)
    return pad_to(cum_prod, (1, -1), value = 1., dim = dim)

def calc_geometric(l, dim = -1):
    return exclusive_cumprod(1 - l, dim = dim) * l

# 主类

class Block(nn.Module):
    def __init__(
        self,
        *,
        dim,
        dim_head = 64,
        heads = 8,
        causal = False,
        ff_mult = 4
    ):
        super().__init__()
        self.causal = causal
        self.attn = PreNorm(dim, Attention(dim = dim, dim_head = dim_head, heads = heads, causal = causal))
        self.ff = PreNorm(dim, FeedForward(dim = dim, mult = ff_mult))

        self.to_halt_logits = nn.Sequential(
            nn.Linear(dim, 1),
            Rearrange('... () -> ...')
        )

    def forward(self, x, mask = None):
        x = self.attn(x, mask = mask) + x
        x = self.ff(x) + x

        if self.causal:
            denom = torch.arange(x.shape[-2], device = x.device)
            denom = rearrange(denom, 'n -> () n ()')
            halt_input = x.cumsum(dim = 1) / (denom + 1)
        else:
            halt_input = x.mean(dim = 1)

        halt_logits = self.to_halt_logits(halt_input)

        return x, halt_logits

class PonderTransformer(nn.Module):
    def __init__(
        self,
        *,
        num_tokens,
        dim,
        max_seq_len,
        causal = True,
        dim_head = 64,
        heads = 8,
        ponder_kl_div_loss_weight = 0.01,
        ponder_lambda_p = 0.2,
        ponder_epsilon = 0.05,
        eps = 1e-20
        ):
        # 调用父类的构造函数
        super().__init__()
        # 初始化epsilon值
        self.eps = eps
        # 初始化causal值
        self.causal = causal
        # 初始化序列长度为最大序列长度
        self.seq_len = max_seq_len
        # 创建token嵌入层,将token映射到指定维度
        self.token_emb = nn.Embedding(num_tokens, dim)
        # 创建位置嵌入层,将位置映射到指定维度
        self.pos_emb = nn.Embedding(max_seq_len, dim)

        # 计算最大步数

        # 计算停止概率的阈值
        thres = 1 - ponder_epsilon
        # 计算几何级数停止概率
        halt_probs = calc_geometric(torch.full((ABS_MAX_STEPS,), ponder_lambda_p))
        # 计算停止概率的累积和
        cum_halt_probs = halt_probs.cumsum(dim = 0)
        # 训练最大步数为满足停止概率小于阈值的步数
        self.train_max_steps = (cum_halt_probs < thres).sum().item()

        # 初始化ponder_lambda_p值
        self.ponder_lambda_p = ponder_lambda_p
        # 初始化ponder_kl_div_loss_weight值
        self.ponder_kl_div_loss_weight = ponder_kl_div_loss_weight

        # pondering block

        # 创建Block模块
        self.block = Block(
            dim = dim,
            dim_head = dim_head,
            heads = heads,
            causal = causal
        )

        # 隐藏状态到'Y' - 输出

        # 创建输出层,包括LayerNorm和线性层
        self.to_logits = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, num_tokens)
        )