Lucidrains 系列项目源码解析(二十二)
.\lucidrains\ddpm-proteins\ddpm_proteins\utils.py
# 导入所需的库
import os
from PIL import Image
import seaborn as sn
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F
from sidechainnet.utils.sequence import ProteinVocabulary
from einops import rearrange
# 通用函数
# 检查值是否存在
def exists(val):
return val is not None
# 返回默认值
def default(val, d):
return val if exists(val) else d
# 广播连接多个张量
def broadcat(tensors, dim = -1):
num_tensors = len(tensors)
shape_lens = set(list(map(lambda t: len(t.shape), tensors)))
assert len(shape_lens) == 1, 'tensors must all have the same number of dimensions'
shape_len = list(shape_lens)[0]
dim = (dim + shape_len) if dim < 0 else dim
dims = list(zip(*map(lambda t: list(t.shape), tensors)))
expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim]
assert all([*map(lambda t: len(set(t[1])) <= 2, expandable_dims)]), 'invalid dimensions for broadcastable concatentation'
max_dims = list(map(lambda t: (t[0], max(t[1])), expandable_dims))
expanded_dims = list(map(lambda t: (t[0], (t[1],) * num_tensors), max_dims))
expanded_dims.insert(dim, (dim, dims[dim]))
expandable_shapes = list(zip(*map(lambda t: t[1], expanded_dims)))
tensors = list(map(lambda t: t[0].expand(*t[1]), zip(tensors, expandable_shapes)))
return torch.cat(tensors, dim = dim)
# 单例 MSA 转换器
msa_instances = None
def get_msa_transformer():
global msa_instances
if not exists(msa_instances):
msa_model, alphabet = torch.hub.load("facebookresearch/esm", "esm_msa1_t12_100M_UR50S")
batch_converter = alphabet.get_batch_converter()
return msa_model, batch_converter
return msa_instances
# MSA 嵌入相关函数
VOCAB = ProteinVocabulary()
# 将氨基酸 ID 转换为字符串
def ids_to_aa_str(x):
assert isinstance(x, list), 'input must be a list'
id2aa = VOCAB._int2char
is_char = lambda c: isinstance(c, str) and len(c) == 1
out = []
for el in x:
if isinstance(el, list):
out.append(ids_to_aa_str(el))
elif isinstance(el, int):
out.append(id2aa[el])
else:
raise TypeError('type must be either list or character')
if all(map(is_char, out)):
return ''.join(out)
return out
# 将氨基酸字符串转换为嵌入输入
def aa_str_to_embed_input(x):
assert isinstance(x, list), 'input must be a list'
out = []
for el in x:
if isinstance(el, list):
out.append(aa_str_to_embed_input(el))
elif isinstance(el, str):
out.append((None, el))
else:
raise TypeError('type must be either list or string')
return out
# 对齐位置相关的函数
def apc(x):
a1 = x.sum(-1, keepdims=True)
a2 = x.sum(-2, keepdims=True)
a12 = x.sum((-1, -2), keepdims=True)
avg = a1 * a2
avg.div_(a12)
normalized = x - avg
return normalized
# 使矩阵对称
def symmetrize(x):
return x + x.transpose(-1, -2)
# 将图像填充到指定大小
def pad_image_to(tensor, size, value = 0.):
remainder = size - tensor.shape[-1]
tensor = F.pad(tensor, (0, remainder, 0, remainder), value = value)
return tensor
# 获取单个 MSA 注意力嵌入,带有缓存
CACHE_PATH = default(os.getenv('CACHE_PATH'), os.path.expanduser('~/.cache.ddpm-proteins'))
FETCH_FROM_CACHE = not exists(os.getenv('CLEAR_CACHE'))
os.makedirs(CACHE_PATH, exist_ok = True)
@torch.no_grad()
def get_msa_attention_embedding(
model,
batch_converter,
aa_str,
id,
fetch_msas_fn = lambda t: [],
cache = True
):
device = next(model.parameters()).device
cache_full_path = os.path.join(CACHE_PATH, f'{id}.pt')
if cache and FETCH_FROM_CACHE and os.path.exists(cache_full_path):
try:
loaded = torch.load(cache_full_path).to(device)
except:
loaded = None
if exists(loaded):
return loaded
msas = default(fetch_msas_fn(aa_str), [])
seq_with_msas = [aa_str, *msas]
embed_inputs = aa_str_to_embed_input(seq_with_msas)
_, _, msa_batch_tokens = batch_converter(embed_inputs)
# 使用模型对输入的批量 tokens 进行推理,需要获取注意力权重
results = model(msa_batch_tokens.to(device), need_head_weights = True)
# 从结果中获取行注意力权重
attentions = results['row_attentions']
# 剔除无效的位置信息
attentions = attentions[..., 1:, 1:]
# 重新排列注意力权重的维度
attentions = rearrange(attentions, 'b l h m n -> b (l h) m n')
# 对注意力权重进行对称化处理
attentions = apc(symmetrize(attentions))
# 如果需要缓存结果,则将结果保存到指定路径
if cache:
print(f'caching to {cache_full_path}')
torch.save(attentions, cache_full_path)
# 返回处理后的注意力权重
return attentions
# 获取多序列对齐(MSA)的注意力嵌入
def get_msa_attention_embeddings(
model,
batch_converter,
seqs,
ids,
fetch_msas_fn = lambda t: [],
cache = True
):
# 获取序列的长度
n = seqs.shape[1]
# 重新排列序列的维度
seqs = rearrange(seqs, 'b n -> b () n')
# 将序列 ID 转换为氨基酸字符串
aa_strs = ids_to_aa_str(seqs.cpu().tolist())
# 获取每个序列的注意力嵌入
embeds_list = [get_msa_attention_embedding(model, batch_converter, aa, seq_id, cache = cache) for aa, seq_id in zip(aa_strs, ids)]
# 将嵌入填充到相同长度
embeds_list = [pad_image_to(embed, n) for embed in embeds_list]
# 拼接所有嵌入
embeds = torch.cat(embeds_list, dim = 0)
return embeds
# 循环生成数据加载器
def cycle(loader, thres = 256):
while True:
for data in loader:
# 如果序列长度小于阈值,则生成数据
if data.seqs.shape[1] <= thres:
yield data
# 保存热图
def save_heatmap(tensor, filepath, dpi = 200, return_image = False):
# 生成热图
heatmap = sn.heatmap(tensor.cpu().numpy())
# 获取热图的图像
figure = heatmap.get_figure()
# 保存热图到文件
figure.savefig(filepath, dpi = dpi)
# 清空图像
plt.clf()
# 如果不需要返回图像,则结束函数
if not return_image:
return
# 返回图像对象
return Image.open(filepath)
.\lucidrains\ddpm-proteins\ddpm_proteins\__init__.py
# 从 ddpm_proteins.ddpm_proteins 模块中导入 GaussianDiffusion, Unet, Trainer 类
from ddpm_proteins.ddpm_proteins import GaussianDiffusion, Unet, Trainer
Denoising Diffusion Probabilistic Model for Proteins
Implementation of Denoising Diffusion Probabilistic Model in Pytorch. It is a new approach to generative modeling that may have the potential to rival GANs. It uses denoising score matching to estimate the gradient of the data distribution, followed by Langevin sampling to sample from the true distribution. This implementation was transcribed from the official Tensorflow version here.
This specific repository will be using a heavily modifying version of the U-net for learning on protein structure, with eventual conditioning from MSA Transformers attention heads.

** at around 40k iterations **
Install
$ pip install ddpm-proteins
Training
We are using weights & biases for experimental tracking
First you need to login
$ wandb login
Then you will need to cache all the MSA attention embeddings by first running. For some reason, the below needs to be done multiple times to cache all the proteins correctly (it does work though). I'll get around to fixing this.
$ python cache.py
Finally, you can begin training by invoking
$ python train.py
If you would like to clear or recompute the cache (ie after changing the fetch MSA function), just run
$ rm -rf ~/.cache.ddpm-proteins
Todo
- condition on mask
- condition on MSA transformers (with caching of tensors in specified directory by protein id)
- all-attention network with uformer arxiv.org/abs/2106.03… (with 1d + 2d conv kernels)
- reach for size 384
- add all improvements from arxiv.org/abs/2105.05… and cascaded-diffusion.github.io/
Usage
import torch
from ddpm_proteins import Unet, GaussianDiffusion
model = Unet(
dim = 64,
dim_mults = (1, 2, 4, 8)
)
diffusion = GaussianDiffusion(
model,
image_size = 128,
timesteps = 1000, # number of steps
loss_type = 'l1' # L1 or L2
)
training_images = torch.randn(8, 3, 128, 128)
loss = diffusion(training_images)
loss.backward()
# after a lot of training
sampled_images = diffusion.sample(batch_size = 4)
sampled_images.shape # (4, 3, 128, 128)
Or, if you simply want to pass in a folder name and the desired image dimensions, you can use the Trainer class to easily train a model.
from ddpm_proteins import Unet, GaussianDiffusion, Trainer
model = Unet(
dim = 64,
dim_mults = (1, 2, 4, 8)
).cuda()
diffusion = GaussianDiffusion(
model,
image_size = 128,
timesteps = 1000, # number of steps
loss_type = 'l1' # L1 or L2
).cuda()
trainer = Trainer(
diffusion,
'path/to/your/images',
train_batch_size = 32,
train_lr = 2e-5,
train_num_steps = 700000, # total training steps
gradient_accumulate_every = 2, # gradient accumulation steps
ema_decay = 0.995, # exponential moving average decay
fp16 = True # turn on mixed precision training with apex
)
trainer.train()
Samples and model checkpoints will be logged to ./results periodically
Citations
@misc{ho2020denoising,
title = {Denoising Diffusion Probabilistic Models},
author = {Jonathan Ho and Ajay Jain and Pieter Abbeel},
year = {2020},
eprint = {2006.11239},
archivePrefix = {arXiv},
primaryClass = {cs.LG}
}
@inproceedings{anonymous2021improved,
title = {Improved Denoising Diffusion Probabilistic Models},
author = {Anonymous},
booktitle = {Submitted to International Conference on Learning Representations},
year = {2021},
url = {https://openreview.net/forum?id=-NEXDKk8gZ},
note = {under review}
}
@article{Rao2021.02.12.430858,
author = {Rao, Roshan and Liu, Jason and Verkuil, Robert and Meier, Joshua and Canny, John F. and Abbeel, Pieter and Sercu, Tom and Rives, Alexander},
title = {MSA Transformer},
year = {2021},
publisher = {Cold Spring Harbor Laboratory},
URL = {https://www.biorxiv.org/content/early/2021/02/13/2021.02.12.430858},
journal = {bioRxiv}
}
.\lucidrains\ddpm-proteins\setup.py
# 导入设置工具和查找包工具
from setuptools import setup, find_packages
# 设置包的元数据
setup(
name = 'ddpm-proteins', # 包名
packages = find_packages(), # 查找所有包
version = '0.0.11', # 版本号
license='MIT', # 许可证
description = 'Denoising Diffusion Probabilistic Models - for Proteins - Pytorch', # 描述
author = 'Phil Wang', # 作者
author_email = 'lucidrains@gmail.com', # 作者邮箱
url = 'https://github.com/lucidrains/ddpm-proteins', # 项目链接
keywords = [ # 关键词
'artificial intelligence',
'generative models',
'proteins'
],
install_requires=[ # 依赖的包
'einops',
'matplotlib',
'numpy',
'pillow',
'proDy',
'scipy',
'sidechainnet',
'seaborn',
'torch',
'torchvision',
'tqdm',
'wandb'
],
classifiers=[ # 分类
'Development Status :: 4 - Beta',
'Intended Audience :: Developers',
'Topic :: Scientific/Engineering :: Artificial Intelligence',
'License :: OSI Approved :: MIT License',
'Programming Language :: Python :: 3.6',
],
)
.\lucidrains\ddpm-proteins\train.py
import os
import torch
import sidechainnet as scn
from PIL import Image
from random import randrange
import torch
import torch.nn.functional as F
from torch import optim
from ddpm_proteins import Unet, GaussianDiffusion
from ddpm_proteins.utils import save_heatmap, broadcat, get_msa_attention_embeddings, symmetrize, get_msa_transformer, pad_image_to
from einops import rearrange
os.makedirs('./.tmps', exist_ok = True)
# 定义常量
NUM_ITERATIONS = int(2e6)
IMAGE_SIZE = 256
BATCH_SIZE = 4
GRADIENT_ACCUMULATE_EVERY = 8
LEARNING_RATE = 2e-5
SAMPLE_EVERY = 200
SCALE_DISTANCE_BY = 1e2
# 实验追踪器
import wandb
wandb.init(project = 'ddpm-proteins')
wandb.run.name = f'proteins of length {IMAGE_SIZE} or less'
wandb.run.save()
# 定义模型
model = Unet(
dim = 32,
dim_mults = (1, 2, 4, 8),
channels = 1,
condition_dim = 1 + 144 # mask (1) + attention embedding size (144)
)
diffusion = GaussianDiffusion(
model,
image_size = IMAGE_SIZE,
timesteps = 1000, # number of steps
loss_type = 'l1' # L1 or L2
)
def cycle(loader, thres = 256):
while True:
for data in loader:
if data.seqs.shape[1] <= thres:
yield data
data = scn.load(
casp_version = 12,
thinning = 30,
with_pytorch = 'dataloaders',
batch_size = BATCH_SIZE,
dynamic_batching = False
)
model, batch_converter = get_msa_transformer()
model = model.cuda(1) # 将 msa transformer 放在 cuda 设备 1 上
opt = optim.Adam(diffusion.parameters(), lr = LEARNING_RATE)
train_dl = cycle(data['train'], thres = IMAGE_SIZE)
valid_dl = cycle(data['test'], thres = IMAGE_SIZE)
diffusion = diffusion.cuda()
upper_triangular_mask = torch.ones(IMAGE_SIZE, IMAGE_SIZE).triu_(1).bool().cuda()
# 迭代训练模型
for ind in range(NUM_ITERATIONS):
for _ in range(GRADIENT_ACCUMULATE_EVERY):
batch = next(train_dl)
ids, seqs, coords, masks = batch.pids, batch.seqs, batch.crds, batch.msks
seqs = seqs.argmax(dim = -1)
coords = coords.reshape(BATCH_SIZE, -1, 14, 3)
coords = coords[:, :, 1].cuda() # 选择 alpha 碳
dist = torch.cdist(coords, coords)
data = dist[:, None, :, :]
crossed_mask = (masks[:, None, :, None] * masks[:, None, None, :]).cuda()
data.masked_fill_(~crossed_mask.bool(), 0.)
data = pad_image_to(data, IMAGE_SIZE, value = 0.)
crossed_mask = pad_image_to(crossed_mask, IMAGE_SIZE, value = 0.)
data = (data / SCALE_DISTANCE_BY).clamp(0., 1.)
data = data * upper_triangular_mask[None, None, :, :]
msa_attention_embeds = get_msa_attention_embeddings(model, batch_converter, seqs, ids)
msa_attention_embeds = pad_image_to(msa_attention_embeds, IMAGE_SIZE)
condition_tensor = broadcat((msa_attention_embeds.cuda(0), crossed_mask.float()), dim = 1)
loss = diffusion(data, condition_tensor = condition_tensor)
(loss / GRADIENT_ACCUMULATE_EVERY).backward()
print(loss.item())
wandb.log({'loss': loss.item()})
opt.step()
opt.zero_grad()
# 检查是否满足采样条件
if (ind % SAMPLE_EVERY) == 0:
# 从验证数据加载一个批次
batch = next(valid_dl)
# 获取批次中的蛋白质 ID、序列、坐标和掩码
ids, seqs, coords, masks = batch.pids, batch.seqs, batch.crds, batch.msks
# 将序列转换为 one-hot 编码
seqs = seqs.argmax(dim=-1)
# 重新整形坐标数据以便提取 alpha 碳原子
coords = coords.reshape(BATCH_SIZE, -1, 14, 3)
coords = coords[:, :, 1].cuda()
# 计算坐标之间的距离
dist = torch.cdist(coords, coords)
data = dist[:, None, :, :]
# 创建交叉掩码
crossed_mask = (masks[:, None, :, None] * masks[:, None, None, :]).cuda()
# 将数据中未交叉的部分填充为 0
data.masked_fill_(~crossed_mask.bool(), 0.)
# 将数据填充到指定大小,并缩放到指定范围
data = pad_image_to(data, IMAGE_SIZE, value=0.)
valid_data = (data / SCALE_DISTANCE_BY).clamp(0., 1.)
# 将交叉掩码填充到指定大小
crossed_mask = pad_image_to(crossed_mask, IMAGE_SIZE, value=0.)[:1].float()
# 获取 MSA 注意力嵌入
msa_attention_embeds = get_msa_attention_embeddings(model, batch_converter, seqs[:1], ids[:1])
msa_attention_embeds = pad_image_to(msa_attention_embeds, IMAGE_SIZE)
# 创建条件张量
condition_tensor = broadcat((msa_attention_embeds.cuda(0), crossed_mask.float()), dim=1)
# 从扩散过程中采样生成图像
sampled = diffusion.sample(batch_size=1, condition_tensor=condition_tensor)[0][0]
# 将采样结果限制在 0 到 1 之间,并根据上三角掩码进行修正
sampled = sampled.clamp(0., 1.) * upper_triangular_mask
sampled = symmetrize(sampled)
# 保存生成的图像和相关信息
img = save_heatmap(sampled, './.tmps/validation.png', dpi=100, return_image=True)
crossed_mask_img = save_heatmap(crossed_mask[0][0], './.tmps/mask.png', dpi=100, return_image=True)
truth_img = save_heatmap(valid_data[0][0], './.tmps/truth.png', dpi=100, return_image=True)
# 将图像上传到 wandb
wandb.log({'sample': wandb.Image(img), 'mask': wandb.Image(crossed_mask_img), 'truth': wandb.Image(truth_img)})
.\lucidrains\deep-daze\deep_daze\cli.py
import sys
# 导入系统模块
import fire
# 导入fire模块,用于命令行接口
from deep_daze import Imagine
# 从deep_daze模块中导入Imagine类
def train(
text=None,
img=None,
learning_rate=1e-5,
num_layers=16,
hidden_size=256,
batch_size=4,
gradient_accumulate_every=4,
epochs=20,
iterations=1050,
save_every=100,
image_width=512,
deeper=False,
overwrite=False,
save_progress=True,
seed=None,
open_folder=True,
save_date_time=False,
start_image_path=None,
start_image_train_iters=50,
theta_initial=None,
theta_hidden=None,
start_image_lr=3e-4,
lower_bound_cutout=0.1,
upper_bound_cutout=1.0,
saturate_bound=False,
create_story=False,
story_start_words=5,
story_words_per_epoch=5,
story_separator=None,
averaging_weight=0.3,
gauss_sampling=False,
gauss_mean=0.6,
gauss_std=0.2,
do_cutout=True,
center_bias=False,
center_focus=2,
jit=True,
save_gif=False,
save_video=False,
model_name="ViT-B/32",
optimizer="AdamP"
):
"""
:param text: (required) A phrase less than 77 tokens which you would like to visualize.
# 要可视化的短语,不超过77个标记
:param img: The path to a jpg or png image which you would like to imagine. Can be combined with text.
# 要想象的jpg或png图像的路径,可以与文本结合使用
:param learning_rate: The learning rate of the neural net.
# 神经网络的学习率
:param hidden_size: The hidden layer size of the Siren net.
# Siren网络的隐藏层大小
:param num_layers: The number of hidden layers to use in the Siren neural net.
# 在Siren神经网络中使用的隐藏层数量
:param batch_size: The number of generated images to pass into Siren before calculating loss. Decreasing this can lower memory and accuracy.
# 在计算损失之前传递给Siren的生成图像数量。减少此值可以降低内存和准确性
:param gradient_accumulate_every: Calculate a weighted loss of n samples for each iteration. Increasing this can help increase accuracy with lower batch sizes.
# 每次迭代计算n个样本的加权损失。增加此值可以帮助提高较小批次大小的准确性
:param epochs: The number of epochs to run.
# 要运行的周期数
:param iterations: The number of times to calculate and backpropagate loss in a given epoch.
# 在给定周期内计算和反向传播损失的次数
:param save_progress: Whether or not to save images generated before training Siren is complete.
# 是否保存在训练Siren完成之前生成的图像
:param save_every: Generate an image every time iterations is a multiple of this number.
# 每次迭代是此数字的倍数时生成一幅图像
:param open_folder: Whether or not to open a folder showing your generated images.
# 是否打开显示生成图像的文件夹
:param overwrite: Whether or not to overwrite existing generated images of the same name.
# 是否覆盖同名的现有生成图像
:param deeper: Uses a Siren neural net with 32 hidden layers.
# 使用具有32个隐藏层的Siren神经网络
:param image_width: The desired resolution of the image.
# 图像的期望分辨率
:param seed: A seed to be used for deterministic runs.
# 用于确定性运行的种子
:param save_date_time: Save files with a timestamp prepended e.g. `%y%m%d-%H%M%S-my_phrase_here.png`
# 保存带有时间戳前缀的文件,��如`%y%m%d-%H%M%S-my_phrase_here.png`
:param start_image_path: Path to the image you would like to prime the generator with initially
# 要首先使用的图像的路径
:param start_image_train_iters: Number of iterations for priming, defaults to 50
# 用于初始化的迭代次数,默认为50
:param theta_initial: Hyperparameter describing the frequency of the color space. Only applies to the first layer of the network.
# 描述颜色空间频率的超参数。仅适用于网络的第一层
:param theta_hidden: Hyperparameter describing the frequency of the color space. Only applies to the hidden layers of the network.
# 描述颜色空间频率的超参数。仅适用于网络的隐藏层
:param start_image_lr: Learning rate for the start image training.
# 用于初始图像训练的学习率
:param upper_bound_cutout: The upper bound for the cutouts used in generation.
# 用于生成的切割的上限
:param lower_bound_cutout: The lower bound for the cutouts used in generation.
# 用于生成的切割的下限
:param saturate_bound: If True, the LOWER_BOUND_CUTOUT is linearly increased to 0.75 during training.
# 如果为True,则在训练过程中将LOWER_BOUND_CUTOUT线性增加到0.75
:param create_story: Creates a story by optimizing each epoch on a new sliding-window of the input words. If this is enabled, much longer texts than 77 tokens can be used. Requires save_progress to visualize the transitions of the story.
# 通过在输入单词的新滑动窗口上优化每个时期来创建故事。如果启用此功能,则可以使用比77个标记更长的文本。需要save_progress来可视化故事的转换
:param story_start_words: Only used if create_story is True. How many words to optimize on for the first epoch.
# 仅在create_story为True时使用。第一个时期要优化的单词数
:param story_words_per_epoch: Only used if create_story is True. How many words to add to the optimization goal per epoch after the first one.
# 仅在create_story为True时使用。在第一个时期之后每个时期要添加到优化目标中的单词数
:param story_separator: Only used if create_story is True. Defines a separator like '.' that splits the text into groups for each epoch. Separator needs to be in the text otherwise it will be ignored!
:param averaging_weight: How much to weigh the averaged features of the random cutouts over the individual random cutouts. Increasing this value leads to more details being represented at the cost of some global coherence and a parcellation into smaller scenes.
:param gauss_sampling: Whether to use sampling from a Gaussian distribution instead of a uniform distribution.
:param gauss_mean: The mean of the Gaussian sampling distribution.
:param gauss_std: The standard deviation of the Gaussian sampling distribution.
:param do_cutouts: Whether to use random cutouts as an augmentation. This basically needs to be turned on unless some new augmentations are added in code eventually.
:param center_bias: Whether to use a Gaussian distribution centered around the center of the image to sample the locations of random cutouts instead of a uniform distribution. Leads to the main generated objects to be more focused in the center.
:param center_focus: How much to focus on the center if using center_bias. std = sampling_range / center_focus. High values lead to a very correct representation in the center but washed out colors and details towards the edges,
:param jit: Whether to use the jit-compiled CLIP model. The jit model is faster, but only compatible with torch version 1.7.1.
:param save_gif: Only used if save_progress is True. Saves a GIF animation of the generation procedure using the saved frames.
:param save_video: Only used if save_progress is True. Saves a MP4 animation of the generation procedure using the saved frames.
"""
# Don't instantiate imagine if the user just wants help.
# 检查命令行参数中是否包含"--help",如果有则打印使用信息并退出程序
if any("--help" in arg for arg in sys.argv):
print("Type `imagine --help` for usage info.")
sys.exit()
# 根据参数设置生成的图像的层数
num_layers = 32 if deeper else num_layers
# 创建 Imagine 对象,传入各种参数
imagine = Imagine(
text=text,
img=img,
lr=learning_rate,
num_layers=num_layers,
batch_size=batch_size,
gradient_accumulate_every=gradient_accumulate_every,
epochs=epochs,
iterations=iterations,
image_width=image_width,
save_every=save_every,
save_progress=save_progress,
seed=seed,
open_folder=open_folder,
save_date_time=save_date_time,
start_image_path=start_image_path,
start_image_train_iters=start_image_train_iters,
theta_initial=theta_initial,
theta_hidden=theta_hidden,
start_image_lr=start_image_lr,
lower_bound_cutout=lower_bound_cutout,
upper_bound_cutout=upper_bound_cutout,
saturate_bound=saturate_bound,
create_story=create_story,
story_start_words=story_start_words,
story_words_per_epoch=story_words_per_epoch,
story_separator=story_separator,
averaging_weight=averaging_weight,
gauss_sampling=gauss_sampling,
gauss_mean=gauss_mean,
gauss_std=gauss_std,
do_cutout=do_cutout,
center_bias=center_bias,
center_focus=center_focus,
jit=jit,
hidden_size=hidden_size,
model_name=model_name,
optimizer=optimizer,
save_gif=save_gif,
save_video=save_video,
)
# 打印提示信息
print('Starting up...')
# 如果不覆盖已存在的生成图像文件,并且文件已存在,则询问用户是否覆盖
if not overwrite and imagine.filename.exists():
answer = input('Imagined image already exists, do you want to overwrite? (y/n) ').lower()
if answer not in ('yes', 'y'):
sys.exit()
# 调用 Imagine 对象,开始生成图像
imagine()
# 定义主函数入口
def main():
# 使用 Fire 库将 train 函数转换为命令行接口
fire.Fire(train)
.\lucidrains\deep-daze\deep_daze\deep_daze.py
# 导入所需的库
import os
import subprocess
import sys
import random
from datetime import datetime
from pathlib import Path
import torch
import torch.nn.functional as F
from siren_pytorch import SirenNet, SirenWrapper
from torch import nn
from torch.cuda.amp import GradScaler, autocast
from torch_optimizer import DiffGrad, AdamP
import numpy as np
from PIL import Image
from imageio import imread, mimsave
import torchvision.transforms as T
from tqdm import trange, tqdm
from .clip import load, tokenize
# 定义一些辅助函数
# 检查值是否存在
def exists(val):
return val is not None
# 返回默认值
def default(val, d):
return val if exists(val) else d
# 对图像进行插值
def interpolate(image, size):
return F.interpolate(image, (size, size), mode='bilinear', align_corners=False)
# 随机裁剪图像
def rand_cutout(image, size, center_bias=False, center_focus=2):
# 生成随机裁剪的位置
# 如果center_bias为True,则在图像中心附近采样
# 否则在整个图像范围内随机采样
# 返回裁剪后的图像
...
# 创建用于处理CLIP图像的转换函数
def create_clip_img_transform(image_width):
# 定义CLIP图像的均值和标准差
# 创建图像转换序列
transform = T.Compose([
T.Resize(image_width),
T.CenterCrop((image_width, image_width)),
T.ToTensor(),
T.Normalize(mean=clip_mean, std=clip_std)
])
return transform
# 打开文件夹
def open_folder(path):
# 如果路径是文件,则获取其所在目录
# 如果路径不存在或不是文件夹,则返回
# 根据操作系统选择打开文件夹的命令
# 尝试打开文件夹,忽略可能的错误
...
# 将Siren模型输出归一化到0-1范围
def norm_siren_output(img):
return ((img + 1) * 0.5).clamp(0.0, 1.0)
# 创建文本或图像的文件名
def create_text_path(context_length, text=None, img=None, encoding=None, separator=None):
# 如果提供了文本,则根据指定分隔符截取文件名
# 如果提供了图像,则根据文件名生成文件名
# 否则使用默认文件名
return input_name
# 定义DeepDaze类
class DeepDaze(nn.Module):
...
# 初始化函数,设置模型参数和超参数
def __init__(
self,
clip_perceptor, # CLIP 模型
clip_norm, # 归一化图像
input_res, # 输入分辨率
total_batches, # 总批次数
batch_size, # 批次大小
num_layers=8, # 神经网络层数,默认为8
image_width=512, # 图像宽度,默认为512
loss_coef=100, # 损失系数,默认为100
theta_initial=None, # 初始 theta 值,默认为 None
theta_hidden=None, # 隐藏层 theta 值,默认为 None
lower_bound_cutout=0.1, # 切割下界,应小于0.8
upper_bound_cutout=1.0, # 切割上界
saturate_bound=False, # 是否饱和边界
gauss_sampling=False, # 是否高斯采样
gauss_mean=0.6, # 高斯均值
gauss_std=0.2, # 高斯标准差
do_cutout=True, # 是否进行切割
center_bias=False, # 是否中心偏置
center_focus=2, # 中心焦点
hidden_size=256, # 隐藏层大小
averaging_weight=0.3, # 平均权重
):
super().__init__()
# 加载 CLIP 模型
self.perceptor = clip_perceptor
self.input_resolution = input_res
self.normalize_image = clip_norm
self.loss_coef = loss_coef
self.image_width = image_width
self.batch_size = batch_size
self.total_batches = total_batches
self.num_batches_processed = 0
# 设置初始 theta 值
w0 = default(theta_hidden, 30.)
w0_initial = default(theta_initial, 30.)
# 创建 Siren 网络
siren = SirenNet(
dim_in=2,
dim_hidden=hidden_size,
num_layers=num_layers,
dim_out=3,
use_bias=True,
w0=w0,
w0_initial=w0_initial
)
# 创建 SirenWrapper 模型
self.model = SirenWrapper(
siren,
image_width=image_width,
image_height=image_width
)
self.saturate_bound = saturate_bound
self.saturate_limit = 0.75 # 超过此值的切割会导致不稳定
self.lower_bound_cutout = lower_bound_cutout
self.upper_bound_cutout = upper_bound_cutout
self.gauss_sampling = gauss_sampling
self.gauss_mean = gauss_mean
self.gauss_std = gauss_std
self.do_cutout = do_cutout
self.center_bias = center_bias
self.center_focus = center_focus
self.averaging_weight = averaging_weight
# 根据给定的下界、上界、宽度和高斯均值采样切割大小
def sample_sizes(self, lower, upper, width, gauss_mean):
if self.gauss_sampling:
# 使用高斯分布采样
gauss_samples = torch.zeros(self.batch_size).normal_(mean=gauss_mean, std=self.gauss_std)
outside_bounds_mask = (gauss_samples > upper) | (gauss_samples < upper)
gauss_samples[outside_bounds_mask] = torch.zeros((len(gauss_samples[outside_bounds_mask]),)).uniform_(lower, upper)
sizes = (gauss_samples * width).int()
else:
lower *= width
upper *= width
sizes = torch.randint(int(lower), int(upper), (self.batch_size,))
return sizes
# 定义一个前向传播函数,接受文本嵌入和是否返回损失值以及是否进行干预运行的参数
def forward(self, text_embed, return_loss=True, dry_run=False):
# 使用模型进行前向传播
out = self.model()
# 对输出进行规范化处理
out = norm_siren_output(out)
# 如果不需要返回损失值,则直接返回输出
if not return_loss:
return out
# 确定上下采样边界
width = out.shape[-1]
lower_bound = self.lower_bound_cutout
# 如果饱和边界为真,则根据进度比例调整下限边界
if self.saturate_bound:
progress_fraction = self.num_batches_processed / self.total_batches
lower_bound += (self.saturate_limit - self.lower_bound_cutout) * progress_fraction
# 在下限和上限边界之间采样切割大小
sizes = self.sample_sizes(lower_bound, self.upper_bound_cutout, width, self.gauss_mean)
# 创建归一化的随机切割
if self.do_cutout:
image_pieces = [rand_cutout(out, size, center_bias=self.center_bias, center_focus=self.center_focus) for size in sizes]
image_pieces = [interpolate(piece, self.input_resolution) for piece in image_pieces]
else:
image_pieces = [interpolate(out.clone(), self.input_resolution) for _ in sizes]
# 对图像进行规范化
image_pieces = torch.cat([self.normalize_image(piece) for piece in image_pieces])
# 计算图像嵌入
with autocast(enabled=False):
image_embed = self.perceptor.encode_image(image_pieces)
# 计算损失值
# 对切割特征的平均值进行损失计算
avg_image_embed = image_embed.mean(dim=0).unsqueeze(0)
averaged_loss = -self.loss_coef * torch.cosine_similarity(text_embed, avg_image_embed, dim=-1).mean()
# 对所有切割进行损失计算
general_loss = -self.loss_coef * torch.cosine_similarity(text_embed, image_embed, dim=-1).mean()
# 合并损失值
loss = averaged_loss * (self.averaging_weight) + general_loss * (1 - self.averaging_weight)
# 计算批次数
if not dry_run:
self.num_batches_processed += self.batch_size
return out, loss
# 定义 Imagine 类,继承自 nn.Module
class Imagine(nn.Module):
# 初始化函数
def __init__(
self,
*,
text=None,
img=None,
clip_encoding=None,
lr=1e-5,
batch_size=4,
gradient_accumulate_every=4,
save_every=100,
image_width=512,
num_layers=16,
epochs=20,
iterations=1050,
save_progress=True,
seed=None,
open_folder=True,
save_date_time=False,
start_image_path=None,
start_image_train_iters=10,
start_image_lr=3e-4,
theta_initial=None,
theta_hidden=None,
model_name="ViT-B/32",
lower_bound_cutout=0.1, # should be smaller than 0.8
upper_bound_cutout=1.0,
saturate_bound=False,
averaging_weight=0.3,
create_story=False,
story_start_words=5,
story_words_per_epoch=5,
story_separator=None,
gauss_sampling=False,
gauss_mean=0.6,
gauss_std=0.2,
do_cutout=True,
center_bias=False,
center_focus=2,
optimizer="AdamP",
jit=True,
hidden_size=256,
save_gif=False,
save_video=False,
# 创建 clip_encoding 函数
def create_clip_encoding(self, text=None, img=None, encoding=None):
# 设置 text 和 img 属性
self.text = text
self.img = img
# 如果 encoding 不为空,则转移到设备上
if encoding is not None:
encoding = encoding.to(self.device)
# 如果需要创建 story,则更新编码
elif self.create_story:
encoding = self.update_story_encoding(epoch=0, iteration=1)
# 如果 text 和 img 都不为空,则计算平均编码
elif text is not None and img is not None:
encoding = (self.create_text_encoding(text) + self.create_img_encoding(img)) / 2
# 如果只有 text,则计算文本编码
elif text is not None:
encoding = self.create_text_encoding(text)
# 如果只有 img,则计算图像编码
elif img is not None:
encoding = self.create_img_encoding(img)
return encoding
# 创建文本编码函数
def create_text_encoding(self, text):
# 对文本进行标记化,并转移到设备上
tokenized_text = tokenize(text).to(self.device)
# 使用感知器编码���本
with torch.no_grad():
text_encoding = self.perceptor.encode_text(tokenized_text).detach()
return text_encoding
# 创建图像编码函数
def create_img_encoding(self, img):
# 如果 img 是字符串,则打开图像
if isinstance(img, str):
img = Image.open(img)
# 对图像进行规范化处理,并转移到设备上
normed_img = self.clip_transform(img).unsqueeze(0).to(self.device)
# 使用感知器编码图像
with torch.no_grad():
img_encoding = self.perceptor.encode_image(normed_img).detach()
return img_encoding
# 设置 clip_encoding 函数
def set_clip_encoding(self, text=None, img=None, encoding=None):
# 创建 clip_encoding
encoding = self.create_clip_encoding(text=text, img=img, encoding=encoding)
# 将编码转移到设备上
self.clip_encoding = encoding.to(self.device)
# 返回第一个分隔符的索引
def index_of_first_separator(self) -> int:
for c, word in enumerate(self.all_words):
if self.separator in str(word):
return c + 1
def update_story_encoding(self, epoch, iteration):
# 如果存在分隔符,则将所有单词拼接成字符串,去除分隔符
if self.separator is not None:
self.words = " ".join(self.all_words[:self.index_of_first_separator()])
# 从 epoch-text 中移除分隔符
self.words = self.words.replace(self.separator,'')
self.all_words = self.all_words[self.index_of_first_separator():]
else:
if self.words is None:
self.words = " ".join(self.all_words[:self.num_start_words])
self.all_words = self.all_words[self.num_start_words:]
else:
# 添加 words_per_epoch 个新单词
count = 0
while count < self.words_per_epoch and len(self.all_words) > 0:
new_word = self.all_words[0]
self.words = " ".join(self.words.split(" ") + [new_word])
self.all_words = self.all_words[1:]
count += 1
# 移除单词直到符合上下文长度
while len(self.words) > self.perceptor.context_length:
# 移除第一个单词
self.words = " ".join(self.words.split(" ")[1:])
# 获取新的编码
print("Now thinking of: ", '"', self.words, '"')
sequence_number = self.get_img_sequence_number(epoch, iteration)
# 将新单词保存到磁盘
with open("story_transitions.txt", "a") as f:
f.write(f"{epoch}, {sequence_number}, {self.words}\n")
encoding = self.create_text_encoding(self.words)
return encoding
def image_output_path(self, sequence_number=None):
"""
返回下划线分隔的路径。
如果设置了 `self.save_date_time`,则在前面加上当前时间戳。
如果设置了 `save_every`,则在后面加上左填充为 6 个零的序列号。
:rtype: Path
"""
output_path = self.textpath
if sequence_number:
sequence_number_left_padded = str(sequence_number).zfill(6)
output_path = f"{output_path}.{sequence_number_left_padded}"
if self.save_date_time:
current_time = datetime.now().strftime("%y%m%d-%H%M%S_%f")
output_path = f"{current_time}_{output_path}"
return Path(f"{output_path}.jpg")
def train_step(self, epoch, iteration):
total_loss = 0
for _ in range(self.gradient_accumulate_every):
with autocast(enabled=True):
out, loss = self.model(self.clip_encoding)
loss = loss / self.gradient_accumulate_every
total_loss += loss
self.scaler.scale(loss).backward()
out = out.cpu().float().clamp(0., 1.)
self.scaler.step(self.optimizer)
self.scaler.update()
self.optimizer.zero_grad()
if (iteration % self.save_every == 0) and self.save_progress:
self.save_image(epoch, iteration, img=out)
return out, total_loss
def get_img_sequence_number(self, epoch, iteration):
current_total_iterations = epoch * self.iterations + iteration
sequence_number = current_total_iterations // self.save_every
return sequence_number
@torch.no_grad()
def save_image(self, epoch, iteration, img=None):
sequence_number = self.get_img_sequence_number(epoch, iteration)
if img is None:
img = self.model(self.clip_encoding, return_loss=False).cpu().float().clamp(0., 1.)
self.filename = self.image_output_path(sequence_number=sequence_number)
pil_img = T.ToPILImage()(img.squeeze())
pil_img.save(self.filename, quality=95, subsampling=0)
pil_img.save(f"{self.textpath}.jpg", quality=95, subsampling=0)
tqdm.write(f'image updated at "./{str(self.filename)}"')
# 生成 GIF 动画
def generate_gif(self):
# 初始化空列表用于存储图片
images = []
# 遍历当前目录下的文件
for file_name in sorted(os.listdir('./')):
# 如果文件名以指定前缀开头且不是指定文件名,则将其读取为图片并添加到列表中
if file_name.startswith(self.textpath) and file_name != f'{self.textpath}.jpg':
images.append(imread(os.path.join('./', file_name)))
# 如果需要保存视频,则将图片列表保存为 MP4 格式
if self.save_video:
mimsave(f'{self.textpath}.mp4', images)
print(f'Generated image generation animation at ./{self.textpath}.mp4')
# 如果需要保存 GIF,则将图片列表保存为 GIF 格式
if self.save_gif:
mimsave(f'{self.textpath}.gif', images)
print(f'Generated image generation animation at ./{self.textpath}.gif')
# 向前推进生成过程
def forward(self):
# 如果初始图片存在,则进行初始化操作
if exists(self.start_image):
tqdm.write('Preparing with initial image...')
# 使用 DiffGrad 优化器对模型参数进行优化
optim = DiffGrad(self.model.model.parameters(), lr=self.start_image_lr)
# 创建进度条
pbar = trange(self.start_image_train_iters, desc='iteration')
try:
# 迭代训练初始图片
for _ in pbar:
loss = self.model.model(self.start_image)
loss.backward()
pbar.set_description(f'loss: {loss.item():.2f}')
optim.step()
optim.zero_grad()
except KeyboardInterrupt:
print('interrupted by keyboard, gracefully exiting')
return exit()
# 释放资源
del self.start_image
del optim
# 输出提示信息
tqdm.write(f'Imagining "{self.textpath}" from the depths of my weights...')
# 禁用梯度计算,进行一次预热步骤以解决 CLIP 和 CUDA 的潜在问题
with torch.no_grad():
self.model(self.clip_encoding, dry_run=True)
# 如果需要打开文件夹,则打开当前目录
if self.open_folder:
open_folder('./')
self.open_folder = False
try:
# 迭代训练过程
for epoch in trange(self.epochs, desc='epochs'):
pbar = trange(self.iterations, desc='iteration')
for i in pbar:
_, loss = self.train_step(epoch, i)
pbar.set_description(f'loss: {loss.item():.2f}')
# 如果正在创建故事,则更新 clip_encoding
if self.create_story:
self.clip_encoding = self.update_story_encoding(epoch, i)
except KeyboardInterrupt:
print('interrupted by keyboard, gracefully exiting')
return
# 在结束时保存图片
self.save_image(epoch, i)
# 如果需要保存 GIF 或视频,并且保存进度,则生成 GIF 动画
if (self.save_gif or self.save_video) and self.save_progress:
self.generate_gif()
.\lucidrains\deep-daze\deep_daze\version.py
# 定义变量 __version__,赋值为字符串 '0.11.1'
__version__ = '0.11.1'
.\lucidrains\deep-daze\deep_daze\__init__.py
# 从 deep_daze.deep_daze 模块中导入 DeepDaze 和 Imagine 类
from deep_daze.deep_daze import DeepDaze, Imagine
Deep Daze

mist over green hills

shattered plates on the grass

cosmic love and attention

a time traveler in the crowd

life during the plague

meditative peace in a sunlit forest

a man painting a completely red image

a psychedelic experience on LSD
What is this?
Simple command line tool for text to image generation using OpenAI's CLIP and Siren. Credit goes to Ryan Murdock for the discovery of this technique (and for coming up with the great name)!
This will require that you have an Nvidia GPU or AMD GPU
- Recommended: 16GB VRAM
- Minimum Requirements: 4GB VRAM (Using VERY LOW settings, see usage instructions below)
Install
$ pip install deep-daze
Windows Install

Presuming Python is installed:
- Open command prompt and navigate to the directory of your current version of Python
pip install deep-daze
Examples
$ imagine "a house in the forest"
For Windows:

- Open command prompt as administrator
imagine "a house in the forest"
That's it.
If you have enough memory, you can get better quality by adding a --deeper flag
$ imagine "shattered plates on the ground" --deeper
Advanced
In true deep learning fashion, more layers will yield better results. Default is at 16, but can be increased to 32 depending on your resources.
$ imagine "stranger in strange lands" --num-layers 32
Usage
CLI
NAME
imagine
SYNOPSIS
imagine TEXT <flags>
POSITIONAL ARGUMENTS
TEXT
(required) A phrase less than 77 tokens which you would like to visualize.
FLAGS
--img=IMAGE_PATH
Default: None
Path to png/jpg image or PIL image to optimize on
--encoding=ENCODING
Default: None
User-created custom CLIP encoding. If used, replaces any text or image that was used.
--create_story=CREATE_STORY
Default: False
Creates a story by optimizing each epoch on a new sliding-window of the input words. If this is enabled, much longer texts than 77 tokens can be used. Requires save_progress to visualize the transitions of the story.
--story_start_words=STORY_START_WORDS
Default: 5
Only used if create_story is True. How many words to optimize on for the first epoch.
--story_words_per_epoch=STORY_WORDS_PER_EPOCH
Default: 5
Only used if create_story is True. How many words to add to the optimization goal per epoch after the first one.
--story_separator:
Default: None
Only used if create_story is True. Defines a separator like '.' that splits the text into groups for each epoch. Separator needs to be in the text otherwise it will be ignored
--lower_bound_cutout=LOWER_BOUND_CUTOUT
Default: 0.1
Lower bound of the sampling of the size of the random cut-out of the SIREN image per batch. Should be smaller than 0.8.
--upper_bound_cutout=UPPER_BOUND_CUTOUT
Default: 1.0
Upper bound of the sampling of the size of the random cut-out of the SIREN image per batch. Should probably stay at 1.0.
--saturate_bound=SATURATE_BOUND
Default: False
If True, the LOWER_BOUND_CUTOUT is linearly increased to 0.75 during training.
--learning_rate=LEARNING_RATE
Default: 1e-05
The learning rate of the neural net.
--num_layers=NUM_LAYERS
Default: 16
The number of hidden layers to use in the Siren neural net.
--batch_size=BATCH_SIZE
Default: 4
The number of generated images to pass into Siren before calculating loss. Decreasing this can lower memory and accuracy.
--gradient_accumulate_every=GRADIENT_ACCUMULATE_EVERY
Default: 4
Calculate a weighted loss of n samples for each iteration. Increasing this can help increase accuracy with lower batch sizes.
--epochs=EPOCHS
Default: 20
The number of epochs to run.
--iterations=ITERATIONS
Default: 1050
The number of times to calculate and backpropagate loss in a given epoch.
--save_every=SAVE_EVERY
Default: 100
Generate an image every time iterations is a multiple of this number.
--image_width=IMAGE_WIDTH
Default: 512
The desired resolution of the image.
--deeper=DEEPER
Default: False
Uses a Siren neural net with 32 hidden layers.
--overwrite=OVERWRITE
Default: False
Whether or not to overwrite existing generated images of the same name.
--save_progress=SAVE_PROGRESS
Default: False
Whether or not to save images generated before training Siren is complete.
--seed=SEED
Type: Optional[]
Default: None
A seed to be used for deterministic runs.
--open_folder=OPEN_FOLDER
Default: True
Whether or not to open a folder showing your generated images.
--save_date_time=SAVE_DATE_TIME
Default: False
Save files with a timestamp prepended e.g. `%y%m%d-%H%M%S-my_phrase_here`
--start_image_path=START_IMAGE_PATH
Default: None
The generator is trained first on a starting image before steered towards the textual input
--start_image_train_iters=START_IMAGE_TRAIN_ITERS
Default: 50
The number of steps for the initial training on the starting image
--theta_initial=THETA_INITIAL
Default: 30.0
Hyperparameter describing the frequency of the color space. Only applies to the first layer of the network.
--theta_hidden=THETA_INITIAL
Default: 30.0
Hyperparameter describing the frequency of the color space. Only applies to the hidden layers of the network.
--save_gif=SAVE_GIF
Default: False
Whether or not to save a GIF animation of the generation procedure. Only works if save_progress is set to True.
Priming
Technique first devised and shared by Mario Klingemann, it allows you to prime the generator network with a starting image, before being steered towards the text.
Simply specify the path to the image you wish to use, and optionally the number of initial training steps.
$ imagine 'a clear night sky filled with stars' --start_image_path ./cloudy-night-sky.jpg
Primed starting image

Then trained with the prompt A pizza with green pepper.

Optimize for the interpretation of an image
We can also feed in an image as an optimization goal, instead of only priming the generator network. Deepdaze will then render its own interpretation of that image:
$ imagine --img samples/Autumn_1875_Frederic_Edwin_Church.jpg
Original image:

The network's interpretation:

Original image:

The network's interpretation:

Optimize for text and image combined
$ imagine "A psychedelic experience." --img samples/hot-dog.jpg
The network's interpretation:

New: Create a story
The regular mode for texts only allows 77 tokens. If you want to visualize a full story/paragraph/song/poem, set create_story to True.
Given the poem “Stopping by Woods On a Snowy Evening” by Robert Frost - "Whose woods these are I think I know. His house is in the village though; He will not see me stopping here To watch his woods fill up with snow. My little horse must think it queer To stop without a farmhouse near Between the woods and frozen lake The darkest evening of the year. He gives his harness bells a shake To ask if there is some mistake. The only other sound’s the sweep Of easy wind and downy flake. The woods are lovely, dark and deep, But I have promises to keep, And miles to go before I sleep, And miles to go before I sleep.".
We get:
user-images.githubusercontent.com/19983153/10…
Python
Invoke deep_daze.Imagine in Python
from deep_daze import Imagine
imagine = Imagine(
text = 'cosmic love and attention',
num_layers = 24,
)
imagine()
Save progress every fourth iteration
Save images in the format insert_text_here.00001.png, insert_text_here.00002.png, ...up to (total_iterations % save_every)
imagine = Imagine(
text=text,
save_every=4,
save_progress=True
)
Prepend current timestamp on each image.
Creates files with both the timestamp and the sequence number.
e.g. 210129-043928_328751_insert_text_here.00001.png, 210129-043928_512351_insert_text_here.00002.png, ...
imagine = Imagine(
text=text,
save_every=4,
save_progress=True,
save_date_time=True,
)
High GPU memory usage
If you have at least 16 GiB of vram available, you should be able to run these settings with some wiggle room.
imagine = Imagine(
text=text,
num_layers=42,
batch_size=64,
gradient_accumulate_every=1,
)
Average GPU memory usage
imagine = Imagine(
text=text,
num_layers=24,
batch_size=16,
gradient_accumulate_every=2
)
Very low GPU memory usage (less than 4 GiB)
If you are desperate to run this on a card with less than 8 GiB vram, you can lower the image_width.
imagine = Imagine(
text=text,
image_width=256,
num_layers=16,
batch_size=1,
gradient_accumulate_every=16 # Increase gradient_accumulate_every to correct for loss in low batch sizes
)
VRAM and speed benchmarks:
These experiments were conducted with a 2060 Super RTX and a 3700X Ryzen 5. We first mention the parameters (bs = batch size), then the memory usage and in some cases the training iterations per second:
For an image resolution of 512:
- bs 1, num_layers 22: 7.96 GB
- bs 2, num_layers 20: 7.5 GB
- bs 16, num_layers 16: 6.5 GB
For an image resolution of 256:
- bs 8, num_layers 48: 5.3 GB
- bs 16, num_layers 48: 5.46 GB - 2.0 it/s
- bs 32, num_layers 48: 5.92 GB - 1.67 it/s
- bs 8, num_layers 44: 5 GB - 2.39 it/s
- bs 32, num_layers 44, grad_acc 1: 5.62 GB - 4.83 it/s
- bs 96, num_layers 44, grad_acc 1: 7.51 GB - 2.77 it/s
- bs 32, num_layers 66, grad_acc 1: 7.09 GB - 3.7 it/s
@NotNANtoN recommends a batch size of 32 with 44 layers and training 1-8 epochs.
Where is this going?
This is just a teaser. We will be able to generate images, sound, anything at will, with natural language. The holodeck is about to become real in our lifetimes.
Please join replication efforts for DALL-E for Pytorch or Mesh Tensorflow if you are interested in furthering this technology.
Alternatives
Big Sleep - CLIP and the generator from Big GAN
Citations
@misc{unpublished2021clip,
title = {CLIP: Connecting Text and Images},
author = {Alec Radford, Ilya Sutskever, Jong Wook Kim, Gretchen Krueger, Sandhini Agarwal},
year = {2021}
}
@misc{sitzmann2020implicit,
title = {Implicit Neural Representations with Periodic Activation Functions},
author = {Vincent Sitzmann and Julien N. P. Martel and Alexander W. Bergman and David B. Lindell and Gordon Wetzstein},
year = {2020},
eprint = {2006.09661},
archivePrefix = {arXiv},
primaryClass = {cs.CV}
}
.\lucidrains\deep-daze\setup.py
# 导入 sys 模块
import sys
# 从 setuptools 模块中导入 setup 和 find_packages 函数
from setuptools import setup, find_packages
# 将 deep_daze 目录添加到 sys.path 中
sys.path[0:0] = ['deep_daze']
# 从 version 模块中导入 __version__ 变量
from version import __version__
# 设置包的元数据和配置信息
setup(
# 包的名称
name = 'deep-daze',
# 查找并包含所有包
packages = find_packages(),
# 包含所有数据文件
include_package_data = True,
# 设置入口点,命令行脚本
entry_points={
'console_scripts': [
'imagine = deep_daze.cli:main',
],
},
# 版本号
version = __version__,
# 许可证
license='MIT',
# 描述
description = 'Deep Daze',
# 作者
author = 'Ryan Murdock, Phil Wang',
# 作者邮箱
author_email = 'lucidrains@gmail.com',
# 项目链接
url = 'https://github.com/lucidrains/deep-daze',
# 关键词
keywords = [
'artificial intelligence',
'deep learning',
'transformers',
'implicit neural representations',
'text to image'
],
# 安装依赖
install_requires=[
'einops>=0.3',
'fire',
'ftfy',
'imageio>=2.9.0',
'siren-pytorch>=0.0.8',
'torch>=1.10',
'torch_optimizer',
'torchvision>=0.8.2',
'tqdm',
'regex'
],
# 分类
classifiers=[
'Development Status :: 4 - Beta',
'Intended Audience :: Developers',
'Topic :: Scientific/Engineering :: Artificial Intelligence',
'License :: OSI Approved :: MIT License',
'Programming Language :: Python :: 3.6',
],
)
.\lucidrains\deep-linear-network\deep_linear_network\deep_linear_network.py
# 导入 torch 库
import torch
# 从 torch 库中导入 nn 模块
from torch import nn
# 从 functools 库中导入 reduce 函数
from functools import reduce
# 定义矩阵相乘函数 mm
def mm(x, y):
return x @ y
# 定义一个继承自 nn.Module 的类 DeepLinear
class DeepLinear(nn.Module):
# 初始化函数,接受输入维度 dim_in 和多个维度参数 *dims
def __init__(self, dim_in, *dims):
super().__init__()
# 将输入维度和参数 dims 组成一个维度列表 dims
dims = [dim_in, *dims]
# 将 dims 列表中相邻的维度组成元组,形成维度对列表 pairs
pairs = list(zip(dims[:-1], dims[1:]))
# 使用 map 函数对 pairs 中的每个维度对创建一个随机初始化的权重参数,并组成权重列表 weights
weights = list(map(lambda d: nn.Parameter(torch.randn(d)), pairs))
# 将权重列表转换为 nn.ParameterList 类型,并赋值给 self.weights
self.weights = nn.ParameterList(weights)
# 初始化缓存变量为 None
self._cache = None
# 前向传播函数,接受输入 x
def forward(self, x):
# 如果处于训练模式,重置缓存并返回权重矩阵相乘后的结果
if self.training:
self._cache = None
return reduce(mm, self.weights, x)
# 如果缓存不为空,直接返回输入 x 与缓存的权重矩阵相乘的结果
if self._cache is not None:
return x @ self._cache
# 从权重列表中取出第一个权重矩阵作为头部,其余作为尾部,计算尾部权重矩阵的乘积,并缓存结果
head, *tail = self.weights
weight = reduce(mm, tail, head)
self._cache = weight
return x @ weight
.\lucidrains\deep-linear-network\deep_linear_network\__init__.py
# 从 deep_linear_network.deep_linear_network 模块中导入 DeepLinear 类
from deep_linear_network.deep_linear_network import DeepLinear

Deep Linear Network - Pytorch
A simple to use deep linear network module. Useful for matrix factorization or for passing an input tensor through a series of square weight matrices, where it was discovered that gradient descent implicitly regularizes the output to low-rank solutions.
LeCun's paper uses this unique property to optimize the latent of an autoencoder to be low-rank.
The module will take care of collapsing the linear weight matrices into one weight matrix, caching it across evaluation calls (but expired on training).
Install
$ pip install deep-linear-network
Usage
Matrix factorization
import torch
from deep_linear_network import DeepLinear
x = torch.randn(1, 1024, 256)
linear = DeepLinear(256, 10, 512) # w1 (256 x 10) @ w2 (10 x 512)
linear(x) # (1, 1024, 512)
Deep Linear Network
import torch
from deep_linear_network import DeepLinear
x = torch.randn(1, 1024, 256)
linear = DeepLinear(256, 256, 256, 256, 128) # w1-w3 (256 x 256) w4 (256 x 128)
linear(x) # (1, 1024, 128)
Citations
@misc{arora2019implicit,
title={Implicit Regularization in Deep Matrix Factorization},
author={Sanjeev Arora and Nadav Cohen and Wei Hu and Yuping Luo},
year={2019},
eprint={1905.13655},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@misc{jing2020implicit,
title={Implicit Rank-Minimizing Autoencoder},
author={Li Jing and Jure Zbontar and Yann LeCun},
year={2020},
eprint={2010.00679},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
.\lucidrains\deep-linear-network\setup.py
# 导入设置和查找包的函数
from setuptools import setup, find_packages
# 设置包的元数据
setup(
# 包的名称
name = 'deep-linear-network',
# 查找并包含所有包
packages = find_packages(),
# 版本号
version = '0.0.1',
# 许可证
license='MIT',
# 描述
description = 'Deep Linear Network - Pytorch',
# 作者
author = 'Phil Wang',
# 作者邮箱
author_email = 'lucidrains@gmail.com',
# 项目链接
url = 'https://github.com/lucidrains/deep-linear-network',
# 关键词
keywords = [
'artificial intelligence',
'attention mechanism',
],
# 安装依赖
install_requires=[
'torch',
],
# 分类
classifiers=[
'Development Status :: 4 - Beta',
'Intended Audience :: Developers',
'Topic :: Scientific/Engineering :: Artificial Intelligence',
'License :: OSI Approved :: MIT License',
'Programming Language :: Python :: 3.6',
],
)
.\lucidrains\deformable-attention\deformable_attention\deformable_attention_1d.py
import torch
import torch.nn.functional as F
from torch import nn, einsum
from einops.layers.torch import Rearrange
from einops import rearrange, repeat
# helper functions
# 检查值是否存在
def exists(val):
return val is not None
# 返回值或默认值
def default(val, d):
return val if exists(val) else d
# 检查是否可以被整除
def divisible_by(numer, denom):
return (numer % denom) == 0
# tensor helpers
# 1维网格采样,将网格重塑为2维
def grid_sample_1d(feats, grid, *args, **kwargs):
grid = rearrange(grid, '... -> ... 1 1')
grid = F.pad(grid, (0, 1), value = 0.)
feats = rearrange(feats, '... -> ... 1')
out = F.grid_sample(feats, grid, **kwargs)
return rearrange(out, '... 1 -> ...')
# 将1维序列归一化到-1到1的范围
def normalize_grid(arange, dim = 1, out_dim = -1):
n = arange.shape[-1]
return 2.0 * arange / max(n - 1, 1) - 1.0
# 缩放层
class Scale(nn.Module):
def __init__(self, scale):
super().__init__()
self.scale = scale
def forward(self, x):
return x * self.scale
# 从SwinV2获取连续位置偏差
class CPB(nn.Module):
""" https://arxiv.org/abs/2111.09883v1 """
def __init__(self, dim, *, heads, offset_groups, depth, log_distance = True):
super().__init__()
self.heads = heads
self.offset_groups = offset_groups
self.log_distance = log_distance
self.mlp = nn.ModuleList([])
self.mlp.append(nn.Sequential(
nn.Linear(1, dim),
nn.ReLU()
))
for _ in range(depth - 1):
self.mlp.append(nn.Sequential(
nn.Linear(dim, dim),
nn.ReLU()
))
self.mlp.append(nn.Linear(dim, heads // offset_groups))
def forward(self, grid_q, grid_kv):
device, dtype = grid_q.device, grid_kv.dtype
grid_q = rearrange(grid_q, 'n -> 1 n')
grid_kv = rearrange(grid_kv, 'b n -> b n')
pos = rearrange(grid_q, 'b i -> b i 1 1') - rearrange(grid_kv, 'b j -> b 1 j 1')
if self.log_distance:
pos = torch.sign(pos) * torch.log(pos.abs() + 1) # log of distance is sign(rel_pos) * log(abs(rel_pos) + 1)
bias = pos
for layer in self.mlp:
bias = layer(bias)
bias = rearrange(bias, '(b g) i j o -> b (g o) i j', g = self.offset_groups)
return bias
# main class
class DeformableAttention1D(nn.Module):
def __init__(
self,
*,
dim,
dim_head = 64,
heads = 8,
dropout = 0.,
downsample_factor = 4,
offset_scale = None,
offset_groups = None,
offset_kernel_size = 6,
cpb_log_distance = True,
group_queries = True,
group_key_values = True
):
# 调用父类的构造函数
super().__init__()
# 设置偏移比例,默认为下采样因子
offset_scale = default(offset_scale, downsample_factor)
# 断言偏移核大小必须大于或等于下采样因子
assert offset_kernel_size >= downsample_factor, 'offset kernel size must be greater than or equal to the downsample factor'
# 断言偏移核大小减去下采样因子必须是2的倍数
assert divisible_by(offset_kernel_size - downsample_factor, 2)
# 设置偏移组数,默认为头数
offset_groups = default(offset_groups, heads)
# 断言头数必须是偏移组数的倍数
assert divisible_by(heads, offset_groups)
# 计算内部维度
inner_dim = dim_head * heads
# 设置缩放因子
self.scale = dim_head ** -0.5
self.heads = heads
self.offset_groups = offset_groups
# 计算偏移维度
offset_dims = inner_dim // offset_groups
self.downsample_factor = downsample_factor
# 构建偏移网络
self.to_offsets = nn.Sequential(
nn.Conv1d(offset_dims, offset_dims, offset_kernel_size, groups = offset_dims, stride = downsample_factor, padding = (offset_kernel_size - downsample_factor) // 2),
nn.GELU(),
nn.Conv1d(offset_dims, 1, 1, bias = False),
Rearrange('b 1 n -> b n'),
nn.Tanh(),
Scale(offset_scale)
)
# 构建相对位置偏置
self.rel_pos_bias = CPB(dim // 4, offset_groups = offset_groups, heads = heads, depth = 2, log_distance = cpb_log_distance)
self.dropout = nn.Dropout(dropout)
# 构建查询转换层
self.to_q = nn.Conv1d(dim, inner_dim, 1, groups = offset_groups if group_queries else 1, bias = False)
# 构建键转换层
self.to_k = nn.Conv1d(dim, inner_dim, 1, groups = offset_groups if group_key_values else 1, bias = False)
# 构建值转换层
self.to_v = nn.Conv1d(dim, inner_dim, 1, groups = offset_groups if group_key_values else 1, bias = False)
# 构建输出转换层
self.to_out = nn.Conv1d(inner_dim, dim, 1)
def forward(self, x, return_vgrid = False):
"""
b - batch
h - heads
n - sequence dimension
d - dimension
g - offset groups
"""
heads, b, n, downsample_factor, device = self.heads, x.shape[0], x.shape[-1], self.downsample_factor, x.device
# queries
q = self.to_q(x)
# calculate offsets - offset MLP shared across all groups
group = lambda t: rearrange(t, 'b (g d) n -> (b g) d n', g = self.offset_groups)
grouped_queries = group(q)
offsets = self.to_offsets(grouped_queries)
# calculate grid + offsets
grid = torch.arange(offsets.shape[-1], device = device)
vgrid = grid + offsets
vgrid_scaled = normalize_grid(vgrid)
kv_feats = grid_sample_1d(
group(x),
vgrid_scaled,
mode = 'bilinear', padding_mode = 'zeros', align_corners = False)
kv_feats = rearrange(kv_feats, '(b g) d n -> b (g d) n', b = b)
# derive key / values
k, v = self.to_k(kv_feats), self.to_v(kv_feats)
# scale queries
q = q * self.scale
# split out heads
q, k, v = map(lambda t: rearrange(t, 'b (h d) n -> b h n d', h = heads), (q, k, v))
# query / key similarity
sim = einsum('b h i d, b h j d -> b h i j', q, k)
# relative positional bias
seq_range = torch.arange(n, device = device)
seq_scaled = normalize_grid(seq_range, dim = 0)
rel_pos_bias = self.rel_pos_bias(seq_scaled, vgrid_scaled)
sim = sim + rel_pos_bias
# numerical stability
sim = sim - sim.amax(dim = -1, keepdim = True).detach()
# attention
attn = sim.softmax(dim = -1)
attn = self.dropout(attn)
# aggregate and combine heads
out = einsum('b h i j, b h j d -> b h i d', attn, v)
out = rearrange(out, 'b h n d -> b (h d) n')
out = self.to_out(out)
if return_vgrid:
return out, vgrid
return out