.\YOLO-World\yolo_world\models\detectors\__init__.py
from .yolo_world import YOLOWorldDetector
__all__ = ['YOLOWorldDetector']
.\YOLO-World\yolo_world\models\layers\yolo_bricks.py
from typing import List
import torch
import torch.nn as nn
from torch import Tensor
import torch.nn.functional as F
from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule, Linear
from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
from mmengine.model import BaseModule
from mmyolo.registry import MODELS
from mmyolo.models.layers import CSPLayerWithTwoConv
@MODELS.register_module()
class MaxSigmoidAttnBlock(BaseModule):
"""Max Sigmoid attention block."""
def __init__(self,
in_channels: int,
out_channels: int,
guide_channels: int,
embed_channels: int,
kernel_size: int = 3,
padding: int = 1,
num_heads: int = 1,
use_depthwise: bool = False,
with_scale: bool = False,
conv_cfg: OptConfigType = None,
norm_cfg: ConfigType = dict(type='BN',
momentum=0.03,
eps=0.001),
init_cfg: OptMultiConfig = None) -> None:
super().__init__(init_cfg=init_cfg)
conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
assert (out_channels % num_heads == 0 and
embed_channels % num_heads == 0), \
'out_channels and embed_channels should be divisible by num_heads.'
self.num_heads = num_heads
self.head_channels = out_channels // num_heads
self.embed_conv = ConvModule(
in_channels,
embed_channels,
1,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=None) if embed_channels != in_channels else None
self.guide_fc = Linear(guide_channels, embed_channels)
self.bias = nn.Parameter(torch.zeros(num_heads))
if with_scale:
self.scale = nn.Parameter(torch.ones(1, num_heads, 1, 1))
else:
self.scale = 1.0
self.project_conv = conv(in_channels,
out_channels,
kernel_size,
stride=1,
padding=padding,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=None)
def forward(self, x: Tensor, guide: Tensor) -> Tensor:
"""Forward process."""
B, _, H, W = x.shape
guide = self.guide_fc(guide)
guide = guide.reshape(B, -1, self.num_heads, self.head_channels)
embed = self.embed_conv(x) if self.embed_conv is not None else x
embed = embed.reshape(B, self.num_heads, self.head_channels, H, W)
attn_weight = torch.einsum('bmchw,bnmc->bmhwn', embed, guide)
attn_weight = attn_weight.max(dim=-1)[0]
attn_weight = attn_weight / (self.head_channels**0.5)
attn_weight = attn_weight + self.bias[None, :, None, None]
attn_weight = attn_weight.sigmoid() * self.scale
x = self.project_conv(x)
x = x.reshape(B, self.num_heads, -1, H, W)
x = x * attn_weight.unsqueeze(2)
x = x.reshape(B, -1, H, W)
return x
@MODELS.register_module()
class MaxSigmoidCSPLayerWithTwoConv(CSPLayerWithTwoConv):
"""Sigmoid-attention based CSP layer with two convolution layers."""
def __init__(
self,
in_channels: int,
out_channels: int,
guide_channels: int,
embed_channels: int,
num_heads: int = 1,
expand_ratio: float = 0.5,
num_blocks: int = 1,
with_scale: bool = False,
add_identity: bool = True,
conv_cfg: OptConfigType = None,
norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001),
act_cfg: ConfigType = dict(type='SiLU', inplace=True),
init_cfg: OptMultiConfig = None) -> None:
super().__init__(in_channels=in_channels,
out_channels=out_channels,
expand_ratio=expand_ratio,
num_blocks=num_blocks,
add_identity=add_identity,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg,
init_cfg=init_cfg)
self.final_conv = ConvModule((3 + num_blocks) * self.mid_channels,
out_channels,
1,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg)
self.attn_block = MaxSigmoidAttnBlock(self.mid_channels,
self.mid_channels,
guide_channels=guide_channels,
embed_channels=embed_channels,
num_heads=num_heads,
with_scale=with_scale,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg)
def forward(self, x: Tensor, guide: Tensor) -> Tensor:
"""Forward process."""
x_main = self.main_conv(x)
x_main = list(x_main.split((self.mid_channels, self.mid_channels), 1))
x_main.extend(blocks(x_main[-1]) for blocks in self.blocks)
x_main.append(self.attn_block(x_main[-1], guide))
return self.final_conv(torch.cat(x_main, 1))
@MODELS.register_module()
class ImagePoolingAttentionModule(nn.Module):
def __init__(self,
image_channels: List[int],
text_channels: int,
embed_channels: int,
with_scale: bool = False,
num_feats: int = 3,
num_heads: int = 8,
pool_size: int = 3):
super().__init__()
self.text_channels = text_channels
self.embed_channels = embed_channels
self.num_heads = num_heads
self.num_feats = num_feats
self.head_channels = embed_channels // num_heads
self.pool_size = pool_size
if with_scale:
self.scale = nn.Parameter(torch.tensor([0.]), requires_grad=True)
else:
self.scale = 1.0
self.projections = nn.ModuleList([
ConvModule(in_channels, embed_channels, 1, act_cfg=None)
for in_channels in image_channels
])
self.query = nn.Sequential(nn.LayerNorm(text_channels),
Linear(text_channels, embed_channels))
self.key = nn.Sequential(nn.LayerNorm(embed_channels),
Linear(embed_channels, embed_channels))
self.value = nn.Sequential(nn.LayerNorm(embed_channels),
Linear(embed_channels, embed_channels))
self.proj = Linear(embed_channels, text_channels)
self.image_pools = nn.ModuleList([
nn.AdaptiveMaxPool2d((pool_size, pool_size))
for _ in range(num_feats)
])
def forward(self, text_features, image_features):
B = image_features[0].shape[0]
assert len(image_features) == self.num_feats
num_patches = self.pool_size**2
mlvl_image_features = [
pool(proj(x)).view(B, -1, num_patches)
for (x, proj, pool
) in zip(image_features, self.projections, self.image_pools)
]
mlvl_image_features = torch.cat(mlvl_image_features,
dim=-1).transpose(1, 2)
q = self.query(text_features)
k = self.key(mlvl_image_features)
v = self.value(mlvl_image_features)
q = q.reshape(B, -1, self.num_heads, self.head_channels)
k = k.reshape(B, -1, self.num_heads, self.head_channels)
v = v.reshape(B, -1, self.num_heads, self.head_channels)
attn_weight = torch.einsum('bnmc,bkmc->bmnk', q, k)
attn_weight = attn_weight / (self.head_channels**0.5)
attn_weight = F.softmax(attn_weight, dim=-1)
x = torch.einsum('bmnk,bkmc->bnmc', attn_weight, v)
x = self.proj(x.reshape(B, -1, self.embed_channels))
return x * self.scale + text_features
@MODELS.register_module()
class VanillaSigmoidBlock(BaseModule):
"""Sigmoid attention block."""
def __init__(self,
in_channels: int,
out_channels: int,
guide_channels: int,
embed_channels: int,
kernel_size: int = 3,
padding: int = 1,
num_heads: int = 1,
use_depthwise: bool = False,
with_scale: bool = False,
conv_cfg: OptConfigType = None,
norm_cfg: ConfigType = dict(type='BN',
momentum=0.03,
eps=0.001),
init_cfg: OptMultiConfig = None) -> None:
super().__init__(init_cfg=init_cfg)
conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
assert (out_channels % num_heads == 0 and
embed_channels % num_heads == 0), \
'out_channels and embed_channels should be divisible by num_heads.'
self.num_heads = num_heads
self.head_channels = out_channels // num_heads
self.project_conv = conv(in_channels,
out_channels,
kernel_size,
stride=1,
padding=padding,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=None)
def forward(self, x: Tensor, guide: Tensor) -> Tensor:
"""Forward process."""
x = self.project_conv(x)
x = x * x.sigmoid()
return x
@MODELS.register_module()
class EfficientCSPLayerWithTwoConv(CSPLayerWithTwoConv):
"""Sigmoid-attention based CSP layer with two convolution layers."""
def __init__(
self,
in_channels: int,
out_channels: int,
guide_channels: int,
embed_channels: int,
num_heads: int = 1,
expand_ratio: float = 0.5,
num_blocks: int = 1,
with_scale: bool = False,
add_identity: bool = True,
conv_cfg: OptConfigType = None,
norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001),
act_cfg: ConfigType = dict(type='SiLU', inplace=True),
init_cfg: OptMultiConfig = None) -> None:
super().__init__(in_channels=in_channels,
out_channels=out_channels,
expand_ratio=expand_ratio,
num_blocks=num_blocks,
add_identity=add_identity,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg,
init_cfg=init_cfg)
self.final_conv = ConvModule((3 + num_blocks) * self.mid_channels,
out_channels,
1,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg)
self.attn_block = VanillaSigmoidBlock(self.mid_channels,
self.mid_channels,
guide_channels=guide_channels,
embed_channels=embed_channels,
num_heads=num_heads,
with_scale=with_scale,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg)
def forward(self, x: Tensor, guide: Tensor) -> Tensor:
"""Forward process."""
x_main = self.main_conv(x)
x_main = list(x_main.split((self.mid_channels, self.mid_channels), 1))
x_main.extend(blocks(x_main[-1]) for blocks in self.blocks)
x_main.append(self.attn_block(x_main[-1], guide))
return self.final_conv(torch.cat(x_main, 1))
.\YOLO-World\yolo_world\models\layers\__init__.py
from .yolo_bricks import (
CSPLayerWithTwoConv,
MaxSigmoidAttnBlock,
MaxSigmoidCSPLayerWithTwoConv,
ImagePoolingAttentionModule,
)
__all__ = ['CSPLayerWithTwoConv',
'MaxSigmoidAttnBlock',
'MaxSigmoidCSPLayerWithTwoConv',
'ImagePoolingAttentionModule']
.\YOLO-World\yolo_world\models\losses\dynamic_loss.py
from typing import Optional
import torch
import torch.nn as nn
from torch import Tensor
from mmdet.models.losses.mse_loss import mse_loss
from mmyolo.registry import MODELS
@MODELS.register_module()
class CoVMSELoss(nn.Module):
def __init__(self,
dim: int = 0,
reduction: str = 'mean',
loss_weight: float = 1.0,
eps: float = 1e-6) -> None:
super().__init__()
self.dim = dim
self.reduction = reduction
self.loss_weight = loss_weight
self.eps = eps
def forward(self,
pred: Tensor,
weight: Optional[Tensor] = None,
avg_factor: Optional[int] = None,
reduction_override: Optional[str] = None) -> Tensor:
"""Forward function of loss."""
assert reduction_override in (None, 'none', 'mean', 'sum')
reduction = (
reduction_override if reduction_override else self.reduction)
cov = pred.std(self.dim) / pred.mean(self.dim).clamp(min=self.eps)
target = torch.zeros_like(cov)
loss = self.loss_weight * mse_loss(
cov, target, weight, reduction=reduction, avg_factor=avg_factor)
return loss
.\YOLO-World\yolo_world\models\losses\__init__.py
from .dynamic_loss import CoVMSELoss
__all__ = ['CoVMSELoss']
.\YOLO-World\yolo_world\models\necks\yolo_world_pafpn.py
import copy
from typing import List, Union
import torch
import torch.nn as nn
from torch import Tensor
from mmdet.utils import ConfigType, OptMultiConfig
from mmyolo.registry import MODELS
from mmyolo.models.utils import make_divisible, make_round
from mmyolo.models.necks.yolov8_pafpn import YOLOv8PAFPN
@MODELS.register_module()
class YOLOWorldPAFPN(YOLOv8PAFPN):
"""Path Aggregation Network used in YOLO World
Following YOLOv8 PAFPN, including text to image fusion
"""
def __init__(self,
in_channels: List[int],
out_channels: Union[List[int], int],
guide_channels: int,
embed_channels: List[int],
num_heads: List[int],
deepen_factor: float = 1.0,
widen_factor: float = 1.0,
num_csp_blocks: int = 3,
freeze_all: bool = False,
block_cfg: ConfigType = dict(type='CSPLayerWithTwoConv'),
norm_cfg: ConfigType = dict(type='BN',
momentum=0.03,
eps=0.001),
act_cfg: ConfigType = dict(type='SiLU', inplace=True),
init_cfg: OptMultiConfig = None) -> None:
self.guide_channels = guide_channels
self.embed_channels = embed_channels
self.num_heads = num_heads
self.block_cfg = block_cfg
super().__init__(in_channels=in_channels,
out_channels=out_channels,
deepen_factor=deepen_factor,
widen_factor=widen_factor,
num_csp_blocks=num_csp_blocks,
freeze_all=freeze_all,
norm_cfg=norm_cfg,
act_cfg=act_cfg,
init_cfg=init_cfg)
def build_top_down_layer(self, idx: int) -> nn.Module:
"""build top down layer.
Args:
idx (int): layer idx.
Returns:
nn.Module: The top down layer.
"""
block_cfg = copy.deepcopy(self.block_cfg)
block_cfg.update(
dict(in_channels=make_divisible(
(self.in_channels[idx - 1] + self.in_channels[idx]),
self.widen_factor),
out_channels=make_divisible(self.out_channels[idx - 1],
self.widen_factor),
guide_channels=self.guide_channels,
embed_channels=make_round(self.embed_channels[idx - 1],
self.widen_factor),
num_heads=make_round(self.num_heads[idx - 1],
self.widen_factor),
num_blocks=make_round(self.num_csp_blocks,
self.deepen_factor),
add_identity=False,
norm_cfg=self.norm_cfg,
act_cfg=self.act_cfg))
return MODELS.build(block_cfg)
def build_bottom_up_layer(self, idx: int) -> nn.Module:
"""build bottom up layer.
Args:
idx (int): layer idx.
Returns:
nn.Module: The bottom up layer.
"""
block_cfg = copy.deepcopy(self.block_cfg)
block_cfg.update(
dict(in_channels=make_divisible(
(self.out_channels[idx] + self.out_channels[idx + 1]),
self.widen_factor),
out_channels=make_divisible(self.out_channels[idx + 1],
self.widen_factor),
guide_channels=self.guide_channels,
embed_channels=make_round(self.embed_channels[idx + 1],
self.widen_factor),
num_heads=make_round(self.num_heads[idx + 1],
self.widen_factor),
num_blocks=make_round(self.num_csp_blocks,
self.deepen_factor),
add_identity=False,
norm_cfg=self.norm_cfg,
act_cfg=self.act_cfg))
return MODELS.build(block_cfg)
def forward(self, img_feats: List[Tensor], txt_feats: Tensor) -> tuple:
"""Forward function.
including multi-level image features, text features: BxLxD
"""
assert len(img_feats) == len(self.in_channels)
reduce_outs = []
for idx in range(len(self.in_channels)):
reduce_outs.append(self.reduce_layers[idx](img_feats[idx]))
inner_outs = [reduce_outs[-1]]
for idx in range(len(self.in_channels) - 1, 0, -1):
feat_high = inner_outs[0]
feat_low = reduce_outs[idx - 1]
upsample_feat = self.upsample_layers[len(self.in_channels) - 1 - idx](feat_high)
if self.upsample_feats_cat_first:
top_down_layer_inputs = torch.cat([upsample_feat, feat_low], 1)
else:
top_down_layer_inputs = torch.cat([feat_low, upsample_feat], 1)
inner_out = self.top_down_layers[len(self.in_channels) - 1 - idx](top_down_layer_inputs, txt_feats)
inner_outs.insert(0, inner_out)
outs = [inner_outs[0]]
for idx in range(len(self.in_channels) - 1):
feat_low = outs[-1]
feat_high = inner_outs[idx + 1]
downsample_feat = self.downsample_layers[idx](feat_low)
out = self.bottom_up_layers[idx](torch.cat([downsample_feat, feat_high], 1), txt_feats)
outs.append(out)
results = []
for idx in range(len(self.in_channels)):
results.append(self.out_layers[idx](outs[idx]))
return tuple(results)
@MODELS.register_module()
class YOLOWorldDualPAFPN(YOLOWorldPAFPN):
"""Path Aggregation Network used in YOLO World v8."""
def __init__(self,
in_channels: List[int],
out_channels: Union[List[int], int],
guide_channels: int,
embed_channels: List[int],
num_heads: List[int],
deepen_factor: float = 1.0,
widen_factor: float = 1.0,
num_csp_blocks: int = 3,
freeze_all: bool = False,
text_enhancder: ConfigType = dict(
type='ImagePoolingAttentionModule',
embed_channels=256,
num_heads=8,
pool_size=3),
block_cfg: ConfigType = dict(type='CSPLayerWithTwoConv'),
norm_cfg: ConfigType = dict(type='BN',
momentum=0.03,
eps=0.001),
act_cfg: ConfigType = dict(type='SiLU', inplace=True),
init_cfg: OptMultiConfig = None) -> None:
super().__init__(in_channels=in_channels,
out_channels=out_channels,
guide_channels=guide_channels,
embed_channels=embed_channels,
num_heads=num_heads,
deepen_factor=deepen_factor,
widen_factor=widen_factor,
num_csp_blocks=num_csp_blocks,
freeze_all=freeze_all,
block_cfg=block_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg,
init_cfg=init_cfg)
text_enhancder.update(
dict(
image_channels=[int(x * widen_factor) for x in out_channels],
text_channels=guide_channels,
num_feats=len(out_channels),
))
print(text_enhancder)
self.text_enhancer = MODELS.build(text_enhancder)
def forward(self, img_feats: List[Tensor], txt_feats: Tensor) -> tuple:
"""Forward function."""
assert len(img_feats) == len(self.in_channels)
reduce_outs = []
for idx in range(len(self.in_channels)):
reduce_outs.append(self.reduce_layers[idx](img_feats[idx]))
inner_outs = [reduce_outs[-1]]
for idx in range(len(self.in_channels) - 1, 0, -1):
feat_high = inner_outs[0]
feat_low = reduce_outs[idx - 1]
upsample_feat = self.upsample_layers[len(self.in_channels) - 1 - idx](feat_high)
if self.upsample_feats_cat_first:
top_down_layer_inputs = torch.cat([upsample_feat, feat_low], 1)
else:
top_down_layer_inputs = torch.cat([feat_low, upsample_feat], 1)
inner_out = self.top_down_layers[len(self.in_channels) - 1 - idx](top_down_layer_inputs, txt_feats)
inner_outs.insert(0, inner_out)
txt_feats = self.text_enhancer(txt_feats, inner_outs)
outs = [inner_outs[0]]
for idx in range(len(self.in_channels) - 1):
feat_low = outs[-1]
feat_high = inner_outs[idx + 1]
downsample_feat = self.downsample_layers[idx](feat_low)
out = self.bottom_up_layers[idx](torch.cat([downsample_feat, feat_high], 1), txt_feats)
outs.append(out)
results = []
for idx in range(len(self.in_channels)):
results.append(self.out_layers[idx](outs[idx]))
return tuple(results)
.\YOLO-World\yolo_world\models\necks\__init__.py
from .yolo_world_pafpn import YOLOWorldPAFPN, YOLOWorldDualPAFPN
__all__ = ['YOLOWorldPAFPN', 'YOLOWorldDualPAFPN']
.\YOLO-World\yolo_world\models\__init__.py
from .backbones import *
from .layers import *
from .detectors import *
from .losses import *
from .data_preprocessors import *
from .dense_heads import *
from .necks import *
.\YOLO-World\yolo_world\version.py
__version__ = '0.1.0'
def parse_version_info(version_str):
"""Parse a version string into a tuple.
Args:
version_str (str): The version string.
Returns:
tuple[int | str]: The version info, e.g., "1.3.0" is parsed into
(1, 3, 0), and "2.0.0rc1" is parsed into (2, 0, 0, 'rc1').
"""
version_info = []
for x in version_str.split('.'):
if x.isdigit():
version_info.append(int(x))
elif x.find('rc') != -1:
patch_version = x.split('rc')
version_info.append(int(patch_version[0]))
version_info.append(f'rc{patch_version[1]}')
return tuple(version_info)
version_info = parse_version_info(__version__)
__all__ = ['__version__', 'version_info', 'parse_version_info']
.\YOLO-World\yolo_world\__init__.py
from .models import *
from .datasets import *
from .engine import *