.\YOLO-World\yolo_world\datasets\transformers\mm_transforms.py
import json
import random
from typing import Tuple
import numpy as np
from mmyolo.registry import TRANSFORMS
@TRANSFORMS.register_module()
class RandomLoadText:
def __init__(self,
text_path: str = None,
prompt_format: str = '{}',
num_neg_samples: Tuple[int, int] = (80, 80),
max_num_samples: int = 80,
padding_to_max: bool = False,
padding_value: str = '') -> None:
self.prompt_format = prompt_format
self.num_neg_samples = num_neg_samples
self.max_num_samples = max_num_samples
self.padding_to_max = padding_to_max
self.padding_value = padding_value
if text_path is not None:
with open(text_path, 'r') as f:
self.class_texts = json.load(f)
@TRANSFORMS.register_module()
class LoadText:
def __init__(self,
text_path: str = None,
prompt_format: str = '{}',
multi_prompt_flag: str = '/') -> None:
self.prompt_format = prompt_format
self.multi_prompt_flag = multi_prompt_flag
if text_path is not None:
with open(text_path, 'r') as f:
self.class_texts = json.load(f)
def __call__(self, results: dict) -> dict:
assert 'texts' in results or hasattr(self, 'class_texts'), (
'No texts found in results.')
class_texts = results.get(
'texts',
getattr(self, 'class_texts', None))
texts = []
for idx, cls_caps in enumerate(class_texts):
assert len(cls_caps) > 0
sel_cls_cap = cls_caps[0]
sel_cls_cap = self.prompt_format.format(sel_cls_cap)
texts.append(sel_cls_cap)
results['texts'] = texts
return results
.\YOLO-World\yolo_world\datasets\transformers\__init__.py
__all__ = ['RandomLoadText', 'LoadText', 'MultiModalMosaic',
'MultiModalMosaic9', 'YOLOv5MultiModalMixUp',
'YOLOXMultiModalMixUp']
.\YOLO-World\yolo_world\datasets\utils.py
from typing import Sequence
import torch
from mmengine.dataset import COLLATE_FUNCTIONS
@COLLATE_FUNCTIONS.register_module()
def yolow_collate(data_batch: Sequence,
use_ms_training: bool = False) -> dict:
"""Rewrite collate_fn to get faster training speed.
Args:
data_batch (Sequence): Batch of data.
use_ms_training (bool): Whether to use multi-scale training.
"""
batch_imgs = []
batch_bboxes_labels = []
batch_masks = []
for i in range(len(data_batch)):
datasamples = data_batch[i]['data_samples']
inputs = data_batch[i]['inputs']
batch_imgs.append(inputs)
gt_bboxes = datasamples.gt_instances.bboxes.tensor
gt_labels = datasamples.gt_instances.labels
if 'masks' in datasamples.gt_instances:
masks = datasamples.gt_instances.masks.to_tensor(
dtype=torch.bool, device=gt_bboxes.device)
batch_masks.append(masks)
batch_idx = gt_labels.new_full((len(gt_labels), 1), i)
bboxes_labels = torch.cat((batch_idx, gt_labels[:, None], gt_bboxes),
dim=1)
batch_bboxes_labels.append(bboxes_labels)
collated_results = {
'data_samples': {
'bboxes_labels': torch.cat(batch_bboxes_labels, 0)
}
}
if len(batch_masks) > 0:
collated_results['data_samples']['masks'] = torch.cat(batch_masks, 0)
if use_ms_training:
collated_results['inputs'] = batch_imgs
else:
collated_results['inputs'] = torch.stack(batch_imgs, 0)
if hasattr(data_batch[0]['data_samples'], 'texts'):
batch_texts = [meta['data_samples'].texts for meta in data_batch]
collated_results['data_samples']['texts'] = batch_texts
if hasattr(data_batch[0]['data_samples'], 'is_detection'):
batch_detection = [meta['data_samples'].is_detection
for meta in data_batch]
collated_results['data_samples']['is_detection'] = torch.tensor(
batch_detection)
return collated_results
.\YOLO-World\yolo_world\datasets\yolov5_lvis.py
from mmdet.datasets import LVISV1Dataset
from mmyolo.datasets.yolov5_coco import BatchShapePolicyDataset
from mmyolo.registry import DATASETS
@DATASETS.register_module()
class YOLOv5LVISV1Dataset(BatchShapePolicyDataset, LVISV1Dataset):
"""Dataset for YOLOv5 LVIS Dataset.
We only add `BatchShapePolicy` function compared with Objects365V1Dataset.
See `mmyolo/datasets/utils.py#BatchShapePolicy` for details
"""
pass
.\YOLO-World\yolo_world\datasets\yolov5_mixed_grounding.py
import os.path as osp
from typing import List, Union
from mmengine.fileio import get_local_path, join_path
from mmengine.utils import is_abs
from mmdet.datasets.coco import CocoDataset
from mmyolo.registry import DATASETS
from mmyolo.datasets.yolov5_coco import BatchShapePolicyDataset
@DATASETS.register_module()
class YOLOv5MixedGroundingDataset(BatchShapePolicyDataset, CocoDataset):
"""Mixed grounding dataset."""
METAINFO = {
'classes': ('object',),
'palette': [(220, 20, 60)]}
def load_data_list(self) -> List[dict]:
"""Load annotations from an annotation file named as ``self.ann_file``
Returns:
List[dict]: A list of annotation.
"""
with get_local_path(
self.ann_file, backend_args=self.backend_args) as local_path:
self.coco = self.COCOAPI(local_path)
img_ids = self.coco.get_img_ids()
data_list = []
total_ann_ids = []
for img_id in img_ids:
raw_img_info = self.coco.load_imgs([img_id])[0]
raw_img_info['img_id'] = img_id
ann_ids = self.coco.get_ann_ids(img_ids=[img_id])
raw_ann_info = self.coco.load_anns(ann_ids)
total_ann_ids.extend(ann_ids)
parsed_data_info = self.parse_data_info({
'raw_ann_info':
raw_ann_info,
'raw_img_info':
raw_img_info
})
data_list.append(parsed_data_info)
if self.ANN_ID_UNIQUE:
assert len(set(total_ann_ids)) == len(
total_ann_ids
), f"Annotation ids in '{self.ann_file}' are not unique!"
del self.coco
return data_list
def filter_data(self) -> List[dict]:
"""Filter annotations according to filter_cfg.
Returns:
List[dict]: Filtered results.
"""
if self.test_mode:
return self.data_list
if self.filter_cfg is None:
return self.data_list
filter_empty_gt = self.filter_cfg.get('filter_empty_gt', False)
min_size = self.filter_cfg.get('min_size', 0)
ids_with_ann = set(data_info['img_id'] for data_info in self.data_list)
valid_data_infos = []
for i, data_info in enumerate(self.data_list):
img_id = data_info['img_id']
width = int(data_info['width'])
height = int(data_info['height'])
if filter_empty_gt and img_id not in ids_with_ann:
continue
if min(width, height) >= min_size:
valid_data_infos.append(data_info)
return valid_data_infos
def _join_prefix(self):
"""Join ``self.data_root`` with ``self.data_prefix`` and
``self.ann_file``.
"""
if self.ann_file and not is_abs(self.ann_file) and self.data_root:
self.ann_file = join_path(self.data_root, self.ann_file)
for data_key, prefix in self.data_prefix.items():
if isinstance(prefix, (list, tuple)):
abs_prefix = []
for p in prefix:
if not is_abs(p) and self.data_root:
abs_prefix.append(join_path(self.data_root, p))
else:
abs_prefix.append(p)
self.data_prefix[data_key] = abs_prefix
elif isinstance(prefix, str):
if not is_abs(prefix) and self.data_root:
self.data_prefix[data_key] = join_path(
self.data_root, prefix)
else:
self.data_prefix[data_key] = prefix
else:
raise TypeError('prefix should be a string, tuple or list,'
f'but got {type(prefix)}')
.\YOLO-World\yolo_world\datasets\yolov5_obj365v1.py
from mmdet.datasets import Objects365V1Dataset
from mmyolo.datasets.yolov5_coco import BatchShapePolicyDataset
from mmyolo.registry import DATASETS
@DATASETS.register_module()
class YOLOv5Objects365V1Dataset(BatchShapePolicyDataset, Objects365V1Dataset):
"""Dataset for YOLOv5 VOC Dataset.
We only add `BatchShapePolicy` function compared with Objects365V1Dataset.
See `mmyolo/datasets/utils.py#BatchShapePolicy` for details
"""
pass
.\YOLO-World\yolo_world\datasets\yolov5_obj365v2.py
from mmdet.datasets import Objects365V2Dataset
from mmyolo.datasets.yolov5_coco import BatchShapePolicyDataset
from mmyolo.registry import DATASETS
@DATASETS.register_module()
class YOLOv5Objects365V2Dataset(BatchShapePolicyDataset, Objects365V2Dataset):
"""Dataset for YOLOv5 VOC Dataset.
We only add `BatchShapePolicy` function compared with Objects365V1Dataset.
See `mmyolo/datasets/utils.py#BatchShapePolicy` for details
"""
pass
.\YOLO-World\yolo_world\datasets\yolov5_v3det.py
import copy
import json
import os.path as osp
from typing import List
from mmengine.fileio import get_local_path
from mmdet.datasets.api_wrappers import COCO
from mmdet.datasets import CocoDataset
from mmyolo.datasets.yolov5_coco import BatchShapePolicyDataset
from mmyolo.registry import DATASETS
v3det_ignore_list = [
'a00013820/26_275_28143226914_ff3a247c53_c.jpg',
'n03815615/12_1489_32968099046_be38fa580e_c.jpg',
'n04550184/19_1480_2504784164_ffa3db8844_c.jpg',
'a00008703/2_363_3576131784_dfac6fc6ce_c.jpg',
'n02814533/28_2216_30224383848_a90697f1b3_c.jpg',
'n12026476/29_186_15091304754_5c219872f7_c.jpg',
'n01956764/12_2004_50133201066_72e0d9fea5_c.jpg',
'n03785016/14_2642_518053131_d07abcb5da_c.jpg',
'a00011156/33_250_4548479728_9ce5246596_c.jpg',
'a00009461/19_152_2792869324_db95bebc84_c.jpg',
]
@DATASETS.register_module()
class V3DetDataset(CocoDataset):
"""Objects365 v1 dataset for detection."""
METAINFO = {'classes': 'classes', 'palette': None}
COCOAPI = COCO
ANN_ID_UNIQUE = True
@DATASETS.register_module()
class YOLOv5V3DetDataset(BatchShapePolicyDataset, V3DetDataset):
"""Dataset for YOLOv5 VOC Dataset.
We only add `BatchShapePolicy` function compared with Objects365V1Dataset.
See `mmyolo/datasets/utils.py#BatchShapePolicy` for details
"""
pass
.\YOLO-World\yolo_world\datasets\__init__.py
from .mm_dataset import (
MultiModalDataset, MultiModalMixedDataset)
from .yolov5_obj365v1 import YOLOv5Objects365V1Dataset
from .yolov5_obj365v2 import YOLOv5Objects365V2Dataset
from .yolov5_mixed_grounding import YOLOv5MixedGroundingDataset
from .utils import yolow_collate
from .transformers import *
from .yolov5_v3det import YOLOv5V3DetDataset
from .yolov5_lvis import YOLOv5LVISV1Dataset
__all__ = [
'MultiModalDataset', 'YOLOv5Objects365V1Dataset',
'YOLOv5Objects365V2Dataset', 'YOLOv5MixedGroundingDataset',
'YOLOv5V3DetDataset', 'yolow_collate',
'YOLOv5LVISV1Dataset', 'MultiModalMixedDataset',
]
.\YOLO-World\yolo_world\engine\optimizers\yolow_v5_optim_constructor.py
import logging
from typing import List, Optional, Union
import torch
import torch.nn as nn
from torch.nn import GroupNorm, LayerNorm
from mmengine.dist import get_world_size
from mmengine.logging import print_log
from mmengine.optim import OptimWrapper, DefaultOptimWrapperConstructor
from mmengine.utils.dl_utils import mmcv_full_available
from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm, _InstanceNorm
from mmyolo.registry import (OPTIM_WRAPPER_CONSTRUCTORS, OPTIM_WRAPPERS,
OPTIMIZERS)
@OPTIM_WRAPPER_CONSTRUCTORS.register_module()
class YOLOWv5OptimizerConstructor(DefaultOptimWrapperConstructor):
"""YOLO World v5 constructor for optimizers."""
def __init__(self,
optim_wrapper_cfg: dict,
paramwise_cfg: Optional[dict] = None) -> None:
super().__init__(optim_wrapper_cfg, paramwise_cfg)
self.base_total_batch_size = self.paramwise_cfg.pop(
'base_total_batch_size', 64)
def __call__(self, model: nn.Module) -> OptimWrapper:
if hasattr(model, 'module'):
model = model.module
optim_wrapper_cfg = self.optim_wrapper_cfg.copy()
optim_wrapper_cfg.setdefault('type', 'OptimWrapper')
optimizer_cfg = self.optimizer_cfg.copy()
if 'batch_size_per_gpu' in optimizer_cfg:
batch_size_per_gpu = optimizer_cfg.pop('batch_size_per_gpu')
total_batch_size = get_world_size() * batch_size_per_gpu
accumulate = max(
round(self.base_total_batch_size / total_batch_size), 1)
scale_factor = total_batch_size * \
accumulate / self.base_total_batch_size
if scale_factor != 1:
weight_decay = optimizer_cfg.get('weight_decay', 0)
weight_decay *= scale_factor
optimizer_cfg['weight_decay'] = weight_decay
print_log(f'Scaled weight_decay to {weight_decay}', 'current')
if not self.paramwise_cfg:
optimizer_cfg['params'] = model.parameters()
optimizer = OPTIMIZERS.build(optimizer_cfg)
else:
params: List = []
self.add_params(params, model)
optimizer_cfg['params'] = params
optimizer = OPTIMIZERS.build(optimizer_cfg)
optim_wrapper = OPTIM_WRAPPERS.build(
optim_wrapper_cfg, default_args=dict(optimizer=optimizer))
return optim_wrapper
.\YOLO-World\yolo_world\engine\optimizers\__init__.py
from .yolow_v5_optim_constructor import YOLOWv5OptimizerConstructor
__all__ = ['YOLOWv5OptimizerConstructor']
.\YOLO-World\yolo_world\engine\__init__.py
from .optimizers import *
.\YOLO-World\yolo_world\models\backbones\mm_backbone.py
import itertools
from typing import List, Sequence, Tuple
import torch
from torch import Tensor
from torch.nn.modules.batchnorm import _BatchNorm
from mmengine.model import BaseModule
from mmyolo.registry import MODELS
from mmdet.utils import OptMultiConfig, ConfigType
from transformers import (AutoTokenizer, AutoModel, CLIPTextConfig)
from transformers import CLIPTextModelWithProjection as CLIPTP
@MODELS.register_module()
class HuggingVisionBackbone(BaseModule):
def __init__(self,
model_name: str,
out_indices: Sequence[int] = (0, 1, 2, 3),
norm_eval: bool = True,
frozen_modules: Sequence[str] = (),
init_cfg: OptMultiConfig = None) -> None:
super().__init__(init_cfg=init_cfg)
self.norm_eval = norm_eval
self.frozen_modules = frozen_modules
self.model = AutoModel.from_pretrained(model_name)
self._freeze_modules()
def forward(self, image: Tensor) -> Tuple[Tensor]:
encoded_dict = self.image_model(pixel_values=image,
output_hidden_states=True)
hidden_states = encoded_dict.hidden_states
img_feats = encoded_dict.get('reshaped_hidden_states', hidden_states)
img_feats = [img_feats[i] for i in self.image_out_indices]
return tuple(img_feats)
def _freeze_modules(self):
for name, module in self.model.named_modules():
for frozen_name in self.frozen_modules:
if name.startswith(frozen_name):
module.eval()
for param in module.parameters():
param.requires_grad = False
break
def train(self, mode=True):
super().train(mode)
self._freeze_modules()
if mode and self.norm_eval:
for m in self.modules():
if isinstance(m, _BatchNorm):
m.eval()
@MODELS.register_module()
class HuggingCLIPLanguageBackbone(BaseModule):
def __init__(self,
model_name: str,
frozen_modules: Sequence[str] = (),
dropout: float = 0.0,
training_use_cache: bool = False,
init_cfg: OptMultiConfig = None) -> None:
super().__init__(init_cfg=init_cfg)
self.frozen_modules = frozen_modules
self.training_use_cache = training_use_cache
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
clip_config = CLIPTextConfig.from_pretrained(model_name, attention_dropout=dropout)
self.model = CLIPTP.from_pretrained(model_name, config=clip_config)
self._freeze_modules()
def forward_cache(self, text: List[List[str]]) -> Tensor:
if not hasattr(self, "cache"):
self.cache = self.forward_text(text)
return self.cache
def forward(self, text: List[List[str]]) -> Tensor:
if self.training:
return self.forward_text(text)
else:
return self.forward_cache(text)
def forward_tokenizer(self, texts):
if not hasattr(self, 'text'):
text = list(itertools.chain(*texts))
text = self.tokenizer(text=text, return_tensors='pt', padding=True)
self.text = text.to(device=self.model.device)
return self.text
def forward_text(self, text: List[List[str]]) -> Tensor:
num_per_batch = [len(t) for t in text]
assert max(num_per_batch) == min(num_per_batch), (
'number of sequences not equal in batch')
text = list(itertools.chain(*text))
text = self.tokenizer(text=text, return_tensors='pt', padding=True)
text = text.to(device=self.model.device)
txt_outputs = self.model(**text)
txt_feats = txt_outputs.text_embeds
txt_feats = txt_feats / txt_feats.norm(p=2, dim=-1, keepdim=True)
txt_feats = txt_feats.reshape(-1, num_per_batch[0], txt_feats.shape[-1])
return txt_feats
def _freeze_modules(self):
if len(self.frozen_modules) == 0:
return
if self.frozen_modules[0] == "all":
self.model.eval()
for _, module in self.model.named_modules():
module.eval()
for param in module.parameters():
param.requires_grad = False
return
for name, module in self.model.named_modules():
for frozen_name in self.frozen_modules:
if name.startswith(frozen_name):
module.eval()
for param in module.parameters():
param.requires_grad = False
break
def train(self, mode=True):
super().train(mode)
self._freeze_modules()
@MODELS.register_module()
class PseudoLanguageBackbone(BaseModule):
"""Pseudo Language Backbone
Args:
text_embed_path (str): path to the text embedding file
"""
def __init__(self,
text_embed_path: str = "",
test_embed_path: str = None,
init_cfg: OptMultiConfig = None):
super().__init__(init_cfg)
self.text_embed = torch.load(text_embed_path, map_location='cpu')
if test_embed_path is None:
self.test_embed = self.text_embed
else:
self.test_embed = torch.load(test_embed_path)
self.register_buffer("buff", torch.zeros([
1,
]))
def forward_cache(self, text: List[List[str]]) -> Tensor:
if not hasattr(self, "cache"):
self.cache = self.forward_text(text)
return self.cache
def forward(self, text: List[List[str]]) -> Tensor:
if self.training:
return self.forward_text(text)
else:
return self.forward_cache(text)
def forward_text(self, text: List[List[str]]) -> Tensor:
num_per_batch = [len(t) for t in text]
assert max(num_per_batch) == min(num_per_batch), (
'number of sequences not equal in batch')
text = list(itertools.chain(*text))
if self.training:
text_embed_dict = self.text_embed
else:
text_embed_dict = self.test_embed
text_embeds = torch.stack(
[text_embed_dict[x.split("/")[0]] for x in text])
text_embeds = text_embeds.to(
self.buff.device).requires_grad_(False).float()
text_embeds = text_embeds.reshape(-1, num_per_batch[0],
text_embeds.shape[-1])
return text_embeds
@MODELS.register_module()
class MultiModalYOLOBackbone(BaseModule):
def __init__(self,
image_model: ConfigType,
text_model: ConfigType,
frozen_stages: int = -1,
init_cfg: OptMultiConfig = None) -> None:
super().__init__(init_cfg)
self.image_model = MODELS.build(image_model)
self.text_model = MODELS.build(text_model)
self.frozen_stages = frozen_stages
self._freeze_stages()
def _freeze_stages(self):
"""Freeze the parameters of the specified stage so that they are no
longer updated."""
if self.frozen_stages >= 0:
for i in range(self.frozen_stages + 1):
m = getattr(self.image_model, self.image_model.layers[i])
m.eval()
for param in m.parameters():
param.requires_grad = False
def train(self, mode: bool = True):
"""Convert the model into training mode while keep normalization layer
frozen."""
super().train(mode)
self._freeze_stages()
def forward(self, image: Tensor,
text: List[List[str]]) -> Tuple[Tuple[Tensor], Tensor]:
img_feats = self.image_model(image)
txt_feats = self.text_model(text)
return img_feats, txt_feats
.\YOLO-World\yolo_world\models\backbones\__init__.py
from .mm_backbone import (
MultiModalYOLOBackbone,
HuggingVisionBackbone,
HuggingCLIPLanguageBackbone,
PseudoLanguageBackbone)
__all__ = [
'MultiModalYOLOBackbone',
'HuggingVisionBackbone',
'HuggingCLIPLanguageBackbone',
'PseudoLanguageBackbone'
]
.\YOLO-World\yolo_world\models\data_preprocessors\data_preprocessor.py
from typing import Optional, Union
import torch
from mmdet.models.data_preprocessors import DetDataPreprocessor
from mmengine.structures import BaseDataElement
from mmyolo.registry import MODELS
CastData = Union[tuple, dict, BaseDataElement, torch.Tensor, list, bytes, str, None]
@MODELS.register_module()
class YOLOWDetDataPreprocessor(DetDataPreprocessor):
"""Rewrite collate_fn to get faster training speed.
Note: It must be used together with `mmyolo.datasets.utils.yolow_collate`
"""
def __init__(self, *args, non_blocking: Optional[bool] = True, **kwargs):
super().__init__(*args, non_blocking=non_blocking, **kwargs)
def forward(self, data: dict, training: bool = False) -> dict:
"""Perform normalization, padding and bgr2rgb conversion based on
``DetDataPreprocessorr``.
Args:
data (dict): Data sampled from dataloader.
training (bool): Whether to enable training time augmentation.
Returns:
dict: Data in the same format as the model input.
"""
if not training:
return super().forward(data, training)
data = self.cast_data(data)
inputs, data_samples = data['inputs'], data['data_samples']
assert isinstance(data['data_samples'], dict)
if self._channel_conversion and inputs.shape[1] == 3:
inputs = inputs[:, [2, 1, 0], ...]
if self._enable_normalize:
inputs = (inputs - self.mean) / self.std
if self.batch_augments is not None:
for batch_aug in self.batch_augments:
inputs, data_samples = batch_aug(inputs, data_samples)
img_metas = [{'batch_input_shape': inputs.shape[2:]}] * len(inputs)
data_samples_output = {
'bboxes_labels': data_samples['bboxes_labels'],
'texts': data_samples['texts'],
'img_metas': img_metas
}
if 'masks' in data_samples:
data_samples_output['masks'] = data_samples['masks']
if 'is_detection' in data_samples:
data_samples_output['is_detection'] = data_samples['is_detection']
return {'inputs': inputs, 'data_samples': data_samples_output}
.\YOLO-World\yolo_world\models\data_preprocessors\__init__.py
from .data_preprocessor import YOLOWDetDataPreprocessor
__all__ = ['YOLOWDetDataPreprocessor']
.\YOLO-World\yolo_world\models\dense_heads\yolo_world_head.py
import math
import copy
from typing import List, Optional, Tuple, Union, Sequence
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from mmcv.cnn import ConvModule
from mmengine.config import ConfigDict
from mmengine.model import BaseModule
from torch import Tensor
from mmengine.dist import get_dist_info
from mmengine.structures import InstanceData
from mmdet.structures import SampleList
from mmdet.utils import OptConfigType, InstanceList, OptInstanceList
from mmdet.models.utils import (
multi_apply,
unpack_gt_instances,
filter_scores_and_topk)
from mmyolo.registry import MODELS
from mmyolo.models.dense_heads import YOLOv8HeadModule, YOLOv8Head
from mmyolo.models.utils import gt_instances_preprocess
from mmcv.cnn.bricks import build_norm_layer
@MODELS.register_module()
class ContrastiveHead(BaseModule):
"""Contrastive Head for YOLO-World
compute the region-text scores according to the
similarity between image and text features
Args:
embed_dims (int): embed dim of text and image features
"""
def __init__(self,
embed_dims: int,
init_cfg: OptConfigType = None) -> None:
super().__init__(init_cfg=init_cfg)
self.bias = nn.Parameter(torch.zeros([]))
self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
def forward(self, x: Tensor, w: Tensor) -> Tensor:
"""Forward function of contrastive learning."""
x = F.normalize(x, dim=1, p=2)
w = F.normalize(w, dim=-1, p=2)
x = torch.einsum('bchw,bkc->bkhw', x, w)
x = x * self.logit_scale.exp() + self.bias
return x
@MODELS.register_module()
class BNContrastiveHead(BaseModule):
""" Batch Norm Contrastive Head for YOLO-World
using batch norm instead of l2-normalization
Args:
embed_dims (int): embed dim of text and image features
norm_cfg (dict): normalization params
"""
def __init__(self,
embed_dims: int,
norm_cfg: ConfigDict,
init_cfg: OptConfigType = None) -> None:
super().__init__(init_cfg=init_cfg)
self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
self.bias = nn.Parameter(torch.zeros([]))
self.logit_scale = nn.Parameter(-1.0 * torch.ones([]))
def forward(self, x: Tensor, w: Tensor) -> Tensor:
"""Forward function of contrastive learning."""
x = self.norm(x)
w = F.normalize(w, dim=-1, p=2)
x = torch.einsum('bchw,bkc->bkhw', x, w)
x = x * self.logit_scale.exp() + self.bias
return x
@MODELS.register_module()
class YOLOWorldHeadModule(YOLOv8HeadModule):
"""Head Module for YOLO-World
Args:
embed_dims (int): embed dim for text feautures and image features
use_bn_head (bool): use batch normalization head
"""
def __init__(self,
*args,
embed_dims: int,
use_bn_head: bool = False,
**kwargs) -> None:
self.embed_dims = embed_dims
self.use_bn_head = use_bn_head
super().__init__(*args, **kwargs)
def init_weights(self, prior_prob=0.01):
"""Initialize the weight and bias of PPYOLOE head."""
super().init_weights()
for cls_pred, cls_contrast, stride in zip(self.cls_preds,
self.cls_contrasts,
self.featmap_strides):
cls_pred[-1].bias.data[:] = 0.0
if hasattr(cls_contrast, 'bias'):
nn.init.constant_(
cls_contrast.bias.data,
math.log(5 / self.num_classes / (640 / stride)**2))
def forward(self, img_feats: Tuple[Tensor],
txt_feats: Tensor) -> Tuple[List]:
"""Forward features from the upstream network."""
assert len(img_feats) == self.num_levels
txt_feats = [txt_feats for _ in range(self.num_levels)]
return multi_apply(self.forward_single, img_feats, txt_feats,
self.cls_preds, self.reg_preds, self.cls_contrasts)
def forward_single(self, img_feat: Tensor, txt_feat: Tensor,
cls_pred: nn.ModuleList, reg_pred: nn.ModuleList,
cls_contrast: nn.ModuleList) -> Tuple:
"""Forward feature of a single scale level."""
b, _, h, w = img_feat.shape
cls_embed = cls_pred(img_feat)
cls_logit = cls_contrast(cls_embed, txt_feat)
bbox_dist_preds = reg_pred(img_feat)
if self.reg_max > 1:
bbox_dist_preds = bbox_dist_preds.reshape(
[-1, 4, self.reg_max, h * w]).permute(0, 3, 1, 2)
bbox_preds = bbox_dist_preds.softmax(3).matmul(
self.proj.view([-1, 1])).squeeze(-1)
bbox_preds = bbox_preds.transpose(1, 2).reshape(b, -1, h, w)
else:
bbox_preds = bbox_dist_preds
if self.training:
return cls_logit, bbox_preds, bbox_dist_preds
else:
return cls_logit, bbox_preds
@MODELS.register_module()
class YOLOWorldHead(YOLOv8Head):
"""注册YOLO-World头部模块,并继承自YOLOv8Head"""
"""YOLO-World头部"""
def __init__(self, world_size=-1, *args, **kwargs) -> None:
"""初始化函数,设置world_size参数"""
super().__init__(*args, **kwargs)
self.world_size = world_size
"""YOLO World v8头部。"""
def loss(self, img_feats: Tuple[Tensor], txt_feats: Tensor,
batch_data_samples: Union[list, dict]) -> dict:
"""对上游网络的特征执行前向传播和损失计算"""
outs = self(img_feats, txt_feats)
loss_inputs = outs + (batch_data_samples['bboxes_labels'],
batch_data_samples['img_metas'])
losses = self.loss_by_feat(*loss_inputs)
return losses
def loss_and_predict(
self,
img_feats: Tuple[Tensor],
txt_feats: Tensor,
batch_data_samples: SampleList,
proposal_cfg: Optional[ConfigDict] = None
) -> Tuple[dict, InstanceList]:
"""执行头部的前向传播,然后从特征和数据样本中计算损失和预测。"""
outputs = unpack_gt_instances(batch_data_samples)
(batch_gt_instances, batch_gt_instances_ignore,
batch_img_metas) = outputs
outs = self(img_feats, txt_feats)
loss_inputs = outs + (batch_gt_instances, batch_img_metas,
batch_gt_instances_ignore)
losses = self.loss_by_feat(*loss_inputs)
predictions = self.predict_by_feat(*outs,
batch_img_metas=batch_img_metas,
cfg=proposal_cfg)
return losses, predictions
def forward(self, img_feats: Tuple[Tensor],
txt_feats: Tensor) -> Tuple[List]:
"""从上游网络前向传递特征。"""
return self.head_module(img_feats, txt_feats)
def predict(self,
img_feats: Tuple[Tensor],
txt_feats: Tensor,
batch_data_samples: SampleList,
rescale: bool = False) -> InstanceList:
"""Perform forward propagation of the detection head and predict
detection results on the features of the upstream network.
"""
batch_img_metas = [
data_samples.metainfo for data_samples in batch_data_samples
]
outs = self(img_feats, txt_feats)
predictions = self.predict_by_feat(*outs,
batch_img_metas=batch_img_metas,
rescale=rescale)
return predictions
def aug_test(self,
aug_batch_feats,
aug_batch_img_metas,
rescale=False,
with_ori_nms=False,
**kwargs):
"""Test function with test time augmentation."""
raise NotImplementedError('aug_test is not implemented yet.')
.\YOLO-World\yolo_world\models\dense_heads\yolo_world_seg_head.py
import math
from typing import List, Optional, Tuple, Union, Sequence
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from torch.nn.modules.batchnorm import _BatchNorm
from mmcv.cnn import ConvModule
from mmengine.config import ConfigDict
from mmengine.dist import get_dist_info
from mmengine.structures import InstanceData
from mmdet.structures import SampleList
from mmdet.utils import (ConfigType, OptConfigType, OptInstanceList,
OptMultiConfig, InstanceList)
from mmdet.models.utils import multi_apply, unpack_gt_instances
from mmyolo.models.dense_heads import YOLOv8HeadModule
from mmyolo.models.utils import gt_instances_preprocess
from mmyolo.registry import MODELS, TASK_UTILS
from mmyolo.models.dense_heads.yolov5_ins_head import (
ProtoModule, YOLOv5InsHead
)
from .yolo_world_head import ContrastiveHead, BNContrastiveHead
@MODELS.register_module()
class YOLOWorldSegHeadModule(YOLOv8HeadModule):
def __init__(self,
*args,
embed_dims: int,
proto_channels: int,
mask_channels: int,
freeze_bbox: bool = False,
use_bn_head: bool = False,
**kwargs) -> None:
self.freeze_bbox = freeze_bbox
self.embed_dims = embed_dims
self.proto_channels = proto_channels
self.mask_channels = mask_channels
self.use_bn_head = use_bn_head
super().__init__(*args, **kwargs)
def init_weights(self, prior_prob=0.01):
"""初始化PPYOLOE头部的权重和偏置。"""
super().init_weights()
for cls_pred, cls_contrast, stride in zip(self.cls_preds,
self.cls_contrasts,
self.featmap_strides):
cls_pred[-1].bias.data[:] = 0.0
if hasattr(cls_contrast, 'bias'):
nn.init.constant_(
cls_contrast.bias.data,
math.log(5 / self.num_classes / (640 / stride)**2))
def head_norm_eval(self):
for m in self.cls_preds:
for q in m.modules():
if isinstance(q, _BatchNorm):
q.eval()
for m in self.reg_preds:
for q in m.modules():
if isinstance(q, _BatchNorm):
q.eval()
def train(self, mode: bool = True):
"""将模型转换为训练模式,同时保持归一化层冻结。"""
super().train(mode)
if self.freeze_bbox:
self.head_norm_eval()
def forward(self, img_feats: Tuple[Tensor],
txt_feats: Tensor) -> Tuple[List]:
"""从上游网络前向传播特征。"""
assert len(img_feats) == self.num_levels
txt_feats = [txt_feats for _ in range(self.num_levels)]
mask_protos = self.proto_pred(img_feats[0])
cls_logit, bbox_preds, bbox_dist_preds, coeff_preds = multi_apply(
self.forward_single, img_feats, txt_feats, self.cls_preds,
self.reg_preds, self.cls_contrasts, self.seg_preds)
if self.training:
return cls_logit, bbox_preds, bbox_dist_preds, coeff_preds, mask_protos
else:
return cls_logit, bbox_preds, None, coeff_preds, mask_protos
def forward_single(self, img_feat: Tensor, txt_feat: Tensor,
cls_pred: nn.ModuleList, reg_pred: nn.ModuleList,
cls_contrast: nn.ModuleList,
seg_pred: nn.ModuleList) -> Tuple:
"""Forward feature of a single scale level."""
b, _, h, w = img_feat.shape
cls_embed = cls_pred(img_feat)
cls_logit = cls_contrast(cls_embed, txt_feat)
bbox_dist_preds = reg_pred(img_feat)
coeff_pred = seg_pred(img_feat)
if self.reg_max > 1:
bbox_dist_preds = bbox_dist_preds.reshape(
[-1, 4, self.reg_max, h * w]).permute(0, 3, 1, 2)
bbox_preds = bbox_dist_preds.softmax(3).matmul(
self.proj.view([-1, 1])).squeeze(-1)
bbox_preds = bbox_preds.transpose(1, 2).reshape(b, -1, h, w)
else:
bbox_preds = bbox_dist_preds
if self.training:
return cls_logit, bbox_preds, bbox_dist_preds, coeff_pred
else:
return cls_logit, bbox_preds, None, coeff_pred
@MODELS.register_module()
class YOLOWorldSegHead(YOLOv5InsHead):
def special_init(self):
"""Since YOLO series algorithms will inherit from YOLOv5Head, but
different algorithms have special initialization process.
The special_init function is designed to deal with this situation.
"""
if self.train_cfg:
self.assigner = TASK_UTILS.build(self.train_cfg.assigner)
self.featmap_sizes_train = None
self.num_level_priors = None
self.flatten_priors_train = None
self.stride_tensor = None
"""YOLO World head."""
def loss(self, img_feats: Tuple[Tensor], txt_feats: Tensor,
batch_data_samples: Union[list, dict]) -> dict:
"""Perform forward propagation and loss calculation of the detection
head on the features of the upstream network."""
outs = self(img_feats, txt_feats)
loss_inputs = outs + (batch_data_samples['bboxes_labels'],
batch_data_samples['masks'],
batch_data_samples['img_metas'])
losses = self.loss_by_feat(*loss_inputs)
return losses
def loss_and_predict(
self,
img_feats: Tuple[Tensor],
txt_feats: Tensor,
batch_data_samples: SampleList,
proposal_cfg: Optional[ConfigDict] = None
def forward(self, img_feats: Tuple[Tensor],
txt_feats: Tensor) -> Tuple[List]:
"""Forward features from the upstream network."""
return self.head_module(img_feats, txt_feats)
def predict(self,
img_feats: Tuple[Tensor],
txt_feats: Tensor,
batch_data_samples: SampleList,
rescale: bool = False) -> InstanceList:
"""Perform forward propagation of the detection head and predict
detection results on the features of the upstream network.
"""
batch_img_metas = [
data_samples.metainfo for data_samples in batch_data_samples
]
outs = self(img_feats, txt_feats)
predictions = self.predict_by_feat(*outs,
batch_img_metas=batch_img_metas,
rescale=rescale)
return predictions
def forward(self, img_feats: Tuple[Tensor],
txt_feats: Tensor) -> Tuple[dict, InstanceList]:
"""Perform forward propagation of the head, then calculate loss and
predictions from the features and data samples.
"""
outputs = unpack_gt_instances(batch_data_samples)
(batch_gt_instances, batch_gt_instances_ignore,
batch_img_metas) = outputs
outs = self(img_feats, txt_feats)
loss_inputs = outs + (batch_gt_instances, batch_img_metas,
batch_gt_instances_ignore)
losses = self.loss_by_feat(*loss_inputs)
predictions = self.predict_by_feat(*outs,
batch_img_metas=batch_img_metas,
cfg=proposal_cfg)
return losses, predictions
def aug_test(self,
aug_batch_feats,
aug_batch_img_metas,
rescale=False,
with_ori_nms=False,
**kwargs):
"""Test function with test time augmentation."""
raise NotImplementedError('aug_test is not implemented yet.')
.\YOLO-World\yolo_world\models\dense_heads\__init__.py
from .yolo_world_head import YOLOWorldHead, YOLOWorldHeadModule
from .yolo_world_seg_head import YOLOWorldSegHead, YOLOWorldSegHeadModule
__all__ = [
'YOLOWorldHead', 'YOLOWorldHeadModule', 'YOLOWorldSegHead',
'YOLOWorldSegHeadModule'
]
.\YOLO-World\yolo_world\models\detectors\yolo_world.py
from typing import List, Tuple, Union
from torch import Tensor
from mmdet.structures import OptSampleList, SampleList
from mmyolo.models.detectors import YOLODetector
from mmyolo.registry import MODELS
@MODELS.register_module()
class YOLOWorldDetector(YOLODetector):
"""Implementation of YOLOW Series"""
def __init__(self,
*args,
mm_neck: bool = False,
num_train_classes=80,
num_test_classes=80,
**kwargs) -> None:
self.mm_neck = mm_neck
self.num_train_classes = num_train_classes
self.num_test_classes = num_test_classes
super().__init__(*args, **kwargs)
def loss(self, batch_inputs: Tensor,
batch_data_samples: SampleList) -> Union[dict, list]:
"""Calculate losses from a batch of inputs and data samples."""
self.bbox_head.num_classes = self.num_train_classes
img_feats, txt_feats = self.extract_feat(batch_inputs,
batch_data_samples)
losses = self.bbox_head.loss(img_feats, txt_feats, batch_data_samples)
return losses
def predict(self,
batch_inputs: Tensor,
batch_data_samples: SampleList,
rescale: bool = True) -> SampleList:
"""Predict results from a batch of inputs and data samples with post-
processing.
"""
img_feats, txt_feats = self.extract_feat(batch_inputs,
batch_data_samples)
self.bbox_head.num_classes = txt_feats[0].shape[0]
results_list = self.bbox_head.predict(img_feats,
txt_feats,
batch_data_samples,
rescale=rescale)
batch_data_samples = self.add_pred_to_datasample(
batch_data_samples, results_list)
return batch_data_samples
def _forward(
self,
batch_inputs: Tensor,
batch_data_samples: OptSampleList = None) -> Tuple[List[Tensor]]:
"""Network forward process. Usually includes backbone, neck and head
forward without any post-processing.
"""
img_feats, txt_feats = self.extract_feat(batch_inputs,
batch_data_samples)
results = self.bbox_head.forward(img_feats, txt_feats)
return results
def extract_feat(
self, batch_inputs: Tensor,
batch_data_samples: SampleList) -> Tuple[Tuple[Tensor], Tensor]:
"""Extract features."""
if isinstance(batch_data_samples, dict):
texts = batch_data_samples['texts']
elif isinstance(batch_data_samples, list):
texts = [data_sample.texts for data_sample in batch_data_samples]
else:
raise TypeError('batch_data_samples should be dict or list.')
img_feats, txt_feats = self.backbone(batch_inputs, texts)
if self.with_neck:
if self.mm_neck:
img_feats = self.neck(img_feats, txt_feats)
else:
img_feats = self.neck(img_feats)
return img_feats, txt_feats