使用多模态模型 CLIP 获取向量

134 阅读1分钟

环境

conda create -n clip python=3.8

打开 Command Prompt

conda activate clip

pip install torch

pip install torchvision

pip install ftfy regex tqdm

需要按照git客户端

pip install git+https://github.com/openai/CLIP.git

requirements.txt

pip freeze > requirements.txt

clip @ git+https://github.com/openai/CLIP.git@dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
colorama==0.4.6
filelock==3.16.1
fsspec==2025.2.0
ftfy==6.2.3
Jinja2==3.1.5
MarkupSafe==2.1.5
mpmath==1.3.0
networkx==3.1
numpy==1.24.4
packaging==24.2
pillow==10.4.0
regex==2024.11.6
sympy==1.13.3
torch==2.4.1
torchvision==0.19.1
tqdm==4.67.1
typing_extensions==4.12.2
wcwidth==0.2.13

main.py

import torch
import clip
from PIL import Image

# clip 中可用的模型
print(clip.available_models())

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
print(type(model))

image = preprocess(Image.open("cat1.jpg")).unsqueeze(0).to(device)
text = clip.tokenize(["图表", "狗", "猫"]).to(device)

with torch.no_grad():
    image_features = model.encode_image(image)
    print("image_features", image_features.shape)
    text_features = model.encode_text(text)
    print("text_features", text_features.shape)
    logits_per_image, logits_per_text = model(image, text)
    print("logits_per_image", logits_per_image.shape, logits_per_image)
    print("logits_per_text", logits_per_text.shape, logits_per_text)
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()

print("Label probs:", probs)  # prints: [[0.9927937  0.00421068 0.00299572]]