环境
conda create -n clip python=3.8
打开 Command Prompt
conda activate clip
pip install torch
pip install torchvision
pip install ftfy regex tqdm
需要按照git客户端
pip install
git+https://github.com/openai/CLIP.git
requirements.txt
pip freeze > requirements.txt
clip @ git+https://github.com/openai/CLIP.git@dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
colorama==0.4.6
filelock==3.16.1
fsspec==2025.2.0
ftfy==6.2.3
Jinja2==3.1.5
MarkupSafe==2.1.5
mpmath==1.3.0
networkx==3.1
numpy==1.24.4
packaging==24.2
pillow==10.4.0
regex==2024.11.6
sympy==1.13.3
torch==2.4.1
torchvision==0.19.1
tqdm==4.67.1
typing_extensions==4.12.2
wcwidth==0.2.13
main.py
import torch
import clip
from PIL import Image
# clip 中可用的模型
print(clip.available_models())
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
print(type(model))
image = preprocess(Image.open("cat1.jpg")).unsqueeze(0).to(device)
text = clip.tokenize(["图表", "狗", "猫"]).to(device)
with torch.no_grad():
image_features = model.encode_image(image)
print("image_features", image_features.shape)
text_features = model.encode_text(text)
print("text_features", text_features.shape)
logits_per_image, logits_per_text = model(image, text)
print("logits_per_image", logits_per_image.shape, logits_per_image)
print("logits_per_text", logits_per_text.shape, logits_per_text)
probs = logits_per_image.softmax(dim=-1).cpu().numpy()
print("Label probs:", probs) # prints: [[0.9927937 0.00421068 0.00299572]]