# 基于 CLIP
的图像物体辨别
具身智能依赖多模态,因此学 CLIP
是必不可少的。看了朱毅老师对 CLIP
论文的逐段精读,觉得 CLIP
这篇工作简直无愧于多模态的开山之作。于是自然而然想到做一些基于 CLIP
的实践,以表示有所学成。
本实践是基于 CLIP
实现对图像内物体的分类和辨别。 CLIP
是基于对比学习的多模态模型,能够学习到文本和图像之间的联系,可以做到文本和图像的匹配。
# 运行环境
# 基础
IDE
:我的IDE
是Pycharm 2023.3.4
专业版,当然用Visual Studio Code
也行,只需要有Python
插件即可。Python
版本:尽量在 3.8 以上。- 运行代码方式:在
Pycharm
中右键选择绿色的三角形和在Visual Studio Code
中点击运行均可运行代码,但为了统一,本文选择在IDE
的终端直接运行python
代码。方式很简单,只要输入python
再加上py
文件的路径即可。 - 操作系统:
Windows
和Linux
均可。
# (可选)进阶
更推荐在 IDE
的终端切入 Anaconda
虚拟环境,选择在虚拟环境中运行 python
代码。
如图 Retinexformer_env
就是一个自定义的虚拟环境,这个虚拟环境中的包和 python
版本与其他虚拟环境不同。有兴趣可以了解 Anaconda
。
# 安装相关依赖包与 CLIP
源
在终端输入:
pip install ftfy regex tqdm | |
pip install git+https://github.com/openai/CLIP.git |
两行 pip
代码会在当前环境下安装 CLIP
运行的相关依赖包。
# 核心代码
# 导入相关包
import torch | |
import clip | |
from PIL import Image |
# 规定推理运算设备
device = "cuda" if torch.cuda.is_available() else "cpu" |
如果你的电脑没有 GPU
就会使用 CPU
去推理。
# 下载模型
model, preprocess = clip.load("ViT-L/14", device=device) |
这里使用最佳的模型: ViT-L/14
进行推理。常见的模型还有: ViT-B/32、ViT-B/16、RN50、RN101、RN50×4、RN50x16、RN50x64
。
# 导入图片
image = preprocess(Image.open(r"a-racing-car.jpg")).unsqueeze(0).to(device) |
使用 Image
包导入图像,并装载到计算设备上。
# 定义图像类别的文本并符号化
我参考了 ImageNet
中 1000 个类名,归纳最主要的类名以用于图像的辨别和分类。相较于实际的 ImageNet
的 1000 个类,辨别和分类的类别有所减少。将所有归纳好的类名标签组成一个列表,名为 my_text
。
然后,将标签进行 `prompt engineering 并装载到计算设备上:
text_token = clip.tokenize(my_text).to(device) |
# 计算图像与文本的相关性
logits_per_image, logits_per_text = model(image, text_token) | |
probs = logits_per_image.softmax(dim=-1).cpu().numpy() |
logits_per_image
是对每一张图像而言的,多个文本标签与这个图像的相关性得分。 logits_per_text
则是对每个文本标签而言的,多个图像在这个文本标签的相似度。 probs
也可以理解为相关性得分,只是用 softmax
映射了一下。
# 打印分类结果
print("A photo of " + my_text[list(probs[0]).index(max(probs[0]))]) |
# 核心代码
import torch | |
import clip | |
from PIL import Image | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
model, preprocess = clip.load("ViT-L/14", device=device) | |
# ViT-B/32 or ViT-B/16 or ViT-L/14 or RN50 or RN101 or RN50x4 or RN50x16 or RN50x64 | |
image = preprocess(Image.open(r"a-racing-car.jpg")).unsqueeze(0).to(device) | |
my_text = ["fish", "cock", "hen", "ostrich", "bird", "hawk", "gecko", "toad", "frog", | |
"turtle", "lizard", "alligator", "dinosaur", "snake", "fossil", "spider", | |
"centipede", "peacock", "duck", "goose", "elephant", "Hedgehog", "platypus", "kangaroo", "koala", "marmot", "jellyfish", | |
"coral", "sea life", "sea snake", "conch", "snail", "crab", "lobster", "hermit crab", "penguin", "whale", "walrus", | |
"sea lion", "dog", "wolf", "fox", "cat", "leopard", "lion", "tiger", "bear", "mongoose", "insects", "dragonfly", | |
"butterfly", "starfish", "sea urchin", "rabbit", "rat", "squirrel", "horse", "zebra", "pig", "hippopotamus", | |
"cow", "sheep", "camel", "alpaca", "raccoon", "pangolin", "sloth", "baboon", "orangutan", "monkey", "panda", | |
"abacus", "muslim", "bachelor's uniform", "accordion", "raccoon", "guitar", "aircraft carrier", "airplane", | |
"airship", "church", "ambulance", "amphibious vehicle", "box", "clock", "apron", "trash can", "gun", "backpack", | |
"Breadbox", "Gymnastics", "Hot Air Balloon", "Pen", "Bondi", "Musical Instruments", "Stairs", "Barbells", "Seats", | |
"Haircuts", "Chalets", "Tables", "Wine Barrels", "Trolleys", "Baseball", "Basketball", "Baby", | |
"Musical Instruments", "Swimming", "Baby Towels", "Bathtubs", "cars", "lighthouses", "beakers", "guards", | |
"beer", "buildings", "bibs", "double bikes", "bikinis", "notebooks", "telescopes", "letterboxes", "huts", | |
"sledding", "necklaces", "hats", "bookcases", "bookstores", "bottle caps", "bows and arrows", "bow ties", | |
"tombstones", "brassieres", "armor", "brooms", "brooms", "bucket", "belt", "bulletproof vest", "train", "butcher shop", | |
"taxi", "pot", "candle", "cannon", "canoe", "stapler", "sweater", "car mirror", "carousel", "tool kit", | |
"box", "car wheel", "ATM", "cassette", "cassette player", "castle", "catamaran", "CD player", "cello", | |
"cell phone", "chain", "chainlink fence", "chainsaw", "chest", "commode", "chime", "cabinet", "sock", | |
"church", "cinema", "cleaver", "cliff dwelling", "cloak", "shoe", "bottle", "cup", "pot", "coil", "lock", | |
"keyboard", "candy", "ship", "car", "corkscrew", "horn", "boot", "hat", "crib", "crane", "helmet", "crate", | |
"crib", "Crock Pot", "croquet ball", "crutch", "cuirass", "dam", "desk", "desktop computer", "dial telephone", | |
"diaper", "digital clock", "digital watch", "dining table", "dishcloth", "dishwasher", "car wheel", "dock", | |
"dogsleigh", "dome", "doormat", "oilfield", "drum", "drumstick", "dumbbell", "Dutch oven", "electric fan", | |
"electric guitar", "train", "television", "letter", "coffee maker", "cosmetics","feather boa", "cabinet", | |
"fountain", "fire engine", "fireplace", "flagpole", "flute", "seat", "football", "forklift", "fountain", | |
"fountain pen", "bed", "train", "French horn", "frying pan", "fur coat", "truck", "mask", "gas pump", | |
"goblet", "go-kart", "golf", "golf cart", "boat", "gong", "wedding dress", "piano", "greenhouse", "car", | |
"grocery store", "guillotine", "hair slide", "hair spray", "tank", "hammer", "bamboo tube", "hair dryer", | |
"POS terminal", "handkerchief", "hard disk", "harmonica", "harp", "crane", "hatchet", "pistol", "television", | |
"honeycomb", "hook", "skirt", "gymnastics", "carriage", "hourglass", "music player", "iron", "jack-o’-lantern", | |
"jeans", "jeep", "T-shirt", "jigsaw puzzle", "rickshaw", "joystick", "kimono", "knee pad", "knot", "lab coat", | |
"ladle", "lamp", "laptop", "lawn mower", "lens cap", "knife", "library", "lifeboat", "lighter", "limousine", | |
"ship", "lipstick", "shoe", "lotion", "loudspeaker", "loupe", "log", "compass", "bag", "mailbox", "swimsuit", | |
"swimsuit", "manhole cover", "maraca", "marimba", "mask", "match", "rope", "maze", "beaker", "refrigerator", | |
"pillar", "microphone", "microwave", "military uniform", "water jug", "minibus", "miniskirt", "minivan", "missile", | |
"gloves", "bowl", "motor home", "vintage car", "router", "building", "monitor", "motorcycle", "inkstone", "mortarboard", | |
"building", "mosquito net", "scooter", "bicycle", "tent", "keyboard and mouse", "mousetrap", "truck", "muzzle", | |
"nail", "neck brace", "necklace", "baby bottle", "notebook", "building", "clarinet", "ocarina", "dashboard", | |
"oil filter", "organ", "oscilloscope", "dress", "oxcart", "respirator", "snack", "paddle", "paddle wheel", | |
"padlock", "brush", "pajamas", "building", "musical instrument", "paper towel", "parachute", "parallel bars", | |
"bench", "parking meter", "train", "yard", "payphone", "pillar", "pencil box", "pencil sharpener", "perfume", | |
"Petri dish", "printer", "pick", "helmet", "fence", "car", "bridge", "piggy bank", "pill", "pillow", "ping-pong ball", | |
"pinwheel", "sailboat", "teapot", "plane", "building", "plastic bag", "plate rack", "bulldozer", "plunger", | |
"camera", "pole", "police car", "poncho", "pool table", "bottle", "potted plant", "pottery", "drill", "blanket", | |
"printer", "prison", "missile", "projector", "puck", "punching bag", "handbag", "quill pen", "bed", "racing car", | |
"tennis", "heater", "radio", "satellite receiver", "wine barrel", "mobile home", "fishing rod", "camera", "refrigerator", | |
"remote control", "restaurant", "pistol", "sniper rifle", "rocking chair", "oven", "eraser", "rugby ball", "ruler", | |
"running shoes", "safe", "paperclip", "salt shaker", "slippers", "long skirt", "saxophone", "sword", "scale", | |
"school bus", "sailboat", "scoreboard", "monitor", "screw", "screwdriver", "seatbelt", "sewing machine", | |
"shield", "shoe store", "tatami", "shopping basket", "shopping cart", "shovel", "shower cap", "shower curtain", | |
"skiing", "ski mask", "sleeping bag", "slide caliper", "sliding door", "slot machine", "swimming goggles", "snowmobile", | |
"snowplow", "soap dispenser", "soccer ball", "socks", "solar panel", "hat", "bowl", "keyboard", "space heater", | |
"space shuttle", "spatula", "speedboat", "spider web", "yarn", "sports car", "spotlight", "band", "steam locomotive", | |
"bridge", "drum", "stethoscope", "stretcher", "stone pile", "stopwatch", "stove", "strainer", "streetcar", "stretcher", | |
"sofa", "palace", "ship", "suit", "sundial", "sunglasses", "sunglasses", "sunscreen", "bridge", "mop", "hoodie", | |
"swimming trunks", "swing", "switch", "syringe", "table lamp", "tank", "tape player", "teapot", "teddy bear", | |
"television", "tennis ball", "thatch", "theater curtain", "thimble", "armored vehicle", "throne", "tile", "toaster", | |
"tobacco shop", "toilet", "torch", "totem pole", "tow truck", "toy store", "tractor", "trailer truck", "plate", | |
"trench coat", "tricycle", "boat", "tripod", "arch", "bus", "trombone", "bathtub", "turnstile", "typewriter", | |
"umbrella", "unicycle", "piano", "vacuum cleaner", "vase", "vault", "velvet", "vending machine", "vestment", | |
"bridge", "violin", "volleyball", "waffle iron", "wall clock", "wallet", "wardrobe", "airplane", "washbasin", | |
"washing machine", "water bottle", "water jug", "water tower", "water jug", "whistle", "wig", "window", "window shade", | |
"tie", "wine", "airplane", "wok", "spoon", "scarf", "fence", "wreck", "sailboat", "yurt", "website", "poster", | |
"crossword puzzle", "traffic sign", "traffic light", "book", "menu", "dish", "cake", "ice cream", "popsicle", | |
"bread", "doughnut", "bread", "hamburger", "hot dog", "mashed potato", "vegetable", "broccoli", "cauliflower", | |
"vegetable", "spaghetti squash", "pumpkin", "cucumber", "vegetable", "bell pepper", "flower", "mushroom", | |
"apple", "strawberry", "orange", "lemon", "fruit", "pineapple", "banana", "durian", "fruit", "pomegranate", | |
"alp", "bubble", "cliff", "coral", "hot spring", "scenery", "island", "beach", "seashore", "waterfall", "volcano", | |
"wedding", "diving", "rapeseed", "daisy", "plant", "corn", "pinecone", "chestnut", "mushroom", "corn", "toilet paper", | |
"woman", "man" | |
] # 类名 | |
your_idea_text = "a car" # 你对这张图片的辨别。 | |
my_text.append(your_idea_text) | |
text_token = clip.tokenize(my_text).to(device) | |
with torch.no_grad(): | |
image_features = model.encode_image(image) | |
text_features = model.encode_text(text_token) | |
logits_per_image, logits_per_text = model(image, text_token) | |
probs = logits_per_image.softmax(dim=-1).cpu().numpy() | |
print("A photo of " + my_text[list(probs[0]).index(max(probs[0]))]) |
# 效果
以这张图片为例:
运行代码后,识别出的图片内容:
# 探索代码
可以发现总代码的 image_features
和 text_features
并没有用上。其实, image_features
是所有图像特征的集合, text_features
是所有文本标签的特征集合。可以用 PCA
主成分分析法可视化出来。为了简便,删除大部分 my_text
文本标签,只保留若干个标签。
my_text = ["fish", "cock", "hen", "ostrich", "bird", "hawk", "gecko", "toad", "frog"] |
依然,你的猜测是:
your_idea_text = "a car" # 你对这张图片的辨别。 |
导入相关包:
from sklearn.decomposition import PCA | |
import matplotlib.pyplot as plt |
先将图像特征和文本特征拼接到一起方便处理:
all_features = torch.cat([image_features, text_features], dim=0).cpu().numpy() |
建立模型,处理图像特征和文本特征,将高维特征映射到二维平面特征:
pca = PCA(n_components=2) | |
features_2d = pca.fit_transform(all_features) |
可视化,首先将图像的特征打点:
plt.figure(figsize=(10, 10)) | |
plt.scatter(features_2d[0, 0], features_2d[0, 1], color='red', label='Image Feature') |
接下来打点文本特征:
for i, text in enumerate(my_text): | |
plt.scatter(features_2d[i+1, 0], features_2d[i+1, 1], label=f'Text Feature: {text}') |
显示图例,横纵轴标签等,并最终显示:
plt.legend() | |
plt.title("PCA of Image and Text Features") | |
plt.xlabel("PCA Component 1") | |
plt.ylabel("PCA Component 2") | |
plt.show() |
可视化图像为:
我们丰富一下猜测的文本:
your_idea_text = "a racing car with road-blocks" # 你对这张图片的识别。 |
可视化出来可以看到猜测的文本特征与图像特征更接近了:
更进一步的丰富,加上了路障的颜色以及车子在地上的描述:
your_idea_text = "a racing car on the floor with some red and blue road-blocks besides" # 你对这张图片的识别。 |
可视化出来,可见猜测的文本在横轴的数值在这次的修改超过了 5,进一步向图像特征接近:
这也就说明,跨模态的信息可以被统一地映射到同一个特征空间进行学习,这就为多模态的研究开辟了道路
# 总代码
import numpy as np | |
import torch | |
import clip | |
from PIL import Image | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
model, preprocess = clip.load("ViT-L/14", device=device) | |
# ViT-B/32 or ViT-B/16 or ViT-L/14 or RN50 or RN101 or RN50x4 or RN50x16 or RN50x64 | |
image = preprocess(Image.open(r"a-racing-car.jpg")).unsqueeze(0).to(device) | |
my_text = ["fish", "cock", "hen", "ostrich", "bird", "hawk", "gecko", "toad", "frog", | |
"turtle", "lizard", "alligator", "dinosaur", "snake", "fossil", "spider", | |
"centipede", "peacock", "duck", "goose", "elephant", "Hedgehog", "platypus", "kangaroo", "koala", "marmot", "jellyfish", | |
"coral", "sea life", "sea snake", "conch", "snail", "crab", "lobster", "hermit crab", "penguin", "whale", "walrus", | |
"sea lion", "dog", "wolf", "fox", "cat", "leopard", "lion", "tiger", "bear", "mongoose", "insects", "dragonfly", | |
"butterfly", "starfish", "sea urchin", "rabbit", "rat", "squirrel", "horse", "zebra", "pig", "hippopotamus", | |
"cow", "sheep", "camel", "alpaca", "raccoon", "pangolin", "sloth", "baboon", "orangutan", "monkey", "panda", | |
"abacus", "muslim", "bachelor's uniform", "accordion", "raccoon", "guitar", "aircraft carrier", "airplane", | |
"airship", "church", "ambulance", "amphibious vehicle", "box", "clock", "apron", "trash can", "gun", "backpack", | |
"Breadbox", "Gymnastics", "Hot Air Balloon", "Pen", "Bondi", "Musical Instruments", "Stairs", "Barbells", "Seats", | |
"Haircuts", "Chalets", "Tables", "Wine Barrels", "Trolleys", "Baseball", "Basketball", "Baby", | |
"Musical Instruments", "Swimming", "Baby Towels", "Bathtubs", "cars", "lighthouses", "beakers", "guards", | |
"beer", "buildings", "bibs", "double bikes", "bikinis", "notebooks", "telescopes", "letterboxes", "huts", | |
"sledding", "necklaces", "hats", "bookcases", "bookstores", "bottle caps", "bows and arrows", "bow ties", | |
"tombstones", "brassieres", "armor", "brooms", "brooms", "bucket", "belt", "bulletproof vest", "train", "butcher shop", | |
"taxi", "pot", "candle", "cannon", "canoe", "stapler", "sweater", "car mirror", "carousel", "tool kit", | |
"box", "car wheel", "ATM", "cassette", "cassette player", "castle", "catamaran", "CD player", "cello", | |
"cell phone", "chain", "chainlink fence", "chainsaw", "chest", "commode", "chime", "cabinet", "sock", | |
"church", "cinema", "cleaver", "cliff dwelling", "cloak", "shoe", "bottle", "cup", "pot", "coil", "lock", | |
"keyboard", "candy", "ship", "car", "corkscrew", "horn", "boot", "hat", "crib", "crane", "helmet", "crate", | |
"crib", "Crock Pot", "croquet ball", "crutch", "cuirass", "dam", "desk", "desktop computer", "dial telephone", | |
"diaper", "digital clock", "digital watch", "dining table", "dishcloth", "dishwasher", "car wheel", "dock", | |
"dogsleigh", "dome", "doormat", "oilfield", "drum", "drumstick", "dumbbell", "Dutch oven", "electric fan", | |
"electric guitar", "train", "television", "letter", "coffee maker", "cosmetics","feather boa", "cabinet", | |
"fountain", "fire engine", "fireplace", "flagpole", "flute", "seat", "football", "forklift", "fountain", | |
"fountain pen", "bed", "train", "French horn", "frying pan", "fur coat", "truck", "mask", "gas pump", | |
"goblet", "go-kart", "golf", "golf cart", "boat", "gong", "wedding dress", "piano", "greenhouse", "car", | |
"grocery store", "guillotine", "hair slide", "hair spray", "tank", "hammer", "bamboo tube", "hair dryer", | |
"POS terminal", "handkerchief", "hard disk", "harmonica", "harp", "crane", "hatchet", "pistol", "television", | |
"honeycomb", "hook", "skirt", "gymnastics", "carriage", "hourglass", "music player", "iron", "jack-o’-lantern", | |
"jeans", "jeep", "T-shirt", "jigsaw puzzle", "rickshaw", "joystick", "kimono", "knee pad", "knot", "lab coat", | |
"ladle", "lamp", "laptop", "lawn mower", "lens cap", "knife", "library", "lifeboat", "lighter", "limousine", | |
"ship", "lipstick", "shoe", "lotion", "loudspeaker", "loupe", "log", "compass", "bag", "mailbox", "swimsuit", | |
"swimsuit", "manhole cover", "maraca", "marimba", "mask", "match", "rope", "maze", "beaker", "refrigerator", | |
"pillar", "microphone", "microwave", "military uniform", "water jug", "minibus", "miniskirt", "minivan", "missile", | |
"gloves", "bowl", "motor home", "vintage car", "router", "building", "monitor", "motorcycle", "inkstone", "mortarboard", | |
"building", "mosquito net", "scooter", "bicycle", "tent", "keyboard and mouse", "mousetrap", "truck", "muzzle", | |
"nail", "neck brace", "necklace", "baby bottle", "notebook", "building", "clarinet", "ocarina", "dashboard", | |
"oil filter", "organ", "oscilloscope", "dress", "oxcart", "respirator", "snack", "paddle", "paddle wheel", | |
"padlock", "brush", "pajamas", "building", "musical instrument", "paper towel", "parachute", "parallel bars", | |
"bench", "parking meter", "train", "yard", "payphone", "pillar", "pencil box", "pencil sharpener", "perfume", | |
"Petri dish", "printer", "pick", "helmet", "fence", "car", "bridge", "piggy bank", "pill", "pillow", "ping-pong ball", | |
"pinwheel", "sailboat", "teapot", "plane", "building", "plastic bag", "plate rack", "bulldozer", "plunger", | |
"camera", "pole", "police car", "poncho", "pool table", "bottle", "potted plant", "pottery", "drill", "blanket", | |
"printer", "prison", "missile", "projector", "puck", "punching bag", "handbag", "quill pen", "bed", "racing car", | |
"tennis", "heater", "radio", "satellite receiver", "wine barrel", "mobile home", "fishing rod", "camera", "refrigerator", | |
"remote control", "restaurant", "pistol", "sniper rifle", "rocking chair", "oven", "eraser", "rugby ball", "ruler", | |
"running shoes", "safe", "paperclip", "salt shaker", "slippers", "long skirt", "saxophone", "sword", "scale", | |
"school bus", "sailboat", "scoreboard", "monitor", "screw", "screwdriver", "seatbelt", "sewing machine", | |
"shield", "shoe store", "tatami", "shopping basket", "shopping cart", "shovel", "shower cap", "shower curtain", | |
"skiing", "ski mask", "sleeping bag", "slide caliper", "sliding door", "slot machine", "swimming goggles", "snowmobile", | |
"snowplow", "soap dispenser", "soccer ball", "socks", "solar panel", "hat", "bowl", "keyboard", "space heater", | |
"space shuttle", "spatula", "speedboat", "spider web", "yarn", "sports car", "spotlight", "band", "steam locomotive", | |
"bridge", "drum", "stethoscope", "stretcher", "stone pile", "stopwatch", "stove", "strainer", "streetcar", "stretcher", | |
"sofa", "palace", "ship", "suit", "sundial", "sunglasses", "sunglasses", "sunscreen", "bridge", "mop", "hoodie", | |
"swimming trunks", "swing", "switch", "syringe", "table lamp", "tank", "tape player", "teapot", "teddy bear", | |
"television", "tennis ball", "thatch", "theater curtain", "thimble", "armored vehicle", "throne", "tile", "toaster", | |
"tobacco shop", "toilet", "torch", "totem pole", "tow truck", "toy store", "tractor", "trailer truck", "plate", | |
"trench coat", "tricycle", "boat", "tripod", "arch", "bus", "trombone", "bathtub", "turnstile", "typewriter", | |
"umbrella", "unicycle", "piano", "vacuum cleaner", "vase", "vault", "velvet", "vending machine", "vestment", | |
"bridge", "violin", "volleyball", "waffle iron", "wall clock", "wallet", "wardrobe", "airplane", "washbasin", | |
"washing machine", "water bottle", "water jug", "water tower", "water jug", "whistle", "wig", "window", "window shade", | |
"tie", "wine", "airplane", "wok", "spoon", "scarf", "fence", "wreck", "sailboat", "yurt", "website", "poster", | |
"crossword puzzle", "traffic sign", "traffic light", "book", "menu", "dish", "cake", "ice cream", "popsicle", | |
"bread", "doughnut", "bread", "hamburger", "hot dog", "mashed potato", "vegetable", "broccoli", "cauliflower", | |
"vegetable", "spaghetti squash", "pumpkin", "cucumber", "vegetable", "bell pepper", "flower", "mushroom", | |
"apple", "strawberry", "orange", "lemon", "fruit", "pineapple", "banana", "durian", "fruit", "pomegranate", | |
"alp", "bubble", "cliff", "coral", "hot spring", "scenery", "island", "beach", "seashore", "waterfall", "volcano", | |
"wedding", "diving", "rapeseed", "daisy", "plant", "corn", "pinecone", "chestnut", "mushroom", "corn", "toilet paper", | |
"woman", "man" | |
] | |
# your_idea_text = "a racing car on the floor with some red and blue road-blocks besides" # 你对这张图片的识别。 | |
# your_idea_text = "a racing car with road-blocks" # 你对这张图片的识别。 | |
your_idea_text = "a car " # 你对这张图片的识别。 | |
my_text.append(your_idea_text) | |
text_token = clip.tokenize(my_text).to(device) | |
with torch.no_grad(): | |
image_features = model.encode_image(image) | |
text_features = model.encode_text(text_token) | |
logits_per_image, logits_per_text = model(image, text_token) | |
probs = logits_per_image.softmax(dim=-1).cpu().numpy() | |
print("A photo of " + my_text[list(probs[0]).index(max(probs[0]))]) | |
from sklearn.decomposition import PCA | |
import matplotlib.pyplot as plt | |
all_features = torch.cat([image_features, text_features], dim=0).cpu().numpy() | |
pca = PCA(n_components=2) | |
features_2d = pca.fit_transform(all_features) | |
plt.figure(figsize=(10, 10)) | |
plt.scatter(features_2d[0, 0], features_2d[0, 1], color='red', label='Image Feature') | |
for i, text in enumerate(my_text): | |
plt.scatter(features_2d[i+1, 0], features_2d[i+1, 1], label=f'Text Feature: {text}') | |
plt.legend() | |
plt.title("PCA of Image and Text Features") | |
plt.xlabel("PCA Component 1") | |
plt.ylabel("PCA Component 2") | |
plt.show() |