CLIP Basics: Zero-Shot Vision with CLIPΒΆ
OpenAI CLIP (Contrastive Language-Image Pretraining) connects images and text in a shared embedding space β enabling zero-shot image classification, semantic image search, and multimodal retrieval without any task-specific training.
# Install required packages
# !pip install transformers pillow requests torch torchvision
import torch
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import requests
from io import BytesIO
import matplotlib.pyplot as plt
import numpy as np
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
1. Load CLIP ModelΒΆ
# Load CLIP model (December 2025)
model_name = "openai/clip-vit-large-patch14"
# Alternatives:
# "openai/clip-vit-base-patch32" - Faster, smaller (default in 2023)
# "openai/clip-vit-large-patch14" - Better quality (recommended 2025)
# For modern vision-language: Consider GPT-4V, LLaVA 1.6, Qwen2-VL
model = CLIPModel.from_pretrained(model_name)
processor = CLIPProcessor.from_pretrained(model_name)
# Move to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
print(f"Model: {model_name}")
print(f"Device: {device}")
2. Load and Display ImagesΒΆ
def load_image(url_or_path):
"""Load image from URL or local path."""
if url_or_path.startswith('http'):
response = requests.get(url_or_path)
image = Image.open(BytesIO(response.content))
else:
image = Image.open(url_or_path)
return image
# Sample images (replace with your own)
image_urls = [
"https://images.unsplash.com/photo-1574158622682-e40e69881006", # Cat
"https://images.unsplash.com/photo-1552053831-71594a27632d", # Dog
"https://images.unsplash.com/photo-1511593358241-7eea1f3c84e5", # Leopard
]
images = [load_image(url) for url in image_urls]
# Display
fig, axes = plt.subplots(1, len(images), figsize=(15, 5))
for ax, img in zip(axes, images):
ax.imshow(img)
ax.axis('off')
plt.tight_layout()
plt.show()
3. Zero-Shot Image ClassificationΒΆ
Classify images without any training!
def classify_image(image, labels):
"""Classify image with zero-shot CLIP."""
# Prepare inputs
inputs = processor(
text=labels,
images=image,
return_tensors="pt",
padding=True
).to(device)
# Get predictions
with torch.no_grad():
outputs = model(**inputs)
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
return probs.cpu().numpy()[0]
# Define categories
labels = [
"a photo of a cat",
"a photo of a dog",
"a photo of a wild animal",
"a photo of a bird",
"a photo of a car"
]
# Classify each image
for i, img in enumerate(images):
probs = classify_image(img, labels)
print(f"\nImage {i+1} probabilities:")
for label, prob in zip(labels, probs):
print(f" {label}: {prob:.2%}")
# Show top prediction
top_idx = np.argmax(probs)
print(f" β
Prediction: {labels[top_idx]} ({probs[top_idx]:.2%})")
4. Image-Text SimilarityΒΆ
Find which text descriptions match an image best.
def compute_similarity(image, texts):
"""Compute similarity between image and multiple texts."""
inputs = processor(
text=texts,
images=image,
return_tensors="pt",
padding=True
).to(device)
with torch.no_grad():
outputs = model(**inputs)
# Cosine similarity
similarity = outputs.logits_per_image
return similarity.cpu().numpy()[0]
# Detailed descriptions
descriptions = [
"a cute fluffy cat sleeping on a couch",
"an energetic dog playing in a park",
"a dangerous predator hunting in the wild",
"a colorful bird perched on a branch",
"modern sports car on a highway"
]
# Test with first image
scores = compute_similarity(images[0], descriptions)
print("Similarity scores for first image:")
for desc, score in zip(descriptions, scores):
print(f" {score:6.2f} - {desc}")
# Visualize
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
ax1.imshow(images[0])
ax1.set_title("Image")
ax1.axis('off')
ax2.barh(range(len(descriptions)), scores)
ax2.set_yticks(range(len(descriptions)))
ax2.set_yticklabels([d[:40] + '...' if len(d) > 40 else d for d in descriptions])
ax2.set_xlabel('Similarity Score')
ax2.set_title('Text Similarity')
plt.tight_layout()
plt.show()
5. Image Search by TextΒΆ
Find the best matching image for a text query.
def search_images(images, query):
"""Search for images matching text query."""
scores = []
for img in images:
inputs = processor(
text=[query],
images=img,
return_tensors="pt"
).to(device)
with torch.no_grad():
outputs = model(**inputs)
score = outputs.logits_per_image.item()
scores.append(score)
# Sort by score
ranked = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)
return ranked
# Search queries
queries = [
"a small domestic pet",
"a wild predator with spots",
"man's best friend"
]
for query in queries:
print(f"\nQuery: '{query}'")
results = search_images(images, query)
print("Top matches:")
for rank, (idx, score) in enumerate(results, 1):
print(f" {rank}. Image {idx+1} (score: {score:.2f})")
# Show top result
top_idx = results[0][0]
plt.figure(figsize=(6, 4))
plt.imshow(images[top_idx])
plt.title(f"Best match for: {query}")
plt.axis('off')
plt.show()
6. Image EmbeddingsΒΆ
Extract feature vectors for downstream tasks.
def get_image_embedding(image):
"""Get image embedding vector."""
inputs = processor(images=image, return_tensors="pt").to(device)
with torch.no_grad():
image_features = model.get_image_features(**inputs)
return image_features.cpu().numpy()[0]
def get_text_embedding(text):
"""Get text embedding vector."""
inputs = processor(text=text, return_tensors="pt", padding=True).to(device)
with torch.no_grad():
text_features = model.get_text_features(**inputs)
return text_features.cpu().numpy()[0]
# Get embeddings
img_embedding = get_image_embedding(images[0])
text_embedding = get_text_embedding("a photo of a cat")
print(f"Image embedding shape: {img_embedding.shape}")
print(f"Text embedding shape: {text_embedding.shape}")
# Compute cosine similarity manually
from numpy.linalg import norm
similarity = np.dot(img_embedding, text_embedding) / (norm(img_embedding) * norm(text_embedding))
print(f"\nCosine similarity: {similarity:.4f}")
7. Practical Application: Content ModerationΒΆ
def moderate_image(image, threshold=0.5):
"""Flag potentially inappropriate content."""
safety_labels = [
"safe for work content",
"inappropriate content",
"violent content",
"adult content"
]
probs = classify_image(image, safety_labels)
# Check if any inappropriate category is above threshold
if max(probs[1:]) > threshold:
flagged_idx = np.argmax(probs)
return True, safety_labels[flagged_idx], probs[flagged_idx]
else:
return False, "safe", probs[0]
# Test
for i, img in enumerate(images):
is_flagged, category, confidence = moderate_image(img)
status = "π« FLAGGED" if is_flagged else "β
SAFE"
print(f"Image {i+1}: {status} - {category} ({confidence:.2%})")
Tips & Best PracticesΒΆ
Prompt Engineering for CLIPΒΆ
β Good prompts:
βa photo of a {object}β
βa {adjective} {object}β
βa {object} in {context}β
β Bad prompts:
Just the object name (βcatβ)
Too generic (βthingβ)
Overly complex descriptions
Performance TipsΒΆ
# Batch processing for efficiency
inputs = processor(
text=labels,
images=images, # Multiple images
return_tensors="pt",
padding=True
)
# Use larger model for better accuracy
# "openai/clip-vit-large-patch14"
# Cache embeddings for repeated queries
image_embeddings = {}
for i, img in enumerate(images):
image_embeddings[i] = get_image_embedding(img)
Common Use CasesΒΆ
Product search: Find products by description
Content moderation: Flag inappropriate images
Image tagging: Auto-generate tags
Recommendation: βSimilar imagesβ
Quality control: βIs this a good product photo?β
Exercise: Build Image Search EngineΒΆ
Create a simple image search with your own photos.
class ImageSearchEngine:
def __init__(self, model, processor):
self.model = model
self.processor = processor
self.images = []
self.embeddings = []
def add_image(self, image, metadata=None):
"""Add image to search index."""
embedding = get_image_embedding(image)
self.images.append({"image": image, "metadata": metadata})
self.embeddings.append(embedding)
def search(self, query, top_k=5):
"""Search for images by text query."""
query_embedding = get_text_embedding(query)
# Compute similarities
scores = []
for img_emb in self.embeddings:
similarity = np.dot(query_embedding, img_emb) / \
(norm(query_embedding) * norm(img_emb))
scores.append(similarity)
# Get top k
top_indices = np.argsort(scores)[-top_k:][::-1]
results = [(i, scores[i]) for i in top_indices]
return results
# Create search engine
search_engine = ImageSearchEngine(model, processor)
# Index images
for img in images:
search_engine.add_image(img)
# Search
results = search_engine.search("wild animal", top_k=3)
print("Search results for 'wild animal':")
for idx, score in results:
print(f" Image {idx}: {score:.4f}")
Key TakeawaysΒΆ
Zero-shot power: Classify without training!
Flexible: Works with any text description
Fast: Real-time inference on GPU
Embeddings: Universal image & text features
Production-ready: Used by many companies
LimitationsΒΆ
Not great for fine-grained distinctions
Sensitive to prompt wording
May have biases from training data
Better with common objects/scenes
Modern Alternatives (December 2025)ΒΆ
For Better Vision-Language Understanding:
GPT-4V/GPT-4o- Best overall (multimodal, API)LLaVA 1.6- Open-source, runs locally (7B/13B/34B)Qwen2-VL- Qwenβs vision model (competitive with GPT-4V)CogVLM2- Strong open-source optionCLIP- Still good for embeddings/zero-shot classification
When to Use CLIP:
Zero-shot image classification
Image-text similarity/search
Embedding generation
Real-time applications (fast)
When to Use Modern VLMs:
Complex image understanding
Image captioning/description
Visual question answering
OCR + understanding
Next StepsΒΆ
02_llava.ipynb- Open-source vision-language model (2025 recommended)03_qwen2_vl.ipynb- Qwenβs vision model04_gpt4v.ipynb- GPT-4 Vision for complex understanding05_image_captioning.ipynb- Generate image descriptions