CLIP Basics: Zero-Shot Vision with CLIPΒΆ

OpenAI CLIP (Contrastive Language-Image Pretraining) connects images and text in a shared embedding space β€” enabling zero-shot image classification, semantic image search, and multimodal retrieval without any task-specific training.

# Install required packages
# !pip install transformers pillow requests torch torchvision
import torch
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import requests
from io import BytesIO
import matplotlib.pyplot as plt
import numpy as np

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

1. Load CLIP ModelΒΆ

# Load CLIP model (December 2025)
model_name = "openai/clip-vit-large-patch14"
# Alternatives:
# "openai/clip-vit-base-patch32" - Faster, smaller (default in 2023)
# "openai/clip-vit-large-patch14" - Better quality (recommended 2025)
# For modern vision-language: Consider GPT-4V, LLaVA 1.6, Qwen2-VL

model = CLIPModel.from_pretrained(model_name)
processor = CLIPProcessor.from_pretrained(model_name)

# Move to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

print(f"Model: {model_name}")
print(f"Device: {device}")

2. Load and Display ImagesΒΆ

def load_image(url_or_path):
    """Load image from URL or local path."""
    if url_or_path.startswith('http'):
        response = requests.get(url_or_path)
        image = Image.open(BytesIO(response.content))
    else:
        image = Image.open(url_or_path)
    return image

# Sample images (replace with your own)
image_urls = [
    "https://images.unsplash.com/photo-1574158622682-e40e69881006",  # Cat
    "https://images.unsplash.com/photo-1552053831-71594a27632d",  # Dog
    "https://images.unsplash.com/photo-1511593358241-7eea1f3c84e5",  # Leopard
]

images = [load_image(url) for url in image_urls]

# Display
fig, axes = plt.subplots(1, len(images), figsize=(15, 5))
for ax, img in zip(axes, images):
    ax.imshow(img)
    ax.axis('off')
plt.tight_layout()
plt.show()

3. Zero-Shot Image ClassificationΒΆ

Classify images without any training!

def classify_image(image, labels):
    """Classify image with zero-shot CLIP."""
    # Prepare inputs
    inputs = processor(
        text=labels,
        images=image,
        return_tensors="pt",
        padding=True
    ).to(device)
    
    # Get predictions
    with torch.no_grad():
        outputs = model(**inputs)
        logits_per_image = outputs.logits_per_image
        probs = logits_per_image.softmax(dim=1)
    
    return probs.cpu().numpy()[0]

# Define categories
labels = [
    "a photo of a cat",
    "a photo of a dog",
    "a photo of a wild animal",
    "a photo of a bird",
    "a photo of a car"
]

# Classify each image
for i, img in enumerate(images):
    probs = classify_image(img, labels)
    
    print(f"\nImage {i+1} probabilities:")
    for label, prob in zip(labels, probs):
        print(f"  {label}: {prob:.2%}")
    
    # Show top prediction
    top_idx = np.argmax(probs)
    print(f"  βœ… Prediction: {labels[top_idx]} ({probs[top_idx]:.2%})")

4. Image-Text SimilarityΒΆ

Find which text descriptions match an image best.

def compute_similarity(image, texts):
    """Compute similarity between image and multiple texts."""
    inputs = processor(
        text=texts,
        images=image,
        return_tensors="pt",
        padding=True
    ).to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)
        # Cosine similarity
        similarity = outputs.logits_per_image
    
    return similarity.cpu().numpy()[0]

# Detailed descriptions
descriptions = [
    "a cute fluffy cat sleeping on a couch",
    "an energetic dog playing in a park",
    "a dangerous predator hunting in the wild",
    "a colorful bird perched on a branch",
    "modern sports car on a highway"
]

# Test with first image
scores = compute_similarity(images[0], descriptions)

print("Similarity scores for first image:")
for desc, score in zip(descriptions, scores):
    print(f"  {score:6.2f} - {desc}")

# Visualize
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

ax1.imshow(images[0])
ax1.set_title("Image")
ax1.axis('off')

ax2.barh(range(len(descriptions)), scores)
ax2.set_yticks(range(len(descriptions)))
ax2.set_yticklabels([d[:40] + '...' if len(d) > 40 else d for d in descriptions])
ax2.set_xlabel('Similarity Score')
ax2.set_title('Text Similarity')

plt.tight_layout()
plt.show()

5. Image Search by TextΒΆ

Find the best matching image for a text query.

def search_images(images, query):
    """Search for images matching text query."""
    scores = []
    
    for img in images:
        inputs = processor(
            text=[query],
            images=img,
            return_tensors="pt"
        ).to(device)
        
        with torch.no_grad():
            outputs = model(**inputs)
            score = outputs.logits_per_image.item()
            scores.append(score)
    
    # Sort by score
    ranked = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)
    return ranked

# Search queries
queries = [
    "a small domestic pet",
    "a wild predator with spots",
    "man's best friend"
]

for query in queries:
    print(f"\nQuery: '{query}'")
    results = search_images(images, query)
    
    print("Top matches:")
    for rank, (idx, score) in enumerate(results, 1):
        print(f"  {rank}. Image {idx+1} (score: {score:.2f})")
    
    # Show top result
    top_idx = results[0][0]
    plt.figure(figsize=(6, 4))
    plt.imshow(images[top_idx])
    plt.title(f"Best match for: {query}")
    plt.axis('off')
    plt.show()

6. Image EmbeddingsΒΆ

Extract feature vectors for downstream tasks.

def get_image_embedding(image):
    """Get image embedding vector."""
    inputs = processor(images=image, return_tensors="pt").to(device)
    
    with torch.no_grad():
        image_features = model.get_image_features(**inputs)
    
    return image_features.cpu().numpy()[0]

def get_text_embedding(text):
    """Get text embedding vector."""
    inputs = processor(text=text, return_tensors="pt", padding=True).to(device)
    
    with torch.no_grad():
        text_features = model.get_text_features(**inputs)
    
    return text_features.cpu().numpy()[0]

# Get embeddings
img_embedding = get_image_embedding(images[0])
text_embedding = get_text_embedding("a photo of a cat")

print(f"Image embedding shape: {img_embedding.shape}")
print(f"Text embedding shape: {text_embedding.shape}")

# Compute cosine similarity manually
from numpy.linalg import norm
similarity = np.dot(img_embedding, text_embedding) / (norm(img_embedding) * norm(text_embedding))
print(f"\nCosine similarity: {similarity:.4f}")

7. Practical Application: Content ModerationΒΆ

def moderate_image(image, threshold=0.5):
    """Flag potentially inappropriate content."""
    safety_labels = [
        "safe for work content",
        "inappropriate content",
        "violent content",
        "adult content"
    ]
    
    probs = classify_image(image, safety_labels)
    
    # Check if any inappropriate category is above threshold
    if max(probs[1:]) > threshold:
        flagged_idx = np.argmax(probs)
        return True, safety_labels[flagged_idx], probs[flagged_idx]
    else:
        return False, "safe", probs[0]

# Test
for i, img in enumerate(images):
    is_flagged, category, confidence = moderate_image(img)
    status = "🚫 FLAGGED" if is_flagged else "βœ… SAFE"
    print(f"Image {i+1}: {status} - {category} ({confidence:.2%})")

Tips & Best PracticesΒΆ

Prompt Engineering for CLIPΒΆ

βœ… Good prompts:

  • β€œa photo of a {object}”

  • β€œa {adjective} {object}”

  • β€œa {object} in {context}”

❌ Bad prompts:

  • Just the object name (β€œcat”)

  • Too generic (β€œthing”)

  • Overly complex descriptions

Performance TipsΒΆ

# Batch processing for efficiency
inputs = processor(
    text=labels,
    images=images,  # Multiple images
    return_tensors="pt",
    padding=True
)

# Use larger model for better accuracy
# "openai/clip-vit-large-patch14"

# Cache embeddings for repeated queries
image_embeddings = {}
for i, img in enumerate(images):
    image_embeddings[i] = get_image_embedding(img)

Common Use CasesΒΆ

  1. Product search: Find products by description

  2. Content moderation: Flag inappropriate images

  3. Image tagging: Auto-generate tags

  4. Recommendation: β€œSimilar images”

  5. Quality control: β€œIs this a good product photo?”

Exercise: Build Image Search EngineΒΆ

Create a simple image search with your own photos.

class ImageSearchEngine:
    def __init__(self, model, processor):
        self.model = model
        self.processor = processor
        self.images = []
        self.embeddings = []
    
    def add_image(self, image, metadata=None):
        """Add image to search index."""
        embedding = get_image_embedding(image)
        self.images.append({"image": image, "metadata": metadata})
        self.embeddings.append(embedding)
    
    def search(self, query, top_k=5):
        """Search for images by text query."""
        query_embedding = get_text_embedding(query)
        
        # Compute similarities
        scores = []
        for img_emb in self.embeddings:
            similarity = np.dot(query_embedding, img_emb) / \
                        (norm(query_embedding) * norm(img_emb))
            scores.append(similarity)
        
        # Get top k
        top_indices = np.argsort(scores)[-top_k:][::-1]
        results = [(i, scores[i]) for i in top_indices]
        
        return results

# Create search engine
search_engine = ImageSearchEngine(model, processor)

# Index images
for img in images:
    search_engine.add_image(img)

# Search
results = search_engine.search("wild animal", top_k=3)

print("Search results for 'wild animal':")
for idx, score in results:
    print(f"  Image {idx}: {score:.4f}")

Key TakeawaysΒΆ

  1. Zero-shot power: Classify without training!

  2. Flexible: Works with any text description

  3. Fast: Real-time inference on GPU

  4. Embeddings: Universal image & text features

  5. Production-ready: Used by many companies

LimitationsΒΆ

  • Not great for fine-grained distinctions

  • Sensitive to prompt wording

  • May have biases from training data

  • Better with common objects/scenes

Modern Alternatives (December 2025)ΒΆ

For Better Vision-Language Understanding:

  • GPT-4V / GPT-4o - Best overall (multimodal, API)

  • LLaVA 1.6 - Open-source, runs locally (7B/13B/34B)

  • Qwen2-VL - Qwen’s vision model (competitive with GPT-4V)

  • CogVLM2 - Strong open-source option

  • CLIP - Still good for embeddings/zero-shot classification

When to Use CLIP:

  • Zero-shot image classification

  • Image-text similarity/search

  • Embedding generation

  • Real-time applications (fast)

When to Use Modern VLMs:

  • Complex image understanding

  • Image captioning/description

  • Visual question answering

  • OCR + understanding

Next StepsΒΆ

  • 02_llava.ipynb - Open-source vision-language model (2025 recommended)

  • 03_qwen2_vl.ipynb - Qwen’s vision model

  • 04_gpt4v.ipynb - GPT-4 Vision for complex understanding

  • 05_image_captioning.ipynb - Generate image descriptions