Multimodal RAG: Retrieval-Augmented Generation with ImagesΒΆ

Extend RAG beyond text β€” index images alongside documents and retrieve the right visual context to answer questions about charts, diagrams, product photos, and more.

What Is Multimodal RAG?ΒΆ

Standard RAG: question β†’ retrieve text chunks β†’ LLM answers

Multimodal RAG: question β†’ retrieve text + images β†’ VLM answers

Images + Documents
      β”‚
      β–Ό
 [CLIP Encoder]          ← shared embedding space
      β”‚
      β–Ό
 [Vector Store]          ← images stored as embeddings
      β”‚
  Query time:
      β”‚
 User Question β†’ [CLIP text encoder] β†’ similarity search
      β”‚
      β–Ό
 Top-K images retrieved
      β”‚
      β–Ό
 [GPT-4o / Claude] ← question + retrieved images β†’ answer

Use CasesΒΆ

Use Case

What Gets Indexed

Query Example

E-commerce

Product photos

β€œShow me red running shoes”

Technical docs

Diagrams + text

β€œHow does the circuit work?”

Medical

X-rays, scans

β€œFind cases with similar pathology”

News archive

Photos + articles

β€œPhotos of the 2024 election”

Slide decks

Presentation slides

β€œWhich slide covers pricing?”

# Install dependencies
# !pip install openai anthropic torch transformers pillow chromadb numpy requests

1. CLIP Embeddings for Image IndexingΒΆ

import torch
import numpy as np
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import requests
from io import BytesIO

# Load CLIP for multimodal embeddings
device = 'cuda' if torch.cuda.is_available() else 'cpu'
clip_model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32').to(device)
clip_processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')

def embed_image(image: Image.Image) -> np.ndarray:
    """Get 512-dim CLIP embedding for an image."""
    inputs = clip_processor(images=image, return_tensors='pt').to(device)
    with torch.no_grad():
        features = clip_model.get_image_features(**inputs)
        features = features / features.norm(dim=-1, keepdim=True)  # normalize
    return features.cpu().numpy()[0]

def embed_text(text: str) -> np.ndarray:
    """Get 512-dim CLIP embedding for a text query."""
    inputs = clip_processor(text=[text], return_tensors='pt', padding=True).to(device)
    with torch.no_grad():
        features = clip_model.get_text_features(**inputs)
        features = features / features.norm(dim=-1, keepdim=True)  # normalize
    return features.cpu().numpy()[0]

def load_image_from_url(url: str) -> Image.Image:
    response = requests.get(url, timeout=10)
    return Image.open(BytesIO(response.content)).convert('RGB')

print(f'CLIP loaded on {device}. Embedding dim: 512')

2. Build a Simple Image Vector StoreΒΆ

import json
from pathlib import Path
from typing import List, Dict, Optional

class ImageVectorStore:
    """
    Simple in-memory vector store for images using CLIP embeddings.
    For production, use ChromaDB or Qdrant (see Section 3).
    """
    def __init__(self):
        self.embeddings: List[np.ndarray] = []
        self.metadata: List[Dict] = []  # image path, caption, tags, etc.
        self.images: List[Image.Image] = []
    
    def add_image(self, image: Image.Image, metadata: Dict) -> int:
        """Add an image to the store. Returns its index."""
        embedding = embed_image(image)
        self.embeddings.append(embedding)
        self.metadata.append(metadata)
        self.images.append(image)
        return len(self.embeddings) - 1
    
    def add_image_from_url(self, url: str, metadata: Dict) -> int:
        """Fetch image from URL and add to store."""
        image = load_image_from_url(url)
        metadata['url'] = url
        return self.add_image(image, metadata)
    
    def search_by_text(self, query: str, top_k: int = 3) -> List[Dict]:
        """Find images most similar to a text query."""
        query_embedding = embed_text(query)
        return self._search(query_embedding, top_k)
    
    def search_by_image(self, query_image: Image.Image, top_k: int = 3) -> List[Dict]:
        """Find images similar to a query image."""
        query_embedding = embed_image(query_image)
        return self._search(query_embedding, top_k)
    
    def _search(self, query_embedding: np.ndarray, top_k: int) -> List[Dict]:
        if not self.embeddings:
            return []
        
        # Cosine similarity (embeddings are already normalized)
        scores = np.array(self.embeddings) @ query_embedding
        top_indices = np.argsort(scores)[-top_k:][::-1]
        
        return [
            {
                'index': int(i),
                'score': float(scores[i]),
                'metadata': self.metadata[i],
                'image': self.images[i]
            }
            for i in top_indices
        ]
    
    def __len__(self) -> int:
        return len(self.embeddings)

print('ImageVectorStore class ready.')
import matplotlib.pyplot as plt

# Create store and index some sample images
store = ImageVectorStore()

sample_images = [
    {
        'url': 'https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/PNG_transparency_demonstration_1.png/280px-PNG_transparency_demonstration_1.png',
        'caption': 'Geometric shapes with transparency',
        'tags': ['shapes', 'geometry', 'colors']
    },
]

print('Indexing sample images...')
for item in sample_images:
    try:
        idx = store.add_image_from_url(
            item['url'],
            {'caption': item['caption'], 'tags': item['tags']}
        )
        print(f'  [{idx}] {item["caption"]}')
    except Exception as e:
        print(f'  Failed: {e}')

print(f'\nIndexed {len(store)} images.')

3. ChromaDB for Persistent Image StorageΒΆ

# ChromaDB stores embeddings persistently on disk
import chromadb
import base64
from io import BytesIO

def image_to_base64(image: Image.Image) -> str:
    """Convert PIL image to base64 string for storage."""
    buffer = BytesIO()
    image.save(buffer, format='PNG')
    return base64.b64encode(buffer.getvalue()).decode()

def base64_to_image(b64: str) -> Image.Image:
    """Convert base64 string back to PIL image."""
    return Image.open(BytesIO(base64.b64decode(b64)))

class ChromaImageStore:
    """Persistent image store using ChromaDB."""
    
    def __init__(self, persist_dir: str = './image_store'):
        self.client = chromadb.PersistentClient(path=persist_dir)
        self.collection = self.client.get_or_create_collection(
            name='images',
            metadata={'hnsw:space': 'cosine'}
        )
    
    def add_image(self, image: Image.Image, doc_id: str, metadata: Dict) -> None:
        """Index an image with its CLIP embedding."""
        embedding = embed_image(image).tolist()
        metadata['image_b64'] = image_to_base64(image)
        self.collection.add(
            embeddings=[embedding],
            ids=[doc_id],
            metadatas=[metadata]
        )
    
    def search(self, query: str, n_results: int = 3) -> List[Dict]:
        """Search by text query, return images + metadata."""
        query_embedding = embed_text(query).tolist()
        results = self.collection.query(
            query_embeddings=[query_embedding],
            n_results=n_results
        )
        
        output = []
        for i, meta in enumerate(results['metadatas'][0]):
            b64 = meta.pop('image_b64', None)
            output.append({
                'id': results['ids'][0][i],
                'score': 1 - results['distances'][0][i],  # cosine β†’ similarity
                'metadata': meta,
                'image': base64_to_image(b64) if b64 else None
            })
        return output
    
    def count(self) -> int:
        return self.collection.count()

print('ChromaImageStore class ready.')
print('Persists to disk β€” survives restarts.')

4. Visual QA Pipeline (RAG + VLM)ΒΆ

import base64
from openai import OpenAI

openai_client = OpenAI()

def image_to_data_url(image: Image.Image) -> str:
    """Convert PIL image to base64 data URL for OpenAI API."""
    buffer = BytesIO()
    image.save(buffer, format='PNG')
    b64 = base64.b64encode(buffer.getvalue()).decode()
    return f'data:image/png;base64,{b64}'

def multimodal_rag_answer(
    question: str,
    image_store: ImageVectorStore,
    top_k: int = 2,
    model: str = 'gpt-4o'
) -> Dict:
    """
    Full Multimodal RAG pipeline:
    1. Search image store for relevant images
    2. Pass question + retrieved images to GPT-4o
    3. Return answer with source attribution
    """
    # Step 1: Retrieve relevant images
    results = image_store.search_by_text(question, top_k=top_k)
    
    if not results:
        return {'answer': 'No relevant images found.', 'sources': []}
    
    # Step 2: Build multimodal message
    content = [{
        'type': 'text',
        'text': f'Answer the question using the provided images as context.\n\nQuestion: {question}'
    }]
    
    sources = []
    for i, result in enumerate(results):
        data_url = image_to_data_url(result['image'])
        content.append({
            'type': 'text',
            'text': f'\n[Image {i+1}] Caption: {result["metadata"].get("caption", "unknown")} (relevance: {result["score"]:.2f})'
        })
        content.append({
            'type': 'image_url',
            'image_url': {'url': data_url, 'detail': 'low'}  # 'low' = cheaper, 'high' = more detail
        })
        sources.append(result['metadata'])
    
    # Step 3: Call VLM
    response = openai_client.chat.completions.create(
        model=model,
        messages=[{'role': 'user', 'content': content}],
        max_tokens=500
    )
    
    return {
        'answer': response.choices[0].message.content,
        'sources': sources,
        'retrieved_images': [r['image'] for r in results]
    }

print('multimodal_rag_answer() ready.')
print()
print('Usage:')
print('  result = multimodal_rag_answer("What shapes are in the diagram?", store)')
print('  print(result["answer"])')

5. Document + Image Hybrid RAGΒΆ

Index PDFs/documents with embedded images β€” retrieve both text chunks and figures.

# Hybrid store: text chunks + images in the same ChromaDB collection
# Useful for: technical manuals, research papers, slide decks

class HybridDocumentStore:
    """
    Mixed vector store for both text and image content.
    Text uses OpenAI embeddings (ada-002), images use CLIP.
    Both stored in same ChromaDB for unified search.
    """
    def __init__(self, persist_dir: str = './hybrid_store'):
        self.client = chromadb.PersistentClient(path=persist_dir)
        self.text_collection = self.client.get_or_create_collection('text_chunks')
        self.image_collection = self.client.get_or_create_collection('images')
    
    def add_text_chunk(self, text: str, chunk_id: str, metadata: Dict) -> None:
        """Index a text chunk with OpenAI embeddings."""
        response = openai_client.embeddings.create(
            model='text-embedding-3-small',
            input=text
        )
        embedding = response.data[0].embedding
        metadata['text'] = text
        self.text_collection.add(
            embeddings=[embedding],
            ids=[chunk_id],
            metadatas=[metadata]
        )
    
    def add_image(self, image: Image.Image, image_id: str, metadata: Dict) -> None:
        """Index an image with CLIP embeddings."""
        embedding = embed_image(image).tolist()
        buffer = BytesIO()
        image.save(buffer, format='PNG')
        metadata['image_b64'] = base64.b64encode(buffer.getvalue()).decode()
        self.image_collection.add(
            embeddings=[embedding],
            ids=[image_id],
            metadatas=[metadata]
        )
    
    def search(self, query: str, n_text: int = 2, n_images: int = 2) -> Dict:
        """Search both text and image collections."""
        # Text search
        text_response = openai_client.embeddings.create(
            model='text-embedding-3-small', input=query
        )
        text_embedding = text_response.data[0].embedding
        
        text_results = self.text_collection.query(
            query_embeddings=[text_embedding], n_results=n_text
        )
        
        # Image search (text query β†’ CLIP text encoder)
        image_embedding = embed_text(query).tolist()
        image_results = self.image_collection.query(
            query_embeddings=[image_embedding], n_results=n_images
        )
        
        return {
            'text_chunks': text_results['metadatas'][0] if text_results['metadatas'] else [],
            'images': [
                {
                    'metadata': m,
                    'image': base64_to_image(m.get('image_b64', '')) if m.get('image_b64') else None
                }
                for m in (image_results['metadatas'][0] if image_results['metadatas'] else [])
            ]
        }

print('HybridDocumentStore ready.')
print('Stores text chunks + images, searchable with a single query.')

6. Using Claude for Multimodal RAGΒΆ

import anthropic

claude = anthropic.Anthropic()

def multimodal_rag_claude(
    question: str,
    image_store: ImageVectorStore,
    top_k: int = 2
) -> str:
    """
    Multimodal RAG using Claude Vision.
    Claude handles up to 20 images per request and 200K token context.
    """
    results = image_store.search_by_text(question, top_k=top_k)
    
    content = []
    for i, result in enumerate(results):
        # Convert image to base64
        buffer = BytesIO()
        result['image'].save(buffer, format='PNG')
        b64 = base64.b64encode(buffer.getvalue()).decode()
        
        content.append({'type': 'text', 'text': f'[Image {i+1}: {result["metadata"].get("caption", "")}]'})
        content.append({
            'type': 'image',
            'source': {
                'type': 'base64',
                'media_type': 'image/png',
                'data': b64
            }
        })
    
    content.append({'type': 'text', 'text': f'\nBased on the images above, {question}'})
    
    response = claude.messages.create(
        model='claude-sonnet-4-6',
        max_tokens=500,
        messages=[{'role': 'user', 'content': content}]
    )
    return response.content[0].text

print('multimodal_rag_claude() ready β€” uses Claude Vision for answers.')
print('Claude advantage: 200K context β†’ can handle many more retrieved images.')

7. E-Commerce Image Search ExampleΒΆ

# Realistic use case: product catalog search
# User types a description β†’ find visually matching products

class ProductCatalogSearch:
    """
    Product catalog with visual + text search.
    Indexes product images and descriptions together.
    """
    def __init__(self):
        self.store = ImageVectorStore()
    
    def index_product(self, product: Dict) -> None:
        """
        Index a product. product dict should have:
          - image: PIL Image
          - name, price, category, description
        """
        image = product.pop('image')
        self.store.add_image(image, product)
    
    def search(self, query: str, top_k: int = 5) -> List[Dict]:
        """Find products matching a text description."""
        results = self.store.search_by_text(query, top_k=top_k)
        return [
            {
                'name': r['metadata'].get('name'),
                'price': r['metadata'].get('price'),
                'category': r['metadata'].get('category'),
                'score': r['score'],
                'image': r['image']
            }
            for r in results
        ]
    
    def find_similar(self, product_image: Image.Image, top_k: int = 5) -> List[Dict]:
        """Find visually similar products (reverse image search)."""
        return self.store.search_by_image(product_image, top_k=top_k)

print('ProductCatalogSearch ready.')
print()
print('Workflow:')
print('  1. Index all product images with metadata')
print('  2. User searches: "red leather jacket under $200"')
print('  3. CLIP finds visually + semantically matching products')
print('  4. GPT-4o ranks and explains why each result matches')

8. Performance Tips for ProductionΒΆ

# 1. Batch embedding for large catalogs
def embed_images_batch(images: list, batch_size: int = 32) -> np.ndarray:
    """Process images in batches for efficiency."""
    all_embeddings = []
    for i in range(0, len(images), batch_size):
        batch = images[i:i+batch_size]
        inputs = clip_processor(images=batch, return_tensors='pt', padding=True).to(device)
        with torch.no_grad():
            features = clip_model.get_image_features(**inputs)
            features = features / features.norm(dim=-1, keepdim=True)
        all_embeddings.append(features.cpu().numpy())
    return np.vstack(all_embeddings)

# 2. Use larger CLIP for better quality
# 'openai/clip-vit-large-patch14'  β†’ 768-dim (better)
# 'openai/clip-vit-base-patch32'   β†’ 512-dim (faster)

# 3. Use Qdrant for scale (millions of images)
# from qdrant_client import QdrantClient
# client = QdrantClient(':memory:')  # or host='localhost'

# 4. Cache embeddings β€” don't re-embed unchanged images
# Store embeddings to disk: np.save('embeddings.npy', embeddings)

# 5. Use 'detail: low' for GPT-4o to reduce cost
# 'detail: low'  β†’ 85 tokens per image (~$0.00085)
# 'detail: high' β†’ up to 1700 tokens per image (for fine details)

ExercisesΒΆ

  1. Build a fashion search engine: index 20 clothing items and search by description.

  2. Index a PDF with diagrams (use pdf2image to extract pages as images) and answer questions.

  3. Compare retrieval quality: CLIP embeddings vs. GPT-4o captions β†’ text embeddings.

  4. Build a β€œfind similar product” reverse image search using search_by_image().

  5. Create a slide deck Q&A: convert presentation slides to images and query with natural language.