Multimodal RAG: Retrieval-Augmented Generation with ImagesΒΆ
Extend RAG beyond text β index images alongside documents and retrieve the right visual context to answer questions about charts, diagrams, product photos, and more.
What Is Multimodal RAG?ΒΆ
Standard RAG: question β retrieve text chunks β LLM answers
Multimodal RAG: question β retrieve text + images β VLM answers
Images + Documents
β
βΌ
[CLIP Encoder] β shared embedding space
β
βΌ
[Vector Store] β images stored as embeddings
β
Query time:
β
User Question β [CLIP text encoder] β similarity search
β
βΌ
Top-K images retrieved
β
βΌ
[GPT-4o / Claude] β question + retrieved images β answer
Use CasesΒΆ
Use Case |
What Gets Indexed |
Query Example |
|---|---|---|
E-commerce |
Product photos |
βShow me red running shoesβ |
Technical docs |
Diagrams + text |
βHow does the circuit work?β |
Medical |
X-rays, scans |
βFind cases with similar pathologyβ |
News archive |
Photos + articles |
βPhotos of the 2024 electionβ |
Slide decks |
Presentation slides |
βWhich slide covers pricing?β |
# Install dependencies
# !pip install openai anthropic torch transformers pillow chromadb numpy requests
1. CLIP Embeddings for Image IndexingΒΆ
import torch
import numpy as np
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import requests
from io import BytesIO
# Load CLIP for multimodal embeddings
device = 'cuda' if torch.cuda.is_available() else 'cpu'
clip_model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32').to(device)
clip_processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')
def embed_image(image: Image.Image) -> np.ndarray:
"""Get 512-dim CLIP embedding for an image."""
inputs = clip_processor(images=image, return_tensors='pt').to(device)
with torch.no_grad():
features = clip_model.get_image_features(**inputs)
features = features / features.norm(dim=-1, keepdim=True) # normalize
return features.cpu().numpy()[0]
def embed_text(text: str) -> np.ndarray:
"""Get 512-dim CLIP embedding for a text query."""
inputs = clip_processor(text=[text], return_tensors='pt', padding=True).to(device)
with torch.no_grad():
features = clip_model.get_text_features(**inputs)
features = features / features.norm(dim=-1, keepdim=True) # normalize
return features.cpu().numpy()[0]
def load_image_from_url(url: str) -> Image.Image:
response = requests.get(url, timeout=10)
return Image.open(BytesIO(response.content)).convert('RGB')
print(f'CLIP loaded on {device}. Embedding dim: 512')
2. Build a Simple Image Vector StoreΒΆ
import json
from pathlib import Path
from typing import List, Dict, Optional
class ImageVectorStore:
"""
Simple in-memory vector store for images using CLIP embeddings.
For production, use ChromaDB or Qdrant (see Section 3).
"""
def __init__(self):
self.embeddings: List[np.ndarray] = []
self.metadata: List[Dict] = [] # image path, caption, tags, etc.
self.images: List[Image.Image] = []
def add_image(self, image: Image.Image, metadata: Dict) -> int:
"""Add an image to the store. Returns its index."""
embedding = embed_image(image)
self.embeddings.append(embedding)
self.metadata.append(metadata)
self.images.append(image)
return len(self.embeddings) - 1
def add_image_from_url(self, url: str, metadata: Dict) -> int:
"""Fetch image from URL and add to store."""
image = load_image_from_url(url)
metadata['url'] = url
return self.add_image(image, metadata)
def search_by_text(self, query: str, top_k: int = 3) -> List[Dict]:
"""Find images most similar to a text query."""
query_embedding = embed_text(query)
return self._search(query_embedding, top_k)
def search_by_image(self, query_image: Image.Image, top_k: int = 3) -> List[Dict]:
"""Find images similar to a query image."""
query_embedding = embed_image(query_image)
return self._search(query_embedding, top_k)
def _search(self, query_embedding: np.ndarray, top_k: int) -> List[Dict]:
if not self.embeddings:
return []
# Cosine similarity (embeddings are already normalized)
scores = np.array(self.embeddings) @ query_embedding
top_indices = np.argsort(scores)[-top_k:][::-1]
return [
{
'index': int(i),
'score': float(scores[i]),
'metadata': self.metadata[i],
'image': self.images[i]
}
for i in top_indices
]
def __len__(self) -> int:
return len(self.embeddings)
print('ImageVectorStore class ready.')
import matplotlib.pyplot as plt
# Create store and index some sample images
store = ImageVectorStore()
sample_images = [
{
'url': 'https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/PNG_transparency_demonstration_1.png/280px-PNG_transparency_demonstration_1.png',
'caption': 'Geometric shapes with transparency',
'tags': ['shapes', 'geometry', 'colors']
},
]
print('Indexing sample images...')
for item in sample_images:
try:
idx = store.add_image_from_url(
item['url'],
{'caption': item['caption'], 'tags': item['tags']}
)
print(f' [{idx}] {item["caption"]}')
except Exception as e:
print(f' Failed: {e}')
print(f'\nIndexed {len(store)} images.')
3. ChromaDB for Persistent Image StorageΒΆ
# ChromaDB stores embeddings persistently on disk
import chromadb
import base64
from io import BytesIO
def image_to_base64(image: Image.Image) -> str:
"""Convert PIL image to base64 string for storage."""
buffer = BytesIO()
image.save(buffer, format='PNG')
return base64.b64encode(buffer.getvalue()).decode()
def base64_to_image(b64: str) -> Image.Image:
"""Convert base64 string back to PIL image."""
return Image.open(BytesIO(base64.b64decode(b64)))
class ChromaImageStore:
"""Persistent image store using ChromaDB."""
def __init__(self, persist_dir: str = './image_store'):
self.client = chromadb.PersistentClient(path=persist_dir)
self.collection = self.client.get_or_create_collection(
name='images',
metadata={'hnsw:space': 'cosine'}
)
def add_image(self, image: Image.Image, doc_id: str, metadata: Dict) -> None:
"""Index an image with its CLIP embedding."""
embedding = embed_image(image).tolist()
metadata['image_b64'] = image_to_base64(image)
self.collection.add(
embeddings=[embedding],
ids=[doc_id],
metadatas=[metadata]
)
def search(self, query: str, n_results: int = 3) -> List[Dict]:
"""Search by text query, return images + metadata."""
query_embedding = embed_text(query).tolist()
results = self.collection.query(
query_embeddings=[query_embedding],
n_results=n_results
)
output = []
for i, meta in enumerate(results['metadatas'][0]):
b64 = meta.pop('image_b64', None)
output.append({
'id': results['ids'][0][i],
'score': 1 - results['distances'][0][i], # cosine β similarity
'metadata': meta,
'image': base64_to_image(b64) if b64 else None
})
return output
def count(self) -> int:
return self.collection.count()
print('ChromaImageStore class ready.')
print('Persists to disk β survives restarts.')
4. Visual QA Pipeline (RAG + VLM)ΒΆ
import base64
from openai import OpenAI
openai_client = OpenAI()
def image_to_data_url(image: Image.Image) -> str:
"""Convert PIL image to base64 data URL for OpenAI API."""
buffer = BytesIO()
image.save(buffer, format='PNG')
b64 = base64.b64encode(buffer.getvalue()).decode()
return f'data:image/png;base64,{b64}'
def multimodal_rag_answer(
question: str,
image_store: ImageVectorStore,
top_k: int = 2,
model: str = 'gpt-4o'
) -> Dict:
"""
Full Multimodal RAG pipeline:
1. Search image store for relevant images
2. Pass question + retrieved images to GPT-4o
3. Return answer with source attribution
"""
# Step 1: Retrieve relevant images
results = image_store.search_by_text(question, top_k=top_k)
if not results:
return {'answer': 'No relevant images found.', 'sources': []}
# Step 2: Build multimodal message
content = [{
'type': 'text',
'text': f'Answer the question using the provided images as context.\n\nQuestion: {question}'
}]
sources = []
for i, result in enumerate(results):
data_url = image_to_data_url(result['image'])
content.append({
'type': 'text',
'text': f'\n[Image {i+1}] Caption: {result["metadata"].get("caption", "unknown")} (relevance: {result["score"]:.2f})'
})
content.append({
'type': 'image_url',
'image_url': {'url': data_url, 'detail': 'low'} # 'low' = cheaper, 'high' = more detail
})
sources.append(result['metadata'])
# Step 3: Call VLM
response = openai_client.chat.completions.create(
model=model,
messages=[{'role': 'user', 'content': content}],
max_tokens=500
)
return {
'answer': response.choices[0].message.content,
'sources': sources,
'retrieved_images': [r['image'] for r in results]
}
print('multimodal_rag_answer() ready.')
print()
print('Usage:')
print(' result = multimodal_rag_answer("What shapes are in the diagram?", store)')
print(' print(result["answer"])')
5. Document + Image Hybrid RAGΒΆ
Index PDFs/documents with embedded images β retrieve both text chunks and figures.
# Hybrid store: text chunks + images in the same ChromaDB collection
# Useful for: technical manuals, research papers, slide decks
class HybridDocumentStore:
"""
Mixed vector store for both text and image content.
Text uses OpenAI embeddings (ada-002), images use CLIP.
Both stored in same ChromaDB for unified search.
"""
def __init__(self, persist_dir: str = './hybrid_store'):
self.client = chromadb.PersistentClient(path=persist_dir)
self.text_collection = self.client.get_or_create_collection('text_chunks')
self.image_collection = self.client.get_or_create_collection('images')
def add_text_chunk(self, text: str, chunk_id: str, metadata: Dict) -> None:
"""Index a text chunk with OpenAI embeddings."""
response = openai_client.embeddings.create(
model='text-embedding-3-small',
input=text
)
embedding = response.data[0].embedding
metadata['text'] = text
self.text_collection.add(
embeddings=[embedding],
ids=[chunk_id],
metadatas=[metadata]
)
def add_image(self, image: Image.Image, image_id: str, metadata: Dict) -> None:
"""Index an image with CLIP embeddings."""
embedding = embed_image(image).tolist()
buffer = BytesIO()
image.save(buffer, format='PNG')
metadata['image_b64'] = base64.b64encode(buffer.getvalue()).decode()
self.image_collection.add(
embeddings=[embedding],
ids=[image_id],
metadatas=[metadata]
)
def search(self, query: str, n_text: int = 2, n_images: int = 2) -> Dict:
"""Search both text and image collections."""
# Text search
text_response = openai_client.embeddings.create(
model='text-embedding-3-small', input=query
)
text_embedding = text_response.data[0].embedding
text_results = self.text_collection.query(
query_embeddings=[text_embedding], n_results=n_text
)
# Image search (text query β CLIP text encoder)
image_embedding = embed_text(query).tolist()
image_results = self.image_collection.query(
query_embeddings=[image_embedding], n_results=n_images
)
return {
'text_chunks': text_results['metadatas'][0] if text_results['metadatas'] else [],
'images': [
{
'metadata': m,
'image': base64_to_image(m.get('image_b64', '')) if m.get('image_b64') else None
}
for m in (image_results['metadatas'][0] if image_results['metadatas'] else [])
]
}
print('HybridDocumentStore ready.')
print('Stores text chunks + images, searchable with a single query.')
6. Using Claude for Multimodal RAGΒΆ
import anthropic
claude = anthropic.Anthropic()
def multimodal_rag_claude(
question: str,
image_store: ImageVectorStore,
top_k: int = 2
) -> str:
"""
Multimodal RAG using Claude Vision.
Claude handles up to 20 images per request and 200K token context.
"""
results = image_store.search_by_text(question, top_k=top_k)
content = []
for i, result in enumerate(results):
# Convert image to base64
buffer = BytesIO()
result['image'].save(buffer, format='PNG')
b64 = base64.b64encode(buffer.getvalue()).decode()
content.append({'type': 'text', 'text': f'[Image {i+1}: {result["metadata"].get("caption", "")}]'})
content.append({
'type': 'image',
'source': {
'type': 'base64',
'media_type': 'image/png',
'data': b64
}
})
content.append({'type': 'text', 'text': f'\nBased on the images above, {question}'})
response = claude.messages.create(
model='claude-sonnet-4-6',
max_tokens=500,
messages=[{'role': 'user', 'content': content}]
)
return response.content[0].text
print('multimodal_rag_claude() ready β uses Claude Vision for answers.')
print('Claude advantage: 200K context β can handle many more retrieved images.')
7. E-Commerce Image Search ExampleΒΆ
# Realistic use case: product catalog search
# User types a description β find visually matching products
class ProductCatalogSearch:
"""
Product catalog with visual + text search.
Indexes product images and descriptions together.
"""
def __init__(self):
self.store = ImageVectorStore()
def index_product(self, product: Dict) -> None:
"""
Index a product. product dict should have:
- image: PIL Image
- name, price, category, description
"""
image = product.pop('image')
self.store.add_image(image, product)
def search(self, query: str, top_k: int = 5) -> List[Dict]:
"""Find products matching a text description."""
results = self.store.search_by_text(query, top_k=top_k)
return [
{
'name': r['metadata'].get('name'),
'price': r['metadata'].get('price'),
'category': r['metadata'].get('category'),
'score': r['score'],
'image': r['image']
}
for r in results
]
def find_similar(self, product_image: Image.Image, top_k: int = 5) -> List[Dict]:
"""Find visually similar products (reverse image search)."""
return self.store.search_by_image(product_image, top_k=top_k)
print('ProductCatalogSearch ready.')
print()
print('Workflow:')
print(' 1. Index all product images with metadata')
print(' 2. User searches: "red leather jacket under $200"')
print(' 3. CLIP finds visually + semantically matching products')
print(' 4. GPT-4o ranks and explains why each result matches')
8. Performance Tips for ProductionΒΆ
# 1. Batch embedding for large catalogs
def embed_images_batch(images: list, batch_size: int = 32) -> np.ndarray:
"""Process images in batches for efficiency."""
all_embeddings = []
for i in range(0, len(images), batch_size):
batch = images[i:i+batch_size]
inputs = clip_processor(images=batch, return_tensors='pt', padding=True).to(device)
with torch.no_grad():
features = clip_model.get_image_features(**inputs)
features = features / features.norm(dim=-1, keepdim=True)
all_embeddings.append(features.cpu().numpy())
return np.vstack(all_embeddings)
# 2. Use larger CLIP for better quality
# 'openai/clip-vit-large-patch14' β 768-dim (better)
# 'openai/clip-vit-base-patch32' β 512-dim (faster)
# 3. Use Qdrant for scale (millions of images)
# from qdrant_client import QdrantClient
# client = QdrantClient(':memory:') # or host='localhost'
# 4. Cache embeddings β don't re-embed unchanged images
# Store embeddings to disk: np.save('embeddings.npy', embeddings)
# 5. Use 'detail: low' for GPT-4o to reduce cost
# 'detail: low' β 85 tokens per image (~$0.00085)
# 'detail: high' β up to 1700 tokens per image (for fine details)
ExercisesΒΆ
Build a fashion search engine: index 20 clothing items and search by description.
Index a PDF with diagrams (use
pdf2imageto extract pages as images) and answer questions.Compare retrieval quality: CLIP embeddings vs. GPT-4o captions β text embeddings.
Build a βfind similar productβ reverse image search using
search_by_image().Create a slide deck Q&A: convert presentation slides to images and query with natural language.