Stable Diffusion & Image Generation¶
Text-to-image, img2img, inpainting, ControlNet, and fine-tuning diffusion models with LoRA.
# Install dependencies
# !pip install diffusers transformers accelerate torch pillow
Diffusion Process Explained¶
import numpy as np
from typing import List, Tuple, Optional
from dataclasses import dataclass
@dataclass
class DiffusionConfig:
"""Diffusion model configuration"""
num_inference_steps: int = 50 # Number of denoising steps
guidance_scale: float = 7.5 # Classifier-free guidance strength
height: int = 512
width: int = 512
seed: Optional[int] = None
class SimpleDiffusion:
"""Simplified diffusion process demonstration"""
def __init__(self, timesteps: int = 1000):
self.timesteps = timesteps
# Beta schedule (noise variance at each timestep)
self.betas = self._linear_beta_schedule()
self.alphas = 1 - self.betas
self.alpha_cumprod = np.cumprod(self.alphas)
def _linear_beta_schedule(self) -> np.ndarray:
"""Linear noise schedule"""
beta_start = 0.0001
beta_end = 0.02
return np.linspace(beta_start, beta_end, self.timesteps)
def add_noise(self, x: np.ndarray, t: int) -> Tuple[np.ndarray, np.ndarray]:
"""Add noise to data at timestep t (forward process)"""
noise = np.random.randn(*x.shape)
# x_t = sqrt(alpha_cumprod_t) * x + sqrt(1 - alpha_cumprod_t) * noise
alpha_t = self.alpha_cumprod[t]
noisy_x = np.sqrt(alpha_t) * x + np.sqrt(1 - alpha_t) * noise
return noisy_x, noise
def denoise_step(self, x_t: np.ndarray, t: int, predicted_noise: np.ndarray) -> np.ndarray:
"""Single denoising step (reverse process)"""
alpha_t = self.alphas[t]
alpha_cumprod_t = self.alpha_cumprod[t]
beta_t = self.betas[t]
# Predict x_0 from x_t and noise
x_0_pred = (x_t - np.sqrt(1 - alpha_cumprod_t) * predicted_noise) / np.sqrt(alpha_cumprod_t)
# Compute x_{t-1}
if t > 0:
alpha_cumprod_t_prev = self.alpha_cumprod[t - 1]
variance = beta_t * (1 - alpha_cumprod_t_prev) / (1 - alpha_cumprod_t)
noise = np.random.randn(*x_t.shape)
x_t_prev = (
np.sqrt(alpha_cumprod_t_prev) * x_0_pred +
np.sqrt(1 - alpha_cumprod_t_prev - variance) * predicted_noise +
np.sqrt(variance) * noise
)
else:
x_t_prev = x_0_pred
return x_t_prev
# Demonstrate diffusion
diffusion = SimpleDiffusion(timesteps=100)
# Original "image" (1D for simplicity)
original = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
# Add noise at different timesteps
print("Forward Process (Adding Noise):")
for t in [0, 25, 50, 75, 99]:
noisy, _ = diffusion.add_noise(original, t)
print(f" t={t:2d}: {noisy}")
print("\nAt t=99, the signal is almost pure noise!")
Text-to-Image Generation¶
# Stable Diffusion with Diffusers (requires installation)
'''
from diffusers import StableDiffusionPipeline
import torch
# Load model
model_id = "runwayml/stable-diffusion-v1-5"
pipe = StableDiffusionPipeline.from_pretrained(
model_id,
torch_dtype=torch.float16 # Use FP16 for speed
)
pipe = pipe.to("cuda") # Use GPU
# Generate image
prompt = "a cat astronaut in space, digital art, highly detailed"
negative_prompt = "blurry, low quality, distorted"
image = pipe(
prompt=prompt,
negative_prompt=negative_prompt,
num_inference_steps=50,
guidance_scale=7.5,
height=512,
width=512
).images[0]
image.save("cat_astronaut.png")
'''
print("Stable Diffusion example (commented - requires diffusers & GPU)")
print("\nPopular Models:")
print(" runwayml/stable-diffusion-v1-5 - Original SD 1.5")
print(" stabilityai/stable-diffusion-2-1 - SD 2.1 (768x768)")
print(" stabilityai/stable-diffusion-xl-base-1.0 - SDXL (best quality)")
print(" dreamlike-art/dreamlike-photoreal-2.0 - Photorealistic")
Image Generator Wrapper¶
@dataclass
class GeneratedImage:
"""Generated image result"""
image: np.ndarray
prompt: str
seed: int
config: DiffusionConfig
metadata: dict = None
class ImageGenerator:
"""Text-to-image generator wrapper"""
def __init__(self, model_name: str = "sd-v1.5"):
self.model_name = model_name
self.default_config = DiffusionConfig()
def generate(self,
prompt: str,
negative_prompt: Optional[str] = None,
config: Optional[DiffusionConfig] = None) -> GeneratedImage:
"""Generate image from text prompt"""
cfg = config or self.default_config
# Set seed for reproducibility
if cfg.seed is not None:
np.random.seed(cfg.seed)
else:
cfg.seed = np.random.randint(0, 2**32 - 1)
# Simulate image generation
# In production: use actual Stable Diffusion pipeline
image = np.random.randint(0, 255, (cfg.height, cfg.width, 3), dtype=np.uint8)
return GeneratedImage(
image=image,
prompt=prompt,
seed=cfg.seed,
config=cfg,
metadata={
"negative_prompt": negative_prompt,
"model": self.model_name
}
)
def generate_batch(self,
prompts: List[str],
config: Optional[DiffusionConfig] = None) -> List[GeneratedImage]:
"""Generate multiple images"""
return [self.generate(prompt, config=config) for prompt in prompts]
# Test generator
generator = ImageGenerator()
# Generate single image
config = DiffusionConfig(num_inference_steps=50, guidance_scale=7.5, seed=42)
result = generator.generate(
prompt="a beautiful sunset over mountains, oil painting",
negative_prompt="blurry, low quality",
config=config
)
print(f"Generated image:")
print(f" Prompt: {result.prompt}")
print(f" Size: {result.image.shape}")
print(f" Seed: {result.seed}")
print(f" Steps: {result.config.num_inference_steps}")
# Generate batch
prompts = [
"a cute cat",
"a futuristic city",
"a fantasy dragon"
]
batch_results = generator.generate_batch(prompts, config=config)
print(f"\nGenerated {len(batch_results)} images from batch")
Image-to-Image Translation¶
# Image-to-Image with Stable Diffusion (requires installation)
'''
from diffusers import StableDiffusionImg2ImgPipeline
from PIL import Image
# Load img2img pipeline
pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
"runwayml/stable-diffusion-v1-5",
torch_dtype=torch.float16
).to("cuda")
# Load init image
init_image = Image.open("input.jpg").resize((512, 512))
# Transform image
prompt = "convert to watercolor painting style"
image = pipe(
prompt=prompt,
image=init_image,
strength=0.75, # 0=no change, 1=completely new
guidance_scale=7.5,
num_inference_steps=50
).images[0]
image.save("watercolor.png")
'''
class Image2ImageGenerator:
"""Image-to-image translation"""
def __init__(self, model_name: str = "sd-v1.5"):
self.model_name = model_name
def transform(self,
image: np.ndarray,
prompt: str,
strength: float = 0.75) -> GeneratedImage:
"""Transform image according to prompt
Args:
image: Input image
prompt: Transformation description
strength: How much to change (0=none, 1=complete)
"""
# In production: use StableDiffusionImg2ImgPipeline
# Simulate transformation
transformed = (image * (1 - strength) +
np.random.randint(0, 255, image.shape) * strength).astype(np.uint8)
return GeneratedImage(
image=transformed,
prompt=prompt,
seed=42,
config=DiffusionConfig(),
metadata={"strength": strength, "type": "img2img"}
)
# Test img2img
img2img = Image2ImageGenerator()
input_img = np.random.randint(0, 255, (512, 512, 3), dtype=np.uint8)
result = img2img.transform(
input_img,
prompt="convert to oil painting style",
strength=0.75
)
print("Image-to-Image Translation:")
print(f" Prompt: {result.prompt}")
print(f" Strength: {result.metadata['strength']}")
print(f" Output shape: {result.image.shape}")
ControlNet (Structure Preservation)¶
# ControlNet (requires installation)
'''
from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
import cv2
# Load ControlNet (Canny edge)
controlnet = ControlNetModel.from_pretrained(
"lllyasviel/sd-controlnet-canny",
torch_dtype=torch.float16
)
pipe = StableDiffusionControlNetPipeline.from_pretrained(
"runwayml/stable-diffusion-v1-5",
controlnet=controlnet,
torch_dtype=torch.float16
).to("cuda")
# Load image and detect edges
image = cv2.imread("input.jpg")
edges = cv2.Canny(image, 100, 200)
edges = Image.fromarray(edges)
# Generate with structure preserved
prompt = "a beautiful landscape painting"
output = pipe(
prompt=prompt,
image=edges,
num_inference_steps=50
).images[0]
output.save("controlled.png")
'''
print("ControlNet Types:")
print(" Canny: Edge-based control")
print(" Depth: Depth map control")
print(" Pose: Human pose control")
print(" Scribble: Sketch control")
print(" Segmentation: Semantic control")
print("\nUse Case: Preserve structure while changing style/content")
Production Pipeline¶
import time
from collections import deque
import hashlib
class ProductionImageGenerator:
"""Production-ready image generator with safety & performance"""
def __init__(self, model_name: str = "sd-v1.5"):
self.generator = ImageGenerator(model_name)
self.safety_filter = SafetyFilter()
self.prompt_cache = {}
self.stats = {
"total_generations": 0,
"filtered_prompts": 0,
"cache_hits": 0,
"avg_generation_time": 0,
"generation_times": deque(maxlen=100)
}
def generate_safe(self,
prompt: str,
negative_prompt: Optional[str] = None,
config: Optional[DiffusionConfig] = None,
use_cache: bool = True) -> Optional[GeneratedImage]:
"""Generate with safety filtering"""
start = time.time()
# Safety check
if not self.safety_filter.is_safe(prompt):
self.stats["filtered_prompts"] += 1
print(f"⚠️ Prompt blocked by safety filter")
return None
# Check cache
cache_key = self._get_cache_key(prompt, config)
if use_cache and cache_key in self.prompt_cache:
self.stats["cache_hits"] += 1
self.stats["total_generations"] += 1
return self.prompt_cache[cache_key]
# Generate
result = self.generator.generate(prompt, negative_prompt, config)
# Cache result
if use_cache:
self.prompt_cache[cache_key] = result
# Update stats
gen_time = time.time() - start
self.stats["generation_times"].append(gen_time)
self.stats["total_generations"] += 1
self.stats["avg_generation_time"] = np.mean(self.stats["generation_times"])
return result
def _get_cache_key(self, prompt: str, config: Optional[DiffusionConfig]) -> str:
"""Generate cache key"""
cfg = config or self.generator.default_config
key_str = f"{prompt}_{cfg.num_inference_steps}_{cfg.guidance_scale}_{cfg.seed}"
return hashlib.md5(key_str.encode()).hexdigest()
def get_performance_stats(self) -> dict:
"""Get performance statistics"""
total = max(self.stats["total_generations"], 1)
return {
"total_generations": self.stats["total_generations"],
"filtered_prompts": self.stats["filtered_prompts"],
"cache_hits": self.stats["cache_hits"],
"cache_hit_rate": self.stats["cache_hits"] / total,
"avg_generation_time_s": self.stats["avg_generation_time"]
}
class SafetyFilter:
"""Content safety filter"""
def __init__(self):
# In production: use AI safety classifier
self.blocked_keywords = ["violence", "explicit", "harmful"]
def is_safe(self, prompt: str) -> bool:
"""Check if prompt is safe"""
prompt_lower = prompt.lower()
return not any(keyword in prompt_lower for keyword in self.blocked_keywords)
# Test production generator
prod_gen = ProductionImageGenerator()
# Generate images
prompts = [
"a beautiful sunset",
"a cute puppy",
"a beautiful sunset", # Duplicate (cache hit)
]
config = DiffusionConfig(seed=42)
for prompt in prompts:
result = prod_gen.generate_safe(prompt, config=config)
if result:
print(f"✅ Generated: {prompt}")
# Print stats
stats = prod_gen.get_performance_stats()
print(f"\nPerformance Stats:")
print(f" Total generations: {stats['total_generations']}")
print(f" Cache hits: {stats['cache_hits']}")
print(f" Cache hit rate: {stats['cache_hit_rate']:.1%}")
print(f" Avg time: {stats['avg_generation_time_s']:.3f}s")
Best Practices¶
1. Prompt Engineering¶
Be specific and detailed
Include style: “oil painting”, “digital art”, “photorealistic”
Add quality modifiers: “highly detailed”, “8k”, “masterpiece”
Use negative prompts: “blurry, low quality, distorted”
2. Hyperparameters¶
Steps: 20-50 (more = better quality, slower)
Guidance scale: 7-12 (higher = more prompt adherence)
Strength (img2img): 0.5-0.8 (lower = more original preserved)
Seed: Fix for reproducibility
3. Optimization¶
Use FP16 precision (2x faster, same quality)
Enable attention slicing for low VRAM
Use xformers for memory efficiency
Batch generation when possible
4. Fine-Tuning¶
Textual Inversion: Learn new concepts (5-10 images)
DreamBooth: Personalize subjects (20-50 images)
LoRA: Efficient fine-tuning (low rank adaptation)
Hypernetworks: Style transfer
Common Use Cases¶
Art generation: Digital art, paintings, illustrations
Product visualization: Product mockups, variations
Interior design: Room layouts, furniture arrangements
Fashion: Clothing designs, style variations
Marketing: Ad creatives, social media content
Game development: Concept art, textures, assets
Key Takeaways¶
✅ Diffusion models gradually denoise random noise
✅ Text guides the denoising process
✅ More steps = better quality but slower
✅ Negative prompts remove unwanted features
✅ ControlNet preserves structure while changing content
✅ LoRA enables efficient fine-tuning
Next: 05_multimodal_rag.ipynb - Combine vision & language for Q&A