Stable Diffusion & Image Generation

Text-to-image, img2img, inpainting, ControlNet, and fine-tuning diffusion models with LoRA.

# Install dependencies
# !pip install diffusers transformers accelerate torch pillow

Diffusion Process Explained

import numpy as np
from typing import List, Tuple, Optional
from dataclasses import dataclass

@dataclass
class DiffusionConfig:
    """Diffusion model configuration"""
    num_inference_steps: int = 50  # Number of denoising steps
    guidance_scale: float = 7.5    # Classifier-free guidance strength
    height: int = 512
    width: int = 512
    seed: Optional[int] = None

class SimpleDiffusion:
    """Simplified diffusion process demonstration"""
    
    def __init__(self, timesteps: int = 1000):
        self.timesteps = timesteps
        # Beta schedule (noise variance at each timestep)
        self.betas = self._linear_beta_schedule()
        self.alphas = 1 - self.betas
        self.alpha_cumprod = np.cumprod(self.alphas)
    
    def _linear_beta_schedule(self) -> np.ndarray:
        """Linear noise schedule"""
        beta_start = 0.0001
        beta_end = 0.02
        return np.linspace(beta_start, beta_end, self.timesteps)
    
    def add_noise(self, x: np.ndarray, t: int) -> Tuple[np.ndarray, np.ndarray]:
        """Add noise to data at timestep t (forward process)"""
        noise = np.random.randn(*x.shape)
        
        # x_t = sqrt(alpha_cumprod_t) * x + sqrt(1 - alpha_cumprod_t) * noise
        alpha_t = self.alpha_cumprod[t]
        noisy_x = np.sqrt(alpha_t) * x + np.sqrt(1 - alpha_t) * noise
        
        return noisy_x, noise
    
    def denoise_step(self, x_t: np.ndarray, t: int, predicted_noise: np.ndarray) -> np.ndarray:
        """Single denoising step (reverse process)"""
        alpha_t = self.alphas[t]
        alpha_cumprod_t = self.alpha_cumprod[t]
        beta_t = self.betas[t]
        
        # Predict x_0 from x_t and noise
        x_0_pred = (x_t - np.sqrt(1 - alpha_cumprod_t) * predicted_noise) / np.sqrt(alpha_cumprod_t)
        
        # Compute x_{t-1}
        if t > 0:
            alpha_cumprod_t_prev = self.alpha_cumprod[t - 1]
            variance = beta_t * (1 - alpha_cumprod_t_prev) / (1 - alpha_cumprod_t)
            noise = np.random.randn(*x_t.shape)
            x_t_prev = (
                np.sqrt(alpha_cumprod_t_prev) * x_0_pred +
                np.sqrt(1 - alpha_cumprod_t_prev - variance) * predicted_noise +
                np.sqrt(variance) * noise
            )
        else:
            x_t_prev = x_0_pred
        
        return x_t_prev

# Demonstrate diffusion
diffusion = SimpleDiffusion(timesteps=100)

# Original "image" (1D for simplicity)
original = np.array([1.0, 2.0, 3.0, 4.0, 5.0])

# Add noise at different timesteps
print("Forward Process (Adding Noise):")
for t in [0, 25, 50, 75, 99]:
    noisy, _ = diffusion.add_noise(original, t)
    print(f"  t={t:2d}: {noisy}")

print("\nAt t=99, the signal is almost pure noise!")

Text-to-Image Generation

# Stable Diffusion with Diffusers (requires installation)
'''
from diffusers import StableDiffusionPipeline
import torch

# Load model
model_id = "runwayml/stable-diffusion-v1-5"
pipe = StableDiffusionPipeline.from_pretrained(
    model_id,
    torch_dtype=torch.float16  # Use FP16 for speed
)
pipe = pipe.to("cuda")  # Use GPU

# Generate image
prompt = "a cat astronaut in space, digital art, highly detailed"
negative_prompt = "blurry, low quality, distorted"

image = pipe(
    prompt=prompt,
    negative_prompt=negative_prompt,
    num_inference_steps=50,
    guidance_scale=7.5,
    height=512,
    width=512
).images[0]

image.save("cat_astronaut.png")
'''

print("Stable Diffusion example (commented - requires diffusers & GPU)")
print("\nPopular Models:")
print("  runwayml/stable-diffusion-v1-5 - Original SD 1.5")
print("  stabilityai/stable-diffusion-2-1 - SD 2.1 (768x768)")
print("  stabilityai/stable-diffusion-xl-base-1.0 - SDXL (best quality)")
print("  dreamlike-art/dreamlike-photoreal-2.0 - Photorealistic")

Image Generator Wrapper

@dataclass
class GeneratedImage:
    """Generated image result"""
    image: np.ndarray
    prompt: str
    seed: int
    config: DiffusionConfig
    metadata: dict = None

class ImageGenerator:
    """Text-to-image generator wrapper"""
    
    def __init__(self, model_name: str = "sd-v1.5"):
        self.model_name = model_name
        self.default_config = DiffusionConfig()
    
    def generate(self,
                 prompt: str,
                 negative_prompt: Optional[str] = None,
                 config: Optional[DiffusionConfig] = None) -> GeneratedImage:
        """Generate image from text prompt"""
        cfg = config or self.default_config
        
        # Set seed for reproducibility
        if cfg.seed is not None:
            np.random.seed(cfg.seed)
        else:
            cfg.seed = np.random.randint(0, 2**32 - 1)
        
        # Simulate image generation
        # In production: use actual Stable Diffusion pipeline
        image = np.random.randint(0, 255, (cfg.height, cfg.width, 3), dtype=np.uint8)
        
        return GeneratedImage(
            image=image,
            prompt=prompt,
            seed=cfg.seed,
            config=cfg,
            metadata={
                "negative_prompt": negative_prompt,
                "model": self.model_name
            }
        )
    
    def generate_batch(self,
                       prompts: List[str],
                       config: Optional[DiffusionConfig] = None) -> List[GeneratedImage]:
        """Generate multiple images"""
        return [self.generate(prompt, config=config) for prompt in prompts]

# Test generator
generator = ImageGenerator()

# Generate single image
config = DiffusionConfig(num_inference_steps=50, guidance_scale=7.5, seed=42)
result = generator.generate(
    prompt="a beautiful sunset over mountains, oil painting",
    negative_prompt="blurry, low quality",
    config=config
)

print(f"Generated image:")
print(f"  Prompt: {result.prompt}")
print(f"  Size: {result.image.shape}")
print(f"  Seed: {result.seed}")
print(f"  Steps: {result.config.num_inference_steps}")

# Generate batch
prompts = [
    "a cute cat",
    "a futuristic city",
    "a fantasy dragon"
]
batch_results = generator.generate_batch(prompts, config=config)
print(f"\nGenerated {len(batch_results)} images from batch")

Image-to-Image Translation

# Image-to-Image with Stable Diffusion (requires installation)
'''
from diffusers import StableDiffusionImg2ImgPipeline
from PIL import Image

# Load img2img pipeline
pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
    torch_dtype=torch.float16
).to("cuda")

# Load init image
init_image = Image.open("input.jpg").resize((512, 512))

# Transform image
prompt = "convert to watercolor painting style"
image = pipe(
    prompt=prompt,
    image=init_image,
    strength=0.75,  # 0=no change, 1=completely new
    guidance_scale=7.5,
    num_inference_steps=50
).images[0]

image.save("watercolor.png")
'''

class Image2ImageGenerator:
    """Image-to-image translation"""
    
    def __init__(self, model_name: str = "sd-v1.5"):
        self.model_name = model_name
    
    def transform(self,
                  image: np.ndarray,
                  prompt: str,
                  strength: float = 0.75) -> GeneratedImage:
        """Transform image according to prompt
        
        Args:
            image: Input image
            prompt: Transformation description
            strength: How much to change (0=none, 1=complete)
        """
        # In production: use StableDiffusionImg2ImgPipeline
        # Simulate transformation
        transformed = (image * (1 - strength) + 
                       np.random.randint(0, 255, image.shape) * strength).astype(np.uint8)
        
        return GeneratedImage(
            image=transformed,
            prompt=prompt,
            seed=42,
            config=DiffusionConfig(),
            metadata={"strength": strength, "type": "img2img"}
        )

# Test img2img
img2img = Image2ImageGenerator()
input_img = np.random.randint(0, 255, (512, 512, 3), dtype=np.uint8)

result = img2img.transform(
    input_img,
    prompt="convert to oil painting style",
    strength=0.75
)

print("Image-to-Image Translation:")
print(f"  Prompt: {result.prompt}")
print(f"  Strength: {result.metadata['strength']}")
print(f"  Output shape: {result.image.shape}")

ControlNet (Structure Preservation)

# ControlNet (requires installation)
'''
from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
import cv2

# Load ControlNet (Canny edge)
controlnet = ControlNetModel.from_pretrained(
    "lllyasviel/sd-controlnet-canny",
    torch_dtype=torch.float16
)

pipe = StableDiffusionControlNetPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
    controlnet=controlnet,
    torch_dtype=torch.float16
).to("cuda")

# Load image and detect edges
image = cv2.imread("input.jpg")
edges = cv2.Canny(image, 100, 200)
edges = Image.fromarray(edges)

# Generate with structure preserved
prompt = "a beautiful landscape painting"
output = pipe(
    prompt=prompt,
    image=edges,
    num_inference_steps=50
).images[0]

output.save("controlled.png")
'''

print("ControlNet Types:")
print("  Canny: Edge-based control")
print("  Depth: Depth map control")
print("  Pose: Human pose control")
print("  Scribble: Sketch control")
print("  Segmentation: Semantic control")
print("\nUse Case: Preserve structure while changing style/content")

Production Pipeline

import time
from collections import deque
import hashlib

class ProductionImageGenerator:
    """Production-ready image generator with safety & performance"""
    
    def __init__(self, model_name: str = "sd-v1.5"):
        self.generator = ImageGenerator(model_name)
        self.safety_filter = SafetyFilter()
        self.prompt_cache = {}
        self.stats = {
            "total_generations": 0,
            "filtered_prompts": 0,
            "cache_hits": 0,
            "avg_generation_time": 0,
            "generation_times": deque(maxlen=100)
        }
    
    def generate_safe(self,
                      prompt: str,
                      negative_prompt: Optional[str] = None,
                      config: Optional[DiffusionConfig] = None,
                      use_cache: bool = True) -> Optional[GeneratedImage]:
        """Generate with safety filtering"""
        start = time.time()
        
        # Safety check
        if not self.safety_filter.is_safe(prompt):
            self.stats["filtered_prompts"] += 1
            print(f"⚠️  Prompt blocked by safety filter")
            return None
        
        # Check cache
        cache_key = self._get_cache_key(prompt, config)
        if use_cache and cache_key in self.prompt_cache:
            self.stats["cache_hits"] += 1
            self.stats["total_generations"] += 1
            return self.prompt_cache[cache_key]
        
        # Generate
        result = self.generator.generate(prompt, negative_prompt, config)
        
        # Cache result
        if use_cache:
            self.prompt_cache[cache_key] = result
        
        # Update stats
        gen_time = time.time() - start
        self.stats["generation_times"].append(gen_time)
        self.stats["total_generations"] += 1
        self.stats["avg_generation_time"] = np.mean(self.stats["generation_times"])
        
        return result
    
    def _get_cache_key(self, prompt: str, config: Optional[DiffusionConfig]) -> str:
        """Generate cache key"""
        cfg = config or self.generator.default_config
        key_str = f"{prompt}_{cfg.num_inference_steps}_{cfg.guidance_scale}_{cfg.seed}"
        return hashlib.md5(key_str.encode()).hexdigest()
    
    def get_performance_stats(self) -> dict:
        """Get performance statistics"""
        total = max(self.stats["total_generations"], 1)
        return {
            "total_generations": self.stats["total_generations"],
            "filtered_prompts": self.stats["filtered_prompts"],
            "cache_hits": self.stats["cache_hits"],
            "cache_hit_rate": self.stats["cache_hits"] / total,
            "avg_generation_time_s": self.stats["avg_generation_time"]
        }

class SafetyFilter:
    """Content safety filter"""
    
    def __init__(self):
        # In production: use AI safety classifier
        self.blocked_keywords = ["violence", "explicit", "harmful"]
    
    def is_safe(self, prompt: str) -> bool:
        """Check if prompt is safe"""
        prompt_lower = prompt.lower()
        return not any(keyword in prompt_lower for keyword in self.blocked_keywords)

# Test production generator
prod_gen = ProductionImageGenerator()

# Generate images
prompts = [
    "a beautiful sunset",
    "a cute puppy",
    "a beautiful sunset",  # Duplicate (cache hit)
]

config = DiffusionConfig(seed=42)
for prompt in prompts:
    result = prod_gen.generate_safe(prompt, config=config)
    if result:
        print(f"✅ Generated: {prompt}")

# Print stats
stats = prod_gen.get_performance_stats()
print(f"\nPerformance Stats:")
print(f"  Total generations: {stats['total_generations']}")
print(f"  Cache hits: {stats['cache_hits']}")
print(f"  Cache hit rate: {stats['cache_hit_rate']:.1%}")
print(f"  Avg time: {stats['avg_generation_time_s']:.3f}s")

Best Practices

1. Prompt Engineering

  • Be specific and detailed

  • Include style: “oil painting”, “digital art”, “photorealistic”

  • Add quality modifiers: “highly detailed”, “8k”, “masterpiece”

  • Use negative prompts: “blurry, low quality, distorted”

2. Hyperparameters

  • Steps: 20-50 (more = better quality, slower)

  • Guidance scale: 7-12 (higher = more prompt adherence)

  • Strength (img2img): 0.5-0.8 (lower = more original preserved)

  • Seed: Fix for reproducibility

3. Optimization

  • Use FP16 precision (2x faster, same quality)

  • Enable attention slicing for low VRAM

  • Use xformers for memory efficiency

  • Batch generation when possible

4. Fine-Tuning

  • Textual Inversion: Learn new concepts (5-10 images)

  • DreamBooth: Personalize subjects (20-50 images)

  • LoRA: Efficient fine-tuning (low rank adaptation)

  • Hypernetworks: Style transfer

Common Use Cases

  • Art generation: Digital art, paintings, illustrations

  • Product visualization: Product mockups, variations

  • Interior design: Room layouts, furniture arrangements

  • Fashion: Clothing designs, style variations

  • Marketing: Ad creatives, social media content

  • Game development: Concept art, textures, assets

Key Takeaways

✅ Diffusion models gradually denoise random noise

✅ Text guides the denoising process

✅ More steps = better quality but slower

✅ Negative prompts remove unwanted features

✅ ControlNet preserves structure while changing content

✅ LoRA enables efficient fine-tuning

Next: 05_multimodal_rag.ipynb - Combine vision & language for Q&A