Run this notebook: Open in Colab Open in Kaggle

ControlNet: Precise Control Over Image Generation¶

ControlNet adds spatial conditioning to Stable Diffusion — letting you control pose, edges, depth, and composition precisely while still using natural language prompts.

Why ControlNet?¶

Stable Diffusion alone is unpredictable — you can’t reliably control:

Subject pose
Image composition
Object placement
Structural layout

ControlNet solves this with conditioning signals:

Control Type	Input	Use Case
Canny edges	Edge map	Preserve structure from photo
OpenPose	Skeleton	Control human pose
Depth	Depth map	3D scene composition
Scribble	Rough sketch	Quick concept art
Segmentation	Semantic mask	Control object placement
Normal map	Surface normals	Relighting 3D-like
Lineart	Clean lines	Anime/illustration style

# Install dependencies
# !pip install diffusers>=0.31.0 transformers accelerate safetensors
# !pip install opencv-python controlnet-aux

import torch
from diffusers import (
    StableDiffusionControlNetPipeline,
    ControlNetModel,
    UniPCMultistepScheduler
)
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import cv2
import requests
from io import BytesIO

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Device: {device}')
if torch.cuda.is_available():
    print(f'VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB')

1. Load Image Helper¶

def load_image(url_or_path: str, size: tuple = (512, 512)) -> Image.Image:
    """Load image from URL or path, resize to target size."""
    if url_or_path.startswith('http'):
        response = requests.get(url_or_path)
        img = Image.open(BytesIO(response.content)).convert('RGB')
    else:
        img = Image.open(url_or_path).convert('RGB')
    return img.resize(size)

def show_images(images: list, titles: list = None, figsize=(15, 5)) -> None:
    """Display multiple images side by side."""
    fig, axes = plt.subplots(1, len(images), figsize=figsize)
    if len(images) == 1:
        axes = [axes]
    for ax, img, title in zip(axes, images, titles or [''] * len(images)):
        ax.imshow(img)
        ax.set_title(title)
        ax.axis('off')
    plt.tight_layout()
    plt.show()

print('Image helpers ready.')

2. ControlNet with Canny Edges¶

Extract edge map from a photo → generate new image preserving the structure.

def extract_canny_edges(image: Image.Image, low_threshold: int = 100, high_threshold: int = 200) -> Image.Image:
    """Extract Canny edge map from an image."""
    img_array = np.array(image)
    # Convert to grayscale
    gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
    # Detect edges
    edges = cv2.Canny(gray, low_threshold, high_threshold)
    # Convert back to RGB (ControlNet expects 3-channel input)
    edges_rgb = cv2.cvtColor(edges, cv2.COLOR_GRAY2RGB)
    return Image.fromarray(edges_rgb)

# Load a sample image
# Using a simple geometric test image
sample_url = 'https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/PNG_transparency_demonstration_1.png/280px-PNG_transparency_demonstration_1.png'
source_image = load_image(sample_url)

# Extract edges
edge_map = extract_canny_edges(source_image)

show_images(
    [source_image, edge_map],
    ['Source Image', 'Canny Edge Map']
)

print('Edge map extracted — will guide the ControlNet generation.')

# Load ControlNet Canny model
controlnet = ControlNetModel.from_pretrained(
    'lllyasviel/sd-controlnet-canny',
    torch_dtype=torch.float16
)

# Load pipeline
pipe = StableDiffusionControlNetPipeline.from_pretrained(
    'runwayml/stable-diffusion-v1-5',
    controlnet=controlnet,
    torch_dtype=torch.float16,
    safety_checker=None
)

# Use fast scheduler
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
pipe = pipe.to(device)

# Memory optimization
if device == 'cpu':
    pipe.enable_attention_slicing()

print('ControlNet Canny pipeline loaded!')

def generate_with_canny(
    source_image: Image.Image,
    prompt: str,
    negative_prompt: str = 'low quality, blurry',
    num_steps: int = 20,
    guidance_scale: float = 7.5,
    controlnet_scale: float = 1.0,
    seed: int = 42
) -> tuple:
    """Generate image using Canny edge control."""
    edge_map = extract_canny_edges(source_image)
    
    generator = torch.manual_seed(seed)
    output = pipe(
        prompt,
        negative_prompt=negative_prompt,
        image=edge_map,
        num_inference_steps=num_steps,
        guidance_scale=guidance_scale,
        controlnet_conditioning_scale=controlnet_scale,
        generator=generator
    )
    return output.images[0], edge_map

# Try different prompts on same structure
prompts = [
    'a watercolor painting of colorful shapes, vibrant, artistic',
    'a futuristic neon glowing version, cyberpunk style',
    'pencil sketch illustration, hand drawn, detailed',
]

print('generate_with_canny() ready.')
print('Usage: generated_img, edge_map = generate_with_canny(source_image, "your prompt")')

3. ControlNet Strength (Conditioning Scale)¶

controlnet_conditioning_scale controls how strictly the model follows the control signal:

0.0 — Ignores control (pure Stable Diffusion)
0.5 — Loose guidance
1.0 — Strong guidance (default)
1.5+ — Very strict (may look unnatural)

# Demonstrate controlnet_conditioning_scale effect
conditioning_scales = [0.3, 0.7, 1.0, 1.5]
prompt = 'a beautiful oil painting, masterpiece, vibrant colors'

print('ControlNet conditioning scale comparison:')
print('scale=0.3  → Loosely follows structure (more creative freedom)')
print('scale=0.7  → Moderate structure adherence')
print('scale=1.0  → Strong structure (default, recommended)')
print('scale=1.5  → Very strict — may look rigid or unnatural')
print()
print('Run generate_with_canny(source_image, prompt, controlnet_scale=X) to compare.')

4. OpenPose Control (Pose Transfer)¶

# OpenPose extracts body skeleton keypoints — use them to transfer pose to new characters
# !pip install controlnet-aux

from controlnet_aux import OpenposeDetector
from diffusers import ControlNetModel, StableDiffusionControlNetPipeline

def setup_openpose_pipeline():
    """Load OpenPose ControlNet pipeline."""
    pose_detector = OpenposeDetector.from_pretrained('lllyasviel/ControlNet')
    
    controlnet = ControlNetModel.from_pretrained(
        'lllyasviel/sd-controlnet-openpose',
        torch_dtype=torch.float16
    )
    pipe = StableDiffusionControlNetPipeline.from_pretrained(
        'runwayml/stable-diffusion-v1-5',
        controlnet=controlnet,
        torch_dtype=torch.float16,
        safety_checker=None
    ).to(device)
    pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
    return pose_detector, pipe

def generate_with_pose(
    pose_image: Image.Image,
    prompt: str,
    pose_detector,
    pose_pipe,
    seed: int = 42
) -> tuple:
    """Generate image with pose control."""
    pose_map = pose_detector(pose_image)
    generator = torch.manual_seed(seed)
    output = pose_pipe(
        prompt,
        image=pose_map,
        num_inference_steps=20,
        generator=generator
    )
    return output.images[0], pose_map

print('OpenPose pipeline functions defined.')
print('Use case: extract pose from a photo, generate new person in same pose.')
print()
print('Example:')
print('  detector, pipe = setup_openpose_pipeline()')
print('  result, pose = generate_with_pose(athlete_photo, "an astronaut in a spacesuit")')

5. Scribble to Image¶

Turn rough sketches into polished images — great for rapid concept art.

from controlnet_aux import HEDdetector

def setup_scribble_pipeline():
    """Load HED-based scribble ControlNet."""
    hed = HEDdetector.from_pretrained('lllyasviel/Annotators')
    
    controlnet = ControlNetModel.from_pretrained(
        'lllyasviel/sd-controlnet-scribble',
        torch_dtype=torch.float16
    )
    pipe = StableDiffusionControlNetPipeline.from_pretrained(
        'runwayml/stable-diffusion-v1-5',
        controlnet=controlnet,
        torch_dtype=torch.float16,
        safety_checker=None
    ).to(device)
    pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
    return hed, pipe

def scribble_to_image(sketch_path: str, prompt: str, hed, pipe, seed: int = 42) -> tuple:
    """Convert a scribble/sketch to a detailed image."""
    sketch = load_image(sketch_path)
    scribble_map = hed(sketch, scribble=True)  # scribble=True for rougher detection
    generator = torch.manual_seed(seed)
    output = pipe(
        prompt,
        image=scribble_map,
        num_inference_steps=25,
        guidance_scale=9.0,
        generator=generator
    )
    return output.images[0], scribble_map

print('Scribble pipeline defined.')
print()
print('Use case: hand-drawn sketch → realistic or artistic render.')
print('  hed, pipe = setup_scribble_pipeline()')
print('  result, sketch_map = scribble_to_image("my_sketch.png", "a mountain cabin at dusk")')

6. Depth Map Control¶

# Depth control preserves the 3D structure of a scene
# Useful for: recomposing environments, changing style while keeping layout

from transformers import pipeline as hf_pipeline

def estimate_depth(image: Image.Image) -> Image.Image:
    """Estimate depth map using MiDaS via Hugging Face."""
    depth_estimator = hf_pipeline('depth-estimation', model='Intel/dpt-large')
    result = depth_estimator(image)
    depth = result['depth']
    depth_array = np.array(depth)
    # Normalize to 0-255
    depth_normalized = ((depth_array - depth_array.min()) / 
                       (depth_array.max() - depth_array.min()) * 255).astype(np.uint8)
    return Image.fromarray(depth_normalized).convert('RGB')

print('Depth estimation function defined.')
print()
print('ControlNet depth workflow:')
print('  1. Load source image (e.g., a room photo)')
print('  2. Extract depth map: depth_map = estimate_depth(source_image)')
print('  3. Load controlnet: "lllyasviel/sd-controlnet-depth"')
print('  4. Generate with prompt: "a futuristic space station interior"')
print('     → Same 3D layout, completely new visual style')

7. Multi-ControlNet (Combine Multiple Controls)¶

# Combine Canny + Depth simultaneously for maximum control
from diffusers import StableDiffusionControlNetPipeline

def setup_multi_controlnet():
    """Load pipeline with multiple ControlNet models simultaneously."""
    canny_controlnet = ControlNetModel.from_pretrained(
        'lllyasviel/sd-controlnet-canny', torch_dtype=torch.float16
    )
    depth_controlnet = ControlNetModel.from_pretrained(
        'lllyasviel/sd-controlnet-depth', torch_dtype=torch.float16
    )
    
    # Pass list of ControlNets
    pipe = StableDiffusionControlNetPipeline.from_pretrained(
        'runwayml/stable-diffusion-v1-5',
        controlnet=[canny_controlnet, depth_controlnet],
        torch_dtype=torch.float16,
        safety_checker=None
    ).to(device)
    pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
    return pipe

def generate_multi_control(
    source_image: Image.Image,
    prompt: str,
    pipe,
    seed: int = 42
) -> Image.Image:
    """Generate with combined Canny + Depth control."""
    edge_map = extract_canny_edges(source_image)
    depth_map = estimate_depth(source_image)
    
    generator = torch.manual_seed(seed)
    output = pipe(
        prompt,
        image=[edge_map, depth_map],
        num_inference_steps=25,
        controlnet_conditioning_scale=[1.0, 0.8],  # weight per ControlNet
        generator=generator
    )
    return output.images[0]

print('Multi-ControlNet pipeline defined.')
print()
print('Tip: adjust controlnet_conditioning_scale per control:')
print('  [1.0, 0.8] → Canny edges dominant, depth as secondary guide')
print('  [0.5, 1.2] → Loose edges, strong depth (good for landscapes)')

8. ControlNet with SDXL (2024 Models)¶

For better quality, use SDXL-based ControlNet:

# SDXL ControlNet — higher quality (1024x1024 native resolution)
from diffusers import StableDiffusionXLControlNetPipeline

SDXL_CONTROLNETS = {
    'Canny':   'diffusers/controlnet-canny-sdxl-1.0',
    'Depth':   'diffusers/controlnet-depth-sdxl-1.0',
    'Scribble':'xinsir/controlnet-scribble-sdxl-1.0',
    'Union':   'xinsir/controlnet-union-sdxl-1.0',  # All-in-one!
}

def setup_sdxl_controlnet(controlnet_id: str = 'diffusers/controlnet-canny-sdxl-1.0'):
    """Load SDXL ControlNet pipeline for high-quality 1024x1024 output."""
    controlnet = ControlNetModel.from_pretrained(controlnet_id, torch_dtype=torch.float16)
    pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
        'stabilityai/stable-diffusion-xl-base-1.0',
        controlnet=controlnet,
        torch_dtype=torch.float16,
        variant='fp16'
    ).to(device)
    return pipe

print('SDXL ControlNet models:')
for name, model_id in SDXL_CONTROLNETS.items():
    print(f'  {name:10s} → {model_id}')
print()
print('Note: SDXL ControlNet requires 8GB+ VRAM for 1024x1024 output.')
print('Tip: Use xinsir/controlnet-union-sdxl-1.0 — supports all control types in one model.')

Summary: Choosing Your Control Type¶

Use Case                          → Control Type
─────────────────────────────────────────────────
Preserve photo composition        → Canny + Depth
Transfer human pose               → OpenPose
Concept art from rough sketch     → Scribble / HED
Restyle a 3D scene               → Depth
Anime/illustration from photo     → Lineart
Architecture visualization        → Canny + Depth
Motion/video frame consistency    → Optical Flow

Exercises¶

Take a photo and use Canny ControlNet to generate it in 3 different art styles.
Find a photo of a person dancing and use OpenPose to recreate the pose with a robot character.
Draw a rough house sketch and use Scribble to generate a photorealistic render.
Combine Canny + Depth controls and compare to using each alone.
Use SDXL ControlNet (if you have 8GB+ VRAM) and compare quality to SD 1.5.