Run this notebook: Open in Colab Open in Kaggle

Image Processing Basics: NumPy Arrays, PIL, and Computer Vision Fundamentals¶

Every image is a NumPy array. Understanding pixels, channels, color spaces, and basic transformations is the foundation of all computer vision work — from preprocessing for deep learning to classical vision algorithms.

# !pip install Pillow opencv-python matplotlib
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image, ImageFilter, ImageEnhance, ImageDraw
from io import BytesIO
import warnings
warnings.filterwarnings('ignore')

try:
    import cv2
    HAS_CV2 = True
except ImportError:
    HAS_CV2 = False
    print('OpenCV not installed — using PIL only (pip install opencv-python)')

# Create a synthetic test image (no external files needed)
def create_synthetic_image(size=(256, 256)):
    """Create a colorful test image with geometric shapes."""
    img = np.zeros((size[0], size[1], 3), dtype=np.uint8)
    
    # Background gradient
    for i in range(size[0]):
        img[i, :, 0] = int(255 * i / size[0])  # Red gradient
    
    # Blue rectangle
    img[50:120, 50:150, 2] = 200
    img[50:120, 50:150, 0] = 50
    
    # Green circle (approximated with array ops)
    cy, cx, r = 180, 180, 40
    y, x = np.ogrid[:size[0], :size[1]]
    mask = (x - cx)**2 + (y - cy)**2 <= r**2
    img[mask, 1] = 220
    img[mask, 0] = 30
    img[mask, 2] = 30
    
    # White stripe
    img[120:130, :, :] = 255
    
    return img

img_array = create_synthetic_image()
pil_image = Image.fromarray(img_array)

print(f'Image array shape: {img_array.shape}  → (height, width, channels)')
print(f'Data type: {img_array.dtype}')
print(f'Value range: [{img_array.min()}, {img_array.max()}]')
print(f'Total pixels: {img_array.shape[0] * img_array.shape[1]:,}')

1. Images as NumPy Arrays¶

fig, axes = plt.subplots(2, 4, figsize=(16, 8))

# Full image
axes[0, 0].imshow(img_array)
axes[0, 0].set_title(f'Original RGB\n{img_array.shape}')
axes[0, 0].axis('off')

# Individual channels
for i, (channel, color, cmap) in enumerate([
    (img_array[:,:,0], 'Red',   'Reds'),
    (img_array[:,:,1], 'Green', 'Greens'),
    (img_array[:,:,2], 'Blue',  'Blues'),
]):
    axes[0, i+1].imshow(channel, cmap=cmap)
    axes[0, i+1].set_title(f'{color} Channel\nrange [{channel.min()},{channel.max()}]')
    axes[0, i+1].axis('off')

# Grayscale conversion
gray = np.dot(img_array[..., :3], [0.299, 0.587, 0.114]).astype(np.uint8)
axes[1, 0].imshow(gray, cmap='gray')
axes[1, 0].set_title('Grayscale\n(weighted average)')
axes[1, 0].axis('off')

# Pixel histogram
axes[1, 1].hist(img_array[:,:,0].flatten(), bins=50, color='red', alpha=0.5, label='R')
axes[1, 1].hist(img_array[:,:,1].flatten(), bins=50, color='green', alpha=0.5, label='G')
axes[1, 1].hist(img_array[:,:,2].flatten(), bins=50, color='blue', alpha=0.5, label='B')
axes[1, 1].set_title('Pixel Value Histogram')
axes[1, 1].legend()

# Pixel access
axes[1, 2].imshow(img_array)
axes[1, 2].set_title('Pixel crop [50:150, 50:150]')
axes[1, 2].axis('off')
crop = img_array[50:150, 50:150]
axes[1, 3].imshow(crop)
axes[1, 3].set_title(f'Cropped region\n{crop.shape}')
axes[1, 3].axis('off')

plt.tight_layout()
plt.show()

print('Key array operations:')
print(f'  img[y, x, channel]  →  pixel at (x,y), channel 0=R, 1=G, 2=B')
print(f'  img[50:150, 50:150] →  crop (y slice, x slice)')
print(f'  img[:, :, 0]        →  red channel')
print(f'  img[::-1, :, :]     →  vertical flip')

2. Color Spaces — RGB, HSV, Grayscale, Lab¶

def rgb_to_hsv(img: np.ndarray) -> np.ndarray:
    """Manual RGB → HSV for understanding."""
    img_float = img / 255.0
    r, g, b = img_float[:,:,0], img_float[:,:,1], img_float[:,:,2]
    
    cmax = np.maximum(np.maximum(r, g), b)
    cmin = np.minimum(np.minimum(r, g), b)
    delta = cmax - cmin
    
    # Value
    v = cmax
    
    # Saturation
    s = np.where(cmax > 0, delta / cmax, 0)
    
    # Hue
    h = np.zeros_like(r)
    m = delta > 0
    h[m & (cmax == r)] = 60 * (((g - b) / delta) % 6)[m & (cmax == r)]
    h[m & (cmax == g)] = 60 * (((b - r) / delta) + 2)[m & (cmax == g)]
    h[m & (cmax == b)] = 60 * (((r - g) / delta) + 4)[m & (cmax == b)]
    
    return np.stack([h, s, v], axis=2)

hsv = rgb_to_hsv(img_array)

fig, axes = plt.subplots(1, 5, figsize=(18, 4))

axes[0].imshow(img_array)
axes[0].set_title('Original RGB')
axes[0].axis('off')

# HSV channels
for i, (channel, title, cmap) in enumerate([
    (hsv[:,:,0], 'Hue (color)', 'hsv'),
    (hsv[:,:,1], 'Saturation (color purity)', 'gray'),
    (hsv[:,:,2], 'Value (brightness)', 'gray'),
]):
    axes[i+1].imshow(channel, cmap=cmap)
    axes[i+1].set_title(title)
    axes[i+1].axis('off')

# Grayscale
gray_pil = pil_image.convert('L')
axes[4].imshow(np.array(gray_pil), cmap='gray')
axes[4].set_title('Grayscale (PIL)')
axes[4].axis('off')

plt.suptitle('Color Space Representations')
plt.tight_layout()
plt.show()

print('Color space guide:')
print('  RGB:  Standard display format. Each channel 0-255.')
print('  HSV:  Hue (color), Saturation (purity), Value (brightness) — easier for color filtering')
print('  Lab:  Perceptually uniform — L*=lightness, a*=green-red, b*=blue-yellow')
print('  Gray: Single channel — luminance = 0.299R + 0.587G + 0.114B')

3. Transformations — Preprocessing for Deep Learning¶

# Common preprocessing steps before feeding images to a CNN

fig, axes = plt.subplots(2, 4, figsize=(16, 8))

# 1. Resize
resized = np.array(pil_image.resize((64, 64)))
axes[0, 0].imshow(resized)
axes[0, 0].set_title('Resized (64×64)\nfor CNN input')
axes[0, 0].axis('off')

# 2. Normalize to [0, 1]
normalized = img_array / 255.0
axes[0, 1].imshow(normalized)
axes[0, 1].set_title(f'Normalized [0,1]\nfor neural nets')
axes[0, 1].axis('off')

# 3. Z-score normalization (ImageNet stats)
imagenet_mean = np.array([0.485, 0.456, 0.406])
imagenet_std  = np.array([0.229, 0.224, 0.225])
standardized = (normalized - imagenet_mean) / imagenet_std
# Clip for display
axes[0, 2].imshow(np.clip(standardized * 0.5 + 0.5, 0, 1))
axes[0, 2].set_title('ImageNet Normalized\n(μ,σ standardized)')
axes[0, 2].axis('off')

# 4. Horizontal flip
flipped = img_array[:, ::-1, :]
axes[0, 3].imshow(flipped)
axes[0, 3].set_title('Horizontal Flip\n(data augmentation)')
axes[0, 3].axis('off')

# 5. Rotation (PIL)
rotated = np.array(pil_image.rotate(15, expand=False))
axes[1, 0].imshow(rotated)
axes[1, 0].set_title('Rotated 15°')
axes[1, 0].axis('off')

# 6. Color jitter
jittered = img_array.copy().astype(np.float32)
jittered[:, :, 0] *= 1.3  # Boost red
jittered = np.clip(jittered, 0, 255).astype(np.uint8)
axes[1, 1].imshow(jittered)
axes[1, 1].set_title('Color Jitter\n(brightness/contrast aug.)')
axes[1, 1].axis('off')

# 7. Gaussian blur
blurred = np.array(pil_image.filter(ImageFilter.GaussianBlur(radius=3)))
axes[1, 2].imshow(blurred)
axes[1, 2].set_title('Gaussian Blur\n(smoothing)')
axes[1, 2].axis('off')

# 8. Random crop
crop_y, crop_x = 30, 30
croped = img_array[crop_y:crop_y+180, crop_x:crop_x+180]
axes[1, 3].imshow(croped)
axes[1, 3].set_title('Random Crop\n(common augmentation)')
axes[1, 3].axis('off')

plt.suptitle('Image Preprocessing Transformations', fontsize=14)
plt.tight_layout()
plt.show()

print('PyTorch/torchvision augmentation pipeline:')
print("""
transforms.Compose([
    transforms.Resize(256),
    transforms.RandomCrop(224),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])""")

4. Classical CV — Edge Detection & Filtering¶

gray = np.array(pil_image.convert('L')).astype(np.float32)

# Sobel edge detection (manual convolution)
def convolve2d(img: np.ndarray, kernel: np.ndarray) -> np.ndarray:
    """2D convolution without padding."""
    h, w = img.shape
    kh, kw = kernel.shape
    out_h, out_w = h - kh + 1, w - kw + 1
    output = np.zeros((out_h, out_w))
    for i in range(out_h):
        for j in range(out_w):
            output[i, j] = (img[i:i+kh, j:j+kw] * kernel).sum()
    return output

# Kernels
sobel_x = np.array([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]])
sobel_y = np.array([[-1,-2,-1], [ 0, 0, 0], [ 1, 2, 1]])
gaussian = np.array([[1,2,1],[2,4,2],[1,2,1]]) / 16
laplacian = np.array([[0,1,0],[1,-4,1],[0,1,0]])

fig, axes = plt.subplots(2, 3, figsize=(15, 9))

axes[0, 0].imshow(gray, cmap='gray')
axes[0, 0].set_title('Grayscale Original')
axes[0, 0].axis('off')

# Sobel edges
gx = convolve2d(gray, sobel_x)
gy = convolve2d(gray, sobel_y)
edges = np.sqrt(gx**2 + gy**2)
edges_normalized = (edges / edges.max() * 255).astype(np.uint8)

axes[0, 1].imshow(gx, cmap='RdBu')
axes[0, 1].set_title('Sobel X (vertical edges)')
axes[0, 1].axis('off')

axes[0, 2].imshow(gy, cmap='RdBu')
axes[0, 2].set_title('Sobel Y (horizontal edges)')
axes[0, 2].axis('off')

axes[1, 0].imshow(edges_normalized, cmap='gray')
axes[1, 0].set_title('Edge Magnitude (√Gx²+Gy²)')
axes[1, 0].axis('off')

blurred = convolve2d(gray, gaussian)
axes[1, 1].imshow(blurred, cmap='gray')
axes[1, 1].set_title('Gaussian Smoothing')
axes[1, 1].axis('off')

laplace_result = np.abs(convolve2d(gray, laplacian))
axes[1, 2].imshow(laplace_result, cmap='gray')
axes[1, 2].set_title('Laplacian (edge sharpness)')
axes[1, 2].axis('off')

plt.suptitle('Convolution Kernels — the Building Blocks of CNNs', fontsize=13)
plt.tight_layout()
plt.show()

print('Kernel intuition:')
print('  Gaussian smoothing: reduces noise (low-pass filter)')
print('  Sobel X/Y: detects horizontal/vertical edges')
print('  Laplacian: detects all edges, sensitive to noise')
print()
print('CNNs learn these kernels automatically during training.')
print('Early layers ≈ edge detectors, deeper layers ≈ complex patterns.')

Image Processing Cheat Sheet¶

Operation                 NumPy / PIL                  OpenCV
────────────────────────────────────────────────────────────────────
Load image                np.array(Image.open(path))   cv2.imread(path)
Save image                Image.fromarray(arr).save()  cv2.imwrite(path, img)
Resize                    img.resize((W, H))            cv2.resize(img, (W, H))
Grayscale                 img.convert('L')              cv2.cvtColor(img, COLOR_BGR2GRAY)
Flip horizontal           img[:, ::-1, :]              cv2.flip(img, 1)
Rotate                    img.rotate(angle)             cv2.rotate(img, ...)
Normalize [0,1]           img.astype(float) / 255       same
Blur                      ImageFilter.GaussianBlur(r)   cv2.GaussianBlur(img, (k,k), σ)
Edge detect               manual Sobel                  cv2.Canny(img, low, high)

⚠️  OpenCV loads images as BGR, not RGB!
   Fix: img = cv2.cvtColor(cv2.imread(path), cv2.COLOR_BGR2RGB)

For deep learning input:
  1. Resize to model's expected size (e.g., 224×224 for ResNet)
  2. Convert to float32 and normalize to [0, 1]
  3. Standardize: (pixel - mean) / std (use ImageNet stats for pretrained models)
  4. Add batch dimension: img[np.newaxis, ...]  → (1, H, W, C)
  5. Transpose for PyTorch: (H, W, C) → (C, H, W)

Exercises¶

Load any JPEG from disk and plot its RGB histogram — what does the histogram tell you about the image?
Implement a sharpening kernel manually (unsharp masking: original + α * (original - blurred)).
Detect all pixels where the hue is in the ‘red’ range in HSV — create a binary mask.
Compare PIL’s Image.resize with nearest-neighbor vs LANCZOS antialiasing — which is better for upscaling?
Write a function that tiles 9 random crops of a single image in a 3×3 grid (data augmentation preview).