Image Processing Basics: NumPy Arrays, PIL, and Computer Vision FundamentalsΒΆ
Every image is a NumPy array. Understanding pixels, channels, color spaces, and basic transformations is the foundation of all computer vision work β from preprocessing for deep learning to classical vision algorithms.
# !pip install Pillow opencv-python matplotlib
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image, ImageFilter, ImageEnhance, ImageDraw
from io import BytesIO
import warnings
warnings.filterwarnings('ignore')
try:
import cv2
HAS_CV2 = True
except ImportError:
HAS_CV2 = False
print('OpenCV not installed β using PIL only (pip install opencv-python)')
# Create a synthetic test image (no external files needed)
def create_synthetic_image(size=(256, 256)):
"""Create a colorful test image with geometric shapes."""
img = np.zeros((size[0], size[1], 3), dtype=np.uint8)
# Background gradient
for i in range(size[0]):
img[i, :, 0] = int(255 * i / size[0]) # Red gradient
# Blue rectangle
img[50:120, 50:150, 2] = 200
img[50:120, 50:150, 0] = 50
# Green circle (approximated with array ops)
cy, cx, r = 180, 180, 40
y, x = np.ogrid[:size[0], :size[1]]
mask = (x - cx)**2 + (y - cy)**2 <= r**2
img[mask, 1] = 220
img[mask, 0] = 30
img[mask, 2] = 30
# White stripe
img[120:130, :, :] = 255
return img
img_array = create_synthetic_image()
pil_image = Image.fromarray(img_array)
print(f'Image array shape: {img_array.shape} β (height, width, channels)')
print(f'Data type: {img_array.dtype}')
print(f'Value range: [{img_array.min()}, {img_array.max()}]')
print(f'Total pixels: {img_array.shape[0] * img_array.shape[1]:,}')
1. Images as NumPy ArraysΒΆ
fig, axes = plt.subplots(2, 4, figsize=(16, 8))
# Full image
axes[0, 0].imshow(img_array)
axes[0, 0].set_title(f'Original RGB\n{img_array.shape}')
axes[0, 0].axis('off')
# Individual channels
for i, (channel, color, cmap) in enumerate([
(img_array[:,:,0], 'Red', 'Reds'),
(img_array[:,:,1], 'Green', 'Greens'),
(img_array[:,:,2], 'Blue', 'Blues'),
]):
axes[0, i+1].imshow(channel, cmap=cmap)
axes[0, i+1].set_title(f'{color} Channel\nrange [{channel.min()},{channel.max()}]')
axes[0, i+1].axis('off')
# Grayscale conversion
gray = np.dot(img_array[..., :3], [0.299, 0.587, 0.114]).astype(np.uint8)
axes[1, 0].imshow(gray, cmap='gray')
axes[1, 0].set_title('Grayscale\n(weighted average)')
axes[1, 0].axis('off')
# Pixel histogram
axes[1, 1].hist(img_array[:,:,0].flatten(), bins=50, color='red', alpha=0.5, label='R')
axes[1, 1].hist(img_array[:,:,1].flatten(), bins=50, color='green', alpha=0.5, label='G')
axes[1, 1].hist(img_array[:,:,2].flatten(), bins=50, color='blue', alpha=0.5, label='B')
axes[1, 1].set_title('Pixel Value Histogram')
axes[1, 1].legend()
# Pixel access
axes[1, 2].imshow(img_array)
axes[1, 2].set_title('Pixel crop [50:150, 50:150]')
axes[1, 2].axis('off')
crop = img_array[50:150, 50:150]
axes[1, 3].imshow(crop)
axes[1, 3].set_title(f'Cropped region\n{crop.shape}')
axes[1, 3].axis('off')
plt.tight_layout()
plt.show()
print('Key array operations:')
print(f' img[y, x, channel] β pixel at (x,y), channel 0=R, 1=G, 2=B')
print(f' img[50:150, 50:150] β crop (y slice, x slice)')
print(f' img[:, :, 0] β red channel')
print(f' img[::-1, :, :] β vertical flip')
2. Color Spaces β RGB, HSV, Grayscale, LabΒΆ
def rgb_to_hsv(img: np.ndarray) -> np.ndarray:
"""Manual RGB β HSV for understanding."""
img_float = img / 255.0
r, g, b = img_float[:,:,0], img_float[:,:,1], img_float[:,:,2]
cmax = np.maximum(np.maximum(r, g), b)
cmin = np.minimum(np.minimum(r, g), b)
delta = cmax - cmin
# Value
v = cmax
# Saturation
s = np.where(cmax > 0, delta / cmax, 0)
# Hue
h = np.zeros_like(r)
m = delta > 0
h[m & (cmax == r)] = 60 * (((g - b) / delta) % 6)[m & (cmax == r)]
h[m & (cmax == g)] = 60 * (((b - r) / delta) + 2)[m & (cmax == g)]
h[m & (cmax == b)] = 60 * (((r - g) / delta) + 4)[m & (cmax == b)]
return np.stack([h, s, v], axis=2)
hsv = rgb_to_hsv(img_array)
fig, axes = plt.subplots(1, 5, figsize=(18, 4))
axes[0].imshow(img_array)
axes[0].set_title('Original RGB')
axes[0].axis('off')
# HSV channels
for i, (channel, title, cmap) in enumerate([
(hsv[:,:,0], 'Hue (color)', 'hsv'),
(hsv[:,:,1], 'Saturation (color purity)', 'gray'),
(hsv[:,:,2], 'Value (brightness)', 'gray'),
]):
axes[i+1].imshow(channel, cmap=cmap)
axes[i+1].set_title(title)
axes[i+1].axis('off')
# Grayscale
gray_pil = pil_image.convert('L')
axes[4].imshow(np.array(gray_pil), cmap='gray')
axes[4].set_title('Grayscale (PIL)')
axes[4].axis('off')
plt.suptitle('Color Space Representations')
plt.tight_layout()
plt.show()
print('Color space guide:')
print(' RGB: Standard display format. Each channel 0-255.')
print(' HSV: Hue (color), Saturation (purity), Value (brightness) β easier for color filtering')
print(' Lab: Perceptually uniform β L*=lightness, a*=green-red, b*=blue-yellow')
print(' Gray: Single channel β luminance = 0.299R + 0.587G + 0.114B')
3. Transformations β Preprocessing for Deep LearningΒΆ
# Common preprocessing steps before feeding images to a CNN
fig, axes = plt.subplots(2, 4, figsize=(16, 8))
# 1. Resize
resized = np.array(pil_image.resize((64, 64)))
axes[0, 0].imshow(resized)
axes[0, 0].set_title('Resized (64Γ64)\nfor CNN input')
axes[0, 0].axis('off')
# 2. Normalize to [0, 1]
normalized = img_array / 255.0
axes[0, 1].imshow(normalized)
axes[0, 1].set_title(f'Normalized [0,1]\nfor neural nets')
axes[0, 1].axis('off')
# 3. Z-score normalization (ImageNet stats)
imagenet_mean = np.array([0.485, 0.456, 0.406])
imagenet_std = np.array([0.229, 0.224, 0.225])
standardized = (normalized - imagenet_mean) / imagenet_std
# Clip for display
axes[0, 2].imshow(np.clip(standardized * 0.5 + 0.5, 0, 1))
axes[0, 2].set_title('ImageNet Normalized\n(ΞΌ,Ο standardized)')
axes[0, 2].axis('off')
# 4. Horizontal flip
flipped = img_array[:, ::-1, :]
axes[0, 3].imshow(flipped)
axes[0, 3].set_title('Horizontal Flip\n(data augmentation)')
axes[0, 3].axis('off')
# 5. Rotation (PIL)
rotated = np.array(pil_image.rotate(15, expand=False))
axes[1, 0].imshow(rotated)
axes[1, 0].set_title('Rotated 15Β°')
axes[1, 0].axis('off')
# 6. Color jitter
jittered = img_array.copy().astype(np.float32)
jittered[:, :, 0] *= 1.3 # Boost red
jittered = np.clip(jittered, 0, 255).astype(np.uint8)
axes[1, 1].imshow(jittered)
axes[1, 1].set_title('Color Jitter\n(brightness/contrast aug.)')
axes[1, 1].axis('off')
# 7. Gaussian blur
blurred = np.array(pil_image.filter(ImageFilter.GaussianBlur(radius=3)))
axes[1, 2].imshow(blurred)
axes[1, 2].set_title('Gaussian Blur\n(smoothing)')
axes[1, 2].axis('off')
# 8. Random crop
crop_y, crop_x = 30, 30
croped = img_array[crop_y:crop_y+180, crop_x:crop_x+180]
axes[1, 3].imshow(croped)
axes[1, 3].set_title('Random Crop\n(common augmentation)')
axes[1, 3].axis('off')
plt.suptitle('Image Preprocessing Transformations', fontsize=14)
plt.tight_layout()
plt.show()
print('PyTorch/torchvision augmentation pipeline:')
print("""
transforms.Compose([
transforms.Resize(256),
transforms.RandomCrop(224),
transforms.RandomHorizontalFlip(p=0.5),
transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
])""")
4. Classical CV β Edge Detection & FilteringΒΆ
gray = np.array(pil_image.convert('L')).astype(np.float32)
# Sobel edge detection (manual convolution)
def convolve2d(img: np.ndarray, kernel: np.ndarray) -> np.ndarray:
"""2D convolution without padding."""
h, w = img.shape
kh, kw = kernel.shape
out_h, out_w = h - kh + 1, w - kw + 1
output = np.zeros((out_h, out_w))
for i in range(out_h):
for j in range(out_w):
output[i, j] = (img[i:i+kh, j:j+kw] * kernel).sum()
return output
# Kernels
sobel_x = np.array([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]])
sobel_y = np.array([[-1,-2,-1], [ 0, 0, 0], [ 1, 2, 1]])
gaussian = np.array([[1,2,1],[2,4,2],[1,2,1]]) / 16
laplacian = np.array([[0,1,0],[1,-4,1],[0,1,0]])
fig, axes = plt.subplots(2, 3, figsize=(15, 9))
axes[0, 0].imshow(gray, cmap='gray')
axes[0, 0].set_title('Grayscale Original')
axes[0, 0].axis('off')
# Sobel edges
gx = convolve2d(gray, sobel_x)
gy = convolve2d(gray, sobel_y)
edges = np.sqrt(gx**2 + gy**2)
edges_normalized = (edges / edges.max() * 255).astype(np.uint8)
axes[0, 1].imshow(gx, cmap='RdBu')
axes[0, 1].set_title('Sobel X (vertical edges)')
axes[0, 1].axis('off')
axes[0, 2].imshow(gy, cmap='RdBu')
axes[0, 2].set_title('Sobel Y (horizontal edges)')
axes[0, 2].axis('off')
axes[1, 0].imshow(edges_normalized, cmap='gray')
axes[1, 0].set_title('Edge Magnitude (βGxΒ²+GyΒ²)')
axes[1, 0].axis('off')
blurred = convolve2d(gray, gaussian)
axes[1, 1].imshow(blurred, cmap='gray')
axes[1, 1].set_title('Gaussian Smoothing')
axes[1, 1].axis('off')
laplace_result = np.abs(convolve2d(gray, laplacian))
axes[1, 2].imshow(laplace_result, cmap='gray')
axes[1, 2].set_title('Laplacian (edge sharpness)')
axes[1, 2].axis('off')
plt.suptitle('Convolution Kernels β the Building Blocks of CNNs', fontsize=13)
plt.tight_layout()
plt.show()
print('Kernel intuition:')
print(' Gaussian smoothing: reduces noise (low-pass filter)')
print(' Sobel X/Y: detects horizontal/vertical edges')
print(' Laplacian: detects all edges, sensitive to noise')
print()
print('CNNs learn these kernels automatically during training.')
print('Early layers β edge detectors, deeper layers β complex patterns.')
Image Processing Cheat SheetΒΆ
Operation NumPy / PIL OpenCV
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
Load image np.array(Image.open(path)) cv2.imread(path)
Save image Image.fromarray(arr).save() cv2.imwrite(path, img)
Resize img.resize((W, H)) cv2.resize(img, (W, H))
Grayscale img.convert('L') cv2.cvtColor(img, COLOR_BGR2GRAY)
Flip horizontal img[:, ::-1, :] cv2.flip(img, 1)
Rotate img.rotate(angle) cv2.rotate(img, ...)
Normalize [0,1] img.astype(float) / 255 same
Blur ImageFilter.GaussianBlur(r) cv2.GaussianBlur(img, (k,k), Ο)
Edge detect manual Sobel cv2.Canny(img, low, high)
β οΈ OpenCV loads images as BGR, not RGB!
Fix: img = cv2.cvtColor(cv2.imread(path), cv2.COLOR_BGR2RGB)
For deep learning input:
1. Resize to model's expected size (e.g., 224Γ224 for ResNet)
2. Convert to float32 and normalize to [0, 1]
3. Standardize: (pixel - mean) / std (use ImageNet stats for pretrained models)
4. Add batch dimension: img[np.newaxis, ...] β (1, H, W, C)
5. Transpose for PyTorch: (H, W, C) β (C, H, W)
ExercisesΒΆ
Load any JPEG from disk and plot its RGB histogram β what does the histogram tell you about the image?
Implement a sharpening kernel manually (unsharp masking: original + Ξ± * (original - blurred)).
Detect all pixels where the hue is in the βredβ range in HSV β create a binary mask.
Compare PILβs
Image.resizewith nearest-neighbor vs LANCZOS antialiasing β which is better for upscaling?Write a function that tiles 9 random crops of a single image in a 3Γ3 grid (data augmentation preview).