Run this notebook: Open in Colab Open in Kaggle

# Setup
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC, LinearSVC
from sklearn.datasets import make_classification, make_circles, make_moons
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from matplotlib.colors import ListedColormap

sns.set_style('whitegrid')
np.random.seed(42)
np.set_printoptions(precision=4, suppress=True)

12.1 Separating Hyperplanes¶

Binary Classification:¶

Given data $\{(\mathbf{x}_n, y_n)\}_{n=1}^N$ where $y_n \in \{-1, +1\}$

Goal: Find hyperplane that separates classes:

\[\mathbf{w}^T\mathbf{x} + b = 0\]

Decision Function:¶

\[f(\mathbf{x}) = \text{sign}(\mathbf{w}^T\mathbf{x} + b)\]

where:

w: weight vector (normal to hyperplane)
b: bias (offset from origin)
sign(·): Returns +1 or -1

Geometric Interpretation:¶

w points perpendicular to decision boundary
Distance from origin: $|b| / ||w||$
Distance from point x to hyperplane: $|w^T x + b| / ||w||$

# Generate linearly separable data

np.random.seed(42)

def generate_separable_data(n_samples=100):
    """Generate linearly separable 2D data."""
    # Generate two clusters
    X1 = np.random.randn(n_samples//2, 2) + np.array([2, 2])
    X2 = np.random.randn(n_samples//2, 2) + np.array([-2, -2])
    
    X = np.vstack([X1, X2])
    y = np.hstack([np.ones(n_samples//2), -np.ones(n_samples//2)])
    
    return X, y

X, y = generate_separable_data(n_samples=100)

print("Linearly Separable Data")
print(f"Shape: {X.shape}")
print(f"Classes: {np.unique(y)}")
print(f"Class distribution: {(y == 1).sum()} positive, {(y == -1).sum()} negative")

# Visualize
plt.figure(figsize=(10, 6))
plt.scatter(X[y == 1, 0], X[y == 1, 1], c='blue', s=50, marker='o', 
           edgecolors='black', label='Class +1')
plt.scatter(X[y == -1, 0], X[y == -1, 1], c='red', s=50, marker='s', 
           edgecolors='black', label='Class -1')
plt.xlabel('x₁', fontsize=12)
plt.ylabel('x₂', fontsize=12)
plt.title('Linearly Separable Data', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.axis('equal')
plt.show()

print("\nQuestion: Which hyperplane should we choose?")
print("→ Many hyperplanes can separate the data!")

# Show multiple separating hyperplanes

def plot_hyperplane(w, b, ax, color='black', linestyle='-', label=None):
    """Plot hyperplane w^T x + b = 0."""
    x1_range = np.linspace(X[:, 0].min() - 1, X[:, 0].max() + 1, 100)
    # w1*x1 + w2*x2 + b = 0  =>  x2 = -(w1*x1 + b) / w2
    x2_line = -(w[0] * x1_range + b) / w[1]
    ax.plot(x1_range, x2_line, color=color, linestyle=linestyle, 
           linewidth=2, label=label)

# Define several separating hyperplanes
hyperplanes = [
    (np.array([1, 1]), -0.5, 'blue', '--', 'Hyperplane 1'),
    (np.array([1, 0.5]), -0.2, 'green', '--', 'Hyperplane 2'),
    (np.array([0.8, 1]), -0.3, 'purple', '--', 'Hyperplane 3'),
]

plt.figure(figsize=(10, 6))
plt.scatter(X[y == 1, 0], X[y == 1, 1], c='blue', s=50, marker='o', 
           edgecolors='black', alpha=0.6)
plt.scatter(X[y == -1, 0], X[y == -1, 1], c='red', s=50, marker='s', 
           edgecolors='black', alpha=0.6)

for w, b, color, style, label in hyperplanes:
    plot_hyperplane(w, b, plt.gca(), color=color, linestyle=style, label=label)

plt.xlabel('x₁', fontsize=12)
plt.ylabel('x₂', fontsize=12)
plt.title('Multiple Separating Hyperplanes', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.xlim([X[:, 0].min() - 1, X[:, 0].max() + 1])
plt.ylim([X[:, 1].min() - 1, X[:, 1].max() + 1])
plt.show()

print("All hyperplanes separate the data perfectly!")
print("Question: Which one is best?")
print("Answer: The one with maximum margin!")

12.2 Maximum Margin¶

Margin:¶

Distance from hyperplane to closest point:

\[\text{margin} = \min_n \frac{|w^T x_n + b|}{||w||}\]

Support Vectors:¶

Points closest to the hyperplane (on the margin)

SVM Objective:¶

Maximize margin ≡ Minimize $||w||^2$ subject to:

\[y_n(w^T x_n + b) \geq 1, \quad \forall n\]

Canonical Hyperplane:¶

Scale w, b such that:

$w^T x + b = +1$ for support vectors in class +1
$w^T x + b = -1$ for support vectors in class -1
Margin = $2 / ||w||$

Key: Maximizing margin = minimizing $||w||^2$

# Fit linear SVM

# Standardize data for numerical stability
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Fit SVM
svm = SVC(kernel='linear', C=1000)  # Large C for hard margin
svm.fit(X_scaled, y)

# Get parameters
w = svm.coef_[0]
b = svm.intercept_[0]

print("Linear SVM Results")
print(f"\nWeight vector w: {w}")
print(f"Bias b: {b}")
print(f"Norm ||w||: {np.linalg.norm(w):.4f}")
print(f"Margin: {2 / np.linalg.norm(w):.4f}")
print(f"\nNumber of support vectors: {len(svm.support_)}")

# Identify support vectors
support_vectors = X_scaled[svm.support_]
support_vector_labels = y[svm.support_]

# Visualize
def plot_svm_decision_boundary(X, y, svm, scaler):
    """Plot SVM decision boundary and margins."""
    fig, ax = plt.subplots(figsize=(12, 8))
    
    # Create mesh
    h = 0.02
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, h),
                           np.arange(x2_min, x2_max, h))
    
    # Predict on mesh
    X_mesh = np.c_[xx1.ravel(), xx2.ravel()]
    X_mesh_scaled = scaler.transform(X_mesh)
    Z = svm.decision_function(X_mesh_scaled).reshape(xx1.shape)
    
    # Plot decision boundary and margins
    ax.contour(xx1, xx2, Z, levels=[-1, 0, 1], colors=['red', 'black', 'blue'],
              linestyles=['--', '-', '--'], linewidths=[2, 3, 2])
    ax.contourf(xx1, xx2, Z, levels=[-np.inf, 0, np.inf], 
               colors=['#FFAAAA', '#AAAAFF'], alpha=0.3)
    
    # Plot points
    ax.scatter(X[y == 1, 0], X[y == 1, 1], c='blue', s=50, marker='o',
              edgecolors='black', label='Class +1')
    ax.scatter(X[y == -1, 0], X[y == -1, 1], c='red', s=50, marker='s',
              edgecolors='black', label='Class -1')
    
    # Highlight support vectors
    sv_original = scaler.inverse_transform(support_vectors)
    ax.scatter(sv_original[:, 0], sv_original[:, 1], s=200, 
              facecolors='none', edgecolors='green', linewidths=3,
              label='Support Vectors')
    
    ax.set_xlabel('x₁', fontsize=12)
    ax.set_ylabel('x₂', fontsize=12)
    ax.set_title('SVM: Maximum Margin Classifier', fontsize=14)
    ax.legend(fontsize=11)
    ax.grid(True, alpha=0.3)
    
    return fig

plot_svm_decision_boundary(X, y, svm, scaler)
plt.show()

print("\nVisualization:")
print("  - Black line: Decision boundary (w^T x + b = 0)")
print("  - Dashed lines: Margins (w^T x + b = ±1)")
print("  - Green circles: Support vectors (on the margin)")
print("  - Distance between margins: 2/||w||")

12.3 Primal Optimization Problem¶

Hard Margin SVM (Primal):¶

\[\min_{\mathbf{w}, b} \frac{1}{2}||\mathbf{w}||^2\]

subject to: $$y_n(\mathbf{w}^T\mathbf{x}_n + b) \geq 1, \quad n = 1, ..., N$$

This is a quadratic programming problem with linear constraints.

Why $\frac{1}{2}||w||^2$?¶

Minimizing $||w||^2$ ≡ maximizing margin $2/||w||$
Factor $\frac{1}{2}$ makes derivative cleaner
Quadratic objective → unique global optimum (convex!)

Solution Approach:¶

Use Lagrange multipliers to convert to dual problem

# Demonstrate convexity of SVM objective

# Objective function: (1/2) ||w||^2
w_range = np.linspace(-3, 3, 100)
objective = 0.5 * w_range**2

plt.figure(figsize=(10, 6))
plt.plot(w_range, objective, 'b-', linewidth=3)
plt.xlabel('w', fontsize=12)
plt.ylabel('(1/2) ||w||²', fontsize=12)
plt.title('SVM Objective Function (Convex)', fontsize=14)
plt.grid(True, alpha=0.3)
plt.axhline(y=0, color='black', linewidth=0.5)
plt.axvline(x=0, color='black', linewidth=0.5)

# Mark minimum
plt.plot(0, 0, 'ro', markersize=15, label='Global minimum')
plt.legend(fontsize=11)
plt.show()

print("Convex Optimization:")
print("  - Objective (1/2)||w||² is convex (quadratic)")
print("  - Constraints y_n(w^T x_n + b) ≥ 1 are linear")
print("  - → Unique global optimum!")
print("  - Can be solved efficiently with QP solvers")

12.4 Dual Problem and Lagrange Multipliers¶

Lagrangian:¶

\[\mathcal{L}(\mathbf{w}, b, \boldsymbol{\alpha}) = \frac{1}{2}||\mathbf{w}||^2 - \sum_{n=1}^N \alpha_n[y_n(\mathbf{w}^T\mathbf{x}_n + b) - 1]\]

where $\alpha_n \geq 0$ are Lagrange multipliers.

KKT Conditions:¶

$\frac{\partial \mathcal{L}}{\partial \mathbf{w}} = 0 \Rightarrow \mathbf{w} = \sum_n \alpha_n y_n \mathbf{x}_n$
$\frac{\partial \mathcal{L}}{\partial b} = 0 \Rightarrow \sum_n \alpha_n y_n = 0$
$\alpha_n[y_n(\mathbf{w}^T\mathbf{x}_n + b) - 1] = 0$ (complementary slackness)

Dual Problem:¶

\[\max_{\boldsymbol{\alpha}} \sum_{n=1}^N \alpha_n - \frac{1}{2}\sum_{n,m}\alpha_n\alpha_m y_n y_m \mathbf{x}_n^T\mathbf{x}_m\]

subject to: $$\alpha_n \geq 0, \quad \sum_n \alpha_n y_n = 0$$

Key: Solution depends only on inner products $\mathbf{x}_n^T\mathbf{x}_m$!

# Examine dual solution

# Get dual coefficients (α_n y_n)
dual_coef = svm.dual_coef_[0]  # α_n * y_n for support vectors
alphas = np.abs(dual_coef)  # |α_n|

print("Dual Problem Solution")
print(f"\nSupport vectors: {len(svm.support_)}")
print(f"\nDual coefficients α (for support vectors):")
for i, (idx, alpha, label) in enumerate(zip(svm.support_, alphas, support_vector_labels)):
    print(f"  SV {i+1} (point {idx}): α = {alpha:.4f}, y = {label:+.0f}")

# Verify KKT conditions
print(f"\nKKT Condition: Σ α_n y_n = 0")
print(f"  Σ α_n y_n = {np.sum(dual_coef):.6f} ≈ 0 ✓")

# Verify w = Σ α_n y_n x_n
w_from_dual = np.sum(dual_coef[:, np.newaxis] * support_vectors, axis=0)
print(f"\nWeight vector from dual:")
print(f"  w (from dual) = {w_from_dual}")
print(f"  w (from svm)  = {w}")
print(f"  Match: {np.allclose(w, w_from_dual)} ✓")

# Complementary slackness: α_n > 0 only for support vectors
X_scaled_sv = X_scaled[svm.support_]
y_sv = y[svm.support_]
decision_values_sv = X_scaled_sv @ w + b

print(f"\nComplementary Slackness:")
print(f"  For support vectors: y_n(w^T x_n + b) should equal 1")
for i in range(min(5, len(support_vectors))):
    margin_val = y_sv[i] * decision_values_sv[i]
    print(f"    SV {i+1}: y * (w^T x + b) = {margin_val:.4f} ≈ 1 ✓")

12.5 Kernel Trick¶

Motivation:¶

Data may not be linearly separable in original space, but linearly separable in higher-dimensional space!

Feature Map:¶

\[\phi: \mathbb{R}^d \rightarrow \mathbb{R}^D, \quad D >> d\]

Kernel Function:¶

Instead of computing $\phi(\mathbf{x})^T\phi(\mathbf{x}')$ explicitly, use kernel:

\[k(\mathbf{x}, \mathbf{x}') = \phi(\mathbf{x})^T\phi(\mathbf{x}')\]

Common Kernels:¶

Linear: $k(\mathbf{x}, \mathbf{x}') = \mathbf{x}^T\mathbf{x}'$
Polynomial: $k(\mathbf{x}, \mathbf{x}') = (\mathbf{x}^T\mathbf{x}' + c)^d$
RBF (Gaussian): $k(\mathbf{x}, \mathbf{x}') = \exp(-\gamma||\mathbf{x} - \mathbf{x}'||^2)$

Key: Can work in infinite-dimensional space without computing $\phi$ explicitly!

# Generate non-linearly separable data (XOR pattern)

np.random.seed(42)

X_xor = np.random.randn(200, 2)
y_xor = np.logical_xor(X_xor[:, 0] > 0, X_xor[:, 1] > 0)
y_xor = np.where(y_xor, 1, -1)

# Add some noise
noise_indices = np.random.choice(200, 20, replace=False)
y_xor[noise_indices] *= -1

print("Non-linearly Separable Data (XOR pattern)")
print(f"Shape: {X_xor.shape}")

plt.figure(figsize=(10, 6))
plt.scatter(X_xor[y_xor == 1, 0], X_xor[y_xor == 1, 1], c='blue', s=50, 
           marker='o', edgecolors='black', label='Class +1')
plt.scatter(X_xor[y_xor == -1, 0], X_xor[y_xor == -1, 1], c='red', s=50, 
           marker='s', edgecolors='black', label='Class -1')
plt.xlabel('x₁', fontsize=12)
plt.ylabel('x₂', fontsize=12)
plt.title('XOR Pattern (Not Linearly Separable)', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.axis('equal')
plt.show()

print("\nLinear SVM will fail on this data!")
print("Solution: Use kernel trick!")

# Compare different kernels

kernels = ['linear', 'poly', 'rbf']
kernel_names = ['Linear', 'Polynomial (degree 3)', 'RBF (Gaussian)']

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for ax, kernel, name in zip(axes, kernels, kernel_names):
    # Fit SVM with kernel
    if kernel == 'poly':
        clf = SVC(kernel=kernel, degree=3, C=1.0)
    else:
        clf = SVC(kernel=kernel, C=1.0)
    
    clf.fit(X_xor, y_xor)
    
    # Create mesh
    h = 0.02
    x1_min, x1_max = X_xor[:, 0].min() - 1, X_xor[:, 0].max() + 1
    x2_min, x2_max = X_xor[:, 1].min() - 1, X_xor[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, h),
                           np.arange(x2_min, x2_max, h))
    
    # Predict
    Z = clf.predict(np.c_[xx1.ravel(), xx2.ravel()])
    Z = Z.reshape(xx1.shape)
    
    # Plot
    ax.contourf(xx1, xx2, Z, alpha=0.3, cmap=ListedColormap(['#FFAAAA', '#AAAAFF']))
    ax.contour(xx1, xx2, Z, levels=[0], colors='black', linewidths=2)
    
    ax.scatter(X_xor[y_xor == 1, 0], X_xor[y_xor == 1, 1], c='blue', s=30, 
              marker='o', edgecolors='black', alpha=0.6)
    ax.scatter(X_xor[y_xor == -1, 0], X_xor[y_xor == -1, 1], c='red', s=30, 
              marker='s', edgecolors='black', alpha=0.6)
    
    # Highlight support vectors
    ax.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1],
              s=150, facecolors='none', edgecolors='green', linewidths=2)
    
    # Compute accuracy
    acc = clf.score(X_xor, y_xor)
    
    ax.set_xlabel('x₁')
    ax.set_ylabel('x₂')
    ax.set_title(f'{name}\nAccuracy: {acc:.2%}\nSVs: {len(clf.support_vectors_)}')
    ax.set_xlim([x1_min, x1_max])
    ax.set_ylim([x2_min, x2_max])

plt.tight_layout()
plt.show()

print("Kernel Comparison:")
print("  - Linear: Fails (can't separate XOR with line)")
print("  - Polynomial: Works (creates curved boundary)")
print("  - RBF: Works best (flexible, smooth boundary)")

# Visualize RBF kernel transformation

# Simple 1D example to visualize
np.random.seed(42)
X_1d = np.array([-2, -1, 0, 1, 2]).reshape(-1, 1)
y_1d = np.array([-1, 1, -1, 1, -1])

# Compute RBF features manually
gamma = 1.0
centers = X_1d  # Use data points as centers

def rbf_features(X, centers, gamma):
    """Compute RBF features φ(x) = [exp(-γ||x-c_i||²) for all centers c_i]."""
    n = X.shape[0]
    m = centers.shape[0]
    features = np.zeros((n, m))
    
    for i in range(m):
        features[:, i] = np.exp(-gamma * np.sum((X - centers[i])**2, axis=1))
    
    return features

# Transform to RBF space
phi_X = rbf_features(X_1d, centers, gamma)

print("RBF Kernel Transformation (1D → 5D)")
print(f"\nOriginal space (1D):")
print(X_1d.ravel())
print(f"\nLabels:")
print(y_1d)

print(f"\nRBF feature space (5D):")
print(phi_X)

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Original space
axes[0].scatter(X_1d[y_1d == 1], np.zeros(sum(y_1d == 1)), c='blue', s=100, 
               marker='o', edgecolors='black', label='Class +1')
axes[0].scatter(X_1d[y_1d == -1], np.zeros(sum(y_1d == -1)), c='red', s=100, 
               marker='s', edgecolors='black', label='Class -1')
axes[0].set_xlabel('x')
axes[0].set_ylabel('')
axes[0].set_title('Original Space (1D)\n Not Linearly Separable')
axes[0].set_ylim([-0.5, 0.5])
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Feature space (show first 2 dimensions)
axes[1].scatter(phi_X[y_1d == 1, 0], phi_X[y_1d == 1, 1], c='blue', s=100, 
               marker='o', edgecolors='black', label='Class +1')
axes[1].scatter(phi_X[y_1d == -1, 0], phi_X[y_1d == -1, 1], c='red', s=100, 
               marker='s', edgecolors='black', label='Class -1')
axes[1].set_xlabel('φ₁(x)')
axes[1].set_ylabel('φ₂(x)')
axes[1].set_title('RBF Feature Space (2 of 5 dims)\n Linearly Separable!')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nKey insight:")
print("  - Data not separable in 1D")
print("  - RBF kernel maps to high-dimensional space")
print("  - Data becomes separable in feature space!")
print("  - Never need to compute φ(x) explicitly (use kernel!)")

12.6 Soft Margin SVM¶

Problem with Hard Margin:¶

Requires data to be perfectly separable
Sensitive to outliers
May overfit

Solution: Soft Margin¶

Allow some misclassifications with slack variables ξₙ:

\[y_n(\mathbf{w}^T\mathbf{x}_n + b) \geq 1 - \xi_n, \quad \xi_n \geq 0\]

Soft Margin Objective:¶

\[\min_{\mathbf{w}, b, \boldsymbol{\xi}} \frac{1}{2}||\mathbf{w}||^2 + C\sum_{n=1}^N \xi_n\]

where:

C: Regularization parameter (tradeoff)
Large C: Focus on correct classification (hard margin)
Small C: Focus on large margin (soft margin)

Interpretation: Balance between margin size and training errors

# Generate overlapping data

np.random.seed(42)

X_overlap = np.vstack([
    np.random.randn(60, 2) + np.array([1.5, 1.5]),
    np.random.randn(60, 2) + np.array([-1.5, -1.5])
])
y_overlap = np.hstack([np.ones(60), -np.ones(60)])

# Add overlap
overlap_indices = np.random.choice(120, 20, replace=False)
X_overlap[overlap_indices] *= 0.3

print("Overlapping Data (Not Perfectly Separable)")
print(f"Shape: {X_overlap.shape}")

plt.figure(figsize=(10, 6))
plt.scatter(X_overlap[y_overlap == 1, 0], X_overlap[y_overlap == 1, 1], 
           c='blue', s=50, marker='o', edgecolors='black', alpha=0.6, label='Class +1')
plt.scatter(X_overlap[y_overlap == -1, 0], X_overlap[y_overlap == -1, 1], 
           c='red', s=50, marker='s', edgecolors='black', alpha=0.6, label='Class -1')
plt.xlabel('x₁', fontsize=12)
plt.ylabel('x₂', fontsize=12)
plt.title('Overlapping Data', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.axis('equal')
plt.show()

print("\nHard margin SVM would fail!")
print("Solution: Use soft margin (allow some misclassifications)")

# Compare different C values

C_values = [0.01, 0.1, 1, 10, 100]

fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

for idx, C in enumerate(C_values):
    ax = axes[idx]
    
    # Fit SVM
    clf = SVC(kernel='linear', C=C)
    clf.fit(X_overlap, y_overlap)
    
    # Create mesh
    h = 0.02
    x1_min, x1_max = X_overlap[:, 0].min() - 1, X_overlap[:, 0].max() + 1
    x2_min, x2_max = X_overlap[:, 1].min() - 1, X_overlap[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, h),
                           np.arange(x2_min, x2_max, h))
    
    # Decision function
    Z = clf.decision_function(np.c_[xx1.ravel(), xx2.ravel()])
    Z = Z.reshape(xx1.shape)
    
    # Plot
    ax.contour(xx1, xx2, Z, levels=[-1, 0, 1], colors=['red', 'black', 'blue'],
              linestyles=['--', '-', '--'], linewidths=2)
    ax.contourf(xx1, xx2, Z, levels=[-np.inf, 0, np.inf],
               colors=['#FFAAAA', '#AAAAFF'], alpha=0.3)
    
    ax.scatter(X_overlap[y_overlap == 1, 0], X_overlap[y_overlap == 1, 1],
              c='blue', s=30, marker='o', edgecolors='black', alpha=0.6)
    ax.scatter(X_overlap[y_overlap == -1, 0], X_overlap[y_overlap == -1, 1],
              c='red', s=30, marker='s', edgecolors='black', alpha=0.6)
    
    # Support vectors
    ax.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1],
              s=150, facecolors='none', edgecolors='green', linewidths=2)
    
    # Accuracy
    acc = clf.score(X_overlap, y_overlap)
    
    ax.set_xlabel('x₁')
    ax.set_ylabel('x₂')
    ax.set_title(f'C = {C}\nAccuracy: {acc:.2%}\nSVs: {len(clf.support_vectors_)}')
    ax.set_xlim([x1_min, x1_max])
    ax.set_ylim([x2_min, x2_max])

# Remove extra subplot
fig.delaxes(axes[5])

plt.tight_layout()
plt.show()

print("Effect of C (Regularization Parameter):")
print("\n  Small C (e.g., 0.01):")
print("    - Large margin (wide separation)")
print("    - Many support vectors")
print("    - More misclassifications allowed")
print("    - Better generalization (less overfitting)")

print("\n  Large C (e.g., 100):")
print("    - Small margin (narrow separation)")
print("    - Fewer support vectors")
print("    - Few misclassifications")
print("    - Risk of overfitting")

print("\n  → C controls bias-variance tradeoff!")

# Real-world example: Handwritten digits (3 vs 8)

from sklearn.datasets import load_digits
from sklearn.model_selection import cross_val_score

# Load digits
digits = load_digits()
X_digits = digits.data
y_digits = digits.target

# Select only 3s and 8s
mask = (y_digits == 3) | (y_digits == 8)
X_binary = X_digits[mask]
y_binary = np.where(y_digits[mask] == 3, 1, -1)

print("Digits Classification: 3 vs 8")
print(f"Dataset size: {X_binary.shape}")
print(f"Class distribution: {(y_binary == 1).sum()} threes, {(y_binary == -1).sum()} eights")

# Visualize sample digits
fig, axes = plt.subplots(2, 10, figsize=(14, 3))
for i in range(10):
    axes[0, i].imshow(X_binary[y_binary == 1][i].reshape(8, 8), cmap='gray')
    axes[0, i].axis('off')
    axes[1, i].imshow(X_binary[y_binary == -1][i].reshape(8, 8), cmap='gray')
    axes[1, i].axis('off')

axes[0, 0].set_ylabel('3s', fontsize=12)
axes[1, 0].set_ylabel('8s', fontsize=12)
plt.suptitle('Sample Digits', fontsize=14)
plt.tight_layout()
plt.show()

# Compare different kernels
kernels_eval = {
    'Linear': SVC(kernel='linear', C=1.0),
    'RBF': SVC(kernel='rbf', C=1.0, gamma='scale'),
    'Polynomial': SVC(kernel='poly', degree=3, C=1.0)
}

print("\nCross-Validation Results (5-fold):")
for name, clf in kernels_eval.items():
    scores = cross_val_score(clf, X_binary, y_binary, cv=5)
    print(f"  {name:12s}: {scores.mean():.3f} ± {scores.std():.3f}")

# Fit best model (RBF)
svm_digits = SVC(kernel='rbf', C=1.0, gamma='scale')
svm_digits.fit(X_binary, y_binary)

print(f"\nBest model: RBF kernel")
print(f"  Number of support vectors: {len(svm_digits.support_vectors_)}/{len(X_binary)}")
print(f"  Percentage: {len(svm_digits.support_vectors_)/len(X_binary)*100:.1f}%")

# Visualize some support vectors
sv_indices = svm_digits.support_[:20]  # First 20 SVs
fig, axes = plt.subplots(2, 10, figsize=(14, 3))
for i, ax in enumerate(axes.flat):
    if i < len(sv_indices):
        ax.imshow(X_binary[sv_indices[i]].reshape(8, 8), cmap='gray')
        label = '3' if y_binary[sv_indices[i]] == 1 else '8'
        ax.set_title(label, fontsize=10)
    ax.axis('off')

plt.suptitle('Support Vectors (First 20)', fontsize=14)
plt.tight_layout()
plt.show()

print("\nSupport vectors are the 'hardest' examples to classify!")

Summary¶

Key Concepts from Chapter 12:¶

Goal: Find hyperplane with maximum margin
Margin: Distance from hyperplane to closest points
Support Vectors: Points on the margin
Primal: Minimize ||w||² subject to constraints
Dual: Maximize based on Lagrange multipliers
Kernel Trick: Nonlinear classification via feature mapping
Soft Margin: Allow misclassifications with penalty C

SVM Formulations:¶

Problem	Objective	Constraints
Hard Margin (Primal)	min (1/2)
Soft Margin (Primal)	min (1/2)
Dual	max Σα_n - (1/2)ΣΣ α_n α_m y_n y_m k(x_n, x_m)	α_n ≥ 0, Σα_n y_n = 0

Common Kernels:¶

Kernel	Formula	Use Case
Linear	k(x, x’) = x^T x’	Linearly separable data
Polynomial	k(x, x’) = (x^T x’ + c)^d	Polynomial boundaries
RBF	k(x, x’) = exp(-γ
Sigmoid	k(x, x’) = tanh(γ x^T x’ + c)	Neural network-like

Hyperparameter C:¶

Value	Effect	Best For
Small C	Large margin, many SVs	Noisy data, better generalization
Large C	Small margin, few SVs	Clean data, fewer outliers

When to Use SVM:¶

✅ Good for:

Binary classification
High-dimensional data
Clear margin of separation
Kernel methods needed
Small to medium datasets

❌ Not good for:

Very large datasets (O(N²) or O(N³))
Many classes (use one-vs-one or one-vs-all)
Probabilistic outputs needed
Speed is critical

Connection to Other Chapters:¶

Concept	Chapter
Hyperplanes	Chapter 2, 3
Optimization (QP)	Chapter 7
Lagrange multipliers	Chapter 7
Inner products	Chapter 3
Kernels (Mercer’s theorem)	Chapter 3

Practical Tips:¶

Always standardize features before SVM
Use RBF kernel as default (very flexible)
Grid search for C and γ (use cross-validation)
Check support vectors: Too many → overfit, too few → underfit
For large data: Use LinearSVC or SGDClassifier
For probabilities: Use SVC with probability=True (slower)

Key Advantages:¶

Maximum margin → good generalization
Kernel trick → nonlinear decision boundaries
Convex optimization → global optimum
Sparse solution → only support vectors matter
Effective in high dimensions → works well for images, text

Next Steps:¶

Multi-class SVM: One-vs-one, one-vs-all
SVM Regression: ε-insensitive loss
Advanced: Sequential Minimal Optimization (SMO)
Related: Logistic regression, kernel methods, neural networks

Practice: Apply SVM to a real classification problem and tune hyperparameters!

12.1 Separating Hyperplanes¶

Binary Classification:¶

Decision Function:¶

Geometric Interpretation:¶

12.2 Maximum Margin¶

Margin:¶

Support Vectors:¶

SVM Objective:¶

Canonical Hyperplane:¶

12.3 Primal Optimization Problem¶

Hard Margin SVM (Primal):¶

Why \(\frac{1}{2}||w||^2\)?¶

Solution Approach:¶

12.4 Dual Problem and Lagrange Multipliers¶

Lagrangian:¶

KKT Conditions:¶

Dual Problem:¶

12.5 Kernel Trick¶

Motivation:¶

Feature Map:¶

Kernel Function:¶

Common Kernels:¶

12.6 Soft Margin SVM¶

Problem with Hard Margin:¶

Solution: Soft Margin¶

Soft Margin Objective:¶

Summary¶

Key Concepts from Chapter 12:¶

SVM Formulations:¶

Common Kernels:¶

Hyperparameter C:¶

When to Use SVM:¶

Connection to Other Chapters:¶

Practical Tips:¶

Key Advantages:¶

Next Steps:¶