# Setup
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC, LinearSVC
from sklearn.datasets import make_classification, make_circles, make_moons
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from matplotlib.colors import ListedColormap

sns.set_style('whitegrid')
np.random.seed(42)
np.set_printoptions(precision=4, suppress=True)

12.1 Separating HyperplanesΒΆ

Binary Classification:ΒΆ

Given data \(\{(\mathbf{x}_n, y_n)\}_{n=1}^N\) where \(y_n \in \{-1, +1\}\)

Goal: Find hyperplane that separates classes:

\[\mathbf{w}^T\mathbf{x} + b = 0\]

Decision Function:ΒΆ

\[f(\mathbf{x}) = \text{sign}(\mathbf{w}^T\mathbf{x} + b)\]

where:

  • w: weight vector (normal to hyperplane)

  • b: bias (offset from origin)

  • sign(Β·): Returns +1 or -1

Geometric Interpretation:ΒΆ

  • w points perpendicular to decision boundary

  • Distance from origin: \(|b| / ||w||\)

  • Distance from point x to hyperplane: \(|w^T x + b| / ||w||\)

# Generate linearly separable data

np.random.seed(42)

def generate_separable_data(n_samples=100):
    """Generate linearly separable 2D data."""
    # Generate two clusters
    X1 = np.random.randn(n_samples//2, 2) + np.array([2, 2])
    X2 = np.random.randn(n_samples//2, 2) + np.array([-2, -2])
    
    X = np.vstack([X1, X2])
    y = np.hstack([np.ones(n_samples//2), -np.ones(n_samples//2)])
    
    return X, y

X, y = generate_separable_data(n_samples=100)

print("Linearly Separable Data")
print(f"Shape: {X.shape}")
print(f"Classes: {np.unique(y)}")
print(f"Class distribution: {(y == 1).sum()} positive, {(y == -1).sum()} negative")

# Visualize
plt.figure(figsize=(10, 6))
plt.scatter(X[y == 1, 0], X[y == 1, 1], c='blue', s=50, marker='o', 
           edgecolors='black', label='Class +1')
plt.scatter(X[y == -1, 0], X[y == -1, 1], c='red', s=50, marker='s', 
           edgecolors='black', label='Class -1')
plt.xlabel('x₁', fontsize=12)
plt.ylabel('xβ‚‚', fontsize=12)
plt.title('Linearly Separable Data', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.axis('equal')
plt.show()

print("\nQuestion: Which hyperplane should we choose?")
print("β†’ Many hyperplanes can separate the data!")
# Show multiple separating hyperplanes

def plot_hyperplane(w, b, ax, color='black', linestyle='-', label=None):
    """Plot hyperplane w^T x + b = 0."""
    x1_range = np.linspace(X[:, 0].min() - 1, X[:, 0].max() + 1, 100)
    # w1*x1 + w2*x2 + b = 0  =>  x2 = -(w1*x1 + b) / w2
    x2_line = -(w[0] * x1_range + b) / w[1]
    ax.plot(x1_range, x2_line, color=color, linestyle=linestyle, 
           linewidth=2, label=label)

# Define several separating hyperplanes
hyperplanes = [
    (np.array([1, 1]), -0.5, 'blue', '--', 'Hyperplane 1'),
    (np.array([1, 0.5]), -0.2, 'green', '--', 'Hyperplane 2'),
    (np.array([0.8, 1]), -0.3, 'purple', '--', 'Hyperplane 3'),
]

plt.figure(figsize=(10, 6))
plt.scatter(X[y == 1, 0], X[y == 1, 1], c='blue', s=50, marker='o', 
           edgecolors='black', alpha=0.6)
plt.scatter(X[y == -1, 0], X[y == -1, 1], c='red', s=50, marker='s', 
           edgecolors='black', alpha=0.6)

for w, b, color, style, label in hyperplanes:
    plot_hyperplane(w, b, plt.gca(), color=color, linestyle=style, label=label)

plt.xlabel('x₁', fontsize=12)
plt.ylabel('xβ‚‚', fontsize=12)
plt.title('Multiple Separating Hyperplanes', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.xlim([X[:, 0].min() - 1, X[:, 0].max() + 1])
plt.ylim([X[:, 1].min() - 1, X[:, 1].max() + 1])
plt.show()

print("All hyperplanes separate the data perfectly!")
print("Question: Which one is best?")
print("Answer: The one with maximum margin!")

12.2 Maximum MarginΒΆ

Margin:ΒΆ

Distance from hyperplane to closest point:

\[\text{margin} = \min_n \frac{|w^T x_n + b|}{||w||}\]

Support Vectors:ΒΆ

Points closest to the hyperplane (on the margin)

SVM Objective:ΒΆ

Maximize margin ≑ Minimize \(||w||^2\) subject to:

\[y_n(w^T x_n + b) \geq 1, \quad \forall n\]

Canonical Hyperplane:ΒΆ

Scale w, b such that:

  • \(w^T x + b = +1\) for support vectors in class +1

  • \(w^T x + b = -1\) for support vectors in class -1

  • Margin = \(2 / ||w||\)

Key: Maximizing margin = minimizing \(||w||^2\)

# Fit linear SVM

# Standardize data for numerical stability
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Fit SVM
svm = SVC(kernel='linear', C=1000)  # Large C for hard margin
svm.fit(X_scaled, y)

# Get parameters
w = svm.coef_[0]
b = svm.intercept_[0]

print("Linear SVM Results")
print(f"\nWeight vector w: {w}")
print(f"Bias b: {b}")
print(f"Norm ||w||: {np.linalg.norm(w):.4f}")
print(f"Margin: {2 / np.linalg.norm(w):.4f}")
print(f"\nNumber of support vectors: {len(svm.support_)}")

# Identify support vectors
support_vectors = X_scaled[svm.support_]
support_vector_labels = y[svm.support_]

# Visualize
def plot_svm_decision_boundary(X, y, svm, scaler):
    """Plot SVM decision boundary and margins."""
    fig, ax = plt.subplots(figsize=(12, 8))
    
    # Create mesh
    h = 0.02
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, h),
                           np.arange(x2_min, x2_max, h))
    
    # Predict on mesh
    X_mesh = np.c_[xx1.ravel(), xx2.ravel()]
    X_mesh_scaled = scaler.transform(X_mesh)
    Z = svm.decision_function(X_mesh_scaled).reshape(xx1.shape)
    
    # Plot decision boundary and margins
    ax.contour(xx1, xx2, Z, levels=[-1, 0, 1], colors=['red', 'black', 'blue'],
              linestyles=['--', '-', '--'], linewidths=[2, 3, 2])
    ax.contourf(xx1, xx2, Z, levels=[-np.inf, 0, np.inf], 
               colors=['#FFAAAA', '#AAAAFF'], alpha=0.3)
    
    # Plot points
    ax.scatter(X[y == 1, 0], X[y == 1, 1], c='blue', s=50, marker='o',
              edgecolors='black', label='Class +1')
    ax.scatter(X[y == -1, 0], X[y == -1, 1], c='red', s=50, marker='s',
              edgecolors='black', label='Class -1')
    
    # Highlight support vectors
    sv_original = scaler.inverse_transform(support_vectors)
    ax.scatter(sv_original[:, 0], sv_original[:, 1], s=200, 
              facecolors='none', edgecolors='green', linewidths=3,
              label='Support Vectors')
    
    ax.set_xlabel('x₁', fontsize=12)
    ax.set_ylabel('xβ‚‚', fontsize=12)
    ax.set_title('SVM: Maximum Margin Classifier', fontsize=14)
    ax.legend(fontsize=11)
    ax.grid(True, alpha=0.3)
    
    return fig

plot_svm_decision_boundary(X, y, svm, scaler)
plt.show()

print("\nVisualization:")
print("  - Black line: Decision boundary (w^T x + b = 0)")
print("  - Dashed lines: Margins (w^T x + b = Β±1)")
print("  - Green circles: Support vectors (on the margin)")
print("  - Distance between margins: 2/||w||")

12.3 Primal Optimization ProblemΒΆ

Hard Margin SVM (Primal):ΒΆ

\[\min_{\mathbf{w}, b} \frac{1}{2}||\mathbf{w}||^2\]

subject to: $\(y_n(\mathbf{w}^T\mathbf{x}_n + b) \geq 1, \quad n = 1, ..., N\)$

This is a quadratic programming problem with linear constraints.

Why \(\frac{1}{2}||w||^2\)?ΒΆ

  • Minimizing \(||w||^2\) ≑ maximizing margin \(2/||w||\)

  • Factor \(\frac{1}{2}\) makes derivative cleaner

  • Quadratic objective β†’ unique global optimum (convex!)

Solution Approach:ΒΆ

Use Lagrange multipliers to convert to dual problem

# Demonstrate convexity of SVM objective

# Objective function: (1/2) ||w||^2
w_range = np.linspace(-3, 3, 100)
objective = 0.5 * w_range**2

plt.figure(figsize=(10, 6))
plt.plot(w_range, objective, 'b-', linewidth=3)
plt.xlabel('w', fontsize=12)
plt.ylabel('(1/2) ||w||Β²', fontsize=12)
plt.title('SVM Objective Function (Convex)', fontsize=14)
plt.grid(True, alpha=0.3)
plt.axhline(y=0, color='black', linewidth=0.5)
plt.axvline(x=0, color='black', linewidth=0.5)

# Mark minimum
plt.plot(0, 0, 'ro', markersize=15, label='Global minimum')
plt.legend(fontsize=11)
plt.show()

print("Convex Optimization:")
print("  - Objective (1/2)||w||Β² is convex (quadratic)")
print("  - Constraints y_n(w^T x_n + b) β‰₯ 1 are linear")
print("  - β†’ Unique global optimum!")
print("  - Can be solved efficiently with QP solvers")

12.4 Dual Problem and Lagrange MultipliersΒΆ

Lagrangian:ΒΆ

\[\mathcal{L}(\mathbf{w}, b, \boldsymbol{\alpha}) = \frac{1}{2}||\mathbf{w}||^2 - \sum_{n=1}^N \alpha_n[y_n(\mathbf{w}^T\mathbf{x}_n + b) - 1]\]

where \(\alpha_n \geq 0\) are Lagrange multipliers.

KKT Conditions:ΒΆ

  1. \(\frac{\partial \mathcal{L}}{\partial \mathbf{w}} = 0 \Rightarrow \mathbf{w} = \sum_n \alpha_n y_n \mathbf{x}_n\)

  2. \(\frac{\partial \mathcal{L}}{\partial b} = 0 \Rightarrow \sum_n \alpha_n y_n = 0\)

  3. \(\alpha_n[y_n(\mathbf{w}^T\mathbf{x}_n + b) - 1] = 0\) (complementary slackness)

Dual Problem:ΒΆ

\[\max_{\boldsymbol{\alpha}} \sum_{n=1}^N \alpha_n - \frac{1}{2}\sum_{n,m}\alpha_n\alpha_m y_n y_m \mathbf{x}_n^T\mathbf{x}_m\]

subject to: $\(\alpha_n \geq 0, \quad \sum_n \alpha_n y_n = 0\)$

Key: Solution depends only on inner products \(\mathbf{x}_n^T\mathbf{x}_m\)!

# Examine dual solution

# Get dual coefficients (Ξ±_n y_n)
dual_coef = svm.dual_coef_[0]  # Ξ±_n * y_n for support vectors
alphas = np.abs(dual_coef)  # |Ξ±_n|

print("Dual Problem Solution")
print(f"\nSupport vectors: {len(svm.support_)}")
print(f"\nDual coefficients Ξ± (for support vectors):")
for i, (idx, alpha, label) in enumerate(zip(svm.support_, alphas, support_vector_labels)):
    print(f"  SV {i+1} (point {idx}): Ξ± = {alpha:.4f}, y = {label:+.0f}")

# Verify KKT conditions
print(f"\nKKT Condition: Ξ£ Ξ±_n y_n = 0")
print(f"  Ξ£ Ξ±_n y_n = {np.sum(dual_coef):.6f} β‰ˆ 0 βœ“")

# Verify w = Ξ£ Ξ±_n y_n x_n
w_from_dual = np.sum(dual_coef[:, np.newaxis] * support_vectors, axis=0)
print(f"\nWeight vector from dual:")
print(f"  w (from dual) = {w_from_dual}")
print(f"  w (from svm)  = {w}")
print(f"  Match: {np.allclose(w, w_from_dual)} βœ“")

# Complementary slackness: Ξ±_n > 0 only for support vectors
X_scaled_sv = X_scaled[svm.support_]
y_sv = y[svm.support_]
decision_values_sv = X_scaled_sv @ w + b

print(f"\nComplementary Slackness:")
print(f"  For support vectors: y_n(w^T x_n + b) should equal 1")
for i in range(min(5, len(support_vectors))):
    margin_val = y_sv[i] * decision_values_sv[i]
    print(f"    SV {i+1}: y * (w^T x + b) = {margin_val:.4f} β‰ˆ 1 βœ“")

12.5 Kernel TrickΒΆ

Motivation:ΒΆ

Data may not be linearly separable in original space, but linearly separable in higher-dimensional space!

Feature Map:ΒΆ

\[\phi: \mathbb{R}^d \rightarrow \mathbb{R}^D, \quad D >> d\]

Kernel Function:ΒΆ

Instead of computing \(\phi(\mathbf{x})^T\phi(\mathbf{x}')\) explicitly, use kernel:

\[k(\mathbf{x}, \mathbf{x}') = \phi(\mathbf{x})^T\phi(\mathbf{x}')\]

Common Kernels:ΒΆ

  1. Linear: \(k(\mathbf{x}, \mathbf{x}') = \mathbf{x}^T\mathbf{x}'\)

  2. Polynomial: \(k(\mathbf{x}, \mathbf{x}') = (\mathbf{x}^T\mathbf{x}' + c)^d\)

  3. RBF (Gaussian): \(k(\mathbf{x}, \mathbf{x}') = \exp(-\gamma||\mathbf{x} - \mathbf{x}'||^2)\)

Key: Can work in infinite-dimensional space without computing \(\phi\) explicitly!

# Generate non-linearly separable data (XOR pattern)

np.random.seed(42)

X_xor = np.random.randn(200, 2)
y_xor = np.logical_xor(X_xor[:, 0] > 0, X_xor[:, 1] > 0)
y_xor = np.where(y_xor, 1, -1)

# Add some noise
noise_indices = np.random.choice(200, 20, replace=False)
y_xor[noise_indices] *= -1

print("Non-linearly Separable Data (XOR pattern)")
print(f"Shape: {X_xor.shape}")

plt.figure(figsize=(10, 6))
plt.scatter(X_xor[y_xor == 1, 0], X_xor[y_xor == 1, 1], c='blue', s=50, 
           marker='o', edgecolors='black', label='Class +1')
plt.scatter(X_xor[y_xor == -1, 0], X_xor[y_xor == -1, 1], c='red', s=50, 
           marker='s', edgecolors='black', label='Class -1')
plt.xlabel('x₁', fontsize=12)
plt.ylabel('xβ‚‚', fontsize=12)
plt.title('XOR Pattern (Not Linearly Separable)', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.axis('equal')
plt.show()

print("\nLinear SVM will fail on this data!")
print("Solution: Use kernel trick!")
# Compare different kernels

kernels = ['linear', 'poly', 'rbf']
kernel_names = ['Linear', 'Polynomial (degree 3)', 'RBF (Gaussian)']

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for ax, kernel, name in zip(axes, kernels, kernel_names):
    # Fit SVM with kernel
    if kernel == 'poly':
        clf = SVC(kernel=kernel, degree=3, C=1.0)
    else:
        clf = SVC(kernel=kernel, C=1.0)
    
    clf.fit(X_xor, y_xor)
    
    # Create mesh
    h = 0.02
    x1_min, x1_max = X_xor[:, 0].min() - 1, X_xor[:, 0].max() + 1
    x2_min, x2_max = X_xor[:, 1].min() - 1, X_xor[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, h),
                           np.arange(x2_min, x2_max, h))
    
    # Predict
    Z = clf.predict(np.c_[xx1.ravel(), xx2.ravel()])
    Z = Z.reshape(xx1.shape)
    
    # Plot
    ax.contourf(xx1, xx2, Z, alpha=0.3, cmap=ListedColormap(['#FFAAAA', '#AAAAFF']))
    ax.contour(xx1, xx2, Z, levels=[0], colors='black', linewidths=2)
    
    ax.scatter(X_xor[y_xor == 1, 0], X_xor[y_xor == 1, 1], c='blue', s=30, 
              marker='o', edgecolors='black', alpha=0.6)
    ax.scatter(X_xor[y_xor == -1, 0], X_xor[y_xor == -1, 1], c='red', s=30, 
              marker='s', edgecolors='black', alpha=0.6)
    
    # Highlight support vectors
    ax.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1],
              s=150, facecolors='none', edgecolors='green', linewidths=2)
    
    # Compute accuracy
    acc = clf.score(X_xor, y_xor)
    
    ax.set_xlabel('x₁')
    ax.set_ylabel('xβ‚‚')
    ax.set_title(f'{name}\nAccuracy: {acc:.2%}\nSVs: {len(clf.support_vectors_)}')
    ax.set_xlim([x1_min, x1_max])
    ax.set_ylim([x2_min, x2_max])

plt.tight_layout()
plt.show()

print("Kernel Comparison:")
print("  - Linear: Fails (can't separate XOR with line)")
print("  - Polynomial: Works (creates curved boundary)")
print("  - RBF: Works best (flexible, smooth boundary)")
# Visualize RBF kernel transformation

# Simple 1D example to visualize
np.random.seed(42)
X_1d = np.array([-2, -1, 0, 1, 2]).reshape(-1, 1)
y_1d = np.array([-1, 1, -1, 1, -1])

# Compute RBF features manually
gamma = 1.0
centers = X_1d  # Use data points as centers

def rbf_features(X, centers, gamma):
    """Compute RBF features Ο†(x) = [exp(-Ξ³||x-c_i||Β²) for all centers c_i]."""
    n = X.shape[0]
    m = centers.shape[0]
    features = np.zeros((n, m))
    
    for i in range(m):
        features[:, i] = np.exp(-gamma * np.sum((X - centers[i])**2, axis=1))
    
    return features

# Transform to RBF space
phi_X = rbf_features(X_1d, centers, gamma)

print("RBF Kernel Transformation (1D β†’ 5D)")
print(f"\nOriginal space (1D):")
print(X_1d.ravel())
print(f"\nLabels:")
print(y_1d)

print(f"\nRBF feature space (5D):")
print(phi_X)

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Original space
axes[0].scatter(X_1d[y_1d == 1], np.zeros(sum(y_1d == 1)), c='blue', s=100, 
               marker='o', edgecolors='black', label='Class +1')
axes[0].scatter(X_1d[y_1d == -1], np.zeros(sum(y_1d == -1)), c='red', s=100, 
               marker='s', edgecolors='black', label='Class -1')
axes[0].set_xlabel('x')
axes[0].set_ylabel('')
axes[0].set_title('Original Space (1D)\n Not Linearly Separable')
axes[0].set_ylim([-0.5, 0.5])
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Feature space (show first 2 dimensions)
axes[1].scatter(phi_X[y_1d == 1, 0], phi_X[y_1d == 1, 1], c='blue', s=100, 
               marker='o', edgecolors='black', label='Class +1')
axes[1].scatter(phi_X[y_1d == -1, 0], phi_X[y_1d == -1, 1], c='red', s=100, 
               marker='s', edgecolors='black', label='Class -1')
axes[1].set_xlabel('φ₁(x)')
axes[1].set_ylabel('Ο†β‚‚(x)')
axes[1].set_title('RBF Feature Space (2 of 5 dims)\n Linearly Separable!')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nKey insight:")
print("  - Data not separable in 1D")
print("  - RBF kernel maps to high-dimensional space")
print("  - Data becomes separable in feature space!")
print("  - Never need to compute Ο†(x) explicitly (use kernel!)")

12.6 Soft Margin SVMΒΆ

Problem with Hard Margin:ΒΆ

  • Requires data to be perfectly separable

  • Sensitive to outliers

  • May overfit

Solution: Soft MarginΒΆ

Allow some misclassifications with slack variables ΞΎβ‚™:

\[y_n(\mathbf{w}^T\mathbf{x}_n + b) \geq 1 - \xi_n, \quad \xi_n \geq 0\]

Soft Margin Objective:ΒΆ

\[\min_{\mathbf{w}, b, \boldsymbol{\xi}} \frac{1}{2}||\mathbf{w}||^2 + C\sum_{n=1}^N \xi_n\]

where:

  • C: Regularization parameter (tradeoff)

  • Large C: Focus on correct classification (hard margin)

  • Small C: Focus on large margin (soft margin)

Interpretation: Balance between margin size and training errors

# Generate overlapping data

np.random.seed(42)

X_overlap = np.vstack([
    np.random.randn(60, 2) + np.array([1.5, 1.5]),
    np.random.randn(60, 2) + np.array([-1.5, -1.5])
])
y_overlap = np.hstack([np.ones(60), -np.ones(60)])

# Add overlap
overlap_indices = np.random.choice(120, 20, replace=False)
X_overlap[overlap_indices] *= 0.3

print("Overlapping Data (Not Perfectly Separable)")
print(f"Shape: {X_overlap.shape}")

plt.figure(figsize=(10, 6))
plt.scatter(X_overlap[y_overlap == 1, 0], X_overlap[y_overlap == 1, 1], 
           c='blue', s=50, marker='o', edgecolors='black', alpha=0.6, label='Class +1')
plt.scatter(X_overlap[y_overlap == -1, 0], X_overlap[y_overlap == -1, 1], 
           c='red', s=50, marker='s', edgecolors='black', alpha=0.6, label='Class -1')
plt.xlabel('x₁', fontsize=12)
plt.ylabel('xβ‚‚', fontsize=12)
plt.title('Overlapping Data', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.axis('equal')
plt.show()

print("\nHard margin SVM would fail!")
print("Solution: Use soft margin (allow some misclassifications)")
# Compare different C values

C_values = [0.01, 0.1, 1, 10, 100]

fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

for idx, C in enumerate(C_values):
    ax = axes[idx]
    
    # Fit SVM
    clf = SVC(kernel='linear', C=C)
    clf.fit(X_overlap, y_overlap)
    
    # Create mesh
    h = 0.02
    x1_min, x1_max = X_overlap[:, 0].min() - 1, X_overlap[:, 0].max() + 1
    x2_min, x2_max = X_overlap[:, 1].min() - 1, X_overlap[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, h),
                           np.arange(x2_min, x2_max, h))
    
    # Decision function
    Z = clf.decision_function(np.c_[xx1.ravel(), xx2.ravel()])
    Z = Z.reshape(xx1.shape)
    
    # Plot
    ax.contour(xx1, xx2, Z, levels=[-1, 0, 1], colors=['red', 'black', 'blue'],
              linestyles=['--', '-', '--'], linewidths=2)
    ax.contourf(xx1, xx2, Z, levels=[-np.inf, 0, np.inf],
               colors=['#FFAAAA', '#AAAAFF'], alpha=0.3)
    
    ax.scatter(X_overlap[y_overlap == 1, 0], X_overlap[y_overlap == 1, 1],
              c='blue', s=30, marker='o', edgecolors='black', alpha=0.6)
    ax.scatter(X_overlap[y_overlap == -1, 0], X_overlap[y_overlap == -1, 1],
              c='red', s=30, marker='s', edgecolors='black', alpha=0.6)
    
    # Support vectors
    ax.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1],
              s=150, facecolors='none', edgecolors='green', linewidths=2)
    
    # Accuracy
    acc = clf.score(X_overlap, y_overlap)
    
    ax.set_xlabel('x₁')
    ax.set_ylabel('xβ‚‚')
    ax.set_title(f'C = {C}\nAccuracy: {acc:.2%}\nSVs: {len(clf.support_vectors_)}')
    ax.set_xlim([x1_min, x1_max])
    ax.set_ylim([x2_min, x2_max])

# Remove extra subplot
fig.delaxes(axes[5])

plt.tight_layout()
plt.show()

print("Effect of C (Regularization Parameter):")
print("\n  Small C (e.g., 0.01):")
print("    - Large margin (wide separation)")
print("    - Many support vectors")
print("    - More misclassifications allowed")
print("    - Better generalization (less overfitting)")

print("\n  Large C (e.g., 100):")
print("    - Small margin (narrow separation)")
print("    - Fewer support vectors")
print("    - Few misclassifications")
print("    - Risk of overfitting")

print("\n  β†’ C controls bias-variance tradeoff!")
# Real-world example: Handwritten digits (3 vs 8)

from sklearn.datasets import load_digits
from sklearn.model_selection import cross_val_score

# Load digits
digits = load_digits()
X_digits = digits.data
y_digits = digits.target

# Select only 3s and 8s
mask = (y_digits == 3) | (y_digits == 8)
X_binary = X_digits[mask]
y_binary = np.where(y_digits[mask] == 3, 1, -1)

print("Digits Classification: 3 vs 8")
print(f"Dataset size: {X_binary.shape}")
print(f"Class distribution: {(y_binary == 1).sum()} threes, {(y_binary == -1).sum()} eights")

# Visualize sample digits
fig, axes = plt.subplots(2, 10, figsize=(14, 3))
for i in range(10):
    axes[0, i].imshow(X_binary[y_binary == 1][i].reshape(8, 8), cmap='gray')
    axes[0, i].axis('off')
    axes[1, i].imshow(X_binary[y_binary == -1][i].reshape(8, 8), cmap='gray')
    axes[1, i].axis('off')

axes[0, 0].set_ylabel('3s', fontsize=12)
axes[1, 0].set_ylabel('8s', fontsize=12)
plt.suptitle('Sample Digits', fontsize=14)
plt.tight_layout()
plt.show()

# Compare different kernels
kernels_eval = {
    'Linear': SVC(kernel='linear', C=1.0),
    'RBF': SVC(kernel='rbf', C=1.0, gamma='scale'),
    'Polynomial': SVC(kernel='poly', degree=3, C=1.0)
}

print("\nCross-Validation Results (5-fold):")
for name, clf in kernels_eval.items():
    scores = cross_val_score(clf, X_binary, y_binary, cv=5)
    print(f"  {name:12s}: {scores.mean():.3f} Β± {scores.std():.3f}")

# Fit best model (RBF)
svm_digits = SVC(kernel='rbf', C=1.0, gamma='scale')
svm_digits.fit(X_binary, y_binary)

print(f"\nBest model: RBF kernel")
print(f"  Number of support vectors: {len(svm_digits.support_vectors_)}/{len(X_binary)}")
print(f"  Percentage: {len(svm_digits.support_vectors_)/len(X_binary)*100:.1f}%")

# Visualize some support vectors
sv_indices = svm_digits.support_[:20]  # First 20 SVs
fig, axes = plt.subplots(2, 10, figsize=(14, 3))
for i, ax in enumerate(axes.flat):
    if i < len(sv_indices):
        ax.imshow(X_binary[sv_indices[i]].reshape(8, 8), cmap='gray')
        label = '3' if y_binary[sv_indices[i]] == 1 else '8'
        ax.set_title(label, fontsize=10)
    ax.axis('off')

plt.suptitle('Support Vectors (First 20)', fontsize=14)
plt.tight_layout()
plt.show()

print("\nSupport vectors are the 'hardest' examples to classify!")

SummaryΒΆ

Key Concepts from Chapter 12:ΒΆ

  1. Goal: Find hyperplane with maximum margin

  2. Margin: Distance from hyperplane to closest points

  3. Support Vectors: Points on the margin

  4. Primal: Minimize ||w||Β² subject to constraints

  5. Dual: Maximize based on Lagrange multipliers

  6. Kernel Trick: Nonlinear classification via feature mapping

  7. Soft Margin: Allow misclassifications with penalty C

SVM Formulations:ΒΆ

Problem

Objective

Constraints

Hard Margin (Primal)

min (1/2)

Soft Margin (Primal)

min (1/2)

Dual

max Σα_n - (1/2)ΣΣ α_n α_m y_n y_m k(x_n, x_m)

Ξ±_n β‰₯ 0, Σα_n y_n = 0

Common Kernels:ΒΆ

Kernel

Formula

Use Case

Linear

k(x, x’) = x^T x’

Linearly separable data

Polynomial

k(x, x’) = (x^T x’ + c)^d

Polynomial boundaries

RBF

k(x, x’) = exp(-Ξ³

Sigmoid

k(x, x’) = tanh(Ξ³ x^T x’ + c)

Neural network-like

Hyperparameter C:ΒΆ

Value

Effect

Best For

Small C

Large margin, many SVs

Noisy data, better generalization

Large C

Small margin, few SVs

Clean data, fewer outliers

When to Use SVM:ΒΆ

βœ… Good for:

  • Binary classification

  • High-dimensional data

  • Clear margin of separation

  • Kernel methods needed

  • Small to medium datasets

❌ Not good for:

  • Very large datasets (O(NΒ²) or O(NΒ³))

  • Many classes (use one-vs-one or one-vs-all)

  • Probabilistic outputs needed

  • Speed is critical

Connection to Other Chapters:ΒΆ

Concept

Chapter

Hyperplanes

Chapter 2, 3

Optimization (QP)

Chapter 7

Lagrange multipliers

Chapter 7

Inner products

Chapter 3

Kernels (Mercer’s theorem)

Chapter 3

Practical Tips:ΒΆ

  1. Always standardize features before SVM

  2. Use RBF kernel as default (very flexible)

  3. Grid search for C and Ξ³ (use cross-validation)

  4. Check support vectors: Too many β†’ overfit, too few β†’ underfit

  5. For large data: Use LinearSVC or SGDClassifier

  6. For probabilities: Use SVC with probability=True (slower)

Key Advantages:ΒΆ

  1. Maximum margin β†’ good generalization

  2. Kernel trick β†’ nonlinear decision boundaries

  3. Convex optimization β†’ global optimum

  4. Sparse solution β†’ only support vectors matter

  5. Effective in high dimensions β†’ works well for images, text

Next Steps:ΒΆ

  • Multi-class SVM: One-vs-one, one-vs-all

  • SVM Regression: Ξ΅-insensitive loss

  • Advanced: Sequential Minimal Optimization (SMO)

  • Related: Logistic regression, kernel methods, neural networks

Practice: Apply SVM to a real classification problem and tune hyperparameters!