# Setup
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC, LinearSVC
from sklearn.datasets import make_classification, make_circles, make_moons
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from matplotlib.colors import ListedColormap
sns.set_style('whitegrid')
np.random.seed(42)
np.set_printoptions(precision=4, suppress=True)
12.1 Separating HyperplanesΒΆ
Binary Classification:ΒΆ
Given data \(\{(\mathbf{x}_n, y_n)\}_{n=1}^N\) where \(y_n \in \{-1, +1\}\)
Goal: Find hyperplane that separates classes:
Decision Function:ΒΆ
where:
w: weight vector (normal to hyperplane)
b: bias (offset from origin)
sign(Β·): Returns +1 or -1
Geometric Interpretation:ΒΆ
w points perpendicular to decision boundary
Distance from origin: \(|b| / ||w||\)
Distance from point x to hyperplane: \(|w^T x + b| / ||w||\)
# Generate linearly separable data
np.random.seed(42)
def generate_separable_data(n_samples=100):
"""Generate linearly separable 2D data."""
# Generate two clusters
X1 = np.random.randn(n_samples//2, 2) + np.array([2, 2])
X2 = np.random.randn(n_samples//2, 2) + np.array([-2, -2])
X = np.vstack([X1, X2])
y = np.hstack([np.ones(n_samples//2), -np.ones(n_samples//2)])
return X, y
X, y = generate_separable_data(n_samples=100)
print("Linearly Separable Data")
print(f"Shape: {X.shape}")
print(f"Classes: {np.unique(y)}")
print(f"Class distribution: {(y == 1).sum()} positive, {(y == -1).sum()} negative")
# Visualize
plt.figure(figsize=(10, 6))
plt.scatter(X[y == 1, 0], X[y == 1, 1], c='blue', s=50, marker='o',
edgecolors='black', label='Class +1')
plt.scatter(X[y == -1, 0], X[y == -1, 1], c='red', s=50, marker='s',
edgecolors='black', label='Class -1')
plt.xlabel('xβ', fontsize=12)
plt.ylabel('xβ', fontsize=12)
plt.title('Linearly Separable Data', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.axis('equal')
plt.show()
print("\nQuestion: Which hyperplane should we choose?")
print("β Many hyperplanes can separate the data!")
# Show multiple separating hyperplanes
def plot_hyperplane(w, b, ax, color='black', linestyle='-', label=None):
"""Plot hyperplane w^T x + b = 0."""
x1_range = np.linspace(X[:, 0].min() - 1, X[:, 0].max() + 1, 100)
# w1*x1 + w2*x2 + b = 0 => x2 = -(w1*x1 + b) / w2
x2_line = -(w[0] * x1_range + b) / w[1]
ax.plot(x1_range, x2_line, color=color, linestyle=linestyle,
linewidth=2, label=label)
# Define several separating hyperplanes
hyperplanes = [
(np.array([1, 1]), -0.5, 'blue', '--', 'Hyperplane 1'),
(np.array([1, 0.5]), -0.2, 'green', '--', 'Hyperplane 2'),
(np.array([0.8, 1]), -0.3, 'purple', '--', 'Hyperplane 3'),
]
plt.figure(figsize=(10, 6))
plt.scatter(X[y == 1, 0], X[y == 1, 1], c='blue', s=50, marker='o',
edgecolors='black', alpha=0.6)
plt.scatter(X[y == -1, 0], X[y == -1, 1], c='red', s=50, marker='s',
edgecolors='black', alpha=0.6)
for w, b, color, style, label in hyperplanes:
plot_hyperplane(w, b, plt.gca(), color=color, linestyle=style, label=label)
plt.xlabel('xβ', fontsize=12)
plt.ylabel('xβ', fontsize=12)
plt.title('Multiple Separating Hyperplanes', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.xlim([X[:, 0].min() - 1, X[:, 0].max() + 1])
plt.ylim([X[:, 1].min() - 1, X[:, 1].max() + 1])
plt.show()
print("All hyperplanes separate the data perfectly!")
print("Question: Which one is best?")
print("Answer: The one with maximum margin!")
12.2 Maximum MarginΒΆ
Margin:ΒΆ
Distance from hyperplane to closest point:
Support Vectors:ΒΆ
Points closest to the hyperplane (on the margin)
SVM Objective:ΒΆ
Maximize margin β‘ Minimize \(||w||^2\) subject to:
Canonical Hyperplane:ΒΆ
Scale w, b such that:
\(w^T x + b = +1\) for support vectors in class +1
\(w^T x + b = -1\) for support vectors in class -1
Margin = \(2 / ||w||\)
Key: Maximizing margin = minimizing \(||w||^2\)
# Fit linear SVM
# Standardize data for numerical stability
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Fit SVM
svm = SVC(kernel='linear', C=1000) # Large C for hard margin
svm.fit(X_scaled, y)
# Get parameters
w = svm.coef_[0]
b = svm.intercept_[0]
print("Linear SVM Results")
print(f"\nWeight vector w: {w}")
print(f"Bias b: {b}")
print(f"Norm ||w||: {np.linalg.norm(w):.4f}")
print(f"Margin: {2 / np.linalg.norm(w):.4f}")
print(f"\nNumber of support vectors: {len(svm.support_)}")
# Identify support vectors
support_vectors = X_scaled[svm.support_]
support_vector_labels = y[svm.support_]
# Visualize
def plot_svm_decision_boundary(X, y, svm, scaler):
"""Plot SVM decision boundary and margins."""
fig, ax = plt.subplots(figsize=(12, 8))
# Create mesh
h = 0.02
x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, h),
np.arange(x2_min, x2_max, h))
# Predict on mesh
X_mesh = np.c_[xx1.ravel(), xx2.ravel()]
X_mesh_scaled = scaler.transform(X_mesh)
Z = svm.decision_function(X_mesh_scaled).reshape(xx1.shape)
# Plot decision boundary and margins
ax.contour(xx1, xx2, Z, levels=[-1, 0, 1], colors=['red', 'black', 'blue'],
linestyles=['--', '-', '--'], linewidths=[2, 3, 2])
ax.contourf(xx1, xx2, Z, levels=[-np.inf, 0, np.inf],
colors=['#FFAAAA', '#AAAAFF'], alpha=0.3)
# Plot points
ax.scatter(X[y == 1, 0], X[y == 1, 1], c='blue', s=50, marker='o',
edgecolors='black', label='Class +1')
ax.scatter(X[y == -1, 0], X[y == -1, 1], c='red', s=50, marker='s',
edgecolors='black', label='Class -1')
# Highlight support vectors
sv_original = scaler.inverse_transform(support_vectors)
ax.scatter(sv_original[:, 0], sv_original[:, 1], s=200,
facecolors='none', edgecolors='green', linewidths=3,
label='Support Vectors')
ax.set_xlabel('xβ', fontsize=12)
ax.set_ylabel('xβ', fontsize=12)
ax.set_title('SVM: Maximum Margin Classifier', fontsize=14)
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)
return fig
plot_svm_decision_boundary(X, y, svm, scaler)
plt.show()
print("\nVisualization:")
print(" - Black line: Decision boundary (w^T x + b = 0)")
print(" - Dashed lines: Margins (w^T x + b = Β±1)")
print(" - Green circles: Support vectors (on the margin)")
print(" - Distance between margins: 2/||w||")
12.3 Primal Optimization ProblemΒΆ
Hard Margin SVM (Primal):ΒΆ
subject to: $\(y_n(\mathbf{w}^T\mathbf{x}_n + b) \geq 1, \quad n = 1, ..., N\)$
This is a quadratic programming problem with linear constraints.
Why \(\frac{1}{2}||w||^2\)?ΒΆ
Minimizing \(||w||^2\) β‘ maximizing margin \(2/||w||\)
Factor \(\frac{1}{2}\) makes derivative cleaner
Quadratic objective β unique global optimum (convex!)
Solution Approach:ΒΆ
Use Lagrange multipliers to convert to dual problem
# Demonstrate convexity of SVM objective
# Objective function: (1/2) ||w||^2
w_range = np.linspace(-3, 3, 100)
objective = 0.5 * w_range**2
plt.figure(figsize=(10, 6))
plt.plot(w_range, objective, 'b-', linewidth=3)
plt.xlabel('w', fontsize=12)
plt.ylabel('(1/2) ||w||Β²', fontsize=12)
plt.title('SVM Objective Function (Convex)', fontsize=14)
plt.grid(True, alpha=0.3)
plt.axhline(y=0, color='black', linewidth=0.5)
plt.axvline(x=0, color='black', linewidth=0.5)
# Mark minimum
plt.plot(0, 0, 'ro', markersize=15, label='Global minimum')
plt.legend(fontsize=11)
plt.show()
print("Convex Optimization:")
print(" - Objective (1/2)||w||Β² is convex (quadratic)")
print(" - Constraints y_n(w^T x_n + b) β₯ 1 are linear")
print(" - β Unique global optimum!")
print(" - Can be solved efficiently with QP solvers")
12.4 Dual Problem and Lagrange MultipliersΒΆ
Lagrangian:ΒΆ
where \(\alpha_n \geq 0\) are Lagrange multipliers.
KKT Conditions:ΒΆ
\(\frac{\partial \mathcal{L}}{\partial \mathbf{w}} = 0 \Rightarrow \mathbf{w} = \sum_n \alpha_n y_n \mathbf{x}_n\)
\(\frac{\partial \mathcal{L}}{\partial b} = 0 \Rightarrow \sum_n \alpha_n y_n = 0\)
\(\alpha_n[y_n(\mathbf{w}^T\mathbf{x}_n + b) - 1] = 0\) (complementary slackness)
Dual Problem:ΒΆ
subject to: $\(\alpha_n \geq 0, \quad \sum_n \alpha_n y_n = 0\)$
Key: Solution depends only on inner products \(\mathbf{x}_n^T\mathbf{x}_m\)!
# Examine dual solution
# Get dual coefficients (Ξ±_n y_n)
dual_coef = svm.dual_coef_[0] # Ξ±_n * y_n for support vectors
alphas = np.abs(dual_coef) # |Ξ±_n|
print("Dual Problem Solution")
print(f"\nSupport vectors: {len(svm.support_)}")
print(f"\nDual coefficients Ξ± (for support vectors):")
for i, (idx, alpha, label) in enumerate(zip(svm.support_, alphas, support_vector_labels)):
print(f" SV {i+1} (point {idx}): Ξ± = {alpha:.4f}, y = {label:+.0f}")
# Verify KKT conditions
print(f"\nKKT Condition: Ξ£ Ξ±_n y_n = 0")
print(f" Ξ£ Ξ±_n y_n = {np.sum(dual_coef):.6f} β 0 β")
# Verify w = Ξ£ Ξ±_n y_n x_n
w_from_dual = np.sum(dual_coef[:, np.newaxis] * support_vectors, axis=0)
print(f"\nWeight vector from dual:")
print(f" w (from dual) = {w_from_dual}")
print(f" w (from svm) = {w}")
print(f" Match: {np.allclose(w, w_from_dual)} β")
# Complementary slackness: Ξ±_n > 0 only for support vectors
X_scaled_sv = X_scaled[svm.support_]
y_sv = y[svm.support_]
decision_values_sv = X_scaled_sv @ w + b
print(f"\nComplementary Slackness:")
print(f" For support vectors: y_n(w^T x_n + b) should equal 1")
for i in range(min(5, len(support_vectors))):
margin_val = y_sv[i] * decision_values_sv[i]
print(f" SV {i+1}: y * (w^T x + b) = {margin_val:.4f} β 1 β")
12.5 Kernel TrickΒΆ
Motivation:ΒΆ
Data may not be linearly separable in original space, but linearly separable in higher-dimensional space!
Feature Map:ΒΆ
Kernel Function:ΒΆ
Instead of computing \(\phi(\mathbf{x})^T\phi(\mathbf{x}')\) explicitly, use kernel:
Common Kernels:ΒΆ
Linear: \(k(\mathbf{x}, \mathbf{x}') = \mathbf{x}^T\mathbf{x}'\)
Polynomial: \(k(\mathbf{x}, \mathbf{x}') = (\mathbf{x}^T\mathbf{x}' + c)^d\)
RBF (Gaussian): \(k(\mathbf{x}, \mathbf{x}') = \exp(-\gamma||\mathbf{x} - \mathbf{x}'||^2)\)
Key: Can work in infinite-dimensional space without computing \(\phi\) explicitly!
# Generate non-linearly separable data (XOR pattern)
np.random.seed(42)
X_xor = np.random.randn(200, 2)
y_xor = np.logical_xor(X_xor[:, 0] > 0, X_xor[:, 1] > 0)
y_xor = np.where(y_xor, 1, -1)
# Add some noise
noise_indices = np.random.choice(200, 20, replace=False)
y_xor[noise_indices] *= -1
print("Non-linearly Separable Data (XOR pattern)")
print(f"Shape: {X_xor.shape}")
plt.figure(figsize=(10, 6))
plt.scatter(X_xor[y_xor == 1, 0], X_xor[y_xor == 1, 1], c='blue', s=50,
marker='o', edgecolors='black', label='Class +1')
plt.scatter(X_xor[y_xor == -1, 0], X_xor[y_xor == -1, 1], c='red', s=50,
marker='s', edgecolors='black', label='Class -1')
plt.xlabel('xβ', fontsize=12)
plt.ylabel('xβ', fontsize=12)
plt.title('XOR Pattern (Not Linearly Separable)', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.axis('equal')
plt.show()
print("\nLinear SVM will fail on this data!")
print("Solution: Use kernel trick!")
# Compare different kernels
kernels = ['linear', 'poly', 'rbf']
kernel_names = ['Linear', 'Polynomial (degree 3)', 'RBF (Gaussian)']
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
for ax, kernel, name in zip(axes, kernels, kernel_names):
# Fit SVM with kernel
if kernel == 'poly':
clf = SVC(kernel=kernel, degree=3, C=1.0)
else:
clf = SVC(kernel=kernel, C=1.0)
clf.fit(X_xor, y_xor)
# Create mesh
h = 0.02
x1_min, x1_max = X_xor[:, 0].min() - 1, X_xor[:, 0].max() + 1
x2_min, x2_max = X_xor[:, 1].min() - 1, X_xor[:, 1].max() + 1
xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, h),
np.arange(x2_min, x2_max, h))
# Predict
Z = clf.predict(np.c_[xx1.ravel(), xx2.ravel()])
Z = Z.reshape(xx1.shape)
# Plot
ax.contourf(xx1, xx2, Z, alpha=0.3, cmap=ListedColormap(['#FFAAAA', '#AAAAFF']))
ax.contour(xx1, xx2, Z, levels=[0], colors='black', linewidths=2)
ax.scatter(X_xor[y_xor == 1, 0], X_xor[y_xor == 1, 1], c='blue', s=30,
marker='o', edgecolors='black', alpha=0.6)
ax.scatter(X_xor[y_xor == -1, 0], X_xor[y_xor == -1, 1], c='red', s=30,
marker='s', edgecolors='black', alpha=0.6)
# Highlight support vectors
ax.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1],
s=150, facecolors='none', edgecolors='green', linewidths=2)
# Compute accuracy
acc = clf.score(X_xor, y_xor)
ax.set_xlabel('xβ')
ax.set_ylabel('xβ')
ax.set_title(f'{name}\nAccuracy: {acc:.2%}\nSVs: {len(clf.support_vectors_)}')
ax.set_xlim([x1_min, x1_max])
ax.set_ylim([x2_min, x2_max])
plt.tight_layout()
plt.show()
print("Kernel Comparison:")
print(" - Linear: Fails (can't separate XOR with line)")
print(" - Polynomial: Works (creates curved boundary)")
print(" - RBF: Works best (flexible, smooth boundary)")
# Visualize RBF kernel transformation
# Simple 1D example to visualize
np.random.seed(42)
X_1d = np.array([-2, -1, 0, 1, 2]).reshape(-1, 1)
y_1d = np.array([-1, 1, -1, 1, -1])
# Compute RBF features manually
gamma = 1.0
centers = X_1d # Use data points as centers
def rbf_features(X, centers, gamma):
"""Compute RBF features Ο(x) = [exp(-Ξ³||x-c_i||Β²) for all centers c_i]."""
n = X.shape[0]
m = centers.shape[0]
features = np.zeros((n, m))
for i in range(m):
features[:, i] = np.exp(-gamma * np.sum((X - centers[i])**2, axis=1))
return features
# Transform to RBF space
phi_X = rbf_features(X_1d, centers, gamma)
print("RBF Kernel Transformation (1D β 5D)")
print(f"\nOriginal space (1D):")
print(X_1d.ravel())
print(f"\nLabels:")
print(y_1d)
print(f"\nRBF feature space (5D):")
print(phi_X)
# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Original space
axes[0].scatter(X_1d[y_1d == 1], np.zeros(sum(y_1d == 1)), c='blue', s=100,
marker='o', edgecolors='black', label='Class +1')
axes[0].scatter(X_1d[y_1d == -1], np.zeros(sum(y_1d == -1)), c='red', s=100,
marker='s', edgecolors='black', label='Class -1')
axes[0].set_xlabel('x')
axes[0].set_ylabel('')
axes[0].set_title('Original Space (1D)\n Not Linearly Separable')
axes[0].set_ylim([-0.5, 0.5])
axes[0].legend()
axes[0].grid(True, alpha=0.3)
# Feature space (show first 2 dimensions)
axes[1].scatter(phi_X[y_1d == 1, 0], phi_X[y_1d == 1, 1], c='blue', s=100,
marker='o', edgecolors='black', label='Class +1')
axes[1].scatter(phi_X[y_1d == -1, 0], phi_X[y_1d == -1, 1], c='red', s=100,
marker='s', edgecolors='black', label='Class -1')
axes[1].set_xlabel('Οβ(x)')
axes[1].set_ylabel('Οβ(x)')
axes[1].set_title('RBF Feature Space (2 of 5 dims)\n Linearly Separable!')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("\nKey insight:")
print(" - Data not separable in 1D")
print(" - RBF kernel maps to high-dimensional space")
print(" - Data becomes separable in feature space!")
print(" - Never need to compute Ο(x) explicitly (use kernel!)")
12.6 Soft Margin SVMΒΆ
Problem with Hard Margin:ΒΆ
Requires data to be perfectly separable
Sensitive to outliers
May overfit
Solution: Soft MarginΒΆ
Allow some misclassifications with slack variables ΞΎβ:
Soft Margin Objective:ΒΆ
where:
C: Regularization parameter (tradeoff)
Large C: Focus on correct classification (hard margin)
Small C: Focus on large margin (soft margin)
Interpretation: Balance between margin size and training errors
# Generate overlapping data
np.random.seed(42)
X_overlap = np.vstack([
np.random.randn(60, 2) + np.array([1.5, 1.5]),
np.random.randn(60, 2) + np.array([-1.5, -1.5])
])
y_overlap = np.hstack([np.ones(60), -np.ones(60)])
# Add overlap
overlap_indices = np.random.choice(120, 20, replace=False)
X_overlap[overlap_indices] *= 0.3
print("Overlapping Data (Not Perfectly Separable)")
print(f"Shape: {X_overlap.shape}")
plt.figure(figsize=(10, 6))
plt.scatter(X_overlap[y_overlap == 1, 0], X_overlap[y_overlap == 1, 1],
c='blue', s=50, marker='o', edgecolors='black', alpha=0.6, label='Class +1')
plt.scatter(X_overlap[y_overlap == -1, 0], X_overlap[y_overlap == -1, 1],
c='red', s=50, marker='s', edgecolors='black', alpha=0.6, label='Class -1')
plt.xlabel('xβ', fontsize=12)
plt.ylabel('xβ', fontsize=12)
plt.title('Overlapping Data', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.axis('equal')
plt.show()
print("\nHard margin SVM would fail!")
print("Solution: Use soft margin (allow some misclassifications)")
# Compare different C values
C_values = [0.01, 0.1, 1, 10, 100]
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()
for idx, C in enumerate(C_values):
ax = axes[idx]
# Fit SVM
clf = SVC(kernel='linear', C=C)
clf.fit(X_overlap, y_overlap)
# Create mesh
h = 0.02
x1_min, x1_max = X_overlap[:, 0].min() - 1, X_overlap[:, 0].max() + 1
x2_min, x2_max = X_overlap[:, 1].min() - 1, X_overlap[:, 1].max() + 1
xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, h),
np.arange(x2_min, x2_max, h))
# Decision function
Z = clf.decision_function(np.c_[xx1.ravel(), xx2.ravel()])
Z = Z.reshape(xx1.shape)
# Plot
ax.contour(xx1, xx2, Z, levels=[-1, 0, 1], colors=['red', 'black', 'blue'],
linestyles=['--', '-', '--'], linewidths=2)
ax.contourf(xx1, xx2, Z, levels=[-np.inf, 0, np.inf],
colors=['#FFAAAA', '#AAAAFF'], alpha=0.3)
ax.scatter(X_overlap[y_overlap == 1, 0], X_overlap[y_overlap == 1, 1],
c='blue', s=30, marker='o', edgecolors='black', alpha=0.6)
ax.scatter(X_overlap[y_overlap == -1, 0], X_overlap[y_overlap == -1, 1],
c='red', s=30, marker='s', edgecolors='black', alpha=0.6)
# Support vectors
ax.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1],
s=150, facecolors='none', edgecolors='green', linewidths=2)
# Accuracy
acc = clf.score(X_overlap, y_overlap)
ax.set_xlabel('xβ')
ax.set_ylabel('xβ')
ax.set_title(f'C = {C}\nAccuracy: {acc:.2%}\nSVs: {len(clf.support_vectors_)}')
ax.set_xlim([x1_min, x1_max])
ax.set_ylim([x2_min, x2_max])
# Remove extra subplot
fig.delaxes(axes[5])
plt.tight_layout()
plt.show()
print("Effect of C (Regularization Parameter):")
print("\n Small C (e.g., 0.01):")
print(" - Large margin (wide separation)")
print(" - Many support vectors")
print(" - More misclassifications allowed")
print(" - Better generalization (less overfitting)")
print("\n Large C (e.g., 100):")
print(" - Small margin (narrow separation)")
print(" - Fewer support vectors")
print(" - Few misclassifications")
print(" - Risk of overfitting")
print("\n β C controls bias-variance tradeoff!")
# Real-world example: Handwritten digits (3 vs 8)
from sklearn.datasets import load_digits
from sklearn.model_selection import cross_val_score
# Load digits
digits = load_digits()
X_digits = digits.data
y_digits = digits.target
# Select only 3s and 8s
mask = (y_digits == 3) | (y_digits == 8)
X_binary = X_digits[mask]
y_binary = np.where(y_digits[mask] == 3, 1, -1)
print("Digits Classification: 3 vs 8")
print(f"Dataset size: {X_binary.shape}")
print(f"Class distribution: {(y_binary == 1).sum()} threes, {(y_binary == -1).sum()} eights")
# Visualize sample digits
fig, axes = plt.subplots(2, 10, figsize=(14, 3))
for i in range(10):
axes[0, i].imshow(X_binary[y_binary == 1][i].reshape(8, 8), cmap='gray')
axes[0, i].axis('off')
axes[1, i].imshow(X_binary[y_binary == -1][i].reshape(8, 8), cmap='gray')
axes[1, i].axis('off')
axes[0, 0].set_ylabel('3s', fontsize=12)
axes[1, 0].set_ylabel('8s', fontsize=12)
plt.suptitle('Sample Digits', fontsize=14)
plt.tight_layout()
plt.show()
# Compare different kernels
kernels_eval = {
'Linear': SVC(kernel='linear', C=1.0),
'RBF': SVC(kernel='rbf', C=1.0, gamma='scale'),
'Polynomial': SVC(kernel='poly', degree=3, C=1.0)
}
print("\nCross-Validation Results (5-fold):")
for name, clf in kernels_eval.items():
scores = cross_val_score(clf, X_binary, y_binary, cv=5)
print(f" {name:12s}: {scores.mean():.3f} Β± {scores.std():.3f}")
# Fit best model (RBF)
svm_digits = SVC(kernel='rbf', C=1.0, gamma='scale')
svm_digits.fit(X_binary, y_binary)
print(f"\nBest model: RBF kernel")
print(f" Number of support vectors: {len(svm_digits.support_vectors_)}/{len(X_binary)}")
print(f" Percentage: {len(svm_digits.support_vectors_)/len(X_binary)*100:.1f}%")
# Visualize some support vectors
sv_indices = svm_digits.support_[:20] # First 20 SVs
fig, axes = plt.subplots(2, 10, figsize=(14, 3))
for i, ax in enumerate(axes.flat):
if i < len(sv_indices):
ax.imshow(X_binary[sv_indices[i]].reshape(8, 8), cmap='gray')
label = '3' if y_binary[sv_indices[i]] == 1 else '8'
ax.set_title(label, fontsize=10)
ax.axis('off')
plt.suptitle('Support Vectors (First 20)', fontsize=14)
plt.tight_layout()
plt.show()
print("\nSupport vectors are the 'hardest' examples to classify!")
SummaryΒΆ
Key Concepts from Chapter 12:ΒΆ
Goal: Find hyperplane with maximum margin
Margin: Distance from hyperplane to closest points
Support Vectors: Points on the margin
Primal: Minimize ||w||Β² subject to constraints
Dual: Maximize based on Lagrange multipliers
Kernel Trick: Nonlinear classification via feature mapping
Soft Margin: Allow misclassifications with penalty C
SVM Formulations:ΒΆ
Problem |
Objective |
Constraints |
|---|---|---|
Hard Margin (Primal) |
min (1/2) |
|
Soft Margin (Primal) |
min (1/2) |
|
Dual |
max Σα_n - (1/2)ΣΣ α_n α_m y_n y_m k(x_n, x_m) |
Ξ±_n β₯ 0, Σα_n y_n = 0 |
Common Kernels:ΒΆ
Kernel |
Formula |
Use Case |
|---|---|---|
Linear |
k(x, xβ) = x^T xβ |
Linearly separable data |
Polynomial |
k(x, xβ) = (x^T xβ + c)^d |
Polynomial boundaries |
RBF |
k(x, xβ) = exp(-Ξ³ |
|
Sigmoid |
k(x, xβ) = tanh(Ξ³ x^T xβ + c) |
Neural network-like |
Hyperparameter C:ΒΆ
Value |
Effect |
Best For |
|---|---|---|
Small C |
Large margin, many SVs |
Noisy data, better generalization |
Large C |
Small margin, few SVs |
Clean data, fewer outliers |
When to Use SVM:ΒΆ
β Good for:
Binary classification
High-dimensional data
Clear margin of separation
Kernel methods needed
Small to medium datasets
β Not good for:
Very large datasets (O(NΒ²) or O(NΒ³))
Many classes (use one-vs-one or one-vs-all)
Probabilistic outputs needed
Speed is critical
Connection to Other Chapters:ΒΆ
Concept |
Chapter |
|---|---|
Hyperplanes |
Chapter 2, 3 |
Optimization (QP) |
Chapter 7 |
Lagrange multipliers |
Chapter 7 |
Inner products |
Chapter 3 |
Kernels (Mercerβs theorem) |
Chapter 3 |
Practical Tips:ΒΆ
Always standardize features before SVM
Use RBF kernel as default (very flexible)
Grid search for C and Ξ³ (use cross-validation)
Check support vectors: Too many β overfit, too few β underfit
For large data: Use LinearSVC or SGDClassifier
For probabilities: Use SVC with probability=True (slower)
Key Advantages:ΒΆ
Maximum margin β good generalization
Kernel trick β nonlinear decision boundaries
Convex optimization β global optimum
Sparse solution β only support vectors matter
Effective in high dimensions β works well for images, text
Next Steps:ΒΆ
Multi-class SVM: One-vs-one, one-vs-all
SVM Regression: Ξ΅-insensitive loss
Advanced: Sequential Minimal Optimization (SMO)
Related: Logistic regression, kernel methods, neural networks
Practice: Apply SVM to a real classification problem and tune hyperparameters!