Run this notebook: Open in Colab Open in Kaggle

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.linalg import eigh, det

sns.set_style('whitegrid')
np.random.seed(42)

1. Motivation: Diversity¶

Problem¶

Sample diverse subset from \(\{1,\ldots,N\}\).

Examples:

Document summarization
Minibatch selection
Recommendation diversity

DPP Solution¶

Point process with repulsion between items.

\[P(Y) \propto \det(K_Y)\]

where \(K\) is positive semidefinite kernel matrix.

2. DPP Definition¶

Kernel Matrix¶

Let \(K \in \mathbb{R}^{N \times N}\) with \(0 \preceq K \preceq I\).

Probability¶

For subset \(Y \subseteq \{1,\ldots,N\}\):

\[P(Y) = \frac{\det(K_Y)}{\det(I + K)}\]

where \(K_Y\) is submatrix with rows/cols in \(Y\).

Marginal Probability¶

\[P(i \in Y) = K_{ii}\]

Repulsion¶

\[P(i, j \in Y) = K_{ii}K_{jj} - K_{ij}^2 \leq P(i \in Y)P(j \in Y)\]

Negative correlation!

# Visualize repulsion
def compute_kernel_rbf(X, sigma=1.0):
    """RBF kernel for features."""
    n = X.shape[0]
    K = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            K[i, j] = np.exp(-np.sum((X[i] - X[j])**2) / (2 * sigma**2))
    return K

# 2D points
np.random.seed(42)
X = np.random.randn(20, 2)
K = compute_kernel_rbf(X, sigma=1.5)

# Normalize
K = K / (np.trace(K) + 1e-8) * 5

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Kernel matrix
im = axes[0].imshow(K, cmap='viridis')
axes[0].set_title('Kernel Matrix K', fontsize=12)
axes[0].set_xlabel('Item j', fontsize=11)
axes[0].set_ylabel('Item i', fontsize=11)
plt.colorbar(im, ax=axes[0])

# Feature space
axes[1].scatter(X[:, 0], X[:, 1], s=100, alpha=0.6)
for i in range(len(X)):
    axes[1].text(X[i, 0]+0.1, X[i, 1]+0.1, str(i), fontsize=8)
axes[1].set_title('Feature Space', fontsize=12)
axes[1].set_xlabel('Feature 1', fontsize=11)
axes[1].set_ylabel('Feature 2', fontsize=11)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

3. L-ensemble DPP¶

Quality-Diversity Tradeoff¶

Define \(L = B^T B\) where \(B \in \mathbb{R}^{d \times N}\).

\[L_{ij} = \langle b_i, b_j \rangle\]

Interpretation:

\(\|b_i\|\) = quality of item \(i\)
\(\langle b_i, b_j \rangle\) = similarity

Probability¶

\[P_L(Y) \propto \det(L_Y)\]

Volume interpretation: \(\det(L_Y) = \text{volume}^2\) of parallelepiped.

# Visualize volume interpretation
np.random.seed(10)
b1 = np.array([1.5, 0.2])
b2_similar = np.array([1.4, 0.3])  # Similar to b1
b2_diverse = np.array([0.3, 1.5])  # Diverse from b1

def volume_squared(v1, v2):
    return np.linalg.det(np.array([v1, v2]))**2

vol_similar = volume_squared(b1, b2_similar)
vol_diverse = volume_squared(b1, b2_diverse)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Similar
axes[0].arrow(0, 0, b1[0], b1[1], head_width=0.1, color='blue', label='b1')
axes[0].arrow(0, 0, b2_similar[0], b2_similar[1], head_width=0.1, color='red', label='b2')
axes[0].fill([0, b1[0], b1[0]+b2_similar[0], b2_similar[0]], 
             [0, b1[1], b1[1]+b2_similar[1], b2_similar[1]], alpha=0.3)
axes[0].set_title(f'Similar: det²={vol_similar:.3f}', fontsize=12)
axes[0].set_xlim(-0.5, 3)
axes[0].set_ylim(-0.5, 2)
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Diverse
axes[1].arrow(0, 0, b1[0], b1[1], head_width=0.1, color='blue', label='b1')
axes[1].arrow(0, 0, b2_diverse[0], b2_diverse[1], head_width=0.1, color='red', label='b2')
axes[1].fill([0, b1[0], b1[0]+b2_diverse[0], b2_diverse[0]], 
             [0, b1[1], b1[1]+b2_diverse[1], b2_diverse[1]], alpha=0.3)
axes[1].set_title(f'Diverse: det²={vol_diverse:.3f}', fontsize=12)
axes[1].set_xlim(-0.5, 3)
axes[1].set_ylim(-0.5, 2)
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Volume ratio: {vol_diverse/vol_similar:.2f}x larger for diverse!")

4. Sampling Algorithm¶

Eigendecomposition Method¶

Compute \(L = V\Lambda V^T\)
For each eigenvalue \(\lambda_i\), include eigenvector with prob \(\frac{\lambda_i}{1+\lambda_i}\)
Sample from selected eigenvectors

Complexity: \(O(N^3)\) for eigendecomposition, \(O(Nk^3)\) for sampling.

def sample_dpp(L, max_iter=100):
    """Sample from DPP via eigendecomposition."""
    N = L.shape[0]
    
    # Eigendecomposition
    eigvals, eigvecs = eigh(L)
    eigvals = np.maximum(eigvals, 0)  # Numerical stability
    
    # Phase 1: Select eigenvectors
    selected = []
    for i in range(N):
        if np.random.rand() < eigvals[i] / (1 + eigvals[i]):
            selected.append(i)
    
    if len(selected) == 0:
        return np.array([])
    
    V = eigvecs[:, selected]
    
    # Phase 2: Sample items
    Y = []
    for _ in range(len(selected)):
        # Compute probabilities
        probs = np.sum(V**2, axis=1)
        probs /= np.sum(probs)
        
        # Sample item
        i = np.random.choice(N, p=probs)
        Y.append(i)
        
        # Update V
        if V.shape[1] > 1:
            v_i = V[i, :]
            V = V - np.outer(V[i, :], v_i) / np.linalg.norm(v_i)**2
            # Orthogonalize
            V, _ = np.linalg.qr(V)
    
    return np.array(Y)

# Test sampling
N = 30
d = 5
B = np.random.randn(d, N)
L = B.T @ B

# Sample multiple times
samples = [sample_dpp(L) for _ in range(100)]
sizes = [len(s) for s in samples]

plt.figure(figsize=(8, 5))
plt.hist(sizes, bins=range(max(sizes)+2), alpha=0.7, edgecolor='black')
plt.xlabel('Subset Size', fontsize=11)
plt.ylabel('Frequency', fontsize=11)
plt.title('DPP Sample Size Distribution', fontsize=12)
plt.grid(True, alpha=0.3, axis='y')
plt.show()

print(f"Mean size: {np.mean(sizes):.2f}")
print(f"Expected size (trace formula): {np.trace(L @ np.linalg.inv(L + np.eye(N))):.2f}")

5. Application: Diverse Subset Selection¶

# Compare DPP vs random sampling
np.random.seed(42)
N = 50
X = np.random.randn(N, 2)

# Compute kernel
K = compute_kernel_rbf(X, sigma=1.0)
L = K / (np.trace(K) + 1e-8) * 10

# DPP sample
dpp_sample = sample_dpp(L)

# Random sample (same size)
k = len(dpp_sample)
random_sample = np.random.choice(N, size=k, replace=False)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# DPP
axes[0].scatter(X[:, 0], X[:, 1], s=30, alpha=0.3, color='gray', label='All')
axes[0].scatter(X[dpp_sample, 0], X[dpp_sample, 1], s=100, color='red', label='DPP')
axes[0].set_title(f'DPP Sample (k={k})', fontsize=12)
axes[0].set_xlabel('Feature 1', fontsize=11)
axes[0].set_ylabel('Feature 2', fontsize=11)
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Random
axes[1].scatter(X[:, 0], X[:, 1], s=30, alpha=0.3, color='gray', label='All')
axes[1].scatter(X[random_sample, 0], X[random_sample, 1], s=100, color='blue', label='Random')
axes[1].set_title(f'Random Sample (k={k})', fontsize=12)
axes[1].set_xlabel('Feature 1', fontsize=11)
axes[1].set_ylabel('Feature 2', fontsize=11)
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Diversity metrics
def min_distance(X, indices):
    if len(indices) < 2:
        return 0
    min_dist = np.inf
    for i in range(len(indices)):
        for j in range(i+1, len(indices)):
            dist = np.linalg.norm(X[indices[i]] - X[indices[j]])
            min_dist = min(min_dist, dist)
    return min_dist

print(f"DPP min distance: {min_distance(X, dpp_sample):.3f}")
print(f"Random min distance: {min_distance(X, random_sample):.3f}")

Summary¶

DPP Definition:¶

\[P(Y) \propto \det(L_Y)\]

Repulsion: Negative correlation between items.

Properties:¶

\(P(i \in Y) = K_{ii}\)
Expected size: \(\mathbb{E}[|Y|] = \text{tr}(K(I+K)^{-1})\)
Volume interpretation: maximize diversity

Sampling:¶

Eigendecompose \(L = V\Lambda V^T\)
Select eigenvectors stochastically
Sequential sampling from span

Applications:¶

Document summarization
Minibatch selection for SGD
Recommendation systems
Neural architecture search

Next Steps:¶

07_concentration_inequalities.ipynb - Theoretical bounds
Explore conditional DPPs
Fast sampling approximations