Information TheoryΒΆ
Shannon entropy, cross-entropy loss, KL divergence, mutual information β the math behind ML objectives.
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import entropy
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
np.random.seed(42)
1. Information Content & EntropyΒΆ
Key idea: Rare events carry more βinformationβ than common events.
Information content of an event with probability \(p\): $\(I(x) = -\log_2(p(x)) \text{ bits}\)$
Entropy - average information content: $\(H(X) = -\sum_{i} p(x_i) \log_2 p(x_i)\)$
High entropy = high uncertainty
# Information content example
def information_content(p):
"""Calculate information content in bits"""
return -np.log2(p)
# Examples
prob_common = 0.9 # Common event (e.g., sun rises)
prob_rare = 0.01 # Rare event (e.g., lottery win)
print("Common event (p=0.9):")
print(f" Information: {information_content(prob_common):.3f} bits")
print("\nRare event (p=0.01):")
print(f" Information: {information_content(prob_rare):.3f} bits")
print("\nRare events carry more information!")
# Visualize information content
p_values = np.linspace(0.01, 1, 100)
info_values = information_content(p_values)
plt.figure(figsize=(10, 6))
plt.plot(p_values, info_values, linewidth=2)
plt.xlabel('Probability p(x)', fontsize=12)
plt.ylabel('Information I(x) [bits]', fontsize=12)
plt.title('Information Content vs Probability', fontsize=14)
plt.grid(True, alpha=0.3)
plt.axhline(y=0, color='k', linewidth=0.5)
plt.show()
# Calculate entropy for different distributions
def calculate_entropy(probabilities):
"""Calculate Shannon entropy in bits"""
# Remove zeros to avoid log(0)
p = probabilities[probabilities > 0]
return -np.sum(p * np.log2(p))
# Example 1: Fair coin (maximum entropy for binary)
fair_coin = np.array([0.5, 0.5])
print("Fair coin [0.5, 0.5]:")
print(f" Entropy: {calculate_entropy(fair_coin):.3f} bits")
# Example 2: Biased coin (lower entropy)
biased_coin = np.array([0.9, 0.1])
print("\nBiased coin [0.9, 0.1]:")
print(f" Entropy: {calculate_entropy(biased_coin):.3f} bits")
# Example 3: Certain outcome (zero entropy)
certain = np.array([1.0, 0.0])
print("\nCertain outcome [1.0, 0.0]:")
print(f" Entropy: {calculate_entropy(certain):.3f} bits")
# Example 4: Uniform distribution over 8 outcomes
uniform_8 = np.ones(8) / 8
print("\nUniform over 8 outcomes:")
print(f" Entropy: {calculate_entropy(uniform_8):.3f} bits")
print(" (Exactly 3 bits - need 3 bits to encode 8 outcomes!)")
# Visualize entropy for binary distributions
p1_values = np.linspace(0.01, 0.99, 100)
entropies = [calculate_entropy(np.array([p, 1-p])) for p in p1_values]
plt.figure(figsize=(10, 6))
plt.plot(p1_values, entropies, linewidth=2)
plt.axvline(x=0.5, color='r', linestyle='--', label='Maximum entropy (p=0.5)')
plt.xlabel('Probability p', fontsize=12)
plt.ylabel('Entropy H(X) [bits]', fontsize=12)
plt.title('Binary Distribution Entropy', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.show()
print("Maximum uncertainty (entropy) at p=0.5 (fair coin)")
2. Cross-EntropyΒΆ
The most important loss function in deep learning!
Cross-entropy between true distribution \(p\) and predicted distribution \(q\): $\(H(p, q) = -\sum_{i} p(x_i) \log q(x_i)\)$
In classification:
\(p\) = true labels (one-hot encoded)
\(q\) = model predictions (softmax output)
Minimize cross-entropy = make predictions match true distribution
def cross_entropy(p, q):
"""Calculate cross-entropy H(p, q)"""
# Clip q to avoid log(0)
q = np.clip(q, 1e-10, 1)
return -np.sum(p * np.log(q))
# Example: 3-class classification
true_label = np.array([0, 1, 0]) # Class 2 is correct
# Good prediction (high confidence, correct class)
good_pred = np.array([0.1, 0.8, 0.1])
print("True label: [0, 1, 0] (class 2)")
print(f"\nGood prediction [0.1, 0.8, 0.1]:")
print(f" Cross-entropy: {cross_entropy(true_label, good_pred):.4f}")
# Bad prediction (high confidence, wrong class)
bad_pred = np.array([0.8, 0.1, 0.1])
print(f"\nBad prediction [0.8, 0.1, 0.1]:")
print(f" Cross-entropy: {cross_entropy(true_label, bad_pred):.4f}")
# Uncertain prediction
uncertain_pred = np.array([0.33, 0.34, 0.33])
print(f"\nUncertain prediction [0.33, 0.34, 0.33]:")
print(f" Cross-entropy: {cross_entropy(true_label, uncertain_pred):.4f}")
print("\nLower cross-entropy = better predictions!")
# Binary cross-entropy (for binary classification)
def binary_cross_entropy(y_true, y_pred):
"""Binary cross-entropy loss"""
y_pred = np.clip(y_pred, 1e-10, 1 - 1e-10)
return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
# Visualize BCE for different predictions
y_pred_range = np.linspace(0.01, 0.99, 100)
# When true label is 1
bce_true1 = [-np.log(p) for p in y_pred_range]
# When true label is 0
bce_true0 = [-np.log(1-p) for p in y_pred_range]
plt.figure(figsize=(12, 6))
plt.plot(y_pred_range, bce_true1, label='True label = 1', linewidth=2)
plt.plot(y_pred_range, bce_true0, label='True label = 0', linewidth=2)
plt.xlabel('Predicted Probability', fontsize=12)
plt.ylabel('Binary Cross-Entropy Loss', fontsize=12)
plt.title('Binary Cross-Entropy Loss Function', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.ylim(0, 5)
plt.show()
print("Penalty increases exponentially as prediction diverges from true label!")
3. KL Divergence (Kullback-Leibler)ΒΆ
How different are two probability distributions?
Properties:
Always β₯ 0
= 0 only when P = Q
NOT symmetric: \(D_{KL}(P \| Q) \neq D_{KL}(Q \| P)\)
Relationship to cross-entropy: $\(D_{KL}(P \| Q) = H(P, Q) - H(P)\)$
Since \(H(P)\) is constant for fixed true distribution, minimizing cross-entropy = minimizing KL divergence!
def kl_divergence(p, q):
"""Calculate KL divergence D_KL(P || Q)"""
p = np.asarray(p)
q = np.asarray(q)
# Only consider where p > 0
mask = p > 0
return np.sum(p[mask] * np.log(p[mask] / q[mask]))
# True distribution (e.g., actual data distribution)
p_true = np.array([0.5, 0.3, 0.2])
# Different approximations
q1 = np.array([0.5, 0.3, 0.2]) # Perfect match
q2 = np.array([0.4, 0.35, 0.25]) # Close
q3 = np.array([0.33, 0.33, 0.34]) # Uniform (far)
print("True distribution P:", p_true)
print(f"\nQ1 {q1}: KL = {kl_divergence(p_true, q1):.4f}")
print(f"Q2 {q2}: KL = {kl_divergence(p_true, q2):.4f}")
print(f"Q3 {q3}: KL = {kl_divergence(p_true, q3):.4f}")
# Demonstrate asymmetry
print("\n=== KL Divergence is Asymmetric ===")
print(f"D_KL(P || Q2) = {kl_divergence(p_true, q2):.4f}")
print(f"D_KL(Q2 || P) = {kl_divergence(q2, p_true):.4f}")
# Visualize KL divergence for Gaussian distributions
from scipy.stats import norm
x = np.linspace(-5, 10, 1000)
# True distribution
mu_true, sigma_true = 2, 1
p_dist = norm(mu_true, sigma_true)
p_x = p_dist.pdf(x)
# Approximations with different means
mu_approx_values = [2, 3, 4, 5]
colors = ['green', 'blue', 'orange', 'red']
plt.figure(figsize=(14, 6))
# Plot distributions
plt.subplot(1, 2, 1)
plt.plot(x, p_x, 'k-', linewidth=3, label=f'True P: N({mu_true}, {sigma_true}Β²)')
for mu, color in zip(mu_approx_values, colors):
q_dist = norm(mu, sigma_true)
q_x = q_dist.pdf(x)
# Calculate KL divergence analytically for Gaussians
kl = 0.5 * ((mu - mu_true)**2 / sigma_true**2)
plt.plot(x, q_x, color=color, linewidth=2,
label=f'Q: N({mu}, {sigma_true}Β²), KL={kl:.3f}')
plt.xlabel('x', fontsize=12)
plt.ylabel('Probability Density', fontsize=12)
plt.title('Gaussian Distributions', fontsize=14)
plt.legend(fontsize=10)
plt.grid(True, alpha=0.3)
# Plot KL divergence vs mean difference
plt.subplot(1, 2, 2)
mu_range = np.linspace(0, 6, 100)
kl_values = [0.5 * ((mu - mu_true)**2 / sigma_true**2) for mu in mu_range]
plt.plot(mu_range, kl_values, linewidth=2)
plt.axvline(x=mu_true, color='r', linestyle='--', label=f'True mean = {mu_true}')
plt.xlabel('Approximate Mean ΞΌ', fontsize=12)
plt.ylabel('KL Divergence', fontsize=12)
plt.title('KL Divergence vs Mean', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
4. ML Application: Training a Neural NetworkΒΆ
Cross-entropy is not just a theoretical construct β it is the default loss function for virtually every classification neural network. When a model outputs class probabilities through a softmax layer, minimizing cross-entropy is equivalent to maximizing the log-likelihood of the true labels under the modelβs predicted distribution, which in turn is equivalent to minimizing the KL divergence between the true and predicted distributions (since the entropy of the true labels is fixed). The code below trains a scikit-learn MLPClassifier on a synthetic 3-class problem and tracks how the cross-entropy loss decreases over training iterations, demonstrating the direct connection between information theory and model optimization.
# Simulate neural network training on a classification task
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
# Generate dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15,
n_classes=3, n_clusters_per_class=1, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train neural network
model = MLPClassifier(hidden_layer_sizes=(50, 30), max_iter=100,
random_state=42, verbose=False)
model.fit(X_train, y_train)
# Get predictions (probabilities)
y_pred_proba = model.predict_proba(X_test)
# Calculate cross-entropy loss manually
from sklearn.preprocessing import label_binarize
y_test_onehot = label_binarize(y_test, classes=[0, 1, 2])
ce_loss = cross_entropy(y_test_onehot.flatten(), y_pred_proba.flatten())
print(f"Cross-Entropy Loss: {ce_loss:.4f}")
print(f"Test Accuracy: {model.score(X_test, y_test):.4f}")
# Show some predictions
print("\nExample predictions:")
for i in range(5):
print(f"True: {y_test[i]}, Predicted probs: {y_pred_proba[i]}")
# Visualize loss during training
# Train with verbose to see loss curve
model_verbose = MLPClassifier(hidden_layer_sizes=(50, 30), max_iter=200,
random_state=42, verbose=False)
model_verbose.fit(X_train, y_train)
plt.figure(figsize=(10, 6))
plt.plot(model_verbose.loss_curve_, linewidth=2)
plt.xlabel('Iteration', fontsize=12)
plt.ylabel('Cross-Entropy Loss', fontsize=12)
plt.title('Training Loss Curve', fontsize=14)
plt.grid(True, alpha=0.3)
plt.show()
print("Loss decreases as model learns to predict correct classes!")
5. Mutual InformationΒΆ
How much does knowing one variable tell you about another?
Properties:
= 0 if X and Y are independent
Higher values = more dependence
Used in feature selection
from sklearn.metrics import mutual_info_score
# Example 1: Perfectly correlated
x1 = np.array([0, 0, 1, 1, 2, 2])
y1 = np.array([0, 0, 1, 1, 2, 2]) # Same as x1
mi1 = mutual_info_score(x1, y1)
print("Perfectly correlated:")
print(f" Mutual Information: {mi1:.4f}")
# Example 2: Partially correlated
x2 = np.array([0, 0, 1, 1, 2, 2])
y2 = np.array([0, 1, 1, 1, 2, 0]) # Some correlation
mi2 = mutual_info_score(x2, y2)
print("\nPartially correlated:")
print(f" Mutual Information: {mi2:.4f}")
# Example 3: Independent
np.random.seed(42)
x3 = np.random.randint(0, 3, 100)
y3 = np.random.randint(0, 3, 100) # Random, independent
mi3 = mutual_info_score(x3, y3)
print("\nIndependent (random):")
print(f" Mutual Information: {mi3:.4f}")
SummaryΒΆ
β Entropy: Measure of uncertainty/randomness β Cross-Entropy: Loss function for classification (minimize to match distributions) β KL Divergence: Distance between probability distributions β Mutual Information: Measure of dependence between variables
Key Insights:ΒΆ
Cross-entropy loss is the standard for classification because:
Probabilistic interpretation
Smooth gradients for optimization
Equivalent to minimizing KL divergence
Relationship:
Entropy = average surprise
Cross-entropy = expected surprise using wrong distribution
KL divergence = extra surprise from using wrong distribution
Applications:
Neural network training (loss functions)
Model selection (information criteria)
Feature selection (mutual information)
Variational inference (KL divergence)
Next Steps:ΒΆ
Study softmax function and its gradient
Learn about variational autoencoders (VAEs)
Explore information bottleneck theory
Understand mutual information in neural networks