Run this notebook: Open in Colab Open in Kaggle

Information Theory¶

Shannon entropy, cross-entropy loss, KL divergence, mutual information — the math behind ML objectives.

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import entropy

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
np.random.seed(42)

1. Information Content & Entropy¶

Key idea: Rare events carry more “information” than common events.

Information content of an event with probability $p$: $$I(x) = -\log_2(p(x)) \text{ bits}$$

Entropy - average information content: $$H(X) = -\sum_{i} p(x_i) \log_2 p(x_i)$$

High entropy = high uncertainty

# Information content example
def information_content(p):
    """Calculate information content in bits"""
    return -np.log2(p)

# Examples
prob_common = 0.9  # Common event (e.g., sun rises)
prob_rare = 0.01   # Rare event (e.g., lottery win)

print("Common event (p=0.9):")
print(f"  Information: {information_content(prob_common):.3f} bits")
print("\nRare event (p=0.01):")
print(f"  Information: {information_content(prob_rare):.3f} bits")
print("\nRare events carry more information!")

# Visualize information content
p_values = np.linspace(0.01, 1, 100)
info_values = information_content(p_values)

plt.figure(figsize=(10, 6))
plt.plot(p_values, info_values, linewidth=2)
plt.xlabel('Probability p(x)', fontsize=12)
plt.ylabel('Information I(x) [bits]', fontsize=12)
plt.title('Information Content vs Probability', fontsize=14)
plt.grid(True, alpha=0.3)
plt.axhline(y=0, color='k', linewidth=0.5)
plt.show()

# Calculate entropy for different distributions
def calculate_entropy(probabilities):
    """Calculate Shannon entropy in bits"""
    # Remove zeros to avoid log(0)
    p = probabilities[probabilities > 0]
    return -np.sum(p * np.log2(p))

# Example 1: Fair coin (maximum entropy for binary)
fair_coin = np.array([0.5, 0.5])
print("Fair coin [0.5, 0.5]:")
print(f"  Entropy: {calculate_entropy(fair_coin):.3f} bits")

# Example 2: Biased coin (lower entropy)
biased_coin = np.array([0.9, 0.1])
print("\nBiased coin [0.9, 0.1]:")
print(f"  Entropy: {calculate_entropy(biased_coin):.3f} bits")

# Example 3: Certain outcome (zero entropy)
certain = np.array([1.0, 0.0])
print("\nCertain outcome [1.0, 0.0]:")
print(f"  Entropy: {calculate_entropy(certain):.3f} bits")

# Example 4: Uniform distribution over 8 outcomes
uniform_8 = np.ones(8) / 8
print("\nUniform over 8 outcomes:")
print(f"  Entropy: {calculate_entropy(uniform_8):.3f} bits")
print("  (Exactly 3 bits - need 3 bits to encode 8 outcomes!)")

# Visualize entropy for binary distributions
p1_values = np.linspace(0.01, 0.99, 100)
entropies = [calculate_entropy(np.array([p, 1-p])) for p in p1_values]

plt.figure(figsize=(10, 6))
plt.plot(p1_values, entropies, linewidth=2)
plt.axvline(x=0.5, color='r', linestyle='--', label='Maximum entropy (p=0.5)')
plt.xlabel('Probability p', fontsize=12)
plt.ylabel('Entropy H(X) [bits]', fontsize=12)
plt.title('Binary Distribution Entropy', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.show()

print("Maximum uncertainty (entropy) at p=0.5 (fair coin)")

2. Cross-Entropy¶

The most important loss function in deep learning!

Cross-entropy between true distribution $p$ and predicted distribution $q$: $$H(p, q) = -\sum_{i} p(x_i) \log q(x_i)$$

In classification:

$p$ = true labels (one-hot encoded)
$q$ = model predictions (softmax output)
Minimize cross-entropy = make predictions match true distribution

def cross_entropy(p, q):
    """Calculate cross-entropy H(p, q)"""
    # Clip q to avoid log(0)
    q = np.clip(q, 1e-10, 1)
    return -np.sum(p * np.log(q))

# Example: 3-class classification
true_label = np.array([0, 1, 0])  # Class 2 is correct

# Good prediction (high confidence, correct class)
good_pred = np.array([0.1, 0.8, 0.1])
print("True label: [0, 1, 0] (class 2)")
print(f"\nGood prediction [0.1, 0.8, 0.1]:")
print(f"  Cross-entropy: {cross_entropy(true_label, good_pred):.4f}")

# Bad prediction (high confidence, wrong class)
bad_pred = np.array([0.8, 0.1, 0.1])
print(f"\nBad prediction [0.8, 0.1, 0.1]:")
print(f"  Cross-entropy: {cross_entropy(true_label, bad_pred):.4f}")

# Uncertain prediction
uncertain_pred = np.array([0.33, 0.34, 0.33])
print(f"\nUncertain prediction [0.33, 0.34, 0.33]:")
print(f"  Cross-entropy: {cross_entropy(true_label, uncertain_pred):.4f}")

print("\nLower cross-entropy = better predictions!")

# Binary cross-entropy (for binary classification)
def binary_cross_entropy(y_true, y_pred):
    """Binary cross-entropy loss"""
    y_pred = np.clip(y_pred, 1e-10, 1 - 1e-10)
    return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

# Visualize BCE for different predictions
y_pred_range = np.linspace(0.01, 0.99, 100)

# When true label is 1
bce_true1 = [-np.log(p) for p in y_pred_range]
# When true label is 0
bce_true0 = [-np.log(1-p) for p in y_pred_range]

plt.figure(figsize=(12, 6))
plt.plot(y_pred_range, bce_true1, label='True label = 1', linewidth=2)
plt.plot(y_pred_range, bce_true0, label='True label = 0', linewidth=2)
plt.xlabel('Predicted Probability', fontsize=12)
plt.ylabel('Binary Cross-Entropy Loss', fontsize=12)
plt.title('Binary Cross-Entropy Loss Function', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.ylim(0, 5)
plt.show()

print("Penalty increases exponentially as prediction diverges from true label!")

3. KL Divergence (Kullback-Leibler)¶

How different are two probability distributions?

\[D_{KL}(P \| Q) = \sum_{i} p(x_i) \log \frac{p(x_i)}{q(x_i)}\]

Properties:

Always ≥ 0
= 0 only when P = Q
NOT symmetric: $D_{KL}(P \| Q) \neq D_{KL}(Q \| P)$

Relationship to cross-entropy: $$D_{KL}(P \| Q) = H(P, Q) - H(P)$$

Since $H(P)$ is constant for fixed true distribution, minimizing cross-entropy = minimizing KL divergence!

def kl_divergence(p, q):
    """Calculate KL divergence D_KL(P || Q)"""
    p = np.asarray(p)
    q = np.asarray(q)
    # Only consider where p > 0
    mask = p > 0
    return np.sum(p[mask] * np.log(p[mask] / q[mask]))

# True distribution (e.g., actual data distribution)
p_true = np.array([0.5, 0.3, 0.2])

# Different approximations
q1 = np.array([0.5, 0.3, 0.2])  # Perfect match
q2 = np.array([0.4, 0.35, 0.25])  # Close
q3 = np.array([0.33, 0.33, 0.34])  # Uniform (far)

print("True distribution P:", p_true)
print(f"\nQ1 {q1}: KL = {kl_divergence(p_true, q1):.4f}")
print(f"Q2 {q2}: KL = {kl_divergence(p_true, q2):.4f}")
print(f"Q3 {q3}: KL = {kl_divergence(p_true, q3):.4f}")

# Demonstrate asymmetry
print("\n=== KL Divergence is Asymmetric ===")
print(f"D_KL(P || Q2) = {kl_divergence(p_true, q2):.4f}")
print(f"D_KL(Q2 || P) = {kl_divergence(q2, p_true):.4f}")

# Visualize KL divergence for Gaussian distributions
from scipy.stats import norm

x = np.linspace(-5, 10, 1000)

# True distribution
mu_true, sigma_true = 2, 1
p_dist = norm(mu_true, sigma_true)
p_x = p_dist.pdf(x)

# Approximations with different means
mu_approx_values = [2, 3, 4, 5]
colors = ['green', 'blue', 'orange', 'red']

plt.figure(figsize=(14, 6))

# Plot distributions
plt.subplot(1, 2, 1)
plt.plot(x, p_x, 'k-', linewidth=3, label=f'True P: N({mu_true}, {sigma_true}²)')

for mu, color in zip(mu_approx_values, colors):
    q_dist = norm(mu, sigma_true)
    q_x = q_dist.pdf(x)
    
    # Calculate KL divergence analytically for Gaussians
    kl = 0.5 * ((mu - mu_true)**2 / sigma_true**2)
    
    plt.plot(x, q_x, color=color, linewidth=2, 
             label=f'Q: N({mu}, {sigma_true}²), KL={kl:.3f}')

plt.xlabel('x', fontsize=12)
plt.ylabel('Probability Density', fontsize=12)
plt.title('Gaussian Distributions', fontsize=14)
plt.legend(fontsize=10)
plt.grid(True, alpha=0.3)

# Plot KL divergence vs mean difference
plt.subplot(1, 2, 2)
mu_range = np.linspace(0, 6, 100)
kl_values = [0.5 * ((mu - mu_true)**2 / sigma_true**2) for mu in mu_range]

plt.plot(mu_range, kl_values, linewidth=2)
plt.axvline(x=mu_true, color='r', linestyle='--', label=f'True mean = {mu_true}')
plt.xlabel('Approximate Mean μ', fontsize=12)
plt.ylabel('KL Divergence', fontsize=12)
plt.title('KL Divergence vs Mean', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

4. ML Application: Training a Neural Network¶

Cross-entropy is not just a theoretical construct – it is the default loss function for virtually every classification neural network. When a model outputs class probabilities through a softmax layer, minimizing cross-entropy is equivalent to maximizing the log-likelihood of the true labels under the model’s predicted distribution, which in turn is equivalent to minimizing the KL divergence between the true and predicted distributions (since the entropy of the true labels is fixed). The code below trains a scikit-learn MLPClassifier on a synthetic 3-class problem and tracks how the cross-entropy loss decreases over training iterations, demonstrating the direct connection between information theory and model optimization.

# Simulate neural network training on a classification task
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

# Generate dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15,
                          n_classes=3, n_clusters_per_class=1, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train neural network
model = MLPClassifier(hidden_layer_sizes=(50, 30), max_iter=100, 
                     random_state=42, verbose=False)
model.fit(X_train, y_train)

# Get predictions (probabilities)
y_pred_proba = model.predict_proba(X_test)

# Calculate cross-entropy loss manually
from sklearn.preprocessing import label_binarize
y_test_onehot = label_binarize(y_test, classes=[0, 1, 2])

ce_loss = cross_entropy(y_test_onehot.flatten(), y_pred_proba.flatten())
print(f"Cross-Entropy Loss: {ce_loss:.4f}")
print(f"Test Accuracy: {model.score(X_test, y_test):.4f}")

# Show some predictions
print("\nExample predictions:")
for i in range(5):
    print(f"True: {y_test[i]}, Predicted probs: {y_pred_proba[i]}")

# Visualize loss during training
# Train with verbose to see loss curve
model_verbose = MLPClassifier(hidden_layer_sizes=(50, 30), max_iter=200,
                             random_state=42, verbose=False)
model_verbose.fit(X_train, y_train)

plt.figure(figsize=(10, 6))
plt.plot(model_verbose.loss_curve_, linewidth=2)
plt.xlabel('Iteration', fontsize=12)
plt.ylabel('Cross-Entropy Loss', fontsize=12)
plt.title('Training Loss Curve', fontsize=14)
plt.grid(True, alpha=0.3)
plt.show()

print("Loss decreases as model learns to predict correct classes!")

5. Mutual Information¶

How much does knowing one variable tell you about another?

\[I(X; Y) = \sum_{x,y} p(x,y) \log \frac{p(x,y)}{p(x)p(y)}\]

Properties:

= 0 if X and Y are independent
Higher values = more dependence
Used in feature selection

from sklearn.metrics import mutual_info_score

# Example 1: Perfectly correlated
x1 = np.array([0, 0, 1, 1, 2, 2])
y1 = np.array([0, 0, 1, 1, 2, 2])  # Same as x1
mi1 = mutual_info_score(x1, y1)
print("Perfectly correlated:")
print(f"  Mutual Information: {mi1:.4f}")

# Example 2: Partially correlated
x2 = np.array([0, 0, 1, 1, 2, 2])
y2 = np.array([0, 1, 1, 1, 2, 0])  # Some correlation
mi2 = mutual_info_score(x2, y2)
print("\nPartially correlated:")
print(f"  Mutual Information: {mi2:.4f}")

# Example 3: Independent
np.random.seed(42)
x3 = np.random.randint(0, 3, 100)
y3 = np.random.randint(0, 3, 100)  # Random, independent
mi3 = mutual_info_score(x3, y3)
print("\nIndependent (random):")
print(f"  Mutual Information: {mi3:.4f}")

Summary¶

✅ Entropy: Measure of uncertainty/randomness ✅ Cross-Entropy: Loss function for classification (minimize to match distributions) ✅ KL Divergence: Distance between probability distributions ✅ Mutual Information: Measure of dependence between variables

Key Insights:¶

Cross-entropy loss is the standard for classification because:
- Probabilistic interpretation
- Smooth gradients for optimization
- Equivalent to minimizing KL divergence
Relationship:
- Entropy = average surprise
- Cross-entropy = expected surprise using wrong distribution
- KL divergence = extra surprise from using wrong distribution
Applications:
- Neural network training (loss functions)
- Model selection (information criteria)
- Feature selection (mutual information)
- Variational inference (KL divergence)

Next Steps:¶

Study softmax function and its gradient
Learn about variational autoencoders (VAEs)
Explore information bottleneck theory
Understand mutual information in neural networks