Run this notebook: Open in Colab Open in Kaggle

# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_diabetes, make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.metrics import accuracy_score
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set random seed
np.random.seed(42)

# Configure plotting
plt.style.use('default')
sns.set_palette('husl')

print("✅ Libraries imported successfully!")

Missing Data: The Silent Model Killer¶

Missing data is one of the most common data quality issues in ML pipelines, yet its impact is frequently underestimated. Missing Completely at Random (MCAR) means the absence is unrelated to any variable – the safest scenario, where dropping rows introduces no bias. Missing at Random (MAR) means missingness depends on observed variables (e.g., younger users skip income fields), allowing imputation from correlated features. Missing Not at Random (MNAR) is the most dangerous: the missingness depends on the unobserved value itself (e.g., high-income individuals refusing to report income), making any simple imputation strategy biased.

Why this matters for ML: Tree-based models like XGBoost handle missing values natively by learning optimal split directions, but linear models and neural networks require explicit imputation. Naive strategies like mean imputation shrink variance and distort correlations – for a feature \(X\) with missing fraction \(f\), the imputed variance becomes \((1 - f) \cdot \text{Var}(X)\), systematically underestimating uncertainty. In production, monitoring the pattern of missingness over time (not just the rate) is critical: a sudden spike in missing values for a specific feature often signals an upstream data pipeline failure rather than natural variation.

# Create dataset with missing values
np.random.seed(42)
n_samples = 1000
n_features = 5

# Generate data
data = np.random.randn(n_samples, n_features) * 10 + 50

# Introduce missing values (10% random)
missing_mask = np.random.random((n_samples, n_features)) < 0.1
data[missing_mask] = np.nan

# Convert to DataFrame
df = pd.DataFrame(data, columns=[f'feature_{i}' for i in range(n_features)])
df['target'] = np.random.randint(0, 2, n_samples)

print("Dataset shape:", df.shape)
print("\nFirst few rows:")
df.head()

def analyze_missing_values(df):
    """
    Comprehensive missing value analysis.
    """
    print("=" * 60)
    print("MISSING VALUE ANALYSIS")
    print("=" * 60)
    
    # Overall statistics
    total_cells = np.product(df.shape)
    total_missing = df.isnull().sum().sum()
    
    print(f"\nTotal cells: {total_cells:,}")
    print(f"Missing cells: {total_missing:,} ({100*total_missing/total_cells:.2f}%)")
    
    # Per-column analysis
    print("\nPer-Column Missing Values:")
    missing_stats = pd.DataFrame({
        'Missing Count': df.isnull().sum(),
        'Missing %': 100 * df.isnull().sum() / len(df),
        'Data Type': df.dtypes
    })
    missing_stats = missing_stats[missing_stats['Missing Count'] > 0].sort_values(
        'Missing %', ascending=False
    )
    
    print(missing_stats)
    
    # Visualization
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Missing value heatmap
    sns.heatmap(df.isnull(), cbar=True, cmap='viridis', ax=axes[0])
    axes[0].set_title('Missing Value Pattern', fontsize=12, fontweight='bold')
    axes[0].set_xlabel('Features')
    axes[0].set_ylabel('Samples')
    
    # Bar chart of missing percentages
    missing_pct = 100 * df.isnull().sum() / len(df)
    missing_pct = missing_pct[missing_pct > 0]
    missing_pct.plot(kind='bar', ax=axes[1], color='coral', edgecolor='black')
    axes[1].set_title('Missing Value Percentage by Feature', fontsize=12, fontweight='bold')
    axes[1].set_xlabel('Features')
    axes[1].set_ylabel('Missing %')
    axes[1].axhline(y=5, color='red', linestyle='--', label='5% threshold')
    axes[1].legend()
    axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=45)
    
    plt.tight_layout()
    plt.show()
    
    # Recommendations
    print("\n" + "=" * 60)
    print("RECOMMENDATIONS")
    print("=" * 60)
    
    for col in missing_stats.index:
        pct = missing_stats.loc[col, 'Missing %']
        if pct > 50:
            print(f"❌ {col}: {pct:.1f}% missing - Consider dropping this feature")
        elif pct > 20:
            print(f"⚠️ {col}: {pct:.1f}% missing - Investigate pattern, consider advanced imputation")
        elif pct > 5:
            print(f"⚠️ {col}: {pct:.1f}% missing - Use median/mode imputation")
        else:
            print(f"✅ {col}: {pct:.1f}% missing - Simple imputation or drop rows")

analyze_missing_values(df)

# Handling strategies
print("MISSING VALUE HANDLING STRATEGIES:\n")

# Strategy 1: Drop rows
df_droprows = df.dropna()
print(f"1. Drop rows: {len(df)} → {len(df_droprows)} samples ({100*(1-len(df_droprows)/len(df)):.1f}% loss)")

# Strategy 2: Drop columns (if >50% missing)
high_missing_cols = df.columns[df.isnull().sum() / len(df) > 0.5]
df_dropcols = df.drop(columns=high_missing_cols)
print(f"2. Drop high-missing columns: {df.shape[1]} → {df_dropcols.shape[1]} features")

# Strategy 3: Impute with mean/median
df_imputed = df.copy()
for col in df.select_dtypes(include=np.number).columns:
    df_imputed[col].fillna(df[col].median(), inplace=True)
print(f"3. Median imputation: {df_imputed.isnull().sum().sum()} missing values")

# Strategy 4: Forward fill
df_ffill = df.fillna(method='ffill')
print(f"4. Forward fill: {df_ffill.isnull().sum().sum()} missing values")

print("\n💡 Best practice: Choose strategy based on data characteristics and missingness pattern")

Duplicate Detection: Preventing Data Leakage and Inflated Metrics¶

Duplicate records silently corrupt ML experiments in two ways. Exact duplicates that span both train and test splits create data leakage – the model has literally memorized test examples, inflating accuracy by as much as 5-15% on real-world datasets. Near-duplicates (rows that differ by only rounding errors or whitespace) are harder to catch but equally problematic. The pandas.DataFrame.duplicated() method detects exact matches using hash-based comparison in \(O(n)\) time, but near-duplicates require fuzzy matching with libraries like fuzzywuzzy or locality-sensitive hashing (LSH).

Critical workflow rule: always deduplicate before performing a train/test split. If duplicates survive into both partitions, your reported generalization error is optimistically biased. In recommendation systems and NLP tasks, duplicate detection extends beyond row-level checks to semantic similarity – two differently worded product descriptions for the same item, or paraphrased text samples, can cause identical leakage problems.

# Create dataset with duplicates
X, y = make_classification(n_samples=1000, n_features=10, random_state=42)
df_dup = pd.DataFrame(X, columns=[f'f{i}' for i in range(10)])
df_dup['target'] = y

# Add exact duplicates (5%)
n_duplicates = 50
dup_indices = np.random.choice(df_dup.index, n_duplicates, replace=False)
df_dup = pd.concat([df_dup, df_dup.loc[dup_indices]], ignore_index=True)

print(f"Dataset size: {len(df_dup)}")
print(f"Expected duplicates: ~{n_duplicates}")

def analyze_duplicates(df):
    """
    Analyze duplicate rows.
    """
    print("=" * 60)
    print("DUPLICATE ANALYSIS")
    print("=" * 60)
    
    # Find exact duplicates
    duplicates = df.duplicated()
    n_duplicates = duplicates.sum()
    
    print(f"\nTotal rows: {len(df)}")
    print(f"Duplicate rows: {n_duplicates} ({100*n_duplicates/len(df):.2f}%)")
    
    if n_duplicates > 0:
        print("\nExample duplicate rows:")
        dup_indices = df[duplicates].index[:3]
        for idx in dup_indices:
            print(f"\nRow {idx}:")
            print(df.loc[idx])
            # Find its original
            original = df[(df == df.loc[idx]).all(axis=1)].index[0]
            if original != idx:
                print(f"  → Duplicate of row {original}")
        
        # Visualize
        fig, ax = plt.subplots(figsize=(8, 5))
        labels = ['Unique', 'Duplicates']
        sizes = [len(df) - n_duplicates, n_duplicates]
        colors = ['lightblue', 'coral']
        ax.pie(sizes, labels=labels, autopct='%1.1f%%', colors=colors, 
               startangle=90, wedgeprops={'edgecolor': 'black'})
        ax.set_title('Duplicate Distribution', fontsize=12, fontweight='bold')
        plt.show()
    
    print("\n" + "=" * 60)
    print("RECOMMENDATION")
    print("=" * 60)
    
    if n_duplicates > 0:
        print(f"⚠️ Found {n_duplicates} duplicates")
        print("   • Review if duplicates are intentional")
        print("   • Remove using: df.drop_duplicates()")
        print("   • Consider keeping first/last occurrence")
    else:
        print("✅ No duplicates found")

analyze_duplicates(df_dup)

# Remove duplicates
df_clean = df_dup.drop_duplicates()

print(f"Before: {len(df_dup)} rows")
print(f"After: {len(df_clean)} rows")
print(f"Removed: {len(df_dup) - len(df_clean)} duplicate rows")
print("\n✅ Duplicates removed successfully!")

Outlier Detection: Distinguishing Signal from Noise¶

Outliers demand careful treatment because they can be either valuable rare events (fraud transactions, equipment failures) or data corruption (sensor glitches, entry errors). Z-score detection assumes normality and flags points beyond \(|z| > 3\) standard deviations, but breaks down for skewed distributions or small samples. The IQR method is distribution-free, defining outliers as points below \(Q_1 - 1.5 \cdot \text{IQR}\) or above \(Q_3 + 1.5 \cdot \text{IQR}\), making it robust for non-Gaussian data. Isolation Forest takes a fundamentally different approach: it builds random trees and measures how few splits are needed to isolate each point – anomalies are isolated quickly (short average path length), with a score near \(-1\).

Practical guidance: never blindly remove outliers. In fraud detection, the outliers are the signal. In regression, a single extreme point can shift a least-squares fit dramatically because the loss is quadratic – \(\sum(y_i - \hat{y}_i)^2\) amplifies large residuals. Robust alternatives like Huber loss or tree-based models (which are inherently outlier-resistant due to rank-based splits) often outperform outlier removal.

# Generate data with outliers
np.random.seed(42)
n_samples = 500
n_outliers = 25

# Normal data
X_normal = np.random.randn(n_samples - n_outliers, 2) * 2

# Outliers
X_outliers = np.random.uniform(low=-10, high=10, size=(n_outliers, 2))

# Combine
X_with_outliers = np.vstack([X_normal, X_outliers])
y_true_outliers = np.hstack([np.zeros(n_samples - n_outliers), np.ones(n_outliers)])

print(f"Total samples: {len(X_with_outliers)}")
print(f"Outliers: {n_outliers} ({100*n_outliers/len(X_with_outliers):.1f}%)")

# Method 1: Z-score
def detect_outliers_zscore(data, threshold=3):
    """
    Detect outliers using Z-score method.
    """
    z_scores = np.abs(stats.zscore(data))
    outliers = (z_scores > threshold).any(axis=1)
    return outliers

# Method 2: IQR
def detect_outliers_iqr(data):
    """
    Detect outliers using IQR method.
    """
    Q1 = np.percentile(data, 25, axis=0)
    Q3 = np.percentile(data, 75, axis=0)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = ((data < lower_bound) | (data > upper_bound)).any(axis=1)
    return outliers

# Method 3: Isolation Forest
iso_forest = IsolationForest(contamination=0.05, random_state=42)
outliers_iso = iso_forest.fit_predict(X_with_outliers) == -1

# Compare methods
outliers_z = detect_outliers_zscore(X_with_outliers)
outliers_iqr = detect_outliers_iqr(X_with_outliers)

print("Outlier Detection Results:\n")
print(f"Z-score (|z| > 3): {outliers_z.sum()} outliers")
print(f"IQR method: {outliers_iqr.sum()} outliers")
print(f"Isolation Forest: {outliers_iso.sum()} outliers")
print(f"\nActual outliers: {n_outliers}")

# Visualize outlier detection
fig, axes = plt.subplots(2, 2, figsize=(14, 12))

methods = [
    ('Original Data', y_true_outliers.astype(bool)),
    ('Z-score', outliers_z),
    ('IQR', outliers_iqr),
    ('Isolation Forest', outliers_iso)
]

for ax, (title, outliers) in zip(axes.flat, methods):
    # Plot normal points
    ax.scatter(X_with_outliers[~outliers, 0], X_with_outliers[~outliers, 1],
               c='blue', alpha=0.5, s=20, label='Normal')
    # Plot outliers
    ax.scatter(X_with_outliers[outliers, 0], X_with_outliers[outliers, 1],
               c='red', alpha=0.8, s=50, marker='x', label='Outlier')
    ax.set_title(f'{title}\n({outliers.sum()} outliers)', fontsize=11, fontweight='bold')
    ax.set_xlabel('Feature 1')
    ax.set_ylabel('Feature 2')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\n💡 Different methods detect different outliers based on assumptions")

Label Noise Detection: When Your Ground Truth Lies¶

Label noise – incorrect annotations in supervised learning data – is pervasive in real-world datasets. Crowdsourced labels typically have 5-20% error rates, and even expert-annotated medical imaging datasets show 3-5% disagreement. The impact scales non-linearly: models trained on 10% noisy labels can lose 2-5% accuracy, but 30% noise can cause complete training collapse for sensitive architectures.

Confident learning detects mislabeled examples by training a model via cross-validation and identifying samples where the model’s confident prediction disagrees with the given label. The intuition is that a well-trained model will learn the majority pattern and flag labels that contradict it. Formally, a sample \((x_i, \tilde{y}_i)\) is flagged when \(\hat{P}(y \neq \tilde{y}_i | x_i) > \tau\) for some threshold \(\tau\). The cleanlab library automates this process using a principled confusion-matrix-based approach. In production, investing in label quality (better annotation guidelines, multi-annotator consensus, active learning) almost always yields larger accuracy gains than switching to a more complex model architecture.

# Create dataset with label noise
X, y_clean = make_classification(
    n_samples=1000, n_features=20, n_informative=15,
    n_redundant=5, random_state=42
)

# Add label noise (flip 10% of labels)
y_noisy = y_clean.copy()
noise_indices = np.random.choice(len(y_noisy), size=int(0.1 * len(y_noisy)), replace=False)
y_noisy[noise_indices] = 1 - y_noisy[noise_indices]  # Flip labels

print(f"Total samples: {len(y_noisy)}")
print(f"Noisy labels: {len(noise_indices)} ({100*len(noise_indices)/len(y_noisy):.1f}%)")
print(f"Label changes: {(y_clean != y_noisy).sum()}")

# Detect label noise using confident learning
from sklearn.model_selection import cross_val_predict

# Train model with noisy labels
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Get cross-validated predictions
y_pred_proba = cross_val_predict(
    model, X, y_noisy, cv=5, method='predict_proba'
)

# Find samples with high confidence but wrong prediction
predicted_class = y_pred_proba.argmax(axis=1)
confidence = y_pred_proba.max(axis=1)

# Potential label errors: high confidence but mismatch
potential_errors = (predicted_class != y_noisy) & (confidence > 0.7)

print(f"\nPotential label errors detected: {potential_errors.sum()}")
print(f"Actual noisy labels: {len(noise_indices)}")
print(f"Detection accuracy: {100 * np.isin(np.where(potential_errors)[0], noise_indices).sum() / len(noise_indices):.1f}%")

# Visualize label noise impact
X_train, X_test, y_train_clean, y_test_clean = train_test_split(
    X, y_clean, test_size=0.2, random_state=42
)
X_train, X_test, y_train_noisy, y_test_noisy = train_test_split(
    X, y_noisy, test_size=0.2, random_state=42
)

# Train with clean labels
model_clean = RandomForestClassifier(n_estimators=100, random_state=42)
model_clean.fit(X_train, y_train_clean)
acc_clean = model_clean.score(X_test, y_test_clean)

# Train with noisy labels
model_noisy = RandomForestClassifier(n_estimators=100, random_state=42)
model_noisy.fit(X_train, y_train_noisy)
acc_noisy = model_noisy.score(X_test, y_test_clean)  # Test on clean labels

# Plot comparison
fig, ax = plt.subplots(figsize=(8, 6))
labels = ['Clean Labels', 'Noisy Labels\n(10% flipped)']
accuracies = [acc_clean, acc_noisy]
colors = ['green', 'red']

bars = ax.bar(labels, accuracies, color=colors, alpha=0.7, edgecolor='black')
ax.set_ylabel('Test Accuracy', fontsize=12)
ax.set_title('Impact of Label Noise on Model Performance', fontsize=14, fontweight='bold')
ax.set_ylim([0, 1])

for bar, acc in zip(bars, accuracies):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{acc:.3f}', ha='center', va='bottom', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

print(f"\n📉 Performance drop: {100*(acc_clean - acc_noisy):.2f}% due to label noise")

Distribution Shift Detection: When the World Changes Under Your Model¶

Distribution shift occurs when the statistical properties of data change between training and deployment, and it is the leading cause of silent model degradation in production. Covariate shift means the input distribution \(P(X)\) changes while \(P(Y|X)\) stays the same – for example, a model trained on daytime images deployed at night. Concept drift is more dangerous: the relationship \(P(Y|X)\) itself changes, as when customer purchasing behavior shifts during a recession. Prior probability shift means class proportions change (e.g., spam rates increasing from 10% to 40%).

The Kolmogorov-Smirnov (KS) test compares two empirical distributions by measuring the maximum absolute difference between their CDFs: \(D = \sup_x |F_{\text{train}}(x) - F_{\text{test}}(x)|\). A small p-value (below the significance level \(\alpha\)) rejects the null hypothesis that both samples come from the same distribution. For multivariate shift detection, Population Stability Index (PSI) and Maximum Mean Discrepancy (MMD) are commonly used in production monitoring systems. When shift is detected, the standard response escalation is: (1) alert and log, (2) fall back to a simpler baseline model, (3) trigger retraining with recent data.

# Simulate distribution shift
np.random.seed(42)

# Training data: mean=0, std=1
X_train_original = np.random.randn(1000, 5)

# Test data with shift: mean=2, std=1.5
X_test_shifted = np.random.randn(200, 5) * 1.5 + 2

print("Train statistics:")
print(f"  Mean: {X_train_original.mean(axis=0)}")
print(f"  Std: {X_train_original.std(axis=0)}")

print("\nTest statistics:")
print(f"  Mean: {X_test_shifted.mean(axis=0)}")
print(f"  Std: {X_test_shifted.std(axis=0)}")

# Statistical test for distribution shift
from scipy.stats import ks_2samp

def detect_distribution_shift(X_train, X_test, alpha=0.05):
    """
    Detect distribution shift using Kolmogorov-Smirnov test.
    """
    print("=" * 60)
    print("DISTRIBUTION SHIFT ANALYSIS")
    print("=" * 60)
    
    n_features = X_train.shape[1]
    shift_detected = []
    
    print(f"\nTesting {n_features} features (α = {alpha}):\n")
    
    for i in range(n_features):
        statistic, p_value = ks_2samp(X_train[:, i], X_test[:, i])
        shift = p_value < alpha
        shift_detected.append(shift)
        
        status = "⚠️ SHIFT" if shift else "✅ OK"
        print(f"Feature {i}: p-value = {p_value:.4f} - {status}")
    
    print("\n" + "=" * 60)
    print(f"Summary: {sum(shift_detected)}/{n_features} features show distribution shift")
    print("=" * 60)
    
    return shift_detected

shift_results = detect_distribution_shift(X_train_original, X_test_shifted)

# Visualize distribution shift
fig, axes = plt.subplots(2, 3, figsize=(15, 8))
axes = axes.flatten()

for i in range(5):
    ax = axes[i]
    ax.hist(X_train_original[:, i], bins=30, alpha=0.5, label='Train', color='blue', edgecolor='black')
    ax.hist(X_test_shifted[:, i], bins=30, alpha=0.5, label='Test', color='red', edgecolor='black')
    
    status = "⚠️ SHIFT" if shift_results[i] else "✅ OK"
    ax.set_title(f'Feature {i} - {status}', fontsize=10, fontweight='bold')
    ax.set_xlabel('Value')
    ax.set_ylabel('Frequency')
    ax.legend()
    ax.grid(True, alpha=0.3)

# Hide the extra subplot
axes[5].axis('off')

plt.tight_layout()
plt.show()

print("\n💡 Distribution shift can severely impact model performance!")
print("   Solutions: Retraining, domain adaptation, or feature engineering")

🎯 Key Takeaways¶

Missing Values:
- Always check percentage missing per feature
- Drop features with >50% missing
- Choose imputation strategy based on data type and pattern
Duplicates:
- Can inflate performance metrics
- Remove before train/test split
- Verify if duplicates are intentional
Outliers:
- Use multiple detection methods
- Don’t always remove - might be valid data
- Consider robust models (Random Forest, etc.)
Label Noise:
- Can significantly hurt performance
- Use confident learning to detect
- Consider robust loss functions
Distribution Shift:
- Test before deployment
- Monitor in production
- Retrain or adapt when detected

📝 Practice Exercise¶

Task: Given a dataset, perform comprehensive data quality analysis:

Check for missing values and recommend handling strategy
Detect and remove duplicates
Identify outliers using 2+ methods
Compare model performance before/after data cleaning

Continue to Notebook 3 to learn about performance profiling! 🚀