# Install fairness libraries (run once)
# !pip install fairlearn aif360

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from fairlearn.metrics import (
    demographic_parity_difference,
    demographic_parity_ratio,
    equalized_odds_difference,
    MetricFrame
)

np.random.seed(42)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

print("โœ… Setup complete")

Part 1: Understanding Bias in AIยถ

Types of Biasยถ

1. Historical Biasยถ

  • What: Bias exists in the world and is reflected in data

  • Example: Historical hiring data shows men hired more often for tech roles

  • Result: Model learns to prefer male candidates

2. Representation Biasยถ

  • What: Training data doesnโ€™t represent reality

  • Example: Face recognition trained mostly on light-skinned faces

  • Result: Poor performance on dark-skinned faces

3. Measurement Biasยถ

  • What: Features measured differently for different groups

  • Example: Credit scores calculated differently by region

  • Result: Unfair predictions across groups

4. Aggregation Biasยถ

  • What: One model for all groups

  • Example: Diabetes risk model using same threshold for all ages

  • Result: Misses disease in elderly, over-diagnoses in young

5. Evaluation Biasยถ

  • What: Testing on non-representative data

  • Example: Speech recognition tested only on native speakers

  • Result: Poor performance on accents

6. Deployment Biasยถ

  • What: Model used in inappropriate context

  • Example: Recidivism model trained on one state, used in another

  • Result: Unfair predictions in new context

Protected Attributesยถ

Protected attributes are characteristics that anti-discrimination law shields from being used as a basis for adverse decisions. In machine learning, even when these features are explicitly excluded from the modelโ€™s input, correlated proxy variables (such as zip code correlating with race, or name correlating with gender) can reintroduce the same biases. This is why simply removing the sensitive column is insufficient โ€“ you must audit the modelโ€™s outputs across protected groups regardless.

Common protected attributes:

  • Race/Ethnicity

  • Gender/Sex

  • Age

  • Religion

  • Disability

  • Sexual Orientation

  • National Origin

Legal context: Regulations like the U.S. Equal Credit Opportunity Act, Title VII of the Civil Rights Act, and the EU AI Act impose specific obligations on automated decision systems. Violations can result in lawsuits, fines, and reputational damage โ€“ making fairness auditing a business necessity, not just an ethical ideal.

# Create synthetic loan approval dataset with bias
np.random.seed(42)
n_samples = 1000

# Generate data
data = pd.DataFrame({
    'income': np.random.normal(50000, 20000, n_samples),
    'credit_score': np.random.normal(650, 100, n_samples),
    'age': np.random.randint(18, 70, n_samples),
    'gender': np.random.choice(['Male', 'Female'], n_samples)
})

# Clip values
data['income'] = data['income'].clip(20000, 150000)
data['credit_score'] = data['credit_score'].clip(300, 850)

# Create biased approval (favors males)
# Legitimate factors
approval_score = (
    (data['income'] - 20000) / 130000 * 0.4 +
    (data['credit_score'] - 300) / 550 * 0.4
)

# Add gender bias (males get 20% boost)
approval_score += np.where(data['gender'] == 'Male', 0.2, 0)

# Random noise
approval_score += np.random.normal(0, 0.1, n_samples)

# Convert to binary
data['approved'] = (approval_score > 0.5).astype(int)

print("Loan Approval Dataset (with bias):")
print("=" * 60)
print(data.head(10))

# Check approval rates by gender
print("\nApproval Rates by Gender:")
approval_by_gender = data.groupby('gender')['approved'].agg(['sum', 'count', 'mean'])
approval_by_gender.columns = ['Approved', 'Total', 'Approval Rate']
print(approval_by_gender)

print("\nโš ๏ธ Notice: Males have higher approval rate!")

Part 2: Fairness Metricsยถ

1. Demographic Parity (Statistical Parity)ยถ

Definition: Positive outcome rate should be equal across groups

Formula: P(ลถ=1 | Gender=Male) = P(ลถ=1 | Gender=Female)

When satisfied: Both groups have same approval/acceptance rate

Criticism: Ignores differences in qualification

2. Equalized Oddsยถ

Definition: True positive and false positive rates equal across groups

Formula:

  • P(ลถ=1 | Y=1, Gender=Male) = P(ลถ=1 | Y=1, Gender=Female) (TPR)

  • P(ลถ=1 | Y=0, Gender=Male) = P(ลถ=1 | Y=0, Gender=Female) (FPR)

When satisfied: Model performs equally well across groups

3. Equal Opportunityยถ

Definition: True positive rate equal across groups (relaxed equalized odds)

Formula: P(ลถ=1 | Y=1, Gender=Male) = P(ลถ=1 | Y=1, Gender=Female)

When satisfied: Qualified people have equal chance regardless of group

# Train a biased model
X = data[['income', 'credit_score', 'age']]
y = data['approved']
gender = data['gender']

X_train, X_test, y_train, y_test, gender_train, gender_test = train_test_split(
    X, y, gender, test_size=0.3, random_state=42
)

# Train model (without using gender as feature)
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Calculate overall accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Overall Accuracy: {accuracy:.3f}")

# Calculate fairness metrics
dp_diff = demographic_parity_difference(
    y_test, y_pred, sensitive_features=gender_test
)
dp_ratio = demographic_parity_ratio(
    y_test, y_pred, sensitive_features=gender_test
)
eo_diff = equalized_odds_difference(
    y_test, y_pred, sensitive_features=gender_test
)

print("\nFairness Metrics:")
print("=" * 60)
print(f"Demographic Parity Difference: {dp_diff:.3f}")
print(f"  Interpretation: {abs(dp_diff)*100:.1f}% difference in approval rates")
print(f"  โœ… Fair if close to 0.0")

print(f"\nDemographic Parity Ratio: {dp_ratio:.3f}")
print(f"  Interpretation: Ratio of approval rates")
print(f"  โœ… Fair if close to 1.0")

print(f"\nEqualized Odds Difference: {eo_diff:.3f}")
print(f"  Interpretation: Max difference in TPR/FPR")
print(f"  โœ… Fair if close to 0.0")

# Fairness threshold (common: 0.8-1.25 for ratio)
if 0.8 <= dp_ratio <= 1.25:
    print("\nโœ… Model passes demographic parity test (80% rule)")
else:
    print("\nโŒ Model FAILS demographic parity test (80% rule)")

Performance by Groupยถ

Aggregate fairness metrics like Demographic Parity Difference give you a single number, but they can mask important details. Breaking down standard classification metrics โ€“ accuracy, precision, recall, and F1 โ€“ by each protected group reveals where the model is failing. A model might have equal selection rates (passing demographic parity) yet still have vastly different false positive rates across groups, meaning different groups experience different types of errors. The MetricFrame class from Fairlearn makes this disaggregated analysis straightforward, and the confusion matrices per group show exactly how prediction errors are distributed.

# Detailed breakdown by group
from sklearn.metrics import precision_score, recall_score, f1_score

metrics = {
    'accuracy': accuracy_score,
    'precision': precision_score,
    'recall': recall_score,
    'f1': f1_score
}

metric_frame = MetricFrame(
    metrics=metrics,
    y_true=y_test,
    y_pred=y_pred,
    sensitive_features=gender_test
)

print("Performance by Gender:")
print("=" * 60)
print(metric_frame.by_group)

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Metrics by group
metric_frame.by_group.plot(kind='bar', ax=axes[0])
axes[0].set_title('Model Performance by Gender', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Score', fontsize=12)
axes[0].set_xlabel('Gender', fontsize=12)
axes[0].legend(title='Metric', bbox_to_anchor=(1.05, 1), loc='upper left')
axes[0].grid(alpha=0.3, axis='y')
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=0)

# Plot 2: Confusion matrices
for i, gender_val in enumerate(['Female', 'Male']):
    mask = gender_test == gender_val
    cm = confusion_matrix(y_test[mask], y_pred[mask])
    
    plt.subplot(2, 2, i+3)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.title(f'{gender_val} - Confusion Matrix')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')

plt.tight_layout()
plt.show()

Part 3: Detecting Biasยถ

Data Bias Detectionยถ

Before touching the model, the first step is auditing the training data itself. Statistical tests like Chi-squared independence tests reveal whether the sensitive attribute and the target label are associated in the dataset โ€“ which would mean any model trained on this data will likely inherit that association. Beyond statistical tests, examining feature distributions by group, checking for representation imbalance, and understanding the data collection process are all essential steps. Bias introduced at the data stage propagates through every downstream decision.

def analyze_dataset_bias(df, target_col, sensitive_col):
    """Analyze bias in dataset"""
    print(f"Dataset Bias Analysis: {target_col} by {sensitive_col}")
    print("=" * 70)
    
    # 1. Class distribution by group
    print("\n1. Class Distribution:")
    dist = pd.crosstab(
        df[sensitive_col], 
        df[target_col], 
        normalize='index'
    ) * 100
    print(dist.round(2))
    
    # 2. Group representation
    print("\n2. Group Representation:")
    group_counts = df[sensitive_col].value_counts()
    group_pct = (group_counts / len(df) * 100).round(2)
    representation = pd.DataFrame({
        'Count': group_counts,
        'Percentage': group_pct
    })
    print(representation)
    
    # 3. Feature statistics by group
    print("\n3. Feature Statistics by Group:")
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    stats = df.groupby(sensitive_col)[numeric_cols].mean()
    print(stats.round(2))
    
    # 4. Statistical tests
    from scipy.stats import chi2_contingency
    
    contingency = pd.crosstab(df[sensitive_col], df[target_col])
    chi2, p_value, dof, expected = chi2_contingency(contingency)
    
    print(f"\n4. Statistical Independence Test:")
    print(f"   Chi-square statistic: {chi2:.3f}")
    print(f"   P-value: {p_value:.4f}")
    
    if p_value < 0.05:
        print(f"   โš ๏ธ SIGNIFICANT relationship between {sensitive_col} and {target_col}!")
        print(f"      Data shows potential bias.")
    else:
        print(f"   โœ… No significant relationship detected")

analyze_dataset_bias(data, 'approved', 'gender')

Model Bias Detectionยถ

Once the model is trained, we need to measure bias in its predictions independently of data bias. Even a model trained on perfectly balanced data can develop biased decision boundaries. Model bias detection involves computing group-specific metrics โ€“ selection rate, true positive rate (TPR), false positive rate (FPR), and precision โ€“ and checking whether the ratios between groups fall within acceptable bounds. The commonly used โ€œ80% ruleโ€ (or four-fifths rule from U.S. employment law) states that the selection rate for any protected group should be at least 80% of the rate for the highest-performing group. The function below provides a comprehensive bias detection report with both ratio and difference tests.

def detect_model_bias(y_true, y_pred, sensitive_features, feature_name='Group'):
    """Comprehensive model bias detection"""
    results = []
    
    for group in np.unique(sensitive_features):
        mask = sensitive_features == group
        
        # Calculate metrics
        cm = confusion_matrix(y_true[mask], y_pred[mask])
        tn, fp, fn, tp = cm.ravel()
        
        # Rates
        tpr = tp / (tp + fn) if (tp + fn) > 0 else 0  # Recall
        fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        selection_rate = (tp + fp) / len(y_true[mask]) if len(y_true[mask]) > 0 else 0
        
        results.append({
            feature_name: group,
            'Selection Rate': selection_rate,
            'TPR (Recall)': tpr,
            'FPR': fpr,
            'Precision': precision,
            'Accuracy': accuracy_score(y_true[mask], y_pred[mask])
        })
    
    df_results = pd.DataFrame(results)
    
    print("Model Bias Detection Report:")
    print("=" * 80)
    print(df_results.round(3).to_string(index=False))
    
    # Check disparities
    print("\nDisparity Analysis:")
    print("-" * 80)
    
    for col in ['Selection Rate', 'TPR (Recall)', 'FPR', 'Precision', 'Accuracy']:
        max_val = df_results[col].max()
        min_val = df_results[col].min()
        ratio = min_val / max_val if max_val > 0 else 0
        diff = max_val - min_val
        
        print(f"{col}:")
        print(f"  Range: {min_val:.3f} - {max_val:.3f}")
        print(f"  Ratio: {ratio:.3f} {'โœ…' if ratio >= 0.8 else 'โŒ'}")
        print(f"  Difference: {diff:.3f} {'โœ…' if diff <= 0.1 else 'โŒ'}")
    
    return df_results

bias_report = detect_model_bias(y_test, y_pred, gender_test, 'Gender')

Part 4: Mitigation Strategiesยถ

1. Pre-Processing: Fix the Dataยถ

Techniques:

  • Reweighting: Assign different weights to samples

  • Resampling: Oversample underrepresented groups

  • Data Augmentation: Generate synthetic fair data

# Reweighting example
from sklearn.utils.class_weight import compute_sample_weight

# Compute sample weights to balance by gender and class
sample_weights = compute_sample_weight(
    class_weight='balanced',
    y=y_train
)

# Train model with sample weights
model_weighted = LogisticRegression(random_state=42)
model_weighted.fit(X_train, y_train, sample_weight=sample_weights)
y_pred_weighted = model_weighted.predict(X_test)

print("Comparison: Original vs Weighted Model")
print("=" * 70)

print("\nOriginal Model:")
detect_model_bias(y_test, y_pred, gender_test, 'Gender')

print("\n" + "=" * 70)
print("\nWeighted Model:")
detect_model_bias(y_test, y_pred_weighted, gender_test, 'Gender')

2. In-Processing: Constrain the Modelยถ

In-processing methods modify the training algorithm itself to enforce fairness constraints while optimizing for accuracy. Fairlearnโ€™s ExponentiatedGradient reformulates fair classification as a sequence of cost-sensitive classification problems, iteratively reweighting samples to find a model that satisfies the specified fairness constraint (e.g., Demographic Parity) while minimizing the accuracy loss.

Techniques:

  • Exponentiated Gradient โ€“ Reduces constrained optimization to a sequence of weighted classification problems

  • Adversarial debiasing โ€“ Trains a classifier alongside an adversary that tries to predict the sensitive attribute from the classifierโ€™s output; the main model learns to make predictions that the adversary cannot distinguish by group

  • Fairness-aware algorithms โ€“ Custom loss functions that include fairness penalty terms, such as \(\mathcal{L} = \mathcal{L}_{\text{task}} + \lambda \cdot \mathcal{L}_{\text{fairness}}\)

# Fairlearn's ExponentiatedGradient for demographic parity
from fairlearn.reductions import ExponentiatedGradient, DemographicParity

# Create fairness-constrained model
mitigator = ExponentiatedGradient(
    LogisticRegression(random_state=42),
    constraints=DemographicParity()
)

mitigator.fit(X_train, y_train, sensitive_features=gender_train)
y_pred_fair = mitigator.predict(X_test)

print("Comparison: Original vs Fair Model")
print("=" * 70)

# Compare fairness metrics
models = [
    ('Original', y_pred),
    ('Fair (Demographic Parity)', y_pred_fair)
]

comparison = []
for name, preds in models:
    acc = accuracy_score(y_test, preds)
    dp_diff = demographic_parity_difference(y_test, preds, sensitive_features=gender_test)
    dp_ratio = demographic_parity_ratio(y_test, preds, sensitive_features=gender_test)
    
    comparison.append({
        'Model': name,
        'Accuracy': acc,
        'DP Difference': dp_diff,
        'DP Ratio': dp_ratio
    })

df_comparison = pd.DataFrame(comparison)
print(df_comparison.round(3).to_string(index=False))

print("\n๐Ÿ’ก Notice: Fair model may have lower accuracy but better fairness!")

3. Post-Processing: Adjust Predictionsยถ

Post-processing methods leave the trained model untouched and instead adjust its predictions to satisfy fairness criteria. This is especially useful when you cannot retrain the model (e.g., a vendor-provided black-box model) or when you want to separate the accuracy optimization from fairness enforcement. Fairlearnโ€™s ThresholdOptimizer finds group-specific classification thresholds that achieve the best accuracy subject to equalized odds or demographic parity constraints.

Techniques:

  • Threshold optimization โ€“ Different classification thresholds per group, calibrated to equalize TPR/FPR

  • Calibration โ€“ Ensure predicted probabilities are equally well-calibrated across groups

  • Reject option classification โ€“ Defer uncertain predictions near the decision boundary to human reviewers

# Threshold Optimizer for equalized odds
from fairlearn.postprocessing import ThresholdOptimizer

# Get probability predictions
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Create threshold optimizer
threshold_optimizer = ThresholdOptimizer(
    estimator=model,
    constraints='equalized_odds',
    objective='accuracy_score',
    predict_method='predict_proba'
)

threshold_optimizer.fit(X_train, y_train, sensitive_features=gender_train)
y_pred_thresh = threshold_optimizer.predict(X_test, sensitive_features=gender_test)

print("Post-Processing Results:")
print("=" * 70)

models = [
    ('Original', y_pred),
    ('Threshold Optimized', y_pred_thresh)
]

for name, preds in models:
    print(f"\n{name}:")
    acc = accuracy_score(y_test, preds)
    eo_diff = equalized_odds_difference(y_test, preds, sensitive_features=gender_test)
    
    print(f"  Accuracy: {acc:.3f}")
    print(f"  Equalized Odds Difference: {eo_diff:.3f}")

Part 5: Real-World Case Studiesยถ

Case Study 1: COMPAS Recidivismยถ

Background: Algorithm used to predict criminal reoffending

Findings (ProPublica 2016):

  • Black defendants: 45% false positive rate

  • White defendants: 23% false positive rate

  • Nearly 2x more likely to wrongly label Black defendants as high risk

Impact: Influenced bail and sentencing decisions

Lesson: Even without race as a feature, proxies (zip code, etc.) can cause bias

Case Study 2: Amazon Hiring Toolยถ

Background: ML tool to screen resumes

Problem: Trained on 10 years of historical hiring data (mostly male candidates)

Result:

  • Penalized resumes with โ€œwomenโ€™sโ€ in them (e.g., โ€œwomenโ€™s chess clubโ€)

  • Downranked graduates from all-womenโ€™s colleges

Action: Amazon scrapped the tool

Lesson: Historical bias gets learned by models

Case Study 3: Face Recognitionยถ

Study (MIT, 2018):

  • Error rates by demographic:

    • Light-skinned males: 0.8%

    • Dark-skinned females: 34.7%

Cause: Training data heavily skewed toward light-skinned faces

Impact: Misidentification in security, law enforcement

Lesson: Representation matters!

# Simulate COMPAS-like scenario
def simulate_compas():
    """Simulate biased recidivism prediction"""
    np.random.seed(42)
    n = 1000
    
    # Create synthetic data
    race = np.random.choice(['Black', 'White'], n, p=[0.4, 0.6])
    
    # True recidivism (similar rates)
    true_recidivism = np.random.choice([0, 1], n, p=[0.6, 0.4])
    
    # Biased predictions (higher false positive for Black defendants)
    predictions = true_recidivism.copy()
    
    # Increase false positives for Black defendants
    black_mask = (race == 'Black') & (true_recidivism == 0)
    flip_indices = np.where(black_mask)[0]
    flip_count = int(len(flip_indices) * 0.3)  # 30% false positive rate
    predictions[np.random.choice(flip_indices, flip_count, replace=False)] = 1
    
    # Calculate false positive rates
    print("COMPAS Simulation - False Positive Rates:")
    print("=" * 60)
    
    for race_val in ['Black', 'White']:
        mask = race == race_val
        actual_neg = (true_recidivism[mask] == 0)
        pred_pos = (predictions[mask] == 1)
        
        fp = np.sum(actual_neg & pred_pos)
        tn = np.sum(actual_neg & ~pred_pos)
        fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
        
        print(f"{race_val}:")
        print(f"  False Positives: {fp}")
        print(f"  True Negatives: {tn}")
        print(f"  False Positive Rate: {fpr:.3f} ({fpr*100:.1f}%)")
    
    print("\nโš ๏ธ This demonstrates how bias manifests in predictions!")

simulate_compas()

Part 6: Fairness Toolkitsยถ

๐ŸŽฏ Knowledge Checkยถ

Q1: Whatโ€™s the difference between demographic parity and equalized odds?
Q2: Can removing sensitive features (like race) eliminate bias?
Q3: What are the three stages for bias mitigation?
Q4: Why might improving fairness reduce accuracy?

Click for answers

A1: Demographic parity: equal outcome rates. Equalized odds: equal TPR and FPR across groups
A2: No! Proxy features (zip code, name, etc.) can still encode sensitive info
A3: Pre-processing (data), in-processing (model), post-processing (predictions)
A4: Trade-off: model optimized for overall accuracy may perform unequally across groups

๐Ÿ“š Summaryยถ

Fairness Checklistยถ

Before Training:

  • Analyze dataset for bias

  • Check representation of protected groups

  • Identify potential proxy features

  • Define fairness criteria for your use case

During Training:

  • Track metrics by group

  • Consider fairness constraints

  • Use appropriate mitigation if needed

After Training:

  • Calculate fairness metrics

  • Test on diverse data

  • Document limitations

  • Monitor in production

Remember: Fairness is context-dependent. Always consider:

  • Legal requirements

  • Stakeholder needs

  • Potential harms

  • Trade-offs

๐Ÿš€ Next Stepsยถ

  1. Complete Bias & Fairness Challenge

  2. Read Notebook 5: Model Comparison & Selection

  3. Audit your own models for bias

  4. Explore Fairlearn and AIF360 documentation

Important work! Fairness is crucial for responsible AI! โš–๏ธ