Run this notebook: Open in Colab Open in Kaggle

Exploratory Data Analysis: A Systematic Framework¶

EDA is not just running .describe() — it’s a structured investigation. This notebook gives you a repeatable framework: univariate → bivariate → multivariate → anomaly detection.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Use the Titanic dataset — classic EDA substrate
df = sns.load_dataset('titanic')
print(f'Shape: {df.shape}')
df.head()

Step 1: Data Overview — The First 5 Minutes¶

def quick_overview(df: pd.DataFrame) -> pd.DataFrame:
    """One-table summary of every column."""
    report = pd.DataFrame({
        'dtype':    df.dtypes,
        'nulls':    df.isnull().sum(),
        'null_pct': (df.isnull().sum() / len(df) * 100).round(1),
        'unique':   df.nunique(),
        'sample':   df.iloc[0],
    })
    report['cardinality'] = report['unique'].apply(
        lambda x: 'binary' if x == 2 else ('low' if x < 20 else ('medium' if x < 100 else 'high'))
    )
    return report

print(quick_overview(df))

Step 2: Univariate Analysis — Each Column in Isolation¶

# Numerical columns
num_cols = ['age', 'fare', 'sibsp', 'parch']

fig, axes = plt.subplots(2, len(num_cols), figsize=(16, 8))

for i, col in enumerate(num_cols):
    data = df[col].dropna()
    
    # Histogram + KDE
    axes[0, i].hist(data, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
    axes[0, i].set_title(f'{col}\nskew={data.skew():.2f}')
    axes[0, i].set_xlabel(col)
    
    # Boxplot
    axes[1, i].boxplot(data, vert=True)
    axes[1, i].set_title(f'{col} distribution')

plt.suptitle('Numerical Feature Distributions', fontsize=14)
plt.tight_layout()
plt.show()

# Summary stats with more percentiles
df[num_cols].describe(percentiles=[.05, .25, .5, .75, .95]).round(2)

# Categorical columns
cat_cols = ['sex', 'pclass', 'embarked', 'who', 'class']

fig, axes = plt.subplots(1, len(cat_cols), figsize=(18, 4))

for ax, col in zip(axes, cat_cols):
    counts = df[col].value_counts()
    ax.bar(counts.index.astype(str), counts.values, color='coral', edgecolor='black')
    ax.set_title(col)
    ax.set_xlabel('')
    for i, v in enumerate(counts.values):
        ax.text(i, v + 2, str(v), ha='center', fontsize=9)

plt.suptitle('Categorical Feature Distributions', fontsize=14)
plt.tight_layout()
plt.show()

Step 3: Bivariate Analysis — Feature vs. Target¶

target = 'survived'

fig, axes = plt.subplots(2, 3, figsize=(16, 10))

# Categorical features vs survival
for ax, col in zip(axes[0], ['sex', 'pclass', 'embarked']):
    survival_rate = df.groupby(col)[target].mean().sort_values(ascending=False)
    bars = ax.bar(survival_rate.index.astype(str), survival_rate.values,
                  color=['#2ecc71' if v > 0.5 else '#e74c3c' for v in survival_rate.values])
    ax.set_title(f'Survival Rate by {col}')
    ax.set_ylim(0, 1)
    ax.axhline(y=df[target].mean(), color='black', linestyle='--', alpha=0.5, label='Overall avg')
    for bar, v in zip(bars, survival_rate.values):
        ax.text(bar.get_x() + bar.get_width()/2., v + 0.02, f'{v:.1%}', ha='center', fontsize=10)

# Numerical features vs survival (KDE per class)
for ax, col in zip(axes[1], ['age', 'fare', 'sibsp']):
    for survived, color, label in [(0, 'red', 'Died'), (1, 'green', 'Survived')]:
        data = df[df[target] == survived][col].dropna()
        ax.hist(data, bins=25, alpha=0.5, color=color, label=label, density=True)
    ax.set_title(f'{col} by Survival')
    ax.legend()

plt.suptitle('Feature vs. Survival Rate', fontsize=14)
plt.tight_layout()
plt.show()

Step 4: Correlation Analysis¶

# Numerical correlations
corr_cols = ['survived', 'pclass', 'age', 'sibsp', 'parch', 'fare']
corr_matrix = df[corr_cols].corr()

fig, ax = plt.subplots(figsize=(8, 6))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))  # upper triangle
sns.heatmap(
    corr_matrix,
    annot=True, fmt='.2f', cmap='coolwarm',
    mask=mask, vmin=-1, vmax=1, center=0,
    ax=ax, square=True
)
ax.set_title('Correlation Matrix (lower triangle)')
plt.tight_layout()
plt.show()

# Features most correlated with target
target_corr = corr_matrix['survived'].drop('survived').sort_values(key=abs, ascending=False)
print('\nCorrelation with survival (magnitude order):')
for col, val in target_corr.items():
    direction = '↑' if val > 0 else '↓'
    print(f'  {col:10s}: {val:+.3f} {direction}')

Step 5: Multivariate — Interaction Effects¶

# pclass × sex interaction — survival heatmap
pivot = df.groupby(['pclass', 'sex'])['survived'].mean().unstack()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

sns.heatmap(pivot, annot=True, fmt='.1%', cmap='RdYlGn', ax=ax1, vmin=0, vmax=1)
ax1.set_title('Survival Rate: Class × Sex')

# Scatter with multiple dimensions
survived_colors = {0: 'red', 1: 'green'}
pclass_markers = {1: 'o', 2: 's', 3: '^'}

for pclass in [1, 2, 3]:
    for survived in [0, 1]:
        subset = df[(df['pclass'] == pclass) & (df['survived'] == survived) & df['age'].notna()]
        ax2.scatter(subset['age'], subset['fare'],
                    c=survived_colors[survived], marker=pclass_markers[pclass],
                    alpha=0.4, s=30,
                    label=f'Class {pclass}, {"Survived" if survived else "Died"}')

ax2.set_xlabel('Age')
ax2.set_ylabel('Fare')
ax2.set_title('Age vs Fare (color=survival, shape=class)')
ax2.legend(loc='upper right', fontsize=7, ncol=2)

plt.tight_layout()
plt.show()

Step 6: Outlier Detection¶

def detect_outliers(series: pd.Series, method: str = 'iqr') -> pd.Series:
    """Return boolean mask of outliers."""
    if method == 'iqr':
        Q1, Q3 = series.quantile(0.25), series.quantile(0.75)
        IQR = Q3 - Q1
        return (series < Q1 - 1.5 * IQR) | (series > Q3 + 1.5 * IQR)
    elif method == 'zscore':
        return pd.Series(np.abs(stats.zscore(series.dropna())) > 3, index=series.dropna().index)

print('Outlier counts by method:')
for col in ['age', 'fare', 'sibsp']:
    data = df[col].dropna()
    iqr_outliers = detect_outliers(data, 'iqr').sum()
    z_outliers = detect_outliers(data, 'zscore').sum()
    print(f'  {col:8s}: IQR={iqr_outliers}, Z-score={z_outliers}')

Step 7: Missing Value Analysis¶

# Visualize missing patterns
missing = df.isnull().mean().sort_values(ascending=False)
missing = missing[missing > 0]

if len(missing) > 0:
    fig, ax = plt.subplots(figsize=(8, 4))
    bars = ax.barh(missing.index, missing.values * 100, color='salmon')
    ax.set_xlabel('Missing %')
    ax.set_title('Missing Values by Column')
    for bar, val in zip(bars, missing.values):
        ax.text(val * 100 + 0.5, bar.get_y() + bar.get_height()/2,
                f'{val:.1%}', va='center')
    plt.tight_layout()
    plt.show()

# Is 'age' missing at random? Check vs survival rate
df['age_missing'] = df['age'].isnull()
print('Survival rate when age is missing vs. present:')
print(df.groupby('age_missing')['survived'].agg(['mean', 'count']))

EDA Checklist¶

✅ Shape, dtypes, memory usage
✅ Missing values — how much, is it random?
✅ Univariate: distributions, skew, cardinality
✅ Bivariate: feature vs. target
✅ Correlation matrix
✅ Multivariate: interactions between features
✅ Outliers — detect and decide: fix, keep, or drop?
✅ Class balance (classification tasks)
✅ Temporal patterns (time series data)

Exercises¶

Load the penguins dataset (sns.load_dataset('penguins')). Run the full EDA framework on it.
For the Titanic data, check if fare outliers should be removed — what’s the impact on the fare distribution?
Build a correlation_with_target function that tests both Pearson (numerical) and Cramér’s V (categorical) against a target.
Create an EDA report function that saves all plots to a PDF automatically.
Check if age is MCAR, MAR, or MNAR by testing its relationship with other variables.