Exploratory Data Analysis: A Systematic FrameworkΒΆ
EDA is not just running .describe() β itβs a structured investigation. This notebook gives you a repeatable framework: univariate β bivariate β multivariate β anomaly detection.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
# Use the Titanic dataset β classic EDA substrate
df = sns.load_dataset('titanic')
print(f'Shape: {df.shape}')
df.head()
Step 1: Data Overview β The First 5 MinutesΒΆ
def quick_overview(df: pd.DataFrame) -> pd.DataFrame:
"""One-table summary of every column."""
report = pd.DataFrame({
'dtype': df.dtypes,
'nulls': df.isnull().sum(),
'null_pct': (df.isnull().sum() / len(df) * 100).round(1),
'unique': df.nunique(),
'sample': df.iloc[0],
})
report['cardinality'] = report['unique'].apply(
lambda x: 'binary' if x == 2 else ('low' if x < 20 else ('medium' if x < 100 else 'high'))
)
return report
print(quick_overview(df))
Step 2: Univariate Analysis β Each Column in IsolationΒΆ
# Numerical columns
num_cols = ['age', 'fare', 'sibsp', 'parch']
fig, axes = plt.subplots(2, len(num_cols), figsize=(16, 8))
for i, col in enumerate(num_cols):
data = df[col].dropna()
# Histogram + KDE
axes[0, i].hist(data, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
axes[0, i].set_title(f'{col}\nskew={data.skew():.2f}')
axes[0, i].set_xlabel(col)
# Boxplot
axes[1, i].boxplot(data, vert=True)
axes[1, i].set_title(f'{col} distribution')
plt.suptitle('Numerical Feature Distributions', fontsize=14)
plt.tight_layout()
plt.show()
# Summary stats with more percentiles
df[num_cols].describe(percentiles=[.05, .25, .5, .75, .95]).round(2)
# Categorical columns
cat_cols = ['sex', 'pclass', 'embarked', 'who', 'class']
fig, axes = plt.subplots(1, len(cat_cols), figsize=(18, 4))
for ax, col in zip(axes, cat_cols):
counts = df[col].value_counts()
ax.bar(counts.index.astype(str), counts.values, color='coral', edgecolor='black')
ax.set_title(col)
ax.set_xlabel('')
for i, v in enumerate(counts.values):
ax.text(i, v + 2, str(v), ha='center', fontsize=9)
plt.suptitle('Categorical Feature Distributions', fontsize=14)
plt.tight_layout()
plt.show()
Step 3: Bivariate Analysis β Feature vs. TargetΒΆ
target = 'survived'
fig, axes = plt.subplots(2, 3, figsize=(16, 10))
# Categorical features vs survival
for ax, col in zip(axes[0], ['sex', 'pclass', 'embarked']):
survival_rate = df.groupby(col)[target].mean().sort_values(ascending=False)
bars = ax.bar(survival_rate.index.astype(str), survival_rate.values,
color=['#2ecc71' if v > 0.5 else '#e74c3c' for v in survival_rate.values])
ax.set_title(f'Survival Rate by {col}')
ax.set_ylim(0, 1)
ax.axhline(y=df[target].mean(), color='black', linestyle='--', alpha=0.5, label='Overall avg')
for bar, v in zip(bars, survival_rate.values):
ax.text(bar.get_x() + bar.get_width()/2., v + 0.02, f'{v:.1%}', ha='center', fontsize=10)
# Numerical features vs survival (KDE per class)
for ax, col in zip(axes[1], ['age', 'fare', 'sibsp']):
for survived, color, label in [(0, 'red', 'Died'), (1, 'green', 'Survived')]:
data = df[df[target] == survived][col].dropna()
ax.hist(data, bins=25, alpha=0.5, color=color, label=label, density=True)
ax.set_title(f'{col} by Survival')
ax.legend()
plt.suptitle('Feature vs. Survival Rate', fontsize=14)
plt.tight_layout()
plt.show()
Step 4: Correlation AnalysisΒΆ
# Numerical correlations
corr_cols = ['survived', 'pclass', 'age', 'sibsp', 'parch', 'fare']
corr_matrix = df[corr_cols].corr()
fig, ax = plt.subplots(figsize=(8, 6))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool)) # upper triangle
sns.heatmap(
corr_matrix,
annot=True, fmt='.2f', cmap='coolwarm',
mask=mask, vmin=-1, vmax=1, center=0,
ax=ax, square=True
)
ax.set_title('Correlation Matrix (lower triangle)')
plt.tight_layout()
plt.show()
# Features most correlated with target
target_corr = corr_matrix['survived'].drop('survived').sort_values(key=abs, ascending=False)
print('\nCorrelation with survival (magnitude order):')
for col, val in target_corr.items():
direction = 'β' if val > 0 else 'β'
print(f' {col:10s}: {val:+.3f} {direction}')
Step 5: Multivariate β Interaction EffectsΒΆ
# pclass Γ sex interaction β survival heatmap
pivot = df.groupby(['pclass', 'sex'])['survived'].mean().unstack()
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
sns.heatmap(pivot, annot=True, fmt='.1%', cmap='RdYlGn', ax=ax1, vmin=0, vmax=1)
ax1.set_title('Survival Rate: Class Γ Sex')
# Scatter with multiple dimensions
survived_colors = {0: 'red', 1: 'green'}
pclass_markers = {1: 'o', 2: 's', 3: '^'}
for pclass in [1, 2, 3]:
for survived in [0, 1]:
subset = df[(df['pclass'] == pclass) & (df['survived'] == survived) & df['age'].notna()]
ax2.scatter(subset['age'], subset['fare'],
c=survived_colors[survived], marker=pclass_markers[pclass],
alpha=0.4, s=30,
label=f'Class {pclass}, {"Survived" if survived else "Died"}')
ax2.set_xlabel('Age')
ax2.set_ylabel('Fare')
ax2.set_title('Age vs Fare (color=survival, shape=class)')
ax2.legend(loc='upper right', fontsize=7, ncol=2)
plt.tight_layout()
plt.show()
Step 6: Outlier DetectionΒΆ
def detect_outliers(series: pd.Series, method: str = 'iqr') -> pd.Series:
"""Return boolean mask of outliers."""
if method == 'iqr':
Q1, Q3 = series.quantile(0.25), series.quantile(0.75)
IQR = Q3 - Q1
return (series < Q1 - 1.5 * IQR) | (series > Q3 + 1.5 * IQR)
elif method == 'zscore':
return pd.Series(np.abs(stats.zscore(series.dropna())) > 3, index=series.dropna().index)
print('Outlier counts by method:')
for col in ['age', 'fare', 'sibsp']:
data = df[col].dropna()
iqr_outliers = detect_outliers(data, 'iqr').sum()
z_outliers = detect_outliers(data, 'zscore').sum()
print(f' {col:8s}: IQR={iqr_outliers}, Z-score={z_outliers}')
Step 7: Missing Value AnalysisΒΆ
# Visualize missing patterns
missing = df.isnull().mean().sort_values(ascending=False)
missing = missing[missing > 0]
if len(missing) > 0:
fig, ax = plt.subplots(figsize=(8, 4))
bars = ax.barh(missing.index, missing.values * 100, color='salmon')
ax.set_xlabel('Missing %')
ax.set_title('Missing Values by Column')
for bar, val in zip(bars, missing.values):
ax.text(val * 100 + 0.5, bar.get_y() + bar.get_height()/2,
f'{val:.1%}', va='center')
plt.tight_layout()
plt.show()
# Is 'age' missing at random? Check vs survival rate
df['age_missing'] = df['age'].isnull()
print('Survival rate when age is missing vs. present:')
print(df.groupby('age_missing')['survived'].agg(['mean', 'count']))
EDA ChecklistΒΆ
β
Shape, dtypes, memory usage
β
Missing values β how much, is it random?
β
Univariate: distributions, skew, cardinality
β
Bivariate: feature vs. target
β
Correlation matrix
β
Multivariate: interactions between features
β
Outliers β detect and decide: fix, keep, or drop?
β
Class balance (classification tasks)
β
Temporal patterns (time series data)
ExercisesΒΆ
Load the
penguinsdataset (sns.load_dataset('penguins')). Run the full EDA framework on it.For the Titanic data, check if fare outliers should be removed β whatβs the impact on the fare distribution?
Build a
correlation_with_targetfunction that tests both Pearson (numerical) and CramΓ©rβs V (categorical) against a target.Create an EDA report function that saves all plots to a PDF automatically.
Check if
ageis MCAR, MAR, or MNAR by testing its relationship with other variables.