Data Visualization: From Exploratory Plots to Publication-Quality FiguresΒΆ
Matplotlib fundamentals, seaborn statistical plots, and plotly interactive charts β with a focus on making visualizations that actually communicate insights.
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
import pandas as pd
import numpy as np
# Style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')
# Datasets
tips = sns.load_dataset('tips')
flights = sns.load_dataset('flights')
iris = sns.load_dataset('iris')
print('Datasets loaded: tips, flights, iris')
1. Matplotlib Subplots β Full ControlΒΆ
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Tips Dataset: Four Perspectives', fontsize=16, fontweight='bold')
# 1. Total bill distribution
axes[0, 0].hist(tips['total_bill'], bins=25, color='steelblue', edgecolor='white', alpha=0.8)
axes[0, 0].axvline(tips['total_bill'].mean(), color='red', linestyle='--', label=f'Mean: ${tips["total_bill"].mean():.2f}')
axes[0, 0].axvline(tips['total_bill'].median(), color='orange', linestyle='--', label=f'Median: ${tips["total_bill"].median():.2f}')
axes[0, 0].set_title('Total Bill Distribution')
axes[0, 0].set_xlabel('Total Bill ($)')
axes[0, 0].legend()
# 2. Tip % by day
tips['tip_pct'] = tips['tip'] / tips['total_bill'] * 100
day_order = ['Thur', 'Fri', 'Sat', 'Sun']
bp = axes[0, 1].boxplot(
[tips[tips['day'] == d]['tip_pct'].values for d in day_order],
labels=day_order,
patch_artist=True,
medianprops={'color': 'red', 'linewidth': 2}
)
colors = ['#3498db', '#e74c3c', '#2ecc71', '#f39c12']
for patch, color in zip(bp['boxes'], colors):
patch.set_facecolor(color)
patch.set_alpha(0.7)
axes[0, 1].set_title('Tip % Distribution by Day')
axes[0, 1].set_ylabel('Tip %')
# 3. Scatter: bill vs tip, colored by sex
for sex, color, marker in [('Male', 'steelblue', 'o'), ('Female', 'salmon', 's')]:
subset = tips[tips['sex'] == sex]
axes[1, 0].scatter(subset['total_bill'], subset['tip'],
c=color, marker=marker, alpha=0.6, s=50, label=sex)
m, b = np.polyfit(tips['total_bill'], tips['tip'], 1)
x_line = np.linspace(tips['total_bill'].min(), tips['total_bill'].max(), 100)
axes[1, 0].plot(x_line, m * x_line + b, 'k--', alpha=0.5, label=f'Trend (rΒ²={np.corrcoef(tips["total_bill"], tips["tip"])[0,1]**2:.2f})')
axes[1, 0].set_title('Total Bill vs Tip')
axes[1, 0].set_xlabel('Total Bill ($)')
axes[1, 0].set_ylabel('Tip ($)')
axes[1, 0].legend()
# 4. Stacked bar: smoker Γ time
counts = tips.groupby(['time', 'smoker']).size().unstack(fill_value=0)
counts.plot(kind='bar', stacked=True, ax=axes[1, 1],
color=['#e74c3c', '#3498db'], alpha=0.8, edgecolor='white')
axes[1, 1].set_title('Diners by Meal Time & Smoking')
axes[1, 1].set_xlabel('Time')
axes[1, 1].tick_params(axis='x', rotation=0)
plt.tight_layout()
plt.show()
2. Seaborn β Statistical VisualizationsΒΆ
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
# Violin plot β richer than boxplot
sns.violinplot(data=tips, x='day', y='tip_pct', hue='sex',
split=True, inner='quart', ax=axes[0],
order=['Thur', 'Fri', 'Sat', 'Sun'])
axes[0].set_title('Tip % by Day & Sex (Violin)')
axes[0].set_ylabel('Tip %')
# Heatmap β flights data (months Γ year)
flights_pivot = flights.pivot(index='month', columns='year', values='passengers')
sns.heatmap(flights_pivot, annot=True, fmt='d', cmap='YlOrRd',
ax=axes[1], linewidths=0.5, cbar_kws={'label': 'Passengers'})
axes[1].set_title('Monthly Airline Passengers (1949-1960)')
# Pair plot subset (use FacetGrid for custom pair plots inline)
sns.scatterplot(data=iris, x='sepal_length', y='petal_length',
hue='species', style='species', s=80, ax=axes[2])
axes[2].set_title('Iris: Sepal vs Petal Length')
plt.tight_layout()
plt.show()
# FacetGrid β small multiples
g = sns.FacetGrid(tips, col='time', row='sex', height=4, aspect=1.2)
g.map_dataframe(sns.histplot, x='total_bill', bins=20, kde=True)
g.set_axis_labels('Total Bill ($)', 'Count')
g.set_titles(col_template='{col_name}', row_template='{row_name}')
g.add_legend()
plt.suptitle('Bill Distribution by Meal Γ Sex (Small Multiples)', y=1.02, fontsize=13)
plt.tight_layout()
plt.show()
3. Publication-Quality FiguresΒΆ
# Techniques for making figures presentable
COLORS = {'primary': '#2c3e50', 'accent': '#e74c3c', 'highlight': '#f39c12', 'muted': '#95a5a6'}
fig, ax = plt.subplots(figsize=(10, 6))
# Monthly trend
monthly = flights.groupby('year')['passengers'].sum().reset_index()
ax.fill_between(monthly['year'], monthly['passengers'],
alpha=0.15, color=COLORS['primary'])
ax.plot(monthly['year'], monthly['passengers'],
color=COLORS['primary'], linewidth=2.5, marker='o', markersize=6)
# Highlight peak year
peak = monthly.loc[monthly['passengers'].idxmax()]
ax.scatter(peak['year'], peak['passengers'],
color=COLORS['accent'], s=150, zorder=5)
ax.annotate(f'Peak: {peak["passengers"]:,}',
xy=(peak['year'], peak['passengers']),
xytext=(peak['year'] - 2, peak['passengers'] - 5000),
fontsize=11, color=COLORS['accent'],
arrowprops={'arrowstyle': '->', 'color': COLORS['accent']})
# Polish
ax.set_title('Annual Airline Passengers: 1949β1960', fontsize=15, fontweight='bold', pad=15)
ax.set_xlabel('Year', fontsize=12)
ax.set_ylabel('Total Passengers', fontsize=12)
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{x/1000:.0f}K'))
ax.spines[['top', 'right']].set_visible(False) # Remove top/right borders
ax.grid(axis='y', alpha=0.4)
plt.tight_layout()
plt.show()
4. Plotly β Interactive ChartsΒΆ
# !pip install plotly
try:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# Interactive scatter
fig = px.scatter(
tips, x='total_bill', y='tip',
color='sex', size='tip_pct', symbol='time',
hover_data=['day', 'size'],
title='Tips: Interactive Explorer (hover for details)',
labels={'total_bill': 'Total Bill ($)', 'tip': 'Tip ($)'}
)
fig.show()
# Interactive heatmap
fig2 = px.imshow(
flights_pivot,
title='Airline Passengers Heatmap (interactive)',
color_continuous_scale='YlOrRd',
aspect='auto'
)
fig2.show()
except ImportError:
print('Install plotly: pip install plotly')
print('Example code shown above β run after installing.')
Visualization Cheat SheetΒΆ
Data type / Goal β Best chart
ββββββββββββββββββββββββββββββββββββββββββββββ
Single continuous variable β histogram + KDE
Distribution comparison β violin plot or overlaid KDE
Category comparison β bar chart with error bars
Trend over time β line chart with shaded area
Two continuous variables β scatter with regression line
Many variables pairwise β pairplot (seaborn)
Matrix of values β heatmap
Part of whole β stacked bar (not pie!)
Multiple group distributions β FacetGrid small multiples
Interactive exploration β plotly express
ExercisesΒΆ
Recreate the matplotlib multi-panel figure using seaborn API only.
Load
sns.load_dataset('penguins')and create a pairplot colored by species.Build a Plotly animated scatter plot showing flight passengers growing over time using
animation_frame='year'.Create a dashboard-style 3Γ2 figure for the
tipsdataset with a consistent color scheme.Use
matplotlib.gridspec.GridSpecto create an asymmetric layout (large scatter + two small histograms as marginals).