Pandas Fundamentals: The Operations Every Data Scientist Must KnowΒΆ
Beyond read_csv and head() β master groupby, merge, pivot, method chaining, and memory optimization to write pandas code thatβs fast, readable, and production-ready.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
pd.set_option('display.max_columns', 20)
pd.set_option('display.float_format', '{:.2f}'.format)
# Synthetic sales dataset we'll use throughout
np.random.seed(42)
n = 1000
df = pd.DataFrame({
'date': pd.date_range('2023-01-01', periods=n, freq='D').repeat(1)[:n],
'region': np.random.choice(['North', 'South', 'East', 'West'], n),
'product': np.random.choice(['Laptop', 'Phone', 'Tablet', 'Watch'], n),
'salesperson':np.random.choice([f'Rep_{i}' for i in range(1, 11)], n),
'quantity': np.random.randint(1, 20, n),
'unit_price': np.random.choice([299, 499, 799, 1299, 1999], n).astype(float),
'discount': np.random.uniform(0, 0.3, n).round(2),
'customer_id':np.random.randint(1000, 2000, n),
})
df['revenue'] = (df['quantity'] * df['unit_price'] * (1 - df['discount'])).round(2)
df['month'] = df['date'].dt.to_period('M')
print(f'Dataset: {df.shape[0]} rows Γ {df.shape[1]} columns')
df.head()
1. GroupBy β The Most Powerful Aggregation ToolΒΆ
# Basic groupby
revenue_by_region = df.groupby('region')['revenue'].agg(['sum', 'mean', 'count'])
revenue_by_region.columns = ['total_revenue', 'avg_revenue', 'transactions']
print('Revenue by region:')
print(revenue_by_region.sort_values('total_revenue', ascending=False))
# Multi-level groupby with named aggregations (pandas 0.25+)
summary = df.groupby(['region', 'product']).agg(
total_revenue=('revenue', 'sum'),
avg_quantity=('quantity', 'mean'),
max_discount=('discount', 'max'),
unique_customers=('customer_id', 'nunique'),
transactions=('revenue', 'count')
).round(2).reset_index()
print(summary.head(8))
# transform() β adds group stats back to original row (keeps shape!)
# Use case: compute each sale's % of regional total
df['region_total'] = df.groupby('region')['revenue'].transform('sum')
df['pct_of_region'] = (df['revenue'] / df['region_total'] * 100).round(2)
print('Top 5 transactions by % of regional revenue:')
print(df[['region', 'product', 'revenue', 'pct_of_region']]
.sort_values('pct_of_region', ascending=False).head())
# filter() β keep only groups meeting a condition
# Keep only regions with > 250 transactions
high_volume_regions = df.groupby('region').filter(lambda x: len(x) > 250)
print(f'Original: {len(df)} rows')
print(f'High volume regions only: {len(high_volume_regions)} rows')
print(f'Regions included: {high_volume_regions["region"].unique()}')
2. Merge β Joining DataFrames Like a ProΒΆ
# Create related DataFrames
customers = pd.DataFrame({
'customer_id': range(1000, 2000),
'tier': np.random.choice(['Bronze', 'Silver', 'Gold', 'Platinum'], 1000),
'signup_year': np.random.choice([2019, 2020, 2021, 2022, 2023], 1000),
})
region_targets = pd.DataFrame({
'region': ['North', 'South', 'East', 'West'],
'q4_target': [500_000, 450_000, 600_000, 520_000]
})
# Inner join: only matching rows
merged = df.merge(customers, on='customer_id', how='left') # left = keep all sales
print(f'After left join: {merged.shape}')
# Merge on different column names
merged2 = merged.merge(region_targets, on='region')
print(f'After adding targets: {merged2.shape}')
# Diagnose merge issues: check for duplicates and non-matches
def merge_diagnostic(left_df, right_df, on, how='left'):
"""Check a merge before committing to it."""
left_n = len(left_df)
result = left_df.merge(right_df, on=on, how=how)
right_n = len(right_df)
print(f'Left rows: {left_n:,}')
print(f'Right rows: {right_n:,}')
print(f'Result rows: {len(result):,}')
if len(result) > left_n:
print(f'β οΈ DUPLICATES: result is {len(result) - left_n} rows larger than left!')
print(' β Right table has duplicate keys. Deduplicate before merging.')
elif len(result) < left_n and how == 'left':
print(f'β οΈ NON-MATCHES: {left_n - len(result)} rows did not find a match.')
else:
print('β
Merge looks clean.')
return result
_ = merge_diagnostic(df, customers, on='customer_id')
3. Pivot Tables β Reshaping DataΒΆ
# pivot_table: region Γ product revenue matrix
pivot = df.pivot_table(
values='revenue',
index='region',
columns='product',
aggfunc='sum',
margins=True, # Add row/col totals
margins_name='Total'
).round(0)
print('Revenue: Region Γ Product')
print(pivot)
# Heatmap of the pivot
plt.figure(figsize=(10, 5))
pivot_data = pivot.drop('Total').drop('Total', axis=1)
sns.heatmap(pivot_data, annot=True, fmt='.0f', cmap='YlOrRd',
cbar_kws={'label': 'Revenue ($)'})
plt.title('Revenue by Region and Product')
plt.tight_layout()
plt.show()
4. Method Chaining β Readable TransformationsΒΆ
# Without chaining: messy, lots of temp variables
temp1 = df[df['revenue'] > 1000]
temp2 = temp1.groupby(['region', 'product'])['revenue'].sum().reset_index()
temp3 = temp2.rename(columns={'revenue': 'total_revenue'})
temp4 = temp3.sort_values('total_revenue', ascending=False)
result_bad = temp4.head(5)
# With chaining: clean, single expression
result_good = (
df
.query('revenue > 1000') # filter
.groupby(['region', 'product'])['revenue'] # group
.sum() # aggregate
.reset_index() # flatten
.rename(columns={'revenue': 'total_revenue'}) # rename
.sort_values('total_revenue', ascending=False) # sort
.head(5) # limit
)
print('Top 5 region-product combinations (revenue > $1000):')
print(result_good)
# assign() for inline column creation within a chain
result = (
df
.assign(
revenue_k=lambda x: x['revenue'] / 1000,
is_high_value=lambda x: x['revenue'] > x['revenue'].quantile(0.9),
month_num=lambda x: x['date'].dt.month
)
.query('is_high_value')
.groupby('region')
.agg(high_value_count=('revenue', 'count'),
avg_revenue_k=('revenue_k', 'mean'))
.round(2)
)
print('High-value transactions (top 10%) by region:')
print(result)
5. Apply β Custom Functions on DataFramesΒΆ
# apply() on a Series: vectorized when possible, else row-wise
def classify_revenue(revenue: float) -> str:
if revenue < 500: return 'Low'
elif revenue < 2000: return 'Medium'
elif revenue < 5000: return 'High'
else: return 'Premium'
# β οΈ Gotcha: apply() is Python-level β slow for large DataFrames
# Prefer: pd.cut() or np.select() for classification tasks
# SLOW (avoid):
# df['tier'] = df['revenue'].apply(classify_revenue)
# FAST (preferred):
df['revenue_tier'] = pd.cut(
df['revenue'],
bins=[0, 500, 2000, 5000, float('inf')],
labels=['Low', 'Medium', 'High', 'Premium']
)
print(df['revenue_tier'].value_counts())
# When apply() is appropriate: complex multi-column logic
def compute_commission(row):
"""Commission rate depends on both tier and region."""
base_rate = {'Low': 0.02, 'Medium': 0.05, 'High': 0.08, 'Premium': 0.12}
region_bonus = {'North': 0.01, 'South': 0.0, 'East': 0.015, 'West': 0.005}
rate = base_rate[str(row['revenue_tier'])] + region_bonus[row['region']]
return round(row['revenue'] * rate, 2)
df['commission'] = df.apply(compute_commission, axis=1)
print(f'Total commission paid: ${df["commission"].sum():,.0f}')
print(f'Avg commission rate: {(df["commission"] / df["revenue"]).mean():.2%}')
6. Memory OptimizationΒΆ
def memory_usage_mb(df: pd.DataFrame) -> float:
return df.memory_usage(deep=True).sum() / 1024**2
print(f'Before optimization: {memory_usage_mb(df):.2f} MB')
print()
print('Column dtypes before:')
print(df.dtypes)
df_opt = df.copy()
# 1. Convert low-cardinality strings to category
for col in ['region', 'product', 'salesperson']:
df_opt[col] = df_opt[col].astype('category')
# 2. Downcast integers
for col in ['quantity', 'customer_id']:
df_opt[col] = pd.to_numeric(df_opt[col], downcast='integer')
# 3. Downcast floats
for col in ['unit_price', 'discount', 'revenue', 'commission']:
df_opt[col] = pd.to_numeric(df_opt[col], downcast='float')
print(f'\nAfter optimization: {memory_usage_mb(df_opt):.2f} MB')
reduction = (1 - memory_usage_mb(df_opt) / memory_usage_mb(df)) * 100
print(f'Reduction: {reduction:.1f}%')
Common GotchasΒΆ
# 1. SettingWithCopyWarning β always use .copy() or .loc
subset = df[df['region'] == 'North'].copy() # β
subset['new_col'] = 1 # no warning
# 2. groupby sorts by default β disable for speed
df.groupby('region', sort=False)['revenue'].sum() # faster on large data
# 3. merge adds suffixes on column name conflicts
result = left.merge(right, on='id') # creates 'value_x', 'value_y'
# Fix: rename before merging, or use suffixes=('_left', '_right')
# 4. Period objects don't play well with Parquet β convert to string
df['month'] = df['date'].dt.to_period('M').astype(str) # '2023-01'
ExercisesΒΆ
Find the top 3 salespeople in each region by total revenue using
groupby+nlargest.Add a
rolling_30d_revenuecolumn β the 30-day rolling sum of revenue per region.Build a month-over-month revenue growth table using
pct_change()after pivoting by month.Optimize a 10-column DataFrame from float64 to float32 and category where appropriate β measure the memory savings.
Use
pd.merge_asofto do a time-based merge between sales and a daily price table.