Run this notebook: Open in Colab Open in Kaggle

Pandas Fundamentals: The Operations Every Data Scientist Must Know¶

Beyond read_csv and head() — master groupby, merge, pivot, method chaining, and memory optimization to write pandas code that’s fast, readable, and production-ready.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

pd.set_option('display.max_columns', 20)
pd.set_option('display.float_format', '{:.2f}'.format)

# Synthetic sales dataset we'll use throughout
np.random.seed(42)
n = 1000
df = pd.DataFrame({
    'date':       pd.date_range('2023-01-01', periods=n, freq='D').repeat(1)[:n],
    'region':     np.random.choice(['North', 'South', 'East', 'West'], n),
    'product':    np.random.choice(['Laptop', 'Phone', 'Tablet', 'Watch'], n),
    'salesperson':np.random.choice([f'Rep_{i}' for i in range(1, 11)], n),
    'quantity':   np.random.randint(1, 20, n),
    'unit_price': np.random.choice([299, 499, 799, 1299, 1999], n).astype(float),
    'discount':   np.random.uniform(0, 0.3, n).round(2),
    'customer_id':np.random.randint(1000, 2000, n),
})
df['revenue'] = (df['quantity'] * df['unit_price'] * (1 - df['discount'])).round(2)
df['month']   = df['date'].dt.to_period('M')

print(f'Dataset: {df.shape[0]} rows × {df.shape[1]} columns')
df.head()

1. GroupBy — The Most Powerful Aggregation Tool¶

# Basic groupby
revenue_by_region = df.groupby('region')['revenue'].agg(['sum', 'mean', 'count'])
revenue_by_region.columns = ['total_revenue', 'avg_revenue', 'transactions']
print('Revenue by region:')
print(revenue_by_region.sort_values('total_revenue', ascending=False))

# Multi-level groupby with named aggregations (pandas 0.25+)
summary = df.groupby(['region', 'product']).agg(
    total_revenue=('revenue', 'sum'),
    avg_quantity=('quantity', 'mean'),
    max_discount=('discount', 'max'),
    unique_customers=('customer_id', 'nunique'),
    transactions=('revenue', 'count')
).round(2).reset_index()

print(summary.head(8))

# transform() — adds group stats back to original row (keeps shape!)
# Use case: compute each sale's % of regional total
df['region_total'] = df.groupby('region')['revenue'].transform('sum')
df['pct_of_region'] = (df['revenue'] / df['region_total'] * 100).round(2)

print('Top 5 transactions by % of regional revenue:')
print(df[['region', 'product', 'revenue', 'pct_of_region']]
      .sort_values('pct_of_region', ascending=False).head())

# filter() — keep only groups meeting a condition
# Keep only regions with > 250 transactions
high_volume_regions = df.groupby('region').filter(lambda x: len(x) > 250)
print(f'Original: {len(df)} rows')
print(f'High volume regions only: {len(high_volume_regions)} rows')
print(f'Regions included: {high_volume_regions["region"].unique()}')

2. Merge — Joining DataFrames Like a Pro¶

# Create related DataFrames
customers = pd.DataFrame({
    'customer_id': range(1000, 2000),
    'tier':        np.random.choice(['Bronze', 'Silver', 'Gold', 'Platinum'], 1000),
    'signup_year': np.random.choice([2019, 2020, 2021, 2022, 2023], 1000),
})

region_targets = pd.DataFrame({
    'region': ['North', 'South', 'East', 'West'],
    'q4_target': [500_000, 450_000, 600_000, 520_000]
})

# Inner join: only matching rows
merged = df.merge(customers, on='customer_id', how='left')  # left = keep all sales
print(f'After left join: {merged.shape}')

# Merge on different column names
merged2 = merged.merge(region_targets, on='region')
print(f'After adding targets: {merged2.shape}')

# Diagnose merge issues: check for duplicates and non-matches
def merge_diagnostic(left_df, right_df, on, how='left'):
    """Check a merge before committing to it."""
    left_n = len(left_df)
    result = left_df.merge(right_df, on=on, how=how)
    right_n = len(right_df)
    
    print(f'Left rows: {left_n:,}')
    print(f'Right rows: {right_n:,}')
    print(f'Result rows: {len(result):,}')
    if len(result) > left_n:
        print(f'⚠️  DUPLICATES: result is {len(result) - left_n} rows larger than left!')
        print('   → Right table has duplicate keys. Deduplicate before merging.')
    elif len(result) < left_n and how == 'left':
        print(f'⚠️  NON-MATCHES: {left_n - len(result)} rows did not find a match.')
    else:
        print('✅ Merge looks clean.')
    return result

_ = merge_diagnostic(df, customers, on='customer_id')

3. Pivot Tables — Reshaping Data¶

# pivot_table: region × product revenue matrix
pivot = df.pivot_table(
    values='revenue',
    index='region',
    columns='product',
    aggfunc='sum',
    margins=True,        # Add row/col totals
    margins_name='Total'
).round(0)

print('Revenue: Region × Product')
print(pivot)

# Heatmap of the pivot
plt.figure(figsize=(10, 5))
pivot_data = pivot.drop('Total').drop('Total', axis=1)
sns.heatmap(pivot_data, annot=True, fmt='.0f', cmap='YlOrRd',
            cbar_kws={'label': 'Revenue ($)'})
plt.title('Revenue by Region and Product')
plt.tight_layout()
plt.show()

4. Method Chaining — Readable Transformations¶

# Without chaining: messy, lots of temp variables
temp1 = df[df['revenue'] > 1000]
temp2 = temp1.groupby(['region', 'product'])['revenue'].sum().reset_index()
temp3 = temp2.rename(columns={'revenue': 'total_revenue'})
temp4 = temp3.sort_values('total_revenue', ascending=False)
result_bad = temp4.head(5)

# With chaining: clean, single expression
result_good = (
    df
    .query('revenue > 1000')                            # filter
    .groupby(['region', 'product'])['revenue']          # group
    .sum()                                              # aggregate
    .reset_index()                                      # flatten
    .rename(columns={'revenue': 'total_revenue'})       # rename
    .sort_values('total_revenue', ascending=False)      # sort
    .head(5)                                            # limit
)

print('Top 5 region-product combinations (revenue > $1000):')
print(result_good)

# assign() for inline column creation within a chain
result = (
    df
    .assign(
        revenue_k=lambda x: x['revenue'] / 1000,
        is_high_value=lambda x: x['revenue'] > x['revenue'].quantile(0.9),
        month_num=lambda x: x['date'].dt.month
    )
    .query('is_high_value')
    .groupby('region')
    .agg(high_value_count=('revenue', 'count'),
         avg_revenue_k=('revenue_k', 'mean'))
    .round(2)
)
print('High-value transactions (top 10%) by region:')
print(result)

5. Apply — Custom Functions on DataFrames¶

# apply() on a Series: vectorized when possible, else row-wise
def classify_revenue(revenue: float) -> str:
    if revenue < 500:    return 'Low'
    elif revenue < 2000: return 'Medium'
    elif revenue < 5000: return 'High'
    else:                return 'Premium'

# ⚠️ Gotcha: apply() is Python-level — slow for large DataFrames
# Prefer: pd.cut() or np.select() for classification tasks

# SLOW (avoid):
# df['tier'] = df['revenue'].apply(classify_revenue)

# FAST (preferred):
df['revenue_tier'] = pd.cut(
    df['revenue'],
    bins=[0, 500, 2000, 5000, float('inf')],
    labels=['Low', 'Medium', 'High', 'Premium']
)

print(df['revenue_tier'].value_counts())

# When apply() is appropriate: complex multi-column logic
def compute_commission(row):
    """Commission rate depends on both tier and region."""
    base_rate = {'Low': 0.02, 'Medium': 0.05, 'High': 0.08, 'Premium': 0.12}
    region_bonus = {'North': 0.01, 'South': 0.0, 'East': 0.015, 'West': 0.005}
    rate = base_rate[str(row['revenue_tier'])] + region_bonus[row['region']]
    return round(row['revenue'] * rate, 2)

df['commission'] = df.apply(compute_commission, axis=1)
print(f'Total commission paid: ${df["commission"].sum():,.0f}')
print(f'Avg commission rate: {(df["commission"] / df["revenue"]).mean():.2%}')

6. Memory Optimization¶

def memory_usage_mb(df: pd.DataFrame) -> float:
    return df.memory_usage(deep=True).sum() / 1024**2

print(f'Before optimization: {memory_usage_mb(df):.2f} MB')
print()
print('Column dtypes before:')
print(df.dtypes)

df_opt = df.copy()

# 1. Convert low-cardinality strings to category
for col in ['region', 'product', 'salesperson']:
    df_opt[col] = df_opt[col].astype('category')

# 2. Downcast integers
for col in ['quantity', 'customer_id']:
    df_opt[col] = pd.to_numeric(df_opt[col], downcast='integer')

# 3. Downcast floats
for col in ['unit_price', 'discount', 'revenue', 'commission']:
    df_opt[col] = pd.to_numeric(df_opt[col], downcast='float')

print(f'\nAfter optimization: {memory_usage_mb(df_opt):.2f} MB')
reduction = (1 - memory_usage_mb(df_opt) / memory_usage_mb(df)) * 100
print(f'Reduction: {reduction:.1f}%')

Common Gotchas¶

# 1. SettingWithCopyWarning — always use .copy() or .loc
subset = df[df['region'] == 'North'].copy()  # ✅
subset['new_col'] = 1  # no warning

# 2. groupby sorts by default — disable for speed
df.groupby('region', sort=False)['revenue'].sum()  # faster on large data

# 3. merge adds suffixes on column name conflicts
result = left.merge(right, on='id')  # creates 'value_x', 'value_y'
# Fix: rename before merging, or use suffixes=('_left', '_right')

# 4. Period objects don't play well with Parquet — convert to string
df['month'] = df['date'].dt.to_period('M').astype(str)  # '2023-01'

Exercises¶

Find the top 3 salespeople in each region by total revenue using groupby + nlargest.
Add a rolling_30d_revenue column — the 30-day rolling sum of revenue per region.
Build a month-over-month revenue growth table using pct_change() after pivoting by month.
Optimize a 10-column DataFrame from float64 to float32 and category where appropriate — measure the memory savings.
Use pd.merge_asof to do a time-based merge between sales and a daily price table.