Useful Data Tips

Pandas GroupBy Explained: Complete Guide with Examples

⏱️ 35 sec read 🐍 Python

GroupBy is one of pandas' most powerful features for data aggregation. It follows the split-apply-combine pattern to transform your data efficiently:

1. Basic GroupBy Operations

import pandas as pd

# Sample data
df = pd.DataFrame({
    'category': ['A', 'B', 'A', 'B', 'A', 'C'],
    'value': [10, 20, 15, 25, 12, 30],
    'quantity': [1, 2, 3, 4, 5, 6]
})

# Group by single column and aggregate ✅
df.groupby('category')['value'].sum()
# A    37
# B    45
# C    30

# Multiple aggregations
df.groupby('category')['value'].agg(['sum', 'mean', 'count'])

# Group and get all statistics
df.groupby('category')['value'].describe()

2. Common Aggregation Functions

# Sum
df.groupby('category')['value'].sum()

# Mean (average)
df.groupby('category')['value'].mean()

# Count
df.groupby('category')['value'].count()

# Min and Max
df.groupby('category')['value'].min()
df.groupby('category')['value'].max()

# Standard deviation
df.groupby('category')['value'].std()

# First and Last values
df.groupby('category')['value'].first()
df.groupby('category')['value'].last()

# Median
df.groupby('category')['value'].median()

3. Group by Multiple Columns

sales = pd.DataFrame({
    'region': ['East', 'East', 'West', 'West', 'East'],
    'product': ['A', 'B', 'A', 'B', 'A'],
    'sales': [100, 150, 120, 200, 110]
})

# Group by region and product
result = sales.groupby(['region', 'product'])['sales'].sum()
#             sales
# region product
# East   A        210
#        B        150
# West   A        120
#        B        200

# Reset index to get normal DataFrame
result = result.reset_index()

# Alternative: group multiple columns
sales.groupby(['region', 'product'], as_index=False)['sales'].sum()

4. Multiple Aggregations with agg()

# Multiple functions on single column
df.groupby('category')['value'].agg(['sum', 'mean', 'std', 'min', 'max'])

# Different functions on different columns ✅
result = df.groupby('category').agg({
    'value': ['sum', 'mean'],
    'quantity': ['sum', 'count']
})

# Named aggregations (pandas 0.25+)
result = df.groupby('category').agg(
    total_value=('value', 'sum'),
    avg_value=('value', 'mean'),
    total_quantity=('quantity', 'sum'),
    count=('value', 'count')
)

# Custom aggregation function
df.groupby('category')['value'].agg(lambda x: x.max() - x.min())

5. Transform - Keep Original Shape

# Calculate group mean and broadcast to all rows
df['category_mean'] = df.groupby('category')['value'].transform('mean')

# Calculate percentage of group total
df['pct_of_group'] = df['value'] / df.groupby('category')['value'].transform('sum')

# Z-score within groups
df['z_score'] = (df['value'] - df.groupby('category')['value'].transform('mean')) / \
                df.groupby('category')['value'].transform('std')

# Rank within groups
df['rank_in_group'] = df.groupby('category')['value'].rank(ascending=False)

6. Filter Groups

# Keep only groups with sum > 50
df.groupby('category').filter(lambda x: x['value'].sum() > 50)

# Keep groups with more than 2 members
df.groupby('category').filter(lambda x: len(x) > 2)

# Keep groups where max value > 20
df.groupby('category').filter(lambda x: x['value'].max() > 20)

7. Apply Custom Functions

# Apply custom function to each group
def custom_stats(group):
    return pd.Series({
        'sum': group['value'].sum(),
        'range': group['value'].max() - group['value'].min(),
        'count': len(group)
    })

result = df.groupby('category').apply(custom_stats)

# Apply function that returns DataFrame
def top_2(group):
    return group.nlargest(2, 'value')

df.groupby('category').apply(top_2)

8. Practical Examples

# Example: Sales analysis
sales_data = pd.DataFrame({
    'date': pd.date_range('2024-01-01', periods=100),
    'product': ['A', 'B', 'C'] * 33 + ['A'],
    'sales': range(100, 200),
    'region': ['North', 'South'] * 50
})

# Total sales by product
sales_data.groupby('product')['sales'].sum()

# Average sales by region and product
sales_data.groupby(['region', 'product'])['sales'].mean()

# Sales by month
sales_data['month'] = sales_data['date'].dt.month
monthly_sales = sales_data.groupby('month')['sales'].sum()

# Top selling product per region
sales_data.groupby('region').apply(
    lambda x: x.nlargest(1, 'sales')[['product', 'sales']]
)

# Cumulative sum within groups
sales_data['cumulative_sales'] = sales_data.groupby('product')['sales'].cumsum()

# Running average
sales_data['running_avg'] = sales_data.groupby('product')['sales'].expanding().mean().reset_index(level=0, drop=True)

9. Performance Tips

# Use categorical data type for groupby columns (faster)
df['category'] = df['category'].astype('category')
df.groupby('category')['value'].sum()  # Much faster!

# Use sort=False if you don't need sorted results
df.groupby('category', sort=False)['value'].sum()  # Faster

# Use as_index=False to avoid reset_index() later
df.groupby('category', as_index=False)['value'].sum()

# Avoid apply() when possible - use built-in functions
# Slow ❌
df.groupby('category').apply(lambda x: x['value'].sum())

# Fast ✅
df.groupby('category')['value'].sum()

10. Advanced Patterns

# Percent of total for each group
df['pct_of_total'] = df.groupby('category')['value'].transform(
    lambda x: x / x.sum() * 100
)

# Group statistics
group_stats = df.groupby('category').agg({
    'value': ['sum', 'mean', 'std', 'count'],
    'quantity': 'sum'
})

# Flatten multi-level columns
group_stats.columns = ['_'.join(col).strip() for col in group_stats.columns]

# Pivot-like with groupby
result = df.groupby(['category', 'date'])['value'].sum().unstack()

# Multiple groups with different operations
df.groupby('category').agg(
    total_value=('value', 'sum'),
    avg_value=('value', 'mean'),
    max_quantity=('quantity', 'max'),
    count=('value', 'size')  # Use 'size' to count all rows including NaN
)

Best Practices

Pro Tip: Use df.groupby('col', observed=True) when grouping by categorical columns to only include categories that actually appear in the data. This can significantly speed up operations on large datasets with many unused categories!

← Back to Python Tips