-
Notifications
You must be signed in to change notification settings - Fork 1
Examples
Yasser Mustafa edited this page Feb 15, 2026
·
1 revision
Real-world examples of using PipeFrame for data analysis.
from pipeframe import *
cleaned_data = (raw_data
# Remove missing values
>> drop_na('critical_column')
# Remove duplicates
>> distinct()
# Fix data types
>> define(
date='pd.to_datetime(date)',
amount='amount.astype(float)'
)
# Remove outliers
>> filter('amount > 0 & amount < 10000')
# Standardize text
>> define(category='category.str.upper().str.strip()')
)Analyze product sales by region and quarter:
from pipeframe import *
sales_summary = (sales_data
# Data preparation
>> filter('date >= "2024-01-01" & revenue > 0')
>> define(
quarter='pd.to_datetime(date).dt.quarter',
year='pd.to_datetime(date).dt.year',
profit='revenue - cost',
margin='(profit / revenue) * 100'
)
# Grouping and aggregation
>> group_by('region', 'product', 'quarter')
>> summarize(
total_revenue='sum(revenue)',
total_profit='sum(profit)',
avg_margin='mean(margin)',
num_transactions='count()'
)
# Filtering and sorting
>> filter('total_revenue > 10000')
>> arrange('region', 'quarter', '-total_revenue')
# Final selection
>> select('region', 'product', 'quarter',
'total_revenue', 'total_profit', 'avg_margin')
)
print(sales_summary)Segment customers based on purchase behavior:
from pipeframe import *
customer_segments = (transactions
# Calculate customer metrics
>> group_by('customer_id')
>> summarize(
total_spent='sum(amount)',
num_purchases='count()',
avg_purchase='mean(amount)',
last_purchase='max(date)'
)
# Create segments
>> define(
value_segment='''
"High" if total_spent > 1000 else
"Medium" if total_spent > 500 else
"Low"
''',
frequency_segment='''
"Frequent" if num_purchases > 10 else
"Occasional" if num_purchases > 3 else
"Rare"
'''
)
# Analyze segments
>> group_by('value_segment', 'frequency_segment')
>> summarize(
customer_count='count()',
avg_spent='mean(total_spent)',
avg_purchases='mean(num_purchases)'
)
>> arrange('-customer_count')
)Analyze trends over time:
from pipeframe import *
monthly_trends = (daily_data
# Prepare time data
>> define(
date='pd.to_datetime(date)',
year_month='date.dt.to_period("M")'
)
# Monthly aggregation
>> group_by('year_month', 'category')
>> summarize(
total_sales='sum(sales)',
avg_daily_sales='mean(sales)',
max_sales='max(sales)',
min_sales='min(sales)'
)
# Calculate month-over-month growth
>> arrange('category', 'year_month')
>> group_by('category')
>> define(mom_growth='(total_sales / total_sales.shift(1) - 1) * 100')
# Filter recent months
>> filter('year_month >= "2024-01"')
)Combine multiple datasets:
from pipeframe import *
# Simple join
combined = (orders
>> left_join(customers, on='customer_id')
>> left_join(products, on='product_id')
)
# Join and aggregate
result = (orders
>> inner_join(customers, on='customer_id')
>> filter('customer_age > 25')
>> group_by('customer_city')
>> summarize(
total_orders='count()',
total_revenue='sum(order_amount)',
unique_customers='customer_id.nunique()'
)
>> arrange('-total_revenue')
)A complete end-to-end analysis:
from pipeframe import *
final_report = (raw_sales
# 1. Data Cleaning
>> filter('date.notna() & amount > 0')
>> drop_na('customer_id', 'product_id')
>> distinct()
# 2. Feature Engineering
>> define(
date='pd.to_datetime(date)',
year='date.dt.year',
quarter='date.dt.quarter',
is_weekend='date.dt.dayofweek >= 5',
revenue_category='''
"Premium" if amount > 1000 else
"Standard" if amount > 100 else
"Budget"
'''
)
# 3. Join with reference data
>> left_join(products, on='product_id')
>> left_join(customers, on='customer_id')
# 4. Filter relevant data
>> filter('year == 2024 & customer_status == "Active"')
# 5. Grouping and aggregation
>> group_by('product_category', 'quarter', 'revenue_category')
>> summarize(
total_revenue='sum(amount)',
total_transactions='count()',
unique_customers='customer_id.nunique()',
avg_transaction='mean(amount)',
weekend_sales='sum(amount * is_weekend)'
)
# 6. Calculate percentages
>> define(
weekend_pct='(weekend_sales / total_revenue) * 100',
avg_per_customer='total_revenue / unique_customers'
)
# 7. Final sorting and selection
>> arrange('quarter', '-total_revenue')
>> select('product_category', 'quarter', 'revenue_category',
'total_revenue', 'total_transactions',
'unique_customers', 'weekend_pct')
# 8. Top results only
>> head(20)
)
# Export results
final_report.to_csv('quarterly_report.csv', index=False)- One step at a time: Test each operation before adding the next
-
Use meaningful names:
total_revenueis better thanrev - Comment complex logic: Help future you understand the code
- Chain related operations: Group logical steps together
-
Use
peek(): Debug by viewing intermediate results
- Quick Start Guide - Learn the basics
- API Reference - All available functions
- FAQ - Common questions