-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcreate_sample_data.py
More file actions
80 lines (68 loc) · 2.33 KB
/
create_sample_data.py
File metadata and controls
80 lines (68 loc) · 2.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
np.random.seed(42)
# Generate sample data
n_customers = 1000
regions = ['North America', 'Europe', 'Asia Pacific', 'Latin America', 'Africa', 'Middle East']
channels = ['Organic', 'Paid Search', 'Paid Social', 'Email', 'Referral', 'Direct']
# Region-based pricing tiers
region_pricing = {
'North America': [29.99, 49.99, 79.99],
'Europe': [39.99, 59.99, 89.99],
'Asia Pacific': [19.99, 39.99, 59.99],
'Latin America': [14.99, 29.99, 49.99],
'Africa': [9.99, 19.99, 39.99],
'Middle East': [24.99, 44.99, 74.99]
}
# Channel CAC ranges
channel_cac = {
'Organic': (30, 80),
'Paid Search': (100, 250),
'Paid Social': (80, 200),
'Email': (50, 120),
'Referral': (20, 60),
'Direct': (40, 100)
}
data = []
start_date = datetime(2023, 1, 1)
for i in range(n_customers):
customer_id = 1001 + i
signup_date = start_date + timedelta(days=np.random.randint(0, 365))
region = np.random.choice(regions)
channel = np.random.choice(channels)
# Select price tier based on region
price_tier = np.random.choice(region_pricing[region])
# Calculate CAC components
cac_range = channel_cac[channel]
marketing_spend = np.random.uniform(cac_range[0], cac_range[1])
sales_spend = marketing_spend * np.random.uniform(0.2, 0.5)
# Churn probability (higher for lower price tiers)
if price_tier < 30:
churn_prob = 0.6
elif price_tier < 60:
churn_prob = 0.4
else:
churn_prob = 0.2
# Generate churn month (blank if not churned)
if np.random.random() < churn_prob:
churn_month = np.random.randint(1, 13)
else:
churn_month = np.nan
data.append({
'customer_id': customer_id,
'signup_date': signup_date.strftime('%Y-%m-%d'),
'region': region,
'acquisition_channel': channel,
'monthly_subscription_fee': price_tier,
'churn_month': churn_month if not pd.isna(churn_month) else '',
'marketing_spend': round(marketing_spend, 2),
'sales_spend': round(sales_spend, 2)
})
# Create DataFrame and save to CSV
df = pd.DataFrame(data)
df.to_csv('cac_ltv_model.csv', index=False)
print(f"Created sample data with {len(df)} customers")
print(f"Saved to cac_ltv_model.csv")
print("\nSample data:")
print(df.head(10))