CAC_LTV_Model_Analysis/generate_visualizations.py at main · 419vive/CAC_LTV_Model_Analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#!/usr/bin/env python3
"""
Generate CAC-LTV Visualizations Based on YOUR Actual Data
This replaces the demo visualizations with accurate ones
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

# Set style for professional visualizations
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("="*60)
print("GENERATING VISUALIZATIONS FROM YOUR DATA")
print("="*60)

# Load your data
df = pd.read_csv('cac_ltv_model.csv')

# Calculate necessary metrics
df['total_cac'] = df['marketing_spend'] + df['sales_spend']
df['arpu'] = df['monthly_subscription_fee']
df['gross_margin'] = 0.75
df['churn_rate'] = np.where(
    df['churn_month'].notna(),
    1 / df['churn_month'],
    0.08
)
df['churn_rate_adj'] = df['churn_rate'].replace(0, 0.0001)
df['ltv'] = (df['arpu'] * df['gross_margin']) / df['churn_rate_adj']

# Convert signup_date to datetime
df['signup_date'] = pd.to_datetime(df['signup_date'])
df['cohort_month'] = df['signup_date'].dt.to_period('M')

print("\n1. Creating Cohort Retention Heatmap...")

# Simulate retention for cohorts
cohort_sizes = df.groupby('cohort_month').size()
retention_matrix = pd.DataFrame()

for cohort in cohort_sizes.index:
    cohort_data = df[df['cohort_month'] == cohort]
    size = len(cohort_data)
    avg_churn = cohort_data['churn_rate'].mean()

    # Generate retention curve
    retention = []
    for month in range(13):
        retention.append(int(size * (1 - avg_churn) ** month))

    retention_matrix[str(cohort)] = retention

# Plot 1: Cohort Retention Heatmap
plt.figure(figsize=(14, 8))
sns.heatmap(retention_matrix.T, annot=True, fmt='d', cmap='YlGnBu',
            cbar_kws={'label': 'Customers Remaining'})
plt.title('Customer Retention by Cohort Month (YOUR DATA)', fontsize=16, fontweight='bold')
plt.xlabel('Months Since Acquisition', fontsize=12)
plt.ylabel('Cohort Month', fontsize=12)
plt.tight_layout()
plt.savefig('plot1_cohort_retention.png', dpi=150, bbox_inches='tight')
print("   ✓ Saved: plot1_cohort_retention.png")

# Plot 2: LTV vs CAC by Channel
print("\n2. Creating LTV vs CAC by Channel...")

channel_metrics = df.groupby('acquisition_channel').agg({
    'ltv': 'mean',
    'total_cac': 'mean'
}).round(2)

plt.figure(figsize=(10, 6))
x = np.arange(len(channel_metrics.index))
width = 0.35

# TODO(human): Customize these colors if you prefer different ones
ltv_bars = plt.bar(x - width/2, channel_metrics['ltv'], width,
                   label='LTV', color='#2E7D32', alpha=0.8)
cac_bars = plt.bar(x + width/2, channel_metrics['total_cac'], width,
                   label='CAC', color='#D84315', alpha=0.8)

# Add value labels on bars
for i, (ltv, cac) in enumerate(zip(channel_metrics['ltv'], channel_metrics['total_cac'])):
    plt.text(i - width/2, ltv + 10, f'${ltv:.0f}', ha='center', fontsize=9)
    plt.text(i + width/2, cac + 10, f'${cac:.0f}', ha='center', fontsize=9)

plt.xlabel('Acquisition Channel', fontsize=12)
plt.ylabel('Amount ($)', fontsize=12)
plt.title('LTV vs CAC by Acquisition Channel (YOUR DATA)', fontsize=14, fontweight='bold')
plt.xticks(x, channel_metrics.index, rotation=45, ha='right')
plt.legend(loc='upper left')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('plot2_ltv_vs_cac.png', dpi=150, bbox_inches='tight')
print("   ✓ Saved: plot2_ltv_vs_cac.png")

# Plot 3: LTV:CAC Ratio by Channel
print("\n3. Creating LTV:CAC Ratio chart...")

channel_metrics['ltv_cac_ratio'] = channel_metrics['ltv'] / channel_metrics['total_cac']
channel_metrics = channel_metrics.sort_values('ltv_cac_ratio', ascending=False)

plt.figure(figsize=(10, 6))
colors = ['#2E7D32' if x >= 3 else '#FFA726' if x >= 2 else '#D84315'
          for x in channel_metrics['ltv_cac_ratio']]
bars = plt.bar(range(len(channel_metrics)), channel_metrics['ltv_cac_ratio'], color=colors)

# Add value labels and reference lines
for i, (channel, ratio) in enumerate(zip(channel_metrics.index, channel_metrics['ltv_cac_ratio'])):
    plt.text(i, ratio + 0.1, f'{ratio:.1f}x', ha='center', fontweight='bold')

plt.axhline(y=3, color='green', linestyle='--', alpha=0.5, label='Target (3x)')
plt.axhline(y=1, color='red', linestyle='--', alpha=0.5, label='Breakeven (1x)')

plt.xlabel('Acquisition Channel', fontsize=12)
plt.ylabel('LTV:CAC Ratio', fontsize=12)
plt.title('LTV:CAC Ratio by Channel (YOUR DATA)', fontsize=14, fontweight='bold')
plt.xticks(range(len(channel_metrics)), channel_metrics.index, rotation=45, ha='right')
plt.legend(loc='upper right')
plt.grid(True, alpha=0.3, axis='y')
plt.ylim(0, max(channel_metrics['ltv_cac_ratio']) * 1.2)
plt.tight_layout()
plt.savefig('plot3_ltv_cac_ratio.png', dpi=150, bbox_inches='tight')
print("   ✓ Saved: plot3_ltv_cac_ratio.png")

# Plot 4: ARPU by Region
print("\n4. Creating ARPU by Region chart...")

arpu_by_region = df.groupby('region')['monthly_subscription_fee'].mean().sort_values()

plt.figure(figsize=(10, 6))
colors = plt.cm.viridis(np.linspace(0.3, 0.9, len(arpu_by_region)))
bars = plt.barh(range(len(arpu_by_region)), arpu_by_region.values, color=colors)

# Add value labels
for i, (region, arpu) in enumerate(zip(arpu_by_region.index, arpu_by_region.values)):
    plt.text(arpu + 1, i, f'${arpu:.2f}', va='center', fontweight='bold')

plt.ylabel('Region', fontsize=12)
plt.xlabel('Average Revenue Per User ($)', fontsize=12)
plt.title('Average ARPU by Region (YOUR DATA)', fontsize=14, fontweight='bold')
plt.yticks(range(len(arpu_by_region)), arpu_by_region.index)
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.savefig('plot4_arpu_by_region.png', dpi=150, bbox_inches='tight')
print("   ✓ Saved: plot4_arpu_by_region.png")

# Summary statistics
print("\n" + "="*60)
print("VISUALIZATION SUMMARY")
print("="*60)
print(f"\nKey Metrics Visualized:")
print(f"  • Average LTV: ${df['ltv'].mean():.2f}")
print(f"  • Average CAC: ${df['total_cac'].mean():.2f}")
print(f"  • Best LTV:CAC Ratio: {channel_metrics['ltv_cac_ratio'].max():.1f}x ({channel_metrics['ltv_cac_ratio'].idxmax()})")
print(f"  • ARPU Range: ${arpu_by_region.min():.2f} - ${arpu_by_region.max():.2f}")
print(f"  • Regions: {len(arpu_by_region)} markets")
print(f"  • Channels: {len(channel_metrics)} acquisition sources")

print("\nAll visualizations have been generated successfully!")
print("Files created:")
print("  ✓ plot1_cohort_retention.png")
print("  ✓ plot2_ltv_vs_cac.png")
print("  ✓ plot3_ltv_cac_ratio.png")
print("  ✓ plot4_arpu_by_region.png")