Skip to content

Commit e9f6be9

Browse files
authored
Merge pull request #186 from Leothosine/fix/issue-143-real-analytics-training-data
feat: replace synthetic training data with feature extraction from real analytics data
2 parents e207aaf + f583863 commit e9f6be9

1 file changed

Lines changed: 59 additions & 0 deletions

File tree

src/analytics_training_data.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
"""Extract ML training features from real analytics data instead of synthetic data.
2+
3+
Builds (X, y) arrays from TicketScan, TicketTransfer, and InvalidAttempt records
4+
stored in the database so the scalper-detection model trains on actual behaviour.
5+
"""
6+
from typing import Any, List, Tuple
7+
8+
import numpy as np # type: ignore[import-untyped]
9+
10+
from src.analytics.models import AnalyticsStats, InvalidAttempt, TicketScan, TicketTransfer, get_session
11+
from sqlalchemy import func
12+
13+
14+
def extract_features_from_analytics() -> Tuple[Any, Any]:
15+
"""Return (X, y) arrays built from real analytics records.
16+
17+
Features per event_id:
18+
0 - total scans
19+
1 - invalid scan rate (invalid / total scans, or 0)
20+
2 - total transfers
21+
3 - failed transfer rate (failed / total transfers, or 0)
22+
4 - invalid attempt count
23+
24+
Label: 1 (suspicious) when invalid_scan_rate > 0.3 or failed_transfer_rate > 0.5,
25+
0 otherwise.
26+
27+
Falls back to an empty array pair if no data is available.
28+
"""
29+
session = get_session()
30+
try:
31+
# Aggregate per event_id from AnalyticsStats
32+
stats = session.query(AnalyticsStats).all()
33+
34+
if not stats:
35+
return np.empty((0, 5), dtype=float), np.empty((0,), dtype=int)
36+
37+
rows: List[List[float]] = []
38+
labels: List[int] = []
39+
40+
for s in stats:
41+
total_scans = max(s.scan_count, 1)
42+
total_transfers = max(s.transfer_count, 1)
43+
invalid_scan_rate = s.invalid_scan_count / total_scans
44+
failed_transfer_rate = s.failed_transfer_count / total_transfers
45+
46+
features = [
47+
float(s.scan_count),
48+
invalid_scan_rate,
49+
float(s.transfer_count),
50+
failed_transfer_rate,
51+
float(s.invalid_attempt_count),
52+
]
53+
label = int(invalid_scan_rate > 0.3 or failed_transfer_rate > 0.5)
54+
rows.append(features)
55+
labels.append(label)
56+
57+
return np.array(rows, dtype=float), np.array(labels, dtype=int)
58+
finally:
59+
session.close()

0 commit comments

Comments
 (0)