|
| 1 | +"""Extract ML training features from real analytics data instead of synthetic data. |
| 2 | +
|
| 3 | +Builds (X, y) arrays from TicketScan, TicketTransfer, and InvalidAttempt records |
| 4 | +stored in the database so the scalper-detection model trains on actual behaviour. |
| 5 | +""" |
| 6 | +from typing import Any, List, Tuple |
| 7 | + |
| 8 | +import numpy as np # type: ignore[import-untyped] |
| 9 | + |
| 10 | +from src.analytics.models import AnalyticsStats, InvalidAttempt, TicketScan, TicketTransfer, get_session |
| 11 | +from sqlalchemy import func |
| 12 | + |
| 13 | + |
| 14 | +def extract_features_from_analytics() -> Tuple[Any, Any]: |
| 15 | + """Return (X, y) arrays built from real analytics records. |
| 16 | +
|
| 17 | + Features per event_id: |
| 18 | + 0 - total scans |
| 19 | + 1 - invalid scan rate (invalid / total scans, or 0) |
| 20 | + 2 - total transfers |
| 21 | + 3 - failed transfer rate (failed / total transfers, or 0) |
| 22 | + 4 - invalid attempt count |
| 23 | +
|
| 24 | + Label: 1 (suspicious) when invalid_scan_rate > 0.3 or failed_transfer_rate > 0.5, |
| 25 | + 0 otherwise. |
| 26 | +
|
| 27 | + Falls back to an empty array pair if no data is available. |
| 28 | + """ |
| 29 | + session = get_session() |
| 30 | + try: |
| 31 | + # Aggregate per event_id from AnalyticsStats |
| 32 | + stats = session.query(AnalyticsStats).all() |
| 33 | + |
| 34 | + if not stats: |
| 35 | + return np.empty((0, 5), dtype=float), np.empty((0,), dtype=int) |
| 36 | + |
| 37 | + rows: List[List[float]] = [] |
| 38 | + labels: List[int] = [] |
| 39 | + |
| 40 | + for s in stats: |
| 41 | + total_scans = max(s.scan_count, 1) |
| 42 | + total_transfers = max(s.transfer_count, 1) |
| 43 | + invalid_scan_rate = s.invalid_scan_count / total_scans |
| 44 | + failed_transfer_rate = s.failed_transfer_count / total_transfers |
| 45 | + |
| 46 | + features = [ |
| 47 | + float(s.scan_count), |
| 48 | + invalid_scan_rate, |
| 49 | + float(s.transfer_count), |
| 50 | + failed_transfer_rate, |
| 51 | + float(s.invalid_attempt_count), |
| 52 | + ] |
| 53 | + label = int(invalid_scan_rate > 0.3 or failed_transfer_rate > 0.5) |
| 54 | + rows.append(features) |
| 55 | + labels.append(label) |
| 56 | + |
| 57 | + return np.array(rows, dtype=float), np.array(labels, dtype=int) |
| 58 | + finally: |
| 59 | + session.close() |
0 commit comments