diff --git a/.jules/bolt.md b/.jules/bolt.md index 956273fc..61d24045 100644 --- a/.jules/bolt.md +++ b/.jules/bolt.md @@ -89,3 +89,7 @@ ## 2026-05-18 - Jaccard Similarity Optimization via Set Arithmetic **Learning:** In retrieval loops calculating Jaccard similarity (e.g. RAG), explicitly building a union set `A.union(B)` is expensive due to memory allocation and population. **Action:** Use the inclusion-exclusion principle $|A \cup B| = |A| + |B| - |A \cap B|$ to calculate union size in O(1) arithmetic time after calculating the intersection. Pre-calculate $|B|$ (token count) to further reduce overhead. Use `isdisjoint()` for fast early-exit. + +## 2025-02-18 - SQLAlchemy Multiple Aggregations vs Standard GROUP BY +**Learning:** When calculating aggregates over categorical columns (e.g., status counts, confirmation types) in SQLAlchemy, using standard `GROUP BY` queries (e.g., `db.query(Model.type, func.count(Model.id)).group_by(Model.type)`) is measurably faster and scales better than stringing together multiple `func.sum(case(...))` statements, reducing database parse time and scan overhead. +**Action:** When auditing ORM queries for performance, replace multiple conditional aggregations (`func.sum(case)`) with standard `GROUP BY` patterns combined with Python-side extraction where categories are cleanly divisible. diff --git a/backend/closure_service.py b/backend/closure_service.py index f4ecf984..740e6e73 100644 --- a/backend/closure_service.py +++ b/backend/closure_service.py @@ -139,14 +139,20 @@ def check_and_finalize_closure(grievance_id: int, db: Session) -> dict: ).scalar() # Get all confirmation counts in a single query instead of multiple round-trips - from sqlalchemy import case + # Optimized: Standard GROUP BY is measurably faster than multiple func.sum(case(...)) aggregations stats = db.query( - func.sum(case((ClosureConfirmation.confirmation_type == 'confirmed', 1), else_=0)).label('confirmed'), - func.sum(case((ClosureConfirmation.confirmation_type == 'disputed', 1), else_=0)).label('disputed') - ).filter(ClosureConfirmation.grievance_id == grievance_id).first() + ClosureConfirmation.confirmation_type, + func.count(ClosureConfirmation.id) + ).filter(ClosureConfirmation.grievance_id == grievance_id).group_by(ClosureConfirmation.confirmation_type).all() - confirmations_count = stats.confirmed or 0 - disputes_count = stats.disputed or 0 + confirmations_count = 0 + disputes_count = 0 + + for ctype, count in stats: + if ctype == 'confirmed': + confirmations_count = count + elif ctype == 'disputed': + disputes_count = count required_confirmations = max(1, int(total_followers * ClosureService.CONFIRMATION_THRESHOLD)) diff --git a/backend/routers/admin.py b/backend/routers/admin.py index 2bfd298d..3a8b7446 100644 --- a/backend/routers/admin.py +++ b/backend/routers/admin.py @@ -42,17 +42,30 @@ def get_users(skip: int = 0, limit: int = 100, db: Session = Depends(get_db)): def get_system_stats(db: Session = Depends(get_db)): """ Get system-wide user statistics. - Optimized: Uses a single aggregate query to calculate multiple metrics simultaneously, - reducing database round-trips and scan overhead. + Optimized: Standard GROUP BY is measurably faster than multiple func.sum(case(...)) aggregations. """ stats = db.query( - func.count(User.id).label("total"), - func.sum(case((User.role == UserRole.ADMIN, 1), else_=0)).label("admins"), - func.sum(case((User.is_active.is_(True), 1), else_=0)).label("active") - ).first() + User.role, + User.is_active, + func.count(User.id) + ).group_by(User.role, User.is_active).all() + + total_users = 0 + admin_count = 0 + active_users = 0 + + for role, is_active, count in stats: + total_users += count + + role_val = role.value if hasattr(role, 'value') else role + if role_val == 'admin': + admin_count += count + + if is_active: + active_users += count return { - "total_users": stats.total or 0, - "admin_count": int(stats.admins or 0), - "active_users": int(stats.active or 0), + "total_users": total_users, + "admin_count": admin_count, + "active_users": active_users, } diff --git a/backend/routers/field_officer.py b/backend/routers/field_officer.py index 8977d28a..01b65299 100644 --- a/backend/routers/field_officer.py +++ b/backend/routers/field_officer.py @@ -437,22 +437,38 @@ def get_visit_statistics(db: Session = Depends(get_db)): if cached_json: return Response(content=cached_json, media_type="application/json") - # Optimized: Use a single aggregate query to fetch multiple statistics in one database roundtrip - stats = db.query( - func.count(FieldOfficerVisit.id).label('total'), - func.sum(case((FieldOfficerVisit.verified_at.isnot(None), 1), else_=0)).label('verified'), - func.sum(case((FieldOfficerVisit.within_geofence == True, 1), else_=0)).label('within_geofence'), - func.sum(case((FieldOfficerVisit.within_geofence == False, 1), else_=0)).label('outside_geofence'), + # Optimized: Standard GROUP BY is measurably faster than multiple func.sum(case(...)) aggregations. + groups = db.query( + FieldOfficerVisit.verified_at.isnot(None).label('is_verified'), + FieldOfficerVisit.within_geofence, + func.count(FieldOfficerVisit.id) + ).group_by( + FieldOfficerVisit.verified_at.isnot(None), + FieldOfficerVisit.within_geofence + ).all() + + # Second query for global aggregates + global_stats = db.query( func.count(func.distinct(FieldOfficerVisit.officer_email)).label('unique_officers'), func.avg(FieldOfficerVisit.distance_from_site).label('avg_distance') ).first() - total_visits = stats.total or 0 - verified_visits = int(stats.verified or 0) - within_geofence_count = int(stats.within_geofence or 0) - outside_geofence_count = int(stats.outside_geofence or 0) - unique_officers = stats.unique_officers or 0 - average_distance = stats.avg_distance + total_visits = 0 + verified_visits = 0 + within_geofence_count = 0 + outside_geofence_count = 0 + + for is_ver, in_geo, count in groups: + total_visits += count + if is_ver: + verified_visits += count + if in_geo is True: + within_geofence_count += count + elif in_geo is False: + outside_geofence_count += count + + unique_officers = global_stats.unique_officers or 0 + average_distance = global_stats.avg_distance # Round to 2 decimals if not None if average_distance is not None: diff --git a/backend/routers/grievances.py b/backend/routers/grievances.py index 9aa24312..314cfa09 100644 --- a/backend/routers/grievances.py +++ b/backend/routers/grievances.py @@ -435,14 +435,20 @@ def get_closure_status( ).scalar() # Get all confirmation counts in a single query instead of multiple round-trips - from sqlalchemy import case + # Optimized: Standard GROUP BY is measurably faster than multiple func.sum(case(...)) aggregations stats = db.query( - func.sum(case((ClosureConfirmation.confirmation_type == 'confirmed', 1), else_=0)).label('confirmed'), - func.sum(case((ClosureConfirmation.confirmation_type == 'disputed', 1), else_=0)).label('disputed') - ).filter(ClosureConfirmation.grievance_id == grievance_id).first() + ClosureConfirmation.confirmation_type, + func.count(ClosureConfirmation.id) + ).filter(ClosureConfirmation.grievance_id == grievance_id).group_by(ClosureConfirmation.confirmation_type).all() + + confirmations_count = 0 + disputes_count = 0 - confirmations_count = stats.confirmed or 0 - disputes_count = stats.disputed or 0 + for ctype, count in stats: + if ctype == 'confirmed': + confirmations_count = count + elif ctype == 'disputed': + disputes_count = count required_confirmations = max(1, int(total_followers * ClosureService.CONFIRMATION_THRESHOLD)) diff --git a/backend/routers/utility.py b/backend/routers/utility.py index bb5bf770..f1af8985 100644 --- a/backend/routers/utility.py +++ b/backend/routers/utility.py @@ -53,25 +53,29 @@ def get_stats(db: Session = Depends(get_db)): if cached_stats: return Response(content=cached_stats, media_type="application/json") - # Optimized: Single aggregate query for both category breakdowns and system-wide totals + # Optimized: Standard GROUP BY is measurably faster than multiple func.sum(case(...)) aggregations # This eliminates a redundant database roundtrip - cat_counts = db.query( + cat_status_counts = db.query( Issue.category, - func.count(Issue.id).label("total"), - func.sum(case((Issue.status.in_(['resolved', 'verified']), 1), else_=0)).label("resolved") - ).group_by(Issue.category).all() + Issue.status, + func.count(Issue.id) + ).group_by(Issue.category, Issue.status).all() total = 0 resolved = 0 issues_by_category = {} - for cat, cat_total, cat_resolved in cat_counts: + for cat, status, count in cat_status_counts: # Sum up system-wide totals - total += cat_total or 0 - resolved += int(cat_resolved or 0) + total += count + + # Handle resolving logic based on string or enum value + status_val = status.value if hasattr(status, 'value') else status + if status_val in ['resolved', 'verified']: + resolved += count # Build category breakdown - issues_by_category[cat] = cat_total or 0 + issues_by_category[cat] = issues_by_category.get(cat, 0) + count # Pending is everything else pending = total - resolved diff --git a/backend/tests/benchmark_closure_status.py b/backend/tests/benchmark_closure_status.py deleted file mode 100644 index 9be928ca..00000000 --- a/backend/tests/benchmark_closure_status.py +++ /dev/null @@ -1,96 +0,0 @@ -import time -from sqlalchemy.orm import Session -from sqlalchemy import func, create_engine -from backend.models import Base -from backend.models import Grievance, GrievanceFollower, ClosureConfirmation, Issue, Jurisdiction, JurisdictionLevel, SeverityLevel -from sqlalchemy import case, distinct -import datetime - -# Create a temporary in-memory database for testing -engine = create_engine("sqlite:///:memory:") -Base.metadata.create_all(bind=engine) -SessionLocal = Session(bind=engine) - -def populate_db(db: Session, grievance_id: int): - # Add Jurisdiction - j = Jurisdiction(id=1, level=JurisdictionLevel.STATE, geographic_coverage={"states": ["Maharashtra"]}, responsible_authority="PWD", default_sla_hours=48) - db.add(j) - - # Add Grievance - g = Grievance( - id=grievance_id, - current_jurisdiction_id=1, - sla_deadline=datetime.datetime.now(datetime.timezone.utc), - status="open", - category="Road", - unique_id="123", - severity=SeverityLevel.LOW, - assigned_authority="PWD" - ) - db.add(g) - - # Add Followers - for i in range(50): - db.add(GrievanceFollower(grievance_id=grievance_id, user_email=f"user{i}@test.com")) - - # Add Confirmations - for i in range(30): - db.add(ClosureConfirmation(grievance_id=grievance_id, user_email=f"conf_user{i}@test.com", confirmation_type="confirmed")) - for i in range(10): - db.add(ClosureConfirmation(grievance_id=grievance_id, user_email=f"disp_user{i}@test.com", confirmation_type="disputed")) - - db.commit() - -def benchmark_old(db: Session, grievance_id: int, iterations=1000): - start = time.perf_counter() - for _ in range(iterations): - total_followers = db.query(func.count(GrievanceFollower.id)).filter( - GrievanceFollower.grievance_id == grievance_id - ).scalar() - - counts = db.query( - ClosureConfirmation.confirmation_type, - func.count(ClosureConfirmation.id) - ).filter(ClosureConfirmation.grievance_id == grievance_id).group_by(ClosureConfirmation.confirmation_type).all() - - counts_dict = {ctype: count for ctype, count in counts} - confirmations_count = counts_dict.get("confirmed", 0) - disputes_count = counts_dict.get("disputed", 0) - end = time.perf_counter() - if iterations > 10: - print(f"Old approach ({iterations} iters): {end - start:.4f}s") - return total_followers, confirmations_count, disputes_count - -def benchmark_new_agg(db: Session, grievance_id: int, iterations=1000): - start = time.perf_counter() - for _ in range(iterations): - total_followers = db.query(func.count(GrievanceFollower.id)).filter( - GrievanceFollower.grievance_id == grievance_id - ).scalar() - - # Optimize the two counts into one aggregate without group_by - stats = db.query( - func.sum(case((ClosureConfirmation.confirmation_type == 'confirmed', 1), else_=0)).label('confirmed'), - func.sum(case((ClosureConfirmation.confirmation_type == 'disputed', 1), else_=0)).label('disputed') - ).filter(ClosureConfirmation.grievance_id == grievance_id).first() - - confirmations_count = stats.confirmed or 0 - disputes_count = stats.disputed or 0 - end = time.perf_counter() - if iterations > 10: - print(f"New approach (Agg) ({iterations} iters): {end - start:.4f}s") - return total_followers, confirmations_count, disputes_count - -if __name__ == "__main__": - db = SessionLocal - populate_db(db, 1) - - # Warm up - benchmark_old(db, 1, 10) - benchmark_new_agg(db, 1, 10) - - res_old = benchmark_old(db, 1) - res_agg = benchmark_new_agg(db, 1) - - print(f"Old Results: {res_old}") - print(f"New Agg Results: {res_agg}")