Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .jules/bolt.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,3 +93,7 @@
## 2026-05-20 - Joined Queries for Integrity Verification
**Learning:** Performing multiple sequential database queries to verify cryptographically chained records (e.g., fetching a record and then its associated token/metadata from another table) introduces unnecessary latency and increases database load.
**Action:** Consolidate associated data retrieval into a single SQL `JOIN` query within the verification hot-path. This reduces database round-trips and improves end-to-end latency for blockchain-style integrity checks.

## 2024-05-21 - Spatial Distance Hot Loop Optimization
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

Fix the year in the date header.

The date shows "2024-05-21" but should be "2026-05-21" based on the PR creation date (2026-05-17) and consistency with surrounding entries.

📅 Proposed fix
-## 2024-05-21 - Spatial Distance Hot Loop Optimization
+## 2026-05-21 - Spatial Distance Hot Loop Optimization
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
## 2024-05-21 - Spatial Distance Hot Loop Optimization
## 2026-05-21 - Spatial Distance Hot Loop Optimization
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In @.jules/bolt.md at line 97, Update the markdown header string "## 2024-05-21
- Spatial Distance Hot Loop Optimization" to use the correct year: change
"2024-05-21" to "2026-05-21" so the entry matches the PR date and surrounding
entries.

**Learning:** Converting coordinates to radians inside a hot loop (like `find_nearby_issues`) using `math.radians` adds significant overhead when evaluating thousands of candidates.
**Action:** Pre-calculate constant factor calculations (meters per degree of latitude and longitude) outside the loop based on the target coordinates. This allows calculating distance entirely using subtraction and multiplication of degrees, bypassing the radian conversion overhead.
104 changes: 62 additions & 42 deletions backend/spatial_utils.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
"""
Spatial utilities for geospatial operations and deduplication.
"""

import math
from typing import List, Tuple, Optional
import logging

try:
from sklearn.cluster import DBSCAN
import numpy as np

HAS_SKLEARN = True
except ImportError:
HAS_SKLEARN = False
Expand All @@ -18,7 +20,10 @@

logger = logging.getLogger(__name__)

def get_bounding_box(lat: float, lon: float, radius_meters: float) -> Tuple[float, float, float, float]:

def get_bounding_box(
lat: float, lon: float, radius_meters: float
) -> Tuple[float, float, float, float]:
"""
Calculate the bounding box coordinates for a given radius.
Returns (min_lat, max_lat, min_lon, max_lon).
Expand Down Expand Up @@ -59,13 +64,18 @@ def haversine_distance(lat1: float, lon1: float, lat2: float, lon2: float) -> fl
dlambda = math.radians(lon2 - lon1)

# Haversine formula
a = math.sin(dphi / 2)**2 + math.cos(phi1) * math.cos(phi2) * math.sin(dlambda / 2)**2
a = (
math.sin(dphi / 2) ** 2
+ math.cos(phi1) * math.cos(phi2) * math.sin(dlambda / 2) ** 2
)
c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

return R * c


def equirectangular_distance(lat1: float, lon1: float, lat2: float, lon2: float) -> float:
def equirectangular_distance(
lat1: float, lon1: float, lat2: float, lon2: float
) -> float:
"""
Calculate the distance between two points on the earth (specified in decimal degrees)
using the Equirectangular approximation. This is faster than Haversine for small distances.
Expand All @@ -89,14 +99,14 @@ def equirectangular_distance(lat1: float, lon1: float, lat2: float, lon2: float)
x = dlon * math.cos((lat1_rad + lat2_rad) / 2)
y = dlat

return R * math.sqrt(x*x + y*y)
return R * math.sqrt(x * x + y * y)


def find_nearby_issues(
issues: List[Issue],
target_lat: float,
target_lon: float,
radius_meters: float = 50.0
radius_meters: float = 50.0,
) -> List[Tuple[Issue, float]]:
"""
Find issues within a specified radius of a target location.
Expand All @@ -113,7 +123,9 @@ def find_nearby_issues(
nearby_issues = []

# Optimization: pre-filter using a bounding box to avoid math on distant points
min_lat, max_lat, min_lon, max_lon = get_bounding_box(target_lat, target_lon, radius_meters)
min_lat, max_lat, min_lon, max_lon = get_bounding_box(
target_lat, target_lon, radius_meters
)

# Optimization: Use inline Equirectangular approximation for short distances (< 10km)
# This avoids function call overhead and repeated radian conversions.
Expand All @@ -124,51 +136,54 @@ def find_nearby_issues(
continue

# Apply bounding box pre-filter
if issue.latitude < min_lat or issue.latitude > max_lat or \
issue.longitude < min_lon or issue.longitude > max_lon:
if (
issue.latitude < min_lat
or issue.latitude > max_lat
or issue.longitude < min_lon
or issue.longitude > max_lon
):
continue

distance = haversine_distance(target_lat, target_lon, issue.latitude, issue.longitude)
distance = haversine_distance(
target_lat, target_lon, issue.latitude, issue.longitude
)
if distance <= radius_meters:
nearby_issues.append((issue, distance))
else:
# Optimized path for common case (small radius)
R = 6371000.0
radius_sq = radius_meters * radius_meters

target_lat_rad = math.radians(target_lat)
target_lon_rad = math.radians(target_lon)
# Cosine term is constant for the target latitude in equirectangular projection
cos_lat = math.cos(target_lat_rad)
# Precompute constant factor calculations (meters per degree)
lat_meters_per_deg = 6371000.0 * (math.pi / 180.0)
lon_meters_per_deg = lat_meters_per_deg * math.cos(math.radians(target_lat))

for issue in issues:
if issue.latitude is None or issue.longitude is None:
continue

# Apply bounding box pre-filter
if issue.latitude < min_lat or issue.latitude > max_lat or \
issue.longitude < min_lon or issue.longitude > max_lon:
if (
issue.latitude < min_lat
or issue.latitude > max_lat
or issue.longitude < min_lon
or issue.longitude > max_lon
):
continue

# Inline conversion to radians
lat_rad = math.radians(issue.latitude)
lon_rad = math.radians(issue.longitude)

dlat = lat_rad - target_lat_rad
dlon = lon_rad - target_lon_rad
dlat = issue.latitude - target_lat
dlon = issue.longitude - target_lon

# Handle longitude wrapping (dateline crossing)
if dlon > math.pi:
dlon -= 2 * math.pi
elif dlon < -math.pi:
dlon += 2 * math.pi
if dlon > 180.0:
dlon -= 360.0
elif dlon < -180.0:
dlon += 360.0

x = dlon * cos_lat
y = dlat
x = dlon * lon_meters_per_deg
y = dlat * lat_meters_per_deg

# Squared distance check avoids expensive sqrt()
# (x*R)^2 + (y*R)^2 = R^2 * (x^2 + y^2)
dist_sq = (x*x + y*y) * R * R
dist_sq = x * x + y * y

if dist_sq <= radius_sq:
nearby_issues.append((issue, math.sqrt(dist_sq)))
Expand All @@ -179,7 +194,9 @@ def find_nearby_issues(
return nearby_issues


def cluster_issues_dbscan(issues: List[Issue], eps_meters: float = 30.0) -> List[List[Issue]]:
def cluster_issues_dbscan(
issues: List[Issue], eps_meters: float = 30.0
) -> List[List[Issue]]:
"""
Cluster issues using DBSCAN algorithm based on spatial proximity.

Expand All @@ -194,21 +211,26 @@ def cluster_issues_dbscan(issues: List[Issue], eps_meters: float = 30.0) -> List
if not HAS_SKLEARN:
logger.warning("Scikit-learn not available, returning unclustered issues.")
# Return each issue as its own cluster to ensure visibility
return [[issue] for issue in issues if issue.latitude is not None and issue.longitude is not None]
return [
[issue]
for issue in issues
if issue.latitude is not None and issue.longitude is not None
]

# Filter issues with valid coordinates
valid_issues = [
issue for issue in issues
issue
for issue in issues
if issue.latitude is not None and issue.longitude is not None
]

if not valid_issues:
return []

# Convert to numpy array for DBSCAN
coordinates = np.array([
[issue.latitude, issue.longitude] for issue in valid_issues
])
coordinates = np.array(
[[issue.latitude, issue.longitude] for issue in valid_issues]
)

# Convert eps from meters to degrees (approximate)
# 1 degree latitude ≈ 111,000 meters
Expand All @@ -217,7 +239,7 @@ def cluster_issues_dbscan(issues: List[Issue], eps_meters: float = 30.0) -> List

# Perform DBSCAN clustering
try:
db = DBSCAN(eps=eps_degrees, min_samples=1, metric='haversine').fit(
db = DBSCAN(eps=eps_degrees, min_samples=1, metric="haversine").fit(
np.radians(coordinates)
)

Expand Down Expand Up @@ -250,10 +272,7 @@ def get_cluster_representative(cluster: List[Issue]) -> Issue:
raise ValueError("Cluster cannot be empty")

# Sort by upvotes (descending), then by creation date (ascending)
sorted_issues = sorted(
cluster,
key=lambda x: (-(x.upvotes or 0), x.created_at)
)
sorted_issues = sorted(cluster, key=lambda x: (-(x.upvotes or 0), x.created_at))

return sorted_issues[0]

Expand All @@ -269,7 +288,8 @@ def calculate_cluster_centroid(cluster: List[Issue]) -> Tuple[float, float]:
Tuple of (latitude, longitude) representing the centroid
"""
valid_issues = [
issue for issue in cluster
issue
for issue in cluster
if issue.latitude is not None and issue.longitude is not None
]

Expand Down
Loading