Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .jules/bolt.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,11 @@
## 2024-05-24 - Avoid Copying Large Sets for Membership Checks
**Learning:** Copying a large set (e.g. 100k items) to create a snapshot for read-only membership checks is expensive O(N) and unnecessary. Python's set membership testing is thread-safe.
**Action:** When filtering data against a shared large set, iterate and check membership directly instead of snapshotting, unless strict transactional consistency across the entire iteration is required.

## 2024-05-24 - Deduplicate before API calls
**Learning:** Sending duplicate items in API requests wastes bandwidth and processing time. If the input list might contain duplicates (common in aggregated blocklists), deduplicate it locally before sending.
**Action:** Use `set` logic to filter duplicates from input lists before batching for API calls.

## 2024-05-24 - Parallelize independent batches
**Learning:** When sending large amounts of data in batches to an API, processing batches sequentially blocks on network latency. Using a thread pool to send multiple batches concurrently can significantly speed up the process, provided the API rate limits are respected.
**Action:** Refactor sequential batch processing loops to use `ThreadPoolExecutor` with a conservative number of workers (e.g., 3-5) for write operations.
48 changes: 36 additions & 12 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,26 +460,32 @@
log.error(f"Failed to create folder {sanitize_for_log(name)}: {sanitize_for_log(e)}")
return None

def push_rules(
profile_id: str,
folder_name: str,
folder_id: str,
do: int,
status: int,
hostnames: List[str],
existing_rules: Set[str],
client: httpx.Client,
existing_rules_lock: Optional[threading.Lock] = None,
) -> bool:
if not hostnames:
log.info("Folder %s - no rules to push", sanitize_for_log(folder_name))
return True

original_count = len(hostnames)

# Optimization: Check directly against existing_rules to avoid O(N) copy.
# Membership testing in set is thread-safe, and we don't need a strict snapshot for deduplication.
filtered_hostnames = [h for h in hostnames if h not in existing_rules]
# Optimization 1: Deduplicate input list while preserving order
# Optimization 2: Check directly against existing_rules to avoid O(N) copy.
seen = set()
filtered_hostnames = []
for h in hostnames:

Check warning

Code scanning / Pylint (reported by Codacy)

Variable name "h" doesn't conform to snake_case naming style Warning

Variable name "h" doesn't conform to snake_case naming style

Check warning

Code scanning / Pylintpython3 (reported by Codacy)

Variable name "h" doesn't conform to snake_case naming style Warning

Variable name "h" doesn't conform to snake_case naming style
if h not in existing_rules and h not in seen:
filtered_hostnames.append(h)
seen.add(h)

duplicates_count = original_count - len(filtered_hostnames)

if duplicates_count > 0:
Expand All @@ -490,34 +496,52 @@
return True

successful_batches = 0
total_batches = len(range(0, len(filtered_hostnames), BATCH_SIZE))

for i, start in enumerate(range(0, len(filtered_hostnames), BATCH_SIZE), 1):
batch = filtered_hostnames[start : start + BATCH_SIZE]
# Prepare batches
batches = []
for start in range(0, len(filtered_hostnames), BATCH_SIZE):
batches.append(filtered_hostnames[start : start + BATCH_SIZE])

total_batches = len(batches)

def process_batch(batch_idx: int, batch_data: List[str]) -> bool:

Check warning

Code scanning / Pylint (reported by Codacy)

Missing function docstring Warning

Missing function docstring
data = {
"do": str(do),
"status": str(status),
"group": str(folder_id),
}
for j, hostname in enumerate(batch):
for j, hostname in enumerate(batch_data):
data[f"hostnames[{j}]"] = hostname

try:
_api_post_form(client, f"{API_BASE}/{profile_id}/rules", data=data)
log.info(
"Folder %s – batch %d: added %d rules",
sanitize_for_log(folder_name), i, len(batch)
sanitize_for_log(folder_name), batch_idx, len(batch_data)
)
successful_batches += 1
if existing_rules_lock:
with existing_rules_lock:
existing_rules.update(batch)
existing_rules.update(batch_data)
else:
existing_rules.update(batch)
existing_rules.update(batch_data)
return True
except httpx.HTTPError as e:
log.error(f"Failed to push batch {i} for folder {sanitize_for_log(folder_name)}: {sanitize_for_log(e)}")
log.error(f"Failed to push batch {batch_idx} for folder {sanitize_for_log(folder_name)}: {sanitize_for_log(e)}")

Check warning

Code scanning / Prospector (reported by Codacy)

Use lazy % formatting in logging functions (logging-fstring-interpolation) Warning

Use lazy % formatting in logging functions (logging-fstring-interpolation)

Check warning

Code scanning / Pylintpython3 (reported by Codacy)

Line too long (124/100) Warning

Line too long (124/100)

Check notice

Code scanning / Pylintpython3 (reported by Codacy)

Use lazy % formatting in logging functions Note

Use lazy % formatting in logging functions

Check warning

Code scanning / Pylint (reported by Codacy)

Line too long (124/100) Warning

Line too long (124/100)
if hasattr(e, 'response') and e.response is not None:
log.debug(f"Response content: {e.response.text}")
return False

# Optimization 3: Parallelize batch processing
# Using 3 workers to speed up writes without hitting aggressive rate limits.
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
futures = {
executor.submit(process_batch, i, batch): i
for i, batch in enumerate(batches, 1)
}

for future in concurrent.futures.as_completed(futures):
if future.result():
successful_batches += 1

if successful_batches == total_batches:
log.info("Folder %s – finished (%d new rules added)", sanitize_for_log(folder_name), len(filtered_hostnames))
Expand Down
Loading