diff --git a/.jules/bolt.md b/.jules/bolt.md index da05510..d648ac6 100644 --- a/.jules/bolt.md +++ b/.jules/bolt.md @@ -51,3 +51,7 @@ ## 2024-05-24 - [Skip Validation for Known Data] **Learning:** Performing expensive validation (e.g. regex) on data that is already known to be valid (e.g. exists in trusted remote state) is redundant. Checking existence in a local set (O(1)) before validation avoids CPU overhead for duplicates. **Action:** In filtering loops, check "is already processed/known" before "is valid", especially if "valid" implies "safe to process" and "known" implies "already processed". + +## 2026-02-04 - [Optimize Buffer for Large Downloads] +**Learning:** When downloading large files (e.g., blocklists), the default chunk size of HTTP libraries might be small, leading to excessive loop iterations and list operations. Increasing the buffer size (e.g., to 16KB) reduces CPU overhead during I/O-bound operations. +**Action:** When using `iter_bytes()` or similar streaming methods for large resources, explicitly set a larger `chunk_size` (e.g., 16384) to improve throughput and reduce CPU usage. diff --git a/main.py b/main.py index d27790a..4b786ac 100644 --- a/main.py +++ b/main.py @@ -1000,7 +1000,8 @@ def _gh_get(url: str) -> Dict: # 2. Stream and check actual size chunks = [] current_size = 0 - for chunk in r.iter_bytes(): + # Optimization: Use 16KB chunks to reduce loop overhead/appends for large files + for chunk in r.iter_bytes(chunk_size=16 * 1024): current_size += len(chunk) if current_size > MAX_RESPONSE_SIZE: raise ValueError(