Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .jules/bolt.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,7 @@
## 2024-05-22 - Ordered Deduplication Optimization
**Learning:** `dict.fromkeys(list)` is significantly faster (~2x) than a Python loop with `seen = set()` for deduplicating large lists while preserving order. It also naturally deduplicates invalid items if validation happens after, which prevents log spam.
**Action:** Use `dict.fromkeys()` for ordered deduplication of large inputs instead of manual loop with `seen` set.

## 2026-01-28 - [Avoid ThreadPoolExecutor Overhead]

Check notice

Code scanning / Remark-lint (reported by Codacy)

Warn when references to undefined definitions are found.

[no-undefined-references] Found reference to undefined definition

Check notice

Code scanning / Remark-lint (reported by Codacy)

Warn when shortcut reference links are used.

[no-shortcut-reference-link] Use the trailing `[]` on reference links
**Learning:** `ThreadPoolExecutor` context management and thread creation overhead is non-negligible for single-item or very small workloads. If a parallelizable task only has 1 unit of work (e.g., 1 batch), running it synchronously in the main thread is faster and uses less memory than spinning up a pool.
**Action:** Check the size of the workload before creating a `ThreadPoolExecutor`. If `len(tasks) == 1`, bypass the executor and run directly.
45 changes: 29 additions & 16 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -1172,23 +1172,36 @@ def process_batch(batch_idx: int, batch_data: List[str]) -> Optional[List[str]]:

# Optimization 3: Parallelize batch processing
# Using 3 workers to speed up writes without hitting aggressive rate limits.
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
futures = {
executor.submit(process_batch, i, batch): i
for i, batch in enumerate(batches, 1)
}
# If only 1 batch, run it synchronously to avoid ThreadPoolExecutor overhead.
if total_batches == 1:
result = process_batch(1, batches[0])
if result:
successful_batches += 1
existing_rules.update(result)

render_progress_bar(
successful_batches,
total_batches,
f"Folder {sanitize_for_log(folder_name)}",
)
else:
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
futures = {
executor.submit(process_batch, i, batch): i
for i, batch in enumerate(batches, 1)
}

for future in concurrent.futures.as_completed(futures):
result = future.result()
if result:
successful_batches += 1
existing_rules.update(result)

render_progress_bar(
successful_batches,
total_batches,
f"Folder {sanitize_for_log(folder_name)}",
)
for future in concurrent.futures.as_completed(futures):
result = future.result()
if result:
successful_batches += 1
existing_rules.update(result)

render_progress_bar(
successful_batches,
total_batches,
f"Folder {sanitize_for_log(folder_name)}",
)
Comment on lines +1176 to +1204

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

There is code duplication between the if and else blocks for handling the result of process_batch. Both branches contain the logic to increment successful_batches, update existing_rules, and render the progress bar. This makes the code harder to maintain, as any change to this logic must be applied in two places.

To improve maintainability and follow the DRY (Don't Repeat Yourself) principle, you can extract the result handling logic into a local helper function.

    def _handle_batch_result(result: Optional[List[str]]):
        nonlocal successful_batches
        if result:
            successful_batches += 1
            existing_rules.update(result)

        render_progress_bar(
            successful_batches,
            total_batches,
            f"Folder {sanitize_for_log(folder_name)}",
        )

    if total_batches == 1:
        result = process_batch(1, batches[0])
        _handle_batch_result(result)
    else:
        with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
            futures = {
                executor.submit(process_batch, i, batch): i
                for i, batch in enumerate(batches, 1)
            }

            for future in concurrent.futures.as_completed(futures):
                result = future.result()
                _handle_batch_result(result)


if successful_batches == total_batches:
if USE_COLORS:
Expand Down
45 changes: 23 additions & 22 deletions tests/test_plan_details.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,24 @@
import sys
from unittest.mock import patch

import main

# NOTE: Avoid importing `main` at module import time.
# Some tests delete `sys.modules["main"]` to force a clean import under different env/TTY
# settings; holding a stale module reference can cause patches to target the wrong module.

def test_print_plan_details_no_colors(capsys):
"""Test print_plan_details output when colors are disabled."""
with patch("main.USE_COLORS", False):
import main as m

with patch.object(m, "USE_COLORS", False):
plan_entry = {
"profile": "test_profile",
"folders": [
{"name": "Folder B", "rules": 5},
{"name": "Folder A", "rules": 10},
],
}
main.print_plan_details(plan_entry)
m.print_plan_details(plan_entry)

captured = capsys.readouterr()
output = captured.out
Expand All @@ -32,9 +36,11 @@ def test_print_plan_details_no_colors(capsys):

def test_print_plan_details_empty_folders(capsys):
"""Test print_plan_details with no folders."""
with patch("main.USE_COLORS", False):
import main as m

with patch.object(m, "USE_COLORS", False):
plan_entry = {"profile": "test_profile", "folders": []}
main.print_plan_details(plan_entry)
m.print_plan_details(plan_entry)

captured = capsys.readouterr()
output = captured.out
Expand All @@ -45,27 +51,22 @@ def test_print_plan_details_empty_folders(capsys):

def test_print_plan_details_with_colors(capsys):
"""Test print_plan_details output when colors are enabled."""
# Force USE_COLORS=True for this test, but also ensure Colors class is populated
# The Colors class is defined at import time based on USE_COLORS.
# If main was imported previously with USE_COLORS=False, Colors attributes are empty strings.
# We must reload main with an environment that forces USE_COLORS=True, or mock Colors.

with patch.dict(os.environ, {"NO_COLOR": ""}):
with patch("sys.stderr.isatty", return_value=True), patch("sys.stdout.isatty", return_value=True):
# Robust reload: handle case where main module reference is stale
if "main" in sys.modules:
importlib.reload(sys.modules["main"])
else:
import main
importlib.reload(main)

# Now verify output with colors
# Force USE_COLORS=True for this test, and reload `main` so the `Colors` class is
# created with non-empty ANSI codes.

with patch.dict(os.environ, {"NO_COLOR": ""}, clear=False):
with patch("sys.stderr.isatty", return_value=True), patch(
"sys.stdout.isatty", return_value=True
):
import main as m

m = importlib.reload(m)

plan_entry = {
"profile": "test_profile",
"folders": [{"name": "Folder A", "rules": 10}],
}
# Use the module from sys.modules to ensure we use the reloaded one
sys.modules["main"].print_plan_details(plan_entry)
m.print_plan_details(plan_entry)

captured = capsys.readouterr()
output = captured.out
Expand Down
102 changes: 102 additions & 0 deletions tests/test_push_rules_perf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@

Check warning

Code scanning / Pylint (reported by Codacy)

Missing module docstring

Missing module docstring

Check warning

Code scanning / Pylintpython3 (reported by Codacy)

Missing module docstring

Missing module docstring
import unittest
from unittest.mock import MagicMock, patch, ANY

Check warning

Code scanning / Prospector (reported by Codacy)

Unused ANY imported from unittest.mock (unused-import)

Unused ANY imported from unittest.mock (unused-import)

Check notice

Code scanning / Pylint (reported by Codacy)

Unused ANY imported from unittest.mock

Unused ANY imported from unittest.mock

Check notice

Code scanning / Pylintpython3 (reported by Codacy)

Unused ANY imported from unittest.mock

Unused ANY imported from unittest.mock
Copy link

Copilot AI Feb 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ANY is imported from unittest.mock but never used in this test module. Please remove it to avoid unused-import lint noise and keep the test focused.

Suggested change
from unittest.mock import MagicMock, patch, ANY
from unittest.mock import MagicMock, patch

Copilot uses AI. Check for mistakes.
import sys
import os

# Add root to path to import main
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

import main

Check warning

Code scanning / Pylint (reported by Codacy)

Import "import main" should be placed at the top of the module

Import "import main" should be placed at the top of the module

Check warning

Code scanning / Prospector (reported by Codacy)

Import "import main" should be placed at the top of the module (wrong-import-position)

Import "import main" should be placed at the top of the module (wrong-import-position)

Check warning

Code scanning / Pylintpython3 (reported by Codacy)

Import "import main" should be placed at the top of the module

Import "import main" should be placed at the top of the module

class TestPushRulesPerf(unittest.TestCase):

Check warning

Code scanning / Pylint (reported by Codacy)

Missing class docstring

Missing class docstring

Check warning

Code scanning / Pylintpython3 (reported by Codacy)

Missing class docstring

Missing class docstring
def setUp(self):
self.client = MagicMock()
self.profile_id = "test_profile"
self.folder_name = "test_folder"
self.folder_id = "test_folder_id"
self.do = 1

Check warning

Code scanning / Pylint (reported by Codacy)

Attribute name "do" doesn't conform to snake_case naming style

Attribute name "do" doesn't conform to snake_case naming style

Check warning

Code scanning / Pylintpython3 (reported by Codacy)

Attribute name "do" doesn't conform to snake_case naming style

Attribute name "do" doesn't conform to snake_case naming style
self.status = 1
self.existing_rules = set()

@patch("main.concurrent.futures.as_completed")
@patch("main.concurrent.futures.ThreadPoolExecutor")
def test_push_rules_single_batch_optimization(self, mock_executor, mock_as_completed):
"""
Test that push_rules avoids ThreadPoolExecutor for single batch (< 500 rules).
"""
# Create < 500 rules (1 batch)
hostnames = [f"example{i}.com" for i in range(100)]

# Mock executor context manager
mock_executor_instance = mock_executor.return_value
mock_executor_instance.__enter__.return_value = mock_executor_instance
mock_executor_instance.__exit__.return_value = None

# Mock future
mock_future = MagicMock()
mock_future.result.return_value = hostnames # Success
mock_executor_instance.submit.return_value = mock_future

# Mock as_completed to yield the future immediately
mock_as_completed.return_value = [mock_future]

# Since we are bypassing TPE, we might need to mock API call?
# The code will call process_batch(1, batch).
# process_batch calls _api_post_form.
# client is mocked, so _api_post_form works (retries mocked).
# But we need to ensure process_batch works correctly in isolation.

# For this test, we mock _api_post_form?
# No, _api_post_form calls client.post.
Comment on lines +31 to +51

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The mocks for the ThreadPoolExecutor instance and the commented-out development notes are unnecessary for this test. Since this test verifies the single-batch scenario where the thread pool is not used, these lines are redundant and can be removed. This will make the test cleaner and more focused on its primary assertion.


main.push_rules(
self.profile_id,
self.folder_name,
self.folder_id,
self.do,
self.status,
hostnames,
self.existing_rules,
self.client
)

# Verify if Executor was called.
# AFTER OPTIMIZATION: This should be False.
self.assertFalse(mock_executor.called, "ThreadPoolExecutor should NOT be called for single batch")

Check warning

Code scanning / Pylint (reported by Codacy)

Line too long (106/100)

Line too long (106/100)

Check warning

Code scanning / Pylintpython3 (reported by Codacy)

Line too long (106/100)

Line too long (106/100)

@patch("main.concurrent.futures.as_completed")
@patch("main.concurrent.futures.ThreadPoolExecutor")
def test_push_rules_multi_batch(self, mock_executor, mock_as_completed):
"""
Test that push_rules uses ThreadPoolExecutor for multiple batches (> 500 rules).
"""
# Create > 500 rules (2 batches)
hostnames = [f"example{i}.com" for i in range(600)]

mock_executor_instance = mock_executor.return_value
mock_executor_instance.__enter__.return_value = mock_executor_instance

# Mock submit to return a Future
mock_future = MagicMock()
mock_future.result.return_value = ["some_rule"]
mock_executor_instance.submit.return_value = mock_future

mock_as_completed.return_value = [mock_future, mock_future] # 2 batches

main.push_rules(
self.profile_id,
self.folder_name,
self.folder_id,
self.do,
self.status,
hostnames,
self.existing_rules,
self.client
)

# This should ALWAYS be True
self.assertTrue(mock_executor.called, "ThreadPoolExecutor should be called for multi-batch")

if __name__ == '__main__':

Check warning

Code scanning / Prospector (reported by Codacy)

expected 2 blank lines after class or function definition, found 1 (E305)

expected 2 blank lines after class or function definition, found 1 (E305)
unittest.main()
Loading