From 1e04cab1ff6b2e649a033cea72bdc86e2a4c2ce1 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 16 Dec 2025 07:10:18 +0000 Subject: [PATCH 1/8] Initial plan From 1bc67d821ded255cf66809be460deb0c8881cfed Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 16 Dec 2025 07:15:42 +0000 Subject: [PATCH 2/8] Implement exponential backoff for reactive consumer retries Co-authored-by: cbartz <4182921+cbartz@users.noreply.github.com> --- .../reactive/consumer.py | 25 ++++++++++++-- .../tests/unit/reactive/test_consumer.py | 33 ++++++++++++++++--- 2 files changed, 51 insertions(+), 7 deletions(-) diff --git a/github-runner-manager/src/github_runner_manager/reactive/consumer.py b/github-runner-manager/src/github_runner_manager/reactive/consumer.py index 93601bbca4..a9300c6d23 100644 --- a/github-runner-manager/src/github_runner_manager/reactive/consumer.py +++ b/github-runner-manager/src/github_runner_manager/reactive/consumer.py @@ -29,6 +29,9 @@ PROCESS_COUNT_HEADER_NAME = "X-Process-Count" WAIT_TIME_IN_SEC = 60 RETRY_LIMIT = 5 +# Exponential backoff configuration for message retries +BACKOFF_BASE_SECONDS = 10 +BACKOFF_MAX_SECONDS = 300 # This control message is for testing. The reactive process will stop consuming messages # when the message is sent. This message does not come from the router. END_PROCESSING_PAYLOAD = "__END__" @@ -72,6 +75,19 @@ class QueueError(Exception): """Raised when an error when communicating with the queue occurs.""" +def _calculate_backoff_time(retry_count: int) -> int: + """Calculate exponential backoff time for retries. + + Args: + retry_count: The current retry count (starting from 1). + + Returns: + The backoff time in seconds, capped at BACKOFF_MAX_SECONDS. + """ + backoff_time = BACKOFF_BASE_SECONDS * (2 ** (retry_count - 1)) + return min(backoff_time, BACKOFF_MAX_SECONDS) + + def get_queue_size(queue_config: QueueConfig) -> int: """Get the size of the message queue. @@ -146,11 +162,14 @@ def consume( # noqa: C901 continue if msg_process_count > 1: + backoff_time = _calculate_backoff_time(msg_process_count) logger.info( - "Pause job %s with retry count %s", job_details.url, msg_process_count + "Pause job %s with retry count %s for %s seconds (exponential backoff)", + job_details.url, + msg_process_count, + backoff_time, ) - # Avoid rapid retrying to prevent overloading services, e.g., OpenStack API. - sleep(WAIT_TIME_IN_SEC) + sleep(backoff_time) if not _validate_labels( labels=job_details.labels, supported_labels=supported_labels diff --git a/github-runner-manager/tests/unit/reactive/test_consumer.py b/github-runner-manager/tests/unit/reactive/test_consumer.py index 7ea21d0954..3fae0c9628 100644 --- a/github-runner-manager/tests/unit/reactive/test_consumer.py +++ b/github-runner-manager/tests/unit/reactive/test_consumer.py @@ -331,7 +331,7 @@ def test_consume_retried_job_success(queue_config: QueueConfig, mock_sleep: Magi arrange: A job placed in the message queue which is processed before. act: Call consume. assert: A runner is spawned, the message is removed from the queue, and sleep is called two - times. + times with exponential backoff for the retry and normal wait time for spawn check. """ labels = {secrets.token_hex(16), secrets.token_hex(16)} job_details = consumer.JobDetails( @@ -357,14 +357,15 @@ def test_consume_retried_job_success(queue_config: QueueConfig, mock_sleep: Magi _assert_queue_is_empty(queue_config.queue_name) - mock_sleep.assert_has_calls([mock.call(WAIT_TIME_IN_SEC), mock.call(WAIT_TIME_IN_SEC)]) + # First sleep is exponential backoff for retry (count=2: 20s), second is from _spawn_runner (60s) + mock_sleep.assert_has_calls([mock.call(20), mock.call(WAIT_TIME_IN_SEC)]) def test_consume_retried_job_failure(queue_config: QueueConfig, mock_sleep: MagicMock): """ arrange: A job placed in the message queue which is processed before. Mock runner spawn fail. act: Call consume. - assert: The message requeued. Sleep called once. + assert: The message requeued. Sleep called once with exponential backoff. """ labels = {secrets.token_hex(16), secrets.token_hex(16)} job_details = consumer.JobDetails( @@ -392,7 +393,8 @@ def test_consume_retried_job_failure(queue_config: QueueConfig, mock_sleep: Magi queue_config.queue_name, job_details.json(), headers={PROCESS_COUNT_HEADER_NAME: 2} ) - mock_sleep.assert_called_once_with(WAIT_TIME_IN_SEC) + # Sleep with exponential backoff for retry count 2: 20 seconds + mock_sleep.assert_called_once_with(20) def test_consume_retried_job_failure_past_limit(queue_config: QueueConfig, mock_sleep: MagicMock): @@ -484,3 +486,26 @@ def _assert_msg_has_been_requeued( assert msg.payload == payload if headers is not None: assert msg.headers == headers + + +@pytest.mark.parametrize( + "retry_count,expected_backoff", + [ + pytest.param(1, 10, id="first retry - 10 seconds"), + pytest.param(2, 20, id="second retry - 20 seconds"), + pytest.param(3, 40, id="third retry - 40 seconds"), + pytest.param(4, 80, id="fourth retry - 80 seconds"), + pytest.param(5, 160, id="fifth retry - 160 seconds"), + pytest.param(6, 300, id="sixth retry - capped at max 300 seconds"), + pytest.param(10, 300, id="high retry count - capped at max 300 seconds"), + ], +) +def test_calculate_backoff_time(retry_count: int, expected_backoff: int): + """ + arrange: Given a retry count. + act: Call _calculate_backoff_time. + assert: The correct exponential backoff time is returned, capped at the maximum. + """ + from github_runner_manager.reactive.consumer import _calculate_backoff_time + + assert _calculate_backoff_time(retry_count) == expected_backoff From 7206b99943ef3db665bde0a350e5962e42e357d5 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 16 Dec 2025 07:18:01 +0000 Subject: [PATCH 3/8] Fix line length in test comments Co-authored-by: cbartz <4182921+cbartz@users.noreply.github.com> --- github-runner-manager/tests/unit/reactive/test_consumer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/github-runner-manager/tests/unit/reactive/test_consumer.py b/github-runner-manager/tests/unit/reactive/test_consumer.py index 3fae0c9628..3ae97a0842 100644 --- a/github-runner-manager/tests/unit/reactive/test_consumer.py +++ b/github-runner-manager/tests/unit/reactive/test_consumer.py @@ -357,7 +357,8 @@ def test_consume_retried_job_success(queue_config: QueueConfig, mock_sleep: Magi _assert_queue_is_empty(queue_config.queue_name) - # First sleep is exponential backoff for retry (count=2: 20s), second is from _spawn_runner (60s) + # First sleep is exponential backoff for retry (count=2: 20s), + # second is from _spawn_runner (60s) mock_sleep.assert_has_calls([mock.call(20), mock.call(WAIT_TIME_IN_SEC)]) From cfd18b91058e0db59b84803f7b8f7e9aec3393f4 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 16 Dec 2025 07:45:41 +0000 Subject: [PATCH 4/8] Address PR feedback: move helper below public function, change base to 60s Co-authored-by: cbartz <4182921+cbartz@users.noreply.github.com> --- .../reactive/consumer.py | 28 +++++++++---------- .../tests/unit/reactive/test_consumer.py | 19 ++++++------- 2 files changed, 23 insertions(+), 24 deletions(-) diff --git a/github-runner-manager/src/github_runner_manager/reactive/consumer.py b/github-runner-manager/src/github_runner_manager/reactive/consumer.py index a9300c6d23..38080d4769 100644 --- a/github-runner-manager/src/github_runner_manager/reactive/consumer.py +++ b/github-runner-manager/src/github_runner_manager/reactive/consumer.py @@ -30,7 +30,7 @@ WAIT_TIME_IN_SEC = 60 RETRY_LIMIT = 5 # Exponential backoff configuration for message retries -BACKOFF_BASE_SECONDS = 10 +BACKOFF_BASE_SECONDS = 60 BACKOFF_MAX_SECONDS = 300 # This control message is for testing. The reactive process will stop consuming messages # when the message is sent. This message does not come from the router. @@ -75,19 +75,6 @@ class QueueError(Exception): """Raised when an error when communicating with the queue occurs.""" -def _calculate_backoff_time(retry_count: int) -> int: - """Calculate exponential backoff time for retries. - - Args: - retry_count: The current retry count (starting from 1). - - Returns: - The backoff time in seconds, capped at BACKOFF_MAX_SECONDS. - """ - backoff_time = BACKOFF_BASE_SECONDS * (2 ** (retry_count - 1)) - return min(backoff_time, BACKOFF_MAX_SECONDS) - - def get_queue_size(queue_config: QueueConfig) -> int: """Get the size of the message queue. @@ -108,6 +95,19 @@ def get_queue_size(queue_config: QueueConfig) -> int: raise QueueError("Error when communicating with the queue") from exc +def _calculate_backoff_time(retry_count: int) -> int: + """Calculate exponential backoff time for retries. + + Args: + retry_count: The current retry count (starting from 1). + + Returns: + The backoff time in seconds, capped at BACKOFF_MAX_SECONDS. + """ + backoff_time = BACKOFF_BASE_SECONDS * (2 ** (retry_count - 1)) + return min(backoff_time, BACKOFF_MAX_SECONDS) + + # Ignore `consume` too complex as it is pending re-design. def consume( # noqa: C901 queue_config: QueueConfig, diff --git a/github-runner-manager/tests/unit/reactive/test_consumer.py b/github-runner-manager/tests/unit/reactive/test_consumer.py index 3ae97a0842..99aafa272e 100644 --- a/github-runner-manager/tests/unit/reactive/test_consumer.py +++ b/github-runner-manager/tests/unit/reactive/test_consumer.py @@ -357,9 +357,9 @@ def test_consume_retried_job_success(queue_config: QueueConfig, mock_sleep: Magi _assert_queue_is_empty(queue_config.queue_name) - # First sleep is exponential backoff for retry (count=2: 20s), + # First sleep is exponential backoff for retry (count=2: 120s), # second is from _spawn_runner (60s) - mock_sleep.assert_has_calls([mock.call(20), mock.call(WAIT_TIME_IN_SEC)]) + mock_sleep.assert_has_calls([mock.call(120), mock.call(WAIT_TIME_IN_SEC)]) def test_consume_retried_job_failure(queue_config: QueueConfig, mock_sleep: MagicMock): @@ -394,8 +394,8 @@ def test_consume_retried_job_failure(queue_config: QueueConfig, mock_sleep: Magi queue_config.queue_name, job_details.json(), headers={PROCESS_COUNT_HEADER_NAME: 2} ) - # Sleep with exponential backoff for retry count 2: 20 seconds - mock_sleep.assert_called_once_with(20) + # Sleep with exponential backoff for retry count 2: 120 seconds + mock_sleep.assert_called_once_with(120) def test_consume_retried_job_failure_past_limit(queue_config: QueueConfig, mock_sleep: MagicMock): @@ -492,12 +492,11 @@ def _assert_msg_has_been_requeued( @pytest.mark.parametrize( "retry_count,expected_backoff", [ - pytest.param(1, 10, id="first retry - 10 seconds"), - pytest.param(2, 20, id="second retry - 20 seconds"), - pytest.param(3, 40, id="third retry - 40 seconds"), - pytest.param(4, 80, id="fourth retry - 80 seconds"), - pytest.param(5, 160, id="fifth retry - 160 seconds"), - pytest.param(6, 300, id="sixth retry - capped at max 300 seconds"), + pytest.param(1, 60, id="first retry - 60 seconds"), + pytest.param(2, 120, id="second retry - 120 seconds"), + pytest.param(3, 240, id="third retry - 240 seconds"), + pytest.param(4, 300, id="fourth retry - capped at max 300 seconds"), + pytest.param(5, 300, id="fifth retry - capped at max 300 seconds"), pytest.param(10, 300, id="high retry count - capped at max 300 seconds"), ], ) From f457ae83223ead226580e9ebb88238f3ca73e51f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 16 Dec 2025 09:18:39 +0000 Subject: [PATCH 5/8] Remove 300s cap on exponential backoff Co-authored-by: cbartz <4182921+cbartz@users.noreply.github.com> --- .../src/github_runner_manager/reactive/consumer.py | 6 ++---- .../tests/unit/reactive/test_consumer.py | 8 ++++---- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/github-runner-manager/src/github_runner_manager/reactive/consumer.py b/github-runner-manager/src/github_runner_manager/reactive/consumer.py index 38080d4769..b70f29553a 100644 --- a/github-runner-manager/src/github_runner_manager/reactive/consumer.py +++ b/github-runner-manager/src/github_runner_manager/reactive/consumer.py @@ -31,7 +31,6 @@ RETRY_LIMIT = 5 # Exponential backoff configuration for message retries BACKOFF_BASE_SECONDS = 60 -BACKOFF_MAX_SECONDS = 300 # This control message is for testing. The reactive process will stop consuming messages # when the message is sent. This message does not come from the router. END_PROCESSING_PAYLOAD = "__END__" @@ -102,10 +101,9 @@ def _calculate_backoff_time(retry_count: int) -> int: retry_count: The current retry count (starting from 1). Returns: - The backoff time in seconds, capped at BACKOFF_MAX_SECONDS. + The backoff time in seconds. """ - backoff_time = BACKOFF_BASE_SECONDS * (2 ** (retry_count - 1)) - return min(backoff_time, BACKOFF_MAX_SECONDS) + return BACKOFF_BASE_SECONDS * (2 ** (retry_count - 1)) # Ignore `consume` too complex as it is pending re-design. diff --git a/github-runner-manager/tests/unit/reactive/test_consumer.py b/github-runner-manager/tests/unit/reactive/test_consumer.py index 99aafa272e..848891229c 100644 --- a/github-runner-manager/tests/unit/reactive/test_consumer.py +++ b/github-runner-manager/tests/unit/reactive/test_consumer.py @@ -495,16 +495,16 @@ def _assert_msg_has_been_requeued( pytest.param(1, 60, id="first retry - 60 seconds"), pytest.param(2, 120, id="second retry - 120 seconds"), pytest.param(3, 240, id="third retry - 240 seconds"), - pytest.param(4, 300, id="fourth retry - capped at max 300 seconds"), - pytest.param(5, 300, id="fifth retry - capped at max 300 seconds"), - pytest.param(10, 300, id="high retry count - capped at max 300 seconds"), + pytest.param(4, 480, id="fourth retry - 480 seconds"), + pytest.param(5, 960, id="fifth retry - 960 seconds"), + pytest.param(10, 30720, id="high retry count - 30720 seconds"), ], ) def test_calculate_backoff_time(retry_count: int, expected_backoff: int): """ arrange: Given a retry count. act: Call _calculate_backoff_time. - assert: The correct exponential backoff time is returned, capped at the maximum. + assert: The correct exponential backoff time is returned. """ from github_runner_manager.reactive.consumer import _calculate_backoff_time From 700ad60417c9086a949c1bb4916961e50340ccd3 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 16 Dec 2025 14:17:00 +0000 Subject: [PATCH 6/8] Add 1800s (30 min) cap to exponential backoff Co-authored-by: cbartz <4182921+cbartz@users.noreply.github.com> --- .../src/github_runner_manager/reactive/consumer.py | 6 ++++-- github-runner-manager/tests/unit/reactive/test_consumer.py | 5 +++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/github-runner-manager/src/github_runner_manager/reactive/consumer.py b/github-runner-manager/src/github_runner_manager/reactive/consumer.py index b70f29553a..76936a1062 100644 --- a/github-runner-manager/src/github_runner_manager/reactive/consumer.py +++ b/github-runner-manager/src/github_runner_manager/reactive/consumer.py @@ -31,6 +31,7 @@ RETRY_LIMIT = 5 # Exponential backoff configuration for message retries BACKOFF_BASE_SECONDS = 60 +BACKOFF_MAX_SECONDS = 1800 # This control message is for testing. The reactive process will stop consuming messages # when the message is sent. This message does not come from the router. END_PROCESSING_PAYLOAD = "__END__" @@ -101,9 +102,10 @@ def _calculate_backoff_time(retry_count: int) -> int: retry_count: The current retry count (starting from 1). Returns: - The backoff time in seconds. + The backoff time in seconds, capped at BACKOFF_MAX_SECONDS. """ - return BACKOFF_BASE_SECONDS * (2 ** (retry_count - 1)) + backoff_time = BACKOFF_BASE_SECONDS * (2 ** (retry_count - 1)) + return min(backoff_time, BACKOFF_MAX_SECONDS) # Ignore `consume` too complex as it is pending re-design. diff --git a/github-runner-manager/tests/unit/reactive/test_consumer.py b/github-runner-manager/tests/unit/reactive/test_consumer.py index 848891229c..abf3eb35c2 100644 --- a/github-runner-manager/tests/unit/reactive/test_consumer.py +++ b/github-runner-manager/tests/unit/reactive/test_consumer.py @@ -497,14 +497,15 @@ def _assert_msg_has_been_requeued( pytest.param(3, 240, id="third retry - 240 seconds"), pytest.param(4, 480, id="fourth retry - 480 seconds"), pytest.param(5, 960, id="fifth retry - 960 seconds"), - pytest.param(10, 30720, id="high retry count - 30720 seconds"), + pytest.param(6, 1800, id="sixth retry - capped at max 1800 seconds"), + pytest.param(10, 1800, id="high retry count - capped at max 1800 seconds"), ], ) def test_calculate_backoff_time(retry_count: int, expected_backoff: int): """ arrange: Given a retry count. act: Call _calculate_backoff_time. - assert: The correct exponential backoff time is returned. + assert: The correct exponential backoff time is returned, capped at the maximum. """ from github_runner_manager.reactive.consumer import _calculate_backoff_time From 55a0e29884525c3bc92b7abd7bf16a60823b6ad8 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 16 Dec 2025 14:22:09 +0000 Subject: [PATCH 7/8] Update changelog and bump version to 0.10.0 Co-authored-by: cbartz <4182921+cbartz@users.noreply.github.com> --- docs/changelog.md | 4 ++++ github-runner-manager/pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/changelog.md b/docs/changelog.md index 5aa851b0f2..a71691f15c 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -2,6 +2,10 @@ This changelog documents user-relevant changes to the GitHub runner charm. +## 2025-12-16 + +- Implemented exponential backoff strategy for reactive consumer message retries to reduce load on dependencies during sustained failures. The backoff starts at 60 seconds and doubles with each retry, capped at 1800 seconds (30 minutes). + ## 2025-12-10 - Removed apt update step in cloud-init of the VM creation step since it is now applied in the diff --git a/github-runner-manager/pyproject.toml b/github-runner-manager/pyproject.toml index e5646262d0..6348112c4e 100644 --- a/github-runner-manager/pyproject.toml +++ b/github-runner-manager/pyproject.toml @@ -3,7 +3,7 @@ [project] name = "github-runner-manager" -version = "0.9.0" +version = "0.10.0" authors = [ { name = "Canonical IS DevOps", email = "is-devops-team@canonical.com" }, ] From 58423b9f26f8ecc6eb0b1913726cef164b7e892a Mon Sep 17 00:00:00 2001 From: Christopher Bartz Date: Wed, 17 Dec 2025 13:17:22 +0100 Subject: [PATCH 8/8] Update CODEOWNERS --- CODEOWNERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CODEOWNERS b/CODEOWNERS index 42c75211ff..0aa95e7c45 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1 +1 @@ -* @cbartz @yhaliaw @javierdelapuente @yanksyoon +* @cbartz @yhaliaw @javierdelapuente @yanksyoon @florentianayuwono @weiiwang01