diff --git a/CODEOWNERS b/CODEOWNERS index 42c75211ff..0aa95e7c45 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1 +1 @@ -* @cbartz @yhaliaw @javierdelapuente @yanksyoon +* @cbartz @yhaliaw @javierdelapuente @yanksyoon @florentianayuwono @weiiwang01 diff --git a/docs/changelog.md b/docs/changelog.md index 5aa851b0f2..a71691f15c 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -2,6 +2,10 @@ This changelog documents user-relevant changes to the GitHub runner charm. +## 2025-12-16 + +- Implemented exponential backoff strategy for reactive consumer message retries to reduce load on dependencies during sustained failures. The backoff starts at 60 seconds and doubles with each retry, capped at 1800 seconds (30 minutes). + ## 2025-12-10 - Removed apt update step in cloud-init of the VM creation step since it is now applied in the diff --git a/github-runner-manager/pyproject.toml b/github-runner-manager/pyproject.toml index e5646262d0..6348112c4e 100644 --- a/github-runner-manager/pyproject.toml +++ b/github-runner-manager/pyproject.toml @@ -3,7 +3,7 @@ [project] name = "github-runner-manager" -version = "0.9.0" +version = "0.10.0" authors = [ { name = "Canonical IS DevOps", email = "is-devops-team@canonical.com" }, ] diff --git a/github-runner-manager/src/github_runner_manager/reactive/consumer.py b/github-runner-manager/src/github_runner_manager/reactive/consumer.py index 93601bbca4..76936a1062 100644 --- a/github-runner-manager/src/github_runner_manager/reactive/consumer.py +++ b/github-runner-manager/src/github_runner_manager/reactive/consumer.py @@ -29,6 +29,9 @@ PROCESS_COUNT_HEADER_NAME = "X-Process-Count" WAIT_TIME_IN_SEC = 60 RETRY_LIMIT = 5 +# Exponential backoff configuration for message retries +BACKOFF_BASE_SECONDS = 60 +BACKOFF_MAX_SECONDS = 1800 # This control message is for testing. The reactive process will stop consuming messages # when the message is sent. This message does not come from the router. END_PROCESSING_PAYLOAD = "__END__" @@ -92,6 +95,19 @@ def get_queue_size(queue_config: QueueConfig) -> int: raise QueueError("Error when communicating with the queue") from exc +def _calculate_backoff_time(retry_count: int) -> int: + """Calculate exponential backoff time for retries. + + Args: + retry_count: The current retry count (starting from 1). + + Returns: + The backoff time in seconds, capped at BACKOFF_MAX_SECONDS. + """ + backoff_time = BACKOFF_BASE_SECONDS * (2 ** (retry_count - 1)) + return min(backoff_time, BACKOFF_MAX_SECONDS) + + # Ignore `consume` too complex as it is pending re-design. def consume( # noqa: C901 queue_config: QueueConfig, @@ -146,11 +162,14 @@ def consume( # noqa: C901 continue if msg_process_count > 1: + backoff_time = _calculate_backoff_time(msg_process_count) logger.info( - "Pause job %s with retry count %s", job_details.url, msg_process_count + "Pause job %s with retry count %s for %s seconds (exponential backoff)", + job_details.url, + msg_process_count, + backoff_time, ) - # Avoid rapid retrying to prevent overloading services, e.g., OpenStack API. - sleep(WAIT_TIME_IN_SEC) + sleep(backoff_time) if not _validate_labels( labels=job_details.labels, supported_labels=supported_labels diff --git a/github-runner-manager/tests/unit/reactive/test_consumer.py b/github-runner-manager/tests/unit/reactive/test_consumer.py index 7ea21d0954..abf3eb35c2 100644 --- a/github-runner-manager/tests/unit/reactive/test_consumer.py +++ b/github-runner-manager/tests/unit/reactive/test_consumer.py @@ -331,7 +331,7 @@ def test_consume_retried_job_success(queue_config: QueueConfig, mock_sleep: Magi arrange: A job placed in the message queue which is processed before. act: Call consume. assert: A runner is spawned, the message is removed from the queue, and sleep is called two - times. + times with exponential backoff for the retry and normal wait time for spawn check. """ labels = {secrets.token_hex(16), secrets.token_hex(16)} job_details = consumer.JobDetails( @@ -357,14 +357,16 @@ def test_consume_retried_job_success(queue_config: QueueConfig, mock_sleep: Magi _assert_queue_is_empty(queue_config.queue_name) - mock_sleep.assert_has_calls([mock.call(WAIT_TIME_IN_SEC), mock.call(WAIT_TIME_IN_SEC)]) + # First sleep is exponential backoff for retry (count=2: 120s), + # second is from _spawn_runner (60s) + mock_sleep.assert_has_calls([mock.call(120), mock.call(WAIT_TIME_IN_SEC)]) def test_consume_retried_job_failure(queue_config: QueueConfig, mock_sleep: MagicMock): """ arrange: A job placed in the message queue which is processed before. Mock runner spawn fail. act: Call consume. - assert: The message requeued. Sleep called once. + assert: The message requeued. Sleep called once with exponential backoff. """ labels = {secrets.token_hex(16), secrets.token_hex(16)} job_details = consumer.JobDetails( @@ -392,7 +394,8 @@ def test_consume_retried_job_failure(queue_config: QueueConfig, mock_sleep: Magi queue_config.queue_name, job_details.json(), headers={PROCESS_COUNT_HEADER_NAME: 2} ) - mock_sleep.assert_called_once_with(WAIT_TIME_IN_SEC) + # Sleep with exponential backoff for retry count 2: 120 seconds + mock_sleep.assert_called_once_with(120) def test_consume_retried_job_failure_past_limit(queue_config: QueueConfig, mock_sleep: MagicMock): @@ -484,3 +487,26 @@ def _assert_msg_has_been_requeued( assert msg.payload == payload if headers is not None: assert msg.headers == headers + + +@pytest.mark.parametrize( + "retry_count,expected_backoff", + [ + pytest.param(1, 60, id="first retry - 60 seconds"), + pytest.param(2, 120, id="second retry - 120 seconds"), + pytest.param(3, 240, id="third retry - 240 seconds"), + pytest.param(4, 480, id="fourth retry - 480 seconds"), + pytest.param(5, 960, id="fifth retry - 960 seconds"), + pytest.param(6, 1800, id="sixth retry - capped at max 1800 seconds"), + pytest.param(10, 1800, id="high retry count - capped at max 1800 seconds"), + ], +) +def test_calculate_backoff_time(retry_count: int, expected_backoff: int): + """ + arrange: Given a retry count. + act: Call _calculate_backoff_time. + assert: The correct exponential backoff time is returned, capped at the maximum. + """ + from github_runner_manager.reactive.consumer import _calculate_backoff_time + + assert _calculate_backoff_time(retry_count) == expected_backoff