Skip to content
Merged
2 changes: 1 addition & 1 deletion CODEOWNERS
Original file line number Diff line number Diff line change
@@ -1 +1 @@
* @cbartz @yhaliaw @javierdelapuente @yanksyoon
* @cbartz @yhaliaw @javierdelapuente @yanksyoon @florentianayuwono @weiiwang01
4 changes: 4 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

This changelog documents user-relevant changes to the GitHub runner charm.

## 2025-12-16

- Implemented exponential backoff strategy for reactive consumer message retries to reduce load on dependencies during sustained failures. The backoff starts at 60 seconds and doubles with each retry, capped at 1800 seconds (30 minutes).

Check failure on line 7 in docs/changelog.md

View workflow job for this annotation

GitHub Actions / docs-checks / vale

[vale] reported by reviewdog 🐶 [Canonical.000-US-spellcheck] The word 'backoff' seems to be misspelled. Raw Output: {"message": "[Canonical.000-US-spellcheck] The word 'backoff' seems to be misspelled.", "location": {"path": "docs/changelog.md", "range": {"start": {"line": 7, "column": 144}}}, "severity": "ERROR"}

Check failure on line 7 in docs/changelog.md

View workflow job for this annotation

GitHub Actions / docs-checks / vale

[vale] reported by reviewdog 🐶 [Canonical.000-US-spellcheck] The word 'backoff' seems to be misspelled. Raw Output: {"message": "[Canonical.000-US-spellcheck] The word 'backoff' seems to be misspelled.", "location": {"path": "docs/changelog.md", "range": {"start": {"line": 7, "column": 27}}}, "severity": "ERROR"}

## 2025-12-10

- Removed apt update step in cloud-init of the VM creation step since it is now applied in the
Expand Down
2 changes: 1 addition & 1 deletion github-runner-manager/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

[project]
name = "github-runner-manager"
version = "0.9.0"
version = "0.10.0"
authors = [
{ name = "Canonical IS DevOps", email = "is-devops-team@canonical.com" },
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@
PROCESS_COUNT_HEADER_NAME = "X-Process-Count"
WAIT_TIME_IN_SEC = 60
RETRY_LIMIT = 5
# Exponential backoff configuration for message retries
BACKOFF_BASE_SECONDS = 60
BACKOFF_MAX_SECONDS = 1800
# This control message is for testing. The reactive process will stop consuming messages
# when the message is sent. This message does not come from the router.
END_PROCESSING_PAYLOAD = "__END__"
Expand Down Expand Up @@ -92,6 +95,19 @@ def get_queue_size(queue_config: QueueConfig) -> int:
raise QueueError("Error when communicating with the queue") from exc


def _calculate_backoff_time(retry_count: int) -> int:
"""Calculate exponential backoff time for retries.

Args:
retry_count: The current retry count (starting from 1).

Returns:
The backoff time in seconds, capped at BACKOFF_MAX_SECONDS.
"""
backoff_time = BACKOFF_BASE_SECONDS * (2 ** (retry_count - 1))
return min(backoff_time, BACKOFF_MAX_SECONDS)


# Ignore `consume` too complex as it is pending re-design.
def consume( # noqa: C901
queue_config: QueueConfig,
Expand Down Expand Up @@ -146,11 +162,14 @@ def consume( # noqa: C901
continue

if msg_process_count > 1:
backoff_time = _calculate_backoff_time(msg_process_count)
logger.info(
"Pause job %s with retry count %s", job_details.url, msg_process_count
"Pause job %s with retry count %s for %s seconds (exponential backoff)",
job_details.url,
msg_process_count,
backoff_time,
)
# Avoid rapid retrying to prevent overloading services, e.g., OpenStack API.
sleep(WAIT_TIME_IN_SEC)
sleep(backoff_time)

if not _validate_labels(
labels=job_details.labels, supported_labels=supported_labels
Expand Down
34 changes: 30 additions & 4 deletions github-runner-manager/tests/unit/reactive/test_consumer.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,7 @@ def test_consume_retried_job_success(queue_config: QueueConfig, mock_sleep: Magi
arrange: A job placed in the message queue which is processed before.
act: Call consume.
assert: A runner is spawned, the message is removed from the queue, and sleep is called two
times.
times with exponential backoff for the retry and normal wait time for spawn check.
"""
labels = {secrets.token_hex(16), secrets.token_hex(16)}
job_details = consumer.JobDetails(
Expand All @@ -357,14 +357,16 @@ def test_consume_retried_job_success(queue_config: QueueConfig, mock_sleep: Magi

_assert_queue_is_empty(queue_config.queue_name)

mock_sleep.assert_has_calls([mock.call(WAIT_TIME_IN_SEC), mock.call(WAIT_TIME_IN_SEC)])
# First sleep is exponential backoff for retry (count=2: 120s),
# second is from _spawn_runner (60s)
mock_sleep.assert_has_calls([mock.call(120), mock.call(WAIT_TIME_IN_SEC)])


def test_consume_retried_job_failure(queue_config: QueueConfig, mock_sleep: MagicMock):
"""
arrange: A job placed in the message queue which is processed before. Mock runner spawn fail.
act: Call consume.
assert: The message requeued. Sleep called once.
assert: The message requeued. Sleep called once with exponential backoff.
"""
labels = {secrets.token_hex(16), secrets.token_hex(16)}
job_details = consumer.JobDetails(
Expand Down Expand Up @@ -392,7 +394,8 @@ def test_consume_retried_job_failure(queue_config: QueueConfig, mock_sleep: Magi
queue_config.queue_name, job_details.json(), headers={PROCESS_COUNT_HEADER_NAME: 2}
)

mock_sleep.assert_called_once_with(WAIT_TIME_IN_SEC)
# Sleep with exponential backoff for retry count 2: 120 seconds
mock_sleep.assert_called_once_with(120)


def test_consume_retried_job_failure_past_limit(queue_config: QueueConfig, mock_sleep: MagicMock):
Expand Down Expand Up @@ -484,3 +487,26 @@ def _assert_msg_has_been_requeued(
assert msg.payload == payload
if headers is not None:
assert msg.headers == headers


@pytest.mark.parametrize(
"retry_count,expected_backoff",
[
pytest.param(1, 60, id="first retry - 60 seconds"),
pytest.param(2, 120, id="second retry - 120 seconds"),
pytest.param(3, 240, id="third retry - 240 seconds"),
pytest.param(4, 480, id="fourth retry - 480 seconds"),
pytest.param(5, 960, id="fifth retry - 960 seconds"),
pytest.param(6, 1800, id="sixth retry - capped at max 1800 seconds"),
pytest.param(10, 1800, id="high retry count - capped at max 1800 seconds"),
],
)
def test_calculate_backoff_time(retry_count: int, expected_backoff: int):
"""
arrange: Given a retry count.
act: Call _calculate_backoff_time.
assert: The correct exponential backoff time is returned, capped at the maximum.
"""
from github_runner_manager.reactive.consumer import _calculate_backoff_time

assert _calculate_backoff_time(retry_count) == expected_backoff
Loading