canonical · cbartz · Dec 17, 2025 · Dec 16, 2025 · Dec 16, 2025 · Dec 16, 2025
@@ -1 +1 @@
-*       @cbartz @yhaliaw @javierdelapuente @yanksyoon
+*       @cbartz @yhaliaw @javierdelapuente @yanksyoon @florentianayuwono @weiiwang01
@@ -2,6 +2,10 @@
 
 This changelog documents user-relevant changes to the GitHub runner charm.
 
+## 2025-12-16
+
+- Implemented exponential backoff strategy for reactive consumer message retries to reduce load on dependencies during sustained failures. The backoff starts at 60 seconds and doubles with each retry, capped at 1800 seconds (30 minutes).
+
 ## 2025-12-10
 
 - Removed apt update step in cloud-init of the VM creation step since it is now applied in the

@@ -3,7 +3,7 @@
 
 [project]
 name = "github-runner-manager"
-version = "0.9.0"
+version = "0.10.0"
 authors = [
     { name = "Canonical IS DevOps", email = "is-devops-team@canonical.com" },
 ]

@@ -29,6 +29,9 @@
 PROCESS_COUNT_HEADER_NAME = "X-Process-Count"
 WAIT_TIME_IN_SEC = 60
 RETRY_LIMIT = 5
+# Exponential backoff configuration for message retries
+BACKOFF_BASE_SECONDS = 60
+BACKOFF_MAX_SECONDS = 1800
 # This control message is for testing. The reactive process will stop consuming messages
 # when the message is sent. This message does not come from the router.
 END_PROCESSING_PAYLOAD = "__END__"
@@ -92,6 +95,19 @@ def get_queue_size(queue_config: QueueConfig) -> int:
         raise QueueError("Error when communicating with the queue") from exc
 
 
+def _calculate_backoff_time(retry_count: int) -> int:
+    """Calculate exponential backoff time for retries.
+
+    Args:
+        retry_count: The current retry count (starting from 1).
+
+    Returns:
+        The backoff time in seconds, capped at BACKOFF_MAX_SECONDS.
+    """
+    backoff_time = BACKOFF_BASE_SECONDS * (2 ** (retry_count - 1))
+    return min(backoff_time, BACKOFF_MAX_SECONDS)
+
+
 # Ignore `consume` too complex as it is pending re-design.
 def consume(  # noqa: C901
     queue_config: QueueConfig,
@@ -146,11 +162,14 @@ def consume(  # noqa: C901
                     continue
 
                 if msg_process_count > 1:
+                    backoff_time = _calculate_backoff_time(msg_process_count)
                     logger.info(
-                        "Pause job %s with retry count %s", job_details.url, msg_process_count
+                        "Pause job %s with retry count %s for %s seconds (exponential backoff)",
+                        job_details.url,
+                        msg_process_count,
+                        backoff_time,
                     )
-                    # Avoid rapid retrying to prevent overloading services, e.g., OpenStack API.
-                    sleep(WAIT_TIME_IN_SEC)
+                    sleep(backoff_time)
 
                 if not _validate_labels(
                     labels=job_details.labels, supported_labels=supported_labels

@@ -331,7 +331,7 @@ def test_consume_retried_job_success(queue_config: QueueConfig, mock_sleep: Magi
     arrange: A job placed in the message queue which is processed before.
     act: Call consume.
     assert: A runner is spawned, the message is removed from the queue, and sleep is called two
-        times.
+        times with exponential backoff for the retry and normal wait time for spawn check.
     """
     labels = {secrets.token_hex(16), secrets.token_hex(16)}
     job_details = consumer.JobDetails(
@@ -357,14 +357,16 @@ def test_consume_retried_job_success(queue_config: QueueConfig, mock_sleep: Magi
 
     _assert_queue_is_empty(queue_config.queue_name)
 
-    mock_sleep.assert_has_calls([mock.call(WAIT_TIME_IN_SEC), mock.call(WAIT_TIME_IN_SEC)])
+    # First sleep is exponential backoff for retry (count=2: 120s),
+    # second is from _spawn_runner (60s)
+    mock_sleep.assert_has_calls([mock.call(120), mock.call(WAIT_TIME_IN_SEC)])
 
 
 def test_consume_retried_job_failure(queue_config: QueueConfig, mock_sleep: MagicMock):
     """
     arrange: A job placed in the message queue which is processed before. Mock runner spawn fail.
     act: Call consume.
-    assert: The message requeued. Sleep called once.
+    assert: The message requeued. Sleep called once with exponential backoff.
     """
     labels = {secrets.token_hex(16), secrets.token_hex(16)}
     job_details = consumer.JobDetails(
@@ -392,7 +394,8 @@ def test_consume_retried_job_failure(queue_config: QueueConfig, mock_sleep: Magi
         queue_config.queue_name, job_details.json(), headers={PROCESS_COUNT_HEADER_NAME: 2}
     )
 
-    mock_sleep.assert_called_once_with(WAIT_TIME_IN_SEC)
+    # Sleep with exponential backoff for retry count 2: 120 seconds
+    mock_sleep.assert_called_once_with(120)
 
 
 def test_consume_retried_job_failure_past_limit(queue_config: QueueConfig, mock_sleep: MagicMock):
@@ -484,3 +487,26 @@ def _assert_msg_has_been_requeued(
             assert msg.payload == payload
             if headers is not None:
                 assert msg.headers == headers
+
+
+@pytest.mark.parametrize(
+    "retry_count,expected_backoff",
+    [
+        pytest.param(1, 60, id="first retry - 60 seconds"),
+        pytest.param(2, 120, id="second retry - 120 seconds"),
+        pytest.param(3, 240, id="third retry - 240 seconds"),
+        pytest.param(4, 480, id="fourth retry - 480 seconds"),
+        pytest.param(5, 960, id="fifth retry - 960 seconds"),
+        pytest.param(6, 1800, id="sixth retry - capped at max 1800 seconds"),
+        pytest.param(10, 1800, id="high retry count - capped at max 1800 seconds"),
+    ],
+)
+def test_calculate_backoff_time(retry_count: int, expected_backoff: int):
+    """
+    arrange: Given a retry count.
+    act: Call _calculate_backoff_time.
+    assert: The correct exponential backoff time is returned, capped at the maximum.
+    """
+    from github_runner_manager.reactive.consumer import _calculate_backoff_time
+
+    assert _calculate_backoff_time(retry_count) == expected_backoff
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		* @cbartz @yhaliaw @javierdelapuente @yanksyoon
		* @cbartz @yhaliaw @javierdelapuente @yanksyoon @florentianayuwono @weiiwang01
cbartz marked this conversation as resolved. Show resolved Hide resolved