Skip to content

Commit b95dba1

Browse files
[patch] Retry for rbac permissions when timeout
1 parent 184bf9b commit b95dba1

1 file changed

Lines changed: 49 additions & 5 deletions

File tree

src/mas/devops/tekton.py

Lines changed: 49 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
from kubeconfig import kubectl
2222
from openshift.dynamic import DynamicClient
23-
from openshift.dynamic.exceptions import NotFoundError, UnprocessibleEntityError
23+
from openshift.dynamic.exceptions import NotFoundError, UnprocessibleEntityError, ApiException
2424

2525
from jinja2 import Environment, FileSystemLoader
2626

@@ -1152,7 +1152,51 @@ def prepareInstallRBAC(dynClient: DynamicClient, namespace: str, instanceId: str
11521152

11531153
logger.debug(f"Applying RBAC resource {kind}/{name} in namespace {namespace} for instance {instanceId}")
11541154
resourceAPI = dynClient.resources.get(api_version=apiVersion, kind=kind)
1155-
if namespace:
1156-
resourceAPI.apply(body=resourceBody, namespace=namespace)
1157-
else:
1158-
resourceAPI.apply(body=resourceBody)
1155+
1156+
# Retry logic for transient API server errors (5 minute timeout)
1157+
max_retries = 30 # 30 retries with exponential backoff = ~5 minutes
1158+
retry_delay = 2 # Initial delay in seconds
1159+
max_delay = 30 # Maximum delay between retries
1160+
1161+
for attempt in range(max_retries):
1162+
try:
1163+
if namespace:
1164+
resourceAPI.apply(body=resourceBody, namespace=namespace)
1165+
else:
1166+
resourceAPI.apply(body=resourceBody)
1167+
break # Success, exit retry loop
1168+
1169+
except ApiException as e:
1170+
# Check if it's a retryable error (429, 503, 504, or API server shutdown)
1171+
is_retryable = (
1172+
e.status in [429, 503, 504] or
1173+
"apiserver is shutting down" in str(e).lower() or
1174+
"connection refused" in str(e).lower() or
1175+
"too many requests" in str(e).lower()
1176+
)
1177+
1178+
if is_retryable and attempt < max_retries - 1:
1179+
# Exponential backoff with max delay cap
1180+
wait_time = min(retry_delay * (2 ** attempt), max_delay)
1181+
logger.warning(
1182+
f"API server temporarily unavailable for {kind}/{name} "
1183+
f"(attempt {attempt + 1}/{max_retries}). "
1184+
f"Retrying in {wait_time}s... Error: {e.status} - {str(e)[:100]}"
1185+
)
1186+
sleep(wait_time)
1187+
elif is_retryable:
1188+
# Exhausted all retries
1189+
logger.error(
1190+
f"Failed to apply RBAC resource {kind}/{name} after {max_retries} attempts "
1191+
f"(~5 minutes). API server may be unavailable."
1192+
)
1193+
raise
1194+
else:
1195+
# Non-retryable error (permissions, invalid resource, etc.)
1196+
logger.error(f"Failed to apply RBAC resource {kind}/{name}: {e}")
1197+
raise
1198+
1199+
except Exception as e:
1200+
# Catch any other unexpected errors
1201+
logger.error(f"Unexpected error applying RBAC resource {kind}/{name}: {e}")
1202+
raise

0 commit comments

Comments
 (0)