|
20 | 20 |
|
21 | 21 | from kubeconfig import kubectl |
22 | 22 | from openshift.dynamic import DynamicClient |
23 | | -from openshift.dynamic.exceptions import NotFoundError, UnprocessibleEntityError |
| 23 | +from openshift.dynamic.exceptions import NotFoundError, UnprocessibleEntityError, ApiException |
24 | 24 |
|
25 | 25 | from jinja2 import Environment, FileSystemLoader |
26 | 26 |
|
@@ -1152,7 +1152,54 @@ def prepareInstallRBAC(dynClient: DynamicClient, namespace: str, instanceId: str |
1152 | 1152 |
|
1153 | 1153 | logger.debug(f"Applying RBAC resource {kind}/{name} in namespace {namespace} for instance {instanceId}") |
1154 | 1154 | resourceAPI = dynClient.resources.get(api_version=apiVersion, kind=kind) |
1155 | | - if namespace: |
1156 | | - resourceAPI.apply(body=resourceBody, namespace=namespace) |
1157 | | - else: |
1158 | | - resourceAPI.apply(body=resourceBody) |
| 1155 | + |
| 1156 | + # Optimized retry logic for transient API server errors |
| 1157 | + max_retries = 10 # Reduced from 30 to 10 retries |
| 1158 | + base_delay = 1 # Reduced initial delay from 2s to 1s |
| 1159 | + max_delay = 15 # Reduced max delay from 30s to 15s |
| 1160 | + |
| 1161 | + for attempt in range(max_retries): |
| 1162 | + try: |
| 1163 | + if namespace: |
| 1164 | + resourceAPI.apply(body=resourceBody, namespace=namespace) |
| 1165 | + else: |
| 1166 | + resourceAPI.apply(body=resourceBody) |
| 1167 | + |
| 1168 | + # Log success only if there were previous failures |
| 1169 | + if attempt > 0: |
| 1170 | + logger.info(f"Successfully applied {kind}/{name} after {attempt + 1} attempts") |
| 1171 | + break # Success, exit retry loop |
| 1172 | + |
| 1173 | + except ApiException as e: |
| 1174 | + # Check if it's a retryable error (429, 503, 504, or API server shutdown) |
| 1175 | + is_retryable = (e.status in [429, 503, 504] or "apiserver is shutting down" in str(e).lower() or "connection refused" in str(e).lower() or "too many requests" in str(e).lower()) |
| 1176 | + |
| 1177 | + if is_retryable and attempt < max_retries - 1: |
| 1178 | + # Exponential backoff with jitter to avoid thundering herd |
| 1179 | + import random |
| 1180 | + wait_time = min(base_delay * (2 ** attempt), max_delay) |
| 1181 | + jitter = random.uniform(0, 0.1 * wait_time) # Add up to 10% jitter |
| 1182 | + total_wait = wait_time + jitter |
| 1183 | + |
| 1184 | + logger.warning( |
| 1185 | + f"API server temporarily unavailable for {kind}/{name} " |
| 1186 | + f"(attempt {attempt + 1}/{max_retries}, status: {e.status}). " |
| 1187 | + f"Retrying in {total_wait:.1f}s..." |
| 1188 | + ) |
| 1189 | + sleep(total_wait) |
| 1190 | + elif is_retryable: |
| 1191 | + # Exhausted all retries |
| 1192 | + logger.error( |
| 1193 | + f"Failed to apply RBAC resource {kind}/{name} after {max_retries} attempts. " |
| 1194 | + f"API server may be unavailable. Last error: {e.status} - {str(e)[:200]}" |
| 1195 | + ) |
| 1196 | + raise |
| 1197 | + else: |
| 1198 | + # Non-retryable error (permissions, invalid resource, etc.) |
| 1199 | + logger.error(f"Failed to apply RBAC resource {kind}/{name}: {e.status} - {str(e)[:200]}") |
| 1200 | + raise |
| 1201 | + |
| 1202 | + except Exception as e: |
| 1203 | + # Catch any other unexpected errors |
| 1204 | + logger.error(f"Unexpected error applying RBAC resource {kind}/{name}: {type(e).__name__} - {str(e)[:200]}") |
| 1205 | + raise |
0 commit comments