From fe11a6babcc7ef6d4f3a189eb2adcc27350f0d13 Mon Sep 17 00:00:00 2001 From: Morgan Wowk Date: Wed, 17 Jun 2026 15:52:43 -0700 Subject: [PATCH] kubernetes: decode pod logs with errors=replace to survive torn UTF-8 bytes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Kubernetes client's default read_namespaced_pod_log path does a strict .decode('utf8') over the full log payload before checking HTTP status. When a pod with high-volume tqdm progress bars (block glyphs █▉▊▋▌▍▎▏, 3-byte UTF-8) runs with num_proc>1, concurrent writes to the same fd can split a multi-byte glyph across a chunk boundary, leaving an orphaned continuation byte. The strict decode throws UnicodeDecodeError, which bubbles through the log-upload retry wrapper and marks an otherwise-healthy training run as SYSTEM_ERROR. Fix: pass _preload_content=False to get the raw urllib3 response and decode manually with errors="replace". This is applied to both the single-pod (LaunchedKubernetesContainer.get_log) and multi-pod (LaunchedKubernetesJob._get_log_by_pod_key) log-read paths. A warning is logged whenever replacement characters are injected, so the next occurrence is observable in Observe without requiring a separate debug build. The existing "Bad Request" catch for PodInitializing is unaffected: the kubernetes client's status check runs outside the _preload_content block and still raises ApiException with the correct reason phrase. --- .../launchers/kubernetes_launchers.py | 29 +++++++++++++++++-- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/cloud_pipelines_backend/launchers/kubernetes_launchers.py b/cloud_pipelines_backend/launchers/kubernetes_launchers.py index 6c481baa..25cd267b 100644 --- a/cloud_pipelines_backend/launchers/kubernetes_launchers.py +++ b/cloud_pipelines_backend/launchers/kubernetes_launchers.py @@ -922,7 +922,8 @@ def get_refreshed(self) -> "LaunchedKubernetesContainer": def get_log(self) -> str: launcher = self._get_launcher() core_api_client = k8s_client_lib.CoreV1Api(api_client=launcher._api_client) - return core_api_client.read_namespaced_pod_log( + # _preload_content=False bypasses the kubernetes client's strict .decode('utf8'); see _get_log_by_pod_key. + response = core_api_client.read_namespaced_pod_log( name=self._pod_name, namespace=self._namespace, container=_MAIN_CONTAINER_NAME, @@ -931,7 +932,17 @@ def get_log(self) -> str: # HTTP response body: {"kind":"Status","apiVersion":"v1","metadata":{},"status":"Failure","message":"PodLogOptions \"task-pod-xxxxx\" is invalid: stream: Forbidden: may not be specified","reason":"Invalid","details":{"name":"task-pod-xxxxx","kind":"PodLogOptions","causes":[{"reason":"FieldValueForbidden","message":"Forbidden: may not be specified","field":"stream"}]},"code":422} # stream="All", _request_timeout=launcher._request_timeout, + _preload_content=False, ) + try: + log = response.data.decode("utf-8", errors="replace") + if "\N{REPLACEMENT CHARACTER}" in log: + _logger.warning( + f"Pod log for {self._pod_name} contained invalid UTF-8 bytes; substituted replacement characters." + ) + return log + finally: + response.release_conn() def upload_log(self): launcher = self._get_launcher() @@ -1490,14 +1501,26 @@ def _get_log_by_pod_key(self, pod_name: str) -> str | None: launcher = self._get_launcher() core_api_client = k8s_client_lib.CoreV1Api(api_client=launcher._api_client) try: - log = core_api_client.read_namespaced_pod_log( + # _preload_content=False bypasses the kubernetes client's strict .decode('utf8'), + # which would raise UnicodeDecodeError on torn multi-byte chars in pod logs (e.g. + # tqdm block glyphs split across concurrent-writer chunk boundaries). + response = core_api_client.read_namespaced_pod_log( name=pod_name, namespace=self._namespace, container=_MAIN_CONTAINER_NAME, timestamps=True, _request_timeout=launcher._request_timeout, + _preload_content=False, ) - return log + try: + log = response.data.decode("utf-8", errors="replace") + if "\N{REPLACEMENT CHARACTER}" in log: + _logger.warning( + f"Pod log for {pod_name} contained invalid UTF-8 bytes; substituted replacement characters." + ) + return log + finally: + response.release_conn() except kubernetes.client.exceptions.ApiException as ex: if ex.reason == "Bad Request": # Kubernetes client raises kubernetes.client.exceptions.ApiException when Pod is still in PodInitializing phase