From 059b2f8576244f8a22a17f5f0b8697cac23f157d Mon Sep 17 00:00:00 2001 From: Morgan Wowk Date: Wed, 17 Jun 2026 17:38:25 -0700 Subject: [PATCH] kubernetes: suppress tqdm progress bars in all container pods MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Inject TQDM_DISABLE=1 and HF_DATASETS_DISABLE_PROGRESS_BARS=1 into every Kubernetes container's environment unless the component has already set those keys explicitly (user values take precedence). High-volume tqdm block-glyph output (█▉▊▋▌▍▎▏, 3-byte UTF-8) from concurrent HF datasets workers (num_proc>1) is the dominant source of non-ASCII bytes in pod log streams. Eliminating the glyphs at the source makes the log stream pure ASCII for tokenization/packing phases, removing any possibility of torn multi-byte sequences reaching the Kubernetes API read path regardless of the defensive decode added in the previous commit. Side effect: log sizes for heavy tokenization jobs drop significantly (observed ~6 MB → tens of KB), since tqdm progress bars account for the bulk of the raw byte volume. --- .../launchers/kubernetes_launchers.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/cloud_pipelines_backend/launchers/kubernetes_launchers.py b/cloud_pipelines_backend/launchers/kubernetes_launchers.py index 6c481baa..a0108eed 100644 --- a/cloud_pipelines_backend/launchers/kubernetes_launchers.py +++ b/cloud_pipelines_backend/launchers/kubernetes_launchers.py @@ -69,6 +69,16 @@ # Environment variables for multi-node execution. _MULTI_NODE_NODE_INDEX_ENV_VAR_NAME = "_TANGLE_MULTI_NODE_NODE_INDEX" +# Environment variables injected into every container to suppress tqdm progress +# bar output. High-volume block-glyph writes (3-byte UTF-8: █▉▊▋▌▍▎▏) from +# concurrent worker processes interleave at the OS level, producing torn +# multi-byte sequences in the pod log stream that cause UnicodeDecodeError. +# Components may override these by setting the same keys in their own env. +_TQDM_SUPPRESS_ENV_VARS: dict[str, str] = { + "TQDM_DISABLE": "1", + "HF_DATASETS_DISABLE_PROGRESS_BARS": "1", +} + _T = typing.TypeVar("_T") @@ -352,6 +362,10 @@ def get_output_path(output_name: str) -> str: k8s_client_lib.V1EnvVar(name=name, value=value) for name, value in (container_spec.env or {}).items() ] + user_env_names = {env.name for env in container_env} + for name, value in _TQDM_SUPPRESS_ENV_VARS.items(): + if name not in user_env_names: + container_env.append(k8s_client_lib.V1EnvVar(name=name, value=value)) main_container_spec = k8s_client_lib.V1Container( name=_MAIN_CONTAINER_NAME, image=container_spec.image,