From 9de2b3789a11c8cf9a1d37e145e93203f5475300 Mon Sep 17 00:00:00 2001
From: ActivePeter <1020401660@qq.com>
Date: Sat, 20 Jun 2026 13:45:44 +0800
Subject: [PATCH 01/13] fix: doc page url in readme
---
README.md | 25 +++++++++++++------------
README_CN.md | 25 +++++++++++++------------
2 files changed, 26 insertions(+), 24 deletions(-)
diff --git a/README.md b/README.md
index 53a2110..3f4cd2f 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@
[](./fluxon_release)
[](#interface-capabilities)
-[English](./README.md) | [中文](./README_CN.md) | [Docs](https://tele-ai.github.io/fluxon/) | [中文文档](https://tele-ai.github.io/fluxon/cn/) |
+[English](./README.md) | [中文](./README_CN.md) | [Docs](https://tele-ai.github.io/Fluxon/) | [中文文档](https://tele-ai.github.io/Fluxon/cn/) |
@@ -145,7 +145,7 @@ The benchmark results show that small-file reads and large-file writes are alrea
## 🚀 Quick Start
-Quick Start is the shortest path to try Fluxon. For formal installation, deployment, and operations, see [User Docs](https://tele-ai.github.io/fluxon/user_doc/).
+Quick Start is the shortest path to try Fluxon. For formal installation, deployment, and operations, see [User Docs](https://tele-ai.github.io/Fluxon/user_doc/).
### KV Quick Start
@@ -178,7 +178,7 @@ Open the printed link to view the KV Web UI:
Related interface docs:
-- [KV and RPC Interface](https://tele-ai.github.io/fluxon/user_doc/User---3---KV-and-RPC-Interface/)
+- [KV and RPC Interface](https://tele-ai.github.io/Fluxon/user_doc/User---3---KV-and-RPC-Interface/)
### MQ Quick Start
@@ -209,7 +209,7 @@ Runtime view:
Related interface docs:
-- [MQ Interface](https://tele-ai.github.io/fluxon/user_doc/User---4---MQ-Interface/)
+- [MQ Interface](https://tele-ai.github.io/Fluxon/user_doc/User---4---MQ-Interface/)
### FS Quick Start
@@ -247,7 +247,7 @@ Open the printed link to view the FS Web UI:
Related interface docs:
-- [FS Interface](https://tele-ai.github.io/fluxon/user_doc/User---5---FS-Interface/)
+- [FS Interface](https://tele-ai.github.io/Fluxon/user_doc/User---5---FS-Interface/)
@@ -267,17 +267,18 @@ Related interface docs:
Contributions are welcome. Before you start, please read the developer docs on GitHub Pages:
-- [Developer Docs](https://tele-ai.github.io/fluxon/dev_doc/)
-- [Developer - 1 - Package core install artifacts](https://tele-ai.github.io/fluxon/dev_doc/Developer---1---Package-Core-Install-Artifacts/)
-- [Developer - 2 - Package middleware and images](https://tele-ai.github.io/fluxon/dev_doc/Developer---2---Package-Middleware-and-Images/)
-- [Developer - 4 - Publish a release](https://tele-ai.github.io/fluxon/dev_doc/Developer---4---Publish-a-Release/)
+- [Developer Docs](https://tele-ai.github.io/Fluxon/dev_doc/)
+- [Developer - 1 - Package core install artifacts](https://tele-ai.github.io/Fluxon/dev_doc/Developer---1---Package-Core-Install-Artifacts/)
+- [Developer - 2 - Package middleware and images](https://tele-ai.github.io/Fluxon/dev_doc/Developer---2---Package-Middleware-and-Images/)
+- [Developer - 3 - Documentation Writing Rules](https://tele-ai.github.io/Fluxon/dev_doc/Developer---3---Documentation-Writing-Rules/)
+- [Developer - 4 - Publish a release](https://tele-ai.github.io/Fluxon/dev_doc/Developer---4---Publish-a-Release/)
## 👥 Contributors
-
-
+
+
Some earlier contribution records are no longer fully reflected in the current commit history. Historical highlights:
@@ -312,4 +313,4 @@ Fluxon is open-sourced under Apache License 2.0, see [LICENSE](./LICENSE).
## ⭐ Stargazers over time
-[](https://www.star-history.com/?repos=Tele-AI%2Ffluxon&type=date&legend=top-left)
+[](https://www.star-history.com/?repos=Tele-AI%2FFluxon&type=date&legend=top-left)
diff --git a/README_CN.md b/README_CN.md
index 715511b..da2cdb4 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -20,7 +20,7 @@
[](./fluxon_release)
[](#接口能力)
-[中文](./README_CN.md) | [English](./README.md) | [用户文档](https://tele-ai.github.io/fluxon/cn/) | [English Docs](https://tele-ai.github.io/fluxon/) |
+[中文](./README_CN.md) | [English](./README.md) | [用户文档](https://tele-ai.github.io/Fluxon/cn/) | [English Docs](https://tele-ai.github.io/Fluxon/) |
@@ -147,7 +147,7 @@ benchmark 显示,小文件读和大文件写已显著领先 `Alluxio`,大文
## 🚀 快速开始
-Quick Start 用于最短路径体验;正式安装、部署和运维入口见 [用户文档](https://tele-ai.github.io/fluxon/cn/user_doc/)。
+Quick Start 用于最短路径体验;正式安装、部署和运维入口见 [用户文档](https://tele-ai.github.io/Fluxon/cn/user_doc/)。
### KV 快速开始
@@ -180,7 +180,7 @@ del demo:hello
对应接口文档:
-- [KV 和 RPC 接口](https://tele-ai.github.io/fluxon/cn/user_doc/%E7%94%A8%E6%88%B7---3---KV-RPC%E6%8E%A5%E5%8F%A3/)
+- [KV 和 RPC 接口](https://tele-ai.github.io/Fluxon/cn/user_doc/%E7%94%A8%E6%88%B7---3---KV-RPC%E6%8E%A5%E5%8F%A3/)
### MQ 快速开始
@@ -211,7 +211,7 @@ exit
对应接口文档:
-- [MQ 接口](https://tele-ai.github.io/fluxon/cn/user_doc/%E7%94%A8%E6%88%B7---4---MQ%E6%8E%A5%E5%8F%A3/)
+- [MQ 接口](https://tele-ai.github.io/Fluxon/cn/user_doc/%E7%94%A8%E6%88%B7---4---MQ%E6%8E%A5%E5%8F%A3/)
### FS 快速开始
@@ -249,7 +249,7 @@ FS Quick Start 会额外打印:
对应接口文档:
-- [FS 接口](https://tele-ai.github.io/fluxon/cn/user_doc/%E7%94%A8%E6%88%B7---5---FS%E6%8E%A5%E5%8F%A3/)
+- [FS 接口](https://tele-ai.github.io/Fluxon/cn/user_doc/%E7%94%A8%E6%88%B7---5---FS%E6%8E%A5%E5%8F%A3/)
@@ -269,17 +269,18 @@ FS Quick Start 会额外打印:
欢迎参与贡献。开始之前,建议先阅读 GitHub Pages 上的开发者文档:
-- [开发者文档总入口](https://tele-ai.github.io/fluxon/cn/dev_doc/)
-- [开发者 - 1 - 打包核心安装包](https://tele-ai.github.io/fluxon/cn/dev_doc/%E5%BC%80%E5%8F%91%E8%80%85---1---%E6%89%93%E5%8C%85%E6%A0%B8%E5%BF%83%E5%AE%89%E8%A3%85%E5%8C%85/)
-- [开发者 - 2 - 打包中间件和镜像](https://tele-ai.github.io/fluxon/cn/dev_doc/%E5%BC%80%E5%8F%91%E8%80%85---2---%E6%89%93%E5%8C%85%E4%B8%AD%E9%97%B4%E4%BB%B6%E5%92%8C%E9%95%9C%E5%83%8F/)
-- [开发者 - 4 - 发布 Release](https://tele-ai.github.io/fluxon/cn/dev_doc/%E5%BC%80%E5%8F%91%E8%80%85---4---%E5%8F%91%E5%B8%83-Release/)
+- [开发者文档总入口](https://tele-ai.github.io/Fluxon/cn/dev_doc/)
+- [开发者 - 1 - 打包核心安装包](https://tele-ai.github.io/Fluxon/cn/dev_doc/%E5%BC%80%E5%8F%91%E8%80%85---1---%E6%89%93%E5%8C%85%E6%A0%B8%E5%BF%83%E5%AE%89%E8%A3%85%E5%8C%85/)
+- [开发者 - 2 - 打包中间件和镜像](https://tele-ai.github.io/Fluxon/cn/dev_doc/%E5%BC%80%E5%8F%91%E8%80%85---2---%E6%89%93%E5%8C%85%E4%B8%AD%E9%97%B4%E4%BB%B6%E5%92%8C%E9%95%9C%E5%83%8F/)
+- [开发者 - 3 - 文档写作规约](https://tele-ai.github.io/Fluxon/cn/dev_doc/%E5%BC%80%E5%8F%91%E8%80%85---3---%E6%96%87%E6%A1%A3%E5%86%99%E4%BD%9C%E8%A7%84%E7%BA%A6/)
+- [开发者 - 4 - 发布 Release](https://tele-ai.github.io/Fluxon/cn/dev_doc/%E5%BC%80%E5%8F%91%E8%80%85---4---%E5%8F%91%E5%B8%83-Release/)
## 👥 Contributors
-
-
+
+
部分更早期的贡献记录已经无法从当前 commit 历史里完整反映,这里补充说明:
@@ -314,4 +315,4 @@ Fluxon 基于 Apache License 2.0 开源,见 [LICENSE](./LICENSE)。
## ⭐ Star 增长趋势
-[](https://www.star-history.com/?repos=Tele-AI%2Ffluxon&type=date&legend=top-left)
+[](https://www.star-history.com/?repos=Tele-AI%2FFluxon&type=date&legend=top-left)
From cf6c5c09a306bc354f5deeb85b302ebc2afd34c8 Mon Sep 17 00:00:00 2001
From: ActivePeter <1020401660@qq.com>
Date: Mon, 22 Jun 2026 18:02:48 +0800
Subject: [PATCH 02/13] test
---
.github/workflows/all_test.yml | 9 +-
deployment/gen_bare_deploy_bash.py | 588 ++++-------
.../atomic_group_node_resolution_tail.sh.tmpl | 14 +
.../atomic_group_service_block.sh.tmpl | 24 +
.../atomic_group_start.sh.tmpl | 7 +
.../atomic_group_stop.sh.tmpl | 6 +
.../bare_entrypoint.sh.tmpl | 5 +
.../common_node_resolution_tail.sh.tmpl | 15 +
.../etcd_health_wait_block.sh.tmpl | 4 +
.../gen_bare_deploy_bash/host_prelude.sh.tmpl | 57 +
.../selection_present_probe_fn.sh.tmpl | 19 +
...ction_supervisor_launch_wait_block.sh.tmpl | 9 +
...on_supervisor_path_from_script_dir.sh.tmpl | 7 +
.../standalone_start.sh.tmpl | 6 +
.../standalone_start_body.sh.tmpl | 27 +
.../standalone_stop.sh.tmpl | 15 +
.../start_lock_block.sh.tmpl | 14 +
.../tcp_ready_helpers.sh.tmpl | 120 +++
.../tcp_ready_wait_block.sh.tmpl | 6 +
deployment/tests/test_gen_bare_deploy_bash.py | 153 ++-
deployment/tests/test_gen_k8s_daemonset.py | 2 +-
deployment/tests/test_log_shard.py | 117 ++
.../test_selection_supervisor_codegen.py | 191 +++-
.../test_start_test_bed_bootstrap_log.py | 33 +-
deployment/utils/log_shard.py | 196 ++++
deployment/utils/proc_lifecycle_codegen.py | 41 +-
.../utils/selection_supervisor_codegen.py | 90 +-
...15\347\275\256\346\200\273\350\247\210.md" | 217 ++++
...74\345\207\272\351\223\276\350\267\257.md" | 414 ++++++++
fluxon_py/config.py | 15 +
fluxon_py/tests/test_config.py | 49 +
fluxon_rs/Cargo.lock | 2 +
fluxon_rs/fluxon_fs/src/agent.rs | 8 +-
.../fluxon_kv/src/client_seg_pool/mod.rs | 23 +
fluxon_rs/fluxon_kv/src/config.rs | 150 ++-
.../external_client_test.rs | 12 +-
.../fluxon_kv/src/external_client_api/mod.rs | 7 +
fluxon_rs/fluxon_kv/src/kvcore_test_lib.rs | 4 +
fluxon_rs/fluxon_kv/src/lib.rs | 653 +++++++-----
.../fluxon_kv/src/memholder/memholder_test.rs | 8 +
fluxon_rs/fluxon_ops/Cargo.toml | 4 +
fluxon_rs/fluxon_ops/build.rs | 13 +
fluxon_rs/fluxon_ops/src/lib.rs | 250 ++++-
fluxon_rs/fluxon_util/build.rs | 19 +-
fluxon_rs/fluxon_util/src/lib.rs | 7 +-
fluxon_rs/fluxon_util/src/log.rs | 380 +++++--
fluxon_rs/fluxon_util/tests/log_mgmt.rs | 120 +++
fluxon_test_stack/ci_2_virt_node.py | 2 +
fluxon_test_stack/ci_test_list.yaml | 16 +
fluxon_test_stack/deployconf_testbed.yml | 5 +-
fluxon_test_stack/pack_test_stack_rsc.py | 261 +----
fluxon_test_stack/start_test_bed.py | 32 +-
fluxon_test_stack/test_runner.py | 284 +++--
...fluxon_fs_s3_download_and_exec.sh.template | 108 ++
fluxon_test_stack/test_runner_ui.py | 4 +
.../tests/test_ci_2_virt_node_contract.py | 97 +-
.../tests/test_pack_test_stack_rsc_cli.py | 125 ++-
.../tests/test_runner_contract.py | 50 +
.../test_test_runner_testbed_contract.py | 99 ++
.../tests/test_test_runner_ui_contract.py | 37 +-
.../test_top_attention_log_mgmt_contract.py | 112 ++
.../top_attention_test_index/README.md | 1 +
.../top_attention_test_index/_log_mgmt.py | 54 +
scripts/git_source_selection.py | 163 +++
scripts/source_selection_profiles.py | 134 +++
setup_and_pack/nix/lib_layout.py | 35 +-
setup_and_pack/nix/pack_fluxonkv_pylib.py | 232 +---
setup_and_pack/public_workspace_contract.py | 56 +-
.../tests/test_git_source_selection_utils.py | 182 ++++
setup_and_pack/tests/test_lib_layout.py | 7 +
...est_pack_fluxonkv_pylib_bridge_prebuilt.py | 33 +
setup_and_pack/utils/__init__.py | 2 +
.../utils/artifact_cache_digest_utils.py | 29 +-
skills/browser-helm/SKILL.md | 232 ++++
skills/browser-helm/agents/openai.yaml | 6 +
skills/browser-helm/references/commands.md | 131 +++
skills/canvas-dag_organizer-v1/SKILL.md | 10 +
.../agents/openai.yaml | 6 +
skills/canvas-ops-v1/SKILL.md | 10 +
skills/canvas-ops-v1/agents/openai.yaml | 6 +
skills/canvas-tidy_selection-v1/SKILL.md | 10 +
.../agents/openai.yaml | 6 +
skills/find-skills/SKILL.md | 133 +++
skills/imagegen/LICENSE.txt | 201 ++++
skills/imagegen/SKILL.md | 356 +++++++
skills/imagegen/agents/openai.yaml | 6 +
skills/imagegen/assets/imagegen-small.svg | 5 +
skills/imagegen/assets/imagegen.png | Bin 0 -> 1711 bytes
skills/imagegen/references/cli.md | 242 +++++
skills/imagegen/references/codex-network.md | 33 +
skills/imagegen/references/image-api.md | 90 ++
skills/imagegen/references/prompting.md | 118 +++
skills/imagegen/references/sample-prompts.md | 433 ++++++++
skills/imagegen/scripts/image_gen.py | 995 ++++++++++++++++++
skills/imagegen/scripts/remove_chroma_key.py | 440 ++++++++
skills/openai-docs/LICENSE.txt | 201 ++++
skills/openai-docs/SKILL.md | 167 +++
skills/openai-docs/agents/openai.yaml | 14 +
skills/openai-docs/assets/openai-small.svg | 3 +
skills/openai-docs/assets/openai.png | Bin 0 -> 1429 bytes
skills/openai-docs/references/latest-model.md | 37 +
.../openai-docs/references/prompting-guide.md | 244 +++++
.../openai-docs/references/upgrade-guide.md | 181 ++++
.../scripts/fetch-codex-manual.mjs | 598 +++++++++++
.../scripts/resolve-latest-model-info.js | 147 +++
skills/plugin-creator/SKILL.md | 243 +++++
skills/plugin-creator/agents/openai.yaml | 6 +
.../assets/plugin-creator-small.svg | 3 +
.../plugin-creator/assets/plugin-creator.png | Bin 0 -> 1563 bytes
.../references/installing-and-updating.md | 143 +++
.../references/plugin-json-spec.md | 194 ++++
.../scripts/create_basic_plugin.py | 324 ++++++
.../scripts/read_marketplace_name.py | 48 +
.../scripts/update_plugin_cachebuster.py | 78 ++
.../plugin-creator/scripts/validate_plugin.py | 593 +++++++++++
.../SKILL.md | 11 +
.../agents/openai.yaml | 6 +
.../SKILL.md | 10 +
.../agents/openai.yaml | 6 +
.../SKILL.md | 10 +
.../agents/openai.yaml | 6 +
.../SKILL.md | 10 +
.../agents/openai.yaml | 6 +
.../SKILL.md | 10 +
.../agents/openai.yaml | 6 +
.../SKILL.md | 16 +
.../agents/openai.yaml | 6 +
.../SKILL.md | 27 +
.../agents/openai.yaml | 6 +
.../SKILL.md | 10 +
.../agents/openai.yaml | 6 +
.../SKILL.md | 10 +
.../agents/openai.yaml | 6 +
.../SKILL.md | 10 +
.../agents/openai.yaml | 6 +
.../SKILL.md | 10 +
.../agents/openai.yaml | 6 +
.../SKILL.md | 10 +
.../agents/openai.yaml | 6 +
.../SKILL.md | 10 +
.../agents/openai.yaml | 6 +
.../SKILL.md | 10 +
.../agents/openai.yaml | 6 +
.../SKILL.md | 10 +
.../agents/openai.yaml | 6 +
.../SKILL.md | 10 +
.../agents/openai.yaml | 6 +
.../SKILL.md | 15 +
.../agents/openai.yaml | 6 +
.../SKILL.md | 10 +
.../agents/openai.yaml | 6 +
.../SKILL.md | 10 +
.../agents/openai.yaml | 6 +
skills/rs-skill-smoke-09e1daf7/SKILL.md | 8 +
skills/rs-skill-smoke-529efbc9/SKILL.md | 8 +
skills/rs-skill-smoke-cde1029f/SKILL.md | 8 +
skills/skill-creator/SKILL.md | 416 ++++++++
skills/skill-creator/agents/openai.yaml | 5 +
.../assets/skill-creator-small.svg | 3 +
skills/skill-creator/assets/skill-creator.png | Bin 0 -> 1563 bytes
skills/skill-creator/license.txt | 202 ++++
.../skill-creator/references/openai_yaml.md | 49 +
.../scripts/generate_openai_yaml.py | 226 ++++
skills/skill-creator/scripts/init_skill.py | 400 +++++++
.../skill-creator/scripts/quick_validate.py | 101 ++
skills/skill-installer/LICENSE.txt | 202 ++++
skills/skill-installer/SKILL.md | 58 +
skills/skill-installer/agents/openai.yaml | 5 +
.../assets/skill-installer-small.svg | 3 +
.../assets/skill-installer.png | Bin 0 -> 1086 bytes
.../skill-installer/scripts/github_utils.py | 21 +
.../scripts/install-skill-from-github.py | 308 ++++++
skills/skill-installer/scripts/list-skills.py | 107 ++
173 files changed, 13984 insertions(+), 1462 deletions(-)
create mode 100644 deployment/templates/gen_bare_deploy_bash/atomic_group_node_resolution_tail.sh.tmpl
create mode 100644 deployment/templates/gen_bare_deploy_bash/atomic_group_service_block.sh.tmpl
create mode 100644 deployment/templates/gen_bare_deploy_bash/atomic_group_start.sh.tmpl
create mode 100644 deployment/templates/gen_bare_deploy_bash/atomic_group_stop.sh.tmpl
create mode 100644 deployment/templates/gen_bare_deploy_bash/bare_entrypoint.sh.tmpl
create mode 100644 deployment/templates/gen_bare_deploy_bash/common_node_resolution_tail.sh.tmpl
create mode 100644 deployment/templates/gen_bare_deploy_bash/etcd_health_wait_block.sh.tmpl
create mode 100644 deployment/templates/gen_bare_deploy_bash/host_prelude.sh.tmpl
create mode 100644 deployment/templates/gen_bare_deploy_bash/selection_present_probe_fn.sh.tmpl
create mode 100644 deployment/templates/gen_bare_deploy_bash/selection_supervisor_launch_wait_block.sh.tmpl
create mode 100644 deployment/templates/gen_bare_deploy_bash/selection_supervisor_path_from_script_dir.sh.tmpl
create mode 100644 deployment/templates/gen_bare_deploy_bash/standalone_start.sh.tmpl
create mode 100644 deployment/templates/gen_bare_deploy_bash/standalone_start_body.sh.tmpl
create mode 100644 deployment/templates/gen_bare_deploy_bash/standalone_stop.sh.tmpl
create mode 100644 deployment/templates/gen_bare_deploy_bash/start_lock_block.sh.tmpl
create mode 100644 deployment/templates/gen_bare_deploy_bash/tcp_ready_helpers.sh.tmpl
create mode 100644 deployment/templates/gen_bare_deploy_bash/tcp_ready_wait_block.sh.tmpl
create mode 100644 deployment/tests/test_log_shard.py
create mode 100644 deployment/utils/log_shard.py
create mode 100644 "fluxon_doc_cn/design/fluxon_0_\351\205\215\347\275\256\346\200\273\350\247\210.md"
create mode 100644 "fluxon_doc_cn/design/log_1_\346\234\254\345\234\260\346\226\207\344\273\266\346\227\245\345\277\227\344\270\216Greptime_OTLP\345\257\274\345\207\272\351\223\276\350\267\257.md"
create mode 100644 fluxon_rs/fluxon_util/tests/log_mgmt.rs
create mode 100644 fluxon_test_stack/test_runner_templates/payload_fluxon_fs_s3_download_and_exec.sh.template
create mode 100644 fluxon_test_stack/tests/test_top_attention_log_mgmt_contract.py
create mode 100644 fluxon_test_stack/top_attention_test_index/_log_mgmt.py
create mode 100644 scripts/git_source_selection.py
create mode 100644 scripts/source_selection_profiles.py
create mode 100644 setup_and_pack/tests/test_git_source_selection_utils.py
create mode 100644 skills/browser-helm/SKILL.md
create mode 100644 skills/browser-helm/agents/openai.yaml
create mode 100644 skills/browser-helm/references/commands.md
create mode 100644 skills/canvas-dag_organizer-v1/SKILL.md
create mode 100644 skills/canvas-dag_organizer-v1/agents/openai.yaml
create mode 100644 skills/canvas-ops-v1/SKILL.md
create mode 100644 skills/canvas-ops-v1/agents/openai.yaml
create mode 100644 skills/canvas-tidy_selection-v1/SKILL.md
create mode 100644 skills/canvas-tidy_selection-v1/agents/openai.yaml
create mode 100644 skills/find-skills/SKILL.md
create mode 100644 skills/imagegen/LICENSE.txt
create mode 100644 skills/imagegen/SKILL.md
create mode 100644 skills/imagegen/agents/openai.yaml
create mode 100644 skills/imagegen/assets/imagegen-small.svg
create mode 100644 skills/imagegen/assets/imagegen.png
create mode 100644 skills/imagegen/references/cli.md
create mode 100644 skills/imagegen/references/codex-network.md
create mode 100644 skills/imagegen/references/image-api.md
create mode 100644 skills/imagegen/references/prompting.md
create mode 100644 skills/imagegen/references/sample-prompts.md
create mode 100644 skills/imagegen/scripts/image_gen.py
create mode 100644 skills/imagegen/scripts/remove_chroma_key.py
create mode 100644 skills/openai-docs/LICENSE.txt
create mode 100644 skills/openai-docs/SKILL.md
create mode 100644 skills/openai-docs/agents/openai.yaml
create mode 100644 skills/openai-docs/assets/openai-small.svg
create mode 100644 skills/openai-docs/assets/openai.png
create mode 100644 skills/openai-docs/references/latest-model.md
create mode 100644 skills/openai-docs/references/prompting-guide.md
create mode 100644 skills/openai-docs/references/upgrade-guide.md
create mode 100644 skills/openai-docs/scripts/fetch-codex-manual.mjs
create mode 100644 skills/openai-docs/scripts/resolve-latest-model-info.js
create mode 100644 skills/plugin-creator/SKILL.md
create mode 100644 skills/plugin-creator/agents/openai.yaml
create mode 100644 skills/plugin-creator/assets/plugin-creator-small.svg
create mode 100644 skills/plugin-creator/assets/plugin-creator.png
create mode 100644 skills/plugin-creator/references/installing-and-updating.md
create mode 100644 skills/plugin-creator/references/plugin-json-spec.md
create mode 100644 skills/plugin-creator/scripts/create_basic_plugin.py
create mode 100644 skills/plugin-creator/scripts/read_marketplace_name.py
create mode 100644 skills/plugin-creator/scripts/update_plugin_cachebuster.py
create mode 100644 skills/plugin-creator/scripts/validate_plugin.py
create mode 100644 skills/prompt-0ca565e9-3d44-45f1-832d-caa438aceddb/SKILL.md
create mode 100644 skills/prompt-0ca565e9-3d44-45f1-832d-caa438aceddb/agents/openai.yaml
create mode 100644 skills/prompt-1309ed22-5b5e-4774-9b85-41bb1b7cc971/SKILL.md
create mode 100644 skills/prompt-1309ed22-5b5e-4774-9b85-41bb1b7cc971/agents/openai.yaml
create mode 100644 skills/prompt-1323c8c8-88a0-40d2-89df-14fc9533a122/SKILL.md
create mode 100644 skills/prompt-1323c8c8-88a0-40d2-89df-14fc9533a122/agents/openai.yaml
create mode 100644 skills/prompt-144929a0-ae69-404b-9f58-a8696378e4e3/SKILL.md
create mode 100644 skills/prompt-144929a0-ae69-404b-9f58-a8696378e4e3/agents/openai.yaml
create mode 100644 skills/prompt-15d9a907-a363-4ec7-81ad-806f9418ad72/SKILL.md
create mode 100644 skills/prompt-15d9a907-a363-4ec7-81ad-806f9418ad72/agents/openai.yaml
create mode 100644 skills/prompt-193dd3cd-2722-413b-b88c-12c2af645f80/SKILL.md
create mode 100644 skills/prompt-193dd3cd-2722-413b-b88c-12c2af645f80/agents/openai.yaml
create mode 100644 skills/prompt-2793a3a4-310f-40c8-ba5d-bc7f5c1cafd7/SKILL.md
create mode 100644 skills/prompt-2793a3a4-310f-40c8-ba5d-bc7f5c1cafd7/agents/openai.yaml
create mode 100644 skills/prompt-2d53cebd-afd4-4d35-94e9-74436da3148a/SKILL.md
create mode 100644 skills/prompt-2d53cebd-afd4-4d35-94e9-74436da3148a/agents/openai.yaml
create mode 100644 skills/prompt-2eaed145-d789-4b27-93b9-8ea990830b3a/SKILL.md
create mode 100644 skills/prompt-2eaed145-d789-4b27-93b9-8ea990830b3a/agents/openai.yaml
create mode 100644 skills/prompt-345530e6-2736-42c3-9d4e-da5f14b8b8cb/SKILL.md
create mode 100644 skills/prompt-345530e6-2736-42c3-9d4e-da5f14b8b8cb/agents/openai.yaml
create mode 100644 skills/prompt-566905c8-0ad8-4d7e-857a-1c38ac7e54ca/SKILL.md
create mode 100644 skills/prompt-566905c8-0ad8-4d7e-857a-1c38ac7e54ca/agents/openai.yaml
create mode 100644 skills/prompt-5e80deb4-c278-4424-a0f4-a3df4f3443d8/SKILL.md
create mode 100644 skills/prompt-5e80deb4-c278-4424-a0f4-a3df4f3443d8/agents/openai.yaml
create mode 100644 skills/prompt-615e1231-fe33-47f8-bf35-29fdf3766d98/SKILL.md
create mode 100644 skills/prompt-615e1231-fe33-47f8-bf35-29fdf3766d98/agents/openai.yaml
create mode 100644 skills/prompt-7ae16163-92c9-4fde-a74f-7c61eddd62f2/SKILL.md
create mode 100644 skills/prompt-7ae16163-92c9-4fde-a74f-7c61eddd62f2/agents/openai.yaml
create mode 100644 skills/prompt-8c5cc431-635c-4c94-9deb-a502e77160eb/SKILL.md
create mode 100644 skills/prompt-8c5cc431-635c-4c94-9deb-a502e77160eb/agents/openai.yaml
create mode 100644 skills/prompt-a7fb4e43-d1eb-4739-93b3-646d7a1c072c/SKILL.md
create mode 100644 skills/prompt-a7fb4e43-d1eb-4739-93b3-646d7a1c072c/agents/openai.yaml
create mode 100644 skills/prompt-ac42abf9-6df8-4539-99c7-e402e905a03b/SKILL.md
create mode 100644 skills/prompt-ac42abf9-6df8-4539-99c7-e402e905a03b/agents/openai.yaml
create mode 100644 skills/prompt-ae9ff67b-09d8-4848-bbde-aac1fb6e1315/SKILL.md
create mode 100644 skills/prompt-ae9ff67b-09d8-4848-bbde-aac1fb6e1315/agents/openai.yaml
create mode 100644 skills/prompt-f118ab91-390b-48e2-a962-3abe4d54211e/SKILL.md
create mode 100644 skills/prompt-f118ab91-390b-48e2-a962-3abe4d54211e/agents/openai.yaml
create mode 100644 skills/rs-skill-smoke-09e1daf7/SKILL.md
create mode 100644 skills/rs-skill-smoke-529efbc9/SKILL.md
create mode 100644 skills/rs-skill-smoke-cde1029f/SKILL.md
create mode 100644 skills/skill-creator/SKILL.md
create mode 100644 skills/skill-creator/agents/openai.yaml
create mode 100644 skills/skill-creator/assets/skill-creator-small.svg
create mode 100644 skills/skill-creator/assets/skill-creator.png
create mode 100644 skills/skill-creator/license.txt
create mode 100644 skills/skill-creator/references/openai_yaml.md
create mode 100644 skills/skill-creator/scripts/generate_openai_yaml.py
create mode 100644 skills/skill-creator/scripts/init_skill.py
create mode 100644 skills/skill-creator/scripts/quick_validate.py
create mode 100644 skills/skill-installer/LICENSE.txt
create mode 100644 skills/skill-installer/SKILL.md
create mode 100644 skills/skill-installer/agents/openai.yaml
create mode 100644 skills/skill-installer/assets/skill-installer-small.svg
create mode 100644 skills/skill-installer/assets/skill-installer.png
create mode 100644 skills/skill-installer/scripts/github_utils.py
create mode 100644 skills/skill-installer/scripts/install-skill-from-github.py
create mode 100644 skills/skill-installer/scripts/list-skills.py
diff --git a/.github/workflows/all_test.yml b/.github/workflows/all_test.yml
index 4300c60..33cdd5b 100644
--- a/.github/workflows/all_test.yml
+++ b/.github/workflows/all_test.yml
@@ -86,10 +86,15 @@ jobs:
# Scene selection:
# - ci_top_attention_doc_page_build keeps the doc-site build as a CI scene workload.
# - ci_top_attention_bin_kvtest keeps the Rust kv_test entry under the same CI scene contract.
+ # - ci_top_attention_log_mgmt keeps log rolling/sharding coverage under the same CI scene contract.
suite["scenes"] = {
key: value
for key, value in suite["scenes"].items()
- if key in ("ci_top_attention_doc_page_build", "ci_top_attention_bin_kvtest")
+ if key in (
+ "ci_top_attention_doc_page_build",
+ "ci_top_attention_bin_kvtest",
+ "ci_top_attention_log_mgmt",
+ )
}
# Profile selection:
@@ -107,11 +112,13 @@ jobs:
suite["profiles"]["fluxon_tcp"]["runtime"]["ci"]["scene_configs"]["ci_top_attention_doc_page_build"]["doc_site_base_url"] = (
"${{ github.repository_owner }}.github.io/${{ github.event.repository.name }}"
)
+ suite["profiles"]["fluxon_tcp"]["runtime"]["ci"]["scene_configs"]["ci_top_attention_log_mgmt"]["enabled"] = True
# Scale selection:
# - Keep the original per-scene scales from ci_test_list.yaml.
# - ci_top_attention_doc_page_build stays on n1_kvowner_dram_3gib.
# - ci_top_attention_bin_kvtest stays on n1_kvowner_dram_20gib.
+ # - ci_top_attention_log_mgmt stays on n1_kvowner_dram_20gib.
out_path.write_text(
yaml.safe_dump(suite, sort_keys=False, allow_unicode=False),
diff --git a/deployment/gen_bare_deploy_bash.py b/deployment/gen_bare_deploy_bash.py
index ce51025..5503658 100644
--- a/deployment/gen_bare_deploy_bash.py
+++ b/deployment/gen_bare_deploy_bash.py
@@ -4,8 +4,10 @@
import argparse
import json
import os
+import re
import shlex
import sys
+from functools import lru_cache
from pathlib import Path
from typing import Any, Dict, List
@@ -25,7 +27,9 @@
StopTimeouts,
render_bash_proc_lifecycle_funcs_pid_tree,
)
+from log_shard import render_module_source as render_log_shard_module_source # type: ignore
from selection_supervisor_codegen import ( # type: ignore
+ LOG_SHARD_HELPER_FILENAME,
PYTHON_SELECTION_SUPERVISOR_FILENAME,
render_python_selection_supervisor_module,
)
@@ -44,13 +48,36 @@
ATOMIC_GROUP_CRASHLOOP_CONSECUTIVE_RESTARTS = 10
ATOMIC_GROUP_CRASHLOOP_INTERVAL_LT_SECONDS = 30
ATOMIC_GROUP_PROBABLE_READY_SECONDS = 10
-STANDALONE_PROBABLE_READY_SECONDS = 3
-STANDALONE_STARTUP_DEADLINE_SECONDS = 60
-ATOMIC_GROUP_STARTUP_DEADLINE_SECONDS = 10 * 60
+STANDALONE_PROBABLE_READY_SECONDS = 10
+STANDALONE_STARTUP_DEADLINE_SECONDS = 10
+ATOMIC_GROUP_STARTUP_DEADLINE_SECONDS = 10
HOSTWORKDIR_RUNTIME_TOKEN = "${HOSTWORKDIR}"
REPO_ROOT = SCRIPT_DIR.parent
-TCP_READY_STABLE_SECONDS = 2
-TCP_READY_POLL_INTERVAL_SECONDS = 0.2
+BARE_TEMPLATE_DIR = SCRIPT_DIR / "templates" / "gen_bare_deploy_bash"
+_TEMPLATE_TOKEN_RE = re.compile(r"\{\{([A-Z0-9_]+)\}\}")
+
+
+@lru_cache(maxsize=None)
+def _load_bare_template(*, template_name: str) -> str:
+ template_path = BARE_TEMPLATE_DIR / template_name
+ if not template_path.is_file():
+ raise RuntimeError(f"missing bare deploy template: {template_path}")
+ return template_path.read_text(encoding="utf-8")
+
+
+def _render_bare_template(*, template_name: str, values: Dict[str, str]) -> str:
+ template = _load_bare_template(template_name=template_name)
+
+ def _replace(match: re.Match[str]) -> str:
+ key = match.group(1)
+ if key not in values:
+ raise RuntimeError(f"missing bare deploy template value: template={template_name} key={key}")
+ value = values[key]
+ if not isinstance(value, str):
+ raise ValueError(f"bare deploy template value must be a string: template={template_name} key={key}")
+ return value
+
+ return _TEMPLATE_TOKEN_RE.sub(_replace, template)
def _resolve_repo_root_cli_path(*, raw_path: Path, field_name: str) -> Path:
@@ -89,6 +116,10 @@ def main() -> None:
outdir / PYTHON_SELECTION_SUPERVISOR_FILENAME,
render_python_selection_supervisor_module(timeouts=STOP_TIMEOUTS),
)
+ (outdir / LOG_SHARD_HELPER_FILENAME).write_text(
+ render_log_shard_module_source(),
+ encoding="utf-8",
+ )
name_prefix = _require_str(cfg.get("name_prefix"), "name_prefix")
cluster_nodes_raw = _require_list(cfg.get("cluster_nodes"), "cluster_nodes")
@@ -306,12 +337,12 @@ def _bare_entrypoint_script_name(*, workload_name: str) -> str:
def _render_bare_entrypoint_script(*, service_name: str, entrypoint: str) -> str:
- return (
- "#!/usr/bin/env bash\n"
- "set -euo pipefail\n\n"
- f"export SERVICE={_sh_quote(service_name)}\n"
- + entrypoint.strip()
- + "\n"
+ return _render_bare_template(
+ template_name="bare_entrypoint.sh.tmpl",
+ values={
+ "SERVICE_EXPORT": _sh_quote(service_name),
+ "ENTRYPOINT": entrypoint.strip(),
+ },
)
@@ -353,29 +384,25 @@ def _render_standalone_start_script(
service_cfg: Dict[str, Any],
) -> str:
allowed_nodes = _extract_nodes(service_cfg)
- service_port = _extract_port(service_cfg)
- port_export = ""
- if service_port is not None:
- port_export = f"export {service_name.upper()}__PORT={_sh_quote(str(service_port))}\n"
- return (
- "#!/usr/bin/env bash\n"
- "set -euo pipefail\n\n"
- f"SERVICE={_sh_quote(service_name)}\n"
- f"NAME_PREFIX={_sh_quote(name_prefix)}\n"
- + _render_nodes_bash(name="ALLOWED_NODES", nodes=allowed_nodes)
- + _render_host_prelude(cluster_nodes=cluster_nodes)
- + _render_common_node_resolution_tail(service_name=service_name)
- + _render_selection_supervisor_path_from_script_dir()
- + _render_proc_lifecycle_pid_tree_helpers()
- + _render_tcp_ready_helpers()
- + _render_selection_present_probe_fn()
- + _render_start_lock_block()
- + _render_global_env_exports(global_envs)
- + port_export
- + _render_standalone_start_body(
- name_prefix=name_prefix,
- service_name=service_name,
- )
+ return _render_bare_template(
+ template_name="standalone_start.sh.tmpl",
+ values={
+ "SERVICE_ASSIGN": _sh_quote(service_name),
+ "NAME_PREFIX_ASSIGN": _sh_quote(name_prefix),
+ "ALLOWED_NODES_BLOCK": _render_nodes_bash(name="ALLOWED_NODES", nodes=allowed_nodes),
+ "HOST_PRELUDE": _render_host_prelude(cluster_nodes=cluster_nodes),
+ "COMMON_NODE_RESOLUTION_TAIL": _render_common_node_resolution_tail(service_name=service_name),
+ "SELECTION_SUPERVISOR_PATH_BLOCK": _render_selection_supervisor_path_from_script_dir(),
+ "PROC_LIFECYCLE_HELPERS": _render_proc_lifecycle_pid_tree_helpers(),
+ "SELECTION_PRESENT_PROBE_FN": _render_selection_present_probe_fn(),
+ "START_LOCK_BLOCK": _render_start_lock_block(),
+ "GLOBAL_ENV_EXPORTS": _render_global_env_exports(global_envs),
+ "PORT_EXPORT": _render_service_port_export(service_name=service_name, service_cfg=service_cfg),
+ "START_BODY": _render_standalone_start_body(
+ name_prefix=name_prefix,
+ service_name=service_name,
+ ),
+ },
)
@@ -387,25 +414,19 @@ def _render_standalone_stop_script(
service_cfg: Dict[str, Any],
) -> str:
allowed_nodes = _extract_nodes(service_cfg)
- return (
- "#!/usr/bin/env bash\n"
- "set -euo pipefail\n\n"
- f"SERVICE={_sh_quote(service_name)}\n"
- f"NAME_PREFIX={_sh_quote(name_prefix)}\n"
- + _render_nodes_bash(name="ALLOWED_NODES", nodes=allowed_nodes)
- + _render_host_prelude(cluster_nodes=cluster_nodes)
- + _render_common_node_resolution_tail(service_name=service_name)
- + _render_selection_supervisor_path_from_script_dir()
- + f'SUPERVISOR_LABEL={_sh_quote(_bare_plain_selection_supervisor_label(name_prefix=name_prefix, service_name=service_name))}\n'
- + "# English note:\n"
- + "# - Generated bare stop is retained as a manual operator tool.\n"
- + "# - Automation must not depend on this path for handover or rollout convergence.\n"
- + "# - The command only asks the shared selection supervisor to retire the concrete selection\n"
- + "# identity identified by label on this node.\n"
- + 'if ! python3 "$SELECTION_SUPERVISOR" stop --label "$SUPERVISOR_LABEL" --scope-key "$HOSTWORKDIR" --missing-ok >/dev/null; then\n'
- + ' echo "[bare] stop failed svc=$SERVICE label=$SUPERVISOR_LABEL hostworkdir=$HOSTWORKDIR"\n'
- + " exit 1\n"
- + "fi\n"
+ return _render_bare_template(
+ template_name="standalone_stop.sh.tmpl",
+ values={
+ "SERVICE_ASSIGN": _sh_quote(service_name),
+ "NAME_PREFIX_ASSIGN": _sh_quote(name_prefix),
+ "ALLOWED_NODES_BLOCK": _render_nodes_bash(name="ALLOWED_NODES", nodes=allowed_nodes),
+ "HOST_PRELUDE": _render_host_prelude(cluster_nodes=cluster_nodes),
+ "COMMON_NODE_RESOLUTION_TAIL": _render_common_node_resolution_tail(service_name=service_name),
+ "SELECTION_SUPERVISOR_PATH_BLOCK": _render_selection_supervisor_path_from_script_dir(),
+ "SUPERVISOR_LABEL_ASSIGN": _sh_quote(
+ _bare_plain_selection_supervisor_label(name_prefix=name_prefix, service_name=service_name)
+ ),
+ },
)
@@ -429,20 +450,19 @@ def _render_atomic_group_start_script(
service_cfg=service_cfg,
)
)
- return (
- "#!/usr/bin/env bash\n"
- "set -euo pipefail\n\n"
- f"GROUP={_sh_quote(group_name)}\n"
- f"NAME_PREFIX={_sh_quote(name_prefix)}\n"
- + _render_host_prelude(cluster_nodes=cluster_nodes)
- + _render_atomic_group_node_resolution_tail(group_cfg["nodes"])
- + _render_selection_supervisor_path_from_script_dir()
- + _render_proc_lifecycle_pid_tree_helpers()
- + _render_tcp_ready_helpers()
- + _render_global_env_exports(global_envs)
- + f"GROUP_STARTUP_DEADLINE_TS=$(( $(date +%s) + {ATOMIC_GROUP_STARTUP_DEADLINE_SECONDS} ))\n"
- + "".join(service_blocks)
- + 'echo "[atomic-group] ready group=$GROUP node=$NODE_ID"\n'
+ return _render_bare_template(
+ template_name="atomic_group_start.sh.tmpl",
+ values={
+ "GROUP_ASSIGN": _sh_quote(group_name),
+ "NAME_PREFIX_ASSIGN": _sh_quote(name_prefix),
+ "HOST_PRELUDE": _render_host_prelude(cluster_nodes=cluster_nodes),
+ "ATOMIC_GROUP_NODE_RESOLUTION_TAIL": _render_atomic_group_node_resolution_tail(group_cfg["nodes"]),
+ "SELECTION_SUPERVISOR_PATH_BLOCK": _render_selection_supervisor_path_from_script_dir(),
+ "PROC_LIFECYCLE_HELPERS": _render_proc_lifecycle_pid_tree_helpers(),
+ "GLOBAL_ENV_EXPORTS": _render_global_env_exports(global_envs),
+ "GROUP_STARTUP_DEADLINE_ASSIGN": str(ATOMIC_GROUP_STARTUP_DEADLINE_SECONDS),
+ "SERVICE_BLOCKS": "".join(service_blocks),
+ },
)
@@ -454,276 +474,105 @@ def _render_atomic_group_stop_script(
group_cfg: Dict[str, Any],
) -> str:
stop_services = list(reversed(group_cfg["services"]))
- return (
- "#!/usr/bin/env bash\n"
- "set -u -o pipefail\n\n"
- f"GROUP={_sh_quote(group_name)}\n"
- f"NAME_PREFIX={_sh_quote(name_prefix)}\n"
- + _render_host_prelude(cluster_nodes=cluster_nodes)
- + _render_atomic_group_node_resolution_tail(group_cfg["nodes"])
- + _render_selection_supervisor_path_from_script_dir()
- + _render_atomic_group_stop_fn(
- runtime_specs=[
- {
- "service_name": service_name,
- "supervisor_label": _bare_atomic_group_member_selection_supervisor_label(
- name_prefix=name_prefix,
- group_name=group_name,
- service_name=service_name,
- ),
- }
- for service_name in stop_services
- ],
- )
- + "stop_group\n"
+ return _render_bare_template(
+ template_name="atomic_group_stop.sh.tmpl",
+ values={
+ "GROUP_ASSIGN": _sh_quote(group_name),
+ "NAME_PREFIX_ASSIGN": _sh_quote(name_prefix),
+ "HOST_PRELUDE": _render_host_prelude(cluster_nodes=cluster_nodes),
+ "ATOMIC_GROUP_NODE_RESOLUTION_TAIL": _render_atomic_group_node_resolution_tail(group_cfg["nodes"]),
+ "SELECTION_SUPERVISOR_PATH_BLOCK": _render_selection_supervisor_path_from_script_dir(),
+ "ATOMIC_GROUP_STOP_FN": _render_atomic_group_stop_fn(
+ runtime_specs=[
+ {
+ "service_name": service_name,
+ "supervisor_label": _bare_atomic_group_member_selection_supervisor_label(
+ name_prefix=name_prefix,
+ group_name=group_name,
+ service_name=service_name,
+ ),
+ }
+ for service_name in stop_services
+ ],
+ ),
+ },
)
def _render_host_prelude(*, cluster_nodes: List[Dict[str, Any]]) -> str:
all_nodes = [_require_str(node.get("hostname"), "cluster_nodes[].hostname") for node in cluster_nodes]
- out = _render_nodes_bash(name="ALL_NODES", nodes=all_nodes)
- out += "\nLOCAL_HOSTNAME=$(hostname -s 2>/dev/null || hostname 2>/dev/null || echo unknown)\n"
- out += 'LOCAL_FQDN=$(hostname -f 2>/dev/null || echo "$LOCAL_HOSTNAME")\n'
- out += 'NODE_ID="${NODE_ID:-}"\n'
- out += 'if [ -n "$NODE_ID" ]; then\n'
- out += ' _node_id_known=false\n'
- out += ' for n in "${ALL_NODES[@]}"; do\n'
- out += ' if [ "$n" = "$NODE_ID" ]; then\n'
- out += ' _node_id_known=true\n'
- out += " break\n"
- out += " fi\n"
- out += " done\n"
- out += ' if [ "$_node_id_known" != true ]; then\n'
- out += ' echo "Unknown preset NODE_ID: $NODE_ID"\n'
- out += f' echo "Known nodes: {" ".join(all_nodes)}"\n'
- out += " exit 1\n"
- out += " fi\n"
- out += "fi\n"
- out += 'if [ -z "$NODE_ID" ]; then\n'
- out += 'for n in "${ALL_NODES[@]}"; do\n'
- out += ' if [ "$n" = "$LOCAL_HOSTNAME" ] || [ "$n" = "$LOCAL_FQDN" ]; then\n'
- out += ' NODE_ID="$n"\n'
- out += " break\n"
- out += " fi\n"
- out += "done\n"
- out += "fi\n"
- out += 'if [ -z "$NODE_ID" ] && [ ${#ALL_NODES[@]} -eq 1 ]; then\n'
- out += ' NODE_ID="${ALL_NODES[0]}"\n'
- out += "fi\n"
- out += 'if [ -z "$NODE_ID" ]; then\n'
- out += ' for ip in $(hostname -I 2>/dev/null); do\n'
- out += ' for n in "${ALL_NODES[@]}"; do\n'
- out += ' _ip_n=""\n'
- out += ' case "$n" in\n'
- for node in cluster_nodes:
- node_name = _require_str(node.get("hostname"), "cluster_nodes[].hostname")
- node_ip = _require_str(node.get("ip"), f"cluster_nodes[{node_name}].ip")
- out += f" {_sh_quote(node_name)}) _ip_n={_sh_quote(node_ip)};;\n"
- out += ' *) _ip_n="";;\n'
- out += " esac\n"
- out += ' if [ "$_ip_n" = "$ip" ]; then\n'
- out += ' NODE_ID="$n"\n'
- out += " break\n"
- out += " fi\n"
- out += " done\n"
- out += ' [ -n "$NODE_ID" ] && break\n'
- out += " done\n"
- out += "fi\n"
- out += 'if [ -z "$NODE_ID" ]; then\n'
- out += ' echo "Cannot map host to a configured node. Hostname=$LOCAL_HOSTNAME FQDN=$LOCAL_FQDN IPs=$(hostname -I 2>/dev/null)"\n'
- out += f' echo "Known nodes: {" ".join(all_nodes)}"\n'
- out += " exit 1\n"
- out += "fi\n\n"
- out += 'HOST_IP=""\nHOSTWORKDIR=""\ncase "$NODE_ID" in\n'
+ ip_case_lines: list[str] = []
+ host_case_lines: list[str] = []
for node in cluster_nodes:
node_name = _require_str(node.get("hostname"), "cluster_nodes[].hostname")
node_ip = _require_str(node.get("ip"), f"cluster_nodes[{node_name}].ip")
hostworkdir = _require_str(node.get("hostworkdir"), f"cluster_nodes[{node_name}].hostworkdir")
- out += f" {_sh_quote(node_name)}) HOST_IP={_sh_quote(node_ip)}; HOSTWORKDIR={_sh_quote(hostworkdir)};;\n"
- out += ' *) echo "Unknown NODE_ID: $NODE_ID"; exit 1;;\n'
- out += "esac\n"
- return out
+ ip_case_lines.append(f" {_sh_quote(node_name)}) _ip_n={_sh_quote(node_ip)};;")
+ host_case_lines.append(
+ f" {_sh_quote(node_name)}) HOST_IP={_sh_quote(node_ip)}; HOSTWORKDIR={_sh_quote(hostworkdir)};;"
+ )
+ return _render_bare_template(
+ template_name="host_prelude.sh.tmpl",
+ values={
+ "ALL_NODES_BLOCK": _render_nodes_bash(name="ALL_NODES", nodes=all_nodes),
+ "KNOWN_NODES": " ".join(all_nodes),
+ "IP_CASE_LINES": "\n".join(ip_case_lines),
+ "HOST_CASE_LINES": "\n".join(host_case_lines),
+ },
+ )
def _render_common_node_resolution_tail(*, service_name: str) -> str:
- return (
- 'if [ ${#ALLOWED_NODES[@]} -gt 0 ]; then\n'
- + ' _ok=false\n'
- + ' for n in "${ALLOWED_NODES[@]}"; do\n'
- + ' if [ "$n" = "$NODE_ID" ]; then _ok=true; fi\n'
- + " done\n"
- + ' if [ "$_ok" != true ]; then\n'
- + f' echo "Service {service_name} not scheduled on this node ($NODE_ID). Allowed: ${{ALLOWED_NODES[*]}}"\n'
- + " exit 0\n"
- + " fi\n"
- + "fi\n\n"
- + 'export NODE_ID="$NODE_ID"\n'
- + 'export HOST_IP="$HOST_IP"\n'
- + 'export HOSTWORKDIR="$HOSTWORKDIR"\n\n'
+ return _render_bare_template(
+ template_name="common_node_resolution_tail.sh.tmpl",
+ values={"SERVICE_NAME": service_name},
)
def _render_atomic_group_node_resolution_tail(allowed_nodes: List[str]) -> str:
- return (
- _render_nodes_bash(name="GROUP_NODES", nodes=allowed_nodes)
- + 'scheduled=false\n'
- + 'for n in "${GROUP_NODES[@]}"; do\n'
- + ' if [ "$n" = "$NODE_ID" ]; then scheduled=true; fi\n'
- + "done\n"
- + 'if [ "$scheduled" != true ]; then\n'
- + ' echo "[atomic-group] skip group=$GROUP node=$NODE_ID allowed=${GROUP_NODES[*]}"\n'
- + " exit 0\n"
- + "fi\n\n"
- + 'export NODE_ID="$NODE_ID"\n'
- + 'export HOST_IP="$HOST_IP"\n'
- + 'export HOSTWORKDIR="$HOSTWORKDIR"\n'
- + 'echo "[atomic-group] group=$GROUP node=$NODE_ID hostworkdir=$HOSTWORKDIR"\n\n'
+ return _render_bare_template(
+ template_name="atomic_group_node_resolution_tail.sh.tmpl",
+ values={"GROUP_NODES_BLOCK": _render_nodes_bash(name="GROUP_NODES", nodes=allowed_nodes)},
)
def _render_start_lock_block() -> str:
- return (
- 'PID_DIR="$HOSTWORKDIR/run"\n'
- + 'mkdir -p "$PID_DIR"\n'
- + 'START_LOCKFILE="$PID_DIR/${SERVICE}.start.lock"\n'
- + 'if ! command -v flock >/dev/null 2>&1; then\n'
- + ' echo "Missing required command: flock"\n'
- + " exit 1\n"
- + "fi\n"
- + 'exec 9>"$START_LOCKFILE"\n'
- + 'if ! flock -xn 9; then\n'
- + ' echo "[bare] start skipped svc=$SERVICE reason=another start is already running lockfile=$START_LOCKFILE"\n'
- + " exit 0\n"
- + "fi\n"
- + 'exec 9>&-\n\n'
- )
+ return _load_bare_template(template_name="start_lock_block.sh.tmpl")
def _render_proc_lifecycle_pid_tree_helpers() -> str:
return render_bash_proc_lifecycle_funcs_pid_tree(timeouts=STOP_TIMEOUTS) + "\n\n"
-def _render_tcp_ready_helpers() -> str:
- return (
- "wait_service_tcp_ready() {\n"
- + ' svc="$1"\n'
- + ' host="$2"\n'
- + ' port="$3"\n'
- + ' stable_seconds="$4"\n'
- + ' deadline_ts="$5"\n'
- + ' context="$6"\n'
- + ' if [[ ! "$port" =~ ^[0-9]+$ ]]; then\n'
- + ' echo "$context tcp-ready: invalid port svc=$svc port=$port"\n'
- + " return 1\n"
- + " fi\n"
- + ' if [[ ! "$stable_seconds" =~ ^[0-9]+$ ]] || [ "$stable_seconds" -le 0 ]; then\n'
- + ' echo "$context tcp-ready: invalid stable_seconds svc=$svc stable_seconds=$stable_seconds"\n'
- + " return 1\n"
- + " fi\n"
- + f" poll_interval_seconds={TCP_READY_POLL_INTERVAL_SECONDS}\n"
- + ' stable_checks=$(python3 - "$stable_seconds" "$poll_interval_seconds" <<\'__FLUXON_TCP_READY_CHECKS__\'\n'
- + "import math\n"
- + "import sys\n"
- + "stable_seconds = float(sys.argv[1])\n"
- + "poll_interval_seconds = float(sys.argv[2])\n"
- + "print(max(1, int(math.ceil(stable_seconds / poll_interval_seconds))))\n"
- + "__FLUXON_TCP_READY_CHECKS__\n"
- + ")\n"
- + ' if [[ ! "$stable_checks" =~ ^[0-9]+$ ]] || [ "$stable_checks" -le 0 ]; then\n'
- + ' echo "$context tcp-ready: failed to compute stable_checks svc=$svc"\n'
- + " return 1\n"
- + " fi\n"
- + " ok_checks=0\n"
- + " while true; do\n"
- + ' now=$(date +%s)\n'
- + ' if [ "$now" -ge "$deadline_ts" ]; then\n'
- + ' echo "$context tcp-ready: deadline exceeded svc=$svc host=$host port=$port"\n'
- + " return 1\n"
- + " fi\n"
- + ' if python3 - "$host" "$port" <<\'__FLUXON_TCP_READY_PROBE__\'\n'
- + "import socket\n"
- + "import sys\n"
- + "host = sys.argv[1]\n"
- + "port = int(sys.argv[2])\n"
- + "with socket.create_connection((host, port), timeout=1.0):\n"
- + " pass\n"
- + "__FLUXON_TCP_READY_PROBE__\n"
- + " then\n"
- + " ok_checks=$((ok_checks+1))\n"
- + ' if [ "$ok_checks" -ge "$stable_checks" ]; then\n'
- + ' echo "$context tcp-ready: ok svc=$svc host=$host port=$port stable_checks=$stable_checks"\n'
- + " return 0\n"
- + " fi\n"
- + " else\n"
- + ' if [ "$ok_checks" -ne 0 ]; then\n'
- + ' echo "$context tcp-ready: reset svc=$svc ok_checks=$ok_checks host=$host port=$port"\n'
- + " fi\n"
- + " ok_checks=0\n"
- + " fi\n"
- + ' sleep "$poll_interval_seconds"\n'
- + " done\n"
- + "}\n\n"
- )
-
-
def _render_selection_present_probe_fn() -> str:
- return (
- "selection_present() {\n"
- + " python3 - \"$SELECTION_SUPERVISOR\" \"$SUPERVISOR_LABEL\" \"$HOSTWORKDIR\" <<'__FLUXON_SELECTION_PRESENT__'\n"
- + "import importlib.util\n"
- + "import sys\n"
- + "from pathlib import Path\n"
- + "\n"
- + "supervisor_path = Path(sys.argv[1])\n"
- + "label = sys.argv[2]\n"
- + "scope_key = sys.argv[3]\n"
- + 'spec = importlib.util.spec_from_file_location("fluxon_selection_supervisor_probe", supervisor_path)\n'
- + "if spec is None or spec.loader is None:\n"
- + ' raise RuntimeError(f"failed to load selection supervisor module: {supervisor_path}")\n'
- + "module = importlib.util.module_from_spec(spec)\n"
- + "sys.modules[spec.name] = module\n"
- + "spec.loader.exec_module(module)\n"
- + "raise SystemExit(0 if module._selection_present(label, scope_key=scope_key) else 1)\n"
- + "__FLUXON_SELECTION_PRESENT__\n"
- + "}\n\n"
- )
+ return _load_bare_template(template_name="selection_present_probe_fn.sh.tmpl")
def _render_selection_supervisor_launch_wait_block(
*,
run_cmd: str,
- logfile_expr: str,
stable_seconds_expr: str,
deadline_ts_expr: str,
context: str,
) -> str:
- return (
- 'SUPERVISOR_PID=$( '
- + run_cmd
- + f' >>{logfile_expr} 2>&1 < /dev/null & echo "$!" )\n'
- + 'if [[ ! "$SUPERVISOR_PID" =~ ^[0-9]+$ ]]; then\n'
- + f' echo "{context} launch failed svc=$SERVICE label=$SUPERVISOR_LABEL supervisor_pid=$SUPERVISOR_PID"\n'
- + " exit 1\n"
- + "fi\n"
- + 'if ! wait_service_probably_ready_pid_tree "$SERVICE" "$SUPERVISOR_PID" '
- + stable_seconds_expr
- + " "
- + deadline_ts_expr
- + f' "{context}"; then\n'
- + f' echo "{context} probable-ready failed svc=$SERVICE label=$SUPERVISOR_LABEL supervisor_pid=$SUPERVISOR_PID"\n'
- + " exit 1\n"
- + "fi\n"
+ return _render_bare_template(
+ template_name="selection_supervisor_launch_wait_block.sh.tmpl",
+ values={
+ "RUN_CMD": run_cmd,
+ "STABLE_SECONDS_EXPR": stable_seconds_expr,
+ "DEADLINE_TS_EXPR": deadline_ts_expr,
+ "CONTEXT": context,
+ },
)
-def _render_tcp_ready_wait_block(*, context: str) -> str:
+def _render_service_port_export(*, service_name: str, service_cfg: Dict[str, Any], indent: str = "") -> str:
+ service_port = _extract_port(service_cfg)
+ if service_port is None:
+ return indent + "unset SERVICE_PORT\n"
return (
- 'if [[ "${SERVICE_PORT:-}" =~ ^[0-9]+$ ]]; then\n'
- + f' if ! wait_service_tcp_ready "$SERVICE" "$HOST_IP" "$SERVICE_PORT" {TCP_READY_STABLE_SECONDS} "$STARTUP_DEADLINE_TS" "{context}"; then\n'
- + f' echo "{context} tcp-ready failed svc=$SERVICE host=$HOST_IP port=$SERVICE_PORT"\n'
- + " exit 1\n"
- + " fi\n"
- + "fi\n"
+ indent + f"export {service_name.upper()}__PORT={_sh_quote(str(service_port))}\n"
+ + indent + f"export SERVICE_PORT={_sh_quote(str(service_port))}\n"
)
@@ -759,54 +608,28 @@ def _render_standalone_start_body(*, name_prefix: str, service_name: str) -> str
crashloop_interval_lt_seconds=0,
child_command=child_command,
)
- return (
- f'SUPERVISOR_LABEL={_sh_quote(_bare_plain_selection_supervisor_label(name_prefix=name_prefix, service_name=service_name))}\n'
- + f'RUNTIME_STATE_JSON={_sh_quote(runtime_state_json)}\n'
- + 'OWNER_TS_MS=$(python3 -c \'import time; print(int(time.time() * 1000))\')\n'
- + f"STARTUP_DEADLINE_TS=$(( $(date +%s) + {STANDALONE_STARTUP_DEADLINE_SECONDS} ))\n"
- + 'LOG_DIR="$HOSTWORKDIR/log"\n'
- + 'LOGFILE="$LOG_DIR/${SERVICE}.log"\n'
- + 'mkdir -p "$LOG_DIR"\n'
- + 'touch "$LOGFILE"\n'
- + 'echo "Starting $SERVICE on $NODE_ID (IP: $HOST_IP, workdir: $HOSTWORKDIR)"\n'
- + "# English note:\n"
- + "# - bootstrap bare start must be idempotent when the shared selection supervisor already owns\n"
- + "# a live child for the same label.\n"
- + "# - start_test_bed enables this path only for deployconf.bootstrap_bare_services.\n"
- + 'if [ "${FLUXON_BARE_ALLOW_ALREADY_PRESENT:-false}" = "true" ]; then\n'
- + " if selection_present; then\n"
- + ' echo "[bare] already present svc=$SERVICE label=$SUPERVISOR_LABEL"\n'
- + ' echo "Started $SERVICE (label: $SUPERVISOR_LABEL)"\n'
- + ' echo "Logs: $LOGFILE"\n'
- + " exit 0\n"
- + " fi\n"
- + "fi\n"
- + "# English note:\n"
- + "# - Bare start must not depend on extra supervisor observation subcommands because the shared\n"
- + "# runtime surface is intentionally reduced to run/stop.\n"
- + "# - We therefore launch the detached supervisor and wait until its pid subtree keeps a live child\n"
- + "# process for a short stable window.\n"
- + _render_selection_supervisor_launch_wait_block(
- run_cmd=run_cmd,
- logfile_expr='"$LOGFILE"',
- stable_seconds_expr=str(STANDALONE_PROBABLE_READY_SECONDS),
- deadline_ts_expr='"$STARTUP_DEADLINE_TS"',
- context="[bare]",
- )
- + _render_tcp_ready_wait_block(context="[bare]")
- + 'echo "Started $SERVICE (label: $SUPERVISOR_LABEL)"\n'
- + 'echo "Logs: $LOGFILE"\n'
+ return _render_bare_template(
+ template_name="standalone_start_body.sh.tmpl",
+ values={
+ "SUPERVISOR_LABEL_ASSIGN": _sh_quote(
+ _bare_plain_selection_supervisor_label(name_prefix=name_prefix, service_name=service_name)
+ ),
+ "RUNTIME_STATE_JSON_ASSIGN": _sh_quote(runtime_state_json),
+ "STARTUP_DEADLINE_SECONDS": str(STANDALONE_STARTUP_DEADLINE_SECONDS),
+ "SELECTION_SUPERVISOR_LAUNCH_WAIT_BLOCK": _render_selection_supervisor_launch_wait_block(
+ run_cmd=run_cmd,
+ stable_seconds_expr=str(STANDALONE_PROBABLE_READY_SECONDS),
+ deadline_ts_expr='"$STARTUP_DEADLINE_TS"',
+ context="[bare]",
+ ),
+ },
)
def _render_selection_supervisor_path_from_script_dir() -> str:
- return (
- 'DIR=$(cd "$(dirname "$0")" && pwd)\n'
- + f'SELECTION_SUPERVISOR="$DIR/{PYTHON_SELECTION_SUPERVISOR_FILENAME}"\n'
- + 'if [ ! -f "$SELECTION_SUPERVISOR" ]; then\n'
- + ' echo "Missing selection supervisor: $SELECTION_SUPERVISOR"\n'
- + " exit 1\n"
- + "fi\n\n"
+ return _render_bare_template(
+ template_name="selection_supervisor_path_from_script_dir.sh.tmpl",
+ values={"SELECTION_SUPERVISOR_FILENAME": PYTHON_SELECTION_SUPERVISOR_FILENAME},
)
@@ -833,10 +656,6 @@ def _render_atomic_group_service_block(
log_path=f"${{HOSTWORKDIR}}/log/{service_name}.log",
)
allowed_nodes = _extract_nodes(service_cfg)
- service_port = _extract_port(service_cfg)
- port_export = ""
- if service_port is not None:
- port_export = f" export {service_name.upper()}__PORT={_sh_quote(str(service_port))}\n"
run_cmd = _render_selection_supervisor_run_shell(
subcommand="run",
supervisor_expr='"$SELECTION_SUPERVISOR"',
@@ -850,54 +669,37 @@ def _render_atomic_group_service_block(
crashloop_interval_lt_seconds=ATOMIC_GROUP_CRASHLOOP_INTERVAL_LT_SECONDS,
child_command=child_command,
)
- return (
- f"\n# rollout: {service_name}\n"
- + _render_nodes_bash(name="ALLOWED_NODES", nodes=allowed_nodes)
- + "scheduled=false\n"
- + 'for n in "${ALLOWED_NODES[@]}"; do\n'
- + ' if [ "$n" = "$NODE_ID" ]; then scheduled=true; fi\n'
- + "done\n"
- + 'if [ "$scheduled" != true ]; then\n'
- + f' echo "[rollout] skip {service_name}: not scheduled on node $NODE_ID"\n'
- + "else\n"
- + f" export SERVICE={_sh_quote(service_name)}\n"
- + port_export
- + ' LOG_DIR="$HOSTWORKDIR/log"\n'
- + ' mkdir -p "$LOG_DIR"\n'
- + f' SUPERVISOR_LABEL={_sh_quote(_bare_atomic_group_member_selection_supervisor_label(name_prefix=name_prefix, group_name=group_name, service_name=service_name))}\n'
- + f' RUNTIME_STATE_JSON={_sh_quote(runtime_state_json)}\n'
- + ' OWNER_TS_MS=$(python3 -c \'import time; print(int(time.time() * 1000))\')\n'
- + f' LOGFILE="$HOSTWORKDIR/log/{service_name}.log"\n'
- + ' touch "$LOGFILE"\n'
- + f' echo "[rollout] start {service_name} node=$NODE_ID hostworkdir=$HOSTWORKDIR"\n'
- + " # English note:\n"
- + " # - Atomic-group order still depends on a readiness gate, but that gate now observes only the\n"
- + " # detached supervisor process subtree on this host.\n"
- + " # - Ownership stays inside the shared selection supervisor big loop; the group runner only waits\n"
- + " # until that loop has a stable live child before advancing to the next service.\n"
- # English note:
- # - The embedded `run_cmd` contains a nested `bash -lc` payload, and that payload may contain
- # heredocs used by real service entrypoints.
- # - A blind newline replacement would shift heredoc terminators away from column 0 inside the
- # child shell and silently turn valid entrypoints into immediate no-op exits.
- # - Indent only the outer block lines while preserving each inner line start exactly.
- + _indent_script_block(
- script=_render_selection_supervisor_launch_wait_block(
- run_cmd=run_cmd,
- logfile_expr='"$LOGFILE"',
- stable_seconds_expr=str(ATOMIC_GROUP_PROBABLE_READY_SECONDS),
- deadline_ts_expr='"$GROUP_STARTUP_DEADLINE_TS"',
- context="[rollout]",
- ).rstrip() + "\n",
- prefix=" ",
- ).rstrip()
- + "\n"
- + _indent_script_block(
- script=_render_tcp_ready_wait_block(context="[rollout]"),
- prefix=" ",
- ).rstrip()
- + "\n"
- + "fi\n"
+ return _render_bare_template(
+ template_name="atomic_group_service_block.sh.tmpl",
+ values={
+ "SERVICE_NAME": service_name,
+ "ALLOWED_NODES_BLOCK": _render_nodes_bash(name="ALLOWED_NODES", nodes=allowed_nodes),
+ "SERVICE_EXPORT": _sh_quote(service_name),
+ "PORT_EXPORT": _render_service_port_export(
+ service_name=service_name,
+ service_cfg=service_cfg,
+ indent=" ",
+ ),
+ "SUPERVISOR_LABEL_ASSIGN": _sh_quote(
+ _bare_atomic_group_member_selection_supervisor_label(
+ name_prefix=name_prefix,
+ group_name=group_name,
+ service_name=service_name,
+ )
+ ),
+ "RUNTIME_STATE_JSON_ASSIGN": _sh_quote(runtime_state_json),
+ "LOGFILE_PATH": f"$HOSTWORKDIR/log/{service_name}.log",
+ "INDENTED_SELECTION_SUPERVISOR_LAUNCH_WAIT_BLOCK": _indent_script_block(
+ script=_render_selection_supervisor_launch_wait_block(
+ run_cmd=run_cmd,
+ stable_seconds_expr=str(ATOMIC_GROUP_PROBABLE_READY_SECONDS),
+ deadline_ts_expr='"$GROUP_STARTUP_DEADLINE_TS"',
+ context="[rollout]",
+ ).rstrip()
+ + "\n",
+ prefix=" ",
+ ).rstrip(),
+ },
)
diff --git a/deployment/templates/gen_bare_deploy_bash/atomic_group_node_resolution_tail.sh.tmpl b/deployment/templates/gen_bare_deploy_bash/atomic_group_node_resolution_tail.sh.tmpl
new file mode 100644
index 0000000..d385995
--- /dev/null
+++ b/deployment/templates/gen_bare_deploy_bash/atomic_group_node_resolution_tail.sh.tmpl
@@ -0,0 +1,14 @@
+{{GROUP_NODES_BLOCK}}scheduled=false
+for n in "${GROUP_NODES[@]}"; do
+ if [ "$n" = "$NODE_ID" ]; then scheduled=true; fi
+done
+if [ "$scheduled" != true ]; then
+ echo "[atomic-group] skip group=$GROUP node=$NODE_ID allowed=${GROUP_NODES[*]}"
+ exit 0
+fi
+
+export NODE_ID="$NODE_ID"
+export HOST_IP="$HOST_IP"
+export HOSTWORKDIR="$HOSTWORKDIR"
+echo "[atomic-group] group=$GROUP node=$NODE_ID hostworkdir=$HOSTWORKDIR"
+
diff --git a/deployment/templates/gen_bare_deploy_bash/atomic_group_service_block.sh.tmpl b/deployment/templates/gen_bare_deploy_bash/atomic_group_service_block.sh.tmpl
new file mode 100644
index 0000000..6ad9a1a
--- /dev/null
+++ b/deployment/templates/gen_bare_deploy_bash/atomic_group_service_block.sh.tmpl
@@ -0,0 +1,24 @@
+
+# rollout: {{SERVICE_NAME}}
+{{ALLOWED_NODES_BLOCK}}scheduled=false
+for n in "${ALLOWED_NODES[@]}"; do
+ if [ "$n" = "$NODE_ID" ]; then scheduled=true; fi
+done
+if [ "$scheduled" != true ]; then
+ echo "[rollout] skip {{SERVICE_NAME}}: not scheduled on node $NODE_ID"
+else
+ export SERVICE={{SERVICE_EXPORT}}
+{{PORT_EXPORT}} LOG_DIR="$HOSTWORKDIR/log"
+ mkdir -p "$LOG_DIR"
+ SUPERVISOR_LABEL={{SUPERVISOR_LABEL_ASSIGN}}
+ RUNTIME_STATE_JSON={{RUNTIME_STATE_JSON_ASSIGN}}
+ OWNER_TS_MS=$(python3 -c 'import time; print(int(time.time() * 1000))')
+ LOGFILE="{{LOGFILE_PATH}}"
+ echo "[rollout] start {{SERVICE_NAME}} node=$NODE_ID hostworkdir=$HOSTWORKDIR"
+ # English note:
+ # - Atomic-group order still depends on a readiness gate, but that gate now observes only the
+ # detached supervisor process subtree on this host.
+ # - Ownership stays inside the shared selection supervisor big loop; the group runner only waits
+ # through the fixed startup observation window before advancing to the next service.
+{{INDENTED_SELECTION_SUPERVISOR_LAUNCH_WAIT_BLOCK}}
+fi
diff --git a/deployment/templates/gen_bare_deploy_bash/atomic_group_start.sh.tmpl b/deployment/templates/gen_bare_deploy_bash/atomic_group_start.sh.tmpl
new file mode 100644
index 0000000..d0c82ad
--- /dev/null
+++ b/deployment/templates/gen_bare_deploy_bash/atomic_group_start.sh.tmpl
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+GROUP={{GROUP_ASSIGN}}
+NAME_PREFIX={{NAME_PREFIX_ASSIGN}}
+{{HOST_PRELUDE}}{{ATOMIC_GROUP_NODE_RESOLUTION_TAIL}}{{SELECTION_SUPERVISOR_PATH_BLOCK}}{{PROC_LIFECYCLE_HELPERS}}{{GLOBAL_ENV_EXPORTS}}GROUP_STARTUP_DEADLINE_TS=$(( $(date +%s) + {{GROUP_STARTUP_DEADLINE_ASSIGN}} ))
+{{SERVICE_BLOCKS}}echo "[atomic-group] ready group=$GROUP node=$NODE_ID"
diff --git a/deployment/templates/gen_bare_deploy_bash/atomic_group_stop.sh.tmpl b/deployment/templates/gen_bare_deploy_bash/atomic_group_stop.sh.tmpl
new file mode 100644
index 0000000..5501b8f
--- /dev/null
+++ b/deployment/templates/gen_bare_deploy_bash/atomic_group_stop.sh.tmpl
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+set -u -o pipefail
+
+GROUP={{GROUP_ASSIGN}}
+NAME_PREFIX={{NAME_PREFIX_ASSIGN}}
+{{HOST_PRELUDE}}{{ATOMIC_GROUP_NODE_RESOLUTION_TAIL}}{{SELECTION_SUPERVISOR_PATH_BLOCK}}{{ATOMIC_GROUP_STOP_FN}}stop_group
diff --git a/deployment/templates/gen_bare_deploy_bash/bare_entrypoint.sh.tmpl b/deployment/templates/gen_bare_deploy_bash/bare_entrypoint.sh.tmpl
new file mode 100644
index 0000000..39db682
--- /dev/null
+++ b/deployment/templates/gen_bare_deploy_bash/bare_entrypoint.sh.tmpl
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+export SERVICE={{SERVICE_EXPORT}}
+{{ENTRYPOINT}}
diff --git a/deployment/templates/gen_bare_deploy_bash/common_node_resolution_tail.sh.tmpl b/deployment/templates/gen_bare_deploy_bash/common_node_resolution_tail.sh.tmpl
new file mode 100644
index 0000000..e0cb433
--- /dev/null
+++ b/deployment/templates/gen_bare_deploy_bash/common_node_resolution_tail.sh.tmpl
@@ -0,0 +1,15 @@
+if [ ${#ALLOWED_NODES[@]} -gt 0 ]; then
+ _ok=false
+ for n in "${ALLOWED_NODES[@]}"; do
+ if [ "$n" = "$NODE_ID" ]; then _ok=true; fi
+ done
+ if [ "$_ok" != true ]; then
+ echo "Service {{SERVICE_NAME}} not scheduled on this node ($NODE_ID). Allowed: ${ALLOWED_NODES[*]}"
+ exit 0
+ fi
+fi
+
+export NODE_ID="$NODE_ID"
+export HOST_IP="$HOST_IP"
+export HOSTWORKDIR="$HOSTWORKDIR"
+
diff --git a/deployment/templates/gen_bare_deploy_bash/etcd_health_wait_block.sh.tmpl b/deployment/templates/gen_bare_deploy_bash/etcd_health_wait_block.sh.tmpl
new file mode 100644
index 0000000..b424bc3
--- /dev/null
+++ b/deployment/templates/gen_bare_deploy_bash/etcd_health_wait_block.sh.tmpl
@@ -0,0 +1,4 @@
+if ! wait_service_etcd_endpoint_healthy "$SERVICE" "$HOSTWORKDIR/fluxon_release/ext_images/etcd/etcdctl" "http://$HOST_IP:$SERVICE_PORT" {{ETCD_HEALTH_STABLE_SECONDS}} {{ETCD_HEALTH_DEADLINE_TS}} "{{CONTEXT}}"; then
+ echo "{{CONTEXT}} etcd-health failed svc=$SERVICE endpoint=http://$HOST_IP:$SERVICE_PORT"
+ exit 1
+fi
diff --git a/deployment/templates/gen_bare_deploy_bash/host_prelude.sh.tmpl b/deployment/templates/gen_bare_deploy_bash/host_prelude.sh.tmpl
new file mode 100644
index 0000000..6075106
--- /dev/null
+++ b/deployment/templates/gen_bare_deploy_bash/host_prelude.sh.tmpl
@@ -0,0 +1,57 @@
+{{ALL_NODES_BLOCK}}
+LOCAL_HOSTNAME=$(hostname -s 2>/dev/null || hostname 2>/dev/null || echo unknown)
+LOCAL_FQDN=$(hostname -f 2>/dev/null || echo "$LOCAL_HOSTNAME")
+NODE_ID="${NODE_ID:-}"
+if [ -n "$NODE_ID" ]; then
+ _node_id_known=false
+ for n in "${ALL_NODES[@]}"; do
+ if [ "$n" = "$NODE_ID" ]; then
+ _node_id_known=true
+ break
+ fi
+ done
+ if [ "$_node_id_known" != true ]; then
+ echo "Unknown preset NODE_ID: $NODE_ID"
+ echo "Known nodes: {{KNOWN_NODES}}"
+ exit 1
+ fi
+fi
+if [ -z "$NODE_ID" ]; then
+for n in "${ALL_NODES[@]}"; do
+ if [ "$n" = "$LOCAL_HOSTNAME" ] || [ "$n" = "$LOCAL_FQDN" ]; then
+ NODE_ID="$n"
+ break
+ fi
+done
+fi
+if [ -z "$NODE_ID" ] && [ ${#ALL_NODES[@]} -eq 1 ]; then
+ NODE_ID="${ALL_NODES[0]}"
+fi
+if [ -z "$NODE_ID" ]; then
+ for ip in $(hostname -I 2>/dev/null); do
+ for n in "${ALL_NODES[@]}"; do
+ _ip_n=""
+ case "$n" in
+{{IP_CASE_LINES}}
+ *) _ip_n="";;
+ esac
+ if [ "$_ip_n" = "$ip" ]; then
+ NODE_ID="$n"
+ break
+ fi
+ done
+ [ -n "$NODE_ID" ] && break
+ done
+fi
+if [ -z "$NODE_ID" ]; then
+ echo "Cannot map host to a configured node. Hostname=$LOCAL_HOSTNAME FQDN=$LOCAL_FQDN IPs=$(hostname -I 2>/dev/null)"
+ echo "Known nodes: {{KNOWN_NODES}}"
+ exit 1
+fi
+
+HOST_IP=""
+HOSTWORKDIR=""
+case "$NODE_ID" in
+{{HOST_CASE_LINES}}
+ *) echo "Unknown NODE_ID: $NODE_ID"; exit 1;;
+esac
diff --git a/deployment/templates/gen_bare_deploy_bash/selection_present_probe_fn.sh.tmpl b/deployment/templates/gen_bare_deploy_bash/selection_present_probe_fn.sh.tmpl
new file mode 100644
index 0000000..0a7282b
--- /dev/null
+++ b/deployment/templates/gen_bare_deploy_bash/selection_present_probe_fn.sh.tmpl
@@ -0,0 +1,19 @@
+selection_present() {
+ python3 - "$SELECTION_SUPERVISOR" "$SUPERVISOR_LABEL" "$HOSTWORKDIR" <<'__FLUXON_SELECTION_PRESENT__'
+import importlib.util
+import sys
+from pathlib import Path
+
+supervisor_path = Path(sys.argv[1])
+label = sys.argv[2]
+scope_key = sys.argv[3]
+spec = importlib.util.spec_from_file_location("fluxon_selection_supervisor_probe", supervisor_path)
+if spec is None or spec.loader is None:
+ raise RuntimeError(f"failed to load selection supervisor module: {supervisor_path}")
+module = importlib.util.module_from_spec(spec)
+sys.modules[spec.name] = module
+spec.loader.exec_module(module)
+raise SystemExit(0 if module._selection_present(label, scope_key=scope_key) else 1)
+__FLUXON_SELECTION_PRESENT__
+}
+
diff --git a/deployment/templates/gen_bare_deploy_bash/selection_supervisor_launch_wait_block.sh.tmpl b/deployment/templates/gen_bare_deploy_bash/selection_supervisor_launch_wait_block.sh.tmpl
new file mode 100644
index 0000000..f466cbc
--- /dev/null
+++ b/deployment/templates/gen_bare_deploy_bash/selection_supervisor_launch_wait_block.sh.tmpl
@@ -0,0 +1,9 @@
+SUPERVISOR_PID=$( {{RUN_CMD}} < /dev/null & echo "$!" )
+if [[ ! "$SUPERVISOR_PID" =~ ^[0-9]+$ ]]; then
+ echo "{{CONTEXT}} launch failed svc=$SERVICE label=$SUPERVISOR_LABEL supervisor_pid=$SUPERVISOR_PID"
+ exit 1
+fi
+if ! wait_service_probably_ready_pid_tree "$SERVICE" "$SUPERVISOR_PID" {{STABLE_SECONDS_EXPR}} {{DEADLINE_TS_EXPR}} "{{CONTEXT}}"; then
+ echo "{{CONTEXT}} probable-ready failed svc=$SERVICE label=$SUPERVISOR_LABEL supervisor_pid=$SUPERVISOR_PID"
+ exit 1
+fi
diff --git a/deployment/templates/gen_bare_deploy_bash/selection_supervisor_path_from_script_dir.sh.tmpl b/deployment/templates/gen_bare_deploy_bash/selection_supervisor_path_from_script_dir.sh.tmpl
new file mode 100644
index 0000000..dac7dff
--- /dev/null
+++ b/deployment/templates/gen_bare_deploy_bash/selection_supervisor_path_from_script_dir.sh.tmpl
@@ -0,0 +1,7 @@
+DIR=$(cd "$(dirname "$0")" && pwd)
+SELECTION_SUPERVISOR="$DIR/{{SELECTION_SUPERVISOR_FILENAME}}"
+if [ ! -f "$SELECTION_SUPERVISOR" ]; then
+ echo "Missing selection supervisor: $SELECTION_SUPERVISOR"
+ exit 1
+fi
+
diff --git a/deployment/templates/gen_bare_deploy_bash/standalone_start.sh.tmpl b/deployment/templates/gen_bare_deploy_bash/standalone_start.sh.tmpl
new file mode 100644
index 0000000..5a565f1
--- /dev/null
+++ b/deployment/templates/gen_bare_deploy_bash/standalone_start.sh.tmpl
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SERVICE={{SERVICE_ASSIGN}}
+NAME_PREFIX={{NAME_PREFIX_ASSIGN}}
+{{ALLOWED_NODES_BLOCK}}{{HOST_PRELUDE}}{{COMMON_NODE_RESOLUTION_TAIL}}{{SELECTION_SUPERVISOR_PATH_BLOCK}}{{PROC_LIFECYCLE_HELPERS}}{{SELECTION_PRESENT_PROBE_FN}}{{START_LOCK_BLOCK}}{{GLOBAL_ENV_EXPORTS}}{{PORT_EXPORT}}{{START_BODY}}
diff --git a/deployment/templates/gen_bare_deploy_bash/standalone_start_body.sh.tmpl b/deployment/templates/gen_bare_deploy_bash/standalone_start_body.sh.tmpl
new file mode 100644
index 0000000..bc2fc40
--- /dev/null
+++ b/deployment/templates/gen_bare_deploy_bash/standalone_start_body.sh.tmpl
@@ -0,0 +1,27 @@
+SUPERVISOR_LABEL={{SUPERVISOR_LABEL_ASSIGN}}
+RUNTIME_STATE_JSON={{RUNTIME_STATE_JSON_ASSIGN}}
+OWNER_TS_MS=$(python3 -c 'import time; print(int(time.time() * 1000))')
+STARTUP_DEADLINE_TS=$(( $(date +%s) + {{STARTUP_DEADLINE_SECONDS}} ))
+LOG_DIR="$HOSTWORKDIR/log"
+LOGFILE="$LOG_DIR/${SERVICE}.log"
+mkdir -p "$LOG_DIR"
+echo "Starting $SERVICE on $NODE_ID (IP: $HOST_IP, workdir: $HOSTWORKDIR)"
+# English note:
+# - bootstrap bare start must be idempotent when the shared selection supervisor already owns
+# a live child for the same label.
+# - start_test_bed enables this path only for deployconf.bootstrap_bare_services.
+if [ "${FLUXON_BARE_ALLOW_ALREADY_PRESENT:-false}" = "true" ]; then
+ if selection_present; then
+ echo "[bare] already present svc=$SERVICE label=$SUPERVISOR_LABEL"
+ echo "Started $SERVICE (label: $SUPERVISOR_LABEL)"
+ echo "Logs: $LOGFILE"
+ exit 0
+ fi
+fi
+# English note:
+# - Bare start must not depend on extra supervisor observation subcommands because the shared
+# runtime surface is intentionally reduced to run/stop.
+# - We therefore launch the detached supervisor and wait until its pid subtree keeps a live child
+# process alive across the fixed startup observation window.
+{{SELECTION_SUPERVISOR_LAUNCH_WAIT_BLOCK}}echo "Started $SERVICE (label: $SUPERVISOR_LABEL)"
+echo "Logs: $LOGFILE"
diff --git a/deployment/templates/gen_bare_deploy_bash/standalone_stop.sh.tmpl b/deployment/templates/gen_bare_deploy_bash/standalone_stop.sh.tmpl
new file mode 100644
index 0000000..4f7dc37
--- /dev/null
+++ b/deployment/templates/gen_bare_deploy_bash/standalone_stop.sh.tmpl
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SERVICE={{SERVICE_ASSIGN}}
+NAME_PREFIX={{NAME_PREFIX_ASSIGN}}
+{{ALLOWED_NODES_BLOCK}}{{HOST_PRELUDE}}{{COMMON_NODE_RESOLUTION_TAIL}}{{SELECTION_SUPERVISOR_PATH_BLOCK}}SUPERVISOR_LABEL={{SUPERVISOR_LABEL_ASSIGN}}
+# English note:
+# - Generated bare stop is retained as a manual operator tool.
+# - Automation must not depend on this path for handover or rollout convergence.
+# - The command only asks the shared selection supervisor to retire the concrete selection
+# identity identified by label on this node.
+if ! python3 "$SELECTION_SUPERVISOR" stop --label "$SUPERVISOR_LABEL" --scope-key "$HOSTWORKDIR" --missing-ok >/dev/null; then
+ echo "[bare] stop failed svc=$SERVICE label=$SUPERVISOR_LABEL hostworkdir=$HOSTWORKDIR"
+ exit 1
+fi
diff --git a/deployment/templates/gen_bare_deploy_bash/start_lock_block.sh.tmpl b/deployment/templates/gen_bare_deploy_bash/start_lock_block.sh.tmpl
new file mode 100644
index 0000000..47ec770
--- /dev/null
+++ b/deployment/templates/gen_bare_deploy_bash/start_lock_block.sh.tmpl
@@ -0,0 +1,14 @@
+PID_DIR="$HOSTWORKDIR/run"
+mkdir -p "$PID_DIR"
+START_LOCKFILE="$PID_DIR/${SERVICE}.start.lock"
+if ! command -v flock >/dev/null 2>&1; then
+ echo "Missing required command: flock"
+ exit 1
+fi
+exec 9>"$START_LOCKFILE"
+if ! flock -xn 9; then
+ echo "[bare] start skipped svc=$SERVICE reason=another start is already running lockfile=$START_LOCKFILE"
+ exit 0
+fi
+exec 9>&-
+
diff --git a/deployment/templates/gen_bare_deploy_bash/tcp_ready_helpers.sh.tmpl b/deployment/templates/gen_bare_deploy_bash/tcp_ready_helpers.sh.tmpl
new file mode 100644
index 0000000..0c0cc3b
--- /dev/null
+++ b/deployment/templates/gen_bare_deploy_bash/tcp_ready_helpers.sh.tmpl
@@ -0,0 +1,120 @@
+wait_service_tcp_ready() {
+ svc="$1"
+ host="$2"
+ port="$3"
+ stable_seconds="$4"
+ deadline_ts="$5"
+ context="$6"
+ if [[ ! "$port" =~ ^[0-9]+$ ]]; then
+ echo "$context tcp-ready: invalid port svc=$svc port=$port"
+ return 1
+ fi
+ if [[ ! "$stable_seconds" =~ ^[0-9]+$ ]] || [ "$stable_seconds" -le 0 ]; then
+ echo "$context tcp-ready: invalid stable_seconds svc=$svc stable_seconds=$stable_seconds"
+ return 1
+ fi
+ poll_interval_seconds={{TCP_READY_POLL_INTERVAL_SECONDS}}
+ stable_checks=$(python3 - "$stable_seconds" "$poll_interval_seconds" <<'__FLUXON_TCP_READY_CHECKS__'
+import math
+import sys
+stable_seconds = float(sys.argv[1])
+poll_interval_seconds = float(sys.argv[2])
+print(max(1, int(math.ceil(stable_seconds / poll_interval_seconds))))
+__FLUXON_TCP_READY_CHECKS__
+)
+ if [[ ! "$stable_checks" =~ ^[0-9]+$ ]] || [ "$stable_checks" -le 0 ]; then
+ echo "$context tcp-ready: failed to compute stable_checks svc=$svc"
+ return 1
+ fi
+ ok_checks=0
+ while true; do
+ now=$(date +%s)
+ if [ "$now" -ge "$deadline_ts" ]; then
+ echo "$context tcp-ready: deadline exceeded svc=$svc host=$host port=$port"
+ return 1
+ fi
+ if python3 - "$host" "$port" <<'__FLUXON_TCP_READY_PROBE__'
+import socket
+import sys
+host = sys.argv[1]
+port = int(sys.argv[2])
+with socket.create_connection((host, port), timeout=1.0):
+ pass
+__FLUXON_TCP_READY_PROBE__
+ then
+ ok_checks=$((ok_checks+1))
+ if [ "$ok_checks" -ge "$stable_checks" ]; then
+ echo "$context tcp-ready: ok svc=$svc host=$host port=$port stable_checks=$stable_checks"
+ return 0
+ fi
+ else
+ if [ "$ok_checks" -ne 0 ]; then
+ echo "$context tcp-ready: reset svc=$svc ok_checks=$ok_checks host=$host port=$port"
+ fi
+ ok_checks=0
+ fi
+ sleep "$poll_interval_seconds"
+ done
+}
+
+wait_service_etcd_endpoint_healthy() {
+ svc="$1"
+ etcdctl_bin="$2"
+ endpoint="$3"
+ stable_seconds="$4"
+ deadline_ts="$5"
+ context="$6"
+ if [ ! -x "$etcdctl_bin" ]; then
+ echo "$context etcd-health: missing etcdctl svc=$svc path=$etcdctl_bin"
+ return 1
+ fi
+ if [ -z "$endpoint" ]; then
+ echo "$context etcd-health: missing endpoint svc=$svc"
+ return 1
+ fi
+ if [[ ! "$stable_seconds" =~ ^[0-9]+$ ]] || [ "$stable_seconds" -le 0 ]; then
+ echo "$context etcd-health: invalid stable_seconds svc=$svc stable_seconds=$stable_seconds"
+ return 1
+ fi
+ poll_interval_seconds={{ETCD_HEALTH_POLL_INTERVAL_SECONDS}}
+ stable_checks=$(python3 - "$stable_seconds" "$poll_interval_seconds" <<'__FLUXON_ETCD_HEALTH_CHECKS__'
+import math
+import sys
+stable_seconds = float(sys.argv[1])
+poll_interval_seconds = float(sys.argv[2])
+print(max(1, int(math.ceil(stable_seconds / poll_interval_seconds))))
+__FLUXON_ETCD_HEALTH_CHECKS__
+)
+ if [[ ! "$stable_checks" =~ ^[0-9]+$ ]] || [ "$stable_checks" -le 0 ]; then
+ echo "$context etcd-health: failed to compute stable_checks svc=$svc"
+ return 1
+ fi
+ ok_checks=0
+ last_output=""
+ while true; do
+ now=$(date +%s)
+ if [ "$now" -ge "$deadline_ts" ]; then
+ if [ -n "$last_output" ]; then
+ last_output="${last_output//$'\n'/ }"
+ echo "$context etcd-health: deadline exceeded svc=$svc endpoint=$endpoint last_output=$last_output"
+ else
+ echo "$context etcd-health: deadline exceeded svc=$svc endpoint=$endpoint"
+ fi
+ return 1
+ fi
+ if probe_output=$(ETCDCTL_API=3 "$etcdctl_bin" --endpoints "$endpoint" --dial-timeout "{{ETCD_HEALTH_PROBE_TIMEOUT_MS}}ms" --command-timeout "{{ETCD_HEALTH_PROBE_TIMEOUT_MS}}ms" endpoint health 2>&1); then
+ ok_checks=$((ok_checks+1))
+ if [ "$ok_checks" -ge "$stable_checks" ]; then
+ echo "$context etcd-health: ok svc=$svc endpoint=$endpoint stable_checks=$stable_checks"
+ return 0
+ fi
+ else
+ last_output="$probe_output"
+ if [ "$ok_checks" -ne 0 ]; then
+ echo "$context etcd-health: reset svc=$svc ok_checks=$ok_checks endpoint=$endpoint"
+ fi
+ ok_checks=0
+ fi
+ sleep "$poll_interval_seconds"
+ done
+}
diff --git a/deployment/templates/gen_bare_deploy_bash/tcp_ready_wait_block.sh.tmpl b/deployment/templates/gen_bare_deploy_bash/tcp_ready_wait_block.sh.tmpl
new file mode 100644
index 0000000..bbf021b
--- /dev/null
+++ b/deployment/templates/gen_bare_deploy_bash/tcp_ready_wait_block.sh.tmpl
@@ -0,0 +1,6 @@
+if [[ "${SERVICE_PORT:-}" =~ ^[0-9]+$ ]]; then
+ if ! wait_service_tcp_ready "$SERVICE" "$HOST_IP" "$SERVICE_PORT" {{TCP_READY_STABLE_SECONDS}} {{TCP_READY_DEADLINE_TS}} "{{CONTEXT}}"; then
+ echo "{{CONTEXT}} tcp-ready failed svc=$SERVICE host=$HOST_IP port=$SERVICE_PORT"
+ exit 1
+ fi
+fi
diff --git a/deployment/tests/test_gen_bare_deploy_bash.py b/deployment/tests/test_gen_bare_deploy_bash.py
index f51a923..f1645a3 100644
--- a/deployment/tests/test_gen_bare_deploy_bash.py
+++ b/deployment/tests/test_gen_bare_deploy_bash.py
@@ -13,6 +13,8 @@
from pathlib import Path
from typing import Callable, List, Optional, Tuple
+import yaml
+
SCRIPT_DIR = Path(__file__).resolve().parent
DEPLOYMENT_DIR = SCRIPT_DIR.parent
@@ -50,6 +52,12 @@ def _build_checks(selected_test_id: Optional[str]) -> List[Tuple[str, Callable[[
("preserves_hostworkdir_runtime_token", test_preserves_hostworkdir_runtime_token),
("generated_scripts_do_not_embed_pidfile_authority", test_generated_scripts_do_not_embed_pidfile_authority),
("ops_entrypoints_use_direct_scripts", test_ops_entrypoints_use_direct_scripts),
+ ("bare_start_uses_no_exit_startup_gate", test_bare_start_uses_no_exit_startup_gate),
+ (
+ "normalized_testbed_master_exports_service_port_for_atomic_group",
+ test_normalized_testbed_master_exports_service_port_for_atomic_group,
+ ),
+ ("normalized_testbed_owner_emits_large_file_paths", test_normalized_testbed_owner_emits_large_file_paths),
("bare_child_command_preserves_runtime_hostworkdir_expansion", test_bare_child_command_preserves_runtime_hostworkdir_expansion),
("supervisor_label_uses_stable_selection_suffix", test_supervisor_label_uses_stable_selection_suffix),
("bootstrap_start_reuses_already_present_selection", test_bootstrap_start_reuses_already_present_selection),
@@ -93,6 +101,7 @@ def test_preserves_hostworkdir_runtime_token() -> None:
FLUXON_SHARED_MEM: "${HOSTWORKDIR}/shm1"
service:
svc_plain:
+ port: 12345
entrypoint: |
WORKDIR="${HOSTWORKDIR}/svc_${NODE_ID}"
EXPORT_TABLE=$(cat < None:
assert "wait-present" not in script, script
assert "launch_only_start_gate" not in script, script
assert 'wait_service_probably_ready_pid_tree "$SERVICE" "$SUPERVISOR_PID"' in script, script
- assert 'wait_service_tcp_ready "$SERVICE" "$HOST_IP" "$SERVICE_PORT"' in script, script
+ assert 'wait_service_probably_ready_pid_tree "$SERVICE" "$SUPERVISOR_PID" 10 "$STARTUP_DEADLINE_TS" "[bare]"' in script, script
+ assert "export SERVICE_PORT=12345" in script, script
+ assert 'STARTUP_DEADLINE_TS=$(( $(date +%s) + 10 ))' in script, script
+ assert "wait_service_tcp_ready" not in script, script
+ assert "wait_service_etcd_endpoint_healthy" not in script, script
assert 'SUPERVISOR_PID=$( setsid ' not in script, script
+ assert '>>"$LOGFILE" 2>&1' not in script, script
+ assert 'touch "$LOGFILE"' not in script, script
assert 'python3 "$SELECTION_SUPERVISOR" stop --label "$SUPERVISOR_LABEL" --scope-key "$HOSTWORKDIR" --missing-ok' in stop_script, stop_script
assert "retire-runtime" not in stop_script, stop_script
print("PASS: test_preserves_hostworkdir_runtime_token")
@@ -149,6 +164,7 @@ def test_atomic_group_start_does_not_auto_stop_on_failure() -> None:
hostworkdir: /tmp/hostworkdir
service:
svc_a:
+ port: 23456
entrypoint: |
echo svc_a
node_bind:
@@ -179,7 +195,12 @@ def test_atomic_group_start_does_not_auto_stop_on_failure() -> None:
assert 'SUPERVISOR_PID=$( setsid ' not in script, script
assert 'echo "[rollout] probable-ready failed svc=$SERVICE label=$SUPERVISOR_LABEL supervisor_pid=$SUPERVISOR_PID"' in script, script
assert 'wait_service_probably_ready_pid_tree "$SERVICE" "$SUPERVISOR_PID"' in script, script
- assert 'wait_service_tcp_ready "$SERVICE" "$HOST_IP" "$SERVICE_PORT"' in script, script
+ assert 'wait_service_probably_ready_pid_tree "$SERVICE" "$SUPERVISOR_PID" 10 "$GROUP_STARTUP_DEADLINE_TS" "[rollout]"' in script, script
+ assert 'GROUP_STARTUP_DEADLINE_TS=$(( $(date +%s) + 10 ))' in script, script
+ assert "export SERVICE_PORT=23456" in script, script
+ assert "unset SERVICE_PORT" in script, script
+ assert "wait_service_tcp_ready" not in script, script
+ assert "wait_service_etcd_endpoint_healthy" not in script, script
print("PASS: test_atomic_group_start_does_not_auto_stop_on_failure")
@@ -251,11 +272,129 @@ def test_ops_entrypoints_use_direct_scripts() -> None:
assert "-m fluxon_py.runtime.start_ops_controller" in controller_entrypoint, controller_entrypoint
assert "examples/fluxon_ops/start_controller.py" not in controller_entrypoint, controller_entrypoint
+ assert 'http_listen_addr: "0.0.0.0:19080"' in controller_entrypoint, controller_entrypoint
+ assert 'http_listen_addr: "0.0.0.0:${MASTER__PORT}"' not in controller_entrypoint, controller_entrypoint
assert "-m fluxon_py.runtime.start_ops_agent" in agent_entrypoint, agent_entrypoint
assert "examples/fluxon_ops/start_agent.py" not in agent_entrypoint, agent_entrypoint
print("PASS: test_ops_entrypoints_use_direct_scripts")
+def test_bare_start_uses_no_exit_startup_gate() -> None:
+ with tempfile.TemporaryDirectory(prefix="test_gen_bare_deploy_bash_no_exit_gate_") as td:
+ tmpdir = Path(td)
+ config_path = tmpdir / "deployconf.yaml"
+ outdir = tmpdir / "out"
+ config_path.write_text(
+ textwrap.dedent(
+ """
+ name_prefix: fluxon-testbed
+ cluster_nodes:
+ - hostname: node-a
+ ip: 127.0.0.1
+ hostworkdir: /tmp/hostworkdir
+ service:
+ etcd:
+ port: 2379
+ entrypoint: |
+ echo etcd
+ node_bind:
+ node: ["node-a"]
+ tikv:
+ port: 20160
+ entrypoint: |
+ echo tikv
+ node_bind:
+ node: ["node-a"]
+ svc_plain:
+ port: 12345
+ entrypoint: |
+ echo plain
+ node_bind:
+ node: ["node-a"]
+ """
+ ).strip()
+ + "\n",
+ encoding="utf-8",
+ )
+
+ result = _run_generator(config_path=config_path, outdir=outdir)
+ assert result.returncode == 0, f"generator failed: stdout={result.stdout} stderr={result.stderr}"
+
+ etcd_script = (outdir / "start_etcd.sh").read_text(encoding="utf-8")
+ tikv_script = (outdir / "start_tikv.sh").read_text(encoding="utf-8")
+ plain_script = (outdir / "start_svc_plain.sh").read_text(encoding="utf-8")
+
+ for script in (etcd_script, tikv_script, plain_script):
+ assert 'STARTUP_DEADLINE_TS=$(( $(date +%s) + 10 ))' in script, script
+ assert 'wait_service_probably_ready_pid_tree "$SERVICE" "$SUPERVISOR_PID" 10 "$STARTUP_DEADLINE_TS" "[bare]"' in script, script
+ assert "wait_service_tcp_ready" not in script, script
+ assert "wait_service_etcd_endpoint_healthy" not in script, script
+ print("PASS: test_bare_start_uses_no_exit_startup_gate")
+
+
+def test_normalized_testbed_master_exports_service_port_for_atomic_group() -> None:
+ with tempfile.TemporaryDirectory(prefix="test_gen_bare_deploy_bash_normalized_testbed_") as td:
+ tmpdir = Path(td)
+ config_path = tmpdir / "deployconf.normalized.yaml"
+ outdir = tmpdir / "out"
+
+ start_test_bed = _load_python_module(
+ module_name="start_test_bed_for_gen_bare_tests",
+ path=DEPLOYMENT_DIR.parent / "fluxon_test_stack" / "start_test_bed.py",
+ )
+ base_cfg = yaml.safe_load(
+ (DEPLOYMENT_DIR.parent / "fluxon_test_stack" / "deployconf_testbed.yml").read_text(encoding="utf-8")
+ )
+ normalized, _ = start_test_bed._normalize_bootstrap_deployconf(deployconf=base_cfg)
+ config_path.write_text(
+ yaml.safe_dump(normalized, sort_keys=False, allow_unicode=False),
+ encoding="utf-8",
+ )
+
+ result = _run_generator(config_path=config_path, outdir=outdir)
+ assert result.returncode == 0, f"generator failed: stdout={result.stdout} stderr={result.stderr}"
+
+ script = (outdir / "start_fluxon_core_controller.sh").read_text(encoding="utf-8")
+ master_block_start = script.index("export SERVICE=master")
+ owner_block_start = script.index("export SERVICE=owner")
+ master_block = script[master_block_start:owner_block_start]
+ assert "export MASTER__PORT=51051" in master_block, master_block
+ assert "export SERVICE_PORT=51051" in master_block, master_block
+ assert "unset SERVICE_PORT" not in master_block, master_block
+ assert 'wait_service_probably_ready_pid_tree "$SERVICE" "$SUPERVISOR_PID" 10 "$GROUP_STARTUP_DEADLINE_TS" "[rollout]"' in master_block, master_block
+ assert "wait_service_tcp_ready" not in master_block, master_block
+ print("PASS: test_normalized_testbed_master_exports_service_port_for_atomic_group")
+
+
+def test_normalized_testbed_owner_emits_large_file_paths() -> None:
+ with tempfile.TemporaryDirectory(prefix="test_gen_bare_deploy_bash_testbed_owner_large_paths_") as td:
+ tmpdir = Path(td)
+ config_path = tmpdir / "deployconf.normalized.yaml"
+ outdir = tmpdir / "out"
+
+ start_test_bed = _load_python_module(
+ module_name="start_test_bed_for_owner_large_paths_tests",
+ path=DEPLOYMENT_DIR.parent / "fluxon_test_stack" / "start_test_bed.py",
+ )
+ base_cfg = yaml.safe_load(
+ (DEPLOYMENT_DIR.parent / "fluxon_test_stack" / "deployconf_testbed.yml").read_text(encoding="utf-8")
+ )
+ normalized, _ = start_test_bed._normalize_bootstrap_deployconf(deployconf=base_cfg)
+ config_path.write_text(
+ yaml.safe_dump(normalized, sort_keys=False, allow_unicode=False),
+ encoding="utf-8",
+ )
+
+ result = _run_generator(config_path=config_path, outdir=outdir)
+ assert result.returncode == 0, f"generator failed: stdout={result.stdout} stderr={result.stderr}"
+
+ script = (outdir / "entrypoint__fluxon-self-host2-fluxon_core_controller__owner.sh").read_text(encoding="utf-8")
+ assert 'large_file_paths:' in script, script
+ assert 'log_root_path: "${HOSTWORKDIR}/large/log/owner_${NODE_ID}"' in script, script
+ assert 'cache_root_path: "${HOSTWORKDIR}/large/cache/owner_${NODE_ID}"' in script, script
+ print("PASS: test_normalized_testbed_owner_emits_large_file_paths")
+
+
def test_bare_child_command_preserves_runtime_hostworkdir_expansion() -> None:
with tempfile.TemporaryDirectory(prefix="test_gen_bare_deploy_bash_runtime_expand_") as td:
tmpdir = Path(td)
@@ -600,6 +739,16 @@ def _load_generated_supervisor_module(supervisor_path: Path):
return module
+def _load_python_module(*, module_name: str, path: Path):
+ spec = importlib.util.spec_from_file_location(module_name, path)
+ if spec is None or spec.loader is None:
+ raise RuntimeError(f"failed to load module: {path}")
+ module = importlib.util.module_from_spec(spec)
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module)
+ return module
+
+
def _wait_until_selection_present(module, *, label: str, timeout_seconds: int = 15) -> None:
deadline = time.time() + timeout_seconds
while time.time() < deadline:
diff --git a/deployment/tests/test_gen_k8s_daemonset.py b/deployment/tests/test_gen_k8s_daemonset.py
index eff0aad..2cd769e 100644
--- a/deployment/tests/test_gen_k8s_daemonset.py
+++ b/deployment/tests/test_gen_k8s_daemonset.py
@@ -248,7 +248,7 @@ def test_ops_entrypoints_use_direct_scripts() -> None:
cluster_name: "${FLUXON_CLUSTER_NAME}"
member_kind: kv
output: web
- http_listen_addr: "0.0.0.0:${MASTER__PORT}"
+ http_listen_addr: "0.0.0.0:${OPS_CONTROLLER__PORT}"
YAML
${HOSTWORKDIR}/venv/bin/python -m fluxon_py.runtime.start_ops_controller -c "${WORKDIR}/ops_controller.yaml" -w "${WORKDIR}"
node_bind:
diff --git a/deployment/tests/test_log_shard.py b/deployment/tests/test_log_shard.py
new file mode 100644
index 0000000..642e718
--- /dev/null
+++ b/deployment/tests/test_log_shard.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import argparse
+import datetime
+import os
+import sys
+import tempfile
+import time
+from pathlib import Path
+from typing import Callable, List, Optional, Tuple
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+DEPLOYMENT_DIR = SCRIPT_DIR.parent
+sys.path.insert(0, str(DEPLOYMENT_DIR))
+
+from utils import log_shard
+
+
+def main() -> int:
+ parser = argparse.ArgumentParser(description="log_shard util test runner")
+ parser.add_argument("--test-id", help="Run only the named test id")
+ args = parser.parse_args()
+
+ checks = _build_checks(args.test_id)
+ failures = 0
+ for _, check in checks:
+ try:
+ check()
+ print(f"PASS: {check.__name__}")
+ except Exception as exc:
+ print(f"FAIL: {check.__name__}: {exc}")
+ failures += 1
+ return 0 if failures == 0 else 1
+
+
+def _build_checks(selected_test_id: Optional[str]) -> List[Tuple[str, Callable[[], None]]]:
+ checks: List[Tuple[str, Callable[[], None]]] = [
+ ("daily_path_uses_utc_date_suffix", test_daily_path_uses_utc_date_suffix),
+ ("daily_path_uses_test_window_suffix_when_configured", test_daily_path_uses_test_window_suffix_when_configured),
+ ("resolve_readable_prefers_latest_existing_shard", test_resolve_readable_prefers_latest_existing_shard),
+ ("cleanup_keeps_only_retention_window", test_cleanup_keeps_only_retention_window),
+ ]
+ if selected_test_id is None:
+ return checks
+ for check_id, check in checks:
+ if check_id == selected_test_id:
+ return [(check_id, check)]
+ available = ", ".join(check_id for check_id, _ in checks)
+ raise ValueError(f"unknown --test-id: {selected_test_id}. Available: {available}")
+
+
+def test_daily_path_uses_utc_date_suffix() -> None:
+ base = Path("/tmp/test_runner.log")
+ now = datetime.datetime(2026, 6, 21, 4, 0, 0, tzinfo=datetime.timezone.utc)
+ resolved = log_shard.daily_sharded_log_path(base, now=now)
+ assert resolved.name == "test_runner.2026-06-21.log", resolved
+
+
+def test_resolve_readable_prefers_latest_existing_shard() -> None:
+ with tempfile.TemporaryDirectory(prefix="test_log_shard_resolve_") as td:
+ root = Path(td)
+ base = root / "service.log"
+ (root / "service.2026-06-19.log").write_text("old\n", encoding="utf-8")
+ (root / "service.2026-06-20.log").write_text("new\n", encoding="utf-8")
+ resolved = log_shard.resolve_readable_log_path(base)
+ assert resolved == (root / "service.2026-06-20.log").resolve(), resolved
+
+
+def test_daily_path_uses_test_window_suffix_when_configured() -> None:
+ base = Path("/tmp/test_runner.log")
+ saved_window = os.environ.get(log_shard.TEST_LOG_SHARD_WINDOW_SECONDS_ENV)
+ saved_anchor = os.environ.get(log_shard.TEST_LOG_SHARD_ANCHOR_UNIX_SECONDS_ENV)
+ try:
+ os.environ[log_shard.TEST_LOG_SHARD_WINDOW_SECONDS_ENV] = "10"
+ os.environ[log_shard.TEST_LOG_SHARD_ANCHOR_UNIX_SECONDS_ENV] = str(
+ int(datetime.datetime(2026, 6, 21, 0, 0, 0, tzinfo=datetime.timezone.utc).timestamp())
+ )
+ now_0 = datetime.datetime(2026, 6, 21, 0, 0, 5, tzinfo=datetime.timezone.utc)
+ now_1 = datetime.datetime(2026, 6, 21, 0, 0, 15, tzinfo=datetime.timezone.utc)
+ resolved_0 = log_shard.daily_sharded_log_path(base, now=now_0)
+ resolved_1 = log_shard.daily_sharded_log_path(base, now=now_1)
+ assert resolved_0.name == "test_runner.2026-01-01.log", resolved_0
+ assert resolved_1.name == "test_runner.2026-01-02.log", resolved_1
+ finally:
+ if saved_window is None:
+ os.environ.pop(log_shard.TEST_LOG_SHARD_WINDOW_SECONDS_ENV, None)
+ else:
+ os.environ[log_shard.TEST_LOG_SHARD_WINDOW_SECONDS_ENV] = saved_window
+ if saved_anchor is None:
+ os.environ.pop(log_shard.TEST_LOG_SHARD_ANCHOR_UNIX_SECONDS_ENV, None)
+ else:
+ os.environ[log_shard.TEST_LOG_SHARD_ANCHOR_UNIX_SECONDS_ENV] = saved_anchor
+
+
+def test_cleanup_keeps_only_retention_window() -> None:
+ with tempfile.TemporaryDirectory(prefix="test_log_shard_cleanup_") as td:
+ root = Path(td)
+ base = root / "service.log"
+ keep_date = datetime.datetime.now(datetime.timezone.utc).date()
+ old_date = keep_date - datetime.timedelta(days=31)
+ recent_date = keep_date - datetime.timedelta(days=30)
+ stale_path = root / f"service.{old_date.isoformat()}.log"
+ recent_path = root / f"service.{recent_date.isoformat()}.log"
+ today_path = root / f"service.{keep_date.isoformat()}.log"
+ stale_path.write_text("stale\n", encoding="utf-8")
+ recent_path.write_text("recent\n", encoding="utf-8")
+ today_path.write_text("today\n", encoding="utf-8")
+ log_shard.cleanup_old_daily_sharded_logs(base, retention_days=31)
+ assert not stale_path.exists(), stale_path
+ assert recent_path.exists(), recent_path
+ assert today_path.exists(), today_path
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/deployment/tests/test_selection_supervisor_codegen.py b/deployment/tests/test_selection_supervisor_codegen.py
index 02ffa3b..a00caa9 100644
--- a/deployment/tests/test_selection_supervisor_codegen.py
+++ b/deployment/tests/test_selection_supervisor_codegen.py
@@ -19,6 +19,7 @@
UTILS_DIR = SCRIPT_DIR.parent / "utils"
sys.path.insert(0, str(UTILS_DIR))
+from log_shard import render_module_source as render_log_shard_module_source # type: ignore
from selection_supervisor_codegen import render_python_selection_supervisor_module # type: ignore
@@ -41,6 +42,9 @@ def _build_checks(selected_test_id: Optional[str]) -> List[Tuple[str, Callable[[
("install_subreaper_uses_prctl", test_install_subreaper_uses_prctl),
("spawn_child_sanitizes_rdma_driver_env", test_spawn_child_sanitizes_rdma_driver_env),
("selection_present_requires_live_child_process", test_selection_present_requires_live_child_process),
+ ("runtime_log_path_uses_daily_shard_files", test_runtime_log_path_uses_daily_shard_files),
+ ("runtime_log_path_expands_hostworkdir_env", test_runtime_log_path_expands_hostworkdir_env),
+ ("runtime_log_shards_roll_and_preserve_content_boundaries", test_runtime_log_shards_roll_and_preserve_content_boundaries),
("selection_present_checks_all_live_supervisors", test_selection_present_checks_all_live_supervisors),
("zombie_supervisor_is_treated_as_stopped", test_zombie_supervisor_is_treated_as_stopped),
("legacy_replace_process_is_observed_as_live_owner", test_legacy_replace_process_is_observed_as_live_owner),
@@ -99,6 +103,10 @@ def _write_runtime_script(root: Path, *, term_seconds: int = 5, kill_seconds: in
),
encoding="utf-8",
)
+ (root / "log_shard.py").write_text(
+ render_log_shard_module_source(),
+ encoding="utf-8",
+ )
return supervisor_path
@@ -561,6 +569,181 @@ def test_selection_present_requires_live_child_process() -> None:
_terminate_process(supervisor)
+def test_runtime_log_path_uses_daily_shard_files() -> None:
+ module = _load_runtime_module()
+ with tempfile.TemporaryDirectory(prefix="test_selection_supervisor_log_shard_") as td:
+ root = Path(td)
+ supervisor_path = _write_runtime_script(root)
+ child_path = root / "child.py"
+ child_path.write_text(
+ "import sys, time\n"
+ "print('hello-log-shard', flush=True)\n"
+ "time.sleep(30)\n",
+ encoding="utf-8",
+ )
+ label = "DaemonSet/test-log-shard"
+ child_argv = [sys.executable, str(child_path)]
+ base_log_path = root / "test-log-shard.log"
+ supervisor = _run_supervisor_command(
+ supervisor_path=supervisor_path,
+ label=label,
+ owner_ts_ms=1,
+ state_json=json.dumps(
+ {
+ "kind": "DaemonSet",
+ "name": "test-log-shard",
+ "service_name": "test-log-shard",
+ "argv": child_argv,
+ "cwd": str(root),
+ "log_path": str(base_log_path),
+ },
+ sort_keys=True,
+ ),
+ child_argv=child_argv,
+ cwd=root,
+ )
+ try:
+ _wait_until_present(module, label)
+ deadline = time.time() + 5.0
+ shard_path = root / f"test-log-shard.{time.strftime('%Y-%m-%d', time.gmtime())}.log"
+ while time.time() < deadline and not shard_path.exists():
+ time.sleep(0.1)
+ assert shard_path.exists(), shard_path
+ assert not base_log_path.exists(), base_log_path
+ assert "hello-log-shard" in shard_path.read_text(encoding="utf-8", errors="replace")
+ finally:
+ _terminate_process(supervisor)
+
+
+def test_runtime_log_path_expands_hostworkdir_env() -> None:
+ module = _load_runtime_module()
+ with tempfile.TemporaryDirectory(prefix="test_selection_supervisor_expand_hostworkdir_") as td:
+ root = Path(td)
+ hostworkdir = root / "hostworkdir"
+ hostworkdir.mkdir(parents=True, exist_ok=True)
+ supervisor_path = _write_runtime_script(root)
+ child_path = root / "child.py"
+ child_path.write_text(
+ "import time\n"
+ "print('expanded-hostworkdir-log', flush=True)\n"
+ "time.sleep(30)\n",
+ encoding="utf-8",
+ )
+ label = "DaemonSet/test-expand-hostworkdir"
+ child_argv = [sys.executable, str(child_path)]
+ saved_hostworkdir = os.environ.get("HOSTWORKDIR")
+ os.environ["HOSTWORKDIR"] = str(hostworkdir)
+ supervisor = _run_supervisor_command(
+ supervisor_path=supervisor_path,
+ label=label,
+ owner_ts_ms=1,
+ state_json=json.dumps(
+ {
+ "kind": "DaemonSet",
+ "name": "test-expand-hostworkdir",
+ "service_name": "test-expand-hostworkdir",
+ "argv": child_argv,
+ "cwd": str(root),
+ "log_path": "${HOSTWORKDIR}/log/test-expand-hostworkdir.log",
+ },
+ sort_keys=True,
+ ),
+ child_argv=child_argv,
+ cwd=root,
+ )
+ try:
+ _wait_until_present(module, label)
+ deadline = time.time() + 5.0
+ shard_path = hostworkdir / "log" / f"test-expand-hostworkdir.{time.strftime('%Y-%m-%d', time.gmtime())}.log"
+ while time.time() < deadline and not shard_path.exists():
+ time.sleep(0.1)
+ assert shard_path.exists(), shard_path
+ assert "expanded-hostworkdir-log" in shard_path.read_text(encoding="utf-8", errors="replace")
+ finally:
+ _terminate_process(supervisor)
+ if saved_hostworkdir is None:
+ os.environ.pop("HOSTWORKDIR", None)
+ else:
+ os.environ["HOSTWORKDIR"] = saved_hostworkdir
+
+
+def test_runtime_log_shards_roll_and_preserve_content_boundaries() -> None:
+ module = _load_runtime_module()
+ saved_window = os.environ.get("FLUXON_TEST_LOG_SHARD_WINDOW_SECONDS")
+ saved_anchor = os.environ.get("FLUXON_TEST_LOG_SHARD_ANCHOR_UNIX_SECONDS")
+ with tempfile.TemporaryDirectory(prefix="test_selection_supervisor_log_roll_") as td:
+ root = Path(td)
+ supervisor_path = _write_runtime_script(root)
+ child_path = root / "child.py"
+ child_path.write_text(
+ "import sys, time\n"
+ "print('[ops-log-mgmt][phase=before] ts=' + str(int(time.time())), flush=True)\n"
+ "time.sleep(11)\n"
+ "print('[ops-log-mgmt][phase=after] ts=' + str(int(time.time())), flush=True)\n"
+ "time.sleep(30)\n",
+ encoding="utf-8",
+ )
+ anchor = str(int(time.time()) - 2)
+ os.environ["FLUXON_TEST_LOG_SHARD_WINDOW_SECONDS"] = "10"
+ os.environ["FLUXON_TEST_LOG_SHARD_ANCHOR_UNIX_SECONDS"] = anchor
+ label = "DaemonSet/test-log-roll"
+ child_argv = [sys.executable, str(child_path)]
+ base_log_path = root / "test-log-roll.log"
+ stale_shard = root / "test-log-roll.2025-12-01.log"
+ stale_shard.write_text("stale\n", encoding="utf-8")
+ supervisor = _run_supervisor_command(
+ supervisor_path=supervisor_path,
+ label=label,
+ owner_ts_ms=1,
+ state_json=json.dumps(
+ {
+ "kind": "DaemonSet",
+ "name": "test-log-roll",
+ "service_name": "test-log-roll",
+ "argv": child_argv,
+ "cwd": str(root),
+ "log_path": str(base_log_path),
+ },
+ sort_keys=True,
+ ),
+ child_argv=child_argv,
+ cwd=root,
+ )
+ try:
+ _wait_until_present(module, label)
+ first_shard = root / "test-log-roll.2026-01-01.log"
+ second_shard = root / "test-log-roll.2026-01-02.log"
+ deadline = time.time() + 20.0
+ while time.time() < deadline:
+ if first_shard.exists() and second_shard.exists():
+ first_text = first_shard.read_text(encoding="utf-8", errors="replace")
+ second_text = second_shard.read_text(encoding="utf-8", errors="replace")
+ if "[ops-log-mgmt][phase=before]" in first_text and "[ops-log-mgmt][phase=after]" in second_text:
+ break
+ time.sleep(0.2)
+ assert first_shard.exists(), first_shard
+ assert second_shard.exists(), second_shard
+ assert not stale_shard.exists(), stale_shard
+ shard_names = sorted(path.name for path in root.glob("test-log-roll.*.log"))
+ assert shard_names == ["test-log-roll.2026-01-01.log", "test-log-roll.2026-01-02.log"], shard_names
+ first_text = first_shard.read_text(encoding="utf-8", errors="replace")
+ second_text = second_shard.read_text(encoding="utf-8", errors="replace")
+ assert "[ops-log-mgmt][phase=before]" in first_text, first_text
+ assert "[ops-log-mgmt][phase=after]" not in first_text, first_text
+ assert "[ops-log-mgmt][phase=after]" in second_text, second_text
+ assert "[ops-log-mgmt][phase=before]" not in second_text, second_text
+ finally:
+ _terminate_process(supervisor)
+ if saved_window is None:
+ os.environ.pop("FLUXON_TEST_LOG_SHARD_WINDOW_SECONDS", None)
+ else:
+ os.environ["FLUXON_TEST_LOG_SHARD_WINDOW_SECONDS"] = saved_window
+ if saved_anchor is None:
+ os.environ.pop("FLUXON_TEST_LOG_SHARD_ANCHOR_UNIX_SECONDS", None)
+ else:
+ os.environ["FLUXON_TEST_LOG_SHARD_ANCHOR_UNIX_SECONDS"] = saved_anchor
+
+
def test_selection_present_checks_all_live_supervisors() -> None:
module = _load_runtime_module()
label = "DaemonSet/test-present-any-live-child"
@@ -569,7 +752,9 @@ def test_selection_present_checks_all_live_supervisors() -> None:
original_iter_live_supervisors = module._iter_live_supervisors
original_count_pid_tree_members = module._count_pid_tree_members
try:
- module._iter_live_supervisors = lambda current_label=None: [stale_new, old_live] if current_label == label else []
+ module._iter_live_supervisors = (
+ lambda current_label=None, scope_key=None: [stale_new, old_live] if current_label == label else []
+ )
module._count_pid_tree_members = lambda pid: {11: 1, 22: 2}[pid]
assert module._selection_present(label) is True
finally:
@@ -1135,7 +1320,7 @@ def test_retire_adopted_children_stops_live_roots() -> None:
calls: List[tuple[str, object]] = []
try:
module._direct_live_child_pids = lambda pid: [41, 42] if pid == module.os.getpid() else []
- module._iter_live_supervisors = lambda label=None: []
+ module._iter_live_supervisors = lambda label=None, scope_key=None: []
module._stop_pid_tree_batch = lambda roots, label: calls.append(("stop", (list(roots), label)))
module._reap_terminated_children = lambda: [(41, 0), (42, 0)]
module._log_reaped_children = lambda **kwargs: calls.append(("reap", kwargs))
@@ -1160,7 +1345,7 @@ def test_retire_adopted_children_preserves_live_supervisor_roots() -> None:
calls: List[tuple[str, object]] = []
try:
module._direct_live_child_pids = lambda pid: [41, 42] if pid == module.os.getpid() else []
- module._iter_live_supervisors = lambda label=None: [
+ module._iter_live_supervisors = lambda label=None, scope_key=None: [
module.LiveSupervisor(
process_info=module.ProcessInfo(pid=42, ppid=module.os.getpid(), pgid=42, state="S", start_time_ticks=1),
owner_ts_ms=7,
diff --git a/deployment/tests/test_start_test_bed_bootstrap_log.py b/deployment/tests/test_start_test_bed_bootstrap_log.py
index 312deea..9f5ef49 100644
--- a/deployment/tests/test_start_test_bed_bootstrap_log.py
+++ b/deployment/tests/test_start_test_bed_bootstrap_log.py
@@ -3,6 +3,7 @@
from __future__ import annotations
import argparse
+import copy
import importlib.util
import io
import sys
@@ -604,6 +605,7 @@ def test_normalize_bootstrap_deployconf_strips_legacy_master_p2p_listen_port() -
ops_agent_entrypoint = normalized["service"]["ops_agent"]["entrypoint"]
assert "p2p_listen_port: 31100" not in master_entrypoint, master_entrypoint
assert "p2p_listen_port: 12102" in ops_agent_entrypoint, ops_agent_entrypoint
+ assert normalized["service"]["master"]["port"] == 51051, normalized["service"]["master"]
assert notes == ["service.master.entrypoint: removed legacy master field p2p_listen_port"], notes
assert "p2p_listen_port: 31100" in deployconf["service"]["master"]["entrypoint"], deployconf
print("PASS: test_normalize_bootstrap_deployconf_strips_legacy_master_p2p_listen_port")
@@ -789,6 +791,7 @@ def test_normalize_bootstrap_deployconf_rewrites_same_host_local_multi_node_fixe
assert "--http-addr 0.0.0.0:19390" in normalized["service"]["greptime"]["entrypoint"], normalized["service"]["greptime"]["entrypoint"]
assert normalized["service"]["tikv_pd"]["port"] == 19400, normalized["service"]["tikv_pd"]
assert normalized["service"]["tikv"]["port"] == 19410, normalized["service"]["tikv"]
+ assert normalized["service"]["master"]["port"] == 19290, normalized["service"]["master"]
assert "port: 19290" in normalized["service"]["master"]["entrypoint"], normalized["service"]["master"]["entrypoint"]
assert "OPS_AGENT_P2P_LISTEN_PORT=19320" in normalized["service"]["ops_agent"]["entrypoint"], normalized["service"]["ops_agent"]["entrypoint"]
assert "OPS_AGENT_P2P_LISTEN_PORT=19321" in normalized["service"]["ops_agent"]["entrypoint"], normalized["service"]["ops_agent"]["entrypoint"]
@@ -845,11 +848,35 @@ def test_normalize_bootstrap_deployconf_keeps_non_local_or_single_node_ports_unc
},
}
normalized, notes = module._normalize_bootstrap_deployconf(deployconf=deployconf)
- assert normalized == deployconf, normalized
+ assert normalized["service"]["master"]["port"] == 51051, normalized["service"]["master"]
+ expected = copy.deepcopy(deployconf)
+ expected["service"]["master"]["port"] = 51051
+ assert normalized == expected, normalized
assert notes == [], notes
print("PASS: test_normalize_bootstrap_deployconf_keeps_non_local_or_single_node_ports_unchanged")
+def test_normalize_bootstrap_deployconf_promotes_master_port_from_entrypoint() -> None:
+ module = _load_start_test_bed_module()
+ deployconf = {
+ "service": {
+ "master": {
+ "entrypoint": (
+ 'cat > "${CONFIG_PATH}" < None:
module = _load_start_test_bed_module()
with tempfile.TemporaryDirectory(prefix="test_start_test_bed_refresh_bare_") as td:
@@ -1476,6 +1503,10 @@ def main() -> int:
"normalize_bootstrap_deployconf_keeps_non_local_or_single_node_ports_unchanged",
test_normalize_bootstrap_deployconf_keeps_non_local_or_single_node_ports_unchanged,
),
+ (
+ "normalize_bootstrap_deployconf_promotes_master_port_from_entrypoint",
+ test_normalize_bootstrap_deployconf_promotes_master_port_from_entrypoint,
+ ),
(
"refresh_cluster_bare_deploy_scripts_copies_local_and_remote_nodes",
test_refresh_cluster_bare_deploy_scripts_copies_local_and_remote_nodes,
diff --git a/deployment/utils/log_shard.py b/deployment/utils/log_shard.py
new file mode 100644
index 0000000..415d4ff
--- /dev/null
+++ b/deployment/utils/log_shard.py
@@ -0,0 +1,196 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import datetime
+import os
+from pathlib import Path
+from typing import Optional
+
+
+DEFAULT_DAILY_LOG_RETENTION_DAYS = 31
+TEST_LOG_SHARD_WINDOW_SECONDS_ENV = "FLUXON_TEST_LOG_SHARD_WINDOW_SECONDS"
+TEST_LOG_SHARD_ANCHOR_UNIX_SECONDS_ENV = "FLUXON_TEST_LOG_SHARD_ANCHOR_UNIX_SECONDS"
+TEST_LOG_SHARD_BASE_DATE = datetime.date(2026, 1, 1)
+
+
+def _read_test_log_shard_window_seconds() -> Optional[int]:
+ raw_value = os.environ.get(TEST_LOG_SHARD_WINDOW_SECONDS_ENV)
+ if raw_value is None:
+ return None
+ text = raw_value.strip()
+ if not text:
+ return None
+ window_seconds = int(text)
+ if window_seconds <= 0:
+ raise ValueError(
+ f"{TEST_LOG_SHARD_WINDOW_SECONDS_ENV} must be a positive integer, got: {raw_value!r}"
+ )
+ return window_seconds
+
+
+def _read_test_log_shard_anchor_unix_seconds() -> int:
+ raw_value = os.environ.get(TEST_LOG_SHARD_ANCHOR_UNIX_SECONDS_ENV)
+ if raw_value is None or not raw_value.strip():
+ raise ValueError(
+ f"{TEST_LOG_SHARD_ANCHOR_UNIX_SECONDS_ENV} is required when "
+ f"{TEST_LOG_SHARD_WINDOW_SECONDS_ENV} is set"
+ )
+ return int(raw_value.strip())
+
+
+def _resolve_shard_date(ts: datetime.datetime) -> datetime.date:
+ window_seconds = _read_test_log_shard_window_seconds()
+ if window_seconds is None:
+ return ts.date()
+ anchor_unix_seconds = _read_test_log_shard_anchor_unix_seconds()
+ unix_seconds = int(ts.timestamp())
+ bucket_index = (unix_seconds - anchor_unix_seconds) // window_seconds
+ if bucket_index < 0:
+ raise ValueError(
+ "test log shard anchor must not be in the future: "
+ f"anchor={anchor_unix_seconds}, ts={unix_seconds}"
+ )
+ return TEST_LOG_SHARD_BASE_DATE + datetime.timedelta(days=bucket_index)
+
+
+def daily_sharded_log_path(
+ base_path: Path,
+ *,
+ now: Optional[datetime.datetime] = None,
+) -> Path:
+ ts = datetime.datetime.now(datetime.timezone.utc) if now is None else now.astimezone(datetime.timezone.utc)
+ name = base_path.name
+ if not name.endswith(".log"):
+ raise ValueError(f"log base path must end with .log: {base_path}")
+ stem = name[:-4]
+ shard_date = _resolve_shard_date(ts)
+ return (base_path.parent / f"{stem}.{shard_date.isoformat()}.log").resolve()
+
+
+def latest_existing_daily_sharded_log_path(base_path: Path) -> Optional[Path]:
+ name = base_path.name
+ if not name.endswith(".log"):
+ return base_path.resolve() if base_path.exists() else None
+ stem = name[:-4]
+ prefix = stem + "."
+ suffix = ".log"
+ latest: Optional[tuple[datetime.date, Path]] = None
+ parent = base_path.parent
+ if not parent.exists():
+ return base_path.resolve() if base_path.exists() else None
+ for path in parent.iterdir():
+ if not path.is_file():
+ continue
+ entry_name = path.name
+ if not entry_name.startswith(prefix) or not entry_name.endswith(suffix):
+ continue
+ date_text = entry_name[len(prefix):-len(suffix)]
+ try:
+ shard_date = datetime.date.fromisoformat(date_text)
+ except ValueError:
+ continue
+ if latest is None or shard_date > latest[0]:
+ latest = (shard_date, path.resolve())
+ if latest is not None:
+ return latest[1]
+ if base_path.exists():
+ return base_path.resolve()
+ return None
+
+
+def resolve_readable_log_path(base_path: Path) -> Optional[Path]:
+ current = daily_sharded_log_path(base_path)
+ if current.exists():
+ return current
+ return latest_existing_daily_sharded_log_path(base_path)
+
+
+def cleanup_old_daily_sharded_logs(
+ base_path: Path,
+ *,
+ retention_days: int = DEFAULT_DAILY_LOG_RETENTION_DAYS,
+) -> None:
+ name = base_path.name
+ if not name.endswith(".log"):
+ return
+ current_shard_date = _resolve_shard_date(datetime.datetime.now(datetime.timezone.utc))
+ keep_since = current_shard_date - datetime.timedelta(days=max(int(retention_days) - 1, 0))
+ stem = name[:-4]
+ prefix = stem + "."
+ suffix = ".log"
+ parent = base_path.parent
+ parent.mkdir(parents=True, exist_ok=True)
+ for path in parent.iterdir():
+ if not path.is_file():
+ continue
+ entry_name = path.name
+ if not entry_name.startswith(prefix) or not entry_name.endswith(suffix):
+ continue
+ date_text = entry_name[len(prefix):-len(suffix)]
+ try:
+ shard_date = datetime.date.fromisoformat(date_text)
+ except ValueError:
+ continue
+ if shard_date < keep_since:
+ try:
+ path.unlink()
+ except FileNotFoundError:
+ pass
+
+
+def render_module_source() -> str:
+ module_path = Path(__file__).resolve()
+ return module_path.read_text(encoding="utf-8")
+
+
+def import_sibling_log_shard():
+ import importlib.util
+ import sys
+
+ helper_path = Path(__file__).resolve().with_name("log_shard.py")
+ module_name = "_fluxon_log_shard_runtime"
+ loaded = sys.modules.get(module_name)
+ if loaded is not None:
+ return loaded
+ spec = importlib.util.spec_from_file_location(module_name, helper_path)
+ if spec is None or spec.loader is None:
+ raise RuntimeError(f"failed to load log shard helper: {helper_path}")
+ module = importlib.util.module_from_spec(spec)
+ sys.modules[module_name] = module
+ spec.loader.exec_module(module)
+ return module
+
+
+def relay_fd_to_daily_sharded_logs(
+ *,
+ base_log_path: str,
+ read_fd: int,
+ retention_days: int = DEFAULT_DAILY_LOG_RETENTION_DAYS,
+) -> None:
+ base_path = Path(os.path.abspath(base_log_path))
+ current_path: Optional[Path] = None
+ current_fp = None
+ try:
+ while True:
+ try:
+ chunk = os.read(read_fd, 65536)
+ except OSError:
+ break
+ if not chunk:
+ break
+ next_path = daily_sharded_log_path(base_path)
+ if current_path != next_path:
+ if current_fp is not None:
+ current_fp.flush()
+ current_fp.close()
+ cleanup_old_daily_sharded_logs(base_path, retention_days=retention_days)
+ next_path.parent.mkdir(parents=True, exist_ok=True)
+ current_fp = next_path.open("ab", buffering=0)
+ current_path = next_path
+ current_fp.write(chunk)
+ finally:
+ if current_fp is not None:
+ current_fp.flush()
+ current_fp.close()
+ os.close(read_fd)
diff --git a/deployment/utils/proc_lifecycle_codegen.py b/deployment/utils/proc_lifecycle_codegen.py
index 31ef2b0..116b0c4 100644
--- a/deployment/utils/proc_lifecycle_codegen.py
+++ b/deployment/utils/proc_lifecycle_codegen.py
@@ -150,22 +150,19 @@ def render_bash_proc_lifecycle_funcs_pid_tree(*, timeouts: StopTimeouts) -> str:
}}
wait_service_probably_ready_pid_tree() {{
- # "Probably ready" contract:
- # - A service is considered probably-ready iff for N consecutive seconds:
- # - the supervisor PID exists, and
- # - the supervisor PID subtree has at least one other PID besides the supervisor.
- # - If the child process restarts during the window, we reset the counter and keep waiting,
- # until the provided deadline is reached.
- #
- # This is used by atomic-group runners to enforce strict start ordering.
+ # Startup gate contract:
+ # - Success means the supervisor PID stays alive across the fixed startup window.
+ # - During this startup window we do not probe service ports or readiness endpoints.
+ # - We intentionally do not require the child to expose ports, endpoints, or even finish
+ # spawning before the window ends.
svc="$1"
root_pid="$2"
- stable_seconds="$3"
+ startup_window_seconds="$3"
deadline_ts="$4"
context="$5"
- if [[ ! "$stable_seconds" =~ ^[0-9]+$ ]] || [ "$stable_seconds" -le 0 ]; then
- echo "$context probable-ready: invalid stable_seconds=$stable_seconds svc=$svc"
+ if [[ ! "$startup_window_seconds" =~ ^[0-9]+$ ]] || [ "$startup_window_seconds" -le 0 ]; then
+ echo "$context probable-ready: invalid startup_window_seconds=$startup_window_seconds svc=$svc"
return 1
fi
if [[ ! "$deadline_ts" =~ ^[0-9]+$ ]] || [ "$deadline_ts" -le 0 ]; then
@@ -173,30 +170,16 @@ def render_bash_proc_lifecycle_funcs_pid_tree(*, timeouts: StopTimeouts) -> str:
return 1
fi
- ok_s=0
while true; do
- now=$(date +%s)
- if [ "$now" -ge "$deadline_ts" ]; then
- echo "$context probable-ready: deadline exceeded svc=$svc stable_seconds=$stable_seconds pid=$root_pid"
- return 1
- fi
-
if ! _pid_exists "$root_pid"; then
echo "$context probable-ready: supervisor pid exited svc=$svc pid=$root_pid"
return 1
fi
- if _pid_tree_has_child_process "$root_pid"; then
- ok_s=$((ok_s+1))
- if [ "$ok_s" -ge "$stable_seconds" ]; then
- echo "$context probable-ready: ok svc=$svc stable_seconds=$stable_seconds pid=$root_pid"
- return 0
- fi
- else
- if [ "$ok_s" -ne 0 ]; then
- echo "$context probable-ready: reset svc=$svc ok_s=$ok_s missing_child=true"
- fi
- ok_s=0
+ now=$(date +%s)
+ if [ "$now" -ge "$deadline_ts" ]; then
+ echo "$context probable-ready: ok svc=$svc startup_window_seconds=$startup_window_seconds pid=$root_pid"
+ return 0
fi
sleep 1
diff --git a/deployment/utils/selection_supervisor_codegen.py b/deployment/utils/selection_supervisor_codegen.py
index 2945ff5..ab76dfc 100644
--- a/deployment/utils/selection_supervisor_codegen.py
+++ b/deployment/utils/selection_supervisor_codegen.py
@@ -13,6 +13,7 @@
PYTHON_SELECTION_SUPERVISOR_FILENAME = "selection_supervisor.py"
+LOG_SHARD_HELPER_FILENAME = "log_shard.py"
def render_python_selection_supervisor_module(*, timeouts) -> str:
@@ -42,11 +43,13 @@ def render_python_selection_supervisor_module(*, timeouts) -> str:
import enum
import fcntl
import hashlib
+import importlib.util
import json
import os
import signal
import subprocess
import sys
+import threading
import time
from dataclasses import dataclass
from pathlib import Path
@@ -62,6 +65,37 @@ def render_python_selection_supervisor_module(*, timeouts) -> str:
SANITIZED_CHILD_ENV_KEYS = ("RDMAV_DRIVERS", "IBV_DRIVERS")
_shutdown_requested = False
+_STDIO_ROUTER_THREAD = None
+_STDIO_ROUTER_KEEPALIVE_FP = None
+
+
+def _load_log_shard_helper():
+ candidates = []
+ raw_file = globals().get("__file__")
+ if isinstance(raw_file, str) and raw_file:
+ candidates.append(Path(raw_file).resolve().with_name("__LOG_SHARD_HELPER_FILENAME__"))
+ cwd = Path.cwd().resolve()
+ candidates.append(cwd / "__LOG_SHARD_HELPER_FILENAME__")
+ candidates.append(cwd / "deployment" / "utils" / "__LOG_SHARD_HELPER_FILENAME__")
+ for entry in sys.path:
+ if not isinstance(entry, str) or not entry:
+ continue
+ candidates.append(Path(entry).resolve() / "__LOG_SHARD_HELPER_FILENAME__")
+ helper_path = candidates[0]
+ for candidate in candidates:
+ if candidate.is_file():
+ helper_path = candidate
+ break
+ spec = importlib.util.spec_from_file_location("_fluxon_selection_log_shard", helper_path)
+ if spec is None or spec.loader is None:
+ raise RuntimeError(f"failed to load log shard helper: {helper_path}")
+ module = importlib.util.module_from_spec(spec)
+ sys.modules[spec.name] = module
+ spec.loader.exec_module(module)
+ return module
+
+
+_LOG_SHARD = _load_log_shard_helper()
def main() -> int:
@@ -96,6 +130,8 @@ def main() -> int:
stop_parser.add_argument("--missing-ok", action="store_true")
args = parser.parse_args()
+ runtime_state_for_stdio = _runtime_state_for_startup_stdio(args)
+ _redirect_process_stdio_to_runtime_log(runtime_state_for_stdio)
# English note:
# - The supervisor module is invoked both as a long-running `run` daemon and as a short-lived
# `stop` helper from ops-managed reconcile loops.
@@ -356,6 +392,16 @@ def _parse_run_command_spec(args: argparse.Namespace) -> RunCommandSpec:
)
+def _runtime_state_for_startup_stdio(args: argparse.Namespace) -> Optional[SelectionRuntimeState]:
+ if str(args.command) != "run":
+ return None
+ label = _require_non_empty_str(args.label, "label")
+ state_json = args.state_json
+ if state_json is None:
+ return None
+ return _build_runtime_state(label=label, state_json=state_json)
+
+
def _requested_phase1_overlap_with_applyless_owner(
current_owner: Optional[LiveSupervisor],
requested_runtime_state: Optional[SelectionRuntimeState],
@@ -438,6 +484,7 @@ def _run_supervisor(spec: RunCommandSpec, selection_lock_fp=None) -> int:
restart_timestamps: List[float] = []
backoff_seconds = spec.restart_delay_seconds
+ _redirect_process_stdio_to_runtime_log(runtime_state)
while True:
_log_reaped_children(
@@ -661,6 +708,10 @@ def _sanitize_child_ld_library_path(raw_value: Optional[str]) -> Optional[str]:
return ":".join(sanitized_entries)
+def _expand_runtime_state_path(value: str) -> str:
+ return os.path.expandvars(value)
+
+
def _spawn_child(command: List[str], workdir: Optional[Path]) -> subprocess.Popen[bytes]:
def _set_pdeathsig_sigterm() -> None:
libc = ctypes.CDLL("libc.so.6", use_errno=True)
@@ -687,6 +738,40 @@ def _set_pdeathsig_sigterm() -> None:
)
+def _redirect_process_stdio_to_runtime_log(runtime_state: Optional[SelectionRuntimeState]) -> None:
+ global _STDIO_ROUTER_THREAD
+ global _STDIO_ROUTER_KEEPALIVE_FP
+ if runtime_state is None:
+ return
+ if _STDIO_ROUTER_THREAD is not None:
+ return
+ base_log_path = _require_non_empty_str(runtime_state.log_path, "state.log_path")
+ read_fd, write_fd = os.pipe()
+ router_keepalive = os.dup(write_fd)
+
+ def _router_loop() -> None:
+ _LOG_SHARD.relay_fd_to_daily_sharded_logs(
+ base_log_path=base_log_path,
+ read_fd=read_fd,
+ retention_days=_LOG_SHARD.DEFAULT_DAILY_LOG_RETENTION_DAYS,
+ )
+
+ router = threading.Thread(
+ target=_router_loop,
+ name="selection-supervisor-stdio-log-router",
+ daemon=True,
+ )
+ router.start()
+ os.dup2(write_fd, sys.stdout.fileno())
+ os.dup2(write_fd, sys.stderr.fileno())
+ sys.stdout = os.fdopen(sys.stdout.fileno(), "w", encoding="utf-8", buffering=1, closefd=False)
+ sys.stderr = os.fdopen(sys.stderr.fileno(), "w", encoding="utf-8", buffering=1, closefd=False)
+ try:
+ os.close(write_fd)
+ except OSError:
+ pass
+ _STDIO_ROUTER_KEEPALIVE_FP = os.fdopen(router_keepalive, "w", encoding="utf-8", buffering=1)
+ _STDIO_ROUTER_THREAD = router
def _retired_and_preserved_adopted_roots(root_pid: int) -> Tuple[List[int], List[int]]:
adopted_roots = _direct_live_child_pids(root_pid)
if not adopted_roots:
@@ -788,7 +873,9 @@ def _selection_runtime_state_from_raw(
apply_id=_require_optional_non_empty_str(raw.get("apply_id"), "state.apply_id"),
argv=_require_non_empty_str_list(raw.get("argv"), "state.argv"),
cwd=_require_optional_non_empty_str(raw.get("cwd"), "state.cwd"),
- log_path=_require_non_empty_str(raw.get("log_path"), "state.log_path"),
+ log_path=_expand_runtime_state_path(
+ _require_non_empty_str(raw.get("log_path"), "state.log_path")
+ ),
owner_ts_ms=owner_ts_ms,
started_ts_ms=started_ts_ms,
)
@@ -1337,6 +1424,7 @@ def _signal_pid_tree(root_pid: int, sig: signal.Signals, label: str) -> None:
"""
return (
textwrap.dedent(template)
+ .replace("__LOG_SHARD_HELPER_FILENAME__", LOG_SHARD_HELPER_FILENAME)
.replace("__TERM_S__", str(term_s))
.replace("__KILL_S__", str(kill_s))
.replace("__SUPERSEDE_S__", str(supersede_s))
diff --git "a/fluxon_doc_cn/design/fluxon_0_\351\205\215\347\275\256\346\200\273\350\247\210.md" "b/fluxon_doc_cn/design/fluxon_0_\351\205\215\347\275\256\346\200\273\350\247\210.md"
new file mode 100644
index 0000000..852b73f
--- /dev/null
+++ "b/fluxon_doc_cn/design/fluxon_0_\351\205\215\347\275\256\346\200\273\350\247\210.md"
@@ -0,0 +1,217 @@
+# Fluxon 配置总览
+
+## 1. 结论
+
+本文只回答一件事:Fluxon 仓库里有哪些稳定配置入口,它们各自负责什么,校验后会变成什么运行时结构。
+
+**稳定结论:**
+
+- 配置输入和运行时结构是分开的,YAML 只负责声明意图,`verify()` / `parse_*()` 负责收敛成唯一可执行结果。
+- 共享契约优先放在 `fluxon_commu_contract` 和 `fluxon_cli::config` 这类公共模块里,业务包更多是复用或重导出。
+- `host:port`、`http(s)://...`、`cluster-scoped path` 这几类格式都被严格区分,不靠探测或模糊回退。
+- 仓库里的 checked-in YAML 分两类:运行时契约和环境/测试契约。前者要强校验,后者主要用于把开发、部署、测试流水线接起来。
+
+```mermaid
+flowchart TD
+ A[build_config_ext.yml
build_config_ext_static.yml] --> B[setup_and_pack / repo_config_utils]
+ C[deployment/deployconf.yaml] --> D[deployment utils / fluxon_py tests]
+ E[fluxon_py/tests/test_config.yaml] --> D
+ F[fluxon_test_stack/*.yaml] --> G[teststack runner / start_test_bed]
+ H[fluxon_cli/src/config.rs] --> I[monitor / UI]
+ J[fluxon_kv/src/config.rs] --> K[KV runtime]
+ L[fluxon_fs_core/src/config.rs] --> M[FS runtime]
+ N[fluxon_commu_contract/src/config.rs] --> K
+ N --> M
+```
+
+## 2. 配置地图
+
+| 配置家族 | 入口文件 / 模块 | 主要消费者 | 作用 |
+| --- | --- | --- | --- |
+| 仓库环境配置 | `build_config_ext.yml` | Rust KV 测试族、`fluxon_py/tests/test_lib.py`、`setup_and_pack` 打包/校验脚本、TestStack 的 `bin_kvtest` 用例 staging | 提供 etcd、Prometheus、remote write 等开发/测试基线 |
+| 静态构建配置 | `build_config_ext_static.yml` | `setup_and_pack/pack_release.py`、`build_pack_fluxonkv_pylib_img.py`、Nix 打包链路 | 固定 wheel / manylinux 版本 |
+| 部署配置 | `deployment/deployconf.yaml` | 部署脚本、`fluxon_py` 测试入口、TestStack 生成/消费链路 | 提供集群节点、服务地址和全局环境变量 |
+| Python 测试配置 | `fluxon_py/tests/test_config.yaml` | `fluxon_py` 测试入口、测试辅助库、deployconf 解析链路 | 连接 deployconf,选择 KV backend 类型 |
+| 开发/打包环境配置 | `setup_and_pack/setup_dev_env/*.yaml`、`setup_and_pack/build_pack_fluxonkv_pylib_img/*.yaml`、`setup_and_pack/nix/*.yaml`、`pub_prepare_build.yaml` | `setup_and_pack` 脚本 | 提供开发机和打包流水线的环境输入 |
+| TestStack 配置 | `fluxon_test_stack/ci_test_list.yaml`、`start_test_bed.yaml`、`gitops.yaml` | `test_runner.py`、`start_test_bed.py` | 定义 suite、testbed、GitOps 和 UI 入口 |
+| CLI 监控配置 | `fluxon_cli/src/config.rs` | `master_ui_monitor`、`test_runner_ui` | 提供监控页和查询页配置 |
+| KV 配置 | `fluxon_kv/src/config.rs` | KV master / owner / external | 定义 KV 运行时角色和校验规则 |
+| FS 配置 | `fluxon_fs_core/src/config.rs` | FS master / agent / panel | 定义 FS cache、master、panel、权限和转移态 |
+| 共享传输配置 | `fluxon_commu_contract/src/config.rs`、`transfer_engine/surface.rs` | KV / FS / commu | 提供 `NetworkConfig`、`ProtocolType`、`TransferEngineType` |
+
+## 3. 通用规则
+
+| 规则 | 含义 |
+| --- | --- |
+| `serde(deny_unknown_fields)` | 运行时 YAML 默认拒绝未知字段 |
+| `from_file` / `from_str` + `verify` | 先解析,再收敛成强类型运行时配置 |
+| `YamlNullable` | 只在需要区分“缺失 / null / value”时使用 |
+| `host:port` 与 `http(s)://...` 分离 | etcd / deployconf 常用前者,监控 / Prometheus 常用后者 |
+| 派生值要显式写回 | 例如 cluster-scoped 路径、默认表名、默认 transport_mode |
+
+## 4. 环境与部署配置
+
+### 4.1 `build_config_ext.yml`
+
+这是仓库级开发环境配置,不是业务 runtime config。
+
+| 字段 | 规则 | 主要用途 |
+| --- | --- | --- |
+| `etcd` | 必填,`host:port` | 供 Rust / Python / 测试工具读取 etcd 地址 |
+| `prom` | 必填,`http(s)://.../v1` 或 `.../api/v1` | 供 Grafana / TSDB 查询 URL 使用 |
+| `prom_remote_write_url` | 必填,`http(s)://...` | 供 remote write 使用 |
+
+`setup_and_pack/utils/repo_config_utils.py` 里保留了 `prometheus_remote_write_url` 的旧名兼容读取,但这是 build tooling 的过渡路径,不是推荐的新契约。
+
+### 4.2 `build_config_ext_static.yml`
+
+当前只固定一个值:
+
+| 字段 | 规则 |
+| --- | --- |
+| `manylinux_version` | 必填,当前只允许 `2_28` |
+
+### 4.3 `deployment/deployconf.yaml`
+
+这是部署和打包流水线的核心配置。当前稳定消费面主要有三块:
+
+| 区块 | 关键字段 | 作用 |
+| --- | --- | --- |
+| `cluster_nodes` | 节点列表 | 作为 placeholder 解析的基础 |
+| `service` | 服务节点映射 | 供部署脚本和测试脚本查 service ip:port |
+| `global_envs` | `ETCD_FULL_ADDRESS`、`FLUXON_PROMETHEUS_BASE_URL`、`MONITOR_GREPTIMEDB_WRITE_URL`、`FLUXON_CLUSTER_NAME`、`FLUXON_SHARED_MEM`、`FLUXON_SHARED_FILE` | 供部署/测试代码读取集群级 authority |
+
+`global_envs` 允许占位符解析,先由 `cluster_nodes` + `service` 构造映射,再把变量落成最终值。
+
+### 4.4 `fluxon_py/tests/test_config.yaml`
+
+这是一层测试入口配置,不是 runtime 部署配置。
+
+| 字段 | 规则 |
+| --- | --- |
+| `deployconf_path` | 必填,指向共享 deployconf |
+| `kv_svc_type` | 必填,当前测试助手只接受已知 backend 类型 |
+
+测试代码里还保留了 mooncake 相关读取函数,但 checked-in 的最小样例只使用上面两个字段。
+
+### 4.5 `fluxon_test_stack/*`
+
+TestStack 的配置已经单独有设计文档,这里只收口成一句话:
+
+- `ci_test_list.yaml` 定义 suite 空间。
+- `start_test_bed.yaml` 定义共享 testbed 和 UI。
+- `gitops.yaml` 定义 GitOps 轮询和记录。
+- 生成的 `deployconf_testbed.yml` 是派生产物,不是手工主配置。
+
+## 5. 运行时配置
+
+### 5.1 KV
+
+KV 的入口在 `fluxon_kv/src/config.rs`,对外分成 master 和 client 两个稳定 YAML:
+
+| 类型 | 作用 |
+| --- | --- |
+| `MasterConfigYaml` | master 节点输入 |
+| `ClientConfigYaml` | owner / external 输入 |
+| `TestSpecConfig` | 测试和实验分支开关 |
+| `MonitoringConfigYaml` | master 监控块 |
+| `NetworkConfig` | 网络白名单和 IP 映射,共享自 `fluxon_commu_contract` |
+
+核心分流规则:
+
+- `contribute_to_cluster_pool_size` 缺失或全零时,进入 external。
+- `contribute_to_cluster_pool_size.dram > 0` 时,进入 owner。
+- `test_spec_config.side_transfer_role = worker` 时,走 side-transfer worker 分支,强制 `TransferEngineType::P2p`。
+
+主要约束:
+
+- `monitoring` 在 master 上必填。
+- `master_ui` 依赖 `monitoring`,并作为嵌入式 monitor HTTP 服务启动。
+- `shared_memory_path` / `shared_file_path` 会拼成 `cluster_name` 作用域路径。
+- `etcd_addresses` 在 client 侧保留 raw `host:port` 和归一化 `http://host:port` 两份视图。
+- zero-contribution `external` / side worker 的 `etcd_addresses`、`sub_cluster`、`large_file_paths` 由 owner 发布的 `shared.json` 继承;本地配置面只保留 attach owner 所需的共享 bundle 锚点和本进程参数。
+
+更细的调用时序、持有生命周期和并发规则分别在 `kv_1_概览与分层.md`、`kv_2_调用时序.md`、`kv_3_参数与并发.md`、`kv_4_allocation_segment_holder生命周期.md` 里展开。
+
+### 5.2 FS
+
+FS 的配置集中在 `fluxon_fs_core/src/config.rs`,上层 `fluxon_fs/src/config.rs` 只是重导出。
+
+| 配置块 | 入口 | 结果 |
+| --- | --- | --- |
+| cache | `fluxon_fs.cache` | `FluxonFsGlobalConfig` |
+| master | `fluxon_fs.master` | `FluxonFsMasterConfig` |
+| master_panel | `fluxon_fs.master_panel` | `FluxonFsMasterPanelConfig` |
+
+`fluxon_fs.cache` 的核心字段:
+
+- `stale_window_ms` 必须 `> 0`。
+- `write_session_target_inflight_bytes` 可缺省,默认 128 MiB。
+- `rules[*]` 需要绝对路径、合法 cache/write 模式、合法前缀和非零 cache 上限。
+- `exports[*]` 需要绝对路径;`nodes` 缺失时表示 `AgentRegistry`,给出时表示 `StaticNodes`。
+
+`fluxon_fs.master` 的核心字段:
+
+- `instance_key` 必填。
+- `pull_interval_ms` 可选,但如果给出必须 `> 0`。
+- 旧的 `fluxon_fs.rpc` 和 `rpc_timeout_ms` 已移除。
+
+`fluxon_fs.master_panel` 的核心字段:
+
+- `listen_addr`、`public_base_url`、`prometheus_base_url`、`access_db_path` 都是必需基线。
+- `bootstrap_access_model` 是面板的启动授权模型。
+- `transfer_state_store` 当前稳定实现是 `tikv`。
+- `s3_gateway` 负责对象请求和 KV miss 策略。
+
+FS 还把访问模型拆成两层:
+
+- `access_model` 是用户/权限的输入模型。
+- `runtime_access_model` 是 runtime 使用的派生模型,密码会被哈希,不再原样保留。
+
+### 5.3 CLI 监控
+
+`fluxon_cli/src/config.rs` 定义统一监控页配置,KV 的 `master_ui` 和 TestStack 的 UI 都复用它。
+
+| 类型 | 关键字段 |
+| --- | --- |
+| `MonitorConfigYaml` | `etcd_endpoints`、`prometheus_base_url`、`cluster_name`、`member_kind`、`output` |
+| 可选项 | `mq_unique_key_prefixes`、`http_listen_addr`、`greptime_sql` |
+
+主要约束:
+
+- `etcd_endpoints` 必须非空且带 scheme。
+- `prometheus_base_url` 必须带 scheme。
+- `mq_unique_key_prefixes` 给出时不能为空,也不能带前后空白。
+- `greptime_sql` 可以显式提供;如果 `prometheus_base_url` 指向 Greptime 的 `/v1/prometheus`,会自动派生默认 SQL 连接信息。
+
+### 5.4 共享传输契约
+
+`fluxon_commu_contract` 提供多个被 KV / FS 共同复用的基础类型:
+
+| 类型 | 取值 | 作用 |
+| --- | --- | --- |
+| `ProtocolType` | `Tcp` / `Rdma` | 输入协议选择 |
+| `TransferEngineType` | `Closed` / `P2p` | 传输引擎分支 |
+| `TransferBackendActivationMode` | 三个显式分支 | 控制 backend 激活方式 |
+| `NetworkConfig` | `subnet_whitelist`、`primary_ip_to_extended_ips` | 网络白名单和 IP 扩展映射 |
+
+这些类型是共享契约,不属于某一个子系统的私有配置。
+
+## 6. 配置之间的关系
+
+| 关系 | 说明 |
+| --- | --- |
+| build_config_ext -> deployment/test | 先确定环境基线,再给 runtime 配置提供 host、URL、路径 |
+| deployconf -> test_config | Python 测试配置通过 `deployconf_path` 指向共享部署配置 |
+| deployconf -> teststack | `start_test_bed` 和 `test_runner` 读取派生后的 testbed deployconf |
+| commu_contract -> KV / FS | `ProtocolType`、`TransferEngineType`、`NetworkConfig` 是共享底座 |
+| CLI config -> KV / TestStack UI | master UI、runner UI 复用同一个 monitor config 契约 |
+
+## 7. 读法建议
+
+如果你只想看某一块的细节,按这个顺序读:
+
+1. 环境/部署先看 `deployment/utils/deployconf_config_utils.py` 和 `fluxon_util/src/dev_config.rs`。
+2. KV 先看 `fluxon_kv/src/config.rs`,再接 `kv_1` 到 `kv_4`。
+3. FS 先看 `fluxon_fs_core/src/config.rs`,再看 `用户 - 5 - FS接口.md`。
+4. TestStack 直接看 `teststack_1_当前架构与CI测试流程.md`。
diff --git "a/fluxon_doc_cn/design/log_1_\346\234\254\345\234\260\346\226\207\344\273\266\346\227\245\345\277\227\344\270\216Greptime_OTLP\345\257\274\345\207\272\351\223\276\350\267\257.md" "b/fluxon_doc_cn/design/log_1_\346\234\254\345\234\260\346\226\207\344\273\266\346\227\245\345\277\227\344\270\216Greptime_OTLP\345\257\274\345\207\272\351\223\276\350\267\257.md"
new file mode 100644
index 0000000..fd81c45
--- /dev/null
+++ "b/fluxon_doc_cn/design/log_1_\346\234\254\345\234\260\346\226\207\344\273\266\346\227\245\345\277\227\344\270\216Greptime_OTLP\345\257\274\345\207\272\351\223\276\350\267\257.md"
@@ -0,0 +1,414 @@
+# Fluxon Log 设计 1 - 统一 log 标准与 Greptime OTLP 导出链路
+
+## 0. 总起
+本文定义 Fluxon 服务平面的统一日志标准。主线代码落在 `fluxon_rs/fluxon_kv/src/config.rs`、`fluxon_rs/fluxon_kv/src/lib.rs`、`fluxon_rs/fluxon_util/src/log.rs`、`fluxon_rs/fluxon_observability/src/greptime_otlp_tracing.rs`、`fluxon_rs/fluxon_observability/src/greptime_otlp_log_orchestrator.rs` 和 `fluxon_rs/fluxon_observability/src/greptime_otlp_log.rs`。
+
+稳定结论先说死:
+
+- 本地文件日志始终启用,作为可回放的安全网。
+- Greptime OTLP 导出由 `master.monitoring.otlp_log_api` 控制,`master` 负责配置源,`owner` / `external` 只消费广播。
+- `testbed` 是独立的 `log_service_kind`,启动器、runner、UI 和 workload 统一按同一套日志语义落盘。
+- 当前导出链路采用 best-effort 策略,不阻塞主业务路径。
+
+本文重点回答四个问题:
+
+1. 各条日志链路当前落在哪些目录边界里。
+2. 当前 canonical 文件名、按天分片和 31 天清理语义是什么。
+3. Rust / Python 之间哪些 contract 已经对齐,哪些还没有。
+4. 当前实现里哪些地方已经收口,哪些地方仍是未完全收口点。
+
+KV 里的 `external` 与 side worker 都只消费 owner 感知结果。当前稳定 contract 是:它们显式配置 `shared_memory_path` / `shared_file_path` 作为 attach owner 的共享 bundle 锚点,`large_file_paths` 则从 owner 发布的 `shared.json` 继承;日志和 cache 从启动起就直接落到 owner 派生出来的大文件目录,不再要求 zero-contribution 侧另配一份本地 large root。
+
+## 1. 目录边界
+目录边界只管物理隔离,不管统一 root。统一的是命名、元数据、归档窗口和清理语义。
+
+### 1.1 KV
+- `master` 以 `log_dir` 作为本地主日志根,并在其下派生 cluster-scoped runtime 日志目录。
+- `owner`、`external` 和 side worker 共享单一 `share_path` 作为 share 根,用来放 `mmap.file`、`shared.json`、peer metadata 和 side transfer 相关文件。
+- `owner` 的 `large_file_paths` 定义 runtime log、cache 等大文件资产的物理根目录。
+- `external` 和 side worker 不再单独声明自己的 `large_file_paths`。它们在 zero-contribution bootstrap 阶段从 owner `shared.json` 继承同一组大文件根目录,然后直接复用 owner 派生出来的 runtime log / cache 边界。
+
+### 1.2 ops / bare shared supervisor control plane
+这里不要把 `ops` 和 `bare` 理解成两套彼此独立的面。两者确实共用同一个 `selection_supervisor.py + log_shard.py` 实现源,但当前实际落盘边界不是一棵完全统一的目录树。
+
+先区分两个层次:
+
+| 层次 | 稳定根 | 主要内容 |
+| --- | --- | --- |
+| `deployconf -> gen_bare -> bare bootstrap` | `hostworkdir` | generated control scripts、bare 服务日志 |
+| `ops` runtime | `workdir` | runtime config、embedded supervisor runtime、ops-managed workload 日志 |
+
+其中:
+
+- `hostworkdir` 是节点级宿主根,用来承载 deployer 下发产物、bare 控制脚本和其他需要跨进程稳定复用的目录。
+- `workdir` 是某个具体进程实例自己的运行子目录,用来承载该实例的 runtime config、embedded supervisor runtime 和它托管出来的 workload 日志。
+- 位置关系上,当前 self-host deployconf 里 `workdir` 通常是 `hostworkdir` 的子目录;语义关系上,`workdir` 仍然只是“某个实例的运行子树”,不能反过来代表整个 `hostworkdir`。
+
+bare 稳定根当前可以直观看成:
+
+```text
+${HOSTWORKDIR}/
+ log/
+ ops_controller..log
+ ops_agent..log
+ ..log
+ gen_bare_deploy_bash/
+ start_ops_controller.sh
+ start_ops_agent.sh
+ start_.sh
+ stop_ops_controller.sh
+ stop_ops_agent.sh
+ stop_.sh
+ start_.sh
+ stop_.sh
+ selection_supervisor.py
+ log_shard.py
+ entrypoint__.sh
+```
+
+当前 self-host deployconf 下,`hostworkdir` 与 `ops workdir` 的实际位置关系可以直观看成:
+
+```text
+${HOSTWORKDIR}/
+ gen_bare_deploy_bash/
+ ...
+ log/
+ ops_controller..log
+ ops_agent..log
+ ..log
+ ops_controller/
+ ops_controller.yaml
+ selection_supervisor/
+ selection_supervisor.py
+ log_shard.py
+ log/
+ workload____..log
+ ops_agent/
+ /
+ ops_agent.yaml
+ selection_supervisor/
+ selection_supervisor.py
+ log_shard.py
+ log/
+ workload____..log
+```
+
+这里再把 contract 说清楚:
+
+- `${HOSTWORKDIR}/gen_bare_deploy_bash/` 里的 `start_*.sh` / `stop_*.sh` 是 generated control scripts,是这套 shared supervisor 控制面的入口脚本,不是另一套独立 authority。
+- bare 这一层的稳定逻辑基名仍然是 `${HOSTWORKDIR}/log/.log`,shared supervisor runtime 再把它收口为 `${HOSTWORKDIR}/log/..log`。
+- ops-managed workload 这一层的稳定逻辑基名则是 `${WORKDIR}/log/workload____.log`,shared supervisor runtime 再把它收口为 `${WORKDIR}/log/workload____..log`。
+- 两层真正共享的是 `selection_supervisor.py + log_shard.py` 这组控制与滚动实现,不是“所有路径和文件名完全一样”。
+
+在当前 self-host deployconf 示例里:
+
+- `ops_controller` 的 workdir 是 `${HOSTWORKDIR}/ops_controller`
+- `ops_agent` 的 workdir 是 `${HOSTWORKDIR}/ops_agent/${NODE_ID}`
+
+### 1.3 testbed
+- `workdir`、`run_dir` 分别承担 launcher、runner、UI、workload 的 run-scoped 落盘边界。
+- `testbed` 必须显式作为 `log_service_kind` 出现,不再用泛化名称代替。
+- launcher 和 workload 的目录语义要和 ops 对齐。
+- 当前优先级不是先把 testbed 做到完美支持,而是先把 ops 长时服务日志 contract 讲清楚并收口;testbed 继续按“服务级日志”和“case artifact”分开讨论。
+
+### 1.4 FS
+- `shared_file_path` 与 `export.remote_root_dir_abs` 分开使用。
+- 前者负责共享 attachment 边界。
+- 后者负责业务数据边界。
+
+这里的目标很明确:目录可以不同,语义必须一致。`log`、`cache`、`shared attachment`、`workload data` 不能混在同一个边界里。
+
+## 2. 文件命名
+当前实现里的文件命名还没有完全统一,但已经可以明确分成下面几类。
+
+| 类别 | 当前逻辑基名 | 当前实际落盘 |
+| --- | --- | --- |
+| KV runtime | `fluxon-kv-.log` | `fluxon-kv-..log` |
+| bare 服务日志 | `.log` | `..log` |
+| ops-managed workload | `workload____.log` | `workload____..log` |
+| testbed 服务日志 | `test_runner.log` / `test_runner_ui.log` | `test_runner..log` / `test_runner_ui..log` |
+| KV side worker stdio | `side_worker_.stdout.log` / `side_worker_.stderr.log` | 当前还没补日期分片 |
+
+补充说明:
+
+- KV runtime 日志当前仍由 `fluxon_util::init_log(...)` 创建,`run_master_impl(...)` 和 `run_client_impl(...)` 都会初始化这套本地文件日志,所以 `master`、`owner`、`external` 这些 KV 运行时进程当前确实都会产生这类文件。
+- `ops` 里还保留一些特例命名,例如 `smoke.log`、`smoke_bare.log`、`smoke_workloads_bare.log`。这些都属于当前实现尚未收口的历史命名。
+- `testbed` 当前仍然没有单一 canonical log filename。服务级日志已经补上时间分片,但 `ci_runner` 等 case 级日志仍主要落在 `results//run_/logs/**` 与 `summary.yaml`、`exception.txt`、`ci.log` 这类 run artifact 里。
+
+清理只依据文件名里约定好的日期分片字段,不按目录数量、文件大小或历史批次做判断。这样本地清理和 Greptime retention 才能共享同一时间窗口。
+
+## 3. 元数据字段
+这一节描述的是当前 KV OTLP 导出链路已经实际写入 Greptime 的元数据字段。
+
+| 字段 | 含义 |
+| --- | --- |
+| `service.name` | 当前固定为 `fluxon` |
+| `fluxon_cluster_name` | 集群名 |
+| `fluxon_member_kind` | 当前业务类型标签,例如 `kv` |
+| `fluxon_role` | 当前进程角色标签,例如 `master`、`owner_client`、`external_client` |
+| `fluxon_member_id` | 当前实例标识 |
+
+当前实现里的日志元数据仍然是围绕 `cluster_name`、`member_kind`、`role`、`member_id` 这组字段组织的;`log_service_kind`、`log_kind`、`process_role`、`instance_key`、`workload_kind`、`workload_name` 这些更细的统一字段,目前还没有完整进入导出链路。
+
+## 4. 归档、超时与清理
+本地文件日志按天滚动归档,默认保留 31 天。清理时只扫描 canonical log file name,并按命名约定提取日期分片删除过期文件,不按文件数量或目录总量触发。
+
+流式备份和 OTLP 导出也服从同一套窗口:
+
+| 项目 | 规则 |
+| --- | --- |
+| 导出策略 | best-effort,不阻塞主业务路径 |
+| 队列满 | 允许丢弃,并保留可观测信号 |
+| 发送失败 | 允许跳过当前 batch,本地文件仍在 |
+| 停机行为 | shutdown 时执行 best-effort flush |
+| 超时语义 | 单次导出必须有硬上界,不能无限挂起 |
+
+Greptime 侧的 retention / TTL 也按同一日期窗口收口,保证本地与远端的保留语义一致。这里要把远端清理语义说死:写入 `fluxon_logs` 的日志记录默认只保留 1 个月,超过窗口的数据必须由 Greptime 表级 TTL 或定时清理任务删除,不能只依赖查询层按时间过滤“看不见旧数据”。
+
+如果后续本地窗口仍保持 31 天,那么 Greptime 侧也应保持同一 31 天窗口;如果本地窗口改为新的 canonical 值,远端 TTL 也必须同步调整。`disable_observability=true` 只关闭 OTLP 层,不关闭本地文件日志。
+
+如果某条 stream 只是“备份副本”,它不能绕开本地日志的归档窗口单独永久存活。超时后应停止 tailing、释放资源,并交回本地文件归档策略处理历史文件。
+
+## 5. 当前实现里已经收口的点
+这一节只写已经可以当作当前事实使用的内容。
+
+### 5.1 本地文件按天分片与 31 天窗口
+- KV runtime 已具备稳定的按天滚动与保留窗口。
+- bare 服务日志已经接到 shared supervisor 的按天分片与同口径清理。
+- ops-managed workload 日志已经接到 shared supervisor 的按天分片与同口径清理。
+- `test_runner` / `test_runner_ui` 这类 testbed 服务级日志已补齐按天分片与本地 31 天保留窗口。
+
+### 5.2 shared supervisor 已经统一到一个实现源
+- bare bootstrap 与 ops-managed workload 现在都复用 `selection_supervisor.py + log_shard.py` 这组实现。
+- `gen_bare_deploy_bash.py` 会把同一个 `log_shard.py` helper 下发到生成目录。
+- bare 启动脚本层保留的是稳定逻辑基名,真正的 stdio 重定向和实际分片写入都在共享 `selection_supervisor.py` 运行时里生效。
+
+### 5.3 Rust / Python 已经有三类明确对齐
+- 按天分片与 31 天清理
+- 日志目录派生规则
+- OTLP 基础字段与 Greptime header
+
+## 6. 当前还没有完全收口的点
+这一节只写未完全收口点,避免把“当前事实”和“目标态”混在一起。
+
+### 6.1 KV 目录边界还没有完全收口到单一 `share_path`
+- 预期 KV 最终收口为单一 `share_path`,统一承载 `mmap.file`、`shared.json` 和 side transfer metadata。
+- 当前 Rust 实现仍保留 `shared_memory_path` 与 `shared_file_path` 两条配置,并分别用于 `mmap.file` 与 `shared.json` / `peer metadata` 的就绪探测和发布。
+
+### 6.2 side worker stdio 仍未收口到统一按天分片
+- zero-contribution bootstrap 已经在启动前继承 owner 的 `large_file_paths`,因此 KV runtime logger 不再依赖 attach 后热切换文件路径。
+- 但 side worker stdio 当前仍然直接写 `side_worker_.stdout.log` / `side_worker_.stderr.log`,还没有补到统一的按天分片命名。
+
+### 6.3 side worker stdio 与历史 `smoke` 文件还没纳入这轮收口
+- side worker stdio 当前仍是 `side_worker_.stdout.log` / `side_worker_.stderr.log`。
+- `smoke.log`、`smoke_bare.log`、`smoke_workloads_bare.log` 一类历史命名仍然存在。
+
+### 6.4 testbed 只有服务级日志收口到了同类语义
+- `test_runner`、`test_runner_ui` 已改为“稳定逻辑基名 + 按天分片落盘”。
+- case 级 `run_dir/logs/**`、`summary.yaml`、`resolved_case.yaml`、`benchmark_result.json` 等仍按 run artifact 生命周期消费。
+- `history_lookback_days` 仍只是控制 UI 回看哪些 workdir;`gitops retention.max_age_days` 仍然清理 gitops run 目录,不是 testbed 服务日志文件的统一 TTL。
+
+### 6.5 OTLP 统一字段和统一状态机还没有全部收口
+- 当前导出链路仍以 `cluster_name`、`member_kind`、`role`、`member_id` 为主。
+- `log_service_kind`、`log_kind`、`process_role`、`instance_key`、`workload_kind`、`workload_name` 这组更细的 canonical 字段还没有完整进入导出链路。
+- Rust 通用链路已经把 `disabled`、`direct`、`proxy`、失败分支显式枚举出来;Python benchmark exporter 仍是直连特化路径,还没有进入同一套通用发送状态机。
+
+## 7. rs / py 模块对齐与防漂移
+稳定结论先说死:
+
+- 共享 log contract 以 Rust canonical 模块为准,Python 优先复用 Rust 已经导出的结果。
+- 当前已经能从代码直接看出三类对齐:按天分片与 31 天清理、日志目录派生、OTLP 基础字段与 header。
+- 当前还没有完全收口的是通用 OTLP 发送状态机。Rust 已经显式枚举发送分支,Python 侧 benchmark exporter 仍是直连特化路径。
+
+### 7.1 按天分片与本地保留窗口
+Rust `fluxon_rs/fluxon_util/src/log.rs`:
+
+```rust
+const LOG_RETENTION_DAYS: usize = 31;
+
+pub fn current_daily_sharded_log_path(base_path: &Path) -> anyhow::Result {
+ daily_sharded_log_path(base_path, current_shard_date()?)
+}
+
+fn cleanup_old_daily_sharded_logs(base_path: &Path, retention_days: usize) -> anyhow::Result<()> {
+ let keep_since = current_shard_date()? - chrono::Days::new(retention_days.saturating_sub(1) as u64);
+ ...
+ if shard_date < keep_since {
+ fs::remove_file(&path)?;
+ }
+}
+
+impl DailyShardedFileWriter {
+ fn rotate_if_needed(&self, state: &mut DailyShardedFileWriterState) -> io::Result<()> {
+ let next_path = self.current_path()?;
+ cleanup_old_daily_sharded_logs(&self.base_path, self.retention_days)?;
+ let file = fs::OpenOptions::new().create(true).append(true).open(&next_path)?;
+ state.current_path = Some(next_path);
+ state.current_file = Some(file);
+ Ok(())
+ }
+}
+```
+
+Python `deployment/utils/log_shard.py`:
+
+```python
+DEFAULT_DAILY_LOG_RETENTION_DAYS = 31
+
+def daily_sharded_log_path(base_path: Path, *, now: Optional[datetime.datetime] = None) -> Path:
+ shard_date = _resolve_shard_date(ts)
+ return (base_path.parent / f"{stem}.{shard_date.isoformat()}.log").resolve()
+
+def cleanup_old_daily_sharded_logs(base_path: Path, *, retention_days: int = DEFAULT_DAILY_LOG_RETENTION_DAYS) -> None:
+ current_shard_date = _resolve_shard_date(datetime.datetime.now(datetime.timezone.utc))
+ keep_since = current_shard_date - datetime.timedelta(
+ days=max(int(retention_days) - 1, 0)
+ )
+```
+
+这两段现在对齐的是同一个显式 contract:逻辑基名保持不变,日期字段统一落在 `..log`,默认本地窗口都是 31 天,而且过期删除都显式按日期分片判断。这里不要机械要求两边 helper 名称完全一样;对齐的是“按天分片 + 31 天窗口 + 同口径清理”这条 contract。
+
+### 7.2 KV 主日志是 Rust;Python 侧要分 bare 服务日志和 ops-managed workload 日志两层
+先把边界说死:KV runtime 主日志当前基本都是 Rust 在输出。`master`、`owner`、`external` 这些 KV 进程走的是 `fluxon_util::init_log(...)` 这条链。Python 一侧真正需要单独检查的,当前已经分成两层:
+
+- `deployconf -> gen_bare -> bare bootstrap` 这一层,负责 `ops_controller`、`ops_agent` 和其他 bare service 自身的 stdout/stderr。
+- `ops_agent` 进入 desired-runtime 管理之后,再去托管 workload;这一层的日志 contract 不再沿用 bare `${service_name}.log`,而是 `workload____.log`。
+
+先看 bare 这一层:
+
+Python `deployment/gen_bare_deploy_bash.py`:
+
+```python
+from log_shard import render_module_source as render_log_shard_module_source
+
+(outdir / LOG_SHARD_HELPER_FILENAME).write_text(
+ render_log_shard_module_source(),
+ encoding="utf-8",
+)
+```
+
+```python
+runtime_state_json = _bare_runtime_state_json(
+ workload_name=workload_name,
+ authority_name=...,
+ service_name=service_name,
+ log_path=f"${{HOSTWORKDIR}}/log/{service_name}.log",
+)
+
+LOG_DIR="$HOSTWORKDIR/log"
+LOGFILE="$LOG_DIR/${SERVICE}.log"
+...
+SUPERVISOR_PID=$( ... < /dev/null & echo "$!" )
+```
+
+Python `deployment/utils/selection_supervisor_codegen.py`:
+
+```python
+def _redirect_process_stdio_to_runtime_log(runtime_state: Optional[SelectionRuntimeState]) -> None:
+ base_log_path = _require_non_empty_str(runtime_state.log_path, "state.log_path")
+
+ def _router_loop() -> None:
+ _LOG_SHARD.relay_fd_to_daily_sharded_logs(
+ base_log_path=base_log_path,
+ read_fd=read_fd,
+ retention_days=_LOG_SHARD.DEFAULT_DAILY_LOG_RETENTION_DAYS,
+ )
+
+ os.dup2(write_fd, sys.stdout.fileno())
+ os.dup2(write_fd, sys.stderr.fileno())
+
+...
+
+_redirect_process_stdio_to_runtime_log(runtime_state)
+```
+
+再看 ops-managed workload 这一层:
+
+Rust `fluxon_rs/fluxon_ops/src/lib.rs`:
+
+```rust
+fn workload_log_filename(kind: WorkloadKind, name: &str) -> anyhow::Result {
+ Ok(format!("workload__{}__{}.log", kind.as_str(), name))
+}
+
+let runtime_dir = workdir.join(OPS_SELECTION_SUPERVISOR_DIR_NAME);
+let log_dir = workdir.join(OPS_LOG_DIR_NAME);
+let log_path = self.log_dir.join(log_filename);
+```
+
+这组代码说明当前现状是:
+
+- bare bootstrap 与 ops-managed workload 确实已经复用了同一个 `selection_supervisor.py + log_shard.py` 实现源。
+- bare 服务日志与 ops-managed workload 日志也都已经真正接到这套滚动管理 helper 上。
+- 但两层当前并不是同一个 path contract:
+ - bare 服务日志保留的是 `${HOSTWORKDIR}/log/${service_name}.log`
+ - ops-managed workload 保留的是 `${WORKDIR}/log/workload____.log`
+
+### 7.3 OTLP 基础字段与 header 已经同名对齐
+Rust `fluxon_rs/fluxon_observability/src/greptime_otlp_log.rs`:
+
+```rust
+let kvs = vec![
+ KeyValue { key: KEY_CLUSTER_NAME.to_string(), value: Some(...) },
+ KeyValue { key: KEY_MEMBER_KIND.to_string(), value: Some(...) },
+ KeyValue { key: KEY_ROLE.to_string(), value: Some(...) },
+ KeyValue { key: KEY_MEMBER_ID.to_string(), value: Some(...) },
+];
+
+let mut reqb = self
+ .http
+ .post(&self.endpoint)
+ .header("X-Greptime-DB-Name", &self.db_name)
+ .header("X-Greptime-Log-Extract-Keys", GREPTIME_LOG_EXTRACT_KEYS_HEADER_VALUE);
+```
+
+Python `fluxon_test_stack/distributed_benchmark_node.py`:
+
+```python
+log_attrs: Dict[str, Any] = {
+ "fluxon_cluster_name": self._cfg.cluster_name,
+ "fluxon_member_kind": self._cfg.member_kind,
+ "fluxon_role": self._cfg.role,
+ "fluxon_member_id": self._cfg.member_id,
+}
+
+headers = {
+ "Content-Type": "application/x-protobuf",
+ "X-Greptime-DB-Name": self._cfg.db_name,
+ "X-Greptime-Log-Extract-Keys": ",".join(extract_keys),
+}
+```
+
+这两边已经对齐到同一个最小公共集合:`fluxon_cluster_name`、`fluxon_member_kind`、`fluxon_role`、`fluxon_member_id` 这组基础属性同名同义,Greptime header 也保持同一协议面。Python benchmark exporter 可以补 phase summary 字段,但不能改写这组基础字段的含义。
+
+### 7.4 发送状态机还没有完全收口
+Rust `fluxon_rs/fluxon_observability/src/greptime_otlp_log_orchestrator.rs`:
+
+```rust
+pub enum GreptimeOtlpLogAttemptResult {
+ Disabled,
+ Sent { path: GreptimeOtlpLogSendPath, proxy_node: Option },
+ SkippedNoProxy { detail: String },
+ ProxyFailed { proxy_node: N, detail: String },
+}
+```
+
+Python `fluxon_test_stack/distributed_benchmark_node.py`:
+
+```python
+with urllib.request.urlopen(req, timeout=GREPTIME_OTLP_LOG_TIMEOUT_SECONDS) as resp:
+ status = getattr(resp, "status", 200)
+ if int(status) < 200 or int(status) >= 300:
+ body_text = resp.read().decode("utf-8", errors="replace")
+ raise RuntimeError(f"greptime otlp http {status}: {body_text}")
+```
+
+这组对照反映的是当前边界:Rust 通用链路已经把 `disabled`、`direct`、`proxy`、失败分支显式枚举出来;Python 这里只是 benchmark phase summary 的直连特化路径,还没有进入同一套通用发送状态机。后续如果 Python 需要承担通用 service-plane 导出,应该复用 Rust 这组有限分支,而不是再发明一套平行状态模型。
+
+### 7.5 防止未来漂移
+只保留四条工程规则:
+
+1. 共享 contract 只保留一个真相源。目录派生、canonical 字段、发送状态、TTL 这类会跨语言消费的语义,优先由 Rust 定义,Python 复用导出结果或逐项镜像实现。
+2. 任何改动如果影响 canonical 文件名、OTLP 字段、Greptime header、发送分支或 retention,必须同一个 PR 同时更新 Rust 代码、Python 代码、设计文档和至少一层 contract test。
+3. Python 特化路径必须显式标出作用域。`test_runner` 服务日志和 benchmark phase summary 可以保留自己的实现,但不能反向成为公共 contract 的定义源。
+4. 多语言边界坚持一个概念一个名字。不要在 rs / py 两边分别引入近义字段、别名参数或平行配置面,否则文档、查询、清理和告警都会漂移。
diff --git a/fluxon_py/config.py b/fluxon_py/config.py
index 9b7b447..51e0d7d 100644
--- a/fluxon_py/config.py
+++ b/fluxon_py/config.py
@@ -110,6 +110,9 @@ def _yaml_template():
cluster_name: # Cluster name (str)
shared_memory_path: # Shared memory path (str)
shared_file_path: # Shared file path for shared.json/logs/profiles (str)
+ large_file_paths: # Owner-mode large file roots (dict(optional))
+ log_root_path: # Log root path for owner/client large-file outputs (str)
+ cache_root_path: # Cache root path for owner/client large-file outputs (str)
p2p_listen_port: # P2P QUIC listen port override (int(optional))
redis_compat: # Enable Redis protocol shim (dict(optional))
listen_addr: # TCP listen addr, e.g. "127.0.0.1:16379" (str)
@@ -584,6 +587,18 @@ def to_fluxon_kv_client_config_yaml_str(self) -> str:
return yaml.safe_dump(cfg, sort_keys=False)
+ if "large_file_paths" not in spec:
+ raise ValueError("fluxonkv_spec.large_file_paths is required for owner mode")
+ large_file_paths = spec.get("large_file_paths")
+ if not isinstance(large_file_paths, dict):
+ raise ValueError("fluxonkv_spec.large_file_paths must be a mapping in owner mode")
+ for field_name in ("log_root_path", "cache_root_path"):
+ field_value = large_file_paths.get(field_name)
+ if not isinstance(field_value, str) or not field_value.strip():
+ raise ValueError(
+ f"fluxonkv_spec.large_file_paths.{field_name} must be a non-empty string in owner mode"
+ )
+
return yaml.safe_dump(cfg, sort_keys=False)
diff --git a/fluxon_py/tests/test_config.py b/fluxon_py/tests/test_config.py
index 379e3e0..2979d8e 100644
--- a/fluxon_py/tests/test_config.py
+++ b/fluxon_py/tests/test_config.py
@@ -47,6 +47,7 @@ def _build_checks(selected_test_id: Optional[str]) -> List[Tuple[str, Callable[[
("to_yaml_str_roundtrip", _run_test_to_yaml_str_roundtrip),
("fluxonkv_sub_cluster_config", test_fluxonkv_sub_cluster_config),
("fluxonkv_owner_requires_sub_cluster", test_fluxonkv_owner_requires_sub_cluster),
+ ("fluxonkv_owner_requires_large_file_paths", test_fluxonkv_owner_requires_large_file_paths),
("fluxonkv_p2p_relay_removed", test_fluxonkv_p2p_relay_removed),
("fluxon_client_config_yaml_shape", test_fluxon_client_config_yaml_shape),
("fluxonkv_protocol_field", test_fluxonkv_protocol_field),
@@ -270,6 +271,54 @@ def test_fluxonkv_owner_requires_sub_cluster():
print(f"❌ FAIL: test_fluxonkv_owner_requires_sub_cluster - {e}")
+def test_fluxonkv_owner_requires_large_file_paths():
+ """Ensure owner mode requires explicit large_file_paths roots."""
+ try:
+ base = {
+ "instance_key": "test_instance",
+ "contribute_to_cluster_pool_size": {"dram": 16777216, "vram": {}},
+ "fluxonkv_spec": {
+ "etcd_addresses": ["localhost:2379"],
+ "cluster_name": "test_cluster",
+ "shared_memory_path": "/tmp/kvcache_shared_memory/test",
+ "shared_file_path": "/tmp/kvcache_shared_files/test",
+ "sub_cluster": "rack-a",
+ },
+ }
+
+ try:
+ FluxonKvClientConfig(copy.deepcopy(base)).to_fluxon_kv_client_config_yaml_str()
+ print("❌ FAIL: test_fluxonkv_owner_requires_large_file_paths - missing large_file_paths should be rejected")
+ return
+ except ValueError:
+ pass
+
+ invalid_blank = copy.deepcopy(base)
+ invalid_blank["fluxonkv_spec"]["large_file_paths"] = {
+ "log_root_path": " ",
+ "cache_root_path": "/tmp/kvcache_large_cache/test",
+ }
+ try:
+ FluxonKvClientConfig(invalid_blank).to_fluxon_kv_client_config_yaml_str()
+ print("❌ FAIL: test_fluxonkv_owner_requires_large_file_paths - blank log_root_path should be rejected")
+ return
+ except ValueError:
+ pass
+
+ valid = copy.deepcopy(base)
+ valid["fluxonkv_spec"]["large_file_paths"] = {
+ "log_root_path": "/tmp/kvcache_large_logs/test",
+ "cache_root_path": "/tmp/kvcache_large_cache/test",
+ }
+ rendered = FluxonKvClientConfig(valid).to_fluxon_kv_client_config_yaml_str()
+ assert "large_file_paths:" in rendered
+ assert "log_root_path: /tmp/kvcache_large_logs/test" in rendered
+ assert "cache_root_path: /tmp/kvcache_large_cache/test" in rendered
+ print("✅ PASS: test_fluxonkv_owner_requires_large_file_paths")
+ except Exception as e:
+ print(f"❌ FAIL: test_fluxonkv_owner_requires_large_file_paths - {e}")
+
+
def test_fluxonkv_p2p_relay_removed():
"""Ensure removed fluxonkv_spec.p2p_relay is rejected as an unknown key."""
try:
diff --git a/fluxon_rs/Cargo.lock b/fluxon_rs/Cargo.lock
index 4ddcf9b..a4b0ecd 100644
--- a/fluxon_rs/Cargo.lock
+++ b/fluxon_rs/Cargo.lock
@@ -1320,6 +1320,7 @@ dependencies = [
"anyhow",
"askama",
"base64 0.21.7",
+ "chrono",
"clap",
"etcd-client",
"fluxon_cli",
@@ -1336,6 +1337,7 @@ dependencies = [
"serde_json",
"serde_yaml",
"sha2",
+ "tempfile",
"thiserror 1.0.69",
"tokio",
"tracing",
diff --git a/fluxon_rs/fluxon_fs/src/agent.rs b/fluxon_rs/fluxon_fs/src/agent.rs
index eca583e..03a3dd0 100644
--- a/fluxon_rs/fluxon_fs/src/agent.rs
+++ b/fluxon_rs/fluxon_fs/src/agent.rs
@@ -1407,20 +1407,20 @@ impl FluxonFsAgent {
.get_self_info()
.id
.to_string();
- let shared_file_path = if self.kv_framework.is_external_mode() {
+ let cache_root_base = if self.kv_framework.is_external_mode() {
self.kv_framework
.external_client_api_view()
.external_client_api()
.inner()
- .shared_file_path()
+ .cache_root_path()
} else {
self.kv_framework
.client_seg_pool_view()
.client_seg_pool()
- .shared_file_path()
+ .cache_root_path()
.to_string()
};
- let cache_root = resolve_disk_cache_root(Path::new(&shared_file_path), &instance_key);
+ let cache_root = resolve_disk_cache_root(Path::new(&cache_root_base), &instance_key);
let cache =
RemoteDiskCacheManager::new(cache_root.clone(), disk_cache_max_bytes_from_env())
.map_err(|err| {
diff --git a/fluxon_rs/fluxon_kv/src/client_seg_pool/mod.rs b/fluxon_rs/fluxon_kv/src/client_seg_pool/mod.rs
index 7902beb..fb54c06 100644
--- a/fluxon_rs/fluxon_kv/src/client_seg_pool/mod.rs
+++ b/fluxon_rs/fluxon_kv/src/client_seg_pool/mod.rs
@@ -46,6 +46,8 @@ pub struct ClientSegPoolNewArg {
pub contribute_size: ContributeToClusterPoolSize,
pub shared_memory_path: String,
pub shared_file_path: String,
+ pub log_root_path: String,
+ pub cache_root_path: String,
pub cluster_name: String,
pub etcd_addresses: Vec,
pub attach_existing_meta: Option,
@@ -64,6 +66,7 @@ pub struct SharedJsonMeta {
pub etcd_addresses: Vec,
pub shared_memory_path: String,
pub shared_file_path: String,
+ pub large_file_paths: crate::config::LargeFilePaths,
pub protocol_version: String,
pub write_ts: Option,
}
@@ -203,6 +206,10 @@ pub struct ClientSegPoolInner {
shared_memory_path: String,
/// Directory path for regular files (shared.json, side-transfer metadata).
shared_file_path: String,
+ /// Base directory for runtime logs and profile outputs.
+ log_root_path: String,
+ /// Base directory for large cache files.
+ cache_root_path: String,
side_transfer_worker: bool,
attach_owner_ref: Option,
@@ -262,6 +269,8 @@ impl ClientSegPool {
let contribute_size = arg.contribute_size;
let shared_memory_path = arg.shared_memory_path;
let shared_file_path = arg.shared_file_path;
+ let log_root_path = arg.log_root_path;
+ let cache_root_path = arg.cache_root_path;
let cluster_name = arg.cluster_name;
let etcd_addresses = arg.etcd_addresses;
let attach_existing_meta = arg.attach_existing_meta;
@@ -356,6 +365,8 @@ impl ClientSegPool {
view: std::sync::OnceLock::new(),
shared_memory_path: shared_memory_path.clone(),
shared_file_path: shared_file_path.clone(),
+ log_root_path: log_root_path.clone(),
+ cache_root_path: cache_root_path.clone(),
side_transfer_worker,
attach_owner_ref,
cluster_name: cluster_name.clone(),
@@ -372,6 +383,8 @@ impl ClientSegPool {
view: std::sync::OnceLock::new(),
shared_memory_path: shared_memory_path.clone(),
shared_file_path: shared_file_path.clone(),
+ log_root_path: log_root_path.clone(),
+ cache_root_path: cache_root_path.clone(),
side_transfer_worker,
attach_owner_ref,
cluster_name: cluster_name.clone(),
@@ -535,6 +548,8 @@ impl ClientSegPool {
view: std::sync::OnceLock::new(),
shared_memory_path: base_path.to_string(),
shared_file_path: shared_file_path.clone(),
+ log_root_path,
+ cache_root_path,
side_transfer_worker,
attach_owner_ref,
cluster_name,
@@ -553,6 +568,10 @@ impl ClientSegPool {
&self.inner().shared_file_path
}
+ pub fn cache_root_path(&self) -> &str {
+ &self.inner().cache_root_path
+ }
+
fn transfer_rpc_fast_path_eligible_members(&self) -> Vec {
let inner = self.inner();
let self_info = inner.view().cluster_manager().get_self_info();
@@ -1161,6 +1180,10 @@ impl ClientSegPool {
etcd_addresses: inner.etcd_addresses.clone(),
shared_memory_path: shared_memory_canonical,
shared_file_path: shared_file_canonical,
+ large_file_paths: crate::config::LargeFilePaths {
+ log_root_path: inner.log_root_path.clone(),
+ cache_root_path: inner.cache_root_path.clone(),
+ },
protocol_version,
diff --git a/fluxon_rs/fluxon_kv/src/config.rs b/fluxon_rs/fluxon_kv/src/config.rs
index 218ef69..2df094c 100644
--- a/fluxon_rs/fluxon_kv/src/config.rs
+++ b/fluxon_rs/fluxon_kv/src/config.rs
@@ -379,6 +379,17 @@ fn cluster_scoped_shared_path(root: &str, cluster_name: &str) -> KvResult KvResult {
+ let trimmed = root.trim();
+ if trimmed.is_empty() {
+ return Err(ConfigError::InvalidClientConfig {
+ detail: format!("{field_name} cannot be empty"),
+ }
+ .into_kverror());
+ }
+ Ok(trimmed.to_string())
+}
+
fn resolve_compiled_rdma_transfer_engine() -> KvResult {
Ok(TransferEngineType::Closed)
}
@@ -552,6 +563,8 @@ pub struct FluxonKvSpecYaml {
pub shared_memory_path: String,
pub shared_file_path: String,
#[serde(skip_serializing_if = "Option::is_none")]
+ pub large_file_paths: Option,
+ #[serde(skip_serializing_if = "Option::is_none")]
pub p2p_listen_port: Option,
#[serde(skip_serializing_if = "Option::is_none")]
pub redis_compat: Option>,
@@ -559,6 +572,13 @@ pub struct FluxonKvSpecYaml {
pub sub_cluster: Option>,
}
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(deny_unknown_fields)]
+pub struct LargeFilePathsYaml {
+ pub log_root_path: String,
+ pub cache_root_path: String,
+}
+
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct RedisCompatConfigYaml {
@@ -608,6 +628,12 @@ pub struct FluxonKvSpec {
pub sub_cluster: Option,
}
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub struct LargeFilePaths {
+ pub log_root_path: String,
+ pub cache_root_path: String,
+}
+
/// KV client backend types supported by the system
#[derive(Debug, Clone, PartialEq)]
pub enum KvClientType {
@@ -627,6 +653,7 @@ pub struct ClientConfig {
pub fluxonkv_spec: FluxonKvSpec,
pub shared_memory_path: String, // Mandatory shared memory path
pub shared_file_path: String, // Mandatory shared file path
+ pub large_file_paths: LargeFilePaths, // Mandatory large-file roots for logs and caches
pub test_spec_config: TestSpecConfig,
}
@@ -893,7 +920,7 @@ impl ClientConfigYaml {
.into_kverror());
}
- // External (zero-contribution) mode forbids additional knobs to keep the schema minimal.
+ // External (zero-contribution) mode forbids additional owner-derived knobs to keep the schema minimal.
if is_external {
if self.fluxonkv_spec.redis_compat.is_some() {
return Err(ConfigError::InvalidClientConfig {
@@ -914,6 +941,12 @@ impl ClientConfigYaml {
}
.into_kverror());
}
+ if self.fluxonkv_spec.large_file_paths.is_some() {
+ return Err(ConfigError::InvalidClientConfig {
+ detail: "fluxonkv_spec.large_file_paths is forbidden in zero-contribution mode (it is inherited from owner shared.json)".to_string(),
+ }
+ .into_kverror());
+ }
}
// Preserve historical behavior for configs that omit `protocol`, but allow
@@ -1053,6 +1086,32 @@ impl ClientConfigYaml {
}
.into_kverror());
}
+ let large_file_paths = if is_external {
+ LargeFilePaths {
+ log_root_path: String::new(),
+ cache_root_path: String::new(),
+ }
+ } else {
+ let Some(large_file_paths_yaml) = self.fluxonkv_spec.large_file_paths.as_ref() else {
+ return Err(ConfigError::InvalidClientConfig {
+ detail: "fluxonkv_spec.large_file_paths is required for owner mode"
+ .to_string(),
+ }
+ .into_kverror());
+ };
+ let log_root_path = verify_non_empty_root_path(
+ &large_file_paths_yaml.log_root_path,
+ "large_file_paths.log_root_path",
+ )?;
+ let cache_root_path = verify_non_empty_root_path(
+ &large_file_paths_yaml.cache_root_path,
+ "large_file_paths.cache_root_path",
+ )?;
+ LargeFilePaths {
+ log_root_path,
+ cache_root_path,
+ }
+ };
let shared_memory_path = cluster_scoped_shared_path(
&self.fluxonkv_spec.shared_memory_path,
@@ -1062,7 +1121,6 @@ impl ClientConfigYaml {
&self.fluxonkv_spec.shared_file_path,
&fluxonkv_spec.cluster_name,
)?;
-
let redis_compat_listen_addr = match self.fluxonkv_spec.redis_compat.as_ref() {
None | Some(YamlNullable::Null) => None,
Some(YamlNullable::Value(rc)) => {
@@ -1094,6 +1152,7 @@ impl ClientConfigYaml {
fluxonkv_spec,
shared_memory_path,
shared_file_path,
+ large_file_paths,
test_spec_config,
})
}
@@ -1434,6 +1493,9 @@ fluxonkv_spec:
cluster_name: test_cluster
shared_memory_path: /tmp/test_owner
shared_file_path: /tmp/test_owner_files
+ large_file_paths:
+ log_root_path: /tmp/test_owner_logs
+ cache_root_path: /tmp/test_owner_cache
sub_cluster: rack-a
test_spec_config:
disable_observability: true
@@ -1480,6 +1542,9 @@ fluxonkv_spec:
cluster_name: test_cluster
shared_memory_path: /tmp/test_owner
shared_file_path: /tmp/test_owner_files
+ large_file_paths:
+ log_root_path: /tmp/test_owner_logs
+ cache_root_path: /tmp/test_owner_cache
sub_cluster: rack-a
"#,
)
@@ -1492,6 +1557,45 @@ fluxonkv_spec:
assert!(verified.fluxonkv_spec.enable_transfer_rpc_fast_path);
}
+ #[test]
+ fn client_config_zero_contribution_allows_owner_bootstrapped_large_file_paths() {
+ let cfg = ClientConfigYaml::from_str(
+ r#"
+instance_key: test_external
+fluxonkv_spec:
+ cluster_name: test_cluster
+ shared_memory_path: /tmp/test_external
+ shared_file_path: /tmp/test_external_files
+"#,
+ )
+ .unwrap();
+ let verified = cfg.verify().unwrap();
+ assert_eq!(verified.large_file_paths.log_root_path, "");
+ assert_eq!(verified.large_file_paths.cache_root_path, "");
+ assert_eq!(verified.fluxonkv_spec.etcd_addresses, Vec::::new());
+ assert_eq!(verified.fluxonkv_spec.sub_cluster, None);
+ }
+
+ #[test]
+ fn client_config_zero_contribution_rejects_large_file_paths_in_yaml() {
+ let cfg = ClientConfigYaml::from_str(
+ r#"
+instance_key: test_external
+fluxonkv_spec:
+ cluster_name: test_cluster
+ shared_memory_path: /tmp/test_external
+ shared_file_path: /tmp/test_external_files
+ large_file_paths:
+ log_root_path: /tmp/test_external_logs
+ cache_root_path: /tmp/test_external_cache
+"#,
+ )
+ .unwrap();
+ let err = cfg.verify().unwrap_err();
+ let text = format!("{err}");
+ assert!(text.contains("fluxonkv_spec.large_file_paths is forbidden in zero-contribution mode"));
+ }
+
#[test]
fn client_test_spec_config_accepts_explicit_rdma_device_names() {
let cfg = ClientConfigYaml::from_str(
@@ -1505,6 +1609,9 @@ fluxonkv_spec:
cluster_name: test_cluster
shared_memory_path: /tmp/test_owner
shared_file_path: /tmp/test_owner_files
+ large_file_paths:
+ log_root_path: /tmp/test_owner_logs
+ cache_root_path: /tmp/test_owner_cache
sub_cluster: rack-a
test_spec_config:
transport_mode: transfer_with_rpc
@@ -1558,6 +1665,9 @@ fluxonkv_spec:
cluster_name: test_cluster
shared_memory_path: /tmp/test_owner
shared_file_path: /tmp/test_owner_files
+ large_file_paths:
+ log_root_path: /tmp/test_owner_logs
+ cache_root_path: /tmp/test_owner_cache
sub_cluster: rack-a
test_spec_config:
rdma_device_names: ["mlx5_0"]
@@ -1593,6 +1703,9 @@ fluxonkv_spec:
cluster_name: test_cluster
shared_memory_path: /tmp/test_owner
shared_file_path: /tmp/test_owner_files
+ large_file_paths:
+ log_root_path: /tmp/test_owner_logs
+ cache_root_path: /tmp/test_owner_cache
sub_cluster: rack-a
test_spec_config:
transport_mode: transfer_with_rpc
@@ -1624,6 +1737,9 @@ fluxonkv_spec:
cluster_name: test_cluster
shared_memory_path: /tmp/test_owner
shared_file_path: /tmp/test_owner_files
+ large_file_paths:
+ log_root_path: /tmp/test_owner_logs
+ cache_root_path: /tmp/test_owner_cache
sub_cluster: rack-a
test_spec_config:
require_transfer_rpc_fast_path_ready_timeout_seconds: 45
@@ -1649,6 +1765,9 @@ fluxonkv_spec:
cluster_name: test_cluster
shared_memory_path: /tmp/test_owner
shared_file_path: /tmp/test_owner_files
+ large_file_paths:
+ log_root_path: /tmp/test_owner_logs
+ cache_root_path: /tmp/test_owner_cache
sub_cluster: rack-a
test_spec_config:
tcp_thread_control_lane_count: 0
@@ -1675,6 +1794,9 @@ fluxonkv_spec:
cluster_name: test_cluster
shared_memory_path: /tmp/test_owner
shared_file_path: /tmp/test_owner_files
+ large_file_paths:
+ log_root_path: /tmp/test_owner_logs
+ cache_root_path: /tmp/test_owner_cache
sub_cluster: rack-a
test_spec_config:
transport_mode: transfer_with_rpc
@@ -1706,6 +1828,9 @@ fluxonkv_spec:
cluster_name: test_cluster
shared_memory_path: /tmp/test_owner
shared_file_path: /tmp/test_owner_files
+ large_file_paths:
+ log_root_path: /tmp/test_owner_logs
+ cache_root_path: /tmp/test_owner_cache
sub_cluster: rack-a
test_spec_config:
transport_mode: transfer_with_rpc
@@ -1730,6 +1855,9 @@ fluxonkv_spec:
cluster_name: test_cluster
shared_memory_path: /tmp/test_owner
shared_file_path: /tmp/test_owner_files
+ large_file_paths:
+ log_root_path: /tmp/test_owner_logs
+ cache_root_path: /tmp/test_owner_cache
sub_cluster: rack-a
test_spec_config:
rdma_device_names: ["mlx5_0"]
@@ -1784,6 +1912,9 @@ fluxonkv_spec:
cluster_name: test_cluster
shared_memory_path: /tmp/test_side_worker
shared_file_path: /tmp/test_side_worker_files
+ large_file_paths:
+ log_root_path: /tmp/test_side_worker_logs
+ cache_root_path: /tmp/test_side_worker_cache
p2p_listen_port: 18081
test_spec_config:
enable_side_transfer: true
@@ -1823,6 +1954,9 @@ fluxonkv_spec:
cluster_name: test_cluster
shared_memory_path: /tmp/test_side_worker
shared_file_path: /tmp/test_side_worker_files
+ large_file_paths:
+ log_root_path: /tmp/test_side_worker_logs
+ cache_root_path: /tmp/test_side_worker_cache
test_spec_config:
enable_side_transfer: true
side_transfer_role: worker
@@ -1854,6 +1988,9 @@ fluxonkv_spec:
cluster_name: test_cluster
shared_memory_path: /tmp/test_side_worker
shared_file_path: /tmp/test_side_worker_files
+ large_file_paths:
+ log_root_path: /tmp/test_side_worker_logs
+ cache_root_path: /tmp/test_side_worker_cache
test_spec_config:
enable_side_transfer: true
side_transfer_role: worker
@@ -1883,6 +2020,9 @@ fluxonkv_spec:
cluster_name: test_cluster
shared_memory_path: /tmp/test_owner
shared_file_path: /tmp/test_owner_files
+ large_file_paths:
+ log_root_path: /tmp/test_owner_logs
+ cache_root_path: /tmp/test_owner_cache
p2p_listen_port: 18081
sub_cluster: rack-a
test_spec_config:
@@ -1915,6 +2055,9 @@ fluxonkv_spec:
cluster_name: test_cluster
shared_memory_path: /tmp/test_owner
shared_file_path: /tmp/test_owner_files
+ large_file_paths:
+ log_root_path: /tmp/test_owner_logs
+ cache_root_path: /tmp/test_owner_cache
sub_cluster: rack-a
"#,
)
@@ -1940,6 +2083,9 @@ fluxonkv_spec:
cluster_name: test_cluster
shared_memory_path: /tmp/test_owner
shared_file_path: /tmp/test_owner_files
+ large_file_paths:
+ log_root_path: /tmp/test_owner_logs
+ cache_root_path: /tmp/test_owner_cache
sub_cluster: rack-a
test_spec_config:
transport_mode: transfer_with_rpc
diff --git a/fluxon_rs/fluxon_kv/src/external_client_api/external_client_test.rs b/fluxon_rs/fluxon_kv/src/external_client_api/external_client_test.rs
index f811424..6a36ca7 100644
--- a/fluxon_rs/fluxon_kv/src/external_client_api/external_client_test.rs
+++ b/fluxon_rs/fluxon_kv/src/external_client_api/external_client_test.rs
@@ -2,8 +2,8 @@ use std::collections::HashMap;
use crate::cluster_manager::NodeID;
use crate::config::{
- ClientConfig, ContributeToClusterPoolSize, FluxonKvSpec, MasterConfig, MonitoringConfig,
- ProtocolConfig, ProtocolType, TestSpecConfig, TransferEngineType,
+ ClientConfig, ContributeToClusterPoolSize, FluxonKvSpec, LargeFilePaths, MasterConfig,
+ MonitoringConfig, ProtocolConfig, ProtocolType, TestSpecConfig, TransferEngineType,
};
use crate::master_kv_router::MasterKvRouterView;
use crate::{ConfigArg, run_client, run_master};
@@ -82,6 +82,10 @@ fn new_client_config(
},
shared_memory_path: shm_path.to_string(),
shared_file_path: format!("{}_files", shm_path),
+ large_file_paths: LargeFilePaths {
+ log_root_path: format!("{}_logs", shm_path),
+ cache_root_path: format!("{}_cache", shm_path),
+ },
test_spec_config: TestSpecConfig::default(),
}
}
@@ -124,6 +128,10 @@ fn new_zero_contribution_client_config(
},
shared_memory_path: shm_path.to_string(),
shared_file_path: format!("{}_files", shm_path),
+ large_file_paths: LargeFilePaths {
+ log_root_path: String::new(),
+ cache_root_path: String::new(),
+ },
test_spec_config: TestSpecConfig::default(),
}
}
diff --git a/fluxon_rs/fluxon_kv/src/external_client_api/mod.rs b/fluxon_rs/fluxon_kv/src/external_client_api/mod.rs
index f2634be..0758ab5 100644
--- a/fluxon_rs/fluxon_kv/src/external_client_api/mod.rs
+++ b/fluxon_rs/fluxon_kv/src/external_client_api/mod.rs
@@ -253,6 +253,7 @@ define_module!(
pub struct ExternalClientApiNewArg {
pub shared_memory_path: String,
pub shared_file_path: String,
+ pub cache_root_path: String,
pub expected_cluster_name: String,
pub expected_protocol_version: String,
pub enable_side_transfer: bool,
@@ -312,6 +313,7 @@ pub struct ExternalInner {
expected_protocol_version: String,
external_shared_memory_path: String,
external_shared_file_path: String,
+ external_cache_root_path: String,
_enable_side_transfer: bool,
short_circuit_put_payload_path: bool,
side_rr_next: AtomicUsize,
@@ -363,6 +365,7 @@ impl ExternalClientApi {
expected_protocol_version: arg.expected_protocol_version,
external_shared_memory_path: arg.shared_memory_path,
external_shared_file_path: arg.shared_file_path,
+ external_cache_root_path: arg.cache_root_path,
_enable_side_transfer: arg.enable_side_transfer,
short_circuit_put_payload_path: arg.short_circuit_put_payload_path,
side_rr_next: AtomicUsize::new(0),
@@ -1237,6 +1240,10 @@ impl ExternalInner {
self.external_shared_file_path.clone()
}
+ pub fn cache_root_path(&self) -> String {
+ self.external_cache_root_path.clone()
+ }
+
fn should_fallback_side_p2p_error(err: &crate::p2p::P2PError) -> bool {
matches!(
err,
diff --git a/fluxon_rs/fluxon_kv/src/kvcore_test_lib.rs b/fluxon_rs/fluxon_kv/src/kvcore_test_lib.rs
index 355ca6e..1b5754d 100644
--- a/fluxon_rs/fluxon_kv/src/kvcore_test_lib.rs
+++ b/fluxon_rs/fluxon_kv/src/kvcore_test_lib.rs
@@ -147,6 +147,10 @@ fn new_client_config_with_cluster_and_dram(
},
shared_memory_path,
shared_file_path,
+ large_file_paths: crate::config::LargeFilePaths {
+ log_root_path: format!("{}/large_logs/{}", base, instance_key),
+ cache_root_path: format!("{}/large_cache/{}", base, instance_key),
+ },
test_spec_config: TestSpecConfig::default(),
};
println!("fluxonkv core created client config for test: {:?}", conf);
diff --git a/fluxon_rs/fluxon_kv/src/lib.rs b/fluxon_rs/fluxon_kv/src/lib.rs
index b46fd85..96e9b28 100644
--- a/fluxon_rs/fluxon_kv/src/lib.rs
+++ b/fluxon_rs/fluxon_kv/src/lib.rs
@@ -105,6 +105,13 @@ use std::sync::Arc;
use std::time::{Duration, Instant};
use tracing::{info, warn};
+struct ExternalBootstrapBundle {
+ meta: SharedJsonMeta,
+ shared_memory_path: String,
+ shared_file_path: String,
+ etcd_endpoints: Vec,
+}
+
fn cluster_manager_rdma_control_init_from_transfer_config(
_transfer_engine: TransferEngineType,
_protocol: &ProtocolConfig,
@@ -585,7 +592,7 @@ fn tcp_thread_transport_tuning_from_test_spec_config(
}
pub async fn load_client_config(config_arg: ConfigArg) -> KvResult {
- match config_arg {
+ let config = match config_arg {
ConfigArg::None => {
// Try to find default config file
match find_default_config_file() {
@@ -594,13 +601,13 @@ pub async fn load_client_config(config_arg: ConfigArg) -> KvResult
let config_yaml = ClientConfigYaml::from_file(&path)?;
let config = config_yaml.verify()?;
println!("Client configuration loaded and validated successfully");
- Ok(config)
+ config
}
None => Err(ConfigError::FileReadError {
detail: "No config file found. Please provide a config file with -f option"
.to_string(),
}
- .into_kverror()),
+ .into_kverror())?,
}
}
ConfigArg::File(config_path) => {
@@ -608,13 +615,15 @@ pub async fn load_client_config(config_arg: ConfigArg) -> KvResult
let config_yaml = ClientConfigYaml::from_file(&config_path)?;
let config = config_yaml.verify()?;
println!("Client configuration loaded and validated successfully");
- Ok(config)
+ config
}
ConfigArg::Config(config) => {
println!("Using provided client configuration");
- Ok(config)
+ config
}
- }
+ };
+
+ bootstrap_zero_contribution_client_config(config).await
}
pub async fn load_master_config(config_arg: ConfigArg) -> KvResult {
@@ -785,6 +794,7 @@ fn build_side_transfer_worker_config(
},
shared_memory_path: owner_config.shared_memory_path.clone(),
shared_file_path: owner_config.shared_file_path.clone(),
+ large_file_paths: owner_config.large_file_paths.clone(),
test_spec_config,
})
}
@@ -829,6 +839,7 @@ fn build_side_transfer_worker_config_yaml(
cluster_name: side_config.cluster_name,
shared_memory_path: side_config.shared_memory_path,
shared_file_path: side_config.shared_file_path,
+ large_file_paths: None,
p2p_listen_port: side_config.fluxonkv_spec.p2p_listen_port,
redis_compat: None,
sub_cluster: None,
@@ -838,7 +849,7 @@ fn build_side_transfer_worker_config_yaml(
}
fn side_transfer_runtime_dir(owner_config: &ClientConfig) -> PathBuf {
- Path::new(&owner_config.shared_file_path)
+ Path::new(&owner_config.large_file_paths.log_root_path)
.join(format!("{}_cluster_kv_logs", owner_config.cluster_name))
.join("side_transfer_runtime")
.join(&owner_config.instance_key)
@@ -1569,6 +1580,265 @@ fn merge_startup_member_metadata(
Ok(())
}
+async fn bootstrap_zero_contribution_client_config(config: ClientConfig) -> KvResult {
+ let dram = config.contribute_to_cluster_pool_size.dram;
+ let vram_is_zero = config
+ .contribute_to_cluster_pool_size
+ .vram
+ .values()
+ .all(|&v| v == 0);
+ let is_zero_contribution = dram == 0 && vram_is_zero;
+ if !is_zero_contribution {
+ return Ok(config);
+ }
+
+ let bundle = wait_for_external_bootstrap_bundle(&config).await?;
+ let mut final_config = config;
+ final_config.etcd_addresses_raw = bundle.meta.etcd_addresses.clone();
+ final_config.fluxonkv_spec.etcd_addresses = bundle.etcd_endpoints;
+ final_config.fluxonkv_spec.sub_cluster = bundle.meta.sub_cluster.clone();
+ final_config.shared_memory_path = bundle.shared_memory_path;
+ final_config.shared_file_path = bundle.shared_file_path;
+ final_config.large_file_paths = bundle.meta.large_file_paths;
+ Ok(final_config)
+}
+
+async fn wait_for_external_bootstrap_bundle(
+ config: &ClientConfig,
+) -> KvResult {
+ let build_version = fluxon_util::git_version_build_record::get_current_git_commitid().unwrap();
+ let shared_memory_dir = Path::new(&config.shared_memory_path);
+ let shared_file_dir = Path::new(&config.shared_file_path);
+ let shared_json_path = shared_file_dir.join("shared.json");
+ let mmap_file_path = shared_memory_dir.join("mmap.file");
+
+ let mut waited_ticks: u64 = 0;
+ loop {
+ if !shared_json_path.exists() || !mmap_file_path.exists() {
+ limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)).await;
+ waited_ticks += 1;
+ if waited_ticks % 25 == 0 {
+ info!(
+ "Waiting owner shared bundle to be ready... ({}s), shm_dir={} file_dir={} (shared.json={}, mmap.file={})",
+ waited_ticks / 5,
+ shared_memory_dir.to_string_lossy(),
+ shared_file_dir.to_string_lossy(),
+ shared_json_path.exists(),
+ mmap_file_path.exists()
+ );
+ }
+ continue;
+ }
+
+ let shared_json_buf = match std::fs::read_to_string(&shared_json_path) {
+ Ok(v) => v,
+ Err(e) => {
+ limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)).await;
+ waited_ticks += 1;
+ if waited_ticks % 25 == 0 {
+ warn!(
+ "Waiting owner shared.json readable... ({}s), path={}, err={}",
+ waited_ticks / 5,
+ shared_json_path.to_string_lossy(),
+ e
+ );
+ }
+ continue;
+ }
+ };
+
+ let meta: crate::SharedJsonMeta = match serde_json::from_str(&shared_json_buf) {
+ Ok(v) => v,
+ Err(e) => {
+ limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)).await;
+ waited_ticks += 1;
+ if waited_ticks % 25 == 0 {
+ warn!(
+ "Waiting owner shared.json schema ready... ({}s), path={}, err={}",
+ waited_ticks / 5,
+ shared_json_path.to_string_lossy(),
+ e
+ );
+ }
+ continue;
+ }
+ };
+
+ if meta.protocol_version != build_version {
+ limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)).await;
+ waited_ticks += 1;
+ if waited_ticks % 25 == 0 {
+ warn!(
+ "Waiting protocol_version match... ({}s), shm_dir='{}' file_dir='{}', shared='{}', local='{}'",
+ waited_ticks / 5,
+ shared_memory_dir.to_string_lossy(),
+ shared_file_dir.to_string_lossy(),
+ meta.protocol_version,
+ build_version
+ );
+ }
+ continue;
+ }
+
+ if meta.cluster_name != config.cluster_name {
+ limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)).await;
+ waited_ticks += 1;
+ if waited_ticks % 25 == 0 {
+ warn!(
+ "Waiting cluster_name match... ({}s), shm_dir='{}' file_dir='{}', config='{}', shared.json='{}'",
+ waited_ticks / 5,
+ shared_memory_dir.to_string_lossy(),
+ shared_file_dir.to_string_lossy(),
+ config.cluster_name,
+ meta.cluster_name
+ );
+ }
+ continue;
+ }
+
+ let shared_memory_path_canonical = match std::fs::canonicalize(&config.shared_memory_path) {
+ Ok(v) => v.to_string_lossy().into_owned(),
+ Err(e) => {
+ limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)).await;
+ waited_ticks += 1;
+ if waited_ticks % 25 == 0 {
+ warn!(
+ "Waiting shared_memory_path canonicalizable... ({}s), shm_dir='{}', path='{}', err={}",
+ waited_ticks / 5,
+ shared_memory_dir.to_string_lossy(),
+ config.shared_memory_path,
+ e
+ );
+ }
+ continue;
+ }
+ };
+
+ let meta_shm_canonical = match std::fs::canonicalize(&meta.shared_memory_path) {
+ Ok(v) => v.to_string_lossy().into_owned(),
+ Err(e) => {
+ limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)).await;
+ waited_ticks += 1;
+ if waited_ticks % 25 == 0 {
+ warn!(
+ "Waiting shared.json shared_memory_path canonicalizable... ({}s), shm_dir='{}', path='{}', err={}",
+ waited_ticks / 5,
+ shared_memory_dir.to_string_lossy(),
+ meta.shared_memory_path,
+ e
+ );
+ }
+ continue;
+ }
+ };
+
+ let shared_file_path_canonical = match std::fs::canonicalize(&config.shared_file_path) {
+ Ok(v) => v.to_string_lossy().into_owned(),
+ Err(e) => {
+ limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)).await;
+ waited_ticks += 1;
+ if waited_ticks % 25 == 0 {
+ warn!(
+ "Waiting shared_file_path canonicalizable... ({}s), file_dir='{}', path='{}', err={}",
+ waited_ticks / 5,
+ shared_file_dir.to_string_lossy(),
+ config.shared_file_path,
+ e
+ );
+ }
+ continue;
+ }
+ };
+ let meta_file_canonical = match std::fs::canonicalize(&meta.shared_file_path) {
+ Ok(v) => v.to_string_lossy().into_owned(),
+ Err(e) => {
+ limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)).await;
+ waited_ticks += 1;
+ if waited_ticks % 25 == 0 {
+ warn!(
+ "Waiting shared.json shared_file_path canonicalizable... ({}s), file_dir='{}', path='{}', err={}",
+ waited_ticks / 5,
+ shared_file_dir.to_string_lossy(),
+ meta.shared_file_path,
+ e
+ );
+ }
+ continue;
+ }
+ };
+
+ if meta_shm_canonical != shared_memory_path_canonical {
+ limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)).await;
+ waited_ticks += 1;
+ if waited_ticks % 25 == 0 {
+ warn!(
+ "Waiting shared_memory_path match... ({}s), shm_dir='{}', config='{}', shared.json='{}'",
+ waited_ticks / 5,
+ shared_memory_dir.to_string_lossy(),
+ shared_memory_path_canonical,
+ meta_shm_canonical
+ );
+ }
+ continue;
+ }
+ if meta_file_canonical != shared_file_path_canonical {
+ limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)).await;
+ waited_ticks += 1;
+ if waited_ticks % 25 == 0 {
+ warn!(
+ "Waiting shared_file_path match... ({}s), file_dir='{}', config='{}', shared.json='{}'",
+ waited_ticks / 5,
+ shared_file_dir.to_string_lossy(),
+ shared_file_path_canonical,
+ meta_file_canonical
+ );
+ }
+ continue;
+ }
+
+ if meta.etcd_addresses.is_empty() {
+ limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)).await;
+ waited_ticks += 1;
+ if waited_ticks % 25 == 0 {
+ warn!(
+ "Waiting shared.json etcd_addresses non-empty... ({}s), shm_dir='{}' file_dir='{}', shared_memory_path='{}'",
+ waited_ticks / 5,
+ shared_memory_dir.to_string_lossy(),
+ shared_file_dir.to_string_lossy(),
+ meta_shm_canonical
+ );
+ }
+ continue;
+ }
+
+ let etcd_endpoints = match normalize_etcd_addresses(&meta.etcd_addresses) {
+ Ok(v) => v,
+ Err(e) => {
+ limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)).await;
+ waited_ticks += 1;
+ if waited_ticks % 25 == 0 {
+ warn!(
+ "Waiting shared.json etcd_addresses valid... ({}s), shm_dir='{}' file_dir='{}', raw={:?}, err={}",
+ waited_ticks / 5,
+ shared_memory_dir.to_string_lossy(),
+ shared_file_dir.to_string_lossy(),
+ meta.etcd_addresses,
+ e
+ );
+ }
+ continue;
+ }
+ };
+
+ return Ok(ExternalBootstrapBundle {
+ meta,
+ shared_memory_path: meta_shm_canonical,
+ shared_file_path: meta_file_canonical,
+ etcd_endpoints,
+ });
+ }
+}
+
async fn run_client_impl(
config_arg: ConfigArg,
test_overrides: Option,
@@ -1598,9 +1868,8 @@ async fn run_client_impl(
let build_version = fluxon_util::git_version_build_record::get_current_git_commitid().unwrap();
let source_sha256 = fluxon_util::build_info::SOURCE_SHA256;
- // 初始化日志系统:将日志放到共享文件根目录
- // 下的 {cluster_name}_cluster_kv_logs 子目录,避免在 shm 根目录下展开普通文件。
- let kv_logs_dir = Path::new(&config.shared_file_path)
+ // Logs and other large files are isolated from shared.json/peer metadata.
+ let kv_logs_dir = Path::new(&config.large_file_paths.log_root_path)
.join(format!("{}_cluster_kv_logs", config.cluster_name));
let observability_disabled = config.test_spec_config.disable_observability;
let greptime_tracing_rx = if observability_disabled {
@@ -1651,263 +1920,10 @@ async fn run_client_impl(
config.test_spec_config.side_transfer_role,
Some(SideTransferRole::Worker)
);
- let mut bootstrapped_shared_meta: Option = None;
-
- let config = if is_external {
- let shared_memory_dir = Path::new(&config.shared_memory_path);
- let shared_file_dir = Path::new(&config.shared_file_path);
- let shared_json_path = shared_file_dir.join("shared.json");
- let mmap_file_path = shared_memory_dir.join("mmap.file");
-
- let mut waited_ticks: u64 = 0;
- let (meta, meta_shm_canonical, meta_file_canonical, etcd_endpoints) = loop {
- if !shared_json_path.exists() || !mmap_file_path.exists() {
- limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)).await;
- waited_ticks += 1;
- if waited_ticks % 25 == 0 {
- info!(
- "Waiting owner shared bundle to be ready... ({}s), shm_dir={} file_dir={} (shared.json={}, mmap.file={})",
- waited_ticks / 5,
- shared_memory_dir.to_string_lossy(),
- shared_file_dir.to_string_lossy(),
- shared_json_path.exists(),
- mmap_file_path.exists()
- );
- }
- continue;
- }
-
- let shared_json_buf = match std::fs::read_to_string(&shared_json_path) {
- Ok(v) => v,
- Err(e) => {
- limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200))
- .await;
- waited_ticks += 1;
- if waited_ticks % 25 == 0 {
- warn!(
- "Waiting owner shared.json readable... ({}s), path={}, err={}",
- waited_ticks / 5,
- shared_json_path.to_string_lossy(),
- e
- );
- }
- continue;
- }
- };
-
- let meta: crate::SharedJsonMeta = match serde_json::from_str(&shared_json_buf) {
- Ok(v) => v,
- Err(e) => {
- limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200))
- .await;
- waited_ticks += 1;
- if waited_ticks % 25 == 0 {
- warn!(
- "Waiting owner shared.json schema ready... ({}s), path={}, err={}",
- waited_ticks / 5,
- shared_json_path.to_string_lossy(),
- e
- );
- }
- continue;
- }
- };
-
- if meta.protocol_version != build_version {
- limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)).await;
- waited_ticks += 1;
- if waited_ticks % 25 == 0 {
- warn!(
- "Waiting protocol_version match... ({}s), shm_dir='{}' file_dir='{}', shared='{}', local='{}'",
- waited_ticks / 5,
- shared_memory_dir.to_string_lossy(),
- shared_file_dir.to_string_lossy(),
- meta.protocol_version,
- build_version
- );
- }
- continue;
- }
-
- if meta.cluster_name != config.cluster_name {
- limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)).await;
- waited_ticks += 1;
- if waited_ticks % 25 == 0 {
- warn!(
- "Waiting cluster_name match... ({}s), shm_dir='{}' file_dir='{}', config='{}', shared.json='{}'",
- waited_ticks / 5,
- shared_memory_dir.to_string_lossy(),
- shared_file_dir.to_string_lossy(),
- config.cluster_name,
- meta.cluster_name
- );
- }
- continue;
- }
-
- let shared_memory_path_canonical = match std::fs::canonicalize(
- &config.shared_memory_path,
- ) {
- Ok(v) => v.to_string_lossy().into_owned(),
- Err(e) => {
- limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200))
- .await;
- waited_ticks += 1;
- if waited_ticks % 25 == 0 {
- warn!(
- "Waiting shared_memory_path canonicalizable... ({}s), shm_dir='{}', path='{}', err={}",
- waited_ticks / 5,
- shared_memory_dir.to_string_lossy(),
- config.shared_memory_path,
- e
- );
- }
- continue;
- }
- };
-
- let meta_shm_canonical = match std::fs::canonicalize(&meta.shared_memory_path) {
- Ok(v) => v.to_string_lossy().into_owned(),
- Err(e) => {
- limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200))
- .await;
- waited_ticks += 1;
- if waited_ticks % 25 == 0 {
- warn!(
- "Waiting shared.json shared_memory_path canonicalizable... ({}s), shm_dir='{}', path='{}', err={}",
- waited_ticks / 5,
- shared_memory_dir.to_string_lossy(),
- meta.shared_memory_path,
- e
- );
- }
- continue;
- }
- };
- let shared_file_path_canonical = match std::fs::canonicalize(&config.shared_file_path) {
- Ok(v) => v.to_string_lossy().into_owned(),
- Err(e) => {
- limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200))
- .await;
- waited_ticks += 1;
- if waited_ticks % 25 == 0 {
- warn!(
- "Waiting shared_file_path canonicalizable... ({}s), file_dir='{}', path='{}', err={}",
- waited_ticks / 5,
- shared_file_dir.to_string_lossy(),
- config.shared_file_path,
- e
- );
- }
- continue;
- }
- };
- let meta_file_canonical = match std::fs::canonicalize(&meta.shared_file_path) {
- Ok(v) => v.to_string_lossy().into_owned(),
- Err(e) => {
- limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200))
- .await;
- waited_ticks += 1;
- if waited_ticks % 25 == 0 {
- warn!(
- "Waiting shared.json shared_file_path canonicalizable... ({}s), file_dir='{}', path='{}', err={}",
- waited_ticks / 5,
- shared_file_dir.to_string_lossy(),
- meta.shared_file_path,
- e
- );
- }
- continue;
- }
- };
-
- if meta_shm_canonical != shared_memory_path_canonical {
- limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)).await;
- waited_ticks += 1;
- if waited_ticks % 25 == 0 {
- warn!(
- "Waiting shared_memory_path match... ({}s), shm_dir='{}', config='{}', shared.json='{}'",
- waited_ticks / 5,
- shared_memory_dir.to_string_lossy(),
- shared_memory_path_canonical,
- meta_shm_canonical
- );
- }
- continue;
- }
- if meta_file_canonical != shared_file_path_canonical {
- limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)).await;
- waited_ticks += 1;
- if waited_ticks % 25 == 0 {
- warn!(
- "Waiting shared_file_path match... ({}s), file_dir='{}', config='{}', shared.json='{}'",
- waited_ticks / 5,
- shared_file_dir.to_string_lossy(),
- shared_file_path_canonical,
- meta_file_canonical
- );
- }
- continue;
- }
-
- if meta.etcd_addresses.is_empty() {
- limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)).await;
- waited_ticks += 1;
- if waited_ticks % 25 == 0 {
- warn!(
- "Waiting shared.json etcd_addresses non-empty... ({}s), shm_dir='{}' file_dir='{}', shared_memory_path='{}'",
- waited_ticks / 5,
- shared_memory_dir.to_string_lossy(),
- shared_file_dir.to_string_lossy(),
- meta_shm_canonical
- );
- }
- continue;
- }
-
- let etcd_endpoints = match normalize_etcd_addresses(&meta.etcd_addresses) {
- Ok(v) => v,
- Err(e) => {
- limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200))
- .await;
- waited_ticks += 1;
- if waited_ticks % 25 == 0 {
- warn!(
- "Waiting shared.json etcd_addresses valid... ({}s), shm_dir='{}' file_dir='{}', raw={:?}, err={}",
- waited_ticks / 5,
- shared_memory_dir.to_string_lossy(),
- shared_file_dir.to_string_lossy(),
- meta.etcd_addresses,
- e
- );
- }
- continue;
- }
- };
-
- break (
- meta,
- meta_shm_canonical,
- meta_file_canonical,
- etcd_endpoints,
- );
- };
- bootstrapped_shared_meta = Some(meta.clone());
- // External bootstrap contract:
- // - Config provides: instance_key, fluxonkv_spec.cluster_name, fluxonkv_spec.shared_memory_path,
- // fluxonkv_spec.shared_file_path, fluxonkv_spec.p2p_listen_port.
- // - shared.json provides: cluster_name, etcd_addresses (raw), shared_memory_path (canonical),
- // shared_file_path (canonical), protocol_version, sub_cluster.
- // - pprof_duration_seconds is not inherited; it is controlled solely by config.
- let mut final_config = config.clone();
- final_config.etcd_addresses_raw = meta.etcd_addresses.clone();
- final_config.fluxonkv_spec.etcd_addresses = etcd_endpoints;
- final_config.fluxonkv_spec.sub_cluster = meta.sub_cluster;
- final_config.shared_memory_path = meta_shm_canonical;
- final_config.shared_file_path = meta_file_canonical;
- final_config
+ let bootstrapped_shared_meta = if is_external {
+ Some(wait_for_external_bootstrap_bundle(&config).await?.meta)
} else {
- config
+ None
};
if !is_external && config.test_spec_config.side_transfer_worker_count > 0 {
@@ -2012,6 +2028,7 @@ async fn run_client_impl(
external_client_api_arg: ExternalClientApiNewArg {
shared_memory_path: config.shared_memory_path.clone(),
shared_file_path: config.shared_file_path.clone(),
+ cache_root_path: config.large_file_paths.cache_root_path.clone(),
expected_cluster_name: config.cluster_name.clone(),
expected_protocol_version: build_version.clone(),
enable_side_transfer: config.test_spec_config.enable_side_transfer,
@@ -2063,6 +2080,8 @@ async fn run_client_impl(
// Read shared memory path from config (must not be empty).
shared_memory_path: config.shared_memory_path.clone(),
shared_file_path: config.shared_file_path.clone(),
+ log_root_path: config.large_file_paths.log_root_path.clone(),
+ cache_root_path: config.large_file_paths.cache_root_path.clone(),
cluster_name: config.cluster_name.clone(),
etcd_addresses: config.etcd_addresses_raw.clone(),
attach_existing_meta: if is_side_transfer_worker {
@@ -2405,7 +2424,7 @@ async fn run_client_impl(
}
let shutdown_waiter = framework.cluster_manager_view().register_shutdown_waiter();
- let kv_profiles_dir = Path::new(&config.shared_file_path)
+ let kv_profiles_dir = Path::new(&config.large_file_paths.log_root_path)
.join(format!("{}_cluster_kv_profiles", config.cluster_name));
profile::spawn_pprof_flamegraph_on_timeout_or_shutdown(
config.pprof_duration_seconds,
@@ -2485,6 +2504,10 @@ mod tests {
},
shared_memory_path: "/tmp/fluxon_side_transfer_test".to_string(),
shared_file_path: "/tmp/fluxon_side_transfer_test_files".to_string(),
+ large_file_paths: crate::config::LargeFilePaths {
+ log_root_path: "/tmp/fluxon_side_transfer_test_large/log".to_string(),
+ cache_root_path: "/tmp/fluxon_side_transfer_test_large/cache".to_string(),
+ },
test_spec_config: TestSpecConfig {
enable_side_transfer: true,
side_transfer_worker_count: 4,
@@ -2720,6 +2743,7 @@ mod tests {
);
assert!(side_cfg_yaml.contribute_to_cluster_pool_size.is_none());
assert!(side_cfg_yaml.fluxonkv_spec.etcd_addresses.is_none());
+ assert!(side_cfg_yaml.fluxonkv_spec.large_file_paths.is_none());
assert!(side_cfg_yaml.fluxonkv_spec.sub_cluster.is_none());
assert_eq!(side_cfg_yaml.fluxonkv_spec.p2p_listen_port, Some(42001));
assert_eq!(
@@ -2728,6 +2752,101 @@ mod tests {
);
}
+ #[tokio::test]
+ async fn zero_contribution_bootstrap_inherits_large_file_paths_from_owner_shared_json() {
+ let tempdir = new_test_dir("fluxon_external_bootstrap_large_paths");
+ let shared_memory_root = tempdir.join("shared_mem");
+ let shared_file_root = tempdir.join("shared_file");
+ let owner_log_root = tempdir.join("owner_logs");
+ let owner_cache_root = tempdir.join("owner_cache");
+ std::fs::create_dir_all(&shared_memory_root).unwrap();
+ std::fs::create_dir_all(&shared_file_root).unwrap();
+ std::fs::create_dir_all(&owner_log_root).unwrap();
+ std::fs::create_dir_all(&owner_cache_root).unwrap();
+ std::fs::write(shared_memory_root.join("mmap.file"), vec![0u8; 4096]).unwrap();
+
+ let shared_meta = SharedJsonMeta {
+ owner_id: "owner-a".to_string(),
+ node_start_time: 123,
+ segment_len: 4096,
+ segment_label: Some("cpu:0".to_string()),
+ sub_cluster: Some("owner-sub".to_string()),
+ cluster_name: "test_cluster".to_string(),
+ etcd_addresses: vec!["127.0.0.1:2379".to_string()],
+ shared_memory_path: std::fs::canonicalize(&shared_memory_root)
+ .unwrap()
+ .to_string_lossy()
+ .into_owned(),
+ shared_file_path: std::fs::canonicalize(&shared_file_root)
+ .unwrap()
+ .to_string_lossy()
+ .into_owned(),
+ large_file_paths: crate::config::LargeFilePaths {
+ log_root_path: owner_log_root.to_string_lossy().into_owned(),
+ cache_root_path: owner_cache_root.to_string_lossy().into_owned(),
+ },
+ protocol_version:
+ fluxon_util::git_version_build_record::get_current_git_commitid().unwrap(),
+ write_ts: Some(chrono::Utc::now().timestamp_micros()),
+ };
+ std::fs::write(
+ shared_file_root.join("shared.json"),
+ serde_json::to_vec(&shared_meta).unwrap(),
+ )
+ .unwrap();
+
+ let config = ClientConfig {
+ cluster_name: "test_cluster".to_string(),
+ etcd_addresses_raw: Vec::new(),
+ instance_key: "external-a".to_string(),
+ contribute_to_cluster_pool_size: ContributeToClusterPoolSize {
+ dram: 0,
+ vram: HashMap::new(),
+ },
+ protocol: ProtocolConfig {
+ protocol_type: ProtocolType::Tcp,
+ rdma_device_names: None,
+ },
+ pprof_duration_seconds: None,
+ redis_compat_listen_addr: None,
+ fluxonkv_spec: FluxonKvSpec {
+ etcd_addresses: Vec::new(),
+ cluster_name: "test_cluster".to_string(),
+ p2p_listen_port: Some(41001),
+ transfer_engine: TransferEngineType::P2p,
+ enable_transfer_rpc_fast_path: false,
+ sub_cluster: None,
+ },
+ shared_memory_path: shared_memory_root.to_string_lossy().into_owned(),
+ shared_file_path: shared_file_root.to_string_lossy().into_owned(),
+ large_file_paths: crate::config::LargeFilePaths {
+ log_root_path: String::new(),
+ cache_root_path: String::new(),
+ },
+ test_spec_config: TestSpecConfig::default(),
+ };
+
+ let bootstrapped = bootstrap_zero_contribution_client_config(config)
+ .await
+ .expect("bootstrap zero-contribution config");
+ assert_eq!(
+ bootstrapped.large_file_paths.log_root_path,
+ owner_log_root.to_string_lossy()
+ );
+ assert_eq!(
+ bootstrapped.large_file_paths.cache_root_path,
+ owner_cache_root.to_string_lossy()
+ );
+ assert_eq!(
+ bootstrapped.fluxonkv_spec.sub_cluster,
+ Some("owner-sub".to_string())
+ );
+ assert_eq!(
+ bootstrapped.fluxonkv_spec.etcd_addresses,
+ vec!["http://127.0.0.1:2379".to_string()]
+ );
+ }
+
#[test]
fn current_exe_name_helpers_detect_python_and_fluxon_kv() {
assert!(current_exe_looks_like_python(Path::new(
diff --git a/fluxon_rs/fluxon_kv/src/memholder/memholder_test.rs b/fluxon_rs/fluxon_kv/src/memholder/memholder_test.rs
index 377a1c2..5b260c3 100644
--- a/fluxon_rs/fluxon_kv/src/memholder/memholder_test.rs
+++ b/fluxon_rs/fluxon_kv/src/memholder/memholder_test.rs
@@ -94,6 +94,10 @@ fn new_client_config_with_size(
},
shared_memory_path: format!("/tmp/kvcache_shared_memory/{}", instance_key),
shared_file_path: format!("/tmp/kvcache_shared_files/{}", instance_key),
+ large_file_paths: crate::config::LargeFilePaths {
+ log_root_path: format!("/tmp/kvcache_large_logs/{}", instance_key),
+ cache_root_path: format!("/tmp/kvcache_large_cache/{}", instance_key),
+ },
test_spec_config: TestSpecConfig::default(),
}
}
@@ -127,6 +131,10 @@ fn new_zero_contribution_client_config(
},
shared_memory_path: format!("/tmp/kvcache_shared_memory/{}", owner_instance_key),
shared_file_path: format!("/tmp/kvcache_shared_files/{}", owner_instance_key),
+ large_file_paths: crate::config::LargeFilePaths {
+ log_root_path: String::new(),
+ cache_root_path: String::new(),
+ },
test_spec_config: TestSpecConfig::default(),
}
}
diff --git a/fluxon_rs/fluxon_ops/Cargo.toml b/fluxon_rs/fluxon_ops/Cargo.toml
index 0d54fc5..f4f772a 100644
--- a/fluxon_rs/fluxon_ops/Cargo.toml
+++ b/fluxon_rs/fluxon_ops/Cargo.toml
@@ -5,6 +5,7 @@ edition = "2024"
[dependencies]
anyhow = { workspace = true }
+chrono = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
serde_yaml = { workspace = true }
@@ -28,3 +29,6 @@ fluxon_framework = { path = "../fluxon_framework" }
fluxon_util = { path = "../fluxon_util" }
fluxon_cli = { path = "../fluxon_cli" }
fluxon_proxy = { path = "../fluxon_proxy" }
+
+[dev-dependencies]
+tempfile = { workspace = true }
diff --git a/fluxon_rs/fluxon_ops/build.rs b/fluxon_rs/fluxon_ops/build.rs
index ae424ef..585fbfc 100644
--- a/fluxon_rs/fluxon_ops/build.rs
+++ b/fluxon_rs/fluxon_ops/build.rs
@@ -58,14 +58,23 @@ print(
String::from_utf8(output.stdout).expect("selection supervisor output must be utf-8")
}
+fn render_log_shard_helper(repo_root: &Path) -> String {
+ let helper_path = repo_root.join("deployment").join("utils").join("log_shard.py");
+ fs::read_to_string(&helper_path)
+ .unwrap_or_else(|e| panic!("read log shard helper failed: {} ({})", helper_path.display(), e))
+}
+
fn main() {
let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").expect("CARGO_MANIFEST_DIR"));
let repo_root = repo_root(&manifest_dir);
let source = render_selection_supervisor(&repo_root);
+ let log_shard_source = render_log_shard_helper(&repo_root);
let out_dir = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR"));
let out_path = out_dir.join("selection_supervisor.py");
fs::write(&out_path, source).expect("write embedded selection supervisor source");
+ let helper_out_path = out_dir.join("log_shard.py");
+ fs::write(&helper_out_path, log_shard_source).expect("write embedded log shard helper");
println!("cargo:rerun-if-changed=build.rs");
println!(
@@ -76,4 +85,8 @@ fn main() {
.join("selection_supervisor_codegen.py")
.display()
);
+ println!(
+ "cargo:rerun-if-changed={}",
+ repo_root.join("deployment").join("utils").join("log_shard.py").display()
+ );
}
diff --git a/fluxon_rs/fluxon_ops/src/lib.rs b/fluxon_rs/fluxon_ops/src/lib.rs
index 40f646a..b27420b 100644
--- a/fluxon_rs/fluxon_ops/src/lib.rs
+++ b/fluxon_rs/fluxon_ops/src/lib.rs
@@ -28,7 +28,8 @@ use fluxon_kv::{ConfigArg, Framework, run_client};
use fluxon_proxy::{HeaderKv, PanelProxyMethod, PanelProxyResp};
use fluxon_util::{
- FluxonCliProxyDescriptorV2, FluxonCliProxyTransportV2, fluxon_cli_proxy_desc_etcd_key_v2,
+ FluxonCliProxyDescriptorV2, FluxonCliProxyTransportV2, display_runtime_log_path,
+ fluxon_cli_proxy_desc_etcd_key_v2, resolve_readable_log_path,
};
pub const OPS_SERVICE_NAME: &str = "ops";
@@ -57,6 +58,7 @@ const OPS_ATOMIC_GROUP_ANNOTATION_KEY: &str = "fluxon.io/atomic_group";
const OPS_ATOMIC_GROUP_PHASE_ANNOTATION_KEY: &str = "fluxon.io/atomic_group_phase";
const OPS_ATOMIC_GROUP_ORDER_ANNOTATION_KEY: &str = "fluxon.io/atomic_group_order";
const OPS_SELECTION_SUPERVISOR_FILENAME: &str = "selection_supervisor.py";
+const OPS_LOG_SHARD_HELPER_FILENAME: &str = "log_shard.py";
const OPS_SELECTION_SUPERVISOR_DIR_NAME: &str = "selection_supervisor";
const OPS_SELECTION_SUPERVISOR_RUN_RESTART_DELAY_SECONDS: u64 = 5;
const OPS_SELECTION_SUPERVISOR_RUN_MAX_BACKOFF_SECONDS: u64 = 30;
@@ -78,6 +80,7 @@ const DELETE_APPLY_NO_WAIT_DELAY_SECONDS: u64 = 30;
const EMBEDDED_SELECTION_SUPERVISOR_SOURCE: &str =
include_str!(concat!(env!("OUT_DIR"), "/selection_supervisor.py"));
+const EMBEDDED_LOG_SHARD_HELPER_SOURCE: &str = include_str!(concat!(env!("OUT_DIR"), "/log_shard.py"));
// Ops controller uses Fluxon user-RPC to talk to ops agents.
// Keep the timeout as a fixed constant to avoid config surface area.
@@ -970,7 +973,7 @@ fn resolve_python_host_executable(python_exe: &Path) -> anyhow::Result
Ok(resolved)
}
-fn ensure_embedded_selection_supervisor(workdir: &Path) -> anyhow::Result {
+fn ensure_embedded_selection_supervisor_runtime(workdir: &Path) -> anyhow::Result<(PathBuf, PathBuf)> {
let runtime_dir = workdir.join(OPS_SELECTION_SUPERVISOR_DIR_NAME);
std::fs::create_dir_all(&runtime_dir).with_context(|| {
format!(
@@ -979,6 +982,7 @@ fn ensure_embedded_selection_supervisor(workdir: &Path) -> anyhow::Result existing != EMBEDDED_SELECTION_SUPERVISOR_SOURCE,
Err(e) => {
@@ -992,6 +996,19 @@ fn ensure_embedded_selection_supervisor(workdir: &Path) -> anyhow::Result existing != EMBEDDED_LOG_SHARD_HELPER_SOURCE,
+ Err(e) => {
+ if e.kind() == std::io::ErrorKind::NotFound {
+ true
+ } else {
+ return Err(anyhow::Error::new(e).context(format!(
+ "read embedded log shard helper failed: {}",
+ helper_path.display()
+ )));
+ }
+ }
+ };
if should_write {
std::fs::write(&script_path, EMBEDDED_SELECTION_SUPERVISOR_SOURCE).with_context(|| {
format!(
@@ -1019,13 +1036,21 @@ fn ensure_embedded_selection_supervisor(workdir: &Path) -> anyhow::Result anyhow::Result {
let python_exe = resolve_python_host_executable(python_exe)?;
- let script_path = ensure_embedded_selection_supervisor(workdir)?;
+ let (script_path, _helper_path) = ensure_embedded_selection_supervisor_runtime(workdir)?;
if !hostworkdir.is_absolute() {
anyhow::bail!(
"hostworkdir must be absolute for shared selection supervisor runtime: {}",
@@ -1647,7 +1672,9 @@ fn selection_status_from_live_supervisor(
apply_id: runtime_state.as_ref().and_then(|v| v.apply_id.clone()),
argv: runtime_state.as_ref().map(|v| v.argv.clone()),
cwd: runtime_state.as_ref().and_then(|v| v.cwd.clone()),
- log_path: runtime_state.as_ref().map(|v| v.log_path.clone()),
+ log_path: runtime_state
+ .as_ref()
+ .map(|v| display_runtime_log_path(v.log_path.as_str())),
started_ts_ms: None,
owner_ts_ms: Some(supervisor.owner_ts_ms),
supervisor_start_time_ticks: Some(supervisor.start_time_ticks()),
@@ -2970,7 +2997,8 @@ impl UserRpcHandler for ReadWorkloadLogChunkHandler {
}
};
- let path = self.log_dir.join(log_filename);
+ let logical_path = self.log_dir.join(log_filename);
+ let path = resolve_readable_log_path(&logical_path).unwrap_or(logical_path.clone());
let meta = match std::fs::metadata(&path) {
Ok(v) => v,
Err(e) => {
@@ -3773,8 +3801,12 @@ fn desired_workload_matches_running(
workloads: &SupervisorBackedWorkloads,
desired: &AgentDesiredWorkload,
) -> bool {
- let _ = workloads;
- let Ok(status) = observe_selection_status(desired.kind, &desired.name, &desired.authority)
+ let Ok(status) = observe_selection_status_for_scope(
+ desired.kind,
+ &desired.name,
+ &desired.authority,
+ Some(workloads.scope_key.as_str()),
+ )
else {
return false;
};
@@ -3854,7 +3886,6 @@ fn desired_workload_recovery_superseded(
workloads: &SupervisorBackedWorkloads,
desired: &AgentDesiredWorkload,
) -> anyhow::Result {
- let _ = workloads;
// English note:
// - A newer apply-owned generation overlapping an older applyless bare owner is the expected
// phase-1 state of the self-host two-phase handover.
@@ -3863,7 +3894,12 @@ fn desired_workload_recovery_superseded(
// phase 2 has a chance to cut over.
// - Only an owner_ts that is newer than the requested workload and is not this intentional
// phase-1 overlap is treated as a hard superseding fact.
- let status = observe_selection_status(desired.kind, &desired.name, &desired.authority)?;
+ let status = observe_selection_status_for_scope(
+ desired.kind,
+ &desired.name,
+ &desired.authority,
+ Some(workloads.scope_key.as_str()),
+ )?;
if phase1_overlap_with_applyless_owner(&status, desired) {
return Ok(false);
}
@@ -13938,6 +13974,90 @@ mod tests {
assert!(err_text.contains("owner_ts_ms collision"), "{err_text}");
}
+ #[test]
+ fn live_selection_supervisors_isolate_same_label_collision_by_scope_key() {
+ let snapshot = SelectionSupervisorProcSnapshot {
+ infos_by_pid: std::collections::HashMap::from([
+ (
+ 11,
+ ProcessInfoObservation {
+ pid: 11,
+ ppid: 1,
+ pgid: 11,
+ state: 'S',
+ start_time_ticks: 100,
+ },
+ ),
+ (
+ 22,
+ ProcessInfoObservation {
+ pid: 22,
+ ppid: 1,
+ pgid: 22,
+ state: 'S',
+ start_time_ticks: 200,
+ },
+ ),
+ ]),
+ children_by_ppid: std::collections::HashMap::new(),
+ cmdlines: vec![
+ (
+ 11,
+ vec![
+ "/usr/bin/python3".to_string(),
+ "selection_supervisor.py".to_string(),
+ "run".to_string(),
+ "--label".to_string(),
+ "DaemonSet/target".to_string(),
+ "--scope-key".to_string(),
+ "/tmp/scope-a".to_string(),
+ "--owner-ts-ms".to_string(),
+ "2".to_string(),
+ ],
+ ),
+ (
+ 22,
+ vec![
+ "/usr/bin/python3".to_string(),
+ "selection_supervisor.py".to_string(),
+ "run".to_string(),
+ "--label".to_string(),
+ "DaemonSet/target".to_string(),
+ "--scope-key".to_string(),
+ "/tmp/scope-b".to_string(),
+ "--owner-ts-ms".to_string(),
+ "2".to_string(),
+ ],
+ ),
+ ],
+ zombie_infos: Vec::new(),
+ };
+
+ let scoped_a =
+ live_selection_supervisors(&snapshot, Some("DaemonSet/target"), Some("/tmp/scope-a"))
+ .unwrap();
+ assert_eq!(scoped_a.len(), 1);
+ assert_eq!(scoped_a[0].pid(), 11);
+
+ let scoped_b =
+ live_selection_supervisors(&snapshot, Some("DaemonSet/target"), Some("/tmp/scope-b"))
+ .unwrap();
+ assert_eq!(scoped_b.len(), 1);
+ assert_eq!(scoped_b[0].pid(), 22);
+
+ let listed_a = observe_all_selection_statuses_for_snapshot(&snapshot, Some("/tmp/scope-a"))
+ .unwrap();
+ assert_eq!(listed_a.len(), 1);
+ assert_eq!(listed_a[0].label, "DaemonSet/target");
+ assert_eq!(listed_a[0].pid, Some(11));
+
+ let listed_b = observe_all_selection_statuses_for_snapshot(&snapshot, Some("/tmp/scope-b"))
+ .unwrap();
+ assert_eq!(listed_b.len(), 1);
+ assert_eq!(listed_b[0].label, "DaemonSet/target");
+ assert_eq!(listed_b[0].pid, Some(22));
+ }
+
#[test]
fn live_selection_supervisors_reject_matching_legacy_entry_without_owner_ts_ms() {
let snapshot = SelectionSupervisorProcSnapshot {
@@ -14405,6 +14525,95 @@ mod tests {
.unwrap();
}
+ #[test]
+ fn materialize_selection_supervisor_runtime_writes_log_shard_helper() {
+ let python_exe = PathBuf::from("/usr/bin/python3");
+ assert!(
+ python_exe.is_file(),
+ "python executable does not exist: {}",
+ python_exe.display()
+ );
+ let workdir = tempfile::tempdir().unwrap();
+ let runtime =
+ SelectionSupervisorRuntime::materialize(workdir.path(), workdir.path(), python_exe.as_path())
+ .unwrap();
+ assert!(runtime.script_path.exists());
+ assert!(
+ runtime
+ .script_path
+ .parent()
+ .unwrap()
+ .join(OPS_LOG_SHARD_HELPER_FILENAME)
+ .is_file()
+ );
+ }
+
+ #[test]
+ fn detached_selection_supervisor_preserves_early_startup_logs() {
+ let python_exe = PathBuf::from("/usr/bin/python3");
+ assert!(
+ python_exe.is_file(),
+ "python executable does not exist: {}",
+ python_exe.display()
+ );
+ let workdir = tempfile::tempdir().unwrap();
+ let runtime =
+ SelectionSupervisorRuntime::materialize(workdir.path(), workdir.path(), python_exe.as_path())
+ .unwrap();
+ let log_path = workdir.path().join("startup.log");
+ let command = vec![
+ python_exe.display().to_string(),
+ runtime.script_path.display().to_string(),
+ "run".to_string(),
+ "--label".to_string(),
+ "Deployment/startup_demo".to_string(),
+ "--scope-key".to_string(),
+ workdir.path().display().to_string(),
+ "--owner-ts-ms".to_string(),
+ "0".to_string(),
+ "--restart-policy".to_string(),
+ "always".to_string(),
+ "--restart-delay-seconds".to_string(),
+ "5".to_string(),
+ "--max-backoff-seconds".to_string(),
+ "30".to_string(),
+ "--crashloop-consecutive-restarts".to_string(),
+ "0".to_string(),
+ "--crashloop-interval-lt-seconds".to_string(),
+ "0".to_string(),
+ "--".to_string(),
+ "/bin/true".to_string(),
+ ];
+ let pid = runtime.spawn_detached_command(&log_path, command.as_slice()).unwrap();
+ let deadline = Instant::now() + Duration::from_secs(10);
+ let expected = "owner-ts-ms must be positive";
+ let mut saw_expected = false;
+ while Instant::now() < deadline {
+ if let Some(path) = resolve_readable_log_path(&log_path) {
+ let text = std::fs::read_to_string(path).unwrap_or_default();
+ if text.contains(expected) {
+ saw_expected = true;
+ break;
+ }
+ }
+ std::thread::sleep(Duration::from_millis(100));
+ }
+ if let Some(path) = resolve_readable_log_path(&log_path) {
+ let text = std::fs::read_to_string(path).unwrap_or_default();
+ assert!(
+ text.contains(expected),
+ "expected detached supervisor startup logs to reach runtime log, got: {text:?}"
+ );
+ } else {
+ panic!("runtime log path did not materialize");
+ }
+ assert!(saw_expected, "startup log was not observed before timeout");
+ let _ = std::process::Command::new("kill")
+ .arg("-TERM")
+ .arg(pid.to_string())
+ .status();
+ }
+
#[test]
fn atomic_group_non_agent_requires_present_before_running_match() {
let desired = AgentDesiredWorkload {
@@ -14616,4 +14825,25 @@ mod tests {
};
assert!(!phase1_overlap_with_applyless_owner(&status, &desired));
}
+
+ #[test]
+ fn resolve_readable_log_path_prefers_latest_daily_shard() {
+ let td = tempfile::tempdir().unwrap();
+ let base_path = td.path().join("workload__Deployment__demo.log");
+ std::fs::write(
+ td.path().join("workload__Deployment__demo.2026-06-19.log"),
+ "old\n",
+ )
+ .unwrap();
+ std::fs::write(
+ td.path().join("workload__Deployment__demo.2026-06-20.log"),
+ "new\n",
+ )
+ .unwrap();
+ let resolved = resolve_readable_log_path(&base_path).unwrap();
+ assert_eq!(
+ resolved.file_name().and_then(|v| v.to_str()),
+ Some("workload__Deployment__demo.2026-06-20.log")
+ );
+ }
}
diff --git a/fluxon_rs/fluxon_util/build.rs b/fluxon_rs/fluxon_util/build.rs
index 0f586d3..2bf7b87 100644
--- a/fluxon_rs/fluxon_util/build.rs
+++ b/fluxon_rs/fluxon_util/build.rs
@@ -88,12 +88,15 @@ fn collect_crates_for_runtime(ws: &CargoWorkspace) {
println!("cargo:rerun-if-changed=Cargo.toml");
}
-fn try_discover_git_dir(manifest_dir: &Path) -> Option {
+fn try_discover_git_dir(manifest_dir: &Path, workspace_root: &Path) -> Option {
+ let workspace_search_ceiling = workspace_root.parent().unwrap_or(workspace_root);
let mut cur = Some(manifest_dir);
while let Some(dir) = cur {
let candidate = dir.join(".git");
if candidate.is_dir() {
- return Some(candidate);
+ if candidate.join("HEAD").is_file() {
+ return Some(candidate);
+ }
}
if candidate.is_file() {
// Worktree/submodule style: .git is a file containing `gitdir: `
@@ -106,11 +109,17 @@ fn try_discover_git_dir(manifest_dir: &Path) -> Option {
.unwrap_or_else(|| panic!("invalid .git file format: {}", candidate.display()))
.trim();
let gitdir_path = Path::new(gitdir);
- return Some(if gitdir_path.is_absolute() {
+ let resolved = if gitdir_path.is_absolute() {
gitdir_path.to_path_buf()
} else {
dir.join(gitdir_path)
- });
+ };
+ if resolved.join("HEAD").is_file() {
+ return Some(resolved);
+ }
+ }
+ if dir == workspace_search_ceiling {
+ break;
}
cur = dir.parent();
}
@@ -309,7 +318,7 @@ fn main() {
v
}
Err(_) => {
- match try_discover_git_dir(&manifest_dir) {
+ match try_discover_git_dir(&manifest_dir, &ws.workspace_root) {
Some(git_dir) => {
emit_rerun_hints(&git_dir);
resolve_head_commit_id(&git_dir)
diff --git a/fluxon_rs/fluxon_util/src/lib.rs b/fluxon_rs/fluxon_util/src/lib.rs
index 2f4f9fa..e575a75 100644
--- a/fluxon_rs/fluxon_util/src/lib.rs
+++ b/fluxon_rs/fluxon_util/src/lib.rs
@@ -36,7 +36,12 @@ pub mod limitrate;
// PyO3 helpers: run long-time Python call without holding GIL in caller thread.
pub mod pyo3;
// Re-export for stable public API: existing call sites can keep using `fluxon_util::init_log`.
-pub use log::{current_log_file_path, init_log, init_log_test, init_log_with_extra_layer};
+pub use log::{
+ current_daily_sharded_log_path, current_log_file_path, daily_sharded_log_path,
+ display_runtime_log_path, init_log, init_log_test, init_log_with_extra_layer,
+ latest_existing_daily_sharded_log_path, resolve_readable_log_path,
+ DEFAULT_DAILY_LOG_RETENTION_DAYS,
+};
#[cfg(test)]
mod test_util_test;
diff --git a/fluxon_rs/fluxon_util/src/log.rs b/fluxon_rs/fluxon_util/src/log.rs
index db3d88f..648650f 100644
--- a/fluxon_rs/fluxon_util/src/log.rs
+++ b/fluxon_rs/fluxon_util/src/log.rs
@@ -3,6 +3,7 @@ use std::io;
use std::path::{Path, PathBuf};
use std::sync::OnceLock;
+use parking_lot::Mutex;
use tracing_appender::non_blocking;
use tracing_appender::non_blocking::WorkerGuard;
use tracing_subscriber::EnvFilter;
@@ -20,6 +21,9 @@ mod generated_crates {
// RPC fast-path traffic actually entered the closed transfer / verbs backend. Keep the scope explicit:
// only these dependency targets are promoted to DEBUG alongside workspace crates.
const RDMA_DEBUG_TARGETS: &[&str] = &["fabric_lib", "libfabric_sys", "libibverbs_sys"];
+const LOG_RETENTION_DAYS: usize = 31;
+const TEST_LOG_SHARD_WINDOW_SECONDS_ENV: &str = "FLUXON_TEST_LOG_SHARD_WINDOW_SECONDS";
+const TEST_LOG_SHARD_ANCHOR_UNIX_SECONDS_ENV: &str = "FLUXON_TEST_LOG_SHARD_ANCHOR_UNIX_SECONDS";
// Simple UTC timer in RFC3339 seconds (no subsecond precision)
struct UtcSecondTimer;
@@ -37,6 +41,191 @@ static GLOBAL_CONSOLE_LOG_GUARD: OnceLock = OnceLock::new();
// Expose the current process log file path for sidecar collectors (e.g. OTLP tailer).
static GLOBAL_LOG_FILE_PATH: OnceLock = OnceLock::new();
+pub const DEFAULT_DAILY_LOG_RETENTION_DAYS: usize = LOG_RETENTION_DAYS;
+
+#[derive(Clone, Copy, Debug)]
+struct LogShardWindowConfig {
+ window_seconds: i64,
+ anchor_unix_seconds: i64,
+}
+
+fn read_test_log_shard_window_config() -> anyhow::Result