diff --git a/.github/workflows/all_test.yml b/.github/workflows/all_test.yml index 9e729ef..7c001c4 100644 --- a/.github/workflows/all_test.yml +++ b/.github/workflows/all_test.yml @@ -65,6 +65,7 @@ jobs: # Scene selection: # - ci_top_attention_doc_page_build validates doc build through the prebuilt Docker image. # - ci_top_attention_bin_kvtest keeps the Rust kv_test entry under the testbed scene contract. + # - ci_top_attention_log_mgmt keeps log rolling/sharding coverage under the same CI testbed contract. # - ci_top_attention_mq_core keeps MQ correctness coverage inside the same CI testbed contract. suite["scenes"] = { key: value @@ -72,7 +73,9 @@ jobs: if key in ( "ci_top_attention_doc_page_build", "ci_top_attention_bin_kvtest", + "ci_top_attention_log_mgmt", "ci_top_attention_mq_core", + "ci_top_attention_log_mgmt", ) } @@ -91,11 +94,14 @@ jobs: suite["profiles"]["fluxon_tcp"]["runtime"]["ci"]["scene_configs"]["ci_top_attention_doc_page_build"]["doc_site_base_url"] = ( "${{ github.repository_owner }}.github.io/${{ github.event.repository.name }}" ) + suite["profiles"]["fluxon_tcp"]["runtime"]["ci"]["scene_configs"]["ci_top_attention_log_mgmt"]["enabled"] = True + suite["profiles"]["fluxon_tcp"]["runtime"]["ci"]["scene_configs"]["ci_top_attention_mq_core"] = {} # Scale selection: # - Keep the original per-scene scales from ci_test_list.yaml. # - ci_top_attention_doc_page_build stays on n1_kvowner_dram_3gib. # - ci_top_attention_bin_kvtest stays on n1_kvowner_dram_20gib. + # - ci_top_attention_log_mgmt stays on n1_kvowner_dram_20gib. # - ci_top_attention_mq_core stays on n1_kvowner_dram_20gib. out_path.write_text( diff --git a/deployment/deployconf.yaml b/deployment/deployconf.yaml index b0c67b2..3000ffe 100644 --- a/deployment/deployconf.yaml +++ b/deployment/deployconf.yaml @@ -81,13 +81,9 @@ global_envs: # printf "[global]\nindex-url = https://pypi.tuna.tsinghua.edu.cn/simple\n" > ~/.pip/pip.conf; FLUXON_CLUSTER_NAME: "fluxon-example-cluster" FLUXON_SHARED_MEM: "${HOSTWORKDIR}/shm1" - FLUXON_SHARED_FILE: "${HOSTWORKDIR}/shm1_files" FLUXON_SHARED_MEM2: "${HOSTWORKDIR}/shm2" - FLUXON_SHARED_FILE2: "${HOSTWORKDIR}/shm2_files" FLUXON_SHARED_MEM_RELAY: "${HOSTWORKDIR}/shm3" - FLUXON_SHARED_FILE_RELAY: "${HOSTWORKDIR}/shm3_files" FLUXON_SHARED_MEM_RELAY2: "${HOSTWORKDIR}/shm4" - FLUXON_SHARED_FILE_RELAY2: "${HOSTWORKDIR}/shm4_files" # Enable pprof by setting seconds > 0; empty means disabled. FLUXON_PPROF_DURATION_SECONDS: "" @@ -328,8 +324,7 @@ service: etcd_addresses: - "${ETCD_FULL_ADDRESS}" cluster_name: "${FLUXON_CLUSTER_NAME}" - shared_memory_path: "${FLUXON_SHARED_MEM}" - shared_file_path: "${FLUXON_SHARED_FILE}" + share_mem_path: "${FLUXON_SHARED_MEM}" # redis_compat: # listen_addr: "0.0.0.0:16379" sub_cluster: "producer_side" @@ -375,8 +370,7 @@ service: etcd_addresses: - "${ETCD_FULL_ADDRESS}" cluster_name: "${FLUXON_CLUSTER_NAME}" - shared_memory_path: "${FLUXON_SHARED_MEM2}" - shared_file_path: "${FLUXON_SHARED_FILE2}" + share_mem_path: "${FLUXON_SHARED_MEM2}" sub_cluster: "consumer_side" YAML # export UCX_LOG_LEVEL=info @@ -397,15 +391,14 @@ service: cd "${FLUXON_SRC_ROOT}/examples" WORKDIR="./fluxon_work/fluxon_fs_master_work" - mkdir -p "${WORKDIR}" "${FLUXON_SHARED_MEM}" "${FLUXON_SHARED_FILE}" + mkdir -p "${WORKDIR}" "${FLUXON_SHARED_MEM}" cat > "all_config.yaml" < "all_config.yaml" < "all_config.yaml" < str: + template_path = BARE_TEMPLATE_DIR / template_name + if not template_path.is_file(): + raise RuntimeError(f"missing bare deploy template: {template_path}") + return template_path.read_text(encoding="utf-8") + + +def _render_bare_template(*, template_name: str, values: Dict[str, str]) -> str: + template = _load_bare_template(template_name=template_name) + + def _replace(match: re.Match[str]) -> str: + key = match.group(1) + if key not in values: + raise RuntimeError(f"missing bare deploy template value: template={template_name} key={key}") + value = values[key] + if not isinstance(value, str): + raise ValueError(f"bare deploy template value must be a string: template={template_name} key={key}") + return value + + return _TEMPLATE_TOKEN_RE.sub(_replace, template) def _resolve_repo_root_cli_path(*, raw_path: Path, field_name: str) -> Path: @@ -89,6 +116,10 @@ def main() -> None: outdir / PYTHON_SELECTION_SUPERVISOR_FILENAME, render_python_selection_supervisor_module(timeouts=STOP_TIMEOUTS), ) + (outdir / LOG_SHARD_HELPER_FILENAME).write_text( + render_log_shard_module_source(), + encoding="utf-8", + ) name_prefix = _require_str(cfg.get("name_prefix"), "name_prefix") cluster_nodes_raw = _require_list(cfg.get("cluster_nodes"), "cluster_nodes") @@ -306,12 +337,12 @@ def _bare_entrypoint_script_name(*, workload_name: str) -> str: def _render_bare_entrypoint_script(*, service_name: str, entrypoint: str) -> str: - return ( - "#!/usr/bin/env bash\n" - "set -euo pipefail\n\n" - f"export SERVICE={_sh_quote(service_name)}\n" - + entrypoint.strip() - + "\n" + return _render_bare_template( + template_name="bare_entrypoint.sh.tmpl", + values={ + "SERVICE_EXPORT": _sh_quote(service_name), + "ENTRYPOINT": entrypoint.strip(), + }, ) @@ -353,29 +384,25 @@ def _render_standalone_start_script( service_cfg: Dict[str, Any], ) -> str: allowed_nodes = _extract_nodes(service_cfg) - service_port = _extract_port(service_cfg) - port_export = "" - if service_port is not None: - port_export = f"export {service_name.upper()}__PORT={_sh_quote(str(service_port))}\n" - return ( - "#!/usr/bin/env bash\n" - "set -euo pipefail\n\n" - f"SERVICE={_sh_quote(service_name)}\n" - f"NAME_PREFIX={_sh_quote(name_prefix)}\n" - + _render_nodes_bash(name="ALLOWED_NODES", nodes=allowed_nodes) - + _render_host_prelude(cluster_nodes=cluster_nodes) - + _render_common_node_resolution_tail(service_name=service_name) - + _render_selection_supervisor_path_from_script_dir() - + _render_proc_lifecycle_pid_tree_helpers() - + _render_tcp_ready_helpers() - + _render_selection_present_probe_fn() - + _render_start_lock_block() - + _render_global_env_exports(global_envs) - + port_export - + _render_standalone_start_body( - name_prefix=name_prefix, - service_name=service_name, - ) + return _render_bare_template( + template_name="standalone_start.sh.tmpl", + values={ + "SERVICE_ASSIGN": _sh_quote(service_name), + "NAME_PREFIX_ASSIGN": _sh_quote(name_prefix), + "ALLOWED_NODES_BLOCK": _render_nodes_bash(name="ALLOWED_NODES", nodes=allowed_nodes), + "HOST_PRELUDE": _render_host_prelude(cluster_nodes=cluster_nodes), + "COMMON_NODE_RESOLUTION_TAIL": _render_common_node_resolution_tail(service_name=service_name), + "SELECTION_SUPERVISOR_PATH_BLOCK": _render_selection_supervisor_path_from_script_dir(), + "PROC_LIFECYCLE_HELPERS": _render_proc_lifecycle_pid_tree_helpers(), + "SELECTION_PRESENT_PROBE_FN": _render_selection_present_probe_fn(), + "START_LOCK_BLOCK": _render_start_lock_block(), + "GLOBAL_ENV_EXPORTS": _render_global_env_exports(global_envs), + "PORT_EXPORT": _render_service_port_export(service_name=service_name, service_cfg=service_cfg), + "START_BODY": _render_standalone_start_body( + name_prefix=name_prefix, + service_name=service_name, + ), + }, ) @@ -387,25 +414,19 @@ def _render_standalone_stop_script( service_cfg: Dict[str, Any], ) -> str: allowed_nodes = _extract_nodes(service_cfg) - return ( - "#!/usr/bin/env bash\n" - "set -euo pipefail\n\n" - f"SERVICE={_sh_quote(service_name)}\n" - f"NAME_PREFIX={_sh_quote(name_prefix)}\n" - + _render_nodes_bash(name="ALLOWED_NODES", nodes=allowed_nodes) - + _render_host_prelude(cluster_nodes=cluster_nodes) - + _render_common_node_resolution_tail(service_name=service_name) - + _render_selection_supervisor_path_from_script_dir() - + f'SUPERVISOR_LABEL={_sh_quote(_bare_plain_selection_supervisor_label(name_prefix=name_prefix, service_name=service_name))}\n' - + "# English note:\n" - + "# - Generated bare stop is retained as a manual operator tool.\n" - + "# - Automation must not depend on this path for handover or rollout convergence.\n" - + "# - The command only asks the shared selection supervisor to retire the concrete selection\n" - + "# identity identified by label on this node.\n" - + 'if ! python3 "$SELECTION_SUPERVISOR" stop --label "$SUPERVISOR_LABEL" --scope-key "$HOSTWORKDIR" --missing-ok >/dev/null; then\n' - + ' echo "[bare] stop failed svc=$SERVICE label=$SUPERVISOR_LABEL hostworkdir=$HOSTWORKDIR"\n' - + " exit 1\n" - + "fi\n" + return _render_bare_template( + template_name="standalone_stop.sh.tmpl", + values={ + "SERVICE_ASSIGN": _sh_quote(service_name), + "NAME_PREFIX_ASSIGN": _sh_quote(name_prefix), + "ALLOWED_NODES_BLOCK": _render_nodes_bash(name="ALLOWED_NODES", nodes=allowed_nodes), + "HOST_PRELUDE": _render_host_prelude(cluster_nodes=cluster_nodes), + "COMMON_NODE_RESOLUTION_TAIL": _render_common_node_resolution_tail(service_name=service_name), + "SELECTION_SUPERVISOR_PATH_BLOCK": _render_selection_supervisor_path_from_script_dir(), + "SUPERVISOR_LABEL_ASSIGN": _sh_quote( + _bare_plain_selection_supervisor_label(name_prefix=name_prefix, service_name=service_name) + ), + }, ) @@ -429,20 +450,19 @@ def _render_atomic_group_start_script( service_cfg=service_cfg, ) ) - return ( - "#!/usr/bin/env bash\n" - "set -euo pipefail\n\n" - f"GROUP={_sh_quote(group_name)}\n" - f"NAME_PREFIX={_sh_quote(name_prefix)}\n" - + _render_host_prelude(cluster_nodes=cluster_nodes) - + _render_atomic_group_node_resolution_tail(group_cfg["nodes"]) - + _render_selection_supervisor_path_from_script_dir() - + _render_proc_lifecycle_pid_tree_helpers() - + _render_tcp_ready_helpers() - + _render_global_env_exports(global_envs) - + f"GROUP_STARTUP_DEADLINE_TS=$(( $(date +%s) + {ATOMIC_GROUP_STARTUP_DEADLINE_SECONDS} ))\n" - + "".join(service_blocks) - + 'echo "[atomic-group] ready group=$GROUP node=$NODE_ID"\n' + return _render_bare_template( + template_name="atomic_group_start.sh.tmpl", + values={ + "GROUP_ASSIGN": _sh_quote(group_name), + "NAME_PREFIX_ASSIGN": _sh_quote(name_prefix), + "HOST_PRELUDE": _render_host_prelude(cluster_nodes=cluster_nodes), + "ATOMIC_GROUP_NODE_RESOLUTION_TAIL": _render_atomic_group_node_resolution_tail(group_cfg["nodes"]), + "SELECTION_SUPERVISOR_PATH_BLOCK": _render_selection_supervisor_path_from_script_dir(), + "PROC_LIFECYCLE_HELPERS": _render_proc_lifecycle_pid_tree_helpers(), + "GLOBAL_ENV_EXPORTS": _render_global_env_exports(global_envs), + "GROUP_STARTUP_DEADLINE_ASSIGN": str(ATOMIC_GROUP_STARTUP_DEADLINE_SECONDS), + "SERVICE_BLOCKS": "".join(service_blocks), + }, ) @@ -454,276 +474,105 @@ def _render_atomic_group_stop_script( group_cfg: Dict[str, Any], ) -> str: stop_services = list(reversed(group_cfg["services"])) - return ( - "#!/usr/bin/env bash\n" - "set -u -o pipefail\n\n" - f"GROUP={_sh_quote(group_name)}\n" - f"NAME_PREFIX={_sh_quote(name_prefix)}\n" - + _render_host_prelude(cluster_nodes=cluster_nodes) - + _render_atomic_group_node_resolution_tail(group_cfg["nodes"]) - + _render_selection_supervisor_path_from_script_dir() - + _render_atomic_group_stop_fn( - runtime_specs=[ - { - "service_name": service_name, - "supervisor_label": _bare_atomic_group_member_selection_supervisor_label( - name_prefix=name_prefix, - group_name=group_name, - service_name=service_name, - ), - } - for service_name in stop_services - ], - ) - + "stop_group\n" + return _render_bare_template( + template_name="atomic_group_stop.sh.tmpl", + values={ + "GROUP_ASSIGN": _sh_quote(group_name), + "NAME_PREFIX_ASSIGN": _sh_quote(name_prefix), + "HOST_PRELUDE": _render_host_prelude(cluster_nodes=cluster_nodes), + "ATOMIC_GROUP_NODE_RESOLUTION_TAIL": _render_atomic_group_node_resolution_tail(group_cfg["nodes"]), + "SELECTION_SUPERVISOR_PATH_BLOCK": _render_selection_supervisor_path_from_script_dir(), + "ATOMIC_GROUP_STOP_FN": _render_atomic_group_stop_fn( + runtime_specs=[ + { + "service_name": service_name, + "supervisor_label": _bare_atomic_group_member_selection_supervisor_label( + name_prefix=name_prefix, + group_name=group_name, + service_name=service_name, + ), + } + for service_name in stop_services + ], + ), + }, ) def _render_host_prelude(*, cluster_nodes: List[Dict[str, Any]]) -> str: all_nodes = [_require_str(node.get("hostname"), "cluster_nodes[].hostname") for node in cluster_nodes] - out = _render_nodes_bash(name="ALL_NODES", nodes=all_nodes) - out += "\nLOCAL_HOSTNAME=$(hostname -s 2>/dev/null || hostname 2>/dev/null || echo unknown)\n" - out += 'LOCAL_FQDN=$(hostname -f 2>/dev/null || echo "$LOCAL_HOSTNAME")\n' - out += 'NODE_ID="${NODE_ID:-}"\n' - out += 'if [ -n "$NODE_ID" ]; then\n' - out += ' _node_id_known=false\n' - out += ' for n in "${ALL_NODES[@]}"; do\n' - out += ' if [ "$n" = "$NODE_ID" ]; then\n' - out += ' _node_id_known=true\n' - out += " break\n" - out += " fi\n" - out += " done\n" - out += ' if [ "$_node_id_known" != true ]; then\n' - out += ' echo "Unknown preset NODE_ID: $NODE_ID"\n' - out += f' echo "Known nodes: {" ".join(all_nodes)}"\n' - out += " exit 1\n" - out += " fi\n" - out += "fi\n" - out += 'if [ -z "$NODE_ID" ]; then\n' - out += 'for n in "${ALL_NODES[@]}"; do\n' - out += ' if [ "$n" = "$LOCAL_HOSTNAME" ] || [ "$n" = "$LOCAL_FQDN" ]; then\n' - out += ' NODE_ID="$n"\n' - out += " break\n" - out += " fi\n" - out += "done\n" - out += "fi\n" - out += 'if [ -z "$NODE_ID" ] && [ ${#ALL_NODES[@]} -eq 1 ]; then\n' - out += ' NODE_ID="${ALL_NODES[0]}"\n' - out += "fi\n" - out += 'if [ -z "$NODE_ID" ]; then\n' - out += ' for ip in $(hostname -I 2>/dev/null); do\n' - out += ' for n in "${ALL_NODES[@]}"; do\n' - out += ' _ip_n=""\n' - out += ' case "$n" in\n' - for node in cluster_nodes: - node_name = _require_str(node.get("hostname"), "cluster_nodes[].hostname") - node_ip = _require_str(node.get("ip"), f"cluster_nodes[{node_name}].ip") - out += f" {_sh_quote(node_name)}) _ip_n={_sh_quote(node_ip)};;\n" - out += ' *) _ip_n="";;\n' - out += " esac\n" - out += ' if [ "$_ip_n" = "$ip" ]; then\n' - out += ' NODE_ID="$n"\n' - out += " break\n" - out += " fi\n" - out += " done\n" - out += ' [ -n "$NODE_ID" ] && break\n' - out += " done\n" - out += "fi\n" - out += 'if [ -z "$NODE_ID" ]; then\n' - out += ' echo "Cannot map host to a configured node. Hostname=$LOCAL_HOSTNAME FQDN=$LOCAL_FQDN IPs=$(hostname -I 2>/dev/null)"\n' - out += f' echo "Known nodes: {" ".join(all_nodes)}"\n' - out += " exit 1\n" - out += "fi\n\n" - out += 'HOST_IP=""\nHOSTWORKDIR=""\ncase "$NODE_ID" in\n' + ip_case_lines: list[str] = [] + host_case_lines: list[str] = [] for node in cluster_nodes: node_name = _require_str(node.get("hostname"), "cluster_nodes[].hostname") node_ip = _require_str(node.get("ip"), f"cluster_nodes[{node_name}].ip") hostworkdir = _require_str(node.get("hostworkdir"), f"cluster_nodes[{node_name}].hostworkdir") - out += f" {_sh_quote(node_name)}) HOST_IP={_sh_quote(node_ip)}; HOSTWORKDIR={_sh_quote(hostworkdir)};;\n" - out += ' *) echo "Unknown NODE_ID: $NODE_ID"; exit 1;;\n' - out += "esac\n" - return out + ip_case_lines.append(f" {_sh_quote(node_name)}) _ip_n={_sh_quote(node_ip)};;") + host_case_lines.append( + f" {_sh_quote(node_name)}) HOST_IP={_sh_quote(node_ip)}; HOSTWORKDIR={_sh_quote(hostworkdir)};;" + ) + return _render_bare_template( + template_name="host_prelude.sh.tmpl", + values={ + "ALL_NODES_BLOCK": _render_nodes_bash(name="ALL_NODES", nodes=all_nodes), + "KNOWN_NODES": " ".join(all_nodes), + "IP_CASE_LINES": "\n".join(ip_case_lines), + "HOST_CASE_LINES": "\n".join(host_case_lines), + }, + ) def _render_common_node_resolution_tail(*, service_name: str) -> str: - return ( - 'if [ ${#ALLOWED_NODES[@]} -gt 0 ]; then\n' - + ' _ok=false\n' - + ' for n in "${ALLOWED_NODES[@]}"; do\n' - + ' if [ "$n" = "$NODE_ID" ]; then _ok=true; fi\n' - + " done\n" - + ' if [ "$_ok" != true ]; then\n' - + f' echo "Service {service_name} not scheduled on this node ($NODE_ID). Allowed: ${{ALLOWED_NODES[*]}}"\n' - + " exit 0\n" - + " fi\n" - + "fi\n\n" - + 'export NODE_ID="$NODE_ID"\n' - + 'export HOST_IP="$HOST_IP"\n' - + 'export HOSTWORKDIR="$HOSTWORKDIR"\n\n' + return _render_bare_template( + template_name="common_node_resolution_tail.sh.tmpl", + values={"SERVICE_NAME": service_name}, ) def _render_atomic_group_node_resolution_tail(allowed_nodes: List[str]) -> str: - return ( - _render_nodes_bash(name="GROUP_NODES", nodes=allowed_nodes) - + 'scheduled=false\n' - + 'for n in "${GROUP_NODES[@]}"; do\n' - + ' if [ "$n" = "$NODE_ID" ]; then scheduled=true; fi\n' - + "done\n" - + 'if [ "$scheduled" != true ]; then\n' - + ' echo "[atomic-group] skip group=$GROUP node=$NODE_ID allowed=${GROUP_NODES[*]}"\n' - + " exit 0\n" - + "fi\n\n" - + 'export NODE_ID="$NODE_ID"\n' - + 'export HOST_IP="$HOST_IP"\n' - + 'export HOSTWORKDIR="$HOSTWORKDIR"\n' - + 'echo "[atomic-group] group=$GROUP node=$NODE_ID hostworkdir=$HOSTWORKDIR"\n\n' + return _render_bare_template( + template_name="atomic_group_node_resolution_tail.sh.tmpl", + values={"GROUP_NODES_BLOCK": _render_nodes_bash(name="GROUP_NODES", nodes=allowed_nodes)}, ) def _render_start_lock_block() -> str: - return ( - 'PID_DIR="$HOSTWORKDIR/run"\n' - + 'mkdir -p "$PID_DIR"\n' - + 'START_LOCKFILE="$PID_DIR/${SERVICE}.start.lock"\n' - + 'if ! command -v flock >/dev/null 2>&1; then\n' - + ' echo "Missing required command: flock"\n' - + " exit 1\n" - + "fi\n" - + 'exec 9>"$START_LOCKFILE"\n' - + 'if ! flock -xn 9; then\n' - + ' echo "[bare] start skipped svc=$SERVICE reason=another start is already running lockfile=$START_LOCKFILE"\n' - + " exit 0\n" - + "fi\n" - + 'exec 9>&-\n\n' - ) + return _load_bare_template(template_name="start_lock_block.sh.tmpl") def _render_proc_lifecycle_pid_tree_helpers() -> str: return render_bash_proc_lifecycle_funcs_pid_tree(timeouts=STOP_TIMEOUTS) + "\n\n" -def _render_tcp_ready_helpers() -> str: - return ( - "wait_service_tcp_ready() {\n" - + ' svc="$1"\n' - + ' host="$2"\n' - + ' port="$3"\n' - + ' stable_seconds="$4"\n' - + ' deadline_ts="$5"\n' - + ' context="$6"\n' - + ' if [[ ! "$port" =~ ^[0-9]+$ ]]; then\n' - + ' echo "$context tcp-ready: invalid port svc=$svc port=$port"\n' - + " return 1\n" - + " fi\n" - + ' if [[ ! "$stable_seconds" =~ ^[0-9]+$ ]] || [ "$stable_seconds" -le 0 ]; then\n' - + ' echo "$context tcp-ready: invalid stable_seconds svc=$svc stable_seconds=$stable_seconds"\n' - + " return 1\n" - + " fi\n" - + f" poll_interval_seconds={TCP_READY_POLL_INTERVAL_SECONDS}\n" - + ' stable_checks=$(python3 - "$stable_seconds" "$poll_interval_seconds" <<\'__FLUXON_TCP_READY_CHECKS__\'\n' - + "import math\n" - + "import sys\n" - + "stable_seconds = float(sys.argv[1])\n" - + "poll_interval_seconds = float(sys.argv[2])\n" - + "print(max(1, int(math.ceil(stable_seconds / poll_interval_seconds))))\n" - + "__FLUXON_TCP_READY_CHECKS__\n" - + ")\n" - + ' if [[ ! "$stable_checks" =~ ^[0-9]+$ ]] || [ "$stable_checks" -le 0 ]; then\n' - + ' echo "$context tcp-ready: failed to compute stable_checks svc=$svc"\n' - + " return 1\n" - + " fi\n" - + " ok_checks=0\n" - + " while true; do\n" - + ' now=$(date +%s)\n' - + ' if [ "$now" -ge "$deadline_ts" ]; then\n' - + ' echo "$context tcp-ready: deadline exceeded svc=$svc host=$host port=$port"\n' - + " return 1\n" - + " fi\n" - + ' if python3 - "$host" "$port" <<\'__FLUXON_TCP_READY_PROBE__\'\n' - + "import socket\n" - + "import sys\n" - + "host = sys.argv[1]\n" - + "port = int(sys.argv[2])\n" - + "with socket.create_connection((host, port), timeout=1.0):\n" - + " pass\n" - + "__FLUXON_TCP_READY_PROBE__\n" - + " then\n" - + " ok_checks=$((ok_checks+1))\n" - + ' if [ "$ok_checks" -ge "$stable_checks" ]; then\n' - + ' echo "$context tcp-ready: ok svc=$svc host=$host port=$port stable_checks=$stable_checks"\n' - + " return 0\n" - + " fi\n" - + " else\n" - + ' if [ "$ok_checks" -ne 0 ]; then\n' - + ' echo "$context tcp-ready: reset svc=$svc ok_checks=$ok_checks host=$host port=$port"\n' - + " fi\n" - + " ok_checks=0\n" - + " fi\n" - + ' sleep "$poll_interval_seconds"\n' - + " done\n" - + "}\n\n" - ) - - def _render_selection_present_probe_fn() -> str: - return ( - "selection_present() {\n" - + " python3 - \"$SELECTION_SUPERVISOR\" \"$SUPERVISOR_LABEL\" \"$HOSTWORKDIR\" <<'__FLUXON_SELECTION_PRESENT__'\n" - + "import importlib.util\n" - + "import sys\n" - + "from pathlib import Path\n" - + "\n" - + "supervisor_path = Path(sys.argv[1])\n" - + "label = sys.argv[2]\n" - + "scope_key = sys.argv[3]\n" - + 'spec = importlib.util.spec_from_file_location("fluxon_selection_supervisor_probe", supervisor_path)\n' - + "if spec is None or spec.loader is None:\n" - + ' raise RuntimeError(f"failed to load selection supervisor module: {supervisor_path}")\n' - + "module = importlib.util.module_from_spec(spec)\n" - + "sys.modules[spec.name] = module\n" - + "spec.loader.exec_module(module)\n" - + "raise SystemExit(0 if module._selection_present(label, scope_key=scope_key) else 1)\n" - + "__FLUXON_SELECTION_PRESENT__\n" - + "}\n\n" - ) + return _load_bare_template(template_name="selection_present_probe_fn.sh.tmpl") def _render_selection_supervisor_launch_wait_block( *, run_cmd: str, - logfile_expr: str, stable_seconds_expr: str, - deadline_ts_expr: str, + deadline_seconds_expr: str, context: str, ) -> str: - return ( - 'SUPERVISOR_PID=$( ' - + run_cmd - + f' >>{logfile_expr} 2>&1 < /dev/null & echo "$!" )\n' - + 'if [[ ! "$SUPERVISOR_PID" =~ ^[0-9]+$ ]]; then\n' - + f' echo "{context} launch failed svc=$SERVICE label=$SUPERVISOR_LABEL supervisor_pid=$SUPERVISOR_PID"\n' - + " exit 1\n" - + "fi\n" - + 'if ! wait_service_probably_ready_pid_tree "$SERVICE" "$SUPERVISOR_PID" ' - + stable_seconds_expr - + " " - + deadline_ts_expr - + f' "{context}"; then\n' - + f' echo "{context} probable-ready failed svc=$SERVICE label=$SUPERVISOR_LABEL supervisor_pid=$SUPERVISOR_PID"\n' - + " exit 1\n" - + "fi\n" + return _render_bare_template( + template_name="selection_supervisor_launch_wait_block.sh.tmpl", + values={ + "RUN_CMD": run_cmd, + "STABLE_SECONDS_EXPR": stable_seconds_expr, + "DEADLINE_SECONDS_EXPR": deadline_seconds_expr, + "CONTEXT": context, + }, ) -def _render_tcp_ready_wait_block(*, context: str) -> str: +def _render_service_port_export(*, service_name: str, service_cfg: Dict[str, Any], indent: str = "") -> str: + service_port = _extract_port(service_cfg) + if service_port is None: + return indent + "unset SERVICE_PORT\n" return ( - 'if [[ "${SERVICE_PORT:-}" =~ ^[0-9]+$ ]]; then\n' - + f' if ! wait_service_tcp_ready "$SERVICE" "$HOST_IP" "$SERVICE_PORT" {TCP_READY_STABLE_SECONDS} "$STARTUP_DEADLINE_TS" "{context}"; then\n' - + f' echo "{context} tcp-ready failed svc=$SERVICE host=$HOST_IP port=$SERVICE_PORT"\n' - + " exit 1\n" - + " fi\n" - + "fi\n" + indent + f"export {service_name.upper()}__PORT={_sh_quote(str(service_port))}\n" + + indent + f"export SERVICE_PORT={_sh_quote(str(service_port))}\n" ) @@ -759,54 +608,28 @@ def _render_standalone_start_body(*, name_prefix: str, service_name: str) -> str crashloop_interval_lt_seconds=0, child_command=child_command, ) - return ( - f'SUPERVISOR_LABEL={_sh_quote(_bare_plain_selection_supervisor_label(name_prefix=name_prefix, service_name=service_name))}\n' - + f'RUNTIME_STATE_JSON={_sh_quote(runtime_state_json)}\n' - + 'OWNER_TS_MS=$(python3 -c \'import time; print(int(time.time() * 1000))\')\n' - + f"STARTUP_DEADLINE_TS=$(( $(date +%s) + {STANDALONE_STARTUP_DEADLINE_SECONDS} ))\n" - + 'LOG_DIR="$HOSTWORKDIR/log"\n' - + 'LOGFILE="$LOG_DIR/${SERVICE}.log"\n' - + 'mkdir -p "$LOG_DIR"\n' - + 'touch "$LOGFILE"\n' - + 'echo "Starting $SERVICE on $NODE_ID (IP: $HOST_IP, workdir: $HOSTWORKDIR)"\n' - + "# English note:\n" - + "# - bootstrap bare start must be idempotent when the shared selection supervisor already owns\n" - + "# a live child for the same label.\n" - + "# - start_test_bed enables this path only for deployconf.bootstrap_bare_services.\n" - + 'if [ "${FLUXON_BARE_ALLOW_ALREADY_PRESENT:-false}" = "true" ]; then\n' - + " if selection_present; then\n" - + ' echo "[bare] already present svc=$SERVICE label=$SUPERVISOR_LABEL"\n' - + ' echo "Started $SERVICE (label: $SUPERVISOR_LABEL)"\n' - + ' echo "Logs: $LOGFILE"\n' - + " exit 0\n" - + " fi\n" - + "fi\n" - + "# English note:\n" - + "# - Bare start must not depend on extra supervisor observation subcommands because the shared\n" - + "# runtime surface is intentionally reduced to run/stop.\n" - + "# - We therefore launch the detached supervisor and wait until its pid subtree keeps a live child\n" - + "# process for a short stable window.\n" - + _render_selection_supervisor_launch_wait_block( - run_cmd=run_cmd, - logfile_expr='"$LOGFILE"', - stable_seconds_expr=str(STANDALONE_PROBABLE_READY_SECONDS), - deadline_ts_expr='"$STARTUP_DEADLINE_TS"', - context="[bare]", - ) - + _render_tcp_ready_wait_block(context="[bare]") - + 'echo "Started $SERVICE (label: $SUPERVISOR_LABEL)"\n' - + 'echo "Logs: $LOGFILE"\n' + return _render_bare_template( + template_name="standalone_start_body.sh.tmpl", + values={ + "SUPERVISOR_LABEL_ASSIGN": _sh_quote( + _bare_plain_selection_supervisor_label(name_prefix=name_prefix, service_name=service_name) + ), + "RUNTIME_STATE_JSON_ASSIGN": _sh_quote(runtime_state_json), + "STARTUP_DEADLINE_SECONDS": str(STANDALONE_STARTUP_DEADLINE_SECONDS), + "SELECTION_SUPERVISOR_LAUNCH_WAIT_BLOCK": _render_selection_supervisor_launch_wait_block( + run_cmd=run_cmd, + stable_seconds_expr=str(STANDALONE_PROBABLE_READY_SECONDS), + deadline_seconds_expr=str(STANDALONE_STARTUP_DEADLINE_SECONDS), + context="[bare]", + ), + }, ) def _render_selection_supervisor_path_from_script_dir() -> str: - return ( - 'DIR=$(cd "$(dirname "$0")" && pwd)\n' - + f'SELECTION_SUPERVISOR="$DIR/{PYTHON_SELECTION_SUPERVISOR_FILENAME}"\n' - + 'if [ ! -f "$SELECTION_SUPERVISOR" ]; then\n' - + ' echo "Missing selection supervisor: $SELECTION_SUPERVISOR"\n' - + " exit 1\n" - + "fi\n\n" + return _render_bare_template( + template_name="selection_supervisor_path_from_script_dir.sh.tmpl", + values={"SELECTION_SUPERVISOR_FILENAME": PYTHON_SELECTION_SUPERVISOR_FILENAME}, ) @@ -833,10 +656,6 @@ def _render_atomic_group_service_block( log_path=f"${{HOSTWORKDIR}}/log/{service_name}.log", ) allowed_nodes = _extract_nodes(service_cfg) - service_port = _extract_port(service_cfg) - port_export = "" - if service_port is not None: - port_export = f" export {service_name.upper()}__PORT={_sh_quote(str(service_port))}\n" run_cmd = _render_selection_supervisor_run_shell( subcommand="run", supervisor_expr='"$SELECTION_SUPERVISOR"', @@ -850,54 +669,37 @@ def _render_atomic_group_service_block( crashloop_interval_lt_seconds=ATOMIC_GROUP_CRASHLOOP_INTERVAL_LT_SECONDS, child_command=child_command, ) - return ( - f"\n# rollout: {service_name}\n" - + _render_nodes_bash(name="ALLOWED_NODES", nodes=allowed_nodes) - + "scheduled=false\n" - + 'for n in "${ALLOWED_NODES[@]}"; do\n' - + ' if [ "$n" = "$NODE_ID" ]; then scheduled=true; fi\n' - + "done\n" - + 'if [ "$scheduled" != true ]; then\n' - + f' echo "[rollout] skip {service_name}: not scheduled on node $NODE_ID"\n' - + "else\n" - + f" export SERVICE={_sh_quote(service_name)}\n" - + port_export - + ' LOG_DIR="$HOSTWORKDIR/log"\n' - + ' mkdir -p "$LOG_DIR"\n' - + f' SUPERVISOR_LABEL={_sh_quote(_bare_atomic_group_member_selection_supervisor_label(name_prefix=name_prefix, group_name=group_name, service_name=service_name))}\n' - + f' RUNTIME_STATE_JSON={_sh_quote(runtime_state_json)}\n' - + ' OWNER_TS_MS=$(python3 -c \'import time; print(int(time.time() * 1000))\')\n' - + f' LOGFILE="$HOSTWORKDIR/log/{service_name}.log"\n' - + ' touch "$LOGFILE"\n' - + f' echo "[rollout] start {service_name} node=$NODE_ID hostworkdir=$HOSTWORKDIR"\n' - + " # English note:\n" - + " # - Atomic-group order still depends on a readiness gate, but that gate now observes only the\n" - + " # detached supervisor process subtree on this host.\n" - + " # - Ownership stays inside the shared selection supervisor big loop; the group runner only waits\n" - + " # until that loop has a stable live child before advancing to the next service.\n" - # English note: - # - The embedded `run_cmd` contains a nested `bash -lc` payload, and that payload may contain - # heredocs used by real service entrypoints. - # - A blind newline replacement would shift heredoc terminators away from column 0 inside the - # child shell and silently turn valid entrypoints into immediate no-op exits. - # - Indent only the outer block lines while preserving each inner line start exactly. - + _indent_script_block( - script=_render_selection_supervisor_launch_wait_block( - run_cmd=run_cmd, - logfile_expr='"$LOGFILE"', - stable_seconds_expr=str(ATOMIC_GROUP_PROBABLE_READY_SECONDS), - deadline_ts_expr='"$GROUP_STARTUP_DEADLINE_TS"', - context="[rollout]", - ).rstrip() + "\n", - prefix=" ", - ).rstrip() - + "\n" - + _indent_script_block( - script=_render_tcp_ready_wait_block(context="[rollout]"), - prefix=" ", - ).rstrip() - + "\n" - + "fi\n" + return _render_bare_template( + template_name="atomic_group_service_block.sh.tmpl", + values={ + "SERVICE_NAME": service_name, + "ALLOWED_NODES_BLOCK": _render_nodes_bash(name="ALLOWED_NODES", nodes=allowed_nodes), + "SERVICE_EXPORT": _sh_quote(service_name), + "PORT_EXPORT": _render_service_port_export( + service_name=service_name, + service_cfg=service_cfg, + indent=" ", + ), + "SUPERVISOR_LABEL_ASSIGN": _sh_quote( + _bare_atomic_group_member_selection_supervisor_label( + name_prefix=name_prefix, + group_name=group_name, + service_name=service_name, + ) + ), + "RUNTIME_STATE_JSON_ASSIGN": _sh_quote(runtime_state_json), + "LOGFILE_PATH": f"$HOSTWORKDIR/log/{service_name}.log", + "INDENTED_SELECTION_SUPERVISOR_LAUNCH_WAIT_BLOCK": _indent_script_block( + script=_render_selection_supervisor_launch_wait_block( + run_cmd=run_cmd, + stable_seconds_expr=str(ATOMIC_GROUP_PROBABLE_READY_SECONDS), + deadline_seconds_expr=str(ATOMIC_GROUP_STARTUP_DEADLINE_SECONDS), + context="[rollout]", + ).rstrip() + + "\n", + prefix=" ", + ).rstrip(), + }, ) diff --git a/deployment/templates/gen_bare_deploy_bash/atomic_group_node_resolution_tail.sh.tmpl b/deployment/templates/gen_bare_deploy_bash/atomic_group_node_resolution_tail.sh.tmpl new file mode 100644 index 0000000..d385995 --- /dev/null +++ b/deployment/templates/gen_bare_deploy_bash/atomic_group_node_resolution_tail.sh.tmpl @@ -0,0 +1,14 @@ +{{GROUP_NODES_BLOCK}}scheduled=false +for n in "${GROUP_NODES[@]}"; do + if [ "$n" = "$NODE_ID" ]; then scheduled=true; fi +done +if [ "$scheduled" != true ]; then + echo "[atomic-group] skip group=$GROUP node=$NODE_ID allowed=${GROUP_NODES[*]}" + exit 0 +fi + +export NODE_ID="$NODE_ID" +export HOST_IP="$HOST_IP" +export HOSTWORKDIR="$HOSTWORKDIR" +echo "[atomic-group] group=$GROUP node=$NODE_ID hostworkdir=$HOSTWORKDIR" + diff --git a/deployment/templates/gen_bare_deploy_bash/atomic_group_service_block.sh.tmpl b/deployment/templates/gen_bare_deploy_bash/atomic_group_service_block.sh.tmpl new file mode 100644 index 0000000..25da0dd --- /dev/null +++ b/deployment/templates/gen_bare_deploy_bash/atomic_group_service_block.sh.tmpl @@ -0,0 +1,24 @@ + +# rollout: {{SERVICE_NAME}} +{{ALLOWED_NODES_BLOCK}}scheduled=false +for n in "${ALLOWED_NODES[@]}"; do + if [ "$n" = "$NODE_ID" ]; then scheduled=true; fi +done +if [ "$scheduled" != true ]; then + echo "[rollout] skip {{SERVICE_NAME}}: not scheduled on node $NODE_ID" +else + export SERVICE={{SERVICE_EXPORT}} +{{PORT_EXPORT}} LOG_DIR="$HOSTWORKDIR/log" + mkdir -p "$LOG_DIR" + SUPERVISOR_LABEL={{SUPERVISOR_LABEL_ASSIGN}} + RUNTIME_STATE_JSON={{RUNTIME_STATE_JSON_ASSIGN}} + OWNER_TS_MS=$(python3 -c 'import time; print(int(time.time() * 1000))') + LOGFILE="{{LOGFILE_PATH}}" + echo "[rollout] start {{SERVICE_NAME}} node=$NODE_ID hostworkdir=$HOSTWORKDIR" + # English note: + # - Atomic-group order still depends on a startup gate, but that gate now checks only whether + # one supervised child PID stays alive without restart on this host. + # - Ownership stays inside the shared selection supervisor big loop; the group runner only waits + # through the fixed startup observation window before advancing to the next service. +{{INDENTED_SELECTION_SUPERVISOR_LAUNCH_WAIT_BLOCK}} +fi diff --git a/deployment/templates/gen_bare_deploy_bash/atomic_group_start.sh.tmpl b/deployment/templates/gen_bare_deploy_bash/atomic_group_start.sh.tmpl new file mode 100644 index 0000000..7dca2b5 --- /dev/null +++ b/deployment/templates/gen_bare_deploy_bash/atomic_group_start.sh.tmpl @@ -0,0 +1,7 @@ +#!/usr/bin/env bash +set -euo pipefail + +GROUP={{GROUP_ASSIGN}} +NAME_PREFIX={{NAME_PREFIX_ASSIGN}} +{{HOST_PRELUDE}}{{ATOMIC_GROUP_NODE_RESOLUTION_TAIL}}{{SELECTION_SUPERVISOR_PATH_BLOCK}}{{PROC_LIFECYCLE_HELPERS}}{{GLOBAL_ENV_EXPORTS}} +{{SERVICE_BLOCKS}}echo "[atomic-group] ready group=$GROUP node=$NODE_ID" diff --git a/deployment/templates/gen_bare_deploy_bash/atomic_group_stop.sh.tmpl b/deployment/templates/gen_bare_deploy_bash/atomic_group_stop.sh.tmpl new file mode 100644 index 0000000..5501b8f --- /dev/null +++ b/deployment/templates/gen_bare_deploy_bash/atomic_group_stop.sh.tmpl @@ -0,0 +1,6 @@ +#!/usr/bin/env bash +set -u -o pipefail + +GROUP={{GROUP_ASSIGN}} +NAME_PREFIX={{NAME_PREFIX_ASSIGN}} +{{HOST_PRELUDE}}{{ATOMIC_GROUP_NODE_RESOLUTION_TAIL}}{{SELECTION_SUPERVISOR_PATH_BLOCK}}{{ATOMIC_GROUP_STOP_FN}}stop_group diff --git a/deployment/templates/gen_bare_deploy_bash/bare_entrypoint.sh.tmpl b/deployment/templates/gen_bare_deploy_bash/bare_entrypoint.sh.tmpl new file mode 100644 index 0000000..39db682 --- /dev/null +++ b/deployment/templates/gen_bare_deploy_bash/bare_entrypoint.sh.tmpl @@ -0,0 +1,5 @@ +#!/usr/bin/env bash +set -euo pipefail + +export SERVICE={{SERVICE_EXPORT}} +{{ENTRYPOINT}} diff --git a/deployment/templates/gen_bare_deploy_bash/common_node_resolution_tail.sh.tmpl b/deployment/templates/gen_bare_deploy_bash/common_node_resolution_tail.sh.tmpl new file mode 100644 index 0000000..e0cb433 --- /dev/null +++ b/deployment/templates/gen_bare_deploy_bash/common_node_resolution_tail.sh.tmpl @@ -0,0 +1,15 @@ +if [ ${#ALLOWED_NODES[@]} -gt 0 ]; then + _ok=false + for n in "${ALLOWED_NODES[@]}"; do + if [ "$n" = "$NODE_ID" ]; then _ok=true; fi + done + if [ "$_ok" != true ]; then + echo "Service {{SERVICE_NAME}} not scheduled on this node ($NODE_ID). Allowed: ${ALLOWED_NODES[*]}" + exit 0 + fi +fi + +export NODE_ID="$NODE_ID" +export HOST_IP="$HOST_IP" +export HOSTWORKDIR="$HOSTWORKDIR" + diff --git a/deployment/templates/gen_bare_deploy_bash/host_prelude.sh.tmpl b/deployment/templates/gen_bare_deploy_bash/host_prelude.sh.tmpl new file mode 100644 index 0000000..6075106 --- /dev/null +++ b/deployment/templates/gen_bare_deploy_bash/host_prelude.sh.tmpl @@ -0,0 +1,57 @@ +{{ALL_NODES_BLOCK}} +LOCAL_HOSTNAME=$(hostname -s 2>/dev/null || hostname 2>/dev/null || echo unknown) +LOCAL_FQDN=$(hostname -f 2>/dev/null || echo "$LOCAL_HOSTNAME") +NODE_ID="${NODE_ID:-}" +if [ -n "$NODE_ID" ]; then + _node_id_known=false + for n in "${ALL_NODES[@]}"; do + if [ "$n" = "$NODE_ID" ]; then + _node_id_known=true + break + fi + done + if [ "$_node_id_known" != true ]; then + echo "Unknown preset NODE_ID: $NODE_ID" + echo "Known nodes: {{KNOWN_NODES}}" + exit 1 + fi +fi +if [ -z "$NODE_ID" ]; then +for n in "${ALL_NODES[@]}"; do + if [ "$n" = "$LOCAL_HOSTNAME" ] || [ "$n" = "$LOCAL_FQDN" ]; then + NODE_ID="$n" + break + fi +done +fi +if [ -z "$NODE_ID" ] && [ ${#ALL_NODES[@]} -eq 1 ]; then + NODE_ID="${ALL_NODES[0]}" +fi +if [ -z "$NODE_ID" ]; then + for ip in $(hostname -I 2>/dev/null); do + for n in "${ALL_NODES[@]}"; do + _ip_n="" + case "$n" in +{{IP_CASE_LINES}} + *) _ip_n="";; + esac + if [ "$_ip_n" = "$ip" ]; then + NODE_ID="$n" + break + fi + done + [ -n "$NODE_ID" ] && break + done +fi +if [ -z "$NODE_ID" ]; then + echo "Cannot map host to a configured node. Hostname=$LOCAL_HOSTNAME FQDN=$LOCAL_FQDN IPs=$(hostname -I 2>/dev/null)" + echo "Known nodes: {{KNOWN_NODES}}" + exit 1 +fi + +HOST_IP="" +HOSTWORKDIR="" +case "$NODE_ID" in +{{HOST_CASE_LINES}} + *) echo "Unknown NODE_ID: $NODE_ID"; exit 1;; +esac diff --git a/deployment/templates/gen_bare_deploy_bash/selection_present_probe_fn.sh.tmpl b/deployment/templates/gen_bare_deploy_bash/selection_present_probe_fn.sh.tmpl new file mode 100644 index 0000000..0a7282b --- /dev/null +++ b/deployment/templates/gen_bare_deploy_bash/selection_present_probe_fn.sh.tmpl @@ -0,0 +1,19 @@ +selection_present() { + python3 - "$SELECTION_SUPERVISOR" "$SUPERVISOR_LABEL" "$HOSTWORKDIR" <<'__FLUXON_SELECTION_PRESENT__' +import importlib.util +import sys +from pathlib import Path + +supervisor_path = Path(sys.argv[1]) +label = sys.argv[2] +scope_key = sys.argv[3] +spec = importlib.util.spec_from_file_location("fluxon_selection_supervisor_probe", supervisor_path) +if spec is None or spec.loader is None: + raise RuntimeError(f"failed to load selection supervisor module: {supervisor_path}") +module = importlib.util.module_from_spec(spec) +sys.modules[spec.name] = module +spec.loader.exec_module(module) +raise SystemExit(0 if module._selection_present(label, scope_key=scope_key) else 1) +__FLUXON_SELECTION_PRESENT__ +} + diff --git a/deployment/templates/gen_bare_deploy_bash/selection_supervisor_launch_wait_block.sh.tmpl b/deployment/templates/gen_bare_deploy_bash/selection_supervisor_launch_wait_block.sh.tmpl new file mode 100644 index 0000000..702e5bd --- /dev/null +++ b/deployment/templates/gen_bare_deploy_bash/selection_supervisor_launch_wait_block.sh.tmpl @@ -0,0 +1,10 @@ +SUPERVISOR_PID=$( {{RUN_CMD}} < /dev/null & echo "$!" ) +if [[ ! "$SUPERVISOR_PID" =~ ^[0-9]+$ ]]; then + echo "{{CONTEXT}} launch failed svc=$SERVICE label=$SUPERVISOR_LABEL supervisor_pid=$SUPERVISOR_PID" + exit 1 +fi +STARTUP_DEADLINE_SECONDS={{DEADLINE_SECONDS_EXPR}} +if ! wait_service_probably_ready_pid_tree "$SERVICE" "$SUPERVISOR_PID" {{STABLE_SECONDS_EXPR}} "$STARTUP_DEADLINE_SECONDS" "{{CONTEXT}}"; then + echo "{{CONTEXT}} probable-ready failed svc=$SERVICE label=$SUPERVISOR_LABEL supervisor_pid=$SUPERVISOR_PID" + exit 1 +fi diff --git a/deployment/templates/gen_bare_deploy_bash/selection_supervisor_path_from_script_dir.sh.tmpl b/deployment/templates/gen_bare_deploy_bash/selection_supervisor_path_from_script_dir.sh.tmpl new file mode 100644 index 0000000..dac7dff --- /dev/null +++ b/deployment/templates/gen_bare_deploy_bash/selection_supervisor_path_from_script_dir.sh.tmpl @@ -0,0 +1,7 @@ +DIR=$(cd "$(dirname "$0")" && pwd) +SELECTION_SUPERVISOR="$DIR/{{SELECTION_SUPERVISOR_FILENAME}}" +if [ ! -f "$SELECTION_SUPERVISOR" ]; then + echo "Missing selection supervisor: $SELECTION_SUPERVISOR" + exit 1 +fi + diff --git a/deployment/templates/gen_bare_deploy_bash/standalone_start.sh.tmpl b/deployment/templates/gen_bare_deploy_bash/standalone_start.sh.tmpl new file mode 100644 index 0000000..5a565f1 --- /dev/null +++ b/deployment/templates/gen_bare_deploy_bash/standalone_start.sh.tmpl @@ -0,0 +1,6 @@ +#!/usr/bin/env bash +set -euo pipefail + +SERVICE={{SERVICE_ASSIGN}} +NAME_PREFIX={{NAME_PREFIX_ASSIGN}} +{{ALLOWED_NODES_BLOCK}}{{HOST_PRELUDE}}{{COMMON_NODE_RESOLUTION_TAIL}}{{SELECTION_SUPERVISOR_PATH_BLOCK}}{{PROC_LIFECYCLE_HELPERS}}{{SELECTION_PRESENT_PROBE_FN}}{{START_LOCK_BLOCK}}{{GLOBAL_ENV_EXPORTS}}{{PORT_EXPORT}}{{START_BODY}} diff --git a/deployment/templates/gen_bare_deploy_bash/standalone_start_body.sh.tmpl b/deployment/templates/gen_bare_deploy_bash/standalone_start_body.sh.tmpl new file mode 100644 index 0000000..0758876 --- /dev/null +++ b/deployment/templates/gen_bare_deploy_bash/standalone_start_body.sh.tmpl @@ -0,0 +1,26 @@ +SUPERVISOR_LABEL={{SUPERVISOR_LABEL_ASSIGN}} +RUNTIME_STATE_JSON={{RUNTIME_STATE_JSON_ASSIGN}} +OWNER_TS_MS=$(python3 -c 'import time; print(int(time.time() * 1000))') +LOG_DIR="$HOSTWORKDIR/log" +LOGFILE="$LOG_DIR/${SERVICE}.log" +mkdir -p "$LOG_DIR" +echo "Starting $SERVICE on $NODE_ID (IP: $HOST_IP, workdir: $HOSTWORKDIR)" +# English note: +# - bootstrap bare start must be idempotent when the shared selection supervisor already owns +# a live child for the same label. +# - start_test_bed enables this path only for deployconf.bootstrap_bare_services. +if [ "${FLUXON_BARE_ALLOW_ALREADY_PRESENT:-false}" = "true" ]; then + if selection_present; then + echo "[bare] already present svc=$SERVICE label=$SUPERVISOR_LABEL" + echo "Started $SERVICE (label: $SUPERVISOR_LABEL)" + echo "Logs: $LOGFILE" + exit 0 + fi +fi +# English note: +# - Bare start must not depend on extra supervisor observation subcommands because the shared +# runtime surface is intentionally reduced to run/stop. +# - We therefore launch the detached supervisor and wait until one supervised child PID stays +# alive without restart across the fixed startup observation window. +{{SELECTION_SUPERVISOR_LAUNCH_WAIT_BLOCK}}echo "Started $SERVICE (label: $SUPERVISOR_LABEL)" +echo "Logs: $LOGFILE" diff --git a/deployment/templates/gen_bare_deploy_bash/standalone_stop.sh.tmpl b/deployment/templates/gen_bare_deploy_bash/standalone_stop.sh.tmpl new file mode 100644 index 0000000..4f7dc37 --- /dev/null +++ b/deployment/templates/gen_bare_deploy_bash/standalone_stop.sh.tmpl @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +set -euo pipefail + +SERVICE={{SERVICE_ASSIGN}} +NAME_PREFIX={{NAME_PREFIX_ASSIGN}} +{{ALLOWED_NODES_BLOCK}}{{HOST_PRELUDE}}{{COMMON_NODE_RESOLUTION_TAIL}}{{SELECTION_SUPERVISOR_PATH_BLOCK}}SUPERVISOR_LABEL={{SUPERVISOR_LABEL_ASSIGN}} +# English note: +# - Generated bare stop is retained as a manual operator tool. +# - Automation must not depend on this path for handover or rollout convergence. +# - The command only asks the shared selection supervisor to retire the concrete selection +# identity identified by label on this node. +if ! python3 "$SELECTION_SUPERVISOR" stop --label "$SUPERVISOR_LABEL" --scope-key "$HOSTWORKDIR" --missing-ok >/dev/null; then + echo "[bare] stop failed svc=$SERVICE label=$SUPERVISOR_LABEL hostworkdir=$HOSTWORKDIR" + exit 1 +fi diff --git a/deployment/templates/gen_bare_deploy_bash/start_lock_block.sh.tmpl b/deployment/templates/gen_bare_deploy_bash/start_lock_block.sh.tmpl new file mode 100644 index 0000000..47ec770 --- /dev/null +++ b/deployment/templates/gen_bare_deploy_bash/start_lock_block.sh.tmpl @@ -0,0 +1,14 @@ +PID_DIR="$HOSTWORKDIR/run" +mkdir -p "$PID_DIR" +START_LOCKFILE="$PID_DIR/${SERVICE}.start.lock" +if ! command -v flock >/dev/null 2>&1; then + echo "Missing required command: flock" + exit 1 +fi +exec 9>"$START_LOCKFILE" +if ! flock -xn 9; then + echo "[bare] start skipped svc=$SERVICE reason=another start is already running lockfile=$START_LOCKFILE" + exit 0 +fi +exec 9>&- + diff --git a/deployment/tests/test_gen_bare_deploy_bash.py b/deployment/tests/test_gen_bare_deploy_bash.py index f51a923..21f11a6 100644 --- a/deployment/tests/test_gen_bare_deploy_bash.py +++ b/deployment/tests/test_gen_bare_deploy_bash.py @@ -5,6 +5,7 @@ import argparse import importlib.util import os +import shlex import subprocess import sys import tempfile @@ -13,6 +14,8 @@ from pathlib import Path from typing import Callable, List, Optional, Tuple +import yaml + SCRIPT_DIR = Path(__file__).resolve().parent DEPLOYMENT_DIR = SCRIPT_DIR.parent @@ -50,9 +53,17 @@ def _build_checks(selected_test_id: Optional[str]) -> List[Tuple[str, Callable[[ ("preserves_hostworkdir_runtime_token", test_preserves_hostworkdir_runtime_token), ("generated_scripts_do_not_embed_pidfile_authority", test_generated_scripts_do_not_embed_pidfile_authority), ("ops_entrypoints_use_direct_scripts", test_ops_entrypoints_use_direct_scripts), + ("bare_start_uses_no_exit_startup_gate", test_bare_start_uses_no_exit_startup_gate), + ( + "normalized_testbed_master_exports_service_port_for_atomic_group", + test_normalized_testbed_master_exports_service_port_for_atomic_group, + ), + ("normalized_testbed_owner_emits_large_file_paths", test_normalized_testbed_owner_emits_large_file_paths), ("bare_child_command_preserves_runtime_hostworkdir_expansion", test_bare_child_command_preserves_runtime_hostworkdir_expansion), ("supervisor_label_uses_stable_selection_suffix", test_supervisor_label_uses_stable_selection_suffix), ("bootstrap_start_reuses_already_present_selection", test_bootstrap_start_reuses_already_present_selection), + ("bare_start_fails_when_child_exits_within_startup_window", test_bare_start_fails_when_child_exits_within_startup_window), + ("pid_ready_check_requires_full_stable_window_after_first_child_observation", test_pid_ready_check_requires_full_stable_window_after_first_child_observation), ("atomic_group_start_does_not_auto_stop_on_failure", test_atomic_group_start_does_not_auto_stop_on_failure), ("atomic_group_preserves_nested_heredoc_terminator", test_atomic_group_preserves_nested_heredoc_terminator), ("atomic_group_stop_script_is_shell_valid", test_atomic_group_stop_script_is_shell_valid), @@ -93,6 +104,7 @@ def test_preserves_hostworkdir_runtime_token() -> None: FLUXON_SHARED_MEM: "${HOSTWORKDIR}/shm1" service: svc_plain: + port: 12345 entrypoint: | WORKDIR="${HOSTWORKDIR}/svc_${NODE_ID}" EXPORT_TABLE=$(cat < None: assert "/hostworkdir/svc_" not in script, script assert "wait-present" not in script, script assert "launch_only_start_gate" not in script, script - assert 'wait_service_probably_ready_pid_tree "$SERVICE" "$SUPERVISOR_PID"' in script, script - assert 'wait_service_tcp_ready "$SERVICE" "$HOST_IP" "$SERVICE_PORT"' in script, script + _assert_standalone_deadline_after_launch(script) + assert 'wait_service_probably_ready_pid_tree "$SERVICE" "$SUPERVISOR_PID" 10 "$STARTUP_DEADLINE_SECONDS" "[bare]"' in script, script + assert "export SERVICE_PORT=12345" in script, script + assert "wait_service_tcp_ready" not in script, script + assert "wait_service_etcd_endpoint_healthy" not in script, script assert 'SUPERVISOR_PID=$( setsid ' not in script, script + assert '>>"$LOGFILE" 2>&1' not in script, script + assert 'touch "$LOGFILE"' not in script, script assert 'python3 "$SELECTION_SUPERVISOR" stop --label "$SUPERVISOR_LABEL" --scope-key "$HOSTWORKDIR" --missing-ok' in stop_script, stop_script assert "retire-runtime" not in stop_script, stop_script print("PASS: test_preserves_hostworkdir_runtime_token") @@ -149,6 +166,7 @@ def test_atomic_group_start_does_not_auto_stop_on_failure() -> None: hostworkdir: /tmp/hostworkdir service: svc_a: + port: 23456 entrypoint: | echo svc_a node_bind: @@ -179,7 +197,16 @@ def test_atomic_group_start_does_not_auto_stop_on_failure() -> None: assert 'SUPERVISOR_PID=$( setsid ' not in script, script assert 'echo "[rollout] probable-ready failed svc=$SERVICE label=$SUPERVISOR_LABEL supervisor_pid=$SUPERVISOR_PID"' in script, script assert 'wait_service_probably_ready_pid_tree "$SERVICE" "$SUPERVISOR_PID"' in script, script - assert 'wait_service_tcp_ready "$SERVICE" "$HOST_IP" "$SERVICE_PORT"' in script, script + assert 'GROUP_STARTUP_DEADLINE_SECONDS=' not in script, script + assert script.count('STARTUP_DEADLINE_SECONDS=20') == 2, script + _assert_deadline_after_launch( + script=script, + wait_call='wait_service_probably_ready_pid_tree "$SERVICE" "$SUPERVISOR_PID" 10 "$STARTUP_DEADLINE_SECONDS" "[rollout]"', + ) + assert "export SERVICE_PORT=23456" in script, script + assert "unset SERVICE_PORT" in script, script + assert "wait_service_tcp_ready" not in script, script + assert "wait_service_etcd_endpoint_healthy" not in script, script print("PASS: test_atomic_group_start_does_not_auto_stop_on_failure") @@ -251,11 +278,132 @@ def test_ops_entrypoints_use_direct_scripts() -> None: assert "-m fluxon_py.runtime.start_ops_controller" in controller_entrypoint, controller_entrypoint assert "examples/fluxon_ops/start_controller.py" not in controller_entrypoint, controller_entrypoint + assert 'http_listen_addr: "0.0.0.0:19080"' in controller_entrypoint, controller_entrypoint + assert 'http_listen_addr: "0.0.0.0:${MASTER__PORT}"' not in controller_entrypoint, controller_entrypoint assert "-m fluxon_py.runtime.start_ops_agent" in agent_entrypoint, agent_entrypoint assert "examples/fluxon_ops/start_agent.py" not in agent_entrypoint, agent_entrypoint print("PASS: test_ops_entrypoints_use_direct_scripts") +def test_bare_start_uses_no_exit_startup_gate() -> None: + with tempfile.TemporaryDirectory(prefix="test_gen_bare_deploy_bash_no_exit_gate_") as td: + tmpdir = Path(td) + config_path = tmpdir / "deployconf.yaml" + outdir = tmpdir / "out" + config_path.write_text( + textwrap.dedent( + """ + name_prefix: fluxon-testbed + cluster_nodes: + - hostname: node-a + ip: 127.0.0.1 + hostworkdir: /tmp/hostworkdir + service: + etcd: + port: 2379 + entrypoint: | + echo etcd + node_bind: + node: ["node-a"] + tikv: + port: 20160 + entrypoint: | + echo tikv + node_bind: + node: ["node-a"] + svc_plain: + port: 12345 + entrypoint: | + echo plain + node_bind: + node: ["node-a"] + """ + ).strip() + + "\n", + encoding="utf-8", + ) + + result = _run_generator(config_path=config_path, outdir=outdir) + assert result.returncode == 0, f"generator failed: stdout={result.stdout} stderr={result.stderr}" + + etcd_script = (outdir / "start_etcd.sh").read_text(encoding="utf-8") + tikv_script = (outdir / "start_tikv.sh").read_text(encoding="utf-8") + plain_script = (outdir / "start_svc_plain.sh").read_text(encoding="utf-8") + + for script in (etcd_script, tikv_script, plain_script): + _assert_standalone_deadline_after_launch(script) + assert 'wait_service_probably_ready_pid_tree "$SERVICE" "$SUPERVISOR_PID" 10 "$STARTUP_DEADLINE_SECONDS" "[bare]"' in script, script + assert "wait_service_tcp_ready" not in script, script + assert "wait_service_etcd_endpoint_healthy" not in script, script + print("PASS: test_bare_start_uses_no_exit_startup_gate") + + +def test_normalized_testbed_master_exports_service_port_for_atomic_group() -> None: + with tempfile.TemporaryDirectory(prefix="test_gen_bare_deploy_bash_normalized_testbed_") as td: + tmpdir = Path(td) + config_path = tmpdir / "deployconf.normalized.yaml" + outdir = tmpdir / "out" + + start_test_bed = _load_python_module( + module_name="start_test_bed_for_gen_bare_tests", + path=DEPLOYMENT_DIR.parent / "fluxon_test_stack" / "start_test_bed.py", + ) + base_cfg = yaml.safe_load( + (DEPLOYMENT_DIR.parent / "fluxon_test_stack" / "deployconf_testbed.yml").read_text(encoding="utf-8") + ) + normalized, _ = start_test_bed._normalize_bootstrap_deployconf(deployconf=base_cfg) + config_path.write_text( + yaml.safe_dump(normalized, sort_keys=False, allow_unicode=False), + encoding="utf-8", + ) + + result = _run_generator(config_path=config_path, outdir=outdir) + assert result.returncode == 0, f"generator failed: stdout={result.stdout} stderr={result.stderr}" + + script = (outdir / "start_fluxon_core_controller.sh").read_text(encoding="utf-8") + master_block_start = script.index("export SERVICE=master") + owner_block_start = script.index("export SERVICE=owner") + master_block = script[master_block_start:owner_block_start] + assert "export MASTER__PORT=51051" in master_block, master_block + assert "export SERVICE_PORT=51051" in master_block, master_block + assert "unset SERVICE_PORT" not in master_block, master_block + assert 'GROUP_STARTUP_DEADLINE_SECONDS=' not in master_block, master_block + _assert_deadline_after_launch( + script=master_block, + wait_call='wait_service_probably_ready_pid_tree "$SERVICE" "$SUPERVISOR_PID" 10 "$STARTUP_DEADLINE_SECONDS" "[rollout]"', + ) + assert "wait_service_tcp_ready" not in master_block, master_block + print("PASS: test_normalized_testbed_master_exports_service_port_for_atomic_group") + + +def test_normalized_testbed_owner_emits_large_file_paths() -> None: + with tempfile.TemporaryDirectory(prefix="test_gen_bare_deploy_bash_testbed_owner_large_paths_") as td: + tmpdir = Path(td) + config_path = tmpdir / "deployconf.normalized.yaml" + outdir = tmpdir / "out" + + start_test_bed = _load_python_module( + module_name="start_test_bed_for_owner_large_paths_tests", + path=DEPLOYMENT_DIR.parent / "fluxon_test_stack" / "start_test_bed.py", + ) + base_cfg = yaml.safe_load( + (DEPLOYMENT_DIR.parent / "fluxon_test_stack" / "deployconf_testbed.yml").read_text(encoding="utf-8") + ) + normalized, _ = start_test_bed._normalize_bootstrap_deployconf(deployconf=base_cfg) + config_path.write_text( + yaml.safe_dump(normalized, sort_keys=False, allow_unicode=False), + encoding="utf-8", + ) + + result = _run_generator(config_path=config_path, outdir=outdir) + assert result.returncode == 0, f"generator failed: stdout={result.stdout} stderr={result.stderr}" + + script = (outdir / "entrypoint__fluxon-self-host2-fluxon_core_controller__owner.sh").read_text(encoding="utf-8") + assert 'large_file_paths:' in script, script + assert '- "${HOSTWORKDIR}/large/owner_${NODE_ID}"' in script, script + print("PASS: test_normalized_testbed_owner_emits_large_file_paths") + + def test_bare_child_command_preserves_runtime_hostworkdir_expansion() -> None: with tempfile.TemporaryDirectory(prefix="test_gen_bare_deploy_bash_runtime_expand_") as td: tmpdir = Path(td) @@ -438,7 +586,7 @@ def _handle_signal(_signum, _frame): assert first.returncode == 0, ( f"first start failed rc={first.returncode} stdout={first.stdout!r} stderr={first.stderr!r}" ) - _wait_until_selection_present(supervisor_module, label=label) + _wait_until_selection_present(supervisor_module, label=label, scope_key=str(hostworkdir)) second_env = base_env.copy() second_env["FLUXON_BARE_ALLOW_ALREADY_PRESENT"] = "true" @@ -455,7 +603,7 @@ def _handle_signal(_signum, _frame): f"reuse start failed rc={second.returncode} stdout={second.stdout!r} stderr={second.stderr!r}" ) assert "[bare] already present svc=svc_plain" in second.stdout, second.stdout - live_supervisors = supervisor_module._iter_live_supervisors(label) + live_supervisors = supervisor_module._iter_live_supervisors(label, scope_key=str(hostworkdir)) assert len(live_supervisors) == 1, live_supervisors finally: subprocess.run( @@ -467,10 +615,203 @@ def _handle_signal(_signum, _frame): env=base_env, timeout=20, ) - _wait_until_selection_absent(supervisor_module, label=label) + _wait_until_selection_absent(supervisor_module, label=label, scope_key=str(hostworkdir)) print("PASS: test_bootstrap_start_reuses_already_present_selection") +def test_bare_start_fails_when_child_exits_within_startup_window() -> None: + with tempfile.TemporaryDirectory(prefix="test_gen_bare_deploy_bash_child_exit_") as td: + tmpdir = Path(td) + hostworkdir = tmpdir / "hostworkdir" + outdir = hostworkdir / "gen_bare_deploy_bash" + config_path = tmpdir / "deployconf.yaml" + hostworkdir.mkdir(parents=True, exist_ok=True) + (hostworkdir / "exit_after_delay.py").write_text( + textwrap.dedent( + """ + #!/usr/bin/env python3 + import time + + time.sleep(1.5) + raise SystemExit(17) + """ + ).strip() + + "\n", + encoding="utf-8", + ) + config_path.write_text( + textwrap.dedent( + f""" + name_prefix: fluxon-testbed + cluster_nodes: + - hostname: node-a + ip: 127.0.0.1 + hostworkdir: {hostworkdir} + service: + svc_plain: + entrypoint: | + exec python3 "${{HOSTWORKDIR}}/exit_after_delay.py" + node_bind: + node: ["node-a"] + """ + ).strip() + + "\n", + encoding="utf-8", + ) + + result = _run_generator(config_path=config_path, outdir=outdir) + assert result.returncode == 0, f"generator failed: stdout={result.stdout} stderr={result.stderr}" + + start_script = outdir / "start_svc_plain.sh" + stop_script = outdir / "stop_svc_plain.sh" + supervisor_module = _load_generated_supervisor_module(outdir / "selection_supervisor.py") + label = "DaemonSet/fluxon-testbed-svc_plain" + repo_root = DEPLOYMENT_DIR.parent + env = os.environ.copy() + env["NODE_ID"] = "node-a" + + try: + start = subprocess.run( + [str(start_script)], + check=False, + capture_output=True, + text=True, + cwd=str(repo_root), + env=env, + timeout=20, + ) + assert start.returncode != 0, ( + f"expected startup gate failure rc={start.returncode} stdout={start.stdout!r} stderr={start.stderr!r}" + ) + assert "[bare] probable-ready failed svc=svc_plain" in start.stdout, start.stdout + assert "child pid exited" in start.stdout or "child pid changed" in start.stdout, start.stdout + finally: + subprocess.run( + [str(stop_script)], + check=False, + capture_output=True, + text=True, + cwd=str(repo_root), + env=env, + timeout=20, + ) + _wait_until_selection_absent(supervisor_module, label=label, scope_key=str(hostworkdir)) + print("PASS: test_bare_start_fails_when_child_exits_within_startup_window") + + +def test_pid_ready_check_requires_full_stable_window_after_first_child_observation() -> None: + proc_lifecycle = _load_python_module( + module_name="test_proc_lifecycle_codegen_runtime", + path=DEPLOYMENT_DIR / "utils" / "proc_lifecycle_codegen.py", + ) + helpers = proc_lifecycle.render_bash_proc_lifecycle_funcs_pid_tree( + timeouts=proc_lifecycle.StopTimeouts(term_seconds=60, kill_seconds=10, supersede_seconds=30) + ) + with tempfile.TemporaryDirectory(prefix="test_proc_lifecycle_late_child_") as td: + tmpdir = Path(td) + shell_script = tmpdir / "probe.sh" + supervisor_script = tmpdir / "delayed_child_supervisor.py" + child_script = tmpdir / "sleep_child.py" + + child_script.write_text( + textwrap.dedent( + """ + #!/usr/bin/env python3 + import signal + import time + + def _handle_signal(_signum, _frame): + raise SystemExit(0) + + signal.signal(signal.SIGTERM, _handle_signal) + signal.signal(signal.SIGINT, _handle_signal) + + while True: + time.sleep(0.2) + """ + ).strip() + + "\n", + encoding="utf-8", + ) + supervisor_script.write_text( + textwrap.dedent( + f""" + #!/usr/bin/env python3 + import signal + import subprocess + import sys + import time + from pathlib import Path + + child = None + + def _shutdown(_signum, _frame): + global child + if child is not None and child.poll() is None: + child.terminate() + try: + child.wait(timeout=5) + except subprocess.TimeoutExpired: + child.kill() + raise SystemExit(0) + + signal.signal(signal.SIGTERM, _shutdown) + signal.signal(signal.SIGINT, _shutdown) + + time.sleep(4) + child = subprocess.Popen([sys.executable, str(Path({str(child_script)!r}))]) + while True: + if child.poll() is not None: + raise SystemExit(child.returncode or 0) + time.sleep(0.2) + """ + ).strip() + + "\n", + encoding="utf-8", + ) + + shell_script.write_text( + textwrap.dedent( + f"""\ + #!/usr/bin/env bash + set -euo pipefail + {helpers} + python3 {shlex.quote(str(supervisor_script))} & + root_pid="$!" + startup_deadline_seconds=6 + if wait_service_probably_ready_pid_tree "svc_plain" "$root_pid" 4 "$startup_deadline_seconds" "[test]"; then + echo "unexpected success" + kill "$root_pid" >/dev/null 2>&1 || true + wait "$root_pid" >/dev/null 2>&1 || true + exit 99 + else + wait_rc="$?" + fi + kill "$root_pid" >/dev/null 2>&1 || true + wait "$root_pid" >/dev/null 2>&1 || true + exit "$wait_rc" + """ + ), + encoding="utf-8", + ) + shell_script.chmod(0o755) + + result = subprocess.run( + ["bash", str(shell_script)], + check=False, + capture_output=True, + text=True, + cwd=str(DEPLOYMENT_DIR.parent), + timeout=20, + ) + assert result.returncode != 0, ( + f"expected startup gate failure rc={result.returncode} stdout={result.stdout!r} stderr={result.stderr!r}" + ) + assert "unexpected success" not in result.stdout, result.stdout + assert "child pid not stable long enough" in result.stdout, result.stdout + print("PASS: test_pid_ready_check_requires_full_stable_window_after_first_child_observation") + + def test_atomic_group_preserves_nested_heredoc_terminator() -> None: with tempfile.TemporaryDirectory(prefix="test_gen_bare_deploy_bash_atomic_heredoc_") as td: tmpdir = Path(td) @@ -600,22 +941,64 @@ def _load_generated_supervisor_module(supervisor_path: Path): return module -def _wait_until_selection_present(module, *, label: str, timeout_seconds: int = 15) -> None: +def _load_python_module(*, module_name: str, path: Path): + spec = importlib.util.spec_from_file_location(module_name, path) + if spec is None or spec.loader is None: + raise RuntimeError(f"failed to load module: {path}") + module = importlib.util.module_from_spec(spec) + sys.modules[module_name] = module + spec.loader.exec_module(module) + return module + + +def _wait_until_selection_present( + module, + *, + label: str, + scope_key: Optional[str] = None, + timeout_seconds: int = 15, +) -> None: deadline = time.time() + timeout_seconds while time.time() < deadline: - if module._selection_present(label): + if module._selection_present(label, scope_key=scope_key): return time.sleep(0.2) - raise RuntimeError(f"timeout waiting selection present: label={label}") + raise RuntimeError(f"timeout waiting selection present: label={label} scope_key={scope_key}") -def _wait_until_selection_absent(module, *, label: str, timeout_seconds: int = 15) -> None: +def _wait_until_selection_absent( + module, + *, + label: str, + scope_key: Optional[str] = None, + timeout_seconds: int = 15, +) -> None: deadline = time.time() + timeout_seconds while time.time() < deadline: - if not module._iter_live_supervisors(label): + if not module._iter_live_supervisors(label, scope_key=scope_key): return time.sleep(0.2) - raise RuntimeError(f"timeout waiting selection absent: label={label}") + raise RuntimeError(f"timeout waiting selection absent: label={label} scope_key={scope_key}") + + +def _assert_deadline_after_launch(*, script: str, wait_call: str) -> None: + launch_check = 'if [[ ! "$SUPERVISOR_PID" =~ ^[0-9]+$ ]]; then' + deadline_assign = 'STARTUP_DEADLINE_SECONDS=20' + assert launch_check in script, script + assert deadline_assign in script, script + assert wait_call in script, script + + launch_check_idx = script.index(launch_check) + deadline_idx = script.index(deadline_assign) + wait_idx = script.index(wait_call) + assert launch_check_idx < deadline_idx < wait_idx, script + + +def _assert_standalone_deadline_after_launch(script: str) -> None: + _assert_deadline_after_launch( + script=script, + wait_call='wait_service_probably_ready_pid_tree "$SERVICE" "$SUPERVISOR_PID" 10 "$STARTUP_DEADLINE_SECONDS" "[bare]"', + ) if __name__ == "__main__": diff --git a/deployment/tests/test_gen_k8s_daemonset.py b/deployment/tests/test_gen_k8s_daemonset.py index eff0aad..35fae5a 100644 --- a/deployment/tests/test_gen_k8s_daemonset.py +++ b/deployment/tests/test_gen_k8s_daemonset.py @@ -190,7 +190,6 @@ def test_ops_entrypoints_use_direct_scripts() -> None: FLUXON_PIP_CONF_CMD: "true" FLUXON_RELEASE_WHEEL_FETCH_CMD: "true" FLUXON_SHARED_MEM: "${HOSTWORKDIR}/shm1" - FLUXON_SHARED_FILE: "${HOSTWORKDIR}/shm1_files" ETCD_FULL_ADDRESS: "127.0.0.1:33579" FLUXON_CLUSTER_NAME: "fluxon_testbed" FLUXON_OPS_CONTROLLER_INSTANCE_KEY: "ops_controller_node-a" @@ -201,15 +200,14 @@ def test_ops_entrypoints_use_direct_scripts() -> None: ops_agent: entrypoint: | WORKDIR="${HOSTWORKDIR}/ops_agent/${NODE_ID}" - mkdir -p "${WORKDIR}" "${FLUXON_SHARED_MEM}" "${FLUXON_SHARED_FILE}" + mkdir -p "${WORKDIR}" "${FLUXON_SHARED_MEM}" cat > "${WORKDIR}/ops_agent.yaml" < None: ops_controller: entrypoint: | WORKDIR="${HOSTWORKDIR}/ops_controller" - mkdir -p "${WORKDIR}" "${FLUXON_SHARED_MEM}" "${FLUXON_SHARED_FILE}" + mkdir -p "${WORKDIR}" "${FLUXON_SHARED_MEM}" cat > "${WORKDIR}/ops_controller.yaml" < None: pprof_duration_seconds: 60 fluxonkv_spec: cluster_name: "${FLUXON_CLUSTER_NAME}" - shared_memory_path: "${FLUXON_SHARED_MEM}" - shared_file_path: "${FLUXON_SHARED_FILE}" + share_mem_path: "${FLUXON_SHARED_MEM}" p2p_listen_port: 12102 panel: max_body_bytes: 1073741824 @@ -248,7 +245,7 @@ def test_ops_entrypoints_use_direct_scripts() -> None: cluster_name: "${FLUXON_CLUSTER_NAME}" member_kind: kv output: web - http_listen_addr: "0.0.0.0:${MASTER__PORT}" + http_listen_addr: "0.0.0.0:${OPS_CONTROLLER__PORT}" YAML ${HOSTWORKDIR}/venv/bin/python -m fluxon_py.runtime.start_ops_controller -c "${WORKDIR}/ops_controller.yaml" -w "${WORKDIR}" node_bind: diff --git a/deployment/tests/test_log_shard.py b/deployment/tests/test_log_shard.py new file mode 100644 index 0000000..642e718 --- /dev/null +++ b/deployment/tests/test_log_shard.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +import argparse +import datetime +import os +import sys +import tempfile +import time +from pathlib import Path +from typing import Callable, List, Optional, Tuple + +SCRIPT_DIR = Path(__file__).resolve().parent +DEPLOYMENT_DIR = SCRIPT_DIR.parent +sys.path.insert(0, str(DEPLOYMENT_DIR)) + +from utils import log_shard + + +def main() -> int: + parser = argparse.ArgumentParser(description="log_shard util test runner") + parser.add_argument("--test-id", help="Run only the named test id") + args = parser.parse_args() + + checks = _build_checks(args.test_id) + failures = 0 + for _, check in checks: + try: + check() + print(f"PASS: {check.__name__}") + except Exception as exc: + print(f"FAIL: {check.__name__}: {exc}") + failures += 1 + return 0 if failures == 0 else 1 + + +def _build_checks(selected_test_id: Optional[str]) -> List[Tuple[str, Callable[[], None]]]: + checks: List[Tuple[str, Callable[[], None]]] = [ + ("daily_path_uses_utc_date_suffix", test_daily_path_uses_utc_date_suffix), + ("daily_path_uses_test_window_suffix_when_configured", test_daily_path_uses_test_window_suffix_when_configured), + ("resolve_readable_prefers_latest_existing_shard", test_resolve_readable_prefers_latest_existing_shard), + ("cleanup_keeps_only_retention_window", test_cleanup_keeps_only_retention_window), + ] + if selected_test_id is None: + return checks + for check_id, check in checks: + if check_id == selected_test_id: + return [(check_id, check)] + available = ", ".join(check_id for check_id, _ in checks) + raise ValueError(f"unknown --test-id: {selected_test_id}. Available: {available}") + + +def test_daily_path_uses_utc_date_suffix() -> None: + base = Path("/tmp/test_runner.log") + now = datetime.datetime(2026, 6, 21, 4, 0, 0, tzinfo=datetime.timezone.utc) + resolved = log_shard.daily_sharded_log_path(base, now=now) + assert resolved.name == "test_runner.2026-06-21.log", resolved + + +def test_resolve_readable_prefers_latest_existing_shard() -> None: + with tempfile.TemporaryDirectory(prefix="test_log_shard_resolve_") as td: + root = Path(td) + base = root / "service.log" + (root / "service.2026-06-19.log").write_text("old\n", encoding="utf-8") + (root / "service.2026-06-20.log").write_text("new\n", encoding="utf-8") + resolved = log_shard.resolve_readable_log_path(base) + assert resolved == (root / "service.2026-06-20.log").resolve(), resolved + + +def test_daily_path_uses_test_window_suffix_when_configured() -> None: + base = Path("/tmp/test_runner.log") + saved_window = os.environ.get(log_shard.TEST_LOG_SHARD_WINDOW_SECONDS_ENV) + saved_anchor = os.environ.get(log_shard.TEST_LOG_SHARD_ANCHOR_UNIX_SECONDS_ENV) + try: + os.environ[log_shard.TEST_LOG_SHARD_WINDOW_SECONDS_ENV] = "10" + os.environ[log_shard.TEST_LOG_SHARD_ANCHOR_UNIX_SECONDS_ENV] = str( + int(datetime.datetime(2026, 6, 21, 0, 0, 0, tzinfo=datetime.timezone.utc).timestamp()) + ) + now_0 = datetime.datetime(2026, 6, 21, 0, 0, 5, tzinfo=datetime.timezone.utc) + now_1 = datetime.datetime(2026, 6, 21, 0, 0, 15, tzinfo=datetime.timezone.utc) + resolved_0 = log_shard.daily_sharded_log_path(base, now=now_0) + resolved_1 = log_shard.daily_sharded_log_path(base, now=now_1) + assert resolved_0.name == "test_runner.2026-01-01.log", resolved_0 + assert resolved_1.name == "test_runner.2026-01-02.log", resolved_1 + finally: + if saved_window is None: + os.environ.pop(log_shard.TEST_LOG_SHARD_WINDOW_SECONDS_ENV, None) + else: + os.environ[log_shard.TEST_LOG_SHARD_WINDOW_SECONDS_ENV] = saved_window + if saved_anchor is None: + os.environ.pop(log_shard.TEST_LOG_SHARD_ANCHOR_UNIX_SECONDS_ENV, None) + else: + os.environ[log_shard.TEST_LOG_SHARD_ANCHOR_UNIX_SECONDS_ENV] = saved_anchor + + +def test_cleanup_keeps_only_retention_window() -> None: + with tempfile.TemporaryDirectory(prefix="test_log_shard_cleanup_") as td: + root = Path(td) + base = root / "service.log" + keep_date = datetime.datetime.now(datetime.timezone.utc).date() + old_date = keep_date - datetime.timedelta(days=31) + recent_date = keep_date - datetime.timedelta(days=30) + stale_path = root / f"service.{old_date.isoformat()}.log" + recent_path = root / f"service.{recent_date.isoformat()}.log" + today_path = root / f"service.{keep_date.isoformat()}.log" + stale_path.write_text("stale\n", encoding="utf-8") + recent_path.write_text("recent\n", encoding="utf-8") + today_path.write_text("today\n", encoding="utf-8") + log_shard.cleanup_old_daily_sharded_logs(base, retention_days=31) + assert not stale_path.exists(), stale_path + assert recent_path.exists(), recent_path + assert today_path.exists(), today_path + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/deployment/tests/test_selection_supervisor_codegen.py b/deployment/tests/test_selection_supervisor_codegen.py index 02ffa3b..bd2e34c 100644 --- a/deployment/tests/test_selection_supervisor_codegen.py +++ b/deployment/tests/test_selection_supervisor_codegen.py @@ -3,13 +3,13 @@ from __future__ import annotations import argparse +import importlib.util import json import os import subprocess import sys import tempfile import time -import types from pathlib import Path from types import SimpleNamespace from typing import Callable, List, Optional, Tuple @@ -19,6 +19,7 @@ UTILS_DIR = SCRIPT_DIR.parent / "utils" sys.path.insert(0, str(UTILS_DIR)) +from log_shard import render_module_source as render_log_shard_module_source # type: ignore from selection_supervisor_codegen import render_python_selection_supervisor_module # type: ignore @@ -38,9 +39,13 @@ def main() -> int: def _build_checks(selected_test_id: Optional[str]) -> List[Tuple[str, Callable[[], None]]]: checks: List[Tuple[str, Callable[[], None]]] = [ ("runtime_only_supports_run_stop", test_runtime_only_supports_run_stop), + ("runtime_requires_same_directory_log_shard_helper", test_runtime_requires_same_directory_log_shard_helper), ("install_subreaper_uses_prctl", test_install_subreaper_uses_prctl), ("spawn_child_sanitizes_rdma_driver_env", test_spawn_child_sanitizes_rdma_driver_env), ("selection_present_requires_live_child_process", test_selection_present_requires_live_child_process), + ("runtime_log_path_uses_daily_shard_files", test_runtime_log_path_uses_daily_shard_files), + ("runtime_log_path_expands_hostworkdir_env", test_runtime_log_path_expands_hostworkdir_env), + ("runtime_log_shards_roll_and_preserve_content_boundaries", test_runtime_log_shards_roll_and_preserve_content_boundaries), ("selection_present_checks_all_live_supervisors", test_selection_present_checks_all_live_supervisors), ("zombie_supervisor_is_treated_as_stopped", test_zombie_supervisor_is_treated_as_stopped), ("legacy_replace_process_is_observed_as_live_owner", test_legacy_replace_process_is_observed_as_live_owner), @@ -78,13 +83,20 @@ def _run_check(check: Callable[[], None]) -> bool: def _load_runtime_module(): - module = types.ModuleType("test_selection_supervisor_runtime") - sys.modules[module.__name__] = module - code = render_python_selection_supervisor_module( - timeouts=SimpleNamespace(term_seconds=5, kill_seconds=5, supersede_seconds=2), - ) - exec(code, module.__dict__) - return module + root = Path(tempfile.mkdtemp(prefix="test_selection_supervisor_runtime_module_")) + try: + supervisor_path = _write_runtime_script(root) + module_name = f"test_selection_supervisor_runtime_{os.getpid()}_{time.time_ns()}" + spec = importlib.util.spec_from_file_location(module_name, supervisor_path) + if spec is None or spec.loader is None: + raise RuntimeError(f"failed to load runtime module spec: {supervisor_path}") + module = importlib.util.module_from_spec(spec) + module._test_runtime_root = root + sys.modules[module_name] = module + spec.loader.exec_module(module) + return module + finally: + sys.modules.pop(module_name, None) def _write_runtime_script(root: Path, *, term_seconds: int = 5, kill_seconds: int = 5, supersede_seconds: int = 2) -> Path: @@ -99,6 +111,10 @@ def _write_runtime_script(root: Path, *, term_seconds: int = 5, kill_seconds: in ), encoding="utf-8", ) + (root / "log_shard.py").write_text( + render_log_shard_module_source(), + encoding="utf-8", + ) return supervisor_path @@ -430,6 +446,16 @@ def _wait_pid_absent(pid: int, *, timeout_seconds: float = 10.0) -> None: raise RuntimeError(f"timeout waiting pid absent: pid={pid}") +def _read_runtime_log(root: Path, service_name: str) -> str: + shard_path = root / f"{service_name}.{time.strftime('%Y-%m-%d', time.gmtime())}.log" + deadline = time.time() + 5.0 + while time.time() < deadline: + if shard_path.exists(): + return shard_path.read_text(encoding="utf-8", errors="replace") + time.sleep(0.1) + raise RuntimeError(f"runtime log shard missing: {shard_path}") + + def test_runtime_only_supports_run_stop() -> None: code = render_python_selection_supervisor_module( timeouts=SimpleNamespace(term_seconds=5, kill_seconds=5, supersede_seconds=2), @@ -444,6 +470,28 @@ def test_runtime_only_supports_run_stop() -> None: assert "--require-supervisor-start-time-ticks" not in code +def test_runtime_requires_same_directory_log_shard_helper() -> None: + with tempfile.TemporaryDirectory(prefix="test_selection_supervisor_missing_helper_") as td: + root = Path(td) + supervisor_path = root / "selection_supervisor.py" + supervisor_path.write_text( + render_python_selection_supervisor_module( + timeouts=SimpleNamespace(term_seconds=5, kill_seconds=5, supersede_seconds=2), + ), + encoding="utf-8", + ) + proc = subprocess.run( + [sys.executable, str(supervisor_path), "stop", "--label", "DaemonSet/test-missing-helper", "--missing-ok"], + cwd=str(root), + capture_output=True, + text=True, + timeout=10, + check=False, + ) + assert proc.returncode != 0, proc + assert "missing log shard helper next to selection_supervisor.py" in proc.stderr, proc.stderr + + def test_install_subreaper_uses_prctl() -> None: module = _load_runtime_module() @@ -561,6 +609,186 @@ def test_selection_present_requires_live_child_process() -> None: _terminate_process(supervisor) +def test_runtime_log_path_uses_daily_shard_files() -> None: + module = _load_runtime_module() + with tempfile.TemporaryDirectory(prefix="test_selection_supervisor_log_shard_") as td: + root = Path(td) + supervisor_path = _write_runtime_script(root) + child_path = root / "child.py" + child_path.write_text( + "import sys, time\n" + "print('hello-log-shard', flush=True)\n" + "time.sleep(30)\n", + encoding="utf-8", + ) + label = "DaemonSet/test-log-shard" + child_argv = [sys.executable, str(child_path)] + base_log_path = root / "test-log-shard.log" + supervisor = _run_supervisor_command( + supervisor_path=supervisor_path, + label=label, + owner_ts_ms=1, + state_json=json.dumps( + { + "kind": "DaemonSet", + "name": "test-log-shard", + "service_name": "test-log-shard", + "argv": child_argv, + "cwd": str(root), + "log_path": str(base_log_path), + }, + sort_keys=True, + ), + child_argv=child_argv, + cwd=root, + ) + try: + _wait_until_present(module, label) + deadline = time.time() + 5.0 + shard_path = root / f"test-log-shard.{time.strftime('%Y-%m-%d', time.gmtime())}.log" + while time.time() < deadline and not shard_path.exists(): + time.sleep(0.1) + assert shard_path.exists(), shard_path + assert not base_log_path.exists(), base_log_path + assert "hello-log-shard" in shard_path.read_text(encoding="utf-8", errors="replace") + finally: + _terminate_process(supervisor) + + +def test_runtime_log_path_expands_hostworkdir_env() -> None: + module = _load_runtime_module() + with tempfile.TemporaryDirectory(prefix="test_selection_supervisor_expand_hostworkdir_") as td: + root = Path(td) + hostworkdir = root / "hostworkdir" + hostworkdir.mkdir(parents=True, exist_ok=True) + supervisor_path = _write_runtime_script(root) + child_path = root / "child.py" + child_path.write_text( + "import time\n" + "print('expanded-hostworkdir-log', flush=True)\n" + "time.sleep(30)\n", + encoding="utf-8", + ) + label = "DaemonSet/test-expand-hostworkdir" + child_argv = [sys.executable, str(child_path)] + saved_hostworkdir = os.environ.get("HOSTWORKDIR") + os.environ["HOSTWORKDIR"] = str(hostworkdir) + supervisor = _run_supervisor_command( + supervisor_path=supervisor_path, + label=label, + owner_ts_ms=1, + state_json=json.dumps( + { + "kind": "DaemonSet", + "name": "test-expand-hostworkdir", + "service_name": "test-expand-hostworkdir", + "argv": child_argv, + "cwd": str(root), + "log_path": "${HOSTWORKDIR}/log/test-expand-hostworkdir.log", + }, + sort_keys=True, + ), + child_argv=child_argv, + cwd=root, + ) + try: + _wait_until_present(module, label) + deadline = time.time() + 5.0 + shard_path = hostworkdir / "log" / f"test-expand-hostworkdir.{time.strftime('%Y-%m-%d', time.gmtime())}.log" + while time.time() < deadline and not shard_path.exists(): + time.sleep(0.1) + assert shard_path.exists(), shard_path + assert "expanded-hostworkdir-log" in shard_path.read_text(encoding="utf-8", errors="replace") + finally: + _terminate_process(supervisor) + if saved_hostworkdir is None: + os.environ.pop("HOSTWORKDIR", None) + else: + os.environ["HOSTWORKDIR"] = saved_hostworkdir + + +def test_runtime_log_shards_roll_and_preserve_content_boundaries() -> None: + module = _load_runtime_module() + saved_window = os.environ.get("FLUXON_TEST_LOG_SHARD_WINDOW_SECONDS") + saved_anchor = os.environ.get("FLUXON_TEST_LOG_SHARD_ANCHOR_UNIX_SECONDS") + with tempfile.TemporaryDirectory(prefix="test_selection_supervisor_log_roll_") as td: + root = Path(td) + supervisor_path = _write_runtime_script(root) + child_path = root / "child.py" + child_path.write_text( + "import sys, time\n" + "print('[ops-log-mgmt][phase=before] ts=' + str(int(time.time())), flush=True)\n" + "time.sleep(11)\n" + "print('[ops-log-mgmt][phase=after] ts=' + str(int(time.time())), flush=True)\n" + "time.sleep(30)\n", + encoding="utf-8", + ) + anchor = str(int(time.time()) - 2) + os.environ["FLUXON_TEST_LOG_SHARD_WINDOW_SECONDS"] = "10" + os.environ["FLUXON_TEST_LOG_SHARD_ANCHOR_UNIX_SECONDS"] = anchor + label = "DaemonSet/test-log-roll" + child_argv = [sys.executable, str(child_path)] + base_log_path = root / "test-log-roll.log" + stale_shard = root / "test-log-roll.2025-12-01.log" + stale_shard.write_text("stale\n", encoding="utf-8") + supervisor = _run_supervisor_command( + supervisor_path=supervisor_path, + label=label, + owner_ts_ms=1, + state_json=json.dumps( + { + "kind": "DaemonSet", + "name": "test-log-roll", + "service_name": "test-log-roll", + "argv": child_argv, + "cwd": str(root), + "log_path": str(base_log_path), + }, + sort_keys=True, + ), + child_argv=child_argv, + cwd=root, + ) + try: + _wait_until_present(module, label) + first_shard = root / "test-log-roll.2026-01-01.log" + second_shard = None + deadline = time.time() + 20.0 + while time.time() < deadline: + shard_paths = sorted(root.glob("test-log-roll.*.log")) + if len(shard_paths) >= 2: + second_shard = shard_paths[-1] + if first_shard.exists() and second_shard is not None and second_shard.exists(): + first_text = first_shard.read_text(encoding="utf-8", errors="replace") + second_text = second_shard.read_text(encoding="utf-8", errors="replace") + if "[ops-log-mgmt][phase=before]" in first_text and "[ops-log-mgmt][phase=after]" in second_text: + break + time.sleep(0.2) + assert first_shard.exists(), first_shard + assert second_shard is not None, sorted(path.name for path in root.glob("test-log-roll.*.log")) + assert second_shard.exists(), second_shard + assert not stale_shard.exists(), stale_shard + shard_names = sorted(path.name for path in root.glob("test-log-roll.*.log")) + assert shard_names[0] == "test-log-roll.2026-01-01.log", shard_names + assert len(shard_names) == 2, shard_names + first_text = first_shard.read_text(encoding="utf-8", errors="replace") + second_text = second_shard.read_text(encoding="utf-8", errors="replace") + assert "[ops-log-mgmt][phase=before]" in first_text, first_text + assert "[ops-log-mgmt][phase=after]" not in first_text, first_text + assert "[ops-log-mgmt][phase=after]" in second_text, second_text + assert "[ops-log-mgmt][phase=before]" not in second_text, second_text + finally: + _terminate_process(supervisor) + if saved_window is None: + os.environ.pop("FLUXON_TEST_LOG_SHARD_WINDOW_SECONDS", None) + else: + os.environ["FLUXON_TEST_LOG_SHARD_WINDOW_SECONDS"] = saved_window + if saved_anchor is None: + os.environ.pop("FLUXON_TEST_LOG_SHARD_ANCHOR_UNIX_SECONDS", None) + else: + os.environ["FLUXON_TEST_LOG_SHARD_ANCHOR_UNIX_SECONDS"] = saved_anchor + + def test_selection_present_checks_all_live_supervisors() -> None: module = _load_runtime_module() label = "DaemonSet/test-present-any-live-child" @@ -569,7 +797,9 @@ def test_selection_present_checks_all_live_supervisors() -> None: original_iter_live_supervisors = module._iter_live_supervisors original_count_pid_tree_members = module._count_pid_tree_members try: - module._iter_live_supervisors = lambda current_label=None: [stale_new, old_live] if current_label == label else [] + module._iter_live_supervisors = ( + lambda current_label=None, scope_key=None: [stale_new, old_live] if current_label == label else [] + ) module._count_pid_tree_members = lambda pid: {11: 1, 22: 2}[pid] assert module._selection_present(label) is True finally: @@ -832,12 +1062,12 @@ def test_replace_supersedes_old_generation() -> None: assert status["apply_id"] == "apply-2", f"expected new apply to own selection, got {status!r}" assert status["owner_ts_ms"] == 2, f"expected owner_ts_ms=2 after replace, got {status!r}" old_supervisor.wait(timeout=10) - old_stderr = old_supervisor.stderr.read() if old_supervisor.stderr is not None else "" + runtime_log = _read_runtime_log(root, "test-supersede") assert ( - "running generation superseded" in old_stderr - or "superseded child exited without restart" in old_stderr + "running generation superseded" in runtime_log + or "superseded child exited without restart" in runtime_log ), ( - f"expected old supervisor supersede log, stderr={old_stderr!r}" + f"expected old supervisor supersede log, runtime_log={runtime_log!r}" ) finally: _terminate_process(new_supervisor) @@ -1010,11 +1240,11 @@ def test_newer_apply_owned_overlap_with_applyless_owner_defers_retire() -> None: assert bare_supervisor.poll() is None, "old bare supervisor retired before phase-2 cutover or fallback" bare_supervisor.wait(timeout=20) - old_stderr = bare_supervisor.stderr.read() if bare_supervisor.stderr is not None else "" + runtime_log = _read_runtime_log(root, "test-phase1-overlap-applyless") assert ( - "running generation superseded" in old_stderr - or "superseded child exited without restart" in old_stderr - ), old_stderr + "running generation superseded" in runtime_log + or "superseded child exited without restart" in runtime_log + ), runtime_log finally: _terminate_process(takeover_supervisor) _terminate_process(bare_supervisor) @@ -1135,7 +1365,7 @@ def test_retire_adopted_children_stops_live_roots() -> None: calls: List[tuple[str, object]] = [] try: module._direct_live_child_pids = lambda pid: [41, 42] if pid == module.os.getpid() else [] - module._iter_live_supervisors = lambda label=None: [] + module._iter_live_supervisors = lambda label=None, scope_key=None: [] module._stop_pid_tree_batch = lambda roots, label: calls.append(("stop", (list(roots), label))) module._reap_terminated_children = lambda: [(41, 0), (42, 0)] module._log_reaped_children = lambda **kwargs: calls.append(("reap", kwargs)) @@ -1160,7 +1390,7 @@ def test_retire_adopted_children_preserves_live_supervisor_roots() -> None: calls: List[tuple[str, object]] = [] try: module._direct_live_child_pids = lambda pid: [41, 42] if pid == module.os.getpid() else [] - module._iter_live_supervisors = lambda label=None: [ + module._iter_live_supervisors = lambda label=None, scope_key=None: [ module.LiveSupervisor( process_info=module.ProcessInfo(pid=42, ppid=module.os.getpid(), pgid=42, state="S", start_time_ticks=1), owner_ts_ms=7, diff --git a/deployment/tests/test_start_test_bed_bootstrap_log.py b/deployment/tests/test_start_test_bed_bootstrap_log.py index 312deea..2bd6b00 100644 --- a/deployment/tests/test_start_test_bed_bootstrap_log.py +++ b/deployment/tests/test_start_test_bed_bootstrap_log.py @@ -3,6 +3,7 @@ from __future__ import annotations import argparse +import copy import importlib.util import io import sys @@ -190,6 +191,46 @@ def test_failed_status_includes_bootstrap_and_service_log_tails() -> None: print("PASS: test_failed_status_includes_bootstrap_and_service_log_tails") +def test_failed_status_resolves_daily_sharded_service_log_tail() -> None: + module = _load_start_test_bed_module() + with tempfile.TemporaryDirectory(prefix="test_start_test_bed_sharded_failure_tails_") as td: + root = Path(td) + bootstrap_log = root / "fluxon_core_controller.bootstrap.log" + bootstrap_log.write_text("[rollout] probable-ready failed svc=owner\n", encoding="utf-8") + base_service_log = root / "log" / "master.log" + base_service_log.parent.mkdir(parents=True, exist_ok=True) + sharded_service_log = root / "log" / "master.2026-06-23.log" + sharded_service_log.write_text("FATAL: owner bootstrap dependency failed\n", encoding="utf-8") + local_node_cfg = { + "hostname": "node-a", + "hostworkdir": str(root), + } + result = _build_result( + bootstrap_log_path=bootstrap_log, + launcher_rc=1, + selection_name="fluxon_core_controller", + bare_script_name="fluxon_core_controller", + node_name="node-a", + expected_service_names=["master"], + ) + statuses = module._collect_bare_runtime_statuses( + deployconf={}, + cluster_nodes={}, + local_node_cfg=local_node_cfg, + result=result, + ) + assert len(statuses) == 1, statuses + status = statuses[0] + assert status["present"] is False, status + assert status["running"] is False, status + assert status["log_path"] == str(sharded_service_log.resolve()), status + err = status["status_error"] + assert isinstance(err, str) and "bootstrap_log_tail=" in err, err + assert "service_log_tail=" in err, err + assert "owner bootstrap dependency failed" in err, err + print("PASS: test_failed_status_resolves_daily_sharded_service_log_tail") + + def test_testbed_template_tikv_uses_low_fd_limits_for_ci_runner() -> None: deployconf = yaml.safe_load((REPO_ROOT / "fluxon_test_stack" / "deployconf_testbed.yml").read_text(encoding="utf-8")) tikv_cfg = deployconf["service"]["tikv"]["entrypoint"] @@ -604,6 +645,7 @@ def test_normalize_bootstrap_deployconf_strips_legacy_master_p2p_listen_port() - ops_agent_entrypoint = normalized["service"]["ops_agent"]["entrypoint"] assert "p2p_listen_port: 31100" not in master_entrypoint, master_entrypoint assert "p2p_listen_port: 12102" in ops_agent_entrypoint, ops_agent_entrypoint + assert normalized["service"]["master"]["port"] == 51051, normalized["service"]["master"] assert notes == ["service.master.entrypoint: removed legacy master field p2p_listen_port"], notes assert "p2p_listen_port: 31100" in deployconf["service"]["master"]["entrypoint"], deployconf print("PASS: test_normalize_bootstrap_deployconf_strips_legacy_master_p2p_listen_port") @@ -789,6 +831,7 @@ def test_normalize_bootstrap_deployconf_rewrites_same_host_local_multi_node_fixe assert "--http-addr 0.0.0.0:19390" in normalized["service"]["greptime"]["entrypoint"], normalized["service"]["greptime"]["entrypoint"] assert normalized["service"]["tikv_pd"]["port"] == 19400, normalized["service"]["tikv_pd"] assert normalized["service"]["tikv"]["port"] == 19410, normalized["service"]["tikv"] + assert normalized["service"]["master"]["port"] == 19290, normalized["service"]["master"] assert "port: 19290" in normalized["service"]["master"]["entrypoint"], normalized["service"]["master"]["entrypoint"] assert "OPS_AGENT_P2P_LISTEN_PORT=19320" in normalized["service"]["ops_agent"]["entrypoint"], normalized["service"]["ops_agent"]["entrypoint"] assert "OPS_AGENT_P2P_LISTEN_PORT=19321" in normalized["service"]["ops_agent"]["entrypoint"], normalized["service"]["ops_agent"]["entrypoint"] @@ -845,11 +888,35 @@ def test_normalize_bootstrap_deployconf_keeps_non_local_or_single_node_ports_unc }, } normalized, notes = module._normalize_bootstrap_deployconf(deployconf=deployconf) - assert normalized == deployconf, normalized + assert normalized["service"]["master"]["port"] == 51051, normalized["service"]["master"] + expected = copy.deepcopy(deployconf) + expected["service"]["master"]["port"] = 51051 + assert normalized == expected, normalized assert notes == [], notes print("PASS: test_normalize_bootstrap_deployconf_keeps_non_local_or_single_node_ports_unchanged") +def test_normalize_bootstrap_deployconf_promotes_master_port_from_entrypoint() -> None: + module = _load_start_test_bed_module() + deployconf = { + "service": { + "master": { + "entrypoint": ( + 'cat > "${CONFIG_PATH}" < None: module = _load_start_test_bed_module() with tempfile.TemporaryDirectory(prefix="test_start_test_bed_refresh_bare_") as td: @@ -1476,6 +1543,10 @@ def main() -> int: "normalize_bootstrap_deployconf_keeps_non_local_or_single_node_ports_unchanged", test_normalize_bootstrap_deployconf_keeps_non_local_or_single_node_ports_unchanged, ), + ( + "normalize_bootstrap_deployconf_promotes_master_port_from_entrypoint", + test_normalize_bootstrap_deployconf_promotes_master_port_from_entrypoint, + ), ( "refresh_cluster_bare_deploy_scripts_copies_local_and_remote_nodes", test_refresh_cluster_bare_deploy_scripts_copies_local_and_remote_nodes, diff --git a/deployment/utils/deployconf_config_utils.py b/deployment/utils/deployconf_config_utils.py index 4244526..b312c6d 100644 --- a/deployment/utils/deployconf_config_utils.py +++ b/deployment/utils/deployconf_config_utils.py @@ -18,8 +18,7 @@ "load_deployconf_prometheus_base_url", "load_deployconf_prom_remote_write_url", "load_deployconf_fluxon_cluster_name", - "load_deployconf_fluxon_shared_memory_path", - "load_deployconf_fluxon_shared_file_path", + "load_deployconf_fluxon_share_mem_path", "load_deployconf_service_ip_port", ] @@ -140,7 +139,7 @@ def load_deployconf_fluxon_cluster_name(*, config_path: Path) -> str: return raw.strip() -def load_deployconf_fluxon_shared_memory_path(*, config_path: Path) -> str: +def load_deployconf_fluxon_share_mem_path(*, config_path: Path) -> str: global_envs = load_deployconf_resolved_global_envs(config_path=config_path) raw = global_envs.get("FLUXON_SHARED_MEM") if not isinstance(raw, str) or not raw.strip(): @@ -148,14 +147,6 @@ def load_deployconf_fluxon_shared_memory_path(*, config_path: Path) -> str: return raw.strip() -def load_deployconf_fluxon_shared_file_path(*, config_path: Path) -> str: - global_envs = load_deployconf_resolved_global_envs(config_path=config_path) - raw = global_envs.get("FLUXON_SHARED_FILE") - if not isinstance(raw, str) or not raw.strip(): - raise ValueError("deployconf.global_envs.FLUXON_SHARED_FILE must resolve to a non-empty string") - return raw.strip() - - def load_deployconf_service_ip_port(*, config_path: Path, service_name: str) -> Tuple[str, int]: cfg = load_deployconf_mapping(config_path=config_path) cluster_nodes = cfg.get("cluster_nodes") diff --git a/deployment/utils/log_shard.py b/deployment/utils/log_shard.py new file mode 100644 index 0000000..415d4ff --- /dev/null +++ b/deployment/utils/log_shard.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +import datetime +import os +from pathlib import Path +from typing import Optional + + +DEFAULT_DAILY_LOG_RETENTION_DAYS = 31 +TEST_LOG_SHARD_WINDOW_SECONDS_ENV = "FLUXON_TEST_LOG_SHARD_WINDOW_SECONDS" +TEST_LOG_SHARD_ANCHOR_UNIX_SECONDS_ENV = "FLUXON_TEST_LOG_SHARD_ANCHOR_UNIX_SECONDS" +TEST_LOG_SHARD_BASE_DATE = datetime.date(2026, 1, 1) + + +def _read_test_log_shard_window_seconds() -> Optional[int]: + raw_value = os.environ.get(TEST_LOG_SHARD_WINDOW_SECONDS_ENV) + if raw_value is None: + return None + text = raw_value.strip() + if not text: + return None + window_seconds = int(text) + if window_seconds <= 0: + raise ValueError( + f"{TEST_LOG_SHARD_WINDOW_SECONDS_ENV} must be a positive integer, got: {raw_value!r}" + ) + return window_seconds + + +def _read_test_log_shard_anchor_unix_seconds() -> int: + raw_value = os.environ.get(TEST_LOG_SHARD_ANCHOR_UNIX_SECONDS_ENV) + if raw_value is None or not raw_value.strip(): + raise ValueError( + f"{TEST_LOG_SHARD_ANCHOR_UNIX_SECONDS_ENV} is required when " + f"{TEST_LOG_SHARD_WINDOW_SECONDS_ENV} is set" + ) + return int(raw_value.strip()) + + +def _resolve_shard_date(ts: datetime.datetime) -> datetime.date: + window_seconds = _read_test_log_shard_window_seconds() + if window_seconds is None: + return ts.date() + anchor_unix_seconds = _read_test_log_shard_anchor_unix_seconds() + unix_seconds = int(ts.timestamp()) + bucket_index = (unix_seconds - anchor_unix_seconds) // window_seconds + if bucket_index < 0: + raise ValueError( + "test log shard anchor must not be in the future: " + f"anchor={anchor_unix_seconds}, ts={unix_seconds}" + ) + return TEST_LOG_SHARD_BASE_DATE + datetime.timedelta(days=bucket_index) + + +def daily_sharded_log_path( + base_path: Path, + *, + now: Optional[datetime.datetime] = None, +) -> Path: + ts = datetime.datetime.now(datetime.timezone.utc) if now is None else now.astimezone(datetime.timezone.utc) + name = base_path.name + if not name.endswith(".log"): + raise ValueError(f"log base path must end with .log: {base_path}") + stem = name[:-4] + shard_date = _resolve_shard_date(ts) + return (base_path.parent / f"{stem}.{shard_date.isoformat()}.log").resolve() + + +def latest_existing_daily_sharded_log_path(base_path: Path) -> Optional[Path]: + name = base_path.name + if not name.endswith(".log"): + return base_path.resolve() if base_path.exists() else None + stem = name[:-4] + prefix = stem + "." + suffix = ".log" + latest: Optional[tuple[datetime.date, Path]] = None + parent = base_path.parent + if not parent.exists(): + return base_path.resolve() if base_path.exists() else None + for path in parent.iterdir(): + if not path.is_file(): + continue + entry_name = path.name + if not entry_name.startswith(prefix) or not entry_name.endswith(suffix): + continue + date_text = entry_name[len(prefix):-len(suffix)] + try: + shard_date = datetime.date.fromisoformat(date_text) + except ValueError: + continue + if latest is None or shard_date > latest[0]: + latest = (shard_date, path.resolve()) + if latest is not None: + return latest[1] + if base_path.exists(): + return base_path.resolve() + return None + + +def resolve_readable_log_path(base_path: Path) -> Optional[Path]: + current = daily_sharded_log_path(base_path) + if current.exists(): + return current + return latest_existing_daily_sharded_log_path(base_path) + + +def cleanup_old_daily_sharded_logs( + base_path: Path, + *, + retention_days: int = DEFAULT_DAILY_LOG_RETENTION_DAYS, +) -> None: + name = base_path.name + if not name.endswith(".log"): + return + current_shard_date = _resolve_shard_date(datetime.datetime.now(datetime.timezone.utc)) + keep_since = current_shard_date - datetime.timedelta(days=max(int(retention_days) - 1, 0)) + stem = name[:-4] + prefix = stem + "." + suffix = ".log" + parent = base_path.parent + parent.mkdir(parents=True, exist_ok=True) + for path in parent.iterdir(): + if not path.is_file(): + continue + entry_name = path.name + if not entry_name.startswith(prefix) or not entry_name.endswith(suffix): + continue + date_text = entry_name[len(prefix):-len(suffix)] + try: + shard_date = datetime.date.fromisoformat(date_text) + except ValueError: + continue + if shard_date < keep_since: + try: + path.unlink() + except FileNotFoundError: + pass + + +def render_module_source() -> str: + module_path = Path(__file__).resolve() + return module_path.read_text(encoding="utf-8") + + +def import_sibling_log_shard(): + import importlib.util + import sys + + helper_path = Path(__file__).resolve().with_name("log_shard.py") + module_name = "_fluxon_log_shard_runtime" + loaded = sys.modules.get(module_name) + if loaded is not None: + return loaded + spec = importlib.util.spec_from_file_location(module_name, helper_path) + if spec is None or spec.loader is None: + raise RuntimeError(f"failed to load log shard helper: {helper_path}") + module = importlib.util.module_from_spec(spec) + sys.modules[module_name] = module + spec.loader.exec_module(module) + return module + + +def relay_fd_to_daily_sharded_logs( + *, + base_log_path: str, + read_fd: int, + retention_days: int = DEFAULT_DAILY_LOG_RETENTION_DAYS, +) -> None: + base_path = Path(os.path.abspath(base_log_path)) + current_path: Optional[Path] = None + current_fp = None + try: + while True: + try: + chunk = os.read(read_fd, 65536) + except OSError: + break + if not chunk: + break + next_path = daily_sharded_log_path(base_path) + if current_path != next_path: + if current_fp is not None: + current_fp.flush() + current_fp.close() + cleanup_old_daily_sharded_logs(base_path, retention_days=retention_days) + next_path.parent.mkdir(parents=True, exist_ok=True) + current_fp = next_path.open("ab", buffering=0) + current_path = next_path + current_fp.write(chunk) + finally: + if current_fp is not None: + current_fp.flush() + current_fp.close() + os.close(read_fd) diff --git a/deployment/utils/proc_lifecycle_codegen.py b/deployment/utils/proc_lifecycle_codegen.py index 31ef2b0..197829f 100644 --- a/deployment/utils/proc_lifecycle_codegen.py +++ b/deployment/utils/proc_lifecycle_codegen.py @@ -135,71 +135,117 @@ def render_bash_proc_lifecycle_funcs_pid_tree(*, timeouts: StopTimeouts) -> str: ' }} -_pid_tree_has_child_process() {{ +_pid_tree_direct_child_pids() {{ root_pid="$1" - pids="$(_pid_tree_list "$root_pid" 2>/dev/null || true)" - if [ -z "$pids" ]; then + if [[ ! "$root_pid" =~ ^[0-9]+$ ]]; then return 1 fi - # More than one PID means the supervisor has a live child process. - set -- $pids - if [ "$#" -ge 2 ]; then - return 0 + if ! _pid_exists "$root_pid"; then + return 1 fi - return 1 + + ps -eo pid=,ppid=,stat= 2>/dev/null | awk -v root="$root_pid" ' + {{ + pid=$1; + ppid=$2; + state=$3; + if (ppid != root) {{ + next; + }} + if (state ~ /^Z/) {{ + next; + }} + out=out " " pid; + }} + END {{ + sub(/^ /, "", out); + print out; + }} + ' +}} + +_now_monotonic_ms() {{ + python3 - <<'__FLUXON_MONOTONIC_MS__' +import time + +print(time.monotonic_ns() // 1_000_000) +__FLUXON_MONOTONIC_MS__ }} wait_service_probably_ready_pid_tree() {{ - # "Probably ready" contract: - # - A service is considered probably-ready iff for N consecutive seconds: - # - the supervisor PID exists, and - # - the supervisor PID subtree has at least one other PID besides the supervisor. - # - If the child process restarts during the window, we reset the counter and keep waiting, - # until the provided deadline is reached. - # - # This is used by atomic-group runners to enforce strict start ordering. + # Startup gate contract: + # - Success means one supervised direct child PID becomes visible, then stays unchanged for the + # full startup_window_seconds before the overall startup deadline expires. + # - During this startup window we do not probe service ports or readiness endpoints. + # - A child exit or restart inside the window is treated as startup failure even if the + # supervisor process itself stays alive and restarts again later. svc="$1" root_pid="$2" - stable_seconds="$3" - deadline_ts="$4" + startup_window_seconds="$3" + startup_deadline_seconds="$4" context="$5" - if [[ ! "$stable_seconds" =~ ^[0-9]+$ ]] || [ "$stable_seconds" -le 0 ]; then - echo "$context probable-ready: invalid stable_seconds=$stable_seconds svc=$svc" + if [[ ! "$startup_window_seconds" =~ ^[0-9]+$ ]] || [ "$startup_window_seconds" -le 0 ]; then + echo "$context probable-ready: invalid startup_window_seconds=$startup_window_seconds svc=$svc" return 1 fi - if [[ ! "$deadline_ts" =~ ^[0-9]+$ ]] || [ "$deadline_ts" -le 0 ]; then - echo "$context probable-ready: invalid deadline_ts=$deadline_ts svc=$svc" + if [[ ! "$startup_deadline_seconds" =~ ^[0-9]+$ ]] || [ "$startup_deadline_seconds" -le 0 ]; then + echo "$context probable-ready: invalid startup_deadline_seconds=$startup_deadline_seconds svc=$svc" return 1 fi - ok_s=0 + startup_window_ms=$(( startup_window_seconds * 1000 )) + startup_deadline_ms=$(( startup_deadline_seconds * 1000 )) + started_at_monotonic_ms="$(_now_monotonic_ms)" + deadline_monotonic_ms=$(( started_at_monotonic_ms + startup_deadline_ms )) + observed_child_pid="" + observed_child_since_monotonic_ms="" while true; do - now=$(date +%s) - if [ "$now" -ge "$deadline_ts" ]; then - echo "$context probable-ready: deadline exceeded svc=$svc stable_seconds=$stable_seconds pid=$root_pid" - return 1 - fi - if ! _pid_exists "$root_pid"; then echo "$context probable-ready: supervisor pid exited svc=$svc pid=$root_pid" return 1 fi - if _pid_tree_has_child_process "$root_pid"; then - ok_s=$((ok_s+1)) - if [ "$ok_s" -ge "$stable_seconds" ]; then - echo "$context probable-ready: ok svc=$svc stable_seconds=$stable_seconds pid=$root_pid" - return 0 + current_child_pids="$(_pid_tree_direct_child_pids "$root_pid" 2>/dev/null || true)" + current_child_pid="" + if [ -n "$current_child_pids" ]; then + set -- $current_child_pids + if [ "$#" -ne 1 ]; then + echo "$context probable-ready: multiple direct child pids svc=$svc supervisor_pid=$root_pid child_pids=$current_child_pids" + return 1 fi - else - if [ "$ok_s" -ne 0 ]; then - echo "$context probable-ready: reset svc=$svc ok_s=$ok_s missing_child=true" + current_child_pid="$1" + fi + + now_monotonic_ms="$(_now_monotonic_ms)" + if [ -z "$current_child_pid" ]; then + if [ -n "$observed_child_pid" ]; then + echo "$context probable-ready: child pid exited svc=$svc supervisor_pid=$root_pid child_pid=$observed_child_pid" + return 1 fi - ok_s=0 + elif [ -z "$observed_child_pid" ]; then + observed_child_pid="$current_child_pid" + observed_child_since_monotonic_ms="$now_monotonic_ms" + elif [ "$current_child_pid" != "$observed_child_pid" ]; then + echo "$context probable-ready: child pid changed svc=$svc supervisor_pid=$root_pid child_pid=$observed_child_pid replacement_child_pid=$current_child_pid" + return 1 + fi + + if [ -n "$observed_child_since_monotonic_ms" ] && [ $(( now_monotonic_ms - observed_child_since_monotonic_ms )) -ge "$startup_window_ms" ]; then + echo "$context probable-ready: ok svc=$svc startup_window_seconds=$startup_window_seconds supervisor_pid=$root_pid child_pid=$observed_child_pid" + return 0 + fi + + if [ "$now_monotonic_ms" -ge "$deadline_monotonic_ms" ]; then + if [ -z "$observed_child_pid" ]; then + echo "$context probable-ready: no child pid observed svc=$svc supervisor_pid=$root_pid startup_window_seconds=$startup_window_seconds startup_deadline_seconds=$startup_deadline_seconds" + return 1 + fi + echo "$context probable-ready: child pid not stable long enough svc=$svc supervisor_pid=$root_pid child_pid=$observed_child_pid observed_for_ms=$(( now_monotonic_ms - observed_child_since_monotonic_ms )) startup_window_seconds=$startup_window_seconds startup_deadline_seconds=$startup_deadline_seconds" + return 1 fi - sleep 1 + sleep 0.2 done }} diff --git a/deployment/utils/selection_supervisor_codegen.py b/deployment/utils/selection_supervisor_codegen.py index 2945ff5..cc27872 100644 --- a/deployment/utils/selection_supervisor_codegen.py +++ b/deployment/utils/selection_supervisor_codegen.py @@ -13,6 +13,7 @@ PYTHON_SELECTION_SUPERVISOR_FILENAME = "selection_supervisor.py" +LOG_SHARD_HELPER_FILENAME = "log_shard.py" def render_python_selection_supervisor_module(*, timeouts) -> str: @@ -42,11 +43,13 @@ def render_python_selection_supervisor_module(*, timeouts) -> str: import enum import fcntl import hashlib +import importlib.util import json import os import signal import subprocess import sys +import threading import time from dataclasses import dataclass from pathlib import Path @@ -62,6 +65,27 @@ def render_python_selection_supervisor_module(*, timeouts) -> str: SANITIZED_CHILD_ENV_KEYS = ("RDMAV_DRIVERS", "IBV_DRIVERS") _shutdown_requested = False +_STDIO_ROUTER_THREAD = None +_STDIO_ROUTER_KEEPALIVE_FP = None + + +def _load_log_shard_helper(): + raw_file = globals().get("__file__") + if not isinstance(raw_file, str) or not raw_file: + raise RuntimeError("selection_supervisor.py requires __file__ to resolve log shard helper") + helper_path = Path(raw_file).resolve().with_name("__LOG_SHARD_HELPER_FILENAME__") + if not helper_path.is_file(): + raise RuntimeError(f"missing log shard helper next to selection_supervisor.py: {helper_path}") + spec = importlib.util.spec_from_file_location("_fluxon_selection_log_shard", helper_path) + if spec is None or spec.loader is None: + raise RuntimeError(f"failed to load log shard helper: {helper_path}") + module = importlib.util.module_from_spec(spec) + sys.modules[spec.name] = module + spec.loader.exec_module(module) + return module + + +_LOG_SHARD = _load_log_shard_helper() def main() -> int: @@ -96,6 +120,8 @@ def main() -> int: stop_parser.add_argument("--missing-ok", action="store_true") args = parser.parse_args() + runtime_state_for_stdio = _runtime_state_for_startup_stdio(args) + _redirect_process_stdio_to_runtime_log(runtime_state_for_stdio) # English note: # - The supervisor module is invoked both as a long-running `run` daemon and as a short-lived # `stop` helper from ops-managed reconcile loops. @@ -356,6 +382,16 @@ def _parse_run_command_spec(args: argparse.Namespace) -> RunCommandSpec: ) +def _runtime_state_for_startup_stdio(args: argparse.Namespace) -> Optional[SelectionRuntimeState]: + if str(args.command) != "run": + return None + label = _require_non_empty_str(args.label, "label") + state_json = args.state_json + if state_json is None: + return None + return _build_runtime_state(label=label, state_json=state_json) + + def _requested_phase1_overlap_with_applyless_owner( current_owner: Optional[LiveSupervisor], requested_runtime_state: Optional[SelectionRuntimeState], @@ -438,6 +474,7 @@ def _run_supervisor(spec: RunCommandSpec, selection_lock_fp=None) -> int: restart_timestamps: List[float] = [] backoff_seconds = spec.restart_delay_seconds + _redirect_process_stdio_to_runtime_log(runtime_state) while True: _log_reaped_children( @@ -661,6 +698,10 @@ def _sanitize_child_ld_library_path(raw_value: Optional[str]) -> Optional[str]: return ":".join(sanitized_entries) +def _expand_runtime_state_path(value: str) -> str: + return os.path.expandvars(value) + + def _spawn_child(command: List[str], workdir: Optional[Path]) -> subprocess.Popen[bytes]: def _set_pdeathsig_sigterm() -> None: libc = ctypes.CDLL("libc.so.6", use_errno=True) @@ -687,6 +728,40 @@ def _set_pdeathsig_sigterm() -> None: ) +def _redirect_process_stdio_to_runtime_log(runtime_state: Optional[SelectionRuntimeState]) -> None: + global _STDIO_ROUTER_THREAD + global _STDIO_ROUTER_KEEPALIVE_FP + if runtime_state is None: + return + if _STDIO_ROUTER_THREAD is not None: + return + base_log_path = _require_non_empty_str(runtime_state.log_path, "state.log_path") + read_fd, write_fd = os.pipe() + router_keepalive = os.dup(write_fd) + + def _router_loop() -> None: + _LOG_SHARD.relay_fd_to_daily_sharded_logs( + base_log_path=base_log_path, + read_fd=read_fd, + retention_days=_LOG_SHARD.DEFAULT_DAILY_LOG_RETENTION_DAYS, + ) + + router = threading.Thread( + target=_router_loop, + name="selection-supervisor-stdio-log-router", + daemon=True, + ) + router.start() + os.dup2(write_fd, sys.stdout.fileno()) + os.dup2(write_fd, sys.stderr.fileno()) + sys.stdout = os.fdopen(sys.stdout.fileno(), "w", encoding="utf-8", buffering=1, closefd=False) + sys.stderr = os.fdopen(sys.stderr.fileno(), "w", encoding="utf-8", buffering=1, closefd=False) + try: + os.close(write_fd) + except OSError: + pass + _STDIO_ROUTER_KEEPALIVE_FP = os.fdopen(router_keepalive, "w", encoding="utf-8", buffering=1) + _STDIO_ROUTER_THREAD = router def _retired_and_preserved_adopted_roots(root_pid: int) -> Tuple[List[int], List[int]]: adopted_roots = _direct_live_child_pids(root_pid) if not adopted_roots: @@ -788,7 +863,9 @@ def _selection_runtime_state_from_raw( apply_id=_require_optional_non_empty_str(raw.get("apply_id"), "state.apply_id"), argv=_require_non_empty_str_list(raw.get("argv"), "state.argv"), cwd=_require_optional_non_empty_str(raw.get("cwd"), "state.cwd"), - log_path=_require_non_empty_str(raw.get("log_path"), "state.log_path"), + log_path=_expand_runtime_state_path( + _require_non_empty_str(raw.get("log_path"), "state.log_path") + ), owner_ts_ms=owner_ts_ms, started_ts_ms=started_ts_ms, ) @@ -924,6 +1001,17 @@ def _iter_process_cmdlines() -> List[tuple[int, List[str]]]: return out +def _iter_process_snapshots() -> List[tuple[ProcessInfo, List[str]]]: + infos_by_pid = {info.pid: info for info in _iter_process_infos()} + out: List[tuple[ProcessInfo, List[str]]] = [] + for pid, args in _iter_process_cmdlines(): + process_info = infos_by_pid.get(pid) + if process_info is None or process_info.is_zombie: + continue + out.append((process_info, args)) + return out + + def _arg_value(args: List[str], flag: str) -> Optional[str]: for idx, arg in enumerate(args[:-1]): if arg == flag: @@ -1068,13 +1156,11 @@ def _iter_process_infos() -> List[ProcessInfo]: def _iter_live_supervisors(label: Optional[str] = None, *, scope_key: Optional[str] = None) -> List[LiveSupervisor]: out: List[LiveSupervisor] = [] - for pid, args in _iter_process_cmdlines(): + for process_info, args in _iter_process_snapshots(): supervisor_command = _find_selection_supervisor_command(args) if supervisor_command is None: continue - process_info = _find_process_info(pid) - if process_info is None or process_info.is_zombie: - continue + pid = process_info.pid runtime_label = _arg_value(args, "--label") if runtime_label is None: raise RuntimeError(f"running selection supervisor is missing --label pid={pid}") @@ -1337,6 +1423,7 @@ def _signal_pid_tree(root_pid: int, sig: signal.Signals, label: str) -> None: """ return ( textwrap.dedent(template) + .replace("__LOG_SHARD_HELPER_FILENAME__", LOG_SHARD_HELPER_FILENAME) .replace("__TERM_S__", str(term_s)) .replace("__KILL_S__", str(kill_s)) .replace("__SUPERSEDE_S__", str(supersede_s)) diff --git a/examples/external_put_get_del.py b/examples/external_put_get_del.py index c1ee8c3..834425d 100644 --- a/examples/external_put_get_del.py +++ b/examples/external_put_get_del.py @@ -4,8 +4,7 @@ INSTANCE_KEY = "demo_kv_external" CLUSTER_NAME = "demo-kv-cluster" -SHARED_MEMORY_PATH = "/dev/shm/fluxon_kv_demo" -SHARED_FILE_PATH = "/tmp/fluxon_kv_demo/shared" +SHARE_MEM_PATH = "/dev/shm/fluxon_kv_demo" def main() -> None: @@ -14,8 +13,7 @@ def main() -> None: "instance_key": INSTANCE_KEY, "fluxonkv_spec": { "cluster_name": CLUSTER_NAME, - "shared_memory_path": SHARED_MEMORY_PATH, - "shared_file_path": SHARED_FILE_PATH, + "share_mem_path": SHARE_MEM_PATH, }, "test_spec_config": { "disable_observability": True, diff --git a/examples/fluxon_quick_start/start.py b/examples/fluxon_quick_start/start.py index 17fd8c2..32a1639 100644 --- a/examples/fluxon_quick_start/start.py +++ b/examples/fluxon_quick_start/start.py @@ -473,10 +473,9 @@ def _load_config_from_b64(config_b64: str) -> Dict[str, Any]: def _resolve_fluxonkv_spec_paths(*, spec: Dict[str, Any], workdir: Path) -> Dict[str, Any]: resolved = dict(spec) - for field_name in ("shared_memory_path", "shared_file_path"): - raw_path = resolved.get(field_name) - if isinstance(raw_path, str) and raw_path and not Path(raw_path).is_absolute(): - resolved[field_name] = str((workdir / raw_path).resolve()) + raw_path = resolved.get("share_mem_path") + if isinstance(raw_path, str) and raw_path and not Path(raw_path).is_absolute(): + resolved["share_mem_path"] = str((workdir / raw_path).resolve()) return resolved @@ -528,10 +527,13 @@ def _monitoring_block(greptime_http_port: int) -> Dict[str, Any]: } +def _owner_large_file_paths(workdir: Path) -> List[str]: + return [str(workdir / "large" / "owner")] + + def _gen_kv_config(etcd_ep: str, cluster: str, master_port: int, kv_http_port: int, panel_port: int, greptime_http_port: int, workdir: Path) -> Dict[str, Any]: shm = str(workdir / "sharemem") - shared_file_path = str(workdir / "sharefile") log_dir = str(workdir / "log" / "master") master_cfg: Dict[str, Any] = { "etcd_endpoints": [etcd_ep], @@ -551,9 +553,9 @@ def _gen_kv_config(etcd_ep: str, cluster: str, master_port: int, kv_http_port: i "fluxonkv_spec": { "etcd_addresses": [etcd_ep], "cluster_name": cluster, - "shared_memory_path": shm, - "shared_file_path": shared_file_path, + "share_mem_path": shm, "sub_cluster": "default", + "large_file_paths": _owner_large_file_paths(workdir), }, }, "kvexternal_rexport_httpserver_http": { @@ -565,8 +567,7 @@ def _gen_kv_config(etcd_ep: str, cluster: str, master_port: int, kv_http_port: i "instance_key": "qs_http_accessor", "fluxonkv_spec": { "cluster_name": cluster, - "shared_memory_path": shm, - "shared_file_path": shared_file_path, + "share_mem_path": shm, }, }, } @@ -576,7 +577,6 @@ def _gen_kv_config(etcd_ep: str, cluster: str, master_port: int, kv_http_port: i def _gen_mq_config(etcd_ep: str, cluster: str, master_port: int, greptime_http_port: int, workdir: Path, panel_port: int = 0) -> Dict[str, Any]: shm = str(workdir / "sharemem") - shared_file_path = str(workdir / "sharefile") log_dir = str(workdir / "log" / "master") master_cfg: Dict[str, Any] = { "etcd_endpoints": [etcd_ep], @@ -596,17 +596,16 @@ def _gen_mq_config(etcd_ep: str, cluster: str, master_port: int, greptime_http_p "fluxonkv_spec": { "etcd_addresses": [etcd_ep], "cluster_name": cluster, - "shared_memory_path": shm, - "shared_file_path": shared_file_path, + "share_mem_path": shm, "sub_cluster": "default", + "large_file_paths": _owner_large_file_paths(workdir), }, }, "kvexternal": { "instance_key": "qs_mq_external", "fluxonkv_spec": { "cluster_name": cluster, - "shared_memory_path": shm, - "shared_file_path": shared_file_path, + "share_mem_path": shm, }, }, "mpmc_demo": { @@ -630,7 +629,6 @@ def _gen_mq_config(etcd_ep: str, cluster: str, master_port: int, greptime_http_p def _gen_fs_config(etcd_ep: str, cluster: str, master_port: int, panel_port: int, greptime_http_port: int, workdir: Path) -> Dict[str, Any]: shm = str(workdir / "sharemem") - shared_file_path = str(workdir / "sharefile") log_dir = str(workdir / "log" / "master") remote_root_dir = str(workdir / "fs_remote_root") access_db_path = str(workdir / "fs_master" / "access.db") @@ -654,9 +652,9 @@ def _gen_fs_config(etcd_ep: str, cluster: str, master_port: int, panel_port: int "fluxonkv_spec": { "etcd_addresses": [etcd_ep], "cluster_name": cluster, - "shared_memory_path": shm, - "shared_file_path": shared_file_path, + "share_mem_path": shm, "sub_cluster": "default", + "large_file_paths": _owner_large_file_paths(workdir), }, }, "fs_master": { @@ -664,8 +662,7 @@ def _gen_fs_config(etcd_ep: str, cluster: str, master_port: int, panel_port: int "instance_key": "qs_fs_master", "fluxonkv_spec": { "cluster_name": cluster, - "shared_memory_path": shm, - "shared_file_path": shared_file_path, + "share_mem_path": shm, }, }, "fluxon_fs": { @@ -711,8 +708,7 @@ def _gen_fs_config(etcd_ep: str, cluster: str, master_port: int, panel_port: int "instance_key": "qs_fs_agent", "fluxonkv_spec": { "cluster_name": cluster, - "shared_memory_path": shm, - "shared_file_path": shared_file_path, + "share_mem_path": shm, }, }, "fluxon_fs": { @@ -980,19 +976,19 @@ def _wait_for_process_tcp_ready_best_effort( return False -def _kvclient_shared_json_target(shared_file_path: Path, cluster_name: str) -> Path: - return shared_file_path / cluster_name / "shared.json" +def _kvclient_shared_json_target(share_mem_path: Path, cluster_name: str) -> Path: + return share_mem_path / cluster_name / "shared.json" -def _clear_stale_shared_json(shared_file_path: Path, cluster_name: str) -> None: - target = _kvclient_shared_json_target(shared_file_path, cluster_name) +def _clear_stale_shared_json(share_mem_path: Path, cluster_name: str) -> None: + target = _kvclient_shared_json_target(share_mem_path, cluster_name) if target.exists(): print(f"[quick_start] removing stale shared.json: {target}") target.unlink() def _wait_for_shared_json( - shared_file_path: Path, + share_mem_path: Path, cluster_name: str, timeout: int = 180, *, @@ -1001,7 +997,7 @@ def _wait_for_shared_json( log_path: Optional[Path] = None, ) -> None: """Block until shared.json appears (owner kvclient ready).""" - target = _kvclient_shared_json_target(shared_file_path, cluster_name) + target = _kvclient_shared_json_target(share_mem_path, cluster_name) target_dir = target.parent deadline = time.time() + timeout elapsed = 0 @@ -1036,7 +1032,7 @@ def _start_cluster_infra( etcd_log_path = workdir / "log" / "etcd.log" master_log_path = workdir / "log" / "master.log" kvclient_log_path = workdir / "log" / "kvclient.log" - shared_file_path = _kvclient_shared_file_path_from_cfg(cfg) + share_mem_path = _kvclient_share_mem_path_from_cfg(cfg) cluster_name = _kvclient_cluster_name_from_cfg(cfg) log_dir = workdir / "log" @@ -1102,10 +1098,10 @@ def _start_cluster_infra( ) print("[quick_start] starting kvclient...") - _clear_stale_shared_json(shared_file_path, cluster_name) + _clear_stale_shared_json(share_mem_path, cluster_name) kvclient_proc = _start_kvclient(cfg["kvclient"], workdir) _wait_for_shared_json( - shared_file_path, + share_mem_path, cluster_name, proc=kvclient_proc, label="kvclient", @@ -1113,16 +1109,16 @@ def _start_cluster_infra( ) -def _kvclient_shared_file_path_from_cfg(cfg: Dict[str, Any]) -> Path: +def _kvclient_share_mem_path_from_cfg(cfg: Dict[str, Any]) -> Path: kvclient_cfg = cfg.get("kvclient") if not isinstance(kvclient_cfg, dict): raise ValueError("missing kvclient config") spec = kvclient_cfg.get("fluxonkv_spec") if not isinstance(spec, dict): raise ValueError("missing kvclient.fluxonkv_spec config") - raw_path = spec.get("shared_file_path") + raw_path = spec.get("share_mem_path") if not isinstance(raw_path, str) or not raw_path: - raise ValueError("kvclient.fluxonkv_spec.shared_file_path must be a non-empty string") + raise ValueError("kvclient.fluxonkv_spec.share_mem_path must be a non-empty string") return Path(raw_path) diff --git a/examples/rpc_call.py b/examples/rpc_call.py index 250f2fa..4afd331 100644 --- a/examples/rpc_call.py +++ b/examples/rpc_call.py @@ -8,8 +8,7 @@ RPC_SERVER_INSTANCE_KEY = "demo_rpc_server" RPC_CLIENT_INSTANCE_KEY = "demo_rpc_client" CLUSTER_NAME = "demo-kv-cluster" -SHARED_MEMORY_PATH = "/dev/shm/fluxon_kv_demo" -SHARED_FILE_PATH = "/tmp/fluxon_kv_demo/shared" +SHARE_MEM_PATH = "/dev/shm/fluxon_kv_demo" def main() -> None: @@ -43,8 +42,7 @@ def _build_config(*, instance_key: str) -> FluxonKvClientConfig: "instance_key": instance_key, "fluxonkv_spec": { "cluster_name": CLUSTER_NAME, - "shared_memory_path": SHARED_MEMORY_PATH, - "shared_file_path": SHARED_FILE_PATH, + "share_mem_path": SHARE_MEM_PATH, }, "test_spec_config": { "disable_observability": True, diff --git a/examples/start_kv_and_fs_svc.py b/examples/start_kv_and_fs_svc.py index db7e4f6..84f65a8 100644 --- a/examples/start_kv_and_fs_svc.py +++ b/examples/start_kv_and_fs_svc.py @@ -16,8 +16,7 @@ GREPTIME_HTTP_PORT = 34030 GREPTIME_BASE_URL = f"http://127.0.0.1:{GREPTIME_HTTP_PORT}" CLUSTER_NAME = "demo-fs-cluster" -SHARED_MEMORY_PATH = Path("/dev/shm/fluxon_fs_demo").resolve() -SHARED_FILE_PATH = Path("/tmp/fluxon_fs_demo/shared").resolve() +SHARE_MEM_PATH = Path("/dev/shm/fluxon_fs_demo").resolve() WORKDIR = Path("/tmp/fluxon_fs_demo/runtime").resolve() REMOTE_ROOT_DIR = Path("/tmp/fluxon_fs_demo/remote_root").resolve() KV_MASTER_PORT = 34100 @@ -38,11 +37,14 @@ FS_MASTER_ACCESS_DB_PATH = (WORKDIR / "fs_master" / "access.db").resolve() +def build_owner_large_file_paths() -> list[str]: + return [str((WORKDIR / "large" / "owner").resolve())] + + def main() -> None: args = parse_args() WORKDIR.mkdir(parents=True, exist_ok=True) REMOTE_ROOT_DIR.mkdir(parents=True, exist_ok=True) - SHARED_FILE_PATH.mkdir(parents=True, exist_ok=True) log_dir = (WORKDIR / "log").resolve() log_dir.mkdir(parents=True, exist_ok=True) @@ -113,8 +115,7 @@ def main() -> None: ) print(f"[fluxon_fs] cluster name: {CLUSTER_NAME}") - print(f"[fluxon_fs] shared memory path: {SHARED_MEMORY_PATH}") - print(f"[fluxon_fs] shared file path: {SHARED_FILE_PATH}") + print(f"[fluxon_fs] share_mem_path: {SHARE_MEM_PATH}") print(f"[fluxon_fs] remote root dir: {REMOTE_ROOT_DIR}") print(f"[fluxon_fs] export name: {EXPORT_NAME}") print(f"[fluxon_fs] owner instance key: {OWNER_INSTANCE_KEY}") @@ -195,9 +196,9 @@ def build_owner_config() -> dict: "fluxonkv_spec": { "etcd_addresses": [ETCD_ENDPOINT], "cluster_name": CLUSTER_NAME, - "shared_memory_path": str(SHARED_MEMORY_PATH), - "shared_file_path": str(SHARED_FILE_PATH), + "share_mem_path": str(SHARE_MEM_PATH), "sub_cluster": "default", + "large_file_paths": build_owner_large_file_paths(), }, } @@ -208,8 +209,7 @@ def build_fs_master_config() -> dict: "instance_key": FS_MASTER_INSTANCE_KEY, "fluxonkv_spec": { "cluster_name": CLUSTER_NAME, - "shared_memory_path": str(SHARED_MEMORY_PATH), - "shared_file_path": str(SHARED_FILE_PATH), + "share_mem_path": str(SHARE_MEM_PATH), }, }, "fluxon_fs": { @@ -268,8 +268,7 @@ def build_fs_agent_config() -> dict: "instance_key": FS_AGENT_INSTANCE_KEY, "fluxonkv_spec": { "cluster_name": CLUSTER_NAME, - "shared_memory_path": str(SHARED_MEMORY_PATH), - "shared_file_path": str(SHARED_FILE_PATH), + "share_mem_path": str(SHARE_MEM_PATH), }, }, "fluxon_fs": { diff --git a/examples/start_master_owner.py b/examples/start_master_owner.py index 964cf87..7bfb770 100644 --- a/examples/start_master_owner.py +++ b/examples/start_master_owner.py @@ -14,8 +14,7 @@ GREPTIME_HTTP_PORT = 34030 GREPTIME_BASE_URL = f"http://127.0.0.1:{GREPTIME_HTTP_PORT}" CLUSTER_NAME = "demo-kv-cluster" -SHARED_MEMORY_PATH = Path("/dev/shm/fluxon_kv_demo").resolve() -SHARED_FILE_PATH = Path("/tmp/fluxon_kv_demo/shared").resolve() +SHARE_MEM_PATH = Path("/dev/shm/fluxon_kv_demo").resolve() WORKDIR = Path("/tmp/fluxon_kv_demo/runtime").resolve() MASTER_PORT = 31000 MASTER_INSTANCE_KEY = "demo_kv_master" @@ -23,9 +22,12 @@ OWNER_DRAM_BYTES = 1073741824 +def build_owner_large_file_paths() -> list[str]: + return [str((WORKDIR / "large" / "owner").resolve())] + + def main() -> None: args = parse_args() - SHARED_FILE_PATH.mkdir(parents=True, exist_ok=True) log_dir = (WORKDIR / "log").resolve() if args.with_master: @@ -60,8 +62,7 @@ def main() -> None: ) ) - print(f"[fluxon_kv] shared memory path: {SHARED_MEMORY_PATH}") - print(f"[fluxon_kv] shared file path: {SHARED_FILE_PATH}") + print(f"[fluxon_kv] share_mem_path: {SHARE_MEM_PATH}") print(f"[fluxon_kv] etcd endpoint: {ETCD_ENDPOINT}") print(f"[fluxon_kv] greptime base url: {GREPTIME_BASE_URL}") print(f"[fluxon_kv] start master in this script: {args.with_master}") @@ -124,9 +125,9 @@ def build_owner_config() -> dict: "fluxonkv_spec": { "etcd_addresses": [ETCD_ENDPOINT], "cluster_name": CLUSTER_NAME, - "shared_memory_path": str(SHARED_MEMORY_PATH), - "shared_file_path": str(SHARED_FILE_PATH), + "share_mem_path": str(SHARE_MEM_PATH), "sub_cluster": "default", + "large_file_paths": build_owner_large_file_paths(), }, } diff --git a/examples/start_mpmc_demo.py b/examples/start_mpmc_demo.py index 2988674..5ad9f64 100644 --- a/examples/start_mpmc_demo.py +++ b/examples/start_mpmc_demo.py @@ -17,8 +17,7 @@ # These constants are the only user-facing knobs in the minimal example. CLUSTER_NAME = "demo-kv-cluster" -SHARED_MEMORY_PATH = "/dev/shm/fluxon_kv_demo" -SHARED_FILE_PATH = "/tmp/fluxon_kv_demo/shared" +SHARE_MEM_PATH = "/dev/shm/fluxon_kv_demo" CHANNEL_KEY = "demo_mq_channel_doc" CHANNEL_CAPACITY = 128 CHANNEL_TTL_SECONDS = 300 @@ -53,8 +52,7 @@ def _build_store_config(*, role: str) -> FluxonKvClientConfig: "instance_key": f"demo_mq_{role}", "fluxonkv_spec": { "cluster_name": CLUSTER_NAME, - "shared_memory_path": SHARED_MEMORY_PATH, - "shared_file_path": SHARED_FILE_PATH, + "share_mem_path": SHARE_MEM_PATH, }, } ) @@ -198,9 +196,6 @@ def main() -> None: parser.add_argument("--role", choices=["producer", "consumer"], required=True) args = parser.parse_args() - # The minimal example keeps shared file authority explicit and local. - Path(SHARED_FILE_PATH).mkdir(parents=True, exist_ok=True) - # init_logger() reads FLUXON_LOG and sets the user-process console log level. logger = init_logger(f"mpmc_demo_{args.role}") shutdown_requested = threading.Event() diff --git "a/fluxon_doc_cn/design/fluxon_0_\351\205\215\347\275\256\346\200\273\350\247\210.md" "b/fluxon_doc_cn/design/fluxon_0_\351\205\215\347\275\256\346\200\273\350\247\210.md" new file mode 100644 index 0000000..a4c5865 --- /dev/null +++ "b/fluxon_doc_cn/design/fluxon_0_\351\205\215\347\275\256\346\200\273\350\247\210.md" @@ -0,0 +1,491 @@ +# Fluxon 配置总览 + +## 1. 结论 + +本文只回答一件事:Fluxon 仓库里有哪些稳定配置入口,它们各自负责什么,校验后会变成什么运行时结构。 + +**稳定结论:** + +- 配置输入和运行时结构是分开的,YAML 只负责声明意图,`verify()` / `parse_*()` 负责收敛成唯一可执行结果。 +- 共享契约优先放在 `fluxon_commu_contract` 和 `fluxon_cli::config` 这类公共模块里,业务包更多是复用或重导出。 +- `host:port`、`http(s)://...`、`cluster-scoped path` 这几类格式都被严格区分,不靠探测或模糊回退。 +- 仓库里的 checked-in YAML 分两类:运行时契约和环境/测试契约。前者要强校验,后者主要用于把开发、部署、测试流水线接起来。 + +```mermaid +flowchart TD + A[build_config_ext.yml
build_config_ext_static.yml] --> B[setup_and_pack / repo_config_utils] + C[deployment/deployconf.yaml] --> D[deployment utils / fluxon_py tests] + E[fluxon_py/tests/test_config.yaml] --> D + F[fluxon_test_stack/*.yaml] --> G[teststack runner / start_test_bed] + H[fluxon_cli/src/config.rs] --> I[monitor / UI] + J[fluxon_kv/src/config.rs] --> K[KV runtime] + L[fluxon_fs_core/src/config.rs] --> M[FS runtime] + N[fluxon_commu_contract/src/config.rs] --> K + N --> M +``` + +## 2. 配置地图 + +| 配置家族 | 入口文件 / 模块 | 主要消费者 | 作用 | +| --- | --- | --- | --- | +| 仓库环境配置 | `build_config_ext.yml` | Rust KV 测试族、`fluxon_py/tests/test_lib.py`、`setup_and_pack` 打包/校验脚本、TestStack 的 `bin_kvtest` 用例 staging | 提供 etcd、Prometheus、remote write 等开发/测试基线 | +| 静态构建配置 | `build_config_ext_static.yml` | `setup_and_pack/pack_release.py`、`build_pack_fluxonkv_pylib_img.py`、Nix 打包链路 | 固定 wheel / manylinux 版本 | +| 部署配置 | `deployment/deployconf.yaml` | 部署脚本、`fluxon_py` 测试入口、TestStack 生成/消费链路 | 提供集群节点、服务地址和全局环境变量 | +| Python 测试配置 | `fluxon_py/tests/test_config.yaml` | `fluxon_py` 测试入口、测试辅助库、deployconf 解析链路 | 连接 deployconf,选择 KV backend 类型 | +| 开发/打包环境配置 | `setup_and_pack/setup_dev_env/*.yaml`、`setup_and_pack/build_pack_fluxonkv_pylib_img/*.yaml`、`setup_and_pack/nix/*.yaml`、`pub_prepare_build.yaml` | `setup_and_pack` 脚本 | 提供开发机和打包流水线的环境输入 | +| TestStack 配置 | `fluxon_test_stack/ci_test_list.yaml`、`start_test_bed.yaml`、`gitops.yaml` | `test_runner.py`、`start_test_bed.py` | 定义 suite、testbed、GitOps 和 UI 入口 | +| CLI 监控配置 | `fluxon_cli/src/config.rs` | `master_ui_monitor`、`test_runner_ui` | 提供监控页和查询页配置 | +| KV 配置 | `fluxon_kv/src/config.rs` | KV master / owner / external | 定义 KV 运行时角色和校验规则 | +| FS 配置 | `fluxon_fs_core/src/config.rs` | FS master / agent / panel | 定义 FS cache、master、panel、权限和转移态 | +| 共享传输配置 | `fluxon_commu_contract/src/config.rs`、`transfer_engine/surface.rs` | KV / FS / commu | 提供 `NetworkConfig`、`ProtocolType`、`TransferEngineType` | + +## 3. 通用规则 + +| 规则 | 含义 | +| --- | --- | +| `serde(deny_unknown_fields)` | 运行时 YAML 默认拒绝未知字段 | +| `from_file` / `from_str` + `verify` | 先解析,再收敛成强类型运行时配置 | +| `YamlNullable` | 只在需要区分“缺失 / null / value”时使用 | +| `host:port` 与 `http(s)://...` 分离 | etcd / deployconf 常用前者,监控 / Prometheus 常用后者 | +| 派生值要显式写回 | 例如 cluster-scoped 路径、默认表名、默认 transport_mode | + +## 4. 环境与部署配置 + +### 4.1 `build_config_ext.yml` + +这是仓库级开发环境配置,不是业务 runtime config。 + +最小骨架: + +```yaml +# Rust / Python / 测试工具共用的 etcd 地址 +# 输入要求 raw host:port +etcd: 127.0.0.1:43579 + +# Prometheus-compatible 查询入口 +prom: http://127.0.0.1:44030/v1/prometheus + +# remote write 入口 +prom_remote_write_url: http://127.0.0.1:44030/v1/prometheus/write +``` + +这里的重点不是字段多,而是格式严格分层: + +- `etcd` 用 raw `host:port`。 +- `prom` 用带 scheme 的 HTTP URL,并且路径通常是 `/v1/prometheus` 或 `/api/v1`。 +- `prom_remote_write_url` 也是完整 URL。 + +`setup_and_pack/utils/repo_config_utils.py` 里保留了 `prometheus_remote_write_url` 的旧名兼容读取,但这是 build tooling 的过渡路径,不是推荐的新契约。 + +### 4.2 `build_config_ext_static.yml` + +当前最小骨架只有一个稳定字段: + +```yaml +manylinux_version: "2_28" +``` + +当前实现只接受 `2_28`。 + +### 4.3 `deployment/deployconf.yaml` + +这是部署和打包流水线的核心配置。先看最重要的骨架: + +```yaml +namespace: fluxon-example +name_prefix: fluxon-example +image: fluxon_quick_start:0.2.1 + +cluster_nodes: + - hostname: example-node-a + ip: 192.0.2.10 + hostworkdir: /opt/example/fluxon/deployment/example_deploy + mounts: + - /opt/example/fluxon: /fluxon_mount + - /var/run/docker.sock: /var/run/docker.sock + +global_envs: + FLUXON_CLUSTER_NAME: "fluxon-example-cluster" + FLUXON_SHARED_MEM: "${HOSTWORKDIR}/shm1" + ETCD_FULL_ADDRESS: "${${ETCD__NODE_ID}__IP}:${ETCD__PORT}" + FLUXON_PROMETHEUS_BASE_URL: "http://${${GREPTIME__NODE_ID}__IP}:${GREPTIME__PORT}/v1/prometheus" + MONITOR_GREPTIMEDB_WRITE_URL: "http://${${GREPTIME__NODE_ID}__IP}:${GREPTIME__PORT}/v1/prometheus/write" + +release_ext_images: + etcd: + image: quay.io/coreos/etcd:v3.5.0 + greptime: + image: greptime/greptimedb:v0.15.1 +``` + +读这份配置时,先抓住三层: + +- `cluster_nodes` 提供节点清单,是 placeholder 解析的基础。 +- `global_envs` 提供集群级 authority,比如 etcd、Prometheus、cluster name、shared roots。 +- `release_ext_images` 和后续 service/workload 块把这些 authority 接进具体部署动作。 + +`global_envs` 允许占位符解析,先由 `cluster_nodes` + `service` 构造映射,再把变量落成最终值。 + +### 4.4 `fluxon_py/tests/test_config.yaml` + +这是一层测试入口配置,不是 runtime 部署配置。 + +最小骨架: + +```yaml +deployconf_path: ../../deployment/deployconf.yaml +kv_svc_type: fluxon +``` + +这里没有复杂分支: + +- `deployconf_path` 指向共享 deployconf。 +- `kv_svc_type` 选择测试要接的 KV backend;当前 checked-in 样例用的是 `fluxon`。 + +测试代码里还保留了 mooncake 相关读取函数,但 checked-in 的最小样例只使用上面两个字段。 + +### 4.5 `fluxon_test_stack/*` + +TestStack 有三份主配置,建议直接从 YAML 骨架理解: + +`ci_test_list.yaml` 定义 suite / scene 空间: + +```yaml +schema_version: 9 + +run: + mode: full_once + selectors: + case_ids: ALL + profile_ids: [fluxon_fastws, fluxon_tquic, fluxon_sockudo_ws, fluxon_tcp] + command_ids: ALL + test_ids: ALL + +scenes: + kv_read_heavy_zipf: + test_stack: + mode: KVSTORE + read_ratio: 0.9 + write_ratio: 0.1 + request_distribution: zipfian + select: + scales: [n1_kvowner_dram_20gib] + profiles: [fluxon_tcp] +``` + +`start_test_bed.yaml` 定义 testbed authority 和 UI: + +```yaml +schema_version: 6 + +deployconf_path: ./deployconf_testbed.yml +controller_url: http://192.0.2.10:19080/r/ops/fluxon_testbed +controller_basic_auth: + username: example_admin + password: example_password + +test_runner_ui: + enabled: true + host: 0.0.0.0 + port: 18080 + workdir: ./test_runner_ui_runtime + gitops_config_path: ./gitops/gitops.yaml + +bootstrap_phases: + - mode: fixed_bare + node: infra44-ThinkStation-PX + services: [etcd, greptime, tikv_pd, tikv] +``` + +`gitops/gitops.yaml` 定义 GitOps 轮询和触发命令: + +```yaml +interval: 60 + +retention: + max_age_days: 7 + +repos: + - addr: git@github.com:Tele-AI/fluxon.git + follow: + - branch: big_step2 + run: + name_prefix: fluxon_ci + commands: + - python3 fluxon_test_stack/pack_test_stack_rsc.py --all-profiles -c fluxon_test_stack/ci_test_list.yaml + - python3 fluxon_test_stack/test_runner.py -c fluxon_test_stack/ci_test_list.yaml -w . +``` + +生成的 `deployconf_testbed.yml` 是派生产物,不是手工主配置。 + +## 5. 运行时配置 + +### 5.1 KV + +KV 的入口在 `fluxon_kv/src/config.rs`。先记结论:`master` 单独使用 `MasterConfigYaml`;`owner` 和 `external` 共用 `ClientConfigYaml`;`verify()` 再按内存贡献把 client 配置收敛成 owner / external / side-transfer worker 三个运行时分支。 + +`master` 的最小骨架: + +```yaml +instance_key: my-master-1 +cluster_name: demo-kv-cluster + +# master 控制面 etcd 地址;输入要求 raw host:port +etcd_endpoints: + - 127.0.0.1:2379 + +# master 自己的日志 / profile 根目录 +log_dir: /var/lib/fluxon/master_logs + +# 可选;给出时必须 > 0 +port: 31000 + +# 可选;当前 monitor 配置在 master 上是必填的 +monitoring: + prometheus_base_url: http://127.0.0.1:4000/v1/prometheus + prom_remote_write_url: + - http://127.0.0.1:4000/v1/prometheus/write + otlp_log_api: + otlp_endpoint: http://127.0.0.1:4000/v1/otlp/v1/logs + +# 可选;配置后 KV Web UI 会作为 master 内嵌 HTTP 服务启动 +master_ui: + http_listen_addr: 0.0.0.0:31100 +``` + +`owner` 和 `external` 共用同一个 `ClientConfigYaml` 外壳,先看 `owner`: + +```yaml +instance_key: my-owner-1 + +# 只要 dram > 0,就进入 owner 分支 +contribute_to_cluster_pool_size: + # 容量按 16 MiB 对齐 + dram: 1677721600 + vram: {} + +fluxonkv_spec: + cluster_name: demo-kv-cluster + + # 共享 bundle 根目录;运行时会拼成 cluster_name 作用域路径 + share_mem_path: /dev/shm/fluxon + + # owner 必须自己连接 etcd;输入要求 raw host:port + etcd_addresses: + - 127.0.0.1:2379 + + # owner 必须声明自己属于哪个 sub-cluster + sub_cluster: default + + # owner 必须声明大文件根目录列表;运行时按数组顺序选择第一个可用 root, + # 日志和 cache 等子目录都从固定相对位置派生 + large_file_paths: + - /var/lib/fluxon/large + + # 可选 + p2p_listen_port: 31001 + + # 可选;Redis 兼容入口只允许 owner 配 + # redis_compat: + # listen_addr: 0.0.0.0:6379 +``` + +`external` 用的还是 `ClientConfigYaml`,但结构会更小: + +```yaml +instance_key: my-external-1 + +fluxonkv_spec: + cluster_name: demo-kv-cluster + + # external 只保留 attach owner 所需的共享 bundle 根目录 + share_mem_path: /dev/shm/fluxon + + # 可选 + p2p_listen_port: 31002 +``` + +这里最重要的差异不是“多几个字段”,而是配置责任不同: + +- `owner` 负责提供共享内存池、连接 etcd、声明 `sub_cluster`、发布 `shared.json`、给出日志和 cache 的大文件根目录。 +- `external` 不再声明 `etcd_addresses`、`sub_cluster`、`large_file_paths`、`redis_compat`;这些 owner 侧字段都从 owner 发布的 `shared.json` 继承。 +- `etcd_addresses` 在 owner 侧会同时保留两份视图:对外契约还是 raw `host:port`,运行时内部会归一化成 `http://host:port`。 + +主要约束: + +- `monitoring` 在 master 上必填。 +- `master_ui` 依赖 `monitoring`,并作为嵌入式 monitor HTTP 服务启动。 +- `contribute_to_cluster_pool_size` 里的容量都按 16 MiB 对齐;`dram = 0` 但 `vram` 非 0 会被拒绝,避免半 owner 半 external 的模糊状态。 +- owner 模式要求 `contribute_to_cluster_pool_size.dram > 0`,并且必须显式提供 `etcd_addresses`、`sub_cluster`、`large_file_paths`。 +- zero-contribution `external` 模式禁止再写 owner 专属字段;运行时会从 owner `shared.json` 补齐这部分信息。 +- `share_mem_path` 会拼成 `cluster_name` 作用域路径;`mmap.file`、`shared.json` 和 peer metadata 都位于这个 cluster-scoped 目录下。 +- `test_spec_config.side_transfer_role = worker` 不是第三套 YAML,而是 zero-contribution client 的子分支;它强制 `TransferEngineType::P2p`,并关闭 transfer RPC fast path。 +- `test_spec_config.side_transfer_worker_count` 只允许出现在 owner 配置里,用来控制 owner 拉起的 worker 数量。 + +更细的调用时序、持有生命周期和并发规则分别在 `kv_1_概览与分层.md`、`kv_2_调用时序.md`、`kv_3_参数与并发.md`、`kv_4_allocation_segment_holder生命周期.md` 里展开。 + +### 5.2 FS + +FS 的配置集中在 `fluxon_fs_core/src/config.rs`,上层 `fluxon_fs/src/config.rs` 只是重导出。 + +这块分成 `cache`、`master`、`master_panel` 三个稳定子块,直接看骨架更直观: + +```yaml +fluxon_fs: + master: + instance_key: fluxon_fs_master + pull_interval_ms: 1000 + + master_panel: + listen_addr: 0.0.0.0:8091 + public_base_url: http://127.0.0.1:8091 + prometheus_base_url: http://127.0.0.1:4000/v1/prometheus + auto_refresh_interval_secs: 10 + access_db_path: /var/lib/fluxon/fs_master_access.db + bootstrap_access_model: + users: + - username: admin + password: admin + can_manage_users: true + scope_access: [] + transfer_state_store: + kind: tikv + tikv: + pd_endpoints: + - 127.0.0.1:2379 + key_prefix: /fluxon_fs_transfer/ + s3_gateway: + get_object_inflight_pieces: 8 + kv_miss_policy: remote_read + + cache: + stale_window_ms: 5000 + write_session_target_inflight_bytes: 134217728 + rules: + - dir_abs: /var/lib/fluxon/local_shared + cache_mode: read_through + write_mode: write_through + kv_key_prefix: /fluxon_fs_cache/local_shared/ + bytes_field_key: bytes + max_cache_bytes: 1048576 + on_refresh_error: apply_stale_window + exports: + demo: + remote_root_dir_abs: /var/lib/fluxon/export_root + nodes: + - fluxon_fs_writer + cache_max_bytes: 1048576 +``` + +读这段时抓三个点: + +- `fluxon_fs.master` 很小,当前稳定字段只有 `instance_key` 和可选的 `pull_interval_ms`;旧的 `fluxon_fs.rpc` 和 `rpc_timeout_ms` 已移除。 +- `fluxon_fs.master_panel` 负责 UI/S3 授权和 transfer 状态;`listen_addr`、`public_base_url`、`prometheus_base_url`、`access_db_path`、`bootstrap_access_model`、`s3_gateway` 都是启动基线。 +- `fluxon_fs.cache` 负责目录级 cache / export 规则;`rules[*].dir_abs` 和 `exports[*].remote_root_dir_abs` 都必须是绝对路径。 + +还要记住两个分支规则: + +- `exports..nodes` 缺失时,路由模式是 `AgentRegistry`;给出时是 `StaticNodes`。 +- `write_session_target_inflight_bytes` 可缺省,默认 128 MiB;但给出时必须 `> 0`。 + +FS 还把访问模型拆成两层: + +- `access_model` 是用户/权限的输入模型。 +- `runtime_access_model` 是 runtime 使用的派生模型,密码会被哈希,不再原样保留。 + +### 5.3 CLI 监控 + +`fluxon_cli/src/config.rs` 定义统一监控页配置,KV 的 `master_ui` 和 TestStack 的 UI 都复用它。 + +最小骨架: + +```yaml +etcd_endpoints: + - http://127.0.0.1:2379 + +prometheus_base_url: http://127.0.0.1:4000/v1/prometheus +cluster_name: demo-kv-cluster + +# kv / mq / fs +member_kind: kv + +# cli / web +output: web + +# 可选;web 模式常用 +http_listen_addr: 0.0.0.0:18080 + +# 可选;只有 MQ 页面需要扫描 unique key 时再给 +# mq_unique_key_prefixes: +# - /fluxon_mq_unique/ + +# 可选;不写时,如果 prometheus_base_url 明确指向 Greptime /v1/prometheus, +# 会自动派生默认 SQL 连接信息 +# greptime_sql: +# base_url: http://127.0.0.1:4000 +# db: public +# log_table: fluxon_logs +``` + +主要约束: + +- `etcd_endpoints` 必须非空且带 scheme。 +- `prometheus_base_url` 必须带 scheme。 +- `mq_unique_key_prefixes` 给出时不能为空,也不能带前后空白。 +- `greptime_sql` 可以显式提供;如果 `prometheus_base_url` 指向 Greptime 的 `/v1/prometheus`,会自动派生默认 SQL 连接信息。 + +### 5.4 共享传输契约 + +`fluxon_commu_contract` 提供多个被 KV / FS 共同复用的基础类型: + +最常见的是 `NetworkConfig` 这块 YAML: + +```yaml +network: + subnet_whitelist: + - 127.0.0.0/8 + - 10.0.0.0/24 + primary_ip_to_extended_ips: + 10.0.0.10: + - 10.0.0.11 + - 10.0.0.12 +``` + +以及协议/传输分支这两个输入: + +```yaml +protocol: + protocol_type: rdma +``` + +```yaml +protocol: + protocol_type: tcp +``` + +这里对应的稳定枚举取值是: + +- `ProtocolType`: `tcp` / `rdma` +- `TransferEngineType`: `Closed` / `P2p` +- `TransferBackendActivationMode`: `RdmaControl` / `TcpTestBypassRdmaControl` / `TestForceEnableBypassRdmaControl` + +这些类型是共享契约,不属于某一个子系统的私有配置。 + +## 6. 配置之间的关系 + +| 关系 | 说明 | +| --- | --- | +| build_config_ext -> deployment/test | 先确定环境基线,再给 runtime 配置提供 host、URL、路径 | +| deployconf -> test_config | Python 测试配置通过 `deployconf_path` 指向共享部署配置 | +| deployconf -> teststack | `start_test_bed` 和 `test_runner` 读取派生后的 testbed deployconf | +| commu_contract -> KV / FS | `ProtocolType`、`TransferEngineType`、`NetworkConfig` 是共享底座 | +| CLI config -> KV / TestStack UI | master UI、runner UI 复用同一个 monitor config 契约 | + +## 7. 读法建议 + +如果你只想看某一块的细节,按这个顺序读: + +1. 环境/部署先看 `deployment/utils/deployconf_config_utils.py` 和 `fluxon_util/src/dev_config.rs`。 +2. KV 先看 `fluxon_kv/src/config.rs`,再接 `kv_1` 到 `kv_4`。 +3. FS 先看 `fluxon_fs_core/src/config.rs`,再看 `用户 - 5 - FS接口.md`。 +4. TestStack 直接看 `teststack_1_当前架构与CI测试流程.md`。 diff --git "a/fluxon_doc_cn/design/log_1_\346\234\254\345\234\260\346\226\207\344\273\266\346\227\245\345\277\227\344\270\216Greptime_OTLP\345\257\274\345\207\272\351\223\276\350\267\257.md" "b/fluxon_doc_cn/design/log_1_\346\234\254\345\234\260\346\226\207\344\273\266\346\227\245\345\277\227\344\270\216Greptime_OTLP\345\257\274\345\207\272\351\223\276\350\267\257.md" new file mode 100644 index 0000000..555672b --- /dev/null +++ "b/fluxon_doc_cn/design/log_1_\346\234\254\345\234\260\346\226\207\344\273\266\346\227\245\345\277\227\344\270\216Greptime_OTLP\345\257\274\345\207\272\351\223\276\350\267\257.md" @@ -0,0 +1,414 @@ +# Fluxon Log 设计 1 - 统一 log 标准与 Greptime OTLP 导出链路 + +## 0. 总起 +本文定义 Fluxon 服务平面的统一日志标准。主线代码落在 `fluxon_rs/fluxon_kv/src/config.rs`、`fluxon_rs/fluxon_kv/src/lib.rs`、`fluxon_rs/fluxon_util/src/log.rs`、`fluxon_rs/fluxon_observability/src/greptime_otlp_tracing.rs`、`fluxon_rs/fluxon_observability/src/greptime_otlp_log_orchestrator.rs` 和 `fluxon_rs/fluxon_observability/src/greptime_otlp_log.rs`。 + +稳定结论先说死: + +- 本地文件日志始终启用,作为可回放的安全网。 +- Greptime OTLP 导出由 `master.monitoring.otlp_log_api` 控制,`master` 负责配置源,`owner` / `external` 只消费广播。 +- `testbed` 是独立的 `log_service_kind`,启动器、runner、UI 和 workload 统一按同一套日志语义落盘。 +- 当前导出链路采用 best-effort 策略,不阻塞主业务路径。 + +本文重点回答四个问题: + +1. 各条日志链路当前落在哪些目录边界里。 +2. 当前 canonical 文件名、按天分片和 31 天清理语义是什么。 +3. Rust / Python 之间哪些 contract 已经对齐,哪些还没有。 +4. 当前实现里哪些地方已经收口,哪些地方仍是未完全收口点。 + +KV 里的 `external` 与 side worker 都只消费 owner 感知结果。当前稳定 contract 是:它们显式配置单一 `share_mem_path` 作为 attach owner 的共享 bundle 根目录,`mmap.file`、`shared.json` 和 peer metadata 都在运行时拼接出的 cluster-scoped 目录下;`large_file_paths` 则从 owner 发布的 `shared.json` 继承,日志和 cache 从启动起就直接落到 owner 派生出来的大文件目录。 + +## 1. 目录边界 +目录边界只管物理隔离,不管统一 root。统一的是命名、元数据、归档窗口和清理语义。 + +### 1.1 KV +- `master` 以 `log_dir` 作为本地主日志根,并在其下派生 cluster-scoped runtime 日志目录。 +- `owner`、`external` 和 side worker 共享单一 `share_path` 作为 share 根,用来放 `mmap.file`、`shared.json`、peer metadata 和 side transfer 相关文件。 +- `owner` 的 `large_file_paths` 定义 runtime log、cache 等大文件资产的物理根目录。 +- `external` 和 side worker 不再单独声明自己的 `large_file_paths`。它们在 zero-contribution bootstrap 阶段从 owner `shared.json` 继承同一组大文件根目录,然后直接复用 owner 派生出来的 runtime log / cache 边界。 + +### 1.2 ops / bare shared supervisor control plane +这里不要把 `ops` 和 `bare` 理解成两套彼此独立的面。两者确实共用同一个 `selection_supervisor.py + log_shard.py` 实现源,但当前实际落盘边界不是一棵完全统一的目录树。 + +先区分两个层次: + +| 层次 | 稳定根 | 主要内容 | +| --- | --- | --- | +| `deployconf -> gen_bare -> bare bootstrap` | `hostworkdir` | generated control scripts、bare 服务日志 | +| `ops` runtime | `workdir` | runtime config、embedded supervisor runtime、ops-managed workload 日志 | + +其中: + +- `hostworkdir` 是节点级宿主根,用来承载 deployer 下发产物、bare 控制脚本和其他需要跨进程稳定复用的目录。 +- `workdir` 是某个具体进程实例自己的运行子目录,用来承载该实例的 runtime config、embedded supervisor runtime 和它托管出来的 workload 日志。 +- 位置关系上,当前 self-host deployconf 里 `workdir` 通常是 `hostworkdir` 的子目录;语义关系上,`workdir` 仍然只是“某个实例的运行子树”,不能反过来代表整个 `hostworkdir`。 + +bare 稳定根当前可以直观看成: + +```text +${HOSTWORKDIR}/ + log/ + ops_controller..log + ops_agent..log + ..log + gen_bare_deploy_bash/ + start_ops_controller.sh + start_ops_agent.sh + start_.sh + stop_ops_controller.sh + stop_ops_agent.sh + stop_.sh + start_.sh + stop_.sh + selection_supervisor.py + log_shard.py + entrypoint__.sh +``` + +当前 self-host deployconf 下,`hostworkdir` 与 `ops workdir` 的实际位置关系可以直观看成: + +```text +${HOSTWORKDIR}/ + gen_bare_deploy_bash/ + ... + log/ + ops_controller..log + ops_agent..log + ..log + ops_controller/ + ops_controller.yaml + selection_supervisor/ + selection_supervisor.py + log_shard.py + log/ + workload____..log + ops_agent/ + / + ops_agent.yaml + selection_supervisor/ + selection_supervisor.py + log_shard.py + log/ + workload____..log +``` + +这里再把 contract 说清楚: + +- `${HOSTWORKDIR}/gen_bare_deploy_bash/` 里的 `start_*.sh` / `stop_*.sh` 是 generated control scripts,是这套 shared supervisor 控制面的入口脚本,不是另一套独立 authority。 +- bare 这一层的稳定逻辑基名仍然是 `${HOSTWORKDIR}/log/.log`,shared supervisor runtime 再把它收口为 `${HOSTWORKDIR}/log/..log`。 +- ops-managed workload 这一层的稳定逻辑基名则是 `${WORKDIR}/log/workload____.log`,shared supervisor runtime 再把它收口为 `${WORKDIR}/log/workload____..log`。 +- 两层真正共享的是 `selection_supervisor.py + log_shard.py` 这组控制与滚动实现,不是“所有路径和文件名完全一样”。 + +在当前 self-host deployconf 示例里: + +- `ops_controller` 的 workdir 是 `${HOSTWORKDIR}/ops_controller` +- `ops_agent` 的 workdir 是 `${HOSTWORKDIR}/ops_agent/${NODE_ID}` + +### 1.3 testbed +- `workdir`、`run_dir` 分别承担 launcher、runner、UI、workload 的 run-scoped 落盘边界。 +- `testbed` 必须显式作为 `log_service_kind` 出现,不再用泛化名称代替。 +- launcher 和 workload 的目录语义要和 ops 对齐。 +- 当前优先级不是先把 testbed 做到完美支持,而是先把 ops 长时服务日志 contract 讲清楚并收口;testbed 继续按“服务级日志”和“case artifact”分开讨论。 + +### 1.4 FS +- `share_mem_path` 与 `export.remote_root_dir_abs` 分开使用。 +- 前者负责 KV attachment 所需的共享 bundle 边界。 +- 后者负责 FS 业务数据边界。 + +这里的目标很明确:目录可以不同,语义必须一致。`log`、`cache`、`shared attachment`、`workload data` 不能混在同一个边界里。 + +## 2. 文件命名 +当前实现里的文件命名还没有完全统一,但已经可以明确分成下面几类。 + +| 类别 | 当前逻辑基名 | 当前实际落盘 | +| --- | --- | --- | +| KV runtime | `fluxon-kv-.log` | `fluxon-kv-..log` | +| bare 服务日志 | `.log` | `..log` | +| ops-managed workload | `workload____.log` | `workload____..log` | +| testbed 服务日志 | `test_runner.log` / `test_runner_ui.log` | `test_runner..log` / `test_runner_ui..log` | +| KV side worker stdio | `side_worker_.stdout.log` / `side_worker_.stderr.log` | 当前还没补日期分片 | + +补充说明: + +- KV runtime 日志当前仍由 `fluxon_util::init_log(...)` 创建,`run_master_impl(...)` 和 `run_client_impl(...)` 都会初始化这套本地文件日志,所以 `master`、`owner`、`external` 这些 KV 运行时进程当前确实都会产生这类文件。 +- `ops` 里还保留一些特例命名,例如 `smoke.log`、`smoke_bare.log`、`smoke_workloads_bare.log`。这些都属于当前实现尚未收口的历史命名。 +- `testbed` 当前仍然没有单一 canonical log filename。服务级日志已经补上时间分片,但 `ci_runner` 等 case 级日志仍主要落在 `results//run_/logs/**` 与 `summary.yaml`、`exception.txt`、`ci.log` 这类 run artifact 里。 + +清理只依据文件名里约定好的日期分片字段,不按目录数量、文件大小或历史批次做判断。这样本地清理和 Greptime retention 才能共享同一时间窗口。 + +## 3. 元数据字段 +这一节描述的是当前 KV OTLP 导出链路已经实际写入 Greptime 的元数据字段。 + +| 字段 | 含义 | +| --- | --- | +| `service.name` | 当前固定为 `fluxon` | +| `fluxon_cluster_name` | 集群名 | +| `fluxon_member_kind` | 当前业务类型标签,例如 `kv` | +| `fluxon_role` | 当前进程角色标签,例如 `master`、`owner_client`、`external_client` | +| `fluxon_member_id` | 当前实例标识 | + +当前实现里的日志元数据仍然是围绕 `cluster_name`、`member_kind`、`role`、`member_id` 这组字段组织的;`log_service_kind`、`log_kind`、`process_role`、`instance_key`、`workload_kind`、`workload_name` 这些更细的统一字段,目前还没有完整进入导出链路。 + +## 4. 归档、超时与清理 +本地文件日志按天滚动归档,默认保留 31 天。清理时只扫描 canonical log file name,并按命名约定提取日期分片删除过期文件,不按文件数量或目录总量触发。 + +流式备份和 OTLP 导出也服从同一套窗口: + +| 项目 | 规则 | +| --- | --- | +| 导出策略 | best-effort,不阻塞主业务路径 | +| 队列满 | 允许丢弃,并保留可观测信号 | +| 发送失败 | 允许跳过当前 batch,本地文件仍在 | +| 停机行为 | shutdown 时执行 best-effort flush | +| 超时语义 | 单次导出必须有硬上界,不能无限挂起 | + +Greptime 侧的 retention / TTL 也按同一日期窗口收口,保证本地与远端的保留语义一致。这里要把远端清理语义说死:写入 `fluxon_logs` 的日志记录默认只保留 1 个月,超过窗口的数据必须由 Greptime 表级 TTL 或定时清理任务删除,不能只依赖查询层按时间过滤“看不见旧数据”。 + +如果后续本地窗口仍保持 31 天,那么 Greptime 侧也应保持同一 31 天窗口;如果本地窗口改为新的 canonical 值,远端 TTL 也必须同步调整。`disable_observability=true` 只关闭 OTLP 层,不关闭本地文件日志。 + +如果某条 stream 只是“备份副本”,它不能绕开本地日志的归档窗口单独永久存活。超时后应停止 tailing、释放资源,并交回本地文件归档策略处理历史文件。 + +## 5. 当前实现里已经收口的点 +这一节只写已经可以当作当前事实使用的内容。 + +### 5.1 本地文件按天分片与 31 天窗口 +- KV runtime 已具备稳定的按天滚动与保留窗口。 +- bare 服务日志已经接到 shared supervisor 的按天分片与同口径清理。 +- ops-managed workload 日志已经接到 shared supervisor 的按天分片与同口径清理。 +- `test_runner` / `test_runner_ui` 这类 testbed 服务级日志已补齐按天分片与本地 31 天保留窗口。 + +### 5.2 shared supervisor 已经统一到一个实现源 +- bare bootstrap 与 ops-managed workload 现在都复用 `selection_supervisor.py + log_shard.py` 这组实现。 +- `gen_bare_deploy_bash.py` 会把同一个 `log_shard.py` helper 下发到生成目录。 +- bare 启动脚本层保留的是稳定逻辑基名,真正的 stdio 重定向和实际分片写入都在共享 `selection_supervisor.py` 运行时里生效。 + +### 5.3 Rust / Python 已经有三类明确对齐 +- 按天分片与 31 天清理 +- 日志目录派生规则 +- OTLP 基础字段与 Greptime header + +## 6. 当前还没有完全收口的点 +这一节只写未完全收口点,避免把“当前事实”和“目标态”混在一起。 + +### 6.1 KV 共享 bundle 已收口到单一 `share_mem_path` +- 当前 KV public contract 只保留 `share_mem_path`。 +- 运行时在 `share_mem_path` 下拼接 `cluster_name`,统一承载 `mmap.file`、`shared.json`、peer metadata 和 side transfer metadata。 + +### 6.2 side worker stdio 仍未收口到统一按天分片 +- zero-contribution bootstrap 已经在启动前继承 owner 的 `large_file_paths`,因此 KV runtime logger 不再依赖 attach 后热切换文件路径。 +- 但 side worker stdio 当前仍然直接写 `side_worker_.stdout.log` / `side_worker_.stderr.log`,还没有补到统一的按天分片命名。 + +### 6.3 side worker stdio 与历史 `smoke` 文件还没纳入这轮收口 +- side worker stdio 当前仍是 `side_worker_.stdout.log` / `side_worker_.stderr.log`。 +- `smoke.log`、`smoke_bare.log`、`smoke_workloads_bare.log` 一类历史命名仍然存在。 + +### 6.4 testbed 只有服务级日志收口到了同类语义 +- `test_runner`、`test_runner_ui` 已改为“稳定逻辑基名 + 按天分片落盘”。 +- case 级 `run_dir/logs/**`、`summary.yaml`、`resolved_case.yaml`、`benchmark_result.json` 等仍按 run artifact 生命周期消费。 +- `history_lookback_days` 仍只是控制 UI 回看哪些 workdir;`gitops retention.max_age_days` 仍然清理 gitops run 目录,不是 testbed 服务日志文件的统一 TTL。 + +### 6.5 OTLP 统一字段和统一状态机还没有全部收口 +- 当前导出链路仍以 `cluster_name`、`member_kind`、`role`、`member_id` 为主。 +- `log_service_kind`、`log_kind`、`process_role`、`instance_key`、`workload_kind`、`workload_name` 这组更细的 canonical 字段还没有完整进入导出链路。 +- Rust 通用链路已经把 `disabled`、`direct`、`proxy`、失败分支显式枚举出来;Python benchmark exporter 仍是直连特化路径,还没有进入同一套通用发送状态机。 + +## 7. rs / py 模块对齐与防漂移 +稳定结论先说死: + +- 共享 log contract 以 Rust canonical 模块为准,Python 优先复用 Rust 已经导出的结果。 +- 当前已经能从代码直接看出三类对齐:按天分片与 31 天清理、日志目录派生、OTLP 基础字段与 header。 +- 当前还没有完全收口的是通用 OTLP 发送状态机。Rust 已经显式枚举发送分支,Python 侧 benchmark exporter 仍是直连特化路径。 + +### 7.1 按天分片与本地保留窗口 +Rust `fluxon_rs/fluxon_util/src/log.rs`: + +```rust +const LOG_RETENTION_DAYS: usize = 31; + +pub fn current_daily_sharded_log_path(base_path: &Path) -> anyhow::Result { + daily_sharded_log_path(base_path, current_shard_date()?) +} + +fn cleanup_old_daily_sharded_logs(base_path: &Path, retention_days: usize) -> anyhow::Result<()> { + let keep_since = current_shard_date()? - chrono::Days::new(retention_days.saturating_sub(1) as u64); + ... + if shard_date < keep_since { + fs::remove_file(&path)?; + } +} + +impl DailyShardedFileWriter { + fn rotate_if_needed(&self, state: &mut DailyShardedFileWriterState) -> io::Result<()> { + let next_path = self.current_path()?; + cleanup_old_daily_sharded_logs(&self.base_path, self.retention_days)?; + let file = fs::OpenOptions::new().create(true).append(true).open(&next_path)?; + state.current_path = Some(next_path); + state.current_file = Some(file); + Ok(()) + } +} +``` + +Python `deployment/utils/log_shard.py`: + +```python +DEFAULT_DAILY_LOG_RETENTION_DAYS = 31 + +def daily_sharded_log_path(base_path: Path, *, now: Optional[datetime.datetime] = None) -> Path: + shard_date = _resolve_shard_date(ts) + return (base_path.parent / f"{stem}.{shard_date.isoformat()}.log").resolve() + +def cleanup_old_daily_sharded_logs(base_path: Path, *, retention_days: int = DEFAULT_DAILY_LOG_RETENTION_DAYS) -> None: + current_shard_date = _resolve_shard_date(datetime.datetime.now(datetime.timezone.utc)) + keep_since = current_shard_date - datetime.timedelta( + days=max(int(retention_days) - 1, 0) + ) +``` + +这两段现在对齐的是同一个显式 contract:逻辑基名保持不变,日期字段统一落在 `..log`,默认本地窗口都是 31 天,而且过期删除都显式按日期分片判断。这里不要机械要求两边 helper 名称完全一样;对齐的是“按天分片 + 31 天窗口 + 同口径清理”这条 contract。 + +### 7.2 KV 主日志是 Rust;Python 侧要分 bare 服务日志和 ops-managed workload 日志两层 +先把边界说死:KV runtime 主日志当前基本都是 Rust 在输出。`master`、`owner`、`external` 这些 KV 进程走的是 `fluxon_util::init_log(...)` 这条链。Python 一侧真正需要单独检查的,当前已经分成两层: + +- `deployconf -> gen_bare -> bare bootstrap` 这一层,负责 `ops_controller`、`ops_agent` 和其他 bare service 自身的 stdout/stderr。 +- `ops_agent` 进入 desired-runtime 管理之后,再去托管 workload;这一层的日志 contract 不再沿用 bare `${service_name}.log`,而是 `workload____.log`。 + +先看 bare 这一层: + +Python `deployment/gen_bare_deploy_bash.py`: + +```python +from log_shard import render_module_source as render_log_shard_module_source + +(outdir / LOG_SHARD_HELPER_FILENAME).write_text( + render_log_shard_module_source(), + encoding="utf-8", +) +``` + +```python +runtime_state_json = _bare_runtime_state_json( + workload_name=workload_name, + authority_name=..., + service_name=service_name, + log_path=f"${{HOSTWORKDIR}}/log/{service_name}.log", +) + +LOG_DIR="$HOSTWORKDIR/log" +LOGFILE="$LOG_DIR/${SERVICE}.log" +... +SUPERVISOR_PID=$( ... < /dev/null & echo "$!" ) +``` + +Python `deployment/utils/selection_supervisor_codegen.py`: + +```python +def _redirect_process_stdio_to_runtime_log(runtime_state: Optional[SelectionRuntimeState]) -> None: + base_log_path = _require_non_empty_str(runtime_state.log_path, "state.log_path") + + def _router_loop() -> None: + _LOG_SHARD.relay_fd_to_daily_sharded_logs( + base_log_path=base_log_path, + read_fd=read_fd, + retention_days=_LOG_SHARD.DEFAULT_DAILY_LOG_RETENTION_DAYS, + ) + + os.dup2(write_fd, sys.stdout.fileno()) + os.dup2(write_fd, sys.stderr.fileno()) + +... + +_redirect_process_stdio_to_runtime_log(runtime_state) +``` + +再看 ops-managed workload 这一层: + +Rust `fluxon_rs/fluxon_ops/src/lib.rs`: + +```rust +fn workload_log_filename(kind: WorkloadKind, name: &str) -> anyhow::Result { + Ok(format!("workload__{}__{}.log", kind.as_str(), name)) +} + +let runtime_dir = workdir.join(OPS_SELECTION_SUPERVISOR_DIR_NAME); +let log_dir = workdir.join(OPS_LOG_DIR_NAME); +let log_path = self.log_dir.join(log_filename); +``` + +这组代码说明当前现状是: + +- bare bootstrap 与 ops-managed workload 确实已经复用了同一个 `selection_supervisor.py + log_shard.py` 实现源。 +- bare 服务日志与 ops-managed workload 日志也都已经真正接到这套滚动管理 helper 上。 +- 但两层当前并不是同一个 path contract: + - bare 服务日志保留的是 `${HOSTWORKDIR}/log/${service_name}.log` + - ops-managed workload 保留的是 `${WORKDIR}/log/workload____.log` + +### 7.3 OTLP 基础字段与 header 已经同名对齐 +Rust `fluxon_rs/fluxon_observability/src/greptime_otlp_log.rs`: + +```rust +let kvs = vec![ + KeyValue { key: KEY_CLUSTER_NAME.to_string(), value: Some(...) }, + KeyValue { key: KEY_MEMBER_KIND.to_string(), value: Some(...) }, + KeyValue { key: KEY_ROLE.to_string(), value: Some(...) }, + KeyValue { key: KEY_MEMBER_ID.to_string(), value: Some(...) }, +]; + +let mut reqb = self + .http + .post(&self.endpoint) + .header("X-Greptime-DB-Name", &self.db_name) + .header("X-Greptime-Log-Extract-Keys", GREPTIME_LOG_EXTRACT_KEYS_HEADER_VALUE); +``` + +Python `fluxon_test_stack/distributed_benchmark_node.py`: + +```python +log_attrs: Dict[str, Any] = { + "fluxon_cluster_name": self._cfg.cluster_name, + "fluxon_member_kind": self._cfg.member_kind, + "fluxon_role": self._cfg.role, + "fluxon_member_id": self._cfg.member_id, +} + +headers = { + "Content-Type": "application/x-protobuf", + "X-Greptime-DB-Name": self._cfg.db_name, + "X-Greptime-Log-Extract-Keys": ",".join(extract_keys), +} +``` + +这两边已经对齐到同一个最小公共集合:`fluxon_cluster_name`、`fluxon_member_kind`、`fluxon_role`、`fluxon_member_id` 这组基础属性同名同义,Greptime header 也保持同一协议面。Python benchmark exporter 可以补 phase summary 字段,但不能改写这组基础字段的含义。 + +### 7.4 发送状态机还没有完全收口 +Rust `fluxon_rs/fluxon_observability/src/greptime_otlp_log_orchestrator.rs`: + +```rust +pub enum GreptimeOtlpLogAttemptResult { + Disabled, + Sent { path: GreptimeOtlpLogSendPath, proxy_node: Option }, + SkippedNoProxy { detail: String }, + ProxyFailed { proxy_node: N, detail: String }, +} +``` + +Python `fluxon_test_stack/distributed_benchmark_node.py`: + +```python +with urllib.request.urlopen(req, timeout=GREPTIME_OTLP_LOG_TIMEOUT_SECONDS) as resp: + status = getattr(resp, "status", 200) + if int(status) < 200 or int(status) >= 300: + body_text = resp.read().decode("utf-8", errors="replace") + raise RuntimeError(f"greptime otlp http {status}: {body_text}") +``` + +这组对照反映的是当前边界:Rust 通用链路已经把 `disabled`、`direct`、`proxy`、失败分支显式枚举出来;Python 这里只是 benchmark phase summary 的直连特化路径,还没有进入同一套通用发送状态机。后续如果 Python 需要承担通用 service-plane 导出,应该复用 Rust 这组有限分支,而不是再发明一套平行状态模型。 + +### 7.5 防止未来漂移 +只保留四条工程规则: + +1. 共享 contract 只保留一个真相源。目录派生、canonical 字段、发送状态、TTL 这类会跨语言消费的语义,优先由 Rust 定义,Python 复用导出结果或逐项镜像实现。 +2. 任何改动如果影响 canonical 文件名、OTLP 字段、Greptime header、发送分支或 retention,必须同一个 PR 同时更新 Rust 代码、Python 代码、设计文档和至少一层 contract test。 +3. Python 特化路径必须显式标出作用域。`test_runner` 服务日志和 benchmark phase summary 可以保留自己的实现,但不能反向成为公共 contract 的定义源。 +4. 多语言边界坚持一个概念一个名字。不要在 rs / py 两边分别引入近义字段、别名参数或平行配置面,否则文档、查询、清理和告警都会漂移。 diff --git "a/fluxon_doc_cn/design/teststack_1_\345\275\223\345\211\215\346\236\266\346\236\204\344\270\216CI\346\265\213\350\257\225\346\265\201\347\250\213.md" "b/fluxon_doc_cn/design/teststack_1_\345\275\223\345\211\215\346\236\266\346\236\204\344\270\216CI\346\265\213\350\257\225\346\265\201\347\250\213.md" index 823a4be..7134b00 100644 --- "a/fluxon_doc_cn/design/teststack_1_\345\275\223\345\211\215\346\236\266\346\236\204\344\270\216CI\346\265\213\350\257\225\346\265\201\347\250\213.md" +++ "b/fluxon_doc_cn/design/teststack_1_\345\275\223\345\211\215\346\236\266\346\236\204\344\270\216CI\346\265\213\350\257\225\346\265\201\347\250\213.md" @@ -438,6 +438,30 @@ deploy.instances 不写死在 suite 中。Runner 会结合 scale、profile 和 - `resolved_case` 会额外固化 `command_id`、`test_id` 等 CI 元数据; - 生成顺序是稳定的,后续 phase 规划依赖这个顺序。 +### 7.8 owner 模式配置契约 + +**稳定结论:** + +- owner 模式配置一律必须显式提供 `fluxonkv_spec.large_file_paths`,并按数组顺序表达大文件根目录优先级。 +- `fluxonkv_spec.p2p_listen_port` 不是 owner 模式的必填项;是否显式写入,取决于具体分支的运行契约。 +- 不要把 `TEST_STACK` case-local owner 的显式端口分配规则,复制到 shared testbed / CI owner 配置上。 + +这里需要明确区分两类 owner 配置生成面: + +| surface | `large_file_paths` | `p2p_listen_port` | 原因 | +| --- | --- | --- | --- | +| shared testbed / CI owner | 必填 | 默认省略,保持隐式 | 这类 owner 运行在共享环境里,宿主端口占用和 host 布局更易变化,保持由运行时自行绑定可用端口更稳妥 | +| `TEST_STACK` case-local owner | 必填 | 显式写入 | 同一 case 内的 node runtime 需要消费 runner 预编译的有限端口计划,owner peer 地址必须稳定 | + +这条边界对应两种不同责任: + +- `large_file_paths` 是 owner 模式本身的配置契约,缺失时应直接视为配置错误; +- `p2p_listen_port` 是否显式,则是某个运行 surface 的拓扑与端口规划策略,不应从一个 surface 横向推广到另一个 surface。 + +本次相关经验可以收敛成一句规则: + +- owner 模式要显式约束的是 large-file roots,不是“默认必须写死 p2p 端口”。 + ## 8. case 执行流程 ### 8.1 总体时序 diff --git "a/fluxon_doc_cn/user_doc/\347\224\250\346\210\267 - 1 - \346\236\266\346\236\204\345\222\214\346\246\202\345\277\265.md" "b/fluxon_doc_cn/user_doc/\347\224\250\346\210\267 - 1 - \346\236\266\346\236\204\345\222\214\346\246\202\345\277\265.md" index e83afe3..bbe5c27 100644 --- "a/fluxon_doc_cn/user_doc/\347\224\250\346\210\267 - 1 - \346\236\266\346\236\204\345\222\214\346\246\202\345\277\265.md" +++ "b/fluxon_doc_cn/user_doc/\347\224\250\346\210\267 - 1 - \346\236\266\346\236\204\345\222\214\346\246\202\345\277\265.md" @@ -60,9 +60,7 @@ **`prometheus_base_url`** — 面板的 metrics 数据源地址(Prometheus-compatible HTTP API)。面板只查询不采集,不可达时指标显示 N/A。 -**`shared_memory_path`** — 共享内存目录,同机进程通过它附着到同一内存池;这是 mmap / data plane 的本机 authority。 - -**`shared_file_path`** — 共享文件目录,`shared.json`、日志、profile 等本机共享文件位于这里;这是 file / metadata attachment 的本机 authority。 +**`share_mem_path`** — 共享 bundle 根目录。运行时会在其下拼接 `cluster_name`,同一个 cluster-scoped 目录同时承载 `mmap.file`、`shared.json` 和 peer metadata。 **`log_dir`** — master 自己的日志目录 authority。master 运行时会在这个目录下继续派生 cluster 级日志和 profile 子目录。 diff --git "a/fluxon_doc_cn/user_doc/\347\224\250\346\210\267 - 2 - \346\234\215\345\212\241\345\271\263\351\235\242.md" "b/fluxon_doc_cn/user_doc/\347\224\250\346\210\267 - 2 - \346\234\215\345\212\241\345\271\263\351\235\242.md" index 123c31f..2ffa6e5 100644 --- "a/fluxon_doc_cn/user_doc/\347\224\250\346\210\267 - 2 - \346\234\215\345\212\241\345\271\263\351\235\242.md" +++ "b/fluxon_doc_cn/user_doc/\347\224\250\346\210\267 - 2 - \346\234\215\345\212\241\345\271\263\351\235\242.md" @@ -199,8 +199,7 @@ ETCD_ENDPOINT = "127.0.0.1:2379" GREPTIME_HTTP_PORT = 34030 GREPTIME_BASE_URL = f"http://127.0.0.1:{GREPTIME_HTTP_PORT}" CLUSTER_NAME = "demo-kv-cluster" -SHARED_MEMORY_PATH = Path("/dev/shm/fluxon_kv_demo").resolve() -SHARED_FILE_PATH = Path("/tmp/fluxon_kv_demo/shared").resolve() +SHARE_MEM_PATH = Path("/dev/shm/fluxon_kv_demo").resolve() WORKDIR = Path("/tmp/fluxon_kv_demo/runtime").resolve() MASTER_PORT = 31000 MASTER_INSTANCE_KEY = "demo_kv_master" @@ -210,7 +209,6 @@ OWNER_DRAM_BYTES = 1073741824 def main() -> None: args = parse_args() - SHARED_FILE_PATH.mkdir(parents=True, exist_ok=True) log_dir = (WORKDIR / "log").resolve() if args.with_master: @@ -245,8 +243,7 @@ def main() -> None: ) ) - print(f"[fluxon_kv] shared memory path: {SHARED_MEMORY_PATH}") - print(f"[fluxon_kv] shared file path: {SHARED_FILE_PATH}") + print(f"[fluxon_kv] share_mem_path: {SHARE_MEM_PATH}") print(f"[fluxon_kv] etcd endpoint: {ETCD_ENDPOINT}") print(f"[fluxon_kv] greptime base url: {GREPTIME_BASE_URL}") print(f"[fluxon_kv] start master in this script: {args.with_master}") @@ -309,9 +306,9 @@ def build_owner_config() -> dict: "fluxonkv_spec": { "etcd_addresses": [ETCD_ENDPOINT], "cluster_name": CLUSTER_NAME, - "shared_memory_path": str(SHARED_MEMORY_PATH), - "shared_file_path": str(SHARED_FILE_PATH), + "share_mem_path": str(SHARE_MEM_PATH), "sub_cluster": "default", + "large_file_paths": [str((WORKDIR / "large" / "owner").resolve())], }, } diff --git "a/fluxon_doc_cn/user_doc/\347\224\250\346\210\267 - 3 - KV-RPC\346\216\245\345\217\243.md" "b/fluxon_doc_cn/user_doc/\347\224\250\346\210\267 - 3 - KV-RPC\346\216\245\345\217\243.md" index 6494221..9a8c8e1 100644 --- "a/fluxon_doc_cn/user_doc/\347\224\250\346\210\267 - 3 - KV-RPC\346\216\245\345\217\243.md" +++ "b/fluxon_doc_cn/user_doc/\347\224\250\346\210\267 - 3 - KV-RPC\346\216\245\345\217\243.md" @@ -52,8 +52,7 @@ ETCD_ENDPOINT = "127.0.0.1:2379" GREPTIME_HTTP_PORT = 34030 GREPTIME_BASE_URL = f"http://127.0.0.1:{GREPTIME_HTTP_PORT}" CLUSTER_NAME = "demo-kv-cluster" -SHARED_MEMORY_PATH = Path("/dev/shm/fluxon_kv_demo").resolve() -SHARED_FILE_PATH = Path("/tmp/fluxon_kv_demo/shared").resolve() +SHARE_MEM_PATH = Path("/dev/shm/fluxon_kv_demo").resolve() WORKDIR = Path("/tmp/fluxon_kv_demo/runtime").resolve() MASTER_PORT = 31000 MASTER_UI_PORT = 18080 @@ -64,7 +63,6 @@ OWNER_DRAM_BYTES = 1073741824 def main() -> None: args = parse_args() - SHARED_FILE_PATH.mkdir(parents=True, exist_ok=True) log_dir = (WORKDIR / "log").resolve() if args.with_master: @@ -99,8 +97,7 @@ def main() -> None: ) ) - print(f"[fluxon_kv] shared memory path: {SHARED_MEMORY_PATH}") - print(f"[fluxon_kv] shared file path: {SHARED_FILE_PATH}") + print(f"[fluxon_kv] share_mem_path: {SHARE_MEM_PATH}") print(f"[fluxon_kv] etcd endpoint: {ETCD_ENDPOINT}") print(f"[fluxon_kv] greptime base url: {GREPTIME_BASE_URL}") print(f"[fluxon_kv] start master in this script: {args.with_master}") @@ -170,9 +167,9 @@ def build_owner_config() -> dict: "fluxonkv_spec": { "etcd_addresses": [ETCD_ENDPOINT], "cluster_name": CLUSTER_NAME, - "shared_memory_path": str(SHARED_MEMORY_PATH), - "shared_file_path": str(SHARED_FILE_PATH), + "share_mem_path": str(SHARE_MEM_PATH), "sub_cluster": "default", + "large_file_paths": [str((WORKDIR / "large" / "owner").resolve())], }, } @@ -237,6 +234,7 @@ api.close() -> Result[OkNone, ApiError] - `FluxonKvClientConfig`:配置对象,优先直接从 Python dict 创建,也支持从 YAML 文件加载。 - `new_store(config: FluxonKvClientConfig) -> Result[KvClient, ApiError]`:创建 KV client 实例。 - `KvClient`:统一入口,同时提供 KV 读写与节点间调用。 +- `KvClient.third_party_logs_dir() -> Result[str, ApiError]`:返回 Fluxon 分配给第三方 Python 组件的日志根目录。组件应在这个根目录下继续派生自己的子目录,例如 `mq/`。 - `MemHolder`:`get_blocking(...)` 成功后的读取结果持有者,`access()` 取得 `FlatDict`。 - `PutOptionalArgs`:`put_blocking(...)` 的可选参数对象,当前常用字段是 `lease_id`。 - `test_spec_config.disable_observability`:最小 external client 示例里显式设为 `True`,避免把 OTLP / observe 后台任务引入“只验证 KV/RPC 基本链路”的示例生命周期。 @@ -264,8 +262,7 @@ from fluxon_py import FluxonKvClientConfig, new_store INSTANCE_KEY = "demo_kv_external" CLUSTER_NAME = "demo-kv-cluster" -SHARED_MEMORY_PATH = "/dev/shm/fluxon_kv_demo" -SHARED_FILE_PATH = "/tmp/fluxon_kv_demo/shared" +SHARE_MEM_PATH = "/dev/shm/fluxon_kv_demo" def main() -> None: @@ -274,8 +271,7 @@ def main() -> None: "instance_key": INSTANCE_KEY, "fluxonkv_spec": { "cluster_name": CLUSTER_NAME, - "shared_memory_path": SHARED_MEMORY_PATH, - "shared_file_path": SHARED_FILE_PATH, + "share_mem_path": SHARE_MEM_PATH, }, "test_spec_config": { "disable_observability": True, @@ -332,7 +328,9 @@ FLUXON_LOG=DEBUG python3 examples/external_put_get_del.py - `FLUXON_LOG`:控制当前 Python 业务进程 console logger 的输出门限 - Fluxon Python 侧 logger 会读取 `FLUXON_LOG`;合法值是 `DEBUG`、`INFO`、`WARNING`、`ERROR`、`CRITICAL`,默认 `INFO` - `log_dir`:`master` 本地日志 authority -- `shared_file_path`:本机共享文件 authority,`shared.json`、日志、profile 等文件位于这里 +- `share_mem_path`:KV 共享 bundle 根目录,只承载 `mmap.file`、`shared.json` 和 peer metadata +- `large_file_paths`:owner 侧大文件根目录,日志、profile、cache 等运行时资产都从这里派生 +- `store.third_party_logs_dir().unwrap(...)`:返回 `{large_file_paths[0]}/{cluster_name}_cluster_third_party_logs`。第三方 Python 组件应只在这个根目录下派生自己的子目录,这样目录使用更收束,Fluxon 观测平面也能统一感知和采集这些文件日志。 如果服务平面的 `master.monitoring.otlp_log_api` 已经配置,后台服务日志还会继续采集到 Greptime 的 `fluxon_logs` 表。 @@ -411,8 +409,7 @@ from fluxon_py import FluxonKvClientConfig, new_store RPC_SERVER_INSTANCE_KEY = "demo_rpc_server" RPC_CLIENT_INSTANCE_KEY = "demo_rpc_client" CLUSTER_NAME = "demo-kv-cluster" -SHARED_MEMORY_PATH = "/dev/shm/fluxon_kv_demo" -SHARED_FILE_PATH = "/tmp/fluxon_kv_demo/shared" +SHARE_MEM_PATH = "/dev/shm/fluxon_kv_demo" def main() -> None: @@ -446,8 +443,7 @@ def _build_config(*, instance_key: str) -> FluxonKvClientConfig: "instance_key": instance_key, "fluxonkv_spec": { "cluster_name": CLUSTER_NAME, - "shared_memory_path": SHARED_MEMORY_PATH, - "shared_file_path": SHARED_FILE_PATH, + "share_mem_path": SHARE_MEM_PATH, }, "test_spec_config": { "disable_observability": True, @@ -548,8 +544,7 @@ cfg = FluxonKvClientConfig( "instance_key": "my-kv-client-1", "fluxonkv_spec": { "cluster_name": "demo-kv-cluster", - "shared_memory_path": "/dev/shm/fluxon", - "shared_file_path": "/var/lib/fluxon/shared", + "share_mem_path": "/dev/shm/fluxon", }, } ) @@ -574,10 +569,8 @@ instance_key: my-kv-client-1 fluxonkv_spec: # 目标集群名;必须和 master / owner 保持一致 cluster_name: demo-kv-cluster - # 本机共享内存 authority;external 靠它附着到同机 owner 的内存池 - shared_memory_path: /dev/shm/fluxon - # 本机共享文件 authority;shared.json、日志、profile 等文件位于这里 - shared_file_path: /var/lib/fluxon/shared + # 共享 bundle 根目录;运行时会在其下拼接 cluster_name + share_mem_path: /dev/shm/fluxon # 可选:覆盖当前 client 的 P2P 监听端口 p2p_listen_port: 31001 ``` @@ -601,20 +594,18 @@ fluxonkv_spec: - 127.0.0.1:2379 # 目标集群名;必须和 master / external 保持一致 cluster_name: demo-kv-cluster - # 本机共享内存 authority;external 进程会附着到这里 - shared_memory_path: /dev/shm/fluxon - # 本机共享文件 authority;shared.json、日志、profile 等文件位于这里 - shared_file_path: /var/lib/fluxon/shared + # 共享 bundle 根目录;运行时会在其下拼接 cluster_name + share_mem_path: /dev/shm/fluxon # owner 自己的 P2P 监听端口 p2p_listen_port: 31000 # owner 所属子集群标签 sub_cluster: default ``` -这里需要把两个本机 authority 分清楚: +这里需要把共享 bundle 和大文件根目录分清楚: -- `shared_memory_path`:共享内存 / mmap authority,同机进程靠它附着到同一块内存池 -- `shared_file_path`:共享文件 authority,`shared.json`、日志、profile 等文件位于这里 +- `share_mem_path`:共享 bundle 根目录;运行时拼接 `cluster_name` 后,同时承载 `mmap.file`、`shared.json` 和 peer metadata +- `large_file_paths`:owner 独占的大文件 authority,日志、profile、cache 等运行时资产都从这里派生 - `FLUXON_LOG`:用户 Python 进程 console log 的门限,不写时默认 `INFO` -zero-contribution external 模式下有一个硬约束:`fluxonkv_spec.etcd_addresses`、`fluxonkv_spec.sub_cluster`、`fluxonkv_spec.redis_compat` 这类 owner 侧字段不应出现。 +zero-contribution external 模式下有一个硬约束:`fluxonkv_spec.etcd_addresses`、`fluxonkv_spec.sub_cluster`、`fluxonkv_spec.large_file_paths`、`fluxonkv_spec.redis_compat` 这类 owner 侧字段不应出现。 diff --git "a/fluxon_doc_cn/user_doc/\347\224\250\346\210\267 - 4 - MQ\346\216\245\345\217\243.md" "b/fluxon_doc_cn/user_doc/\347\224\250\346\210\267 - 4 - MQ\346\216\245\345\217\243.md" index 89744a2..a788b74 100644 --- "a/fluxon_doc_cn/user_doc/\347\224\250\346\210\267 - 4 - MQ\346\216\245\345\217\243.md" +++ "b/fluxon_doc_cn/user_doc/\347\224\250\346\210\267 - 4 - MQ\346\216\245\345\217\243.md" @@ -86,8 +86,7 @@ ETCD_ENDPOINT = "127.0.0.1:2379" GREPTIME_HTTP_PORT = 34030 GREPTIME_BASE_URL = f"http://127.0.0.1:{GREPTIME_HTTP_PORT}" CLUSTER_NAME = "demo-kv-cluster" -SHARED_MEMORY_PATH = Path("/dev/shm/fluxon_kv_demo").resolve() -SHARED_FILE_PATH = Path("/tmp/fluxon_kv_demo/shared").resolve() +SHARE_MEM_PATH = Path("/dev/shm/fluxon_kv_demo").resolve() WORKDIR = Path("/tmp/fluxon_kv_demo/runtime").resolve() MASTER_PORT = 31000 MASTER_INSTANCE_KEY = "demo_kv_master" @@ -97,7 +96,6 @@ OWNER_DRAM_BYTES = 1073741824 def main() -> None: args = parse_args() - SHARED_FILE_PATH.mkdir(parents=True, exist_ok=True) log_dir = (WORKDIR / "log").resolve() if args.with_master: @@ -132,8 +130,7 @@ def main() -> None: ) ) - print(f"[fluxon_kv] shared memory path: {SHARED_MEMORY_PATH}") - print(f"[fluxon_kv] shared file path: {SHARED_FILE_PATH}") + print(f"[fluxon_kv] share_mem_path: {SHARE_MEM_PATH}") print(f"[fluxon_kv] etcd endpoint: {ETCD_ENDPOINT}") print(f"[fluxon_kv] greptime base url: {GREPTIME_BASE_URL}") print(f"[fluxon_kv] start master in this script: {args.with_master}") @@ -196,9 +193,9 @@ def build_owner_config() -> dict: "fluxonkv_spec": { "etcd_addresses": [ETCD_ENDPOINT], "cluster_name": CLUSTER_NAME, - "shared_memory_path": str(SHARED_MEMORY_PATH), - "shared_file_path": str(SHARED_FILE_PATH), + "share_mem_path": str(SHARE_MEM_PATH), "sub_cluster": "default", + "large_file_paths": [str((WORKDIR / "large" / "owner").resolve())], }, } @@ -299,8 +296,7 @@ from fluxon_py.runtime import register_ctrlc_callback # These constants are the only user-facing knobs in the minimal example. CLUSTER_NAME = "demo-kv-cluster" -SHARED_MEMORY_PATH = "/dev/shm/fluxon_kv_demo" -SHARED_FILE_PATH = "/tmp/fluxon_kv_demo/shared" +SHARE_MEM_PATH = "/dev/shm/fluxon_kv_demo" CHANNEL_KEY = "demo_mq_channel_doc" CHANNEL_CAPACITY = 128 CHANNEL_TTL_SECONDS = 300 @@ -335,8 +331,7 @@ def _build_store_config(*, role: str) -> FluxonKvClientConfig: "instance_key": f"demo_mq_{role}", "fluxonkv_spec": { "cluster_name": CLUSTER_NAME, - "shared_memory_path": SHARED_MEMORY_PATH, - "shared_file_path": SHARED_FILE_PATH, + "share_mem_path": SHARE_MEM_PATH, }, } ) @@ -480,9 +475,7 @@ def main() -> None: parser.add_argument("--role", choices=["producer", "consumer"], required=True) args = parser.parse_args() - # The minimal example keeps shared file authority explicit and local. - Path(SHARED_FILE_PATH).mkdir(parents=True, exist_ok=True) - + # The minimal example keeps share_mem_path explicit and local. # init_logger() reads FLUXON_LOG and sets the user-process console log level. logger = init_logger(f"mpmc_demo_{args.role}") shutdown_requested = threading.Event() @@ -542,7 +535,7 @@ FLUXON_LOG=DEBUG python3 examples/start_mpmc_demo.py --role consumer ### 关键接口常见错误处理 -- `new_or_bind_with_unique_key(...)` 失败:直接把 `unwrap_error()` 打出来,先检查 cluster、shared memory/shared file 路径、`unique_id`、`chan_role` 是否和对端一致 +- `new_or_bind_with_unique_key(...)` 失败:直接把 `unwrap_error()` 打出来,先检查 `cluster_name`、`share_mem_path`、`unique_id`、`chan_role` 是否和对端一致 - `producer.put_data(...)` 返回 `ProducerClosedError`:按正常关闭路径处理,直接退出主循环 - `consumer.get_data(...)` 返回 `ChannelClosedError`:按正常关闭路径处理,直接退出主循环 @@ -552,7 +545,7 @@ FLUXON_LOG=DEBUG python3 examples/start_mpmc_demo.py --role consumer - MQ Python 部分:由 `init_logger(...)` 初始化,直接输出到当前终端,不默认落盘,门限由 `FLUXON_LOG` 控制 - MQ Rust / KV 后台部分:和 KV 一起走服务平面的后台日志链路;`master` 本地日志目录由 `master_cfg["log_dir"]` 指定 -- `shared_file_path`:本机共享文件 authority,用来承载 `shared.json` 等共享文件 +- `share_mem_path`:KV 共享 bundle 根目录,只承载 `mmap.file`、`shared.json` 和 peer metadata;后端日志、profile、cache 从 owner 的 `large_file_paths` 派生 如果服务平面的 `master.monitoring.otlp_log_api` 已经配置,MQ Rust / KV 后台部分的日志还会继续采集到 Greptime 的 `fluxon_logs` 表。 diff --git "a/fluxon_doc_cn/user_doc/\347\224\250\346\210\267 - 5 - FS\346\216\245\345\217\243.md" "b/fluxon_doc_cn/user_doc/\347\224\250\346\210\267 - 5 - FS\346\216\245\345\217\243.md" index 68ec80b..cf9ab79 100644 --- "a/fluxon_doc_cn/user_doc/\347\224\250\346\210\267 - 5 - FS\346\216\245\345\217\243.md" +++ "b/fluxon_doc_cn/user_doc/\347\224\250\346\210\267 - 5 - FS\346\216\245\345\217\243.md" @@ -104,8 +104,7 @@ ETCD_ENDPOINT = "127.0.0.1:2379" GREPTIME_HTTP_PORT = 34030 GREPTIME_BASE_URL = f"http://127.0.0.1:{GREPTIME_HTTP_PORT}" CLUSTER_NAME = "demo-fs-cluster" -SHARED_MEMORY_PATH = Path("/dev/shm/fluxon_fs_demo").resolve() -SHARED_FILE_PATH = Path("/tmp/fluxon_fs_demo/shared").resolve() +SHARE_MEM_PATH = Path("/dev/shm/fluxon_fs_demo").resolve() WORKDIR = Path("/tmp/fluxon_fs_demo/runtime").resolve() REMOTE_ROOT_DIR = Path("/tmp/fluxon_fs_demo/remote_root").resolve() KV_MASTER_PORT = 34100 @@ -130,8 +129,6 @@ def main() -> None: args = parse_args() WORKDIR.mkdir(parents=True, exist_ok=True) REMOTE_ROOT_DIR.mkdir(parents=True, exist_ok=True) - SHARED_FILE_PATH.mkdir(parents=True, exist_ok=True) - log_dir = (WORKDIR / "log").resolve() log_dir.mkdir(parents=True, exist_ok=True) @@ -201,8 +198,7 @@ def main() -> None: ) print(f"[fluxon_fs] cluster name: {CLUSTER_NAME}") - print(f"[fluxon_fs] shared memory path: {SHARED_MEMORY_PATH}") - print(f"[fluxon_fs] shared file path: {SHARED_FILE_PATH}") + print(f"[fluxon_fs] share_mem_path: {SHARE_MEM_PATH}") print(f"[fluxon_fs] remote root dir: {REMOTE_ROOT_DIR}") print(f"[fluxon_fs] export name: {EXPORT_NAME}") print(f"[fluxon_fs] owner instance key: {OWNER_INSTANCE_KEY}") @@ -283,9 +279,9 @@ def build_owner_config() -> dict: "fluxonkv_spec": { "etcd_addresses": [ETCD_ENDPOINT], "cluster_name": CLUSTER_NAME, - "shared_memory_path": str(SHARED_MEMORY_PATH), - "shared_file_path": str(SHARED_FILE_PATH), + "share_mem_path": str(SHARE_MEM_PATH), "sub_cluster": "default", + "large_file_paths": [str((WORKDIR / "large" / "owner").resolve())], }, } @@ -296,8 +292,7 @@ def build_fs_master_config() -> dict: "instance_key": FS_MASTER_INSTANCE_KEY, "fluxonkv_spec": { "cluster_name": CLUSTER_NAME, - "shared_memory_path": str(SHARED_MEMORY_PATH), - "shared_file_path": str(SHARED_FILE_PATH), + "share_mem_path": str(SHARE_MEM_PATH), }, }, "fluxon_fs": { @@ -356,8 +351,7 @@ def build_fs_agent_config() -> dict: "instance_key": FS_AGENT_INSTANCE_KEY, "fluxonkv_spec": { "cluster_name": CLUSTER_NAME, - "shared_memory_path": str(SHARED_MEMORY_PATH), - "shared_file_path": str(SHARED_FILE_PATH), + "share_mem_path": str(SHARE_MEM_PATH), }, }, "fluxon_fs": { @@ -402,8 +396,7 @@ python3 examples/start_kv_and_fs_svc.py --without-master 脚本会持续运行,并打印: - `cluster name` -- `shared memory path` -- `shared file path` +- `share_mem_path` - `remote root dir` - `export name` - `owner instance key` @@ -446,8 +439,7 @@ python3 examples/start_kv_and_fs_svc.py --without-master 这条最小成功路径默认对应本页的本地完整示例,也就是不带 `--without-master` 的启动方式。`--without-master` 用于把当前机器接到已经存在的 KV / FS 集群;如果继续运行 `start_fluxon_fs_writer.py` / `start_fluxon_fs_reader.py`,配置里的这些对象必须和现有集群一致: - `cluster_name` -- `shared_memory_path` -- `shared_file_path` +- `share_mem_path` - `fluxon_fs.master.instance_key` - `export_name` - `remote_root_dir_abs` @@ -727,8 +719,7 @@ FLUXON_LOG=DEBUG python3 examples/start_fluxon_fs_reader.py -c None: args = parse_args() - SHARED_FILE_PATH.mkdir(parents=True, exist_ok=True) log_dir = (WORKDIR / "log").resolve() if args.with_master: @@ -84,8 +82,7 @@ def main() -> None: children.append(ManagedSubprocess(label="master", proc=master_proc)) children.append(ManagedSubprocess(label="owner", proc=owner_proc)) - print(f"[fluxon_kv] shared memory path: {SHARED_MEMORY_PATH}") - print(f"[fluxon_kv] shared file path: {SHARED_FILE_PATH}") + print(f"[fluxon_kv] share_mem_path: {SHARE_MEM_PATH}") print(f"[fluxon_kv] etcd endpoint: {ETCD_ENDPOINT}") print(f"[fluxon_kv] greptime base url: {GREPTIME_BASE_URL}") print(f"[fluxon_kv] start master in this script: {args.with_master}") @@ -145,9 +142,9 @@ def build_owner_config() -> dict: "fluxonkv_spec": { "etcd_addresses": [ETCD_ENDPOINT], "cluster_name": CLUSTER_NAME, - "shared_memory_path": str(SHARED_MEMORY_PATH), - "shared_file_path": str(SHARED_FILE_PATH), + "share_mem_path": str(SHARE_MEM_PATH), "sub_cluster": "default", + "large_file_paths": [str((WORKDIR / "large" / "owner").resolve())], }, } @@ -186,6 +183,7 @@ close() - `FluxonKvClientConfig`: config object, usually built from a Python dict - `new_store(config: FluxonKvClientConfig) -> Result[KvClient, ApiError]`: create one KV client - `KvClient`: single entrypoint for both KV and RPC +- `KvClient.third_party_logs_dir() -> Result[str, ApiError]`: return the Fluxon-assigned log root for third-party Python components. Components should derive their own subdirectories under this root, for example `mq/`. - `MemHolder`: successful result holder from `get_blocking(...)` - `PutOptionalArgs`: optional write controls, most commonly `lease_id` @@ -212,8 +210,7 @@ from fluxon_py import FluxonKvClientConfig, new_store INSTANCE_KEY = "demo_kv_external" CLUSTER_NAME = "demo-kv-cluster" -SHARED_MEMORY_PATH = "/dev/shm/fluxon_kv_demo" -SHARED_FILE_PATH = "/tmp/fluxon_kv_demo/shared" +SHARE_MEM_PATH = "/dev/shm/fluxon_kv_demo" def main() -> None: @@ -222,8 +219,7 @@ def main() -> None: "instance_key": INSTANCE_KEY, "fluxonkv_spec": { "cluster_name": CLUSTER_NAME, - "shared_memory_path": SHARED_MEMORY_PATH, - "shared_file_path": SHARED_FILE_PATH, + "share_mem_path": SHARE_MEM_PATH, }, "test_spec_config": { "disable_observability": True, @@ -265,6 +261,7 @@ Useful calls: - `get_size(key)`: query payload size without reading the whole object - `is_exist(key)`: existence check - `remove(key)`: delete a key +- `third_party_logs_dir()`: return `{large_file_paths[0]}/{cluster_name}_cluster_third_party_logs` as a `Result[str, ApiError]` To increase user-process logs: @@ -272,6 +269,8 @@ To increase user-process logs: FLUXON_LOG=DEBUG python3 examples/external_put_get_del.py ``` +Third-party Python components should place file logs under `store.third_party_logs_dir().unwrap(...)` and then append a component subdirectory such as `mq/`. This keeps log directory usage bounded and lets the Fluxon observability plane discover and collect those file logs through one owner-derived root. + ### Minimal Node-to-Node RPC Example `examples/rpc_call.py`: @@ -287,8 +286,7 @@ from fluxon_py import FluxonKvClientConfig, new_store RPC_SERVER_INSTANCE_KEY = "demo_rpc_server" RPC_CLIENT_INSTANCE_KEY = "demo_rpc_client" CLUSTER_NAME = "demo-kv-cluster" -SHARED_MEMORY_PATH = "/dev/shm/fluxon_kv_demo" -SHARED_FILE_PATH = "/tmp/fluxon_kv_demo/shared" +SHARE_MEM_PATH = "/dev/shm/fluxon_kv_demo" def _build_config(*, instance_key: str) -> FluxonKvClientConfig: @@ -297,8 +295,7 @@ def _build_config(*, instance_key: str) -> FluxonKvClientConfig: "instance_key": instance_key, "fluxonkv_spec": { "cluster_name": CLUSTER_NAME, - "shared_memory_path": SHARED_MEMORY_PATH, - "shared_file_path": SHARED_FILE_PATH, + "share_mem_path": SHARE_MEM_PATH, }, "test_spec_config": { "disable_observability": True, @@ -358,8 +355,7 @@ instance_key: my-kv-client-1 fluxonkv_spec: cluster_name: demo-kv-cluster - shared_memory_path: /dev/shm/fluxon - shared_file_path: /var/lib/fluxon/shared + share_mem_path: /dev/shm/fluxon p2p_listen_port: 31001 ``` @@ -376,16 +372,15 @@ fluxonkv_spec: etcd_addresses: - 127.0.0.1:2379 cluster_name: demo-kv-cluster - shared_memory_path: /dev/shm/fluxon - shared_file_path: /var/lib/fluxon/shared + share_mem_path: /dev/shm/fluxon p2p_listen_port: 31000 sub_cluster: default ``` -Keep these authorities separate: +Keep these roots separate: -- `shared_memory_path`: shared-memory / mmap authority -- `shared_file_path`: shared-file authority for `shared.json`, logs, and profiles +- `share_mem_path`: shared bundle root. Runtime appends `cluster_name`, and that directory holds `mmap.file`, `shared.json`, and peer metadata. +- `large_file_paths`: owner-only large-file authority for logs, profiles, caches, and other derived runtime assets - `FLUXON_LOG`: console log threshold for the user process -In zero-contribution external mode, owner-only fields such as `fluxonkv_spec.etcd_addresses`, `fluxonkv_spec.sub_cluster`, and `fluxonkv_spec.redis_compat` should not appear. +In zero-contribution external mode, owner-only fields such as `fluxonkv_spec.etcd_addresses`, `fluxonkv_spec.sub_cluster`, `fluxonkv_spec.large_file_paths`, and `fluxonkv_spec.redis_compat` should not appear. diff --git a/fluxon_doc_en/user_doc/User - 4 - MQ Interface.md b/fluxon_doc_en/user_doc/User - 4 - MQ Interface.md index 3445aff..8f59529 100644 --- a/fluxon_doc_en/user_doc/User - 4 - MQ Interface.md +++ b/fluxon_doc_en/user_doc/User - 4 - MQ Interface.md @@ -116,7 +116,7 @@ Parameter constraints: ### Common Error Handling -- `new_or_bind_with_unique_key(...)` fails: first check cluster name, shared memory / shared file paths, `unique_id`, and that both ends use matching roles +- `new_or_bind_with_unique_key(...)` fails: first check `cluster_name`, `share_mem_path`, `unique_id`, and that both ends use matching roles - `producer.put_data(...)` returns `ProducerClosedError`: treat it as a normal shutdown signal and exit the main loop - `consumer.get_data(...)` returns `ChannelClosedError`: treat it as a normal shutdown signal and exit the main loop @@ -124,7 +124,8 @@ Parameter constraints: - Python-side MQ logs come from `init_logger(...)` and go to the current terminal by default; the threshold is controlled by `FLUXON_LOG` - Rust / KV background logs follow the shared service-plane pipeline, and the master's local log authority is `master_cfg["log_dir"]` -- `shared_file_path` remains the local shared-file authority for `shared.json` and related files +- `share_mem_path` is the shared bundle root for `mmap.file`, `shared.json`, and peer metadata +- `large_file_paths` is the owner-only large-file authority for backend logs, profiles, caches, and other derived runtime assets If `master.monitoring.otlp_log_api` is configured, backend logs continue to flow into the Greptime `fluxon_logs` table. diff --git a/fluxon_doc_en/user_doc/User - 5 - FS Interface.md b/fluxon_doc_en/user_doc/User - 5 - FS Interface.md index fdf2242..dcf3cd2 100644 --- a/fluxon_doc_en/user_doc/User - 5 - FS Interface.md +++ b/fluxon_doc_en/user_doc/User - 5 - FS Interface.md @@ -287,8 +287,7 @@ Usually means the external client did not attach to the local owner. Check: - whether `start_kv_and_fs_svc.py` is still running - `CLUSTER_NAME` -- `SHARED_MEMORY_PATH` -- `SHARED_FILE_PATH` +- `SHARE_MEM_PATH` ### `fluxon_fs cache config is not loaded yet` diff --git a/fluxon_py/_api_ext_chan/mpmc.py b/fluxon_py/_api_ext_chan/mpmc.py index 8ad4683..4ddbc1e 100644 --- a/fluxon_py/_api_ext_chan/mpmc.py +++ b/fluxon_py/_api_ext_chan/mpmc.py @@ -243,12 +243,12 @@ def stable_delete_ready_keys_for_member( def _local_member_id_cache_path(kv_api: KvClient, mpmc_id: str, role: ChanRole) -> str: cfg = kv_api.config() - shared_memory_path = cfg.fluxonkv_spec_shared_memory_path - if not isinstance(shared_memory_path, str) or not shared_memory_path.strip(): - raise ValueError("fluxonkv_spec.shared_memory_path must be a non-empty string for local member-id cache") + share_mem_path = cfg.fluxonkv_spec_share_mem_path + if not isinstance(share_mem_path, str) or not share_mem_path.strip(): + raise ValueError("fluxonkv_spec.share_mem_path must be a non-empty string for local member-id cache") cluster_name = kv_api.get_cluster_name() role_name = role.value - cache_dir = os.path.join(shared_memory_path, cluster_name, "mq_member_id_cache") + cache_dir = os.path.join(share_mem_path, cluster_name, "mq_member_id_cache") os.makedirs(cache_dir, exist_ok=True) return os.path.join(cache_dir, f"mpmc_{mpmc_id}_{role_name}.json") diff --git a/fluxon_py/config.py b/fluxon_py/config.py index 9b7b447..5861f64 100644 --- a/fluxon_py/config.py +++ b/fluxon_py/config.py @@ -108,8 +108,8 @@ def _yaml_template(): fluxonkv_spec: # fluxon kv specific config (dict(optional)) etcd_addresses: # Etcd address list ((None|['{str}:{int}'])) cluster_name: # Cluster name (str) - shared_memory_path: # Shared memory path (str) - shared_file_path: # Shared file path for shared.json/logs/profiles (str) + share_mem_path: # Shared bundle path for mmap.file/shared.json/peer metadata (str) + large_file_paths: # Owner-mode ordered large-file roots (['{str}'](optional)) p2p_listen_port: # P2P QUIC listen port override (int(optional)) redis_compat: # Enable Redis protocol shim (dict(optional)) listen_addr: # TCP listen addr, e.g. "127.0.0.1:16379" (str) @@ -295,6 +295,116 @@ def _normalize_test_spec_config(raw: Any, ctx: str) -> Dict[str, Any]: return out +def _is_zero_contribution_fluxonkv_config(cfg: Dict[str, Any]) -> bool: + """ + Determine whether one Fluxon KV config uses zero-contribution mode. + + Contract: + - Missing contribute_to_cluster_pool_size means zero-contribution. + - Explicit dram=0 with all vram entries=0 also means zero-contribution. + - Partial-zero configurations are rejected to keep the role contract explicit. + """ + contrib_present = "contribute_to_cluster_pool_size" in cfg + contrib = cfg.get("contribute_to_cluster_pool_size") + if not contrib_present or contrib is None: + return True + if not isinstance(contrib, dict): + raise ValueError("contribute_to_cluster_pool_size must be a mapping when provided") + + dram = int(contrib["dram"]) + vram_raw = contrib.get("vram") + # Missing vram is normalized to "no GPU contribution". + if vram_raw is None: + vram: Dict[str, Any] = {} + elif not isinstance(vram_raw, dict): + raise ValueError("contribute_to_cluster_pool_size.vram must be a mapping") + else: + vram = vram_raw + + vram_is_zero = True + for _, value in vram.items(): + if int(value) != 0: + vram_is_zero = False + break + if dram == 0 and not vram_is_zero: + raise ValueError( + "contribute_to_cluster_pool_size is partially zero: dram=0 but vram has non-zero values" + ) + return dram == 0 and vram_is_zero + + +def _validate_fluxonkv_contract(cfg: Dict[str, Any]) -> None: + """ + Validate the shared Fluxon KV contract and then apply role-specific checks. + + The contract must stay canonical across Python construction, YAML export, and + the Rust bridge, so owner/external differences live inside this one path. + """ + if "fluxonkv_spec" not in cfg: + return + + spec = cfg.get("fluxonkv_spec") + if not isinstance(spec, dict): + raise ValueError("fluxonkv_spec must be a mapping") + + is_zero_contribution = _is_zero_contribution_fluxonkv_config(cfg) + + share_mem_path = spec.get("share_mem_path") + if not isinstance(share_mem_path, str) or not share_mem_path.strip(): + raise ValueError("fluxonkv_spec.share_mem_path must be a non-empty string") + + if "rdma_device_names" in cfg: + raise ValueError("rdma_device_names has been removed from Fluxon KV config") + + if "transfer_engine" in spec: + raise ValueError("fluxonkv_spec.transfer_engine has been removed from Fluxon KV config") + + if is_zero_contribution: + forbidden_spec_keys = [ + "etcd_addresses", + "redis_compat", + "sub_cluster", + "large_file_paths", + ] + for key in forbidden_spec_keys: + if key in spec: + raise ValueError(f"fluxonkv_spec.{key} is forbidden in zero-contribution mode") + return + + contrib = cfg.get("contribute_to_cluster_pool_size") + if not isinstance(contrib, dict): + raise ValueError( + "contribute_to_cluster_pool_size is required for owner mode (non-zero contribution)" + ) + if int(contrib["dram"]) == 0: + raise ValueError("owner mode requires non-zero contribute_to_cluster_pool_size.dram") + + if "etcd_addresses" not in spec: + raise ValueError("fluxonkv_spec.etcd_addresses is required for owner mode") + etcd_addresses = spec.get("etcd_addresses") + if not isinstance(etcd_addresses, list) or len(etcd_addresses) == 0: + raise ValueError("fluxonkv_spec.etcd_addresses must be a non-empty list") + + if "sub_cluster" not in spec: + raise ValueError("fluxonkv_spec.sub_cluster is required for owner mode") + sub_cluster = spec.get("sub_cluster") + if not isinstance(sub_cluster, str) or not sub_cluster.strip(): + raise ValueError("fluxonkv_spec.sub_cluster must be a non-empty string in owner mode") + if sub_cluster != sub_cluster.strip(): + raise ValueError("fluxonkv_spec.sub_cluster must not have leading/trailing whitespace") + + if "large_file_paths" not in spec: + raise ValueError("fluxonkv_spec.large_file_paths is required for owner mode") + large_file_paths = spec.get("large_file_paths") + if not isinstance(large_file_paths, list) or len(large_file_paths) == 0: + raise ValueError("fluxonkv_spec.large_file_paths must be a non-empty list in owner mode") + for idx, field_value in enumerate(large_file_paths): + if not isinstance(field_value, str) or not field_value.strip(): + raise ValueError( + f"fluxonkv_spec.large_file_paths[{idx}] must be a non-empty string in owner mode" + ) + + class FluxonKvClientConfig(): """Configuration class for KV Cache stores that reads from YAML config files.""" @@ -334,80 +444,8 @@ def __init__(self, config_dict: Dict[str, Any]): raise ValueError("pprof_duration_seconds must be > 0") plain["pprof_duration_seconds"] = pprof_duration_seconds - # FluxonKV role selection contract: - # - Missing contribute_to_cluster_pool_size means "zero-contribution" mode. - # - Explicit contribute_to_cluster_pool_size with all zeros also means "zero-contribution" mode. - # - Any partial-zero contribution is rejected to avoid ambiguous behavior. if "fluxonkv_spec" in plain: - spec = plain.get("fluxonkv_spec") - if not isinstance(spec, dict): - raise ValueError("fluxonkv_spec must be a mapping") - - contrib_present = "contribute_to_cluster_pool_size" in plain - contrib = plain.get("contribute_to_cluster_pool_size") - - is_zero_contribution = False - if not contrib_present or contrib is None: - is_zero_contribution = True - elif isinstance(contrib, dict): - dram = int(contrib["dram"]) - vram_raw = contrib.get("vram") - # English note: - # - Owner-mode often contributes DRAM only; forcing `vram: {}` everywhere is noise. - # - Missing vram means "no GPU contribution", which is equivalent to an empty dict. - # - This is a schema normalization rule (not a fallback): if callers want VRAM, they - # must provide an explicit mapping with non-zero values. - if vram_raw is None: - vram: Dict[str, Any] = {} - elif not isinstance(vram_raw, dict): - raise ValueError("contribute_to_cluster_pool_size.vram must be a mapping") - else: - vram = vram_raw - vram_is_zero = True - for _, v in vram.items(): - if int(v) != 0: - vram_is_zero = False - break - if dram == 0 and not vram_is_zero: - raise ValueError( - "contribute_to_cluster_pool_size is partially zero: dram=0 but vram has non-zero values" - ) - is_zero_contribution = dram == 0 and vram_is_zero - else: - raise ValueError("contribute_to_cluster_pool_size must be a mapping when provided") - - if is_zero_contribution: - forbidden_spec_keys = [ - "etcd_addresses", - "redis_compat", - "sub_cluster", - ] - for k in forbidden_spec_keys: - if k in spec: - raise ValueError(f"fluxonkv_spec.{k} is forbidden in zero-contribution mode") - else: - if not contrib_present or not isinstance(contrib, dict): - raise ValueError( - "contribute_to_cluster_pool_size is required for owner mode (non-zero contribution)" - ) - if int(contrib["dram"]) == 0: - raise ValueError("owner mode requires non-zero contribute_to_cluster_pool_size.dram") - if "etcd_addresses" not in spec: - raise ValueError("fluxonkv_spec.etcd_addresses is required for owner mode") - etcd_addresses = spec.get("etcd_addresses") - if not isinstance(etcd_addresses, list) or len(etcd_addresses) == 0: - raise ValueError("fluxonkv_spec.etcd_addresses must be a non-empty list") - if "sub_cluster" not in spec: - raise ValueError("fluxonkv_spec.sub_cluster is required for owner mode") - sub_cluster = spec.get("sub_cluster") - if not isinstance(sub_cluster, str) or not sub_cluster.strip(): - raise ValueError( - "fluxonkv_spec.sub_cluster must be a non-empty string in owner mode" - ) - if sub_cluster != sub_cluster.strip(): - raise ValueError( - "fluxonkv_spec.sub_cluster must not have leading/trailing whitespace" - ) + _validate_fluxonkv_contract(plain) self.config_dict = plain @@ -488,10 +526,10 @@ def fluxonkv_spec_cluster_name(self): return self.config_dict["fluxonkv_spec"]["cluster_name"] @property - def fluxonkv_spec_shared_memory_path(self): + def fluxonkv_spec_share_mem_path(self): if "fluxonkv_spec" not in self.config_dict: return None - return self.config_dict["fluxonkv_spec"]["shared_memory_path"] + return self.config_dict["fluxonkv_spec"]["share_mem_path"] @property def fluxonkv_spec_transfer_engine(self): @@ -518,7 +556,9 @@ def __str__(self): def to_yaml_str(self) -> str: """Serialize the config dict into a YAML document string.""" - return yaml.safe_dump(self.config_dict, sort_keys=False) + cfg = self.to_dict() + _validate_fluxonkv_contract(cfg) + return yaml.safe_dump(cfg, sort_keys=False) def to_fluxon_kv_client_config_yaml_str(self) -> str: """Build the YAML string expected by the Rust `ClientConfigYaml` schema.""" @@ -531,59 +571,7 @@ def to_fluxon_kv_client_config_yaml_str(self) -> str: spec = cfg.get("fluxonkv_spec") if not isinstance(spec, dict): raise ValueError("fluxonkv_spec is required for Fluxon KV client") - - contrib_present = "contribute_to_cluster_pool_size" in cfg - contrib = cfg.get("contribute_to_cluster_pool_size") - is_zero_contribution = False - if not contrib_present or contrib is None: - is_zero_contribution = True - elif isinstance(contrib, dict): - dram = int(contrib["dram"]) - vram_raw = contrib.get("vram") - if vram_raw is None: - vram = {} - elif not isinstance(vram_raw, dict): - raise ValueError("contribute_to_cluster_pool_size.vram must be a mapping") - else: - vram = vram_raw - vram_is_zero = True - for _, v in vram.items(): - if int(v) != 0: - vram_is_zero = False - break - if dram == 0 and not vram_is_zero: - raise ValueError( - "contribute_to_cluster_pool_size is partially zero: dram=0 but vram has non-zero values" - ) - is_zero_contribution = dram == 0 and vram_is_zero - else: - raise ValueError("contribute_to_cluster_pool_size must be a mapping when provided") - - shared_memory_path = spec.get("shared_memory_path") - if not isinstance(shared_memory_path, str) or not shared_memory_path.strip(): - raise ValueError("fluxonkv_spec.shared_memory_path must be a non-empty string") - shared_file_path = spec.get("shared_file_path") - if not isinstance(shared_file_path, str) or not shared_file_path.strip(): - raise ValueError("fluxonkv_spec.shared_file_path must be a non-empty string") - - if "rdma_device_names" in cfg: - raise ValueError("rdma_device_names has been removed from Fluxon KV config") - - if "transfer_engine" in spec: - raise ValueError("fluxonkv_spec.transfer_engine has been removed from Fluxon KV config") - - if is_zero_contribution: - forbidden_spec_keys = [ - "etcd_addresses", - "redis_compat", - "sub_cluster", - ] - for k in forbidden_spec_keys: - if k in spec: - raise ValueError(f"fluxonkv_spec.{k} is forbidden in zero-contribution mode") - - return yaml.safe_dump(cfg, sort_keys=False) - + _validate_fluxonkv_contract(cfg) return yaml.safe_dump(cfg, sort_keys=False) diff --git a/fluxon_py/kvclient/fluxon.py b/fluxon_py/kvclient/fluxon.py index 3578405..1325e3d 100644 --- a/fluxon_py/kvclient/fluxon.py +++ b/fluxon_py/kvclient/fluxon.py @@ -821,6 +821,23 @@ def get_etcd_config(self) -> List[str]: out.append(addr) return out + def third_party_logs_dir(self) -> Result[str, ApiError]: + if self._client is None: + return Result.new_error(GeneralError(message="Store not initialized")) + try: + res = self._client.third_party_logs_dir() + if not res.is_ok(): + return Result.new_error(res.unwrap_error()) + logs_dir = res.unwrap() + if not isinstance(logs_dir, str) or not logs_dir: + return Result.new_error( + GeneralError(message=f"third_party_logs_dir must be non-empty str; got {logs_dir!r}") + ) + return Result.new_ok(logs_dir) + except ApiError as e: + return Result.new_error(e) + except Exception as e: + return Result.new_error(GeneralError(message=f"third_party_logs_dir failed: {e}")) def ensure_zero_contribution_for_channel(self) -> None: self._config.ensure_zero_contribution_for_channel() diff --git a/fluxon_py/kvclient/kvclient_interface.py b/fluxon_py/kvclient/kvclient_interface.py index 3170628..f50db0f 100644 --- a/fluxon_py/kvclient/kvclient_interface.py +++ b/fluxon_py/kvclient/kvclient_interface.py @@ -207,6 +207,9 @@ def get_cluster_name(self) -> str: def get_etcd_config(self) -> List[str]: """Return etcd endpoint list as raw host:port strings (no scheme).""" + @abstractmethod + def third_party_logs_dir(self) -> Result[str, ApiError]: + """Return the owner-derived log root for third-party Python components.""" @abstractmethod def ensure_zero_contribution_for_channel(self) -> None: diff --git a/fluxon_py/kvclient/mooncake.py b/fluxon_py/kvclient/mooncake.py index becd08d..4457cc3 100644 --- a/fluxon_py/kvclient/mooncake.py +++ b/fluxon_py/kvclient/mooncake.py @@ -836,6 +836,12 @@ def get_etcd_config(self) -> List[str]: raise InvalidConfigurationError(message=f"etcd endpoint must be raw host:port (no scheme), got: {addr!r}") return endpoints + def third_party_logs_dir(self) -> Result[str, ApiError]: + return Result.new_error( + GeneralError( + message="third_party_logs_dir is only supported by the Fluxon backend" + ) + ) def ensure_zero_contribution_for_channel(self) -> None: self._config.ensure_zero_contribution_for_channel() diff --git a/fluxon_py/logging.py b/fluxon_py/logging.py index 33fb2ae..bf67c3d 100755 --- a/fluxon_py/logging.py +++ b/fluxon_py/logging.py @@ -94,11 +94,7 @@ def init_logger(name: str = "fluxon") -> Logger: def init_mq_file_logger(name: str = "fluxon_mq") -> Logger: """Initialize an MQ-specific logger with an optional file handler. - Path rule aligned with Rust: - shared_file_path/{cluster_name}_cluster_mq_logs/ - - shared_file_path and cluster_name are provided by fluxon_pyo3.KvClient.logs_dir(), - to avoid scattering files under the shared-memory root directory. + Path rule aligned with Rust: third_party_logs_dir() is derived from owner large_file_paths. If fluxon_pyo3 is unavailable, falls back to console-only logging. """ @@ -112,20 +108,22 @@ def init_mq_file_logger(name: str = "fluxon_mq") -> Logger: ch.setFormatter(CustomFormatter()) logger.addHandler(ch) - # If fluxon_pyo3 is available, try using KvClient.logs_dir() as file log directory. + # Keep third-party file logs under one Fluxon-owned root so observability can discover them. log_dir = None try: from .tool import import_fluxon_pyo3_local fp = import_fluxon_pyo3_local() client = fp.KvClient() - log_dir = client.logs_dir() + third_party_log_dir = client.third_party_logs_dir().unwrap("third_party_logs_dir failed") + if isinstance(third_party_log_dir, str) and third_party_log_dir: + log_dir = os.path.join(third_party_log_dir, "mq") except ImportError as exc: logger.warning("init_mq_file_logger: fluxon_pyo3 not available; MQ file logs disabled: %s", exc) log_dir = None except Exception as exc: # noqa: BLE001 # Keep usable in cases like invalid config or client init failure; use console-only logging. - logger.warning("init_mq_file_logger: KvClient/logs_dir failed: %s", exc) + logger.warning("init_mq_file_logger: KvClient/third_party_logs_dir failed: %s", exc) log_dir = None if isinstance(log_dir, str) and log_dir: diff --git a/fluxon_py/tests/fluxon_fs_transfer_tikv_support.py b/fluxon_py/tests/fluxon_fs_transfer_tikv_support.py index 38d98c4..7e38d04 100644 --- a/fluxon_py/tests/fluxon_fs_transfer_tikv_support.py +++ b/fluxon_py/tests/fluxon_fs_transfer_tikv_support.py @@ -1546,10 +1546,8 @@ def __init__( self._kv_master_port = _pick_free_port() self._ui_base_url = f"http://127.0.0.1:{self._ui_port}" self._fs_s3_base_url = f"{self._ui_base_url}/fs_s3" - self._shared_memory_root = self._work_root / "sm" - self._shared_file_root = self._work_root / "sf" - self._shared_memory_root.mkdir(parents=True, exist_ok=True) - self._shared_file_root.mkdir(parents=True, exist_ok=True) + self._share_mem_root = self._work_root / "sm" + self._share_mem_root.mkdir(parents=True, exist_ok=True) self._etcd: EtcdHarness | None = None self._tikv: TiKvHarness | None = None self._monitor: DummyMonitoringHarness | None = None @@ -1573,8 +1571,8 @@ def store_config(self) -> FluxonFsTransferStateStoreConfig: raise RuntimeError("store_config is unavailable before harness init") return self._store_config - def _cluster_scoped_shared_file_dir(self) -> Path: - return self._shared_file_root / self._cluster_name + def _cluster_scoped_share_mem_dir(self) -> Path: + return self._share_mem_root / self._cluster_name def _monitoring_block(self) -> dict[str, Any]: if self._monitor is None: @@ -1595,9 +1593,9 @@ def _owner_kvclient_config(self) -> dict[str, Any]: "fluxonkv_spec": { "etcd_addresses": [self._etcd.endpoint], "cluster_name": self._cluster_name, - "shared_memory_path": str(self._shared_memory_root), - "shared_file_path": str(self._shared_file_root), + "share_mem_path": str(self._share_mem_root), "sub_cluster": "transfer_owner", + "large_file_paths": [str(self._work_root / "large" / "owner")], }, "test_spec_config": { "disable_observability": True, @@ -1609,8 +1607,7 @@ def _external_kvclient_config(self, *, instance_key: str) -> dict[str, Any]: "instance_key": instance_key, "fluxonkv_spec": { "cluster_name": self._cluster_name, - "shared_memory_path": str(self._shared_memory_root), - "shared_file_path": str(self._shared_file_root), + "share_mem_path": str(self._share_mem_root), }, "test_spec_config": { "disable_observability": True, @@ -1751,7 +1748,7 @@ def _prepare_configs(self) -> None: }, }, ) - self._owner_shared_json_path = self._cluster_scoped_shared_file_dir() / "shared.json" + self._owner_shared_json_path = self._cluster_scoped_share_mem_dir() / "shared.json" def _start_logged_process( self, diff --git a/fluxon_py/tests/test_api_chan_mpmc/test_mpmc_simple_bench.py b/fluxon_py/tests/test_api_chan_mpmc/test_mpmc_simple_bench.py index 5f45336..a29c46f 100644 --- a/fluxon_py/tests/test_api_chan_mpmc/test_mpmc_simple_bench.py +++ b/fluxon_py/tests/test_api_chan_mpmc/test_mpmc_simple_bench.py @@ -69,7 +69,6 @@ def _find_project_root(start: Path) -> Path: MOONCAKE_MASTER_SERVER_ADDRESS, MOONCAKE_METADATA_SERVER, load_test_fluxon_cluster_name, - load_test_fluxon_share_file_path, load_test_fluxon_share_mem_path, new_test_consumer, new_test_producer, @@ -103,7 +102,7 @@ def _find_project_root(start: Path) -> Path: WORKER_EXIT_TIMEOUT_SECONDS = 60.0 STOP_KEY_PREFIX = "/test_mpmc_simple_bench/stop/" SUMMARY_KEY_PREFIX = "/test_mpmc_simple_bench/summary/" -SharedBundle = tuple[str, str] +SharedBundle = str PayloadFieldValue = bytes | DLPackBytesView PayloadFields = dict[str, PayloadFieldValue] SINGLE_FIELD_PAYLOAD_KEY = "payload" @@ -159,8 +158,7 @@ def _build_parser() -> argparse.ArgumentParser: main_parser.add_argument("--batch-size", type=int, required=False, default=DEFAULT_BATCH_SIZE) main_parser.add_argument("--prefetch-num", type=int, required=False, default=DEFAULT_PREFETCH_NUM) main_parser.add_argument("--channel-capacity", type=int, required=False, default=DEFAULT_CHANNEL_CAPACITY) - main_parser.add_argument("--shared-memory-paths", type=str, required=False) - main_parser.add_argument("--shared-file-paths", type=str, required=False) + main_parser.add_argument("--share-mem-paths", type=str, required=False) producer_parser = subparsers.add_parser("run_producer", help="Run one producer worker") producer_parser.add_argument("--backend-type", required=True, type=str) producer_parser.add_argument("--ip", required=True, type=str) @@ -170,8 +168,7 @@ def _build_parser() -> argparse.ArgumentParser: producer_parser.add_argument("--payload-bytes", required=True, type=int) producer_parser.add_argument("--payload-kind", required=True, type=str, choices=PAYLOAD_KIND_CHOICES) producer_parser.add_argument("--channel-capacity", required=True, type=int) - producer_parser.add_argument("--shared-memory-path", required=False, type=str) - producer_parser.add_argument("--shared-file-path", required=False, type=str) + producer_parser.add_argument("--share-mem-path", required=False, type=str) producer_parser.add_argument("--stop-key", required=True, type=str) consumer_parser = subparsers.add_parser("run_consumer", help="Run one consumer worker") @@ -185,8 +182,7 @@ def _build_parser() -> argparse.ArgumentParser: consumer_parser.add_argument("--payload-kind", required=True, type=str, choices=PAYLOAD_KIND_CHOICES) consumer_parser.add_argument("--prefetch-num", required=True, type=int) consumer_parser.add_argument("--channel-capacity", required=True, type=int) - consumer_parser.add_argument("--shared-memory-path", required=False, type=str) - consumer_parser.add_argument("--shared-file-path", required=False, type=str) + consumer_parser.add_argument("--share-mem-path", required=False, type=str) consumer_parser.add_argument("--stop-key", required=True, type=str) consumer_parser.add_argument("--summary-key", required=True, type=str) return parser @@ -209,8 +205,7 @@ def _run_main(args: argparse.Namespace) -> None: _validate_main_args(args) consumer_counts = _parse_consumer_counts(args.consumer_counts) shared_bundles = _parse_shared_bundles( - shared_memory_paths_raw=args.shared_memory_paths, - shared_file_paths_raw=args.shared_file_paths, + share_mem_paths_raw=args.share_mem_paths, ) for consumer_count in consumer_counts: _run_one_case( @@ -272,8 +267,7 @@ def _run_one_case( bootstrap_store = _new_channel_store( role_key=f"{bench_id}_bootstrap", backend_type=KV_SVC_TYPE, - shared_memory_path=bootstrap_bundle[0], - shared_file_path=bootstrap_bundle[1], + share_mem_path=bootstrap_bundle, ) bootstrap_producer = None worker_processes: list[subprocess.Popen[str]] = [] @@ -305,8 +299,7 @@ def _run_one_case( payload_bytes=payload_bytes, payload_kind=payload_kind, channel_capacity=channel_capacity, - shared_memory_path=producer_bundle[0], - shared_file_path=producer_bundle[1], + share_mem_path=producer_bundle, stop_key=stop_key, ) ) @@ -338,10 +331,8 @@ def _run_one_case( str(prefetch_num), "--channel-capacity", str(channel_capacity), - "--shared-memory-path", - consumer_bundle[0], - "--shared-file-path", - consumer_bundle[1], + "--share-mem-path", + consumer_bundle, "--stop-key", stop_key, "--summary-key", @@ -418,8 +409,7 @@ def _run_producer(args: argparse.Namespace) -> None: store = _new_channel_store( role_key=f"{args.bench_id}_producer_{args.producer_id}", backend_type=args.backend_type, - shared_memory_path=args.shared_memory_path, - shared_file_path=args.shared_file_path, + share_mem_path=args.share_mem_path, ) producer = None restore_signal_listener = None @@ -500,8 +490,7 @@ def _run_consumer(args: argparse.Namespace) -> None: store = _new_channel_store( role_key=f"{args.bench_id}_consumer_{args.consumer_id}", backend_type=args.backend_type, - shared_memory_path=args.shared_memory_path, - shared_file_path=args.shared_file_path, + share_mem_path=args.share_mem_path, ) consumer = None restore_signal_listener = None @@ -669,8 +658,7 @@ def _validate_main_args(args: argparse.Namespace) -> None: _validate_non_negative_int("prefetch_num", args.prefetch_num) _validate_positive_int("channel_capacity", args.channel_capacity) _parse_shared_bundles( - shared_memory_paths_raw=args.shared_memory_paths, - shared_file_paths_raw=args.shared_file_paths, + share_mem_paths_raw=args.share_mem_paths, ) _validate_sample_window( total_duration_seconds=int(args.duration_seconds), @@ -732,14 +720,12 @@ def _new_channel_store( *, role_key: str, backend_type: str, - shared_memory_path: str | None, - shared_file_path: str | None, + share_mem_path: str | None, ): config = _new_store_config( instance_key=role_key, backend_type=backend_type, - shared_memory_path=shared_memory_path, - shared_file_path=shared_file_path, + share_mem_path=share_mem_path, ) result = new_store(config) if not result.is_ok(): @@ -751,8 +737,7 @@ def _new_store_config( *, instance_key: str, backend_type: str, - shared_memory_path: str | None, - shared_file_path: str | None, + share_mem_path: str | None, ) -> FluxonKvClientConfig: if backend_type == KvClientType.MOONCAKE.value: return FluxonKvClientConfig( @@ -772,14 +757,12 @@ def _new_store_config( ) if backend_type == KvClientType.FLUXON.value: - resolved_shared_memory_path, resolved_shared_file_path = _resolve_fluxon_shared_bundle( - shared_memory_path=shared_memory_path, - shared_file_path=shared_file_path, + resolved_share_mem_path = _resolve_fluxon_shared_bundle( + share_mem_path=share_mem_path, ) fluxon_spec: dict[str, Any] = { "cluster_name": load_test_fluxon_cluster_name(), - "shared_memory_path": resolved_shared_memory_path, - "shared_file_path": resolved_shared_file_path, + "share_mem_path": resolved_share_mem_path, } return FluxonKvClientConfig( { @@ -813,8 +796,7 @@ def _spawn_producer( payload_bytes: int, payload_kind: PayloadKind, channel_capacity: int, - shared_memory_path: str, - shared_file_path: str, + share_mem_path: str, stop_key: str, ) -> subprocess.Popen[str]: return _spawn_worker( @@ -838,10 +820,8 @@ def _spawn_producer( payload_kind.value, "--channel-capacity", str(channel_capacity), - "--shared-memory-path", - shared_memory_path, - "--shared-file-path", - shared_file_path, + "--share-mem-path", + share_mem_path, "--stop-key", stop_key, ] @@ -850,21 +830,11 @@ def _spawn_producer( def _parse_shared_bundles( *, - shared_memory_paths_raw: str | None, - shared_file_paths_raw: str | None, + share_mem_paths_raw: str | None, ) -> tuple[SharedBundle, ...]: - if shared_memory_paths_raw is None and shared_file_paths_raw is None: - return ((load_test_fluxon_share_mem_path(), load_test_fluxon_share_file_path()),) - if shared_memory_paths_raw is None or shared_file_paths_raw is None: - raise ValueError("shared-memory-paths and shared-file-paths must be set together") - shared_memory_paths = _parse_csv_paths(raw=shared_memory_paths_raw, arg_name="shared-memory-paths") - shared_file_paths = _parse_csv_paths(raw=shared_file_paths_raw, arg_name="shared-file-paths") - if len(shared_memory_paths) != len(shared_file_paths): - raise ValueError( - "shared-memory-paths and shared-file-paths length mismatch: " - f"{len(shared_memory_paths)} != {len(shared_file_paths)}" - ) - return tuple(zip(shared_memory_paths, shared_file_paths, strict=True)) + if share_mem_paths_raw is None: + return (load_test_fluxon_share_mem_path(),) + return _parse_csv_paths(raw=share_mem_paths_raw, arg_name="share-mem-paths") def _parse_csv_paths(*, raw: str, arg_name: str) -> tuple[str, ...]: @@ -887,18 +857,14 @@ def _select_shared_bundle(shared_bundles: tuple[SharedBundle, ...], worker_idx: def _resolve_fluxon_shared_bundle( *, - shared_memory_path: str | None, - shared_file_path: str | None, + share_mem_path: str | None, ) -> SharedBundle: - if shared_memory_path is None or shared_file_path is None: - raise ValueError( - "fluxon backend requires explicit shared_memory_path/shared_file_path for each worker" - ) - resolved_shared_memory_path = str(shared_memory_path).strip() - resolved_shared_file_path = str(shared_file_path).strip() - if resolved_shared_memory_path == "" or resolved_shared_file_path == "": - raise ValueError("shared_memory_path/shared_file_path must be non-empty strings") - return (resolved_shared_memory_path, resolved_shared_file_path) + if share_mem_path is None: + raise ValueError("fluxon backend requires explicit share_mem_path for each worker") + resolved_share_mem_path = str(share_mem_path).strip() + if resolved_share_mem_path == "": + raise ValueError("share_mem_path must be a non-empty string") + return resolved_share_mem_path def _terminate_processes(processes: list[subprocess.Popen[str]]) -> None: diff --git a/fluxon_py/tests/test_api_chan_mpmc/test_mpmc_simple_bench2.py b/fluxon_py/tests/test_api_chan_mpmc/test_mpmc_simple_bench2.py index eabac11..bc9cfa0 100644 --- a/fluxon_py/tests/test_api_chan_mpmc/test_mpmc_simple_bench2.py +++ b/fluxon_py/tests/test_api_chan_mpmc/test_mpmc_simple_bench2.py @@ -67,7 +67,6 @@ def _find_project_root(start: Path) -> Path: MOONCAKE_MASTER_SERVER_ADDRESS, MOONCAKE_METADATA_SERVER, load_test_fluxon_cluster_name, - load_test_fluxon_share_file_path, load_test_fluxon_share_mem_path, new_test_consumer, new_test_producer, @@ -898,8 +897,7 @@ def _new_store_config(*, instance_key: str, backend_type: str) -> FluxonKvClient if backend_type == KvClientType.FLUXON.value: fluxon_spec: dict[str, Any] = { "cluster_name": load_test_fluxon_cluster_name(), - "shared_memory_path": load_test_fluxon_share_mem_path(), - "shared_file_path": load_test_fluxon_share_file_path(), + "share_mem_path": load_test_fluxon_share_mem_path(), } return FluxonKvClientConfig( { diff --git a/fluxon_py/tests/test_backend_fallback_close.py b/fluxon_py/tests/test_backend_fallback_close.py index a33da71..74ad26d 100644 --- a/fluxon_py/tests/test_backend_fallback_close.py +++ b/fluxon_py/tests/test_backend_fallback_close.py @@ -93,6 +93,9 @@ def get_cluster_name(self): def get_etcd_config(self): return [] + def third_party_logs_dir(self): + return Result.new_ok("/tmp/fluxon_third_party_logs") + def ensure_zero_contribution_for_channel(self): return None diff --git a/fluxon_py/tests/test_backend_relay_deployconf.template.yaml b/fluxon_py/tests/test_backend_relay_deployconf.template.yaml index 90ea518..d965923 100644 --- a/fluxon_py/tests/test_backend_relay_deployconf.template.yaml +++ b/fluxon_py/tests/test_backend_relay_deployconf.template.yaml @@ -85,8 +85,9 @@ service: etcd_addresses: - "__ETCD_CONTAINER_NAME__:2379" cluster_name: "__CLUSTER_NAME__" - shared_memory_path: "__OWNER1_SHM__" + share_mem_path: "__OWNER1_SHM__" sub_cluster: "owner1" + large_file_paths: ["__OWNER1_LARGE_ROOT__"] p2p_listen_port: 31011 YAML exec python3 -m fluxon_py.runtime.start_owner_kvclient -c all_config.yaml -w "__RUNTIME_ROOT__/work/owner1" @@ -114,8 +115,9 @@ service: etcd_addresses: - "__ETCD_CONTAINER_NAME__:2379" cluster_name: "__CLUSTER_NAME__" - shared_memory_path: "__OWNER2_SHM__" + share_mem_path: "__OWNER2_SHM__" sub_cluster: "owner2" + large_file_paths: ["__OWNER2_LARGE_ROOT__"] p2p_listen_port: 31012 YAML exec python3 -m fluxon_py.runtime.start_owner_kvclient -c all_config.yaml -w "__RUNTIME_ROOT__/work/owner2" @@ -143,8 +145,9 @@ service: etcd_addresses: - "__ETCD_CONTAINER_NAME__:2379" cluster_name: "__CLUSTER_NAME__" - shared_memory_path: "__OWNER3_SHM__" + share_mem_path: "__OWNER3_SHM__" sub_cluster: "owner3" + large_file_paths: ["__OWNER3_LARGE_ROOT__"] p2p_listen_port: 31013 YAML exec python3 -m fluxon_py.runtime.start_owner_kvclient -c all_config.yaml -w "__RUNTIME_ROOT__/work/owner3" @@ -172,8 +175,9 @@ service: etcd_addresses: - "__ETCD_CONTAINER_NAME__:2379" cluster_name: "__CLUSTER_NAME__" - shared_memory_path: "__OWNER4_SHM__" + share_mem_path: "__OWNER4_SHM__" sub_cluster: "owner4" + large_file_paths: ["__OWNER4_LARGE_ROOT__"] p2p_listen_port: 31014 YAML exec python3 -m fluxon_py.runtime.start_owner_kvclient -c all_config.yaml -w "__RUNTIME_ROOT__/work/owner4" diff --git a/fluxon_py/tests/test_backend_relay_docker.py b/fluxon_py/tests/test_backend_relay_docker.py index 20b970d..d51bcbb 100644 --- a/fluxon_py/tests/test_backend_relay_docker.py +++ b/fluxon_py/tests/test_backend_relay_docker.py @@ -43,30 +43,34 @@ def main() -> None: mode = sys.argv[1] if mode == "wait-store": if len(sys.argv) != 5: - raise RuntimeError("wait-store requires: cluster_name shared_memory_path timeout_seconds") + raise RuntimeError("wait-store requires: cluster_name share_mem_path timeout_seconds") _wait_store(sys.argv[2], sys.argv[3], float(sys.argv[4])) print("wait-store ok") return if mode == "put": if len(sys.argv) != 6: - raise RuntimeError("put requires: cluster_name shared_memory_path key payload_base64") + raise RuntimeError("put requires: cluster_name share_mem_path key payload_base64") _put(sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5]) print("put ok") return if mode == "get": if len(sys.argv) != 7: - raise RuntimeError("get requires: cluster_name shared_memory_path key expected_base64 timeout_seconds") + raise RuntimeError("get requires: cluster_name share_mem_path key expected_base64 timeout_seconds") _get(sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5], float(sys.argv[6])) print("get ok") return raise RuntimeError(f"unknown mode: {mode}") - def _wait_store(cluster_name: str, shared_memory_path: str, timeout_seconds: float) -> None: + def _wait_store( + cluster_name: str, + share_mem_path: str, + timeout_seconds: float, + ) -> None: deadline = time.time() + timeout_seconds last_error = "" while time.time() < deadline: - result = new_store(_new_config(cluster_name, shared_memory_path)) + result = new_store(_new_config(cluster_name, share_mem_path)) if result.is_ok(): store = result.unwrap() _close_store(store) @@ -76,9 +80,14 @@ def _wait_store(cluster_name: str, shared_memory_path: str, timeout_seconds: flo raise RuntimeError(f"wait-store timed out: {last_error}") - def _put(cluster_name: str, shared_memory_path: str, key: str, payload_base64: str) -> None: + def _put( + cluster_name: str, + share_mem_path: str, + key: str, + payload_base64: str, + ) -> None: payload = base64.b64decode(payload_base64.encode("ascii")) - store = _open_store(cluster_name, shared_memory_path) + store = _open_store(cluster_name, share_mem_path) try: put_result = store.put(key, {"payload": payload}) if not put_result.is_ok(): @@ -93,14 +102,14 @@ def _put(cluster_name: str, shared_memory_path: str, key: str, payload_base64: s def _get( cluster_name: str, - shared_memory_path: str, + share_mem_path: str, key: str, expected_base64: str, timeout_seconds: float, ) -> None: expected = base64.b64decode(expected_base64.encode("ascii")) deadline = time.time() + timeout_seconds - store = _open_store(cluster_name, shared_memory_path) + store = _open_store(cluster_name, share_mem_path) try: last_error = "" while time.time() < deadline: @@ -125,21 +134,24 @@ def _get( _close_store(store) - def _new_config(cluster_name: str, shared_memory_path: str) -> FluxonKvClientConfig: + def _new_config( + cluster_name: str, + share_mem_path: str, + ) -> FluxonKvClientConfig: return FluxonKvClientConfig( { "instance_key": f"relay_helper_{os.getpid()}_{int(time.time() * 1000)}", "contribute_to_cluster_pool_size": {"dram": 0, "vram": {}}, "fluxonkv_spec": { "cluster_name": cluster_name, - "shared_memory_path": shared_memory_path, + "share_mem_path": share_mem_path, }, } ) - def _open_store(cluster_name: str, shared_memory_path: str): - result = new_store(_new_config(cluster_name, shared_memory_path)) + def _open_store(cluster_name: str, share_mem_path: str): + result = new_store(_new_config(cluster_name, share_mem_path)) if not result.is_ok(): raise RuntimeError(f"new_store failed: {result.unwrap_error()}") return result.unwrap() @@ -289,7 +301,7 @@ def _relay_wait_for_store( container_name: str, helper_path: str, cluster_name: str, - shared_memory_path: str, + share_mem_path: str, ) -> None: _relay_run( [ @@ -300,7 +312,7 @@ def _relay_wait_for_store( helper_path, "wait-store", cluster_name, - shared_memory_path, + share_mem_path, str(RELAY_DOCKER_WAIT_TIMEOUT_SECONDS), ], timeout_seconds=RELAY_DOCKER_WAIT_TIMEOUT_SECONDS + 30, @@ -423,6 +435,10 @@ def test_relay_docker_connectivity() -> int: owner_name: f"{container_runtime_root}/shm/{owner_name}" for owner_name in ("owner1", "owner2", "owner3", "owner4") } + owner_large_root_paths = { + owner_name: f"{container_runtime_root}/large/{owner_name}" + for owner_name in ("owner1", "owner2", "owner3", "owner4") + } _relay_render_template( deployconf_template_path, rendered_deployconf_path, @@ -448,6 +464,10 @@ def test_relay_docker_connectivity() -> int: "__OWNER2_SHM__": owner_shm_paths["owner2"], "__OWNER3_SHM__": owner_shm_paths["owner3"], "__OWNER4_SHM__": owner_shm_paths["owner4"], + "__OWNER1_LARGE_ROOT__": owner_large_root_paths["owner1"], + "__OWNER2_LARGE_ROOT__": owner_large_root_paths["owner2"], + "__OWNER3_LARGE_ROOT__": owner_large_root_paths["owner3"], + "__OWNER4_LARGE_ROOT__": owner_large_root_paths["owner4"], }, ) @@ -487,13 +507,13 @@ def test_relay_docker_connectivity() -> int: container_name=container_names["owner1"], helper_path=helper_container_path, cluster_name=cluster_name, - shared_memory_path=owner_shm_paths["owner1"], + share_mem_path=owner_shm_paths["owner1"], ) _relay_wait_for_store( container_name=container_names["owner4"], helper_path=helper_container_path, cluster_name=cluster_name, - shared_memory_path=owner_shm_paths["owner4"], + share_mem_path=owner_shm_paths["owner4"], ) key = f"/relay_docker/{run_suffix}/payload" diff --git a/fluxon_py/tests/test_config.py b/fluxon_py/tests/test_config.py index 379e3e0..6de5180 100644 --- a/fluxon_py/tests/test_config.py +++ b/fluxon_py/tests/test_config.py @@ -47,6 +47,8 @@ def _build_checks(selected_test_id: Optional[str]) -> List[Tuple[str, Callable[[ ("to_yaml_str_roundtrip", _run_test_to_yaml_str_roundtrip), ("fluxonkv_sub_cluster_config", test_fluxonkv_sub_cluster_config), ("fluxonkv_owner_requires_sub_cluster", test_fluxonkv_owner_requires_sub_cluster), + ("fluxonkv_owner_requires_large_file_paths", test_fluxonkv_owner_requires_large_file_paths), + ("fluxonkv_external_forbids_large_file_paths", test_fluxonkv_external_forbids_large_file_paths), ("fluxonkv_p2p_relay_removed", test_fluxonkv_p2p_relay_removed), ("fluxon_client_config_yaml_shape", test_fluxon_client_config_yaml_shape), ("fluxonkv_protocol_field", test_fluxonkv_protocol_field), @@ -142,21 +144,36 @@ def _import_fluxon_pyo3_tool_without_package_init(): _PYO3_TOOL = _import_fluxon_pyo3_tool_without_package_init() +def _owner_large_file_paths(tag: str) -> list[str]: + return [f"/tmp/kvcache_large/{tag}"] + + +def _owner_fluxonkv_base_config( + *, + instance_key: str = "test_instance", + cluster_name: str = "test_cluster", + share_mem_path: str = "/tmp/kvcache_shared_memory/test", + sub_cluster: str = "rack-a", + tag: str = "test", +) -> dict: + return { + "instance_key": instance_key, + "contribute_to_cluster_pool_size": {"dram": 16777216, "vram": {}}, + "fluxonkv_spec": { + "etcd_addresses": ["localhost:2379"], + "cluster_name": cluster_name, + "share_mem_path": share_mem_path, + "sub_cluster": sub_cluster, + "large_file_paths": _owner_large_file_paths(tag), + }, + } + + def test_fluxonkv_sub_cluster_config(): """Test fluxonkv_spec.sub_cluster is accepted and exposed.""" try: config = FluxonKvClientConfig( - { - "instance_key": "test_instance", - "contribute_to_cluster_pool_size": {"dram": 16777216, "vram": {}}, - "fluxonkv_spec": { - "etcd_addresses": ["localhost:2379"], - "cluster_name": "test_cluster", - "shared_memory_path": "/tmp/kvcache_shared_memory/test", - "shared_file_path": "/tmp/kvcache_shared_files/test", - "sub_cluster": "producer_side", - }, - } + _owner_fluxonkv_base_config(sub_cluster="producer_side", tag="sub_cluster") ) assert config.fluxonkv_spec_sub_cluster == "producer_side" print("✅ PASS: test_fluxonkv_sub_cluster_config") @@ -225,16 +242,8 @@ def test_fluxon_pyo3_import_authority(): def test_fluxonkv_owner_requires_sub_cluster(): """Ensure owner mode requires a clean non-empty fluxonkv_spec.sub_cluster.""" try: - base = { - "instance_key": "test_instance", - "contribute_to_cluster_pool_size": {"dram": 16777216, "vram": {}}, - "fluxonkv_spec": { - "etcd_addresses": ["localhost:2379"], - "cluster_name": "test_cluster", - "shared_memory_path": "/tmp/kvcache_shared_memory/test", - "shared_file_path": "/tmp/kvcache_shared_files/test", - }, - } + base = _owner_fluxonkv_base_config(tag="owner_requires_sub_cluster") + del base["fluxonkv_spec"]["sub_cluster"] try: FluxonKvClientConfig(copy.deepcopy(base)) @@ -270,20 +279,64 @@ def test_fluxonkv_owner_requires_sub_cluster(): print(f"❌ FAIL: test_fluxonkv_owner_requires_sub_cluster - {e}") -def test_fluxonkv_p2p_relay_removed(): - """Ensure removed fluxonkv_spec.p2p_relay is rejected as an unknown key.""" +def test_fluxonkv_owner_requires_large_file_paths(): + """Ensure owner mode requires explicit large_file_paths roots.""" try: - base = { - "instance_key": "test_instance", - "contribute_to_cluster_pool_size": {"dram": 16777216, "vram": {}}, + base = _owner_fluxonkv_base_config(tag="owner_requires_large_file_paths") + del base["fluxonkv_spec"]["large_file_paths"] + + try: + FluxonKvClientConfig(copy.deepcopy(base)) + print("❌ FAIL: test_fluxonkv_owner_requires_large_file_paths - missing large_file_paths should be rejected") + return + except ValueError: + pass + + invalid_blank = copy.deepcopy(base) + invalid_blank["fluxonkv_spec"]["large_file_paths"] = [" "] + try: + FluxonKvClientConfig(invalid_blank) + print("❌ FAIL: test_fluxonkv_owner_requires_large_file_paths - blank large_file_paths entry should be rejected") + return + except ValueError: + pass + + valid = _owner_fluxonkv_base_config(tag="owner_requires_large_file_paths_valid") + rendered = FluxonKvClientConfig(valid).to_fluxon_kv_client_config_yaml_str() + assert "large_file_paths:" in rendered + assert "- /tmp/kvcache_large/owner_requires_large_file_paths_valid" in rendered + print("✅ PASS: test_fluxonkv_owner_requires_large_file_paths") + except Exception as e: + print(f"❌ FAIL: test_fluxonkv_owner_requires_large_file_paths - {e}") + + +def test_fluxonkv_external_forbids_large_file_paths(): + """Ensure zero-contribution external config cannot declare owner-only large_file_paths.""" + try: + external = { + "instance_key": "test_external", + "contribute_to_cluster_pool_size": {"dram": 0, "vram": {}}, "fluxonkv_spec": { - "etcd_addresses": ["localhost:2379"], "cluster_name": "test_cluster", - "shared_memory_path": "/tmp/kvcache_shared_memory/test", - "shared_file_path": "/tmp/kvcache_shared_files/test", - "sub_cluster": "rack-a", + "share_mem_path": "/tmp/kvcache_shared_memory/test", + "large_file_paths": _owner_large_file_paths("external_forbidden"), }, } + try: + FluxonKvClientConfig(external) + print("❌ FAIL: test_fluxonkv_external_forbids_large_file_paths - external large_file_paths should be rejected") + return + except ValueError: + pass + print("✅ PASS: test_fluxonkv_external_forbids_large_file_paths") + except Exception as e: + print(f"❌ FAIL: test_fluxonkv_external_forbids_large_file_paths - {e}") + + +def test_fluxonkv_p2p_relay_removed(): + """Ensure removed fluxonkv_spec.p2p_relay is rejected as an unknown key.""" + try: + base = _owner_fluxonkv_base_config(tag="p2p_relay_removed") _ = FluxonKvClientConfig(copy.deepcopy(base)) @@ -304,23 +357,13 @@ def test_fluxonkv_p2p_relay_removed(): def test_fluxon_client_config_yaml_shape(): """Test YAML shape required by Rust ClientConfigYaml.""" try: - base = { - "instance_key": "test_instance", - "contribute_to_cluster_pool_size": {"dram": 16777216, "vram": {}}, - "fluxonkv_spec": { - "etcd_addresses": ["localhost:2379"], - "cluster_name": "test_cluster", - "shared_memory_path": "/tmp/kvcache_shared_memory/test", - "shared_file_path": "/tmp/kvcache_shared_files/test", - "sub_cluster": "rack-a", - }, - } + base = _owner_fluxonkv_base_config(tag="yaml_shape") config = FluxonKvClientConfig(copy.deepcopy(base)) yaml_text = config.to_fluxon_kv_client_config_yaml_str() loaded = yaml.safe_load(yaml_text) - assert loaded["fluxonkv_spec"]["shared_memory_path"] == base["fluxonkv_spec"]["shared_memory_path"] + assert loaded["fluxonkv_spec"]["share_mem_path"] == base["fluxonkv_spec"]["share_mem_path"] assert loaded["fluxonkv_spec"]["sub_cluster"] == base["fluxonkv_spec"]["sub_cluster"] - assert "shared_memory_path" not in loaded + assert "share_mem_path" not in loaded assert "rdma_device_names" not in loaded assert "transfer_engine" not in loaded["fluxonkv_spec"] print("✅ PASS: test_fluxon_client_config_yaml_shape") @@ -338,8 +381,7 @@ def test_fluxonkv_protocol_field(): }, "fluxonkv_spec": { "cluster_name": "test_cluster", - "shared_memory_path": "/tmp/kvcache_shared_memory/test_side_worker", - "shared_file_path": "/tmp/kvcache_shared_files/test_side_worker", + "share_mem_path": "/tmp/kvcache_shared_memory/test_side_worker", }, "test_spec_config": { "enable_side_transfer": True, @@ -361,17 +403,7 @@ def test_fluxonkv_protocol_field(): def test_fluxonkv_runtime_defaults_are_internal(): """Ensure Fluxon KV runtime defaults stay internal and are not serialized into YAML.""" try: - base = { - "instance_key": "test_instance", - "contribute_to_cluster_pool_size": {"dram": 16777216, "vram": {}}, - "fluxonkv_spec": { - "etcd_addresses": ["localhost:2379"], - "cluster_name": "test_cluster", - "shared_memory_path": "/tmp/kvcache_shared_memory/test", - "shared_file_path": "/tmp/kvcache_shared_files/test", - "sub_cluster": "rack-a", - }, - } + base = _owner_fluxonkv_base_config(tag="runtime_defaults") config = FluxonKvClientConfig(copy.deepcopy(base)) assert config.fluxonkv_spec_transfer_engine == "closed" assert config.protocol_rdma_device_names is None @@ -387,17 +419,7 @@ def test_fluxonkv_runtime_defaults_are_internal(): def test_fluxonkv_removed_rdma_config_keys(): """Ensure removed Fluxon KV RDMA config keys are rejected.""" try: - base = { - "instance_key": "test_instance", - "contribute_to_cluster_pool_size": {"dram": 16777216, "vram": {}}, - "fluxonkv_spec": { - "etcd_addresses": ["localhost:2379"], - "cluster_name": "test_cluster", - "shared_memory_path": "/tmp/kvcache_shared_memory/test", - "shared_file_path": "/tmp/kvcache_shared_files/test", - "sub_cluster": "rack-a", - }, - } + base = _owner_fluxonkv_base_config(tag="removed_rdma_keys") invalid_rdma = copy.deepcopy(base) invalid_rdma["rdma_device_names"] = "mlx5_0:1" @@ -425,21 +447,11 @@ def test_fluxonkv_removed_rdma_config_keys(): def test_fluxonkv_test_spec_config(): """Ensure test_spec_config is accepted, normalized, and serialized.""" try: - base = { - "instance_key": "test_instance", - "contribute_to_cluster_pool_size": {"dram": 16777216, "vram": {}}, - "fluxonkv_spec": { - "etcd_addresses": ["localhost:2379"], - "cluster_name": "test_cluster", - "shared_memory_path": "/tmp/kvcache_shared_memory/test", - "shared_file_path": "/tmp/kvcache_shared_files/test", - "sub_cluster": "rack-a", - }, - "test_spec_config": { - "disable_observability": True, - "enable_iceoryx_logs": True, - "transport_mode": "transfer_only", - }, + base = _owner_fluxonkv_base_config(tag="test_spec_config") + base["test_spec_config"] = { + "disable_observability": True, + "enable_iceoryx_logs": True, + "transport_mode": "transfer_only", } try: diff --git a/fluxon_py/tests/test_config.yaml b/fluxon_py/tests/test_config.yaml index d47332c..8520ec2 100644 --- a/fluxon_py/tests/test_config.yaml +++ b/fluxon_py/tests/test_config.yaml @@ -1,5 +1,4 @@ kv_svc_type: fluxon etcd_address: 127.0.0.1:2379 cluster_name: fluxon-example-cluster -shared_memory_path: /tmp/fluxon-example-cluster/shm -shared_file_path: /tmp/fluxon-example-cluster/share +share_mem_path: /tmp/fluxon-example-cluster/shm diff --git a/fluxon_py/tests/test_fluxon_fs_patcher.py b/fluxon_py/tests/test_fluxon_fs_patcher.py index 943c5d3..3b02a08 100644 --- a/fluxon_py/tests/test_fluxon_fs_patcher.py +++ b/fluxon_py/tests/test_fluxon_fs_patcher.py @@ -23,7 +23,6 @@ def main() -> None: from fluxon_py.api_error import KeyNotFoundError # noqa: E402 from fluxon_py.tests.test_lib import ( # noqa: E402 load_test_fluxon_cluster_name, - load_test_fluxon_share_file_path, load_test_fluxon_share_mem_path, ) @@ -36,24 +35,22 @@ def _new_test_dir(tag: str) -> Path: return p -def _load_ci_cluster() -> tuple[str, str, str]: +def _load_ci_cluster() -> tuple[str, str]: return ( load_test_fluxon_cluster_name(), load_test_fluxon_share_mem_path(), - load_test_fluxon_share_file_path(), ) def _new_fluxon_external_store(*, instance_key: str): - cluster_name, share_mem_path, share_file_path = _load_ci_cluster() + cluster_name, share_mem_path = _load_ci_cluster() cfg = FluxonKvClientConfig( { "instance_key": instance_key, "contribute_to_cluster_pool_size": {"dram": 0, "vram": {}}, "fluxonkv_spec": { "cluster_name": cluster_name, - "shared_memory_path": share_mem_path, - "shared_file_path": share_file_path, + "share_mem_path": share_mem_path, }, } ) diff --git a/fluxon_py/tests/test_fluxon_fs_remote_mount.py b/fluxon_py/tests/test_fluxon_fs_remote_mount.py index a2b9ed9..9283073 100644 --- a/fluxon_py/tests/test_fluxon_fs_remote_mount.py +++ b/fluxon_py/tests/test_fluxon_fs_remote_mount.py @@ -19,7 +19,6 @@ def main() -> None: from fluxon_py.kvclient import new_store # noqa: E402 from fluxon_py.tests.test_lib import ( # noqa: E402 load_test_fluxon_cluster_name, - load_test_fluxon_share_file_path, load_test_fluxon_share_mem_path, ) @@ -32,11 +31,10 @@ def _new_test_dir(tag: str) -> Path: return p -def _load_ci_cluster() -> tuple[str, str, str]: +def _load_ci_cluster() -> tuple[str, str]: return ( load_test_fluxon_cluster_name(), load_test_fluxon_share_mem_path(), - load_test_fluxon_share_file_path(), ) @@ -45,7 +43,6 @@ def _new_fluxon_external_store_with_cluster( instance_key: str, cluster_name: str, share_mem_path: str, - share_file_path: str, ): cfg = FluxonKvClientConfig( { @@ -53,8 +50,7 @@ def _new_fluxon_external_store_with_cluster( "contribute_to_cluster_pool_size": {"dram": 0, "vram": {}}, "fluxonkv_spec": { "cluster_name": cluster_name, - "shared_memory_path": share_mem_path, - "shared_file_path": share_file_path, + "share_mem_path": share_mem_path, }, } ) @@ -79,7 +75,7 @@ def setUpClass(cls) -> None: cls._remote_root = (cls._tmp / "remote_root").resolve() cls._remote_root.mkdir(parents=True, exist_ok=False) - cls._cluster_name, cls._share_mem_path, cls._share_file_path = _load_ci_cluster() + cls._cluster_name, cls._share_mem_path = _load_ci_cluster() # Keep the mountpoint under a writable temp directory to avoid relying on root paths. # The engine will create the mountpoint if it does not exist. @@ -91,13 +87,11 @@ def setUpClass(cls) -> None: instance_key=f"test_fluxon_fs_agent_{os.getpid()}", cluster_name=cls._cluster_name, share_mem_path=cls._share_mem_path, - share_file_path=cls._share_file_path, ) cls._client_store = _new_fluxon_external_store_with_cluster( instance_key=f"test_fluxon_fs_client_{os.getpid()}", cluster_name=cls._cluster_name, share_mem_path=cls._share_mem_path, - share_file_path=cls._share_file_path, ) agent_key_res = cls._agent_store.instance_key() diff --git a/fluxon_py/tests/test_lib.py b/fluxon_py/tests/test_lib.py index a246e24..9be7003 100644 --- a/fluxon_py/tests/test_lib.py +++ b/fluxon_py/tests/test_lib.py @@ -35,8 +35,7 @@ load_test_config_mapping, load_test_etcd_address_from_test_config, load_test_fluxon_cluster_name_from_test_config, - load_test_fluxon_shared_file_path_from_test_config, - load_test_fluxon_shared_memory_path_from_test_config, + load_test_fluxon_share_mem_path_from_test_config, load_test_kv_svc_type_from_test_config, ) @@ -85,12 +84,7 @@ def load_test_fluxon_cluster_name(*, config_path: Optional[Path] = None) -> str: def load_test_fluxon_share_mem_path(*, config_path: Optional[Path] = None) -> str: """Load required Fluxon shared-memory path from test_config.yaml.""" - return load_test_fluxon_shared_memory_path_from_test_config(config_path=config_path) - - -def load_test_fluxon_share_file_path(*, config_path: Optional[Path] = None) -> str: - """Load required Fluxon shared-file path from test_config.yaml.""" - return load_test_fluxon_shared_file_path_from_test_config(config_path=config_path) + return load_test_fluxon_share_mem_path_from_test_config(config_path=config_path) def load_test_chan_config(*, config_path: Optional[Path] = None) -> Dict[str, int]: @@ -283,12 +277,10 @@ def new_shared_stores( # Strictly require fluxon-specific fields from the shared test/example deployconf. cluster_name = load_test_fluxon_cluster_name() share_mem = load_test_fluxon_share_mem_path() - share_file = load_test_fluxon_share_file_path() spec = { "fluxonkv_spec": { "cluster_name": cluster_name, - "shared_memory_path": share_mem, - "shared_file_path": share_file, + "share_mem_path": share_mem, } } diff --git a/fluxon_py/tests/test_mq/test_example_ctrl_c_exit.py b/fluxon_py/tests/test_mq/test_example_ctrl_c_exit.py index 8e416a2..c1b3193 100644 --- a/fluxon_py/tests/test_mq/test_example_ctrl_c_exit.py +++ b/fluxon_py/tests/test_mq/test_example_ctrl_c_exit.py @@ -97,10 +97,9 @@ def _build_store_config(*, config_path: Path, workdir: Path) -> FluxonKvClientCo producer_cfg = dict(loaded["mpmc_demo"]["producer"]) kvexternal_cfg["instance_key"] = str(producer_cfg["instance_key"]) spec = dict(kvexternal_cfg["fluxonkv_spec"]) - for field_name in ("shared_memory_path", "shared_file_path"): - raw_path = spec.get(field_name) - if isinstance(raw_path, str) and raw_path and not Path(raw_path).is_absolute(): - spec[field_name] = str((workdir / raw_path).resolve()) + raw_path = spec.get("share_mem_path") + if isinstance(raw_path, str) and raw_path and not Path(raw_path).is_absolute(): + spec["share_mem_path"] = str((workdir / raw_path).resolve()) kvexternal_cfg["fluxonkv_spec"] = spec return FluxonKvClientConfig(kvexternal_cfg) @@ -223,10 +222,9 @@ def _build_store_config(*, config_path: Path, workdir: Path) -> FluxonKvClientCo consumer_cfg = dict(loaded["mpmc_demo"]["consumer"]) kvexternal_cfg["instance_key"] = str(consumer_cfg["instance_key"]) spec = dict(kvexternal_cfg["fluxonkv_spec"]) - for field_name in ("shared_memory_path", "shared_file_path"): - raw_path = spec.get(field_name) - if isinstance(raw_path, str) and raw_path and not Path(raw_path).is_absolute(): - spec[field_name] = str((workdir / raw_path).resolve()) + raw_path = spec.get("share_mem_path") + if isinstance(raw_path, str) and raw_path and not Path(raw_path).is_absolute(): + spec["share_mem_path"] = str((workdir / raw_path).resolve()) kvexternal_cfg["fluxonkv_spec"] = spec return FluxonKvClientConfig(kvexternal_cfg) @@ -462,7 +460,7 @@ def _build_example_config( unique_suffix: str, cluster_name: str, etcd_endpoint: str, - shared_memory_path: str, + share_mem_path: str, greptime_http_port: int, master_port: int, ) -> dict[str, Any]: @@ -474,7 +472,7 @@ def _build_example_config( "cluster_name": cluster_name, "instance_key": f"example_ctrlc_master_{unique_suffix}", "port": master_port, - "log_dir": str((Path(shared_memory_path).parent / "log" / "master").resolve()), + "log_dir": str((Path(share_mem_path).parent / "log" / "master").resolve()), "monitoring": _monitoring_block(greptime_http_port=greptime_http_port), }, "kvclient": { @@ -483,9 +481,9 @@ def _build_example_config( "fluxonkv_spec": { "etcd_addresses": [etcd_endpoint], "cluster_name": cluster_name, - "shared_memory_path": shared_memory_path, - "shared_file_path": str((Path(shared_memory_path).parent / "sharefile").resolve()), + "share_mem_path": share_mem_path, "sub_cluster": "demo", + "large_file_paths": [str((Path(share_mem_path).parent / "large" / "owner").resolve())], }, }, "kvexternal": { @@ -493,8 +491,7 @@ def _build_example_config( "contribute_to_cluster_pool_size": {"dram": 0, "vram": {}}, "fluxonkv_spec": { "cluster_name": cluster_name, - "shared_memory_path": shared_memory_path, - "shared_file_path": str((Path(shared_memory_path).parent / "sharefile").resolve()), + "share_mem_path": share_mem_path, }, }, "mpmc_demo": { @@ -524,8 +521,8 @@ def _write_runtime_subconfig(*, path: Path, config: dict[str, Any], key: str) -> ) -def _kvclient_shared_json_target(*, shared_file_path: Path, cluster_name: str) -> Path: - return shared_file_path / cluster_name / "shared.json" +def _kvclient_shared_json_target(*, share_mem_path: Path, cluster_name: str) -> Path: + return share_mem_path / cluster_name / "shared.json" def _start_local_stack(*, temp_root: Path, config_path: Path) -> list[tuple[subprocess.Popen[str], Path]]: @@ -590,13 +587,13 @@ def _start_local_stack(*, temp_root: Path, config_path: Path) -> list[tuple[subp unique_suffix = uuid.uuid4().hex[:12] cluster_name = f"example_ctrlc_cluster_{unique_suffix}" - shared_memory_path = str((temp_root / "sharemem").resolve()) + share_mem_path = str((temp_root / "sharemem").resolve()) master_port = _pick_free_port() config = _build_example_config( unique_suffix=unique_suffix, cluster_name=cluster_name, etcd_endpoint=etcd_endpoint, - shared_memory_path=shared_memory_path, + share_mem_path=share_mem_path, greptime_http_port=greptime_http_port, master_port=master_port, ) @@ -637,7 +634,7 @@ def _start_local_stack(*, temp_root: Path, config_path: Path) -> list[tuple[subp env=env, ) kvclient_shared_json = _kvclient_shared_json_target( - shared_file_path=Path(str(config["kvclient"]["fluxonkv_spec"]["shared_file_path"])).resolve(), + share_mem_path=Path(str(config["kvclient"]["fluxonkv_spec"]["share_mem_path"])).resolve(), cluster_name=cluster_name, ) _wait_for_path( diff --git a/fluxon_rs/Cargo.lock b/fluxon_rs/Cargo.lock index 4ddcf9b..a4b0ecd 100644 --- a/fluxon_rs/Cargo.lock +++ b/fluxon_rs/Cargo.lock @@ -1320,6 +1320,7 @@ dependencies = [ "anyhow", "askama", "base64 0.21.7", + "chrono", "clap", "etcd-client", "fluxon_cli", @@ -1336,6 +1337,7 @@ dependencies = [ "serde_json", "serde_yaml", "sha2", + "tempfile", "thiserror 1.0.69", "tokio", "tracing", diff --git a/fluxon_rs/fluxon_fs/src/agent.rs b/fluxon_rs/fluxon_fs/src/agent.rs index eca583e..a482616 100644 --- a/fluxon_rs/fluxon_fs/src/agent.rs +++ b/fluxon_rs/fluxon_fs/src/agent.rs @@ -1407,20 +1407,23 @@ impl FluxonFsAgent { .get_self_info() .id .to_string(); - let shared_file_path = if self.kv_framework.is_external_mode() { + let cache_root_base = if self.kv_framework.is_external_mode() { self.kv_framework .external_client_api_view() .external_client_api() .inner() - .shared_file_path() + .large_file_paths() + .fs_disk_cache_base_dir() + .map_err(|err| format!("invalid external large_file_paths: {}", err))? } else { self.kv_framework .client_seg_pool_view() .client_seg_pool() - .shared_file_path() - .to_string() + .large_file_paths() + .fs_disk_cache_base_dir() + .map_err(|err| format!("invalid owner large_file_paths: {}", err))? }; - let cache_root = resolve_disk_cache_root(Path::new(&shared_file_path), &instance_key); + let cache_root = resolve_disk_cache_root(cache_root_base.as_path(), &instance_key); let cache = RemoteDiskCacheManager::new(cache_root.clone(), disk_cache_max_bytes_from_env()) .map_err(|err| { diff --git a/fluxon_rs/fluxon_fs/src/agent_service.rs b/fluxon_rs/fluxon_fs/src/agent_service.rs index 91315df..395dfbc 100644 --- a/fluxon_rs/fluxon_fs/src/agent_service.rs +++ b/fluxon_rs/fluxon_fs/src/agent_service.rs @@ -56,6 +56,8 @@ use crate::write_session_rpc::{ FsWriteSessionChunkResp, FsWriteSessionDataFrame, }; +pub(crate) mod transfer_agent; + pub const CHUNK_BYTES: usize = 1024 * 1024; pub const READ_CHUNK_BYTES: usize = 8 * 1024 * 1024; pub const WRITE_SESSION_CHUNK_BYTES: usize = crate::agent::REMOTE_WRITE_SESSION_CHUNK_BYTES; @@ -65,6 +67,9 @@ const WRITE_SESSION_MAX_QUEUED_BYTES: usize = const WRITE_SESSION_IDLE_TIMEOUT_SECS: u64 = 180; const WRITE_SESSION_REAP_INTERVAL_SECS: u64 = 30; const WRITE_SESSION_CLOSE_WAIT_TIMEOUT_SECS: u64 = 30; +pub(crate) const TRANSFER_HEARTBEAT_INTERVAL_MS: i64 = 5_000; +pub(crate) const TRANSFER_STREAM_RPC_TIMEOUT_MS: u64 = 60_000; +pub(crate) const TRANSFER_WORKER_COORDINATION_RPC_TIMEOUT_MS: u64 = 30_000; const AGENT_EXPORTS_SNAPSHOT_SCHEMA_VERSION_KEY: &str = "schema_version"; const AGENT_EXPORTS_SNAPSHOT_EXPORTS_JSON_KEY: &str = "exports_json"; const AGENT_EXPORT_NAME_KEY: &str = "export_name"; @@ -4950,6 +4955,7 @@ mod tests { FluxonFsRuntimeAccessModel { users: vec![FluxonFsRuntimeAccessUser { username: "alice".to_string(), + can_manage_users: false, rpc_token_secret_sha256_hex: hex::encode(sha2::Sha256::digest(b"pw")), }], scope_access: vec![FluxonFsScopeAccess { @@ -4965,6 +4971,7 @@ mod tests { FluxonFsRuntimeAccessModel { users: vec![FluxonFsRuntimeAccessUser { username: "alice".to_string(), + can_manage_users: false, rpc_token_secret_sha256_hex: hex::encode(sha2::Sha256::digest(b"pw")), }], scope_access: vec![FluxonFsScopeAccess { diff --git a/fluxon_rs/fluxon_fs/src/agent_service/transfer_agent.rs b/fluxon_rs/fluxon_fs/src/agent_service/transfer_agent.rs index f35d8e8..1738ade 100644 --- a/fluxon_rs/fluxon_fs/src/agent_service/transfer_agent.rs +++ b/fluxon_rs/fluxon_fs/src/agent_service/transfer_agent.rs @@ -6024,45 +6024,13 @@ mod tests { use std::os::unix::fs::PermissionsExt; use fluxon_fs_core::config::{ - FluxonFsExport, FluxonFsExportRoutingMode, FluxonFsExportRpcPaths, FluxonFsGlobalConfig, - FluxonFsTransferDispositionWire, + FluxonFsExport, FluxonFsGlobalConfig, FluxonFsTransferDispositionWire, + agent_registry_export_for_name_and_root_v1, }; use tempfile::TempDir; fn test_export(root_dir_abs: &str) -> FluxonFsExport { - FluxonFsExport { - remote_root_dir_abs: root_dir_abs.to_string(), - routing_mode: FluxonFsExportRoutingMode::AgentRegistry, - nodes: Vec::new(), - cache_kv_key_prefix: "/test/cache/".to_string(), - cache_bytes_field_key: "bytes".to_string(), - cache_max_bytes: 1, - rpc_paths: FluxonFsExportRpcPaths { - stat: "/stat".to_string(), - lstat: "/lstat".to_string(), - list_dir: "/list_dir".to_string(), - readlink: "/readlink".to_string(), - setxattr: "/setxattr".to_string(), - getxattr: "/getxattr".to_string(), - listxattr: "/listxattr".to_string(), - removexattr: "/removexattr".to_string(), - read_chunk: "/read_chunk".to_string(), - write_chunk: "/write_chunk".to_string(), - truncate: "/truncate".to_string(), - mkdir: "/mkdir".to_string(), - mkfifo: "/mkfifo".to_string(), - mknod: "/mknod".to_string(), - rmdir: "/rmdir".to_string(), - unlink: "/unlink".to_string(), - link: "/link".to_string(), - symlink: "/symlink".to_string(), - rename: "/rename".to_string(), - chmod: "/chmod".to_string(), - chown: "/chown".to_string(), - lchown: "/lchown".to_string(), - utime: "/utime".to_string(), - }, - } + agent_registry_export_for_name_and_root_v1("src", root_dir_abs) } fn test_exports_handle(root_dir_abs: &str) -> AgentExportsHandle { @@ -6071,6 +6039,7 @@ mod tests { AgentExportsHandle::new_from_static_cfg( &FluxonFsGlobalConfig { stale_window_ms: 1, + write_session_target_inflight_bytes: 64 * 1024 * 1024, rules: Vec::new(), exports, }, diff --git a/fluxon_rs/fluxon_fs/src/remote_disk_cache.rs b/fluxon_rs/fluxon_fs/src/remote_disk_cache.rs index 736592a..e45f15e 100644 --- a/fluxon_rs/fluxon_fs/src/remote_disk_cache.rs +++ b/fluxon_rs/fluxon_fs/src/remote_disk_cache.rs @@ -389,16 +389,14 @@ pub fn disk_cache_max_bytes_from_env() -> u64 { .unwrap_or(REMOTE_DISK_CACHE_MAX_BYTES_DEFAULT) } -pub fn resolve_disk_cache_root(shared_file_path: &Path, instance_key: &str) -> PathBuf { +pub fn resolve_disk_cache_root(cache_root_base: &Path, instance_key: &str) -> PathBuf { if let Some(raw) = env::var_os(REMOTE_DISK_CACHE_ROOT_ENV) { let trimmed = raw.to_string_lossy().trim().to_string(); if !trimmed.is_empty() { return PathBuf::from(trimmed); } } - shared_file_path - .join(REMOTE_DISK_CACHE_DIRNAME) - .join(safe_cache_component(instance_key)) + cache_root_base.join(safe_cache_component(instance_key)) } fn write_meta(path: &Path, meta: &RemoteDiskCacheIndexMeta) -> io::Result<()> { diff --git a/fluxon_rs/fluxon_kv/examples/kvcli_example_client_config1.yaml b/fluxon_rs/fluxon_kv/examples/kvcli_example_client_config1.yaml index 7781f8b..8119a14 100644 --- a/fluxon_rs/fluxon_kv/examples/kvcli_example_client_config1.yaml +++ b/fluxon_rs/fluxon_kv/examples/kvcli_example_client_config1.yaml @@ -9,7 +9,7 @@ fluxonkv_spec: etcd_addresses: - "127.0.0.1:2379" cluster_name: "cluster_001" - shared_memory_path: "/tmp/kvcache_shared_memory/client_node_001" + share_mem_path: "/tmp/kvcache_shared_memory/client_node_001" # Optional: Additional configurations can be added here # custom_field: "value" diff --git a/fluxon_rs/fluxon_kv/examples/kvcli_example_client_config2.yaml b/fluxon_rs/fluxon_kv/examples/kvcli_example_client_config2.yaml index e2e9464..9d72fe7 100644 --- a/fluxon_rs/fluxon_kv/examples/kvcli_example_client_config2.yaml +++ b/fluxon_rs/fluxon_kv/examples/kvcli_example_client_config2.yaml @@ -9,7 +9,7 @@ fluxonkv_spec: etcd_addresses: - "127.0.0.1:2379" cluster_name: "cluster_001" - shared_memory_path: "/tmp/kvcache_shared_memory/client_node_002" + share_mem_path: "/tmp/kvcache_shared_memory/client_node_002" # Optional: Additional configurations can be added here # custom_field: "value" diff --git a/fluxon_rs/fluxon_kv/framework_init_steps.yaml b/fluxon_rs/fluxon_kv/framework_init_steps.yaml index 95e18b5..923ae30 100644 --- a/fluxon_rs/fluxon_kv/framework_init_steps.yaml +++ b/fluxon_rs/fluxon_kv/framework_init_steps.yaml @@ -24,24 +24,24 @@ resources: publish_tags: [master, owner, external] published_by: ClusterManager.step.1.init2 doc: | - - ClusterManager: member watch 已建立(具备持续观测能力) - - 提供: ClusterManager.listen() 可持续消费 ClusterEvent + - ClusterManager: member watch is established and continuous observation is available + - Provides: `ClusterManager.listen()` can continuously consume `ClusterEvent` - id: owner_shared_mem_bundle_ready tags: [external] publish_tags: [] published_by: ExternalClientApi.step.1.prepare doc: | - - external: 等 shared.json+mmap.file -> mmap -> 等 owner member 可观测 - - owner: shared.json 由 owner 的 segment registration 流程写入(不在 init DAG 内显式建模) + - external: wait until owner `shared.json` is readable and passes metadata validation, then wait for `mmap.file` -> mmap -> owner member observability + - owner: `shared.json` is written by the owner's segment-registration flow and is not modeled explicitly inside the init DAG - id: prom_remote_write_wait_ready tags: [owner, external] publish_tags: [master] published_by: MetricReporter.step.1.prepare doc: | - - master: 发布 prom remote_write urls(通过 cluster member state) - - owner/external: 等待直到可观测到 master prom remote_write urls(best-effort) + - master: publish Prometheus remote_write URLs through cluster member state + - owner/external: wait until the master's remote_write URLs become observable, best-effort # Init-step DAG source of truth. # @@ -73,9 +73,9 @@ steps: mode: Blocking deps: [] doc: | - - 构造: ClusterManager(无PostView) - - 写 local_state: 保存连接参数/分配本地缓存;若提供 local_ipc_root,则在此处规范化后预填到 self member metadata(不启动任务) - - 依赖: 无 + - Construct `ClusterManager` with no PostView + - Write local state: persist connection args and allocate local caches; if `local_ipc_root` is provided, normalize it here and prefill it into self-member metadata without starting tasks + - Depends on: none exec: kind: Construct call: "ClusterManager::construct" @@ -87,9 +87,9 @@ steps: mode: Blocking deps: [] doc: | - - 构造: P2pModule(无PostView) - - 写 local_state: 分配 P2P 状态/缓存(不启动 transport) - - 依赖: 无 + - Construct `P2pModule` with no PostView + - Write local state: allocate P2P state and caches without starting transport + - Depends on: none exec: kind: Construct call: "P2pModule::construct" @@ -101,9 +101,9 @@ steps: mode: Blocking deps: [] doc: | - - 构造: MasterSegManager(无PostView) - - 写 local_state: 分配段管理器状态(不注册RPC) - - 依赖: 无 + - Construct `MasterSegManager` with no PostView + - Write local state: allocate segment-manager state without registering RPCs + - Depends on: none exec: kind: Construct call: "MasterSegManager::construct" @@ -115,9 +115,9 @@ steps: mode: Blocking deps: [] doc: | - - 构造: MasterKvRouter(无PostView) - - 写 local_state: 分配策略/缓存/通道(不注册RPC/不启动任务) - - 依赖: 无 + - Construct `MasterKvRouter` with no PostView + - Write local state: allocate policy state, caches, and channels without registering RPCs or starting tasks + - Depends on: none exec: kind: Construct call: "MasterKvRouter::construct" @@ -129,9 +129,9 @@ steps: mode: Blocking deps: [] doc: | - - 构造: MetricReporter(无PostView) - - 写 local_state: 准备 registry/缓冲区(不启动 loop) - - 依赖: 无 + - Construct `MetricReporter` with no PostView + - Write local state: prepare registries and buffers without starting loops + - Depends on: none exec: kind: Construct call: "MetricReporter::construct" @@ -143,9 +143,9 @@ steps: mode: Blocking deps: [] doc: | - - 构造: ClientKvApi(无PostView) - - 写 local_state: 分配 API 状态(不注册RPC) - - 依赖: 无 + - Construct `ClientKvApi` with no PostView + - Write local state: allocate API state without registering RPCs + - Depends on: none exec: kind: Construct call: "ClientKvApi::construct" @@ -157,9 +157,9 @@ steps: mode: Blocking deps: [] doc: | - - 构造: ClientSegPool(无PostView) - - 写 local_state: 准备共享内存相关状态(不写 cluster) - - 依赖: 无 + - Construct `ClientSegPool` with no PostView + - Write local state: prepare shared-memory-related state without publishing cluster state + - Depends on: none exec: kind: Construct call: "ClientSegPool::construct" @@ -171,9 +171,9 @@ steps: mode: Blocking deps: [] doc: | - - 构造: ClientTransferEngine(无PostView) - - 写 local_state: 准备 engine 状态(不启动后端) - - 依赖: 无 + - Construct `ClientTransferEngine` with no PostView + - Write local state: prepare engine state without starting the backend + - Depends on: none exec: kind: Construct call: "ClientTransferEngine::construct" @@ -185,9 +185,9 @@ steps: mode: Blocking deps: [] doc: | - - 构造: ExternalClientApi(无PostView) - - 写 local_state: 准备 external 状态(不做 owner discovery) - - 依赖: 无 + - Construct `ExternalClientApi` with no PostView + - Write local state: prepare external-client state without doing owner discovery + - Depends on: none exec: kind: Construct call: "ExternalClientApi::construct" @@ -199,9 +199,9 @@ steps: mode: Blocking deps: [] doc: | - - 构造: MasterLeaseManager(无PostView) - - 写 local_state: 分配 lease 表/cleanup 状态(不注册RPC) - - 依赖: 无 + - Construct `MasterLeaseManager` with no PostView + - Write local state: allocate lease-table and cleanup state without registering RPCs + - Depends on: none exec: kind: Construct call: "MasterLeaseManager::construct" @@ -213,9 +213,9 @@ steps: mode: Blocking deps: [] doc: | - - 加入: etcd cluster(lease + self member) - - 启动: watch members/metadata + event broadcast - - 产出: cluster_state 可读写 + cluster_event + - Join the etcd cluster with lease and self member registration + - Start member and metadata watches plus event broadcast + - Produces readable and writable `cluster_state` plus `cluster_event` exec: kind: Call call: "ClusterManager::init2_for_init_dag" @@ -227,9 +227,9 @@ steps: deps: - res:cluster_member_watch_ready doc: | - - 启动: P2P transport + rpc core - - 启动: cluster_event listener - - 依赖: res:cluster_member_watch_ready(读成员变化) + - Start P2P transport and the RPC core + - Start the `cluster_event` listener + - Depends on: `res:cluster_member_watch_ready` to observe membership changes exec: kind: Call call: "P2pModule::init2_for_init_dag" @@ -241,8 +241,8 @@ steps: deps: - P2pModule.step.1.init2 doc: | - - 注册: seg RPC handler/caller - - 依赖: P2pModule.step.1.init2(rpc transport) + - Register segment RPC handlers and callers + - Depends on: `P2pModule.step.1.init2` for RPC transport exec: kind: Call call: "MasterSegManager::init2_for_init_dag" @@ -255,9 +255,9 @@ steps: - ClusterManager.step.1.init2 - P2pModule.step.1.init2 doc: | - - 注册: master KV RPC handler/caller - - 启动: router tasks(delete broadcast + cluster listener) - - 依赖: ClusterManager.step.1.init2(监听成员) + P2pModule.step.1.init2(rpc) + - Register master KV RPC handlers and callers + - Start router tasks such as delete broadcast and cluster listener + - Depends on: `ClusterManager.step.1.init2` for membership observation and `P2pModule.step.1.init2` for RPC exec: kind: Call call: "MasterKvRouter::init2_for_init_dag" @@ -269,9 +269,9 @@ steps: deps: - ClusterManager.step.1.init2 doc: | - - 启动: monitoring config watcher(异步更新 monitoring_config) - - 注册: prom remote_write proxy RPC caller/handler - - 依赖: ClusterManager.step.1.init2(读广播) + - Start the monitoring-config watcher that updates `monitoring_config` asynchronously + - Register Prometheus remote_write proxy RPC handlers and callers + - Depends on: `ClusterManager.step.1.init2` to read broadcasts exec: kind: Call call: "MetricReporter::init2_prepare" @@ -283,8 +283,8 @@ steps: deps: - res:prom_remote_write_wait_ready doc: | - - 启动: metric_reporter_loop - - 依赖: res:prom_remote_write_wait_ready(内部包含 publish/wait 逻辑) + 本模块 step.1(隐式顺序) + - Start `metric_reporter_loop` + - Depends on: `res:prom_remote_write_wait_ready`, which includes the internal publish-and-wait logic, plus the implicit ordering after this module's step 1 exec: kind: Call call: "MetricReporter::init2_after_prom_remote_write_wait" @@ -297,9 +297,9 @@ steps: - ClusterManager.step.1.init2 - P2pModule.step.1.init2 doc: | - - 注册: KV API RPC handler/caller - - 启动: client_cluster_listener - - 依赖: ClusterManager.step.1.init2(读cluster) + P2pModule.step.1.init2(rpc) + - Register KV API RPC handlers and callers + - Start `client_cluster_listener` + - Depends on: `ClusterManager.step.1.init2` to read cluster state and `P2pModule.step.1.init2` for RPC exec: kind: Call call: "ClientKvApi::init2_for_init_dag" @@ -312,9 +312,9 @@ steps: - ClusterManager.step.1.init2 - P2pModule.step.1.init2 doc: | - - 注册: RequestSegmentRegistration RPC handler - - 写 cluster_state: share_group_binding(owner_id + owner_start_time) - - 依赖: ClusterManager.step.1.init2(写cluster) + P2pModule.step.1.init2(rpc) + - Register the `RequestSegmentRegistration` RPC handler + - Write cluster state: publish `share_group_binding(owner_id + owner_start_time)` + - Depends on: `ClusterManager.step.1.init2` to write cluster state and `P2pModule.step.1.init2` for RPC exec: kind: Call call: "ClientSegPool::init2_for_init_dag" @@ -327,9 +327,9 @@ steps: - ClusterManager.step.1.init2 - P2pModule.step.1.init2 doc: | - - 等待: accessible_ip_ready(node_start_time) - - 构建: transfer engine backend + p2p transfer rpc - - 依赖: ClusterManager.step.1.init2(等ip/监听成员) + P2pModule.step.1.init2(rpc) + - Wait for `accessible_ip_ready(node_start_time)` + - Build the transfer-engine backend plus P2P transfer RPC + - Depends on: `ClusterManager.step.1.init2` to wait for IP visibility and observe members, and `P2pModule.step.1.init2` for RPC exec: kind: Call call: "ClientTransferEngine::init2_for_init_dag" @@ -342,8 +342,8 @@ steps: - ClusterManager.step.1.init2 - P2pModule.step.1.init2 doc: | - - 准备: ExternalClientApi init(不等待 owner) - - 依赖: ClusterManager.step.1.init2(成员/广播) + P2pModule.step.1.init2(rpc) + - Prepare `ExternalClientApi` initialization without waiting for the owner + - Depends on: `ClusterManager.step.1.init2` for members and broadcasts, and `P2pModule.step.1.init2` for RPC exec: kind: Call call: "ExternalClientApi::init2_prepare" @@ -355,8 +355,8 @@ steps: deps: - res:owner_shared_mem_bundle_ready doc: | - - external: 完成 owner generation / sub_cluster 绑定 + 注册 RPC - - 依赖: res:owner_shared_mem_bundle_ready + 本模块 step.1(隐式顺序) + - external: finish owner-generation and sub-cluster binding, then register RPCs + - Depends on: `res:owner_shared_mem_bundle_ready` plus the implicit ordering after this module's step 1 exec: kind: Call call: "ExternalClientApi::init2_after_owner_shared_mem_bundle_ready" @@ -368,9 +368,9 @@ steps: deps: - P2pModule.step.1.init2 doc: | - - 注册: lease RPC - - 启动: cleanup loop - - 依赖: P2pModule.step.1.init2(rpc) + - Register lease RPCs + - Start the cleanup loop + - Depends on: `P2pModule.step.1.init2` for RPC exec: kind: Call call: "MasterLeaseManager::init2_for_init_dag" @@ -381,8 +381,8 @@ steps: mode: Blocking deps: [] doc: | - - 注册: 额外 RPC(route-probe/user-rpc) - - 依赖: 本模块 step.1 已完成(隐式顺序) + - Register extra RPCs such as route-probe and user-rpc + - Depends on: the implicit ordering after this module's step 1 exec: kind: Call call: "P2pModule::init3_for_init_dag" @@ -394,9 +394,9 @@ steps: deps: - ClientTransferEngine.step.1.init2 doc: | - - 调用 transfer_engine: register segments + open_self_segment - - 写 cluster_state: publish transfer_ready - - 依赖: ClientTransferEngine.step.1.init2(engine ready) + - Call the transfer engine to register segments and open the self segment + - Write cluster state: publish `transfer_ready` + - Depends on: `ClientTransferEngine.step.1.init2` with the engine ready exec: kind: Call call: "ClientSegPool::init3_for_init_dag" diff --git a/fluxon_rs/fluxon_kv/src/client_seg_pool/mod.rs b/fluxon_rs/fluxon_kv/src/client_seg_pool/mod.rs index 7902beb..1aa6954 100644 --- a/fluxon_rs/fluxon_kv/src/client_seg_pool/mod.rs +++ b/fluxon_rs/fluxon_kv/src/client_seg_pool/mod.rs @@ -44,8 +44,8 @@ define_module!( #[derive(Clone, Debug)] pub struct ClientSegPoolNewArg { pub contribute_size: ContributeToClusterPoolSize, - pub shared_memory_path: String, - pub shared_file_path: String, + pub share_mem_path: String, + pub large_file_paths: crate::config::LargeFilePaths, pub cluster_name: String, pub etcd_addresses: Vec, pub attach_existing_meta: Option, @@ -62,8 +62,8 @@ pub struct SharedJsonMeta { pub sub_cluster: Option, pub cluster_name: String, pub etcd_addresses: Vec, - pub shared_memory_path: String, - pub shared_file_path: String, + pub share_mem_path: String, + pub large_file_paths: crate::config::LargeFilePaths, pub protocol_version: String, pub write_ts: Option, } @@ -199,10 +199,10 @@ impl Deref for ClientCpuMemReadGuard { pub struct ClientSegPoolInner { cpu_allocated_mem: std::sync::Arc>>, view: std::sync::OnceLock, - /// Directory path for shared-memory backed files (mmap.file). - shared_memory_path: String, - /// Directory path for regular files (shared.json, side-transfer metadata). - shared_file_path: String, + /// Directory path for the local shared bundle (mmap.file, shared.json, peer metadata). + share_mem_path: String, + /// Ordered large-file roots; concrete subdirectories are derived by fixed relative layout. + large_file_paths: crate::config::LargeFilePaths, side_transfer_worker: bool, attach_owner_ref: Option, @@ -233,15 +233,15 @@ impl ClientSegPoolInner { } impl ClientSegPool { - pub fn side_transfer_peers_dir(shared_file_path: &str) -> std::path::PathBuf { - std::path::Path::new(shared_file_path).join(SIDE_TRANSFER_PEERS_DIRNAME) + pub fn side_transfer_peers_dir(share_mem_path: &str) -> std::path::PathBuf { + std::path::Path::new(share_mem_path).join(SIDE_TRANSFER_PEERS_DIRNAME) } pub fn side_transfer_peer_file_path( - shared_file_path: &str, + share_mem_path: &str, side_id: &str, ) -> std::path::PathBuf { - Self::side_transfer_peers_dir(shared_file_path).join(format!("{side_id}.json")) + Self::side_transfer_peers_dir(share_mem_path).join(format!("{side_id}.json")) } pub fn attach_view(&self, view: ClientSegPoolView) { @@ -255,13 +255,13 @@ impl ClientSegPool { pub async fn construct(arg: ClientSegPoolNewArg) -> Result { tracing::info!( - "Constructing ClientSegPool in Client mode with shared_memory_path: {}", - arg.shared_memory_path + "Constructing ClientSegPool in Client mode with share_mem_path: {}", + arg.share_mem_path ); let contribute_size = arg.contribute_size; - let shared_memory_path = arg.shared_memory_path; - let shared_file_path = arg.shared_file_path; + let share_mem_path = arg.share_mem_path; + let large_file_paths = arg.large_file_paths; let cluster_name = arg.cluster_name; let etcd_addresses = arg.etcd_addresses; let attach_existing_meta = arg.attach_existing_meta; @@ -278,7 +278,7 @@ impl ClientSegPool { if let Some(existing_meta) = attach_existing_meta { tracing::info!( "Attaching existing shared memory for side-transfer worker: path={}, len={}", - shared_memory_path, + share_mem_path, existing_meta.segment_len ); @@ -288,7 +288,7 @@ impl ClientSegPool { use std::ptr; let map_len = existing_meta.segment_len as usize; - let mmap_file_path = Path::new(&shared_memory_path).join("mmap.file"); + let mmap_file_path = Path::new(&share_mem_path).join("mmap.file"); let file = OpenOptions::new() .read(true) .write(true) @@ -354,8 +354,8 @@ impl ClientSegPool { layout_validated: AtomicBool::new(false), }))), view: std::sync::OnceLock::new(), - shared_memory_path: shared_memory_path.clone(), - shared_file_path: shared_file_path.clone(), + share_mem_path: share_mem_path.clone(), + large_file_paths: large_file_paths.clone(), side_transfer_worker, attach_owner_ref, cluster_name: cluster_name.clone(), @@ -370,8 +370,8 @@ impl ClientSegPool { let inner = ClientSegPoolInner { cpu_allocated_mem: std::sync::Arc::new(ARwLock::new(None)), view: std::sync::OnceLock::new(), - shared_memory_path: shared_memory_path.clone(), - shared_file_path: shared_file_path.clone(), + share_mem_path: share_mem_path.clone(), + large_file_paths: large_file_paths.clone(), side_transfer_worker, attach_owner_ref, cluster_name: cluster_name.clone(), @@ -394,29 +394,20 @@ impl ClientSegPool { let map_len = contribute_size.dram as usize; - if shared_memory_path.is_empty() { + if share_mem_path.is_empty() { return Err(KvError::SharedMem( crate::rpcresp_kvresult_convert::msg_and_error::SharedMemError::MappingFailed { path: String::new(), len: map_len as u64, - detail: "shared_memory_path is empty; explicit configuration required" - .to_string(), - }, - )); - } - if shared_file_path.is_empty() { - return Err(KvError::SharedMem( - crate::rpcresp_kvresult_convert::msg_and_error::SharedMemError::MetaDataLoadError { - path: String::new(), - detail: "shared_file_path is empty; explicit configuration required" + detail: "share_mem_path is empty; explicit configuration required" .to_string(), }, )); } - let base_path = &shared_memory_path; + let base_path = &share_mem_path; tracing::info!( - "Using shared_memory_path: {} for memory-mapped file", + "Using share_mem_path: {} for memory-mapped file", base_path ); std::fs::create_dir_all(base_path).map_err(|e| { @@ -533,8 +524,8 @@ impl ClientSegPool { layout_validated: AtomicBool::new(false), }))), view: std::sync::OnceLock::new(), - shared_memory_path: base_path.to_string(), - shared_file_path: shared_file_path.clone(), + share_mem_path: base_path.to_string(), + large_file_paths, side_transfer_worker, attach_owner_ref, cluster_name, @@ -549,8 +540,12 @@ impl ClientSegPool { &self.0 } - pub fn shared_file_path(&self) -> &str { - &self.inner().shared_file_path + pub fn share_mem_path(&self) -> &str { + &self.inner().share_mem_path + } + + pub fn large_file_paths(&self) -> &crate::config::LargeFilePaths { + &self.inner().large_file_paths } fn transfer_rpc_fast_path_eligible_members(&self) -> Vec { @@ -897,7 +892,7 @@ impl ClientSegPool { }, ) })?; - let peers_dir = Self::side_transfer_peers_dir(&inner.shared_file_path); + let peers_dir = Self::side_transfer_peers_dir(&inner.share_mem_path); std::fs::create_dir_all(&peers_dir).map_err(|e| { KvError::SharedMem( crate::rpcresp_kvresult_convert::msg_and_error::SharedMemError::MetaDataLoadError { @@ -907,7 +902,7 @@ impl ClientSegPool { ) })?; - let peer_path = Self::side_transfer_peer_file_path(&inner.shared_file_path, &self_info.id); + let peer_path = Self::side_transfer_peer_file_path(&inner.share_mem_path, &self_info.id); let tmp_path = peer_path.with_file_name(format!( "{}.tmp.{}.{}", self_info.id, @@ -950,7 +945,7 @@ impl ClientSegPool { return Ok(()); } let self_id = inner.view().cluster_manager().get_self_info().id; - let peer_path = Self::side_transfer_peer_file_path(&inner.shared_file_path, &self_id); + let peer_path = Self::side_transfer_peer_file_path(&inner.share_mem_path, &self_id); match std::fs::remove_file(&peer_path) { Ok(()) => Ok(()), Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(()), @@ -1111,35 +1106,24 @@ impl ClientSegPool { self.wait_required_transfer_rpc_fast_path_ready().await?; use std::path::Path; - let shared_json_path = Path::new(&inner.shared_file_path).join("shared.json"); + let shared_json_path = Path::new(&inner.share_mem_path).join("shared.json"); if let Some(parent) = shared_json_path.parent() { std::fs::create_dir_all(parent).map_err(|e| { KvError::SharedMem( crate::rpcresp_kvresult_convert::msg_and_error::SharedMemError::MetaDataLoadError { path: parent.to_string_lossy().to_string(), - detail: format!("Failed to create shared_file_path: {}", e), + detail: format!("Failed to create share_mem_path: {}", e), }, ) })?; } - let shared_memory_canonical = std::fs::canonicalize(&inner.shared_memory_path) - .map_err(|e| { - KvError::SharedMem( - crate::rpcresp_kvresult_convert::msg_and_error::SharedMemError::MetaDataLoadError { - path: inner.shared_memory_path.clone(), - detail: format!("Failed to canonicalize shared_memory_path: {}", e), - }, - ) - })? - .to_string_lossy() - .into_owned(); - let shared_file_canonical = std::fs::canonicalize(&inner.shared_file_path) + let share_mem_canonical = std::fs::canonicalize(&inner.share_mem_path) .map_err(|e| { KvError::SharedMem( crate::rpcresp_kvresult_convert::msg_and_error::SharedMemError::MetaDataLoadError { - path: inner.shared_file_path.clone(), - detail: format!("Failed to canonicalize shared_file_path: {}", e), + path: inner.share_mem_path.clone(), + detail: format!("Failed to canonicalize share_mem_path: {}", e), }, ) })? @@ -1159,8 +1143,8 @@ impl ClientSegPool { cluster_name: inner.cluster_name.clone(), etcd_addresses: inner.etcd_addresses.clone(), - shared_memory_path: shared_memory_canonical, - shared_file_path: shared_file_canonical, + share_mem_path: share_mem_canonical, + large_file_paths: inner.large_file_paths.clone(), protocol_version, @@ -1226,8 +1210,8 @@ impl ClientSegPool { )); } - let shared_json_path = std::path::Path::new(&inner.shared_file_path).join("shared.json"); - let mmap_file_path = std::path::Path::new(&inner.shared_memory_path).join("mmap.file"); + let shared_json_path = std::path::Path::new(&inner.share_mem_path).join("shared.json"); + let mmap_file_path = std::path::Path::new(&inner.share_mem_path).join("mmap.file"); if !mmap_file_path.exists() { return Err(KvError::SharedMem( @@ -1343,7 +1327,7 @@ async fn handle_resolve_side_transfer_lane_request( ) -> MsgPack { let self_info = view.cluster_manager().get_self_info(); let peers_dir = - ClientSegPool::side_transfer_peers_dir(&view.client_seg_pool().inner().shared_file_path); + ClientSegPool::side_transfer_peers_dir(&view.client_seg_pool().inner().share_mem_path); tracing::info!( "handle_resolve_side_transfer_lane_request: owner={} lane_idx={} peers_dir={}", self_info.id, diff --git a/fluxon_rs/fluxon_kv/src/config.rs b/fluxon_rs/fluxon_kv/src/config.rs index 218ef69..f9c7691 100644 --- a/fluxon_rs/fluxon_kv/src/config.rs +++ b/fluxon_rs/fluxon_kv/src/config.rs @@ -379,6 +379,34 @@ fn cluster_scoped_shared_path(root: &str, cluster_name: &str) -> KvResult KvResult { + let trimmed = root.trim(); + if trimmed.is_empty() { + return Err(ConfigError::InvalidClientConfig { + detail: format!("{field_name} cannot be empty"), + } + .into_kverror()); + } + Ok(trimmed.to_string()) +} + +fn verify_non_empty_root_path_list(paths: &[String], field_name: &str) -> KvResult> { + if paths.is_empty() { + return Err(ConfigError::InvalidClientConfig { + detail: format!("{field_name} must contain at least one path"), + } + .into_kverror()); + } + let mut out = Vec::with_capacity(paths.len()); + for (idx, root) in paths.iter().enumerate() { + out.push(verify_non_empty_root_path( + root, + &format!("{field_name}[{idx}]"), + )?); + } + Ok(out) +} + fn resolve_compiled_rdma_transfer_engine() -> KvResult { Ok(TransferEngineType::Closed) } @@ -549,8 +577,9 @@ pub struct FluxonKvSpecYaml { #[serde(skip_serializing_if = "Option::is_none")] pub etcd_addresses: Option>>, pub cluster_name: String, - pub shared_memory_path: String, - pub shared_file_path: String, + pub share_mem_path: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub large_file_paths: Option, #[serde(skip_serializing_if = "Option::is_none")] pub p2p_listen_port: Option, #[serde(skip_serializing_if = "Option::is_none")] @@ -559,6 +588,10 @@ pub struct FluxonKvSpecYaml { pub sub_cluster: Option>, } +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(transparent)] +pub struct LargeFilePathsYaml(pub Vec); + #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct RedisCompatConfigYaml { @@ -608,6 +641,81 @@ pub struct FluxonKvSpec { pub sub_cluster: Option, } +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(transparent)] +pub struct LargeFilePaths { + pub paths: Vec, +} + +impl LargeFilePaths { + fn require_configured_paths(&self) -> KvResult<()> { + if self.paths.is_empty() { + return Err(ConfigError::InvalidClientConfig { + detail: "large_file_paths must contain at least one path".to_string(), + } + .into_kverror()); + } + Ok(()) + } + + fn resolve_preferred_root_subdir( + &self, + relative_dir: &Path, + target_name: &str, + ) -> KvResult { + self.require_configured_paths()?; + let mut errors = Vec::new(); + for root in &self.paths { + let candidate = Path::new(root).join(relative_dir); + match fs::create_dir_all(&candidate) { + Ok(()) => return Ok(candidate), + Err(err) => errors.push(format!("{} ({})", candidate.display(), err)), + } + } + Err(ConfigError::InvalidClientConfig { + detail: format!( + "large_file_paths contains no usable root for {}; tried: {}", + target_name, + errors.join(", ") + ), + } + .into_kverror()) + } + + pub fn kv_logs_dir(&self, cluster_name: &str) -> KvResult { + let relative_dir = PathBuf::from(format!("{cluster_name}_cluster_kv_logs")); + self.resolve_preferred_root_subdir(&relative_dir, "kv logs") + } + + pub fn third_party_logs_dir(&self, cluster_name: &str) -> KvResult { + let relative_dir = PathBuf::from(format!("{cluster_name}_cluster_third_party_logs")); + self.resolve_preferred_root_subdir(&relative_dir, "third-party logs") + } + + pub fn kv_profiles_dir(&self, cluster_name: &str) -> KvResult { + let relative_dir = PathBuf::from(format!("{cluster_name}_cluster_kv_profiles")); + self.resolve_preferred_root_subdir(&relative_dir, "kv profiles") + } + + pub fn side_transfer_runtime_dir( + &self, + cluster_name: &str, + instance_key: &str, + ) -> KvResult { + let relative_dir = PathBuf::from(format!( + "{cluster_name}_cluster_kv_logs/side_transfer_runtime/{instance_key}" + )); + self.resolve_preferred_root_subdir(&relative_dir, "side-transfer runtime") + } + + pub fn fs_disk_cache_base_dir(&self) -> KvResult { + self.resolve_preferred_root_subdir( + Path::new("fluxon_fs_disk_cache"), + "fluxon fs disk cache", + ) + } +} + /// KV client backend types supported by the system #[derive(Debug, Clone, PartialEq)] pub enum KvClientType { @@ -625,8 +733,8 @@ pub struct ClientConfig { pub pprof_duration_seconds: Option, pub redis_compat_listen_addr: Option, pub fluxonkv_spec: FluxonKvSpec, - pub shared_memory_path: String, // Mandatory shared memory path - pub shared_file_path: String, // Mandatory shared file path + pub share_mem_path: String, // Mandatory shared bundle path + pub large_file_paths: LargeFilePaths, // Mandatory large-file roots for logs and caches pub test_spec_config: TestSpecConfig, } @@ -893,7 +1001,7 @@ impl ClientConfigYaml { .into_kverror()); } - // External (zero-contribution) mode forbids additional knobs to keep the schema minimal. + // External (zero-contribution) mode forbids additional owner-derived knobs to keep the schema minimal. if is_external { if self.fluxonkv_spec.redis_compat.is_some() { return Err(ConfigError::InvalidClientConfig { @@ -914,6 +1022,12 @@ impl ClientConfigYaml { } .into_kverror()); } + if self.fluxonkv_spec.large_file_paths.is_some() { + return Err(ConfigError::InvalidClientConfig { + detail: "fluxonkv_spec.large_file_paths is forbidden in zero-contribution mode (it is inherited from owner shared.json)".to_string(), + } + .into_kverror()); + } } // Preserve historical behavior for configs that omit `protocol`, but allow @@ -1040,29 +1154,36 @@ impl ClientConfigYaml { } } - // Validate shared_memory_path (mandatory and non-empty) - if self.fluxonkv_spec.shared_memory_path.trim().is_empty() { + // Validate share_mem_path (mandatory and non-empty) + if self.fluxonkv_spec.share_mem_path.trim().is_empty() { return Err(ConfigError::InvalidInstanceKey { - key: "shared_memory_path cannot be empty".to_string(), + key: "share_mem_path cannot be empty".to_string(), } .into_kverror()); } - if self.fluxonkv_spec.shared_file_path.trim().is_empty() { - return Err(ConfigError::InvalidInstanceKey { - key: "shared_file_path cannot be empty".to_string(), + // Owner mode always needs explicit ordered large-file roots. + // The listen port stays optional at this contract layer: deterministic + // callers may pin it, while shared testbed owners can leave it unset + // and let the runtime bind a free port. + let large_file_paths = if is_external { + LargeFilePaths { paths: Vec::new() } + } else { + let Some(large_file_paths_yaml) = self.fluxonkv_spec.large_file_paths.as_ref() else { + return Err(ConfigError::InvalidClientConfig { + detail: "fluxonkv_spec.large_file_paths is required for owner mode" + .to_string(), + } + .into_kverror()); + }; + LargeFilePaths { + paths: verify_non_empty_root_path_list(&large_file_paths_yaml.0, "large_file_paths")?, } - .into_kverror()); - } + }; - let shared_memory_path = cluster_scoped_shared_path( - &self.fluxonkv_spec.shared_memory_path, - &fluxonkv_spec.cluster_name, - )?; - let shared_file_path = cluster_scoped_shared_path( - &self.fluxonkv_spec.shared_file_path, + let share_mem_path = cluster_scoped_shared_path( + &self.fluxonkv_spec.share_mem_path, &fluxonkv_spec.cluster_name, )?; - let redis_compat_listen_addr = match self.fluxonkv_spec.redis_compat.as_ref() { None | Some(YamlNullable::Null) => None, Some(YamlNullable::Value(rc)) => { @@ -1092,8 +1213,8 @@ impl ClientConfigYaml { pprof_duration_seconds, redis_compat_listen_addr, fluxonkv_spec, - shared_memory_path, - shared_file_path, + share_mem_path, + large_file_paths, test_spec_config, }) } @@ -1414,6 +1535,13 @@ impl MasterConfigYaml { #[cfg(test)] mod tests { use super::*; + use uuid::Uuid; + + fn new_test_dir(prefix: &str) -> PathBuf { + let path = std::env::temp_dir().join(format!("{}_{}", prefix, Uuid::new_v4())); + std::fs::create_dir_all(&path).unwrap(); + path + } #[test] fn cluster_scoped_shared_path_appends_cluster_name() { @@ -1432,8 +1560,8 @@ contribute_to_cluster_pool_size: fluxonkv_spec: etcd_addresses: ["127.0.0.1:2379"] cluster_name: test_cluster - shared_memory_path: /tmp/test_owner - shared_file_path: /tmp/test_owner_files + share_mem_path: /tmp/test_owner + large_file_paths: [/tmp/test_owner_large] sub_cluster: rack-a test_spec_config: disable_observability: true @@ -1451,11 +1579,7 @@ test_spec_config: assert!(verified.test_spec_config.enable_iceoryx_logs); assert!(verified.test_spec_config.iceoryx_external_busy_poll); assert!(!verified.test_spec_config.iceoryx_owner_client_busy_poll); - assert_eq!(verified.shared_memory_path, "/tmp/test_owner/test_cluster"); - assert_eq!( - verified.shared_file_path, - "/tmp/test_owner_files/test_cluster" - ); + assert_eq!(verified.share_mem_path, "/tmp/test_owner/test_cluster"); assert_eq!( verified.test_spec_config.transport_mode, Some(TestSpecTransportMode::TransferOnly) @@ -1478,8 +1602,8 @@ contribute_to_cluster_pool_size: fluxonkv_spec: etcd_addresses: ["127.0.0.1:2379"] cluster_name: test_cluster - shared_memory_path: /tmp/test_owner - shared_file_path: /tmp/test_owner_files + share_mem_path: /tmp/test_owner + large_file_paths: [/tmp/test_owner_large] sub_cluster: rack-a "#, ) @@ -1492,6 +1616,73 @@ fluxonkv_spec: assert!(verified.fluxonkv_spec.enable_transfer_rpc_fast_path); } + #[test] + fn client_config_zero_contribution_allows_owner_bootstrapped_large_file_paths() { + let cfg = ClientConfigYaml::from_str( + r#" +instance_key: test_external +fluxonkv_spec: + cluster_name: test_cluster + share_mem_path: /tmp/test_external +"#, + ) + .unwrap(); + let verified = cfg.verify().unwrap(); + assert_eq!(verified.large_file_paths.paths, Vec::::new()); + assert_eq!(verified.fluxonkv_spec.etcd_addresses, Vec::::new()); + assert_eq!(verified.fluxonkv_spec.sub_cluster, None); + } + + #[test] + fn client_config_zero_contribution_rejects_large_file_paths_in_yaml() { + let cfg = ClientConfigYaml::from_str( + r#" +instance_key: test_external +fluxonkv_spec: + cluster_name: test_cluster + share_mem_path: /tmp/test_external + large_file_paths: [/tmp/test_external_large] +"#, + ) + .unwrap(); + let err = cfg.verify().unwrap_err(); + let text = format!("{err}"); + assert!(text.contains("fluxonkv_spec.large_file_paths is forbidden in zero-contribution mode")); + } + + #[test] + fn large_file_paths_prefers_first_usable_root() { + let tempdir = new_test_dir("fluxon_large_paths_prefers_first_usable_root"); + let first_root = tempdir.join("first_root"); + let second_root = tempdir.join("second_root"); + std::fs::create_dir_all(&second_root).unwrap(); + + let large_file_paths = LargeFilePaths { + paths: vec![ + first_root.join("child").to_string_lossy().into_owned(), + second_root.to_string_lossy().into_owned(), + ], + }; + + let logs_dir = large_file_paths.kv_logs_dir("test_cluster").unwrap(); + assert_eq!( + logs_dir, + first_root.join("child").join("test_cluster_cluster_kv_logs") + ); + assert!(logs_dir.exists()); + + let third_party_logs_dir = large_file_paths + .third_party_logs_dir("test_cluster") + .unwrap(); + assert_eq!( + third_party_logs_dir, + first_root + .join("child") + .join("test_cluster_cluster_third_party_logs") + ); + assert!(third_party_logs_dir.exists()); + } + #[test] fn client_test_spec_config_accepts_explicit_rdma_device_names() { let cfg = ClientConfigYaml::from_str( @@ -1503,8 +1694,8 @@ contribute_to_cluster_pool_size: fluxonkv_spec: etcd_addresses: ["127.0.0.1:2379"] cluster_name: test_cluster - shared_memory_path: /tmp/test_owner - shared_file_path: /tmp/test_owner_files + share_mem_path: /tmp/test_owner + large_file_paths: [/tmp/test_owner_large] sub_cluster: rack-a test_spec_config: transport_mode: transfer_with_rpc @@ -1556,8 +1747,8 @@ contribute_to_cluster_pool_size: fluxonkv_spec: etcd_addresses: ["127.0.0.1:2379"] cluster_name: test_cluster - shared_memory_path: /tmp/test_owner - shared_file_path: /tmp/test_owner_files + share_mem_path: /tmp/test_owner + large_file_paths: [/tmp/test_owner_large] sub_cluster: rack-a test_spec_config: rdma_device_names: ["mlx5_0"] @@ -1591,8 +1782,8 @@ contribute_to_cluster_pool_size: fluxonkv_spec: etcd_addresses: ["127.0.0.1:2379"] cluster_name: test_cluster - shared_memory_path: /tmp/test_owner - shared_file_path: /tmp/test_owner_files + share_mem_path: /tmp/test_owner + large_file_paths: [/tmp/test_owner_large] sub_cluster: rack-a test_spec_config: transport_mode: transfer_with_rpc @@ -1622,8 +1813,8 @@ contribute_to_cluster_pool_size: fluxonkv_spec: etcd_addresses: ["127.0.0.1:2379"] cluster_name: test_cluster - shared_memory_path: /tmp/test_owner - shared_file_path: /tmp/test_owner_files + share_mem_path: /tmp/test_owner + large_file_paths: [/tmp/test_owner_large] sub_cluster: rack-a test_spec_config: require_transfer_rpc_fast_path_ready_timeout_seconds: 45 @@ -1647,8 +1838,8 @@ contribute_to_cluster_pool_size: fluxonkv_spec: etcd_addresses: ["127.0.0.1:2379"] cluster_name: test_cluster - shared_memory_path: /tmp/test_owner - shared_file_path: /tmp/test_owner_files + share_mem_path: /tmp/test_owner + large_file_paths: [/tmp/test_owner_large] sub_cluster: rack-a test_spec_config: tcp_thread_control_lane_count: 0 @@ -1673,8 +1864,8 @@ contribute_to_cluster_pool_size: fluxonkv_spec: etcd_addresses: ["127.0.0.1:2379"] cluster_name: test_cluster - shared_memory_path: /tmp/test_owner - shared_file_path: /tmp/test_owner_files + share_mem_path: /tmp/test_owner + large_file_paths: [/tmp/test_owner_large] sub_cluster: rack-a test_spec_config: transport_mode: transfer_with_rpc @@ -1704,8 +1895,8 @@ contribute_to_cluster_pool_size: fluxonkv_spec: etcd_addresses: ["127.0.0.1:2379"] cluster_name: test_cluster - shared_memory_path: /tmp/test_owner - shared_file_path: /tmp/test_owner_files + share_mem_path: /tmp/test_owner + large_file_paths: [/tmp/test_owner_large] sub_cluster: rack-a test_spec_config: transport_mode: transfer_with_rpc @@ -1728,8 +1919,8 @@ contribute_to_cluster_pool_size: fluxonkv_spec: etcd_addresses: ["127.0.0.1:2379"] cluster_name: test_cluster - shared_memory_path: /tmp/test_owner - shared_file_path: /tmp/test_owner_files + share_mem_path: /tmp/test_owner + large_file_paths: [/tmp/test_owner_large] sub_cluster: rack-a test_spec_config: rdma_device_names: ["mlx5_0"] @@ -1762,8 +1953,7 @@ contribute_to_cluster_pool_size: fluxonkv_spec: etcd_addresses: ["127.0.0.1:2379"] cluster_name: test_cluster - shared_memory_path: /tmp/test_owner - shared_file_path: /tmp/test_owner_files + share_mem_path: /tmp/test_owner sub_cluster: rack-a test_spec_config: transport_mode: transfer_only @@ -1782,8 +1972,7 @@ protocol: protocol_type: tcp fluxonkv_spec: cluster_name: test_cluster - shared_memory_path: /tmp/test_side_worker - shared_file_path: /tmp/test_side_worker_files + share_mem_path: /tmp/test_side_worker p2p_listen_port: 18081 test_spec_config: enable_side_transfer: true @@ -1794,13 +1983,9 @@ test_spec_config: let verified = cfg.verify().unwrap(); assert_eq!(verified.protocol.protocol_type, ProtocolType::Tcp); assert_eq!( - verified.shared_memory_path, + verified.share_mem_path, "/tmp/test_side_worker/test_cluster" ); - assert_eq!( - verified.shared_file_path, - "/tmp/test_side_worker_files/test_cluster" - ); assert_eq!( verified.fluxonkv_spec.transfer_engine, TransferEngineType::P2p @@ -1821,8 +2006,7 @@ protocol: protocol_type: tcp fluxonkv_spec: cluster_name: test_cluster - shared_memory_path: /tmp/test_side_worker - shared_file_path: /tmp/test_side_worker_files + share_mem_path: /tmp/test_side_worker test_spec_config: enable_side_transfer: true side_transfer_role: worker @@ -1852,8 +2036,7 @@ protocol: protocol_type: tcp fluxonkv_spec: cluster_name: test_cluster - shared_memory_path: /tmp/test_side_worker - shared_file_path: /tmp/test_side_worker_files + share_mem_path: /tmp/test_side_worker test_spec_config: enable_side_transfer: true side_transfer_role: worker @@ -1881,8 +2064,8 @@ contribute_to_cluster_pool_size: fluxonkv_spec: etcd_addresses: ["127.0.0.1:2379"] cluster_name: test_cluster - shared_memory_path: /tmp/test_owner - shared_file_path: /tmp/test_owner_files + share_mem_path: /tmp/test_owner + large_file_paths: [/tmp/test_owner_large] p2p_listen_port: 18081 sub_cluster: rack-a test_spec_config: @@ -1913,8 +2096,8 @@ contribute_to_cluster_pool_size: fluxonkv_spec: etcd_addresses: ["127.0.0.1:2379"] cluster_name: test_cluster - shared_memory_path: /tmp/test_owner - shared_file_path: /tmp/test_owner_files + share_mem_path: /tmp/test_owner + large_file_paths: [/tmp/test_owner_large] sub_cluster: rack-a "#, ) @@ -1938,8 +2121,8 @@ contribute_to_cluster_pool_size: fluxonkv_spec: etcd_addresses: ["127.0.0.1:2379"] cluster_name: test_cluster - shared_memory_path: /tmp/test_owner - shared_file_path: /tmp/test_owner_files + share_mem_path: /tmp/test_owner + large_file_paths: [/tmp/test_owner_large] sub_cluster: rack-a test_spec_config: transport_mode: transfer_with_rpc diff --git a/fluxon_rs/fluxon_kv/src/external_client_api/external_client_test.rs b/fluxon_rs/fluxon_kv/src/external_client_api/external_client_test.rs index f811424..b55f161 100644 --- a/fluxon_rs/fluxon_kv/src/external_client_api/external_client_test.rs +++ b/fluxon_rs/fluxon_kv/src/external_client_api/external_client_test.rs @@ -2,8 +2,8 @@ use std::collections::HashMap; use crate::cluster_manager::NodeID; use crate::config::{ - ClientConfig, ContributeToClusterPoolSize, FluxonKvSpec, MasterConfig, MonitoringConfig, - ProtocolConfig, ProtocolType, TestSpecConfig, TransferEngineType, + ClientConfig, ContributeToClusterPoolSize, FluxonKvSpec, LargeFilePaths, MasterConfig, + MonitoringConfig, ProtocolConfig, ProtocolType, TestSpecConfig, TransferEngineType, }; use crate::master_kv_router::MasterKvRouterView; use crate::{ConfigArg, run_client, run_master}; @@ -80,8 +80,10 @@ fn new_client_config( enable_transfer_rpc_fast_path: true, sub_cluster: None, }, - shared_memory_path: shm_path.to_string(), - shared_file_path: format!("{}_files", shm_path), + share_mem_path: shm_path.to_string(), + large_file_paths: LargeFilePaths { + paths: vec![format!("{}_large", shm_path)], + }, test_spec_config: TestSpecConfig::default(), } } @@ -92,8 +94,7 @@ fn new_zero_contribution_client_config( shm_path: &str, ) -> ClientConfig { // External instance_key MUST be different from owner. - // External bootstrap shares both owner bundle roots: shared_memory_path for mmap.file and - // shared_file_path for shared.json / peer metadata. + // External bootstrap shares the owner bundle root for mmap.file, shared.json, and peer metadata. let unique_suffix = std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) .map(|d| d.as_nanos()) @@ -122,8 +123,8 @@ fn new_zero_contribution_client_config( enable_transfer_rpc_fast_path: false, sub_cluster: None, }, - shared_memory_path: shm_path.to_string(), - shared_file_path: format!("{}_files", shm_path), + share_mem_path: shm_path.to_string(), + large_file_paths: LargeFilePaths { paths: Vec::new() }, test_spec_config: TestSpecConfig::default(), } } diff --git a/fluxon_rs/fluxon_kv/src/external_client_api/mod.rs b/fluxon_rs/fluxon_kv/src/external_client_api/mod.rs index f2634be..9cb291f 100644 --- a/fluxon_rs/fluxon_kv/src/external_client_api/mod.rs +++ b/fluxon_rs/fluxon_kv/src/external_client_api/mod.rs @@ -251,8 +251,8 @@ define_module!( /// External Client configuration parameters #[derive(Clone, Debug)] pub struct ExternalClientApiNewArg { - pub shared_memory_path: String, - pub shared_file_path: String, + pub share_mem_path: String, + pub large_file_paths: crate::config::LargeFilePaths, pub expected_cluster_name: String, pub expected_protocol_version: String, pub enable_side_transfer: bool, @@ -310,8 +310,8 @@ pub struct ExternalInner { initial_sub_cluster: OnceLock>, expected_cluster_name: String, expected_protocol_version: String, - external_shared_memory_path: String, - external_shared_file_path: String, + external_share_mem_path: String, + external_large_file_paths: crate::config::LargeFilePaths, _enable_side_transfer: bool, short_circuit_put_payload_path: bool, side_rr_next: AtomicUsize, @@ -350,7 +350,7 @@ impl ExternalClientApi { pub async fn construct(arg: ExternalClientApiNewArg) -> Result { tracing::info!( "Constructing ExternalClientApi in ExternalClient mode (PreView): shm_dir={}", - arg.shared_memory_path + arg.share_mem_path ); Ok(Self(ExternalInner { @@ -361,8 +361,8 @@ impl ExternalClientApi { initial_sub_cluster: OnceLock::new(), expected_cluster_name: arg.expected_cluster_name, expected_protocol_version: arg.expected_protocol_version, - external_shared_memory_path: arg.shared_memory_path, - external_shared_file_path: arg.shared_file_path, + external_share_mem_path: arg.share_mem_path, + external_large_file_paths: arg.large_file_paths, _enable_side_transfer: arg.enable_side_transfer, short_circuit_put_payload_path: arg.short_circuit_put_payload_path, side_rr_next: AtomicUsize::new(0), @@ -403,8 +403,7 @@ impl ExternalClientApi { let wait_start_ts = i64::MIN; let OwnerRestartPayload { meta, signature } = task_wait_owner_restart( ext.view.clone_view(), - ext.external_shared_memory_path.clone(), - ext.external_shared_file_path.clone(), + ext.external_share_mem_path.clone(), None, wait_start_ts, None, @@ -414,7 +413,7 @@ impl ExternalClientApi { .await?; let shared_memory_ptr = ExternalInner::init_shared_memory_from_meta( - &ext.external_shared_memory_path, + &ext.external_share_mem_path, &meta, signature, )?; @@ -843,13 +842,11 @@ impl ExternalInner { return Ok(false); }; - let shared_memory_path = self.shared_memory_path(); - let shared_file_path = self.shared_file_path(); - let shared_meta_path = format!("{}/shared.json", shared_file_path); + let share_mem_path = self.share_mem_path(); + let shared_meta_path = format!("{}/shared.json", share_mem_path); let probe = probe_owner_restart_payload( &self.view.clone_view(), - &shared_memory_path, - &shared_file_path, + &share_mem_path, &shared_meta_path, Some(¤t_signature), i64::MIN, @@ -868,7 +865,7 @@ impl ExternalInner { return Ok(false); } - self.finish_owner_recover(&shared_memory_path, payload) + self.finish_owner_recover(&share_mem_path, payload) .await?; Ok(true) } @@ -921,7 +918,7 @@ impl ExternalInner { match self.base_ptr().await { Ok(addr) => Ok(addr), Err(_) => { - let path = self.shared_memory_path(); + let path = self.share_mem_path(); let (st, addr) = self .wait_owner_recover_only(&path, *prev_owner_start_time) .await?; @@ -935,10 +932,10 @@ impl ExternalInner { async fn finish_owner_recover( &self, - shared_memory_path: &str, + share_mem_path: &str, payload: OwnerRestartPayload, ) -> KvResult<(i64, usize)> { - self.remap_shared_memory_with_payload(shared_memory_path, &payload) + self.remap_shared_memory_with_payload(share_mem_path, &payload) .await?; self.view .cluster_manager() @@ -960,10 +957,10 @@ impl ExternalInner { async fn wait_owner_recover_only( &self, - shared_memory_path: &str, + share_mem_path: &str, prev_owner_start_time: i64, ) -> KvResult<(i64, usize)> { - self.wait_owner_recover(shared_memory_path, prev_owner_start_time) + self.wait_owner_recover(share_mem_path, prev_owner_start_time) .await } @@ -971,7 +968,7 @@ impl ExternalInner { &self, prev_owner_start_time: &mut i64, ) -> KvResult { - let path = self.shared_memory_path(); + let path = self.share_mem_path(); let (st, addr) = self .wait_owner_recover_only(&path, *prev_owner_start_time) .await?; @@ -987,7 +984,7 @@ impl ExternalInner { return match self.base_ptr().await { Ok(addr) => Ok(addr), Err(_) => { - let path = self.shared_memory_path(); + let path = self.share_mem_path(); let (st, addr) = self .wait_owner_recover_only(&path, *prev_owner_start_time) .await?; @@ -997,7 +994,7 @@ impl ExternalInner { }; } - let path = self.shared_memory_path(); + let path = self.share_mem_path(); let (st, addr) = self .wait_owner_recover_only(&path, *prev_owner_start_time) .await?; @@ -1009,7 +1006,7 @@ impl ExternalInner { /// has advanced. async fn wait_owner_recover( &self, - _shared_memory_path: &str, + _share_mem_path: &str, prev_owner_start_time: i64, ) -> KvResult<(i64, usize)> { if let Some(res) = self @@ -1090,11 +1087,11 @@ impl ExternalInner { async fn remap_shared_memory_with_payload( &self, - shared_memory_path: &str, + share_mem_path: &str, payload: &OwnerRestartPayload, ) -> KvResult<()> { let shared_memory = Self::init_shared_memory_from_meta( - shared_memory_path, + share_mem_path, &payload.meta, payload.signature.clone(), )?; @@ -1212,11 +1209,11 @@ impl ExternalInner { } fn init_shared_memory_from_meta( - shared_memory_path: &str, + share_mem_path: &str, meta: &SharedJsonMeta, memory_signature: SharedMetaSignature, ) -> KvResult> { - let mmap_file_path = format!("{}/mmap.file", shared_memory_path); + let mmap_file_path = format!("{}/mmap.file", share_mem_path); Self::init_shared_memory(&mmap_file_path, meta.segment_len, memory_signature) } /// Get the shared storage node ID this client connects to @@ -1227,14 +1224,12 @@ impl ExternalInner { /// Get the configured shared-memory base path (external mode). /// Non-external modes return empty string. - pub fn shared_memory_path(&self) -> String { - self.external_shared_memory_path.clone() + pub fn share_mem_path(&self) -> String { + self.external_share_mem_path.clone() } - /// Get the configured shared-file base path (external mode). - /// Non-external modes return empty string. - pub fn shared_file_path(&self) -> String { - self.external_shared_file_path.clone() + pub fn large_file_paths(&self) -> &crate::config::LargeFilePaths { + &self.external_large_file_paths } fn should_fallback_side_p2p_error(err: &crate::p2p::P2PError) -> bool { @@ -1272,7 +1267,7 @@ impl ExternalInner { // require an extra enable flag once the owner has published ready lanes. let owner_id = self.shared_storage_node_id().await?; let owner_start_time = self.current_owner_start_time().await; - let peers_dir = ClientSegPool::side_transfer_peers_dir(&self.external_shared_file_path); + let peers_dir = ClientSegPool::side_transfer_peers_dir(&self.external_share_mem_path); let entries = std::fs::read_dir(&peers_dir).ok()?; let mut ready = Vec::new(); for entry in entries.flatten() { @@ -1534,7 +1529,7 @@ impl ExternalInner { let mut prev_owner_start_time = self.current_owner_start_time().await; let mut recover_attempts = 0usize; if self.base_ptr().await.is_err() { - let path = self.shared_memory_path(); + let path = self.share_mem_path(); tracing::info!("ExternalClientApi.is_exist waiting for owner at: {}", path); let _ = self.ensure_owner_ready(&mut prev_owner_start_time).await?; } @@ -1625,7 +1620,7 @@ impl ExternalInner { // Ensure external mode configured; if not, block until owner is ready once let mut prev_owner_start_time = self.current_owner_start_time().await; if self.base_ptr().await.is_err() { - let path = self.shared_memory_path(); + let path = self.share_mem_path(); tracing::info!( "ExternalClientApi.get detected unmapped shared memory; waiting at: {}", path @@ -1821,7 +1816,7 @@ key={}, attempt={}/{}, err={}", let mut base_addr: usize = match self.base_ptr().await { Ok(addr) => addr, Err(_) => { - let path = self.shared_memory_path(); + let path = self.share_mem_path(); tracing::info!( "ExternalClientApi.put detected unmapped shared memory; waiting for owner to be ready at path: {}", path @@ -1910,7 +1905,7 @@ key={}, attempt={}/{}, err={}", let mut base_addr: usize = match self.base_ptr().await { Ok(addr) => addr, Err(_) => { - let path = self.shared_memory_path(); + let path = self.share_mem_path(); tracing::info!( "ExternalClientApi.put_flat_dict_ptrs detected unmapped shared memory; waiting for owner to be ready at path: {}", path @@ -2300,7 +2295,7 @@ key={}, attempt={}/{}, err={}", let mut prev_owner_start_time = self.current_owner_start_time().await; let mut recover_attempts = 0usize; if self.base_ptr().await.is_err() { - let path = self.shared_memory_path(); + let path = self.share_mem_path(); tracing::info!("ExternalClientApi.delete waiting for owner at: {}", path); let _ = self.ensure_owner_ready(&mut prev_owner_start_time).await?; } @@ -2664,8 +2659,7 @@ async fn handle_sync_kv_to_file_external( async fn task_wait_owner_restart( view: ExternalClientApiView, - shared_memory_path: String, - shared_file_path: String, + share_mem_path: String, current_sig_snapshot: Option, wait_start_ts: i64, old_owner_id: Option, @@ -2674,7 +2668,7 @@ async fn task_wait_owner_restart( ) -> KvResult { let shutdown_poller = view.register_shutdown_poller(); let mut cluster_rx = view.cluster_manager().listen(); - let shared_meta_path = format!("{}/shared.json", &shared_file_path); + let shared_meta_path = format!("{}/shared.json", &share_mem_path); let mut waited = 0u64; loop { if !shutdown_poller.is_running() { @@ -2685,8 +2679,7 @@ async fn task_wait_owner_restart( match probe_owner_restart_payload( &view, - &shared_memory_path, - &shared_file_path, + &share_mem_path, &shared_meta_path, current_sig_snapshot.as_ref(), wait_start_ts, @@ -2735,8 +2728,7 @@ fn read_shared_json_snapshot( async fn probe_owner_restart_payload( view: &ExternalClientApiView, - shared_memory_path: &str, - shared_file_path: &str, + share_mem_path: &str, shared_meta_path: &str, current_sig_snapshot: Option<&SharedMetaSignature>, wait_start_ts: i64, @@ -2744,16 +2736,16 @@ async fn probe_owner_restart_payload( expected_cluster_name: &str, expected_protocol_version: &str, ) -> KvResult { - if !fluxon_util::fs_watch::are_files_ready(shared_memory_path, &["mmap.file"]) { + if !fluxon_util::fs_watch::are_files_ready(share_mem_path, &["mmap.file"]) { return Ok(OwnerRestartProbe::Pending(format!( "shared memory mmap.file not ready yet: path={}", - shared_memory_path + share_mem_path ))); } - if !fluxon_util::fs_watch::are_files_ready(shared_file_path, &["shared.json"]) { + if !fluxon_util::fs_watch::are_files_ready(share_mem_path, &["shared.json"]) { return Ok(OwnerRestartProbe::Pending(format!( "shared metadata shared.json not ready yet: path={}", - shared_file_path + share_mem_path ))); } @@ -2776,13 +2768,13 @@ async fn probe_owner_restart_payload( if meta.protocol_version != expected_protocol_version { return Ok(OwnerRestartProbe::Pending(format!( "shared.json protocol_version mismatch; waiting: shm_dir='{}' shared='{}' local='{}'", - shared_memory_path, meta.protocol_version, expected_protocol_version + share_mem_path, meta.protocol_version, expected_protocol_version ))); } if meta.cluster_name != expected_cluster_name { return Ok(OwnerRestartProbe::Pending(format!( "shared.json cluster_name mismatch; waiting: shm_dir='{}' shared='{}' local='{}'", - shared_memory_path, meta.cluster_name, expected_cluster_name + share_mem_path, meta.cluster_name, expected_cluster_name ))); } if let Some(old_owner_id) = old_owner_id { @@ -2852,7 +2844,7 @@ impl LogicalModule for ExternalClientApi { async fn shutdown(&self) -> Result<(), Self::Error> { // 只在ExternalClient模式下清理共享内存映射 let ext = &self.0; - if ext.shared_memory_path().is_empty() { + if ext.share_mem_path().is_empty() { tracing::info!("ExternalClientApi shutdown (no shared memory path configured)"); return Ok(()); } diff --git a/fluxon_rs/fluxon_kv/src/kv_test.rs b/fluxon_rs/fluxon_kv/src/kv_test.rs index a287769..5f0a9e2 100644 --- a/fluxon_rs/fluxon_kv/src/kv_test.rs +++ b/fluxon_rs/fluxon_kv/src/kv_test.rs @@ -11,7 +11,7 @@ use crate::cluster_manager::ClusterManagerRdmaControlInit; use crate::config::{ - ClientConfig, ContributeToClusterPoolSize, FluxonKvSpec, MasterConfig, MonitoringConfig, + ClientConfig, ContributeToClusterPoolSize, FluxonKvSpec, LargeFilePaths, MasterConfig, MonitoringConfig, ProtocolConfig, ProtocolType, TestSpecConfig, TestSpecTransportMode, TransferEngineType, }; use crate::run_master_with_test_overrides; @@ -609,8 +609,7 @@ struct KvTestClientOptions { transfer_backend_activation_mode: Option, enable_transfer_rpc_fast_path: Option, contribute_to_cluster_pool_size: Option, - shared_memory_path: Option, - shared_file_path: Option, + share_mem_path: Option, etcd_mode: Option, } @@ -639,14 +638,10 @@ impl KvTestClientOptions { .contribute_to_cluster_pool_size .clone() .or_else(|| self.contribute_to_cluster_pool_size.clone()), - shared_memory_path: overrides - .shared_memory_path + share_mem_path: overrides + .share_mem_path .clone() - .or_else(|| self.shared_memory_path.clone()), - shared_file_path: overrides - .shared_file_path - .clone() - .or_else(|| self.shared_file_path.clone()), + .or_else(|| self.share_mem_path.clone()), etcd_mode: overrides .etcd_mode .clone() @@ -767,8 +762,8 @@ struct KvTestRoundOptions { round_profile: KvTestRoundProfile, round_name: String, cluster_name: String, - master_port: u16, - step8_master_port: u16, + master_port: Option, + step8_master_port: Option, master_options: KvTestClientOptions, owner_client_options: KvTestClientOptions, external_client_options: KvTestClientOptions, @@ -800,7 +795,7 @@ impl KvTestRoundOptions { ) } - fn step8_shared_memory_path(&self) -> String { + fn step8_share_mem_path(&self) -> String { format!( "/tmp/kvcache_shared_memory_step8_{}_{}", self.round_name, @@ -808,13 +803,6 @@ impl KvTestRoundOptions { ) } - fn step8_shared_file_path(&self) -> String { - format!( - "/tmp/kvcache_shared_files_step8_{}_{}", - self.round_name, - kv_test_run_scope() - ) - } } #[derive(Clone, Debug)] @@ -850,6 +838,20 @@ fn default_external_contribute_to_cluster_pool_size() -> ContributeToClusterPool } } +fn default_client_large_file_paths( + instance_key: &str, + contribute_to_cluster_pool_size: &ContributeToClusterPoolSize, +) -> LargeFilePaths { + if contribute_to_cluster_pool_size.dram == 0 + && contribute_to_cluster_pool_size.vram.is_empty() + { + return LargeFilePaths { paths: Vec::new() }; + } + LargeFilePaths { + paths: vec![format!("/tmp/kvcache_large/{}", instance_key)], + } +} + fn default_owner_test_client_options(round_profile: KvTestRoundProfile) -> KvTestClientOptions { KvTestClientOptions { protocol_config: Some(round_profile.protocol_config()), @@ -858,8 +860,7 @@ fn default_owner_test_client_options(round_profile: KvTestRoundProfile) -> KvTes transfer_backend_activation_mode: round_profile.owner_transfer_backend_activation_mode(), enable_transfer_rpc_fast_path: Some(round_profile.enable_transfer_rpc_fast_path()), contribute_to_cluster_pool_size: Some(default_owner_contribute_to_cluster_pool_size()), - shared_memory_path: None, - shared_file_path: None, + share_mem_path: None, etcd_mode: Some(KvTestEtcdMode::Enabled), } } @@ -872,8 +873,7 @@ fn default_master_test_client_options(round_profile: KvTestRoundProfile) -> KvTe transfer_backend_activation_mode: round_profile.master_transfer_backend_activation_mode(), enable_transfer_rpc_fast_path: Some(round_profile.enable_transfer_rpc_fast_path()), contribute_to_cluster_pool_size: None, - shared_memory_path: None, - shared_file_path: None, + share_mem_path: None, etcd_mode: None, } } @@ -886,13 +886,12 @@ fn default_external_test_client_options() -> KvTestClientOptions { transfer_backend_activation_mode: None, enable_transfer_rpc_fast_path: Some(false), contribute_to_cluster_pool_size: Some(default_external_contribute_to_cluster_pool_size()), - shared_memory_path: None, - shared_file_path: None, + share_mem_path: None, etcd_mode: Some(KvTestEtcdMode::Disabled), } } -fn new_kv_test_round(round_profile: KvTestRoundProfile, master_port: u16) -> KvTestRoundOptions { +fn new_kv_test_round(round_profile: KvTestRoundProfile) -> KvTestRoundOptions { let round_name = round_profile.round_name(); KvTestRoundOptions { round_profile, @@ -900,8 +899,8 @@ fn new_kv_test_round(round_profile: KvTestRoundProfile, master_port: u16) -> KvT // Keep each process run on its own cluster namespace so a crashed/aborted previous run // cannot poison the next rerun with stale members. cluster_name: format!("test_cluster_{}_{}", round_name, kv_test_run_scope()), - master_port, - step8_master_port: master_port + 10, + master_port: None, + step8_master_port: None, master_options: default_master_test_client_options(round_profile), owner_client_options: default_owner_test_client_options(round_profile), external_client_options: default_external_test_client_options(), @@ -919,16 +918,16 @@ fn default_kv_test_run_options() -> KvTestRunOptions { .map(str::trim) .filter(|item| !item.is_empty()) { - let (profile, port) = match round_name { - "p2p_only" => (KvTestRoundProfile::P2pOnly, 50220), - "rdma_transfer_only" => (KvTestRoundProfile::RdmaTransferOnly, 50240), - "rdma_transfer_with_rpc" => (KvTestRoundProfile::RdmaTransferWithRpc, 50260), + let profile = match round_name { + "p2p_only" => KvTestRoundProfile::P2pOnly, + "rdma_transfer_only" => KvTestRoundProfile::RdmaTransferOnly, + "rdma_transfer_with_rpc" => KvTestRoundProfile::RdmaTransferWithRpc, other => panic!( "unsupported FLUXON_KV_TEST_ROUNDS entry '{}'; expected one of: p2p_only, rdma_transfer_only, rdma_transfer_with_rpc", other ), }; - rounds.push(new_kv_test_round(profile, port)); + rounds.push(new_kv_test_round(profile)); } if rounds.is_empty() { panic!("FLUXON_KV_TEST_ROUNDS was set but produced no valid rounds"); @@ -938,9 +937,9 @@ fn default_kv_test_run_options() -> KvTestRunOptions { KvTestRunOptions { rounds: vec![ - new_kv_test_round(KvTestRoundProfile::P2pOnly, 50220), - new_kv_test_round(KvTestRoundProfile::RdmaTransferOnly, 50240), - new_kv_test_round(KvTestRoundProfile::RdmaTransferWithRpc, 50260), + new_kv_test_round(KvTestRoundProfile::P2pOnly), + new_kv_test_round(KvTestRoundProfile::RdmaTransferOnly), + new_kv_test_round(KvTestRoundProfile::RdmaTransferWithRpc), ], } } @@ -949,7 +948,7 @@ fn default_kv_test_run_options() -> KvTestRunOptions { fn new_master_launch( round: &KvTestRoundOptions, instance_key: &str, - port: u16, + port: Option, ) -> KvTestMasterLaunch { // Read etcd endpoint from project root build_config_ext.yml let etcd = fluxon_util::dev_config::read_etcd_endpoint_from_build_config() @@ -980,7 +979,7 @@ fn new_master_launch( config: MasterConfig { instance_key: round.scoped_instance_key(instance_key), cluster_name: round.cluster_name.clone(), - port: Some(port), + port, etcd_endpoints: vec![etcd.clone()], protocol, transfer_engine, @@ -1020,19 +1019,17 @@ fn build_client_launch( .rdma_control_init .expect("kv_test requires rdma_control_init to be set explicitly"); let transfer_backend_activation_mode = options.transfer_backend_activation_mode; - let shared_memory_path = options - .shared_memory_path + let contribute_to_cluster_pool_size = options + .contribute_to_cluster_pool_size + .unwrap_or(default_owner_contribute_to_cluster_pool_size()); + let share_mem_path = options + .share_mem_path .unwrap_or_else(|| format!("/tmp/kvcache_shared_memory/{}", instance_key)); - let shared_file_path = options - .shared_file_path - .unwrap_or_else(|| format!("/tmp/kvcache_shared_files/{}", instance_key)); let config = ClientConfig { cluster_name: round.cluster_name.clone(), etcd_addresses_raw, instance_key: instance_key.clone(), - contribute_to_cluster_pool_size: options - .contribute_to_cluster_pool_size - .unwrap_or(default_owner_contribute_to_cluster_pool_size()), + contribute_to_cluster_pool_size: contribute_to_cluster_pool_size.clone(), protocol: options.protocol_config.unwrap_or_else(tcp_protocol_config), pprof_duration_seconds: None, redis_compat_listen_addr: None, @@ -1052,8 +1049,11 @@ fn build_client_launch( // kv_test uses a per-instance shared memory path by default so each owner/external share // group is explicit and test overrides only replace this when a scenario intentionally // binds multiple roles to the same owner path. - shared_memory_path, - shared_file_path, + share_mem_path, + large_file_paths: default_client_large_file_paths( + &instance_key, + &contribute_to_cluster_pool_size, + ), // Mirror round intent into the generated config so logs and runtime behavior // agree on whether this launch is transfer_only vs transfer_with_rpc. test_spec_config: kv_test_round_test_spec_config(round.round_profile), @@ -1083,7 +1083,7 @@ fn new_client_launch( } /// 创建测试用的ExternalClient配置 -/// external 与 owner 的 instance_key 必须不同;仅共享 owner 的 shared_memory_path +/// external 与 owner 的 instance_key 必须不同;仅共享 owner 的 share_mem_path fn new_external_client_launch( round: &KvTestRoundOptions, external_instance_key: &str, @@ -1108,18 +1108,12 @@ fn new_external_client_launch( if external_options.enable_transfer_rpc_fast_path.is_none() { external_options.enable_transfer_rpc_fast_path = Some(false); } - if external_options.shared_memory_path.is_none() { - external_options.shared_memory_path = Some(format!( + if external_options.share_mem_path.is_none() { + external_options.share_mem_path = Some(format!( "/tmp/kvcache_shared_memory/{}", round.scoped_instance_key(owner_instance_key) )); } - if external_options.shared_file_path.is_none() { - external_options.shared_file_path = Some(format!( - "/tmp/kvcache_shared_files/{}", - round.scoped_instance_key(owner_instance_key) - )); - } build_client_launch( round, round.scoped_instance_key(external_instance_key), @@ -1586,30 +1580,17 @@ async fn shutdown_framework_with_timeout(label: &str, framework: &crate::Framewo async fn run_kv_step8(round: &KvTestRoundOptions) { info!("📋 Step 8: Verifying external client blocking and recovery behavior"); - let step8_shared_memory_path = round.step8_shared_memory_path(); - let step8_shared_file_path = round.step8_shared_file_path(); - if let Err(e) = fs::remove_dir_all(&step8_shared_memory_path) { + let step8_share_mem_path = round.step8_share_mem_path(); + if let Err(e) = fs::remove_dir_all(&step8_share_mem_path) { warn!( "Step 8: failed to remove existing shared memory dir {}: {}", - step8_shared_memory_path, e + step8_share_mem_path, e ); } - if let Err(e) = fs::create_dir_all(&step8_shared_memory_path) { + if let Err(e) = fs::create_dir_all(&step8_share_mem_path) { warn!( "Step 8: failed to pre-create shared memory dir {}: {}", - step8_shared_memory_path, e - ); - } - if let Err(e) = fs::remove_dir_all(&step8_shared_file_path) { - warn!( - "Step 8: failed to remove existing shared file dir {}: {}", - step8_shared_file_path, e - ); - } - if let Err(e) = fs::create_dir_all(&step8_shared_file_path) { - warn!( - "Step 8: failed to pre-create shared file dir {}: {}", - step8_shared_file_path, e + step8_share_mem_path, e ); } @@ -1630,15 +1611,13 @@ async fn run_kv_step8(round: &KvTestRoundOptions) { let step8_owner_options = round .owner_client_options .merged_with(&KvTestClientOptions { - shared_memory_path: Some(step8_shared_memory_path.clone()), - shared_file_path: Some(step8_shared_file_path.clone()), + share_mem_path: Some(step8_share_mem_path.clone()), ..Default::default() }); let step8_external_options = round .external_client_options .merged_with(&KvTestClientOptions { - shared_memory_path: Some(step8_shared_memory_path.clone()), - shared_file_path: Some(step8_shared_file_path.clone()), + share_mem_path: Some(step8_share_mem_path.clone()), ..Default::default() }); @@ -1840,23 +1819,17 @@ async fn run_kv_step8(round: &KvTestRoundOptions) { .await; shutdown_framework_with_timeout("step8 master", &master_framework_step8).await; - if let Err(e) = fs::remove_dir_all(&step8_shared_memory_path) { + if let Err(e) = fs::remove_dir_all(&step8_share_mem_path) { warn!( "Step 8: failed to clean shared memory dir {} on exit: {}", - step8_shared_memory_path, e - ); - } - if let Err(e) = fs::remove_dir_all(&step8_shared_file_path) { - warn!( - "Step 8: failed to clean shared file dir {} on exit: {}", - step8_shared_file_path, e + step8_share_mem_path, e ); } } async fn run_kv_round(round: &KvTestRoundOptions) { info!( - "Round '{}' uses cluster '{}' and master ports {} / {}", + "Round '{}' uses cluster '{}' and master ports {:?} / {:?}", round.round_name, round.cluster_name, round.master_port, round.step8_master_port ); @@ -2066,7 +2039,7 @@ async fn run_kv_round(round: &KvTestRoundOptions) { // 启动多个客户端节点 let client1_launch = new_client_launch(round, "test_client_1", None); - // external 与 owner 使用不同的 instance_key,但共享 owner 的 shared_memory_path + // external 与 owner 使用不同的 instance_key,但共享 owner 的 share_mem_path let client2_launch = new_external_client_launch(round, "test_client_1_ext2", "test_client_1", None); let client3_launch = diff --git a/fluxon_rs/fluxon_kv/src/kvcore_test_lib.rs b/fluxon_rs/fluxon_kv/src/kvcore_test_lib.rs index 355ca6e..778666f 100644 --- a/fluxon_rs/fluxon_kv/src/kvcore_test_lib.rs +++ b/fluxon_rs/fluxon_kv/src/kvcore_test_lib.rs @@ -121,8 +121,7 @@ fn new_client_config_with_cluster_and_dram( .expect("read raw etcd endpoint from build_config_ext.yml"); // Shared memory path lives under the same test workdir base used by master logs let base = test_workdir_base(); - let shared_memory_path = format!("{}/sharemem/{}", base, instance_key); - let shared_file_path = format!("{}/sharefile/{}", base, instance_key); + let share_mem_path = format!("{}/sharemem/{}", base, instance_key); let conf = ClientConfig { cluster_name: cluster_name.to_string(), etcd_addresses_raw: vec![etcd_raw], @@ -145,8 +144,10 @@ fn new_client_config_with_cluster_and_dram( enable_transfer_rpc_fast_path: true, sub_cluster: None, }, - shared_memory_path, - shared_file_path, + share_mem_path, + large_file_paths: crate::config::LargeFilePaths { + paths: vec![format!("{}/large/{}", base, instance_key)], + }, test_spec_config: TestSpecConfig::default(), }; println!("fluxonkv core created client config for test: {:?}", conf); diff --git a/fluxon_rs/fluxon_kv/src/lib.rs b/fluxon_rs/fluxon_kv/src/lib.rs index b46fd85..edaa386 100644 --- a/fluxon_rs/fluxon_kv/src/lib.rs +++ b/fluxon_rs/fluxon_kv/src/lib.rs @@ -105,6 +105,16 @@ use std::sync::Arc; use std::time::{Duration, Instant}; use tracing::{info, warn}; +struct ExternalBootstrapBundle { + meta: SharedJsonMeta, +} + +struct ExternalBootstrapMetadata { + meta: SharedJsonMeta, + share_mem_path: String, + etcd_endpoints: Vec, +} + fn cluster_manager_rdma_control_init_from_transfer_config( _transfer_engine: TransferEngineType, _protocol: &ProtocolConfig, @@ -585,7 +595,7 @@ fn tcp_thread_transport_tuning_from_test_spec_config( } pub async fn load_client_config(config_arg: ConfigArg) -> KvResult { - match config_arg { + let config = match config_arg { ConfigArg::None => { // Try to find default config file match find_default_config_file() { @@ -594,13 +604,13 @@ pub async fn load_client_config(config_arg: ConfigArg) -> KvResult let config_yaml = ClientConfigYaml::from_file(&path)?; let config = config_yaml.verify()?; println!("Client configuration loaded and validated successfully"); - Ok(config) + config } None => Err(ConfigError::FileReadError { detail: "No config file found. Please provide a config file with -f option" .to_string(), } - .into_kverror()), + .into_kverror())?, } } ConfigArg::File(config_path) => { @@ -608,13 +618,15 @@ pub async fn load_client_config(config_arg: ConfigArg) -> KvResult let config_yaml = ClientConfigYaml::from_file(&config_path)?; let config = config_yaml.verify()?; println!("Client configuration loaded and validated successfully"); - Ok(config) + config } ConfigArg::Config(config) => { println!("Using provided client configuration"); - Ok(config) + config } - } + }; + + bootstrap_zero_contribution_client_config(config).await } pub async fn load_master_config(config_arg: ConfigArg) -> KvResult { @@ -783,8 +795,8 @@ fn build_side_transfer_worker_config( enable_transfer_rpc_fast_path: false, sub_cluster: None, }, - shared_memory_path: owner_config.shared_memory_path.clone(), - shared_file_path: owner_config.shared_file_path.clone(), + share_mem_path: owner_config.share_mem_path.clone(), + large_file_paths: owner_config.large_file_paths.clone(), test_spec_config, }) } @@ -827,8 +839,8 @@ fn build_side_transfer_worker_config_yaml( fluxonkv_spec: crate::config::FluxonKvSpecYaml { etcd_addresses: None, cluster_name: side_config.cluster_name, - shared_memory_path: side_config.shared_memory_path, - shared_file_path: side_config.shared_file_path, + share_mem_path: side_config.share_mem_path, + large_file_paths: None, p2p_listen_port: side_config.fluxonkv_spec.p2p_listen_port, redis_compat: None, sub_cluster: None, @@ -838,14 +850,14 @@ fn build_side_transfer_worker_config_yaml( } fn side_transfer_runtime_dir(owner_config: &ClientConfig) -> PathBuf { - Path::new(&owner_config.shared_file_path) - .join(format!("{}_cluster_kv_logs", owner_config.cluster_name)) - .join("side_transfer_runtime") - .join(&owner_config.instance_key) + owner_config + .large_file_paths + .side_transfer_runtime_dir(&owner_config.cluster_name, &owner_config.instance_key) + .unwrap_or_else(|err| panic!("invalid owner large_file_paths: {}", err)) } fn cluster_manager_local_ipc_root( - shared_memory_path: &str, + share_mem_path: &str, test_spec_config: &TestSpecConfig, ) -> Option { // Test-only override: @@ -863,35 +875,35 @@ fn cluster_manager_local_ipc_root( // they do not need to reuse the same literal filesystem path. // // Causal chain: - // - `shared_memory_path` is authoritative for mmap.file/shared.json coordination and can be long. + // - `share_mem_path` is authoritative for mmap.file/shared.json coordination and can be long. // - iceoryx2 event listeners materialize AF_UNIX socket files under `local_ipc_root`. - // - AF_UNIX paths are short; reusing a long `shared_memory_path` makes listener creation fail + // - AF_UNIX paths are short; reusing a long `share_mem_path` makes listener creation fail // as `ResourceCreationFailed`, even on a clean start with no stale resources. // - Therefore we derive a short, stable alias from the canonical shared-memory root and publish // only that alias as `local_ipc_root`. Some( - derive_short_local_ipc_root(shared_memory_path) + derive_short_local_ipc_root(share_mem_path) .unwrap_or_else(|err| panic!("failed to derive local_ipc_root: {}", err)), ) } -fn derive_short_local_ipc_root(shared_memory_path: &str) -> Result { - if shared_memory_path.trim().is_empty() { - anyhow::bail!("shared_memory_path cannot be empty"); +fn derive_short_local_ipc_root(share_mem_path: &str) -> Result { + if share_mem_path.trim().is_empty() { + anyhow::bail!("share_mem_path cannot be empty"); } - std::fs::create_dir_all(shared_memory_path).map_err(|e| { + std::fs::create_dir_all(share_mem_path).map_err(|e| { anyhow::anyhow!( - "shared_memory_path must be creatable before deriving local_ipc_root: path='{}', err={}", - shared_memory_path, + "share_mem_path must be creatable before deriving local_ipc_root: path='{}', err={}", + share_mem_path, e ) })?; - let canonical = std::fs::canonicalize(shared_memory_path).map_err(|e| { + let canonical = std::fs::canonicalize(share_mem_path).map_err(|e| { anyhow::anyhow!( - "shared_memory_path must be canonicalizable before deriving local_ipc_root: path='{}', err={}", - shared_memory_path, + "share_mem_path must be canonicalizable before deriving local_ipc_root: path='{}', err={}", + share_mem_path, e ) })?; @@ -1099,22 +1111,22 @@ fn format_side_transfer_worker_output_tails(worker: &SideTransferWorkerProcess) } fn read_side_transfer_peer_file( - shared_file_path: &str, + share_mem_path: &str, side_id: &str, ) -> Option { - let peer_path = ClientSegPool::side_transfer_peer_file_path(shared_file_path, side_id); + let peer_path = ClientSegPool::side_transfer_peer_file_path(share_mem_path, side_id); let payload = std::fs::read_to_string(&peer_path).ok()?; serde_json::from_str::(&payload).ok() } fn is_side_transfer_worker_ready( _cluster_manager: &ClusterManager, - shared_file_path: &str, + share_mem_path: &str, owner_id: &str, owner_start_time: i64, side_id: &str, ) -> bool { - let Some(meta) = read_side_transfer_peer_file(shared_file_path, side_id) else { + let Some(meta) = read_side_transfer_peer_file(share_mem_path, side_id) else { return false; }; // Peer files are written only after the worker has attached shared memory and finished @@ -1144,8 +1156,8 @@ fn start_side_transfer_worker( } fn cleanup_stale_side_transfer_bootstrap_artifacts(owner_config: &ClientConfig) -> Result<()> { - let shared_file_path = Path::new(&owner_config.shared_file_path); - let shared_json_path = shared_file_path.join("shared.json"); + let share_mem_path = Path::new(&owner_config.share_mem_path); + let shared_json_path = share_mem_path.join("shared.json"); match std::fs::remove_file(&shared_json_path) { Ok(()) => { info!( @@ -1164,7 +1176,7 @@ fn cleanup_stale_side_transfer_bootstrap_artifacts(owner_config: &ClientConfig) } } - let peers_dir = ClientSegPool::side_transfer_peers_dir(&owner_config.shared_file_path); + let peers_dir = ClientSegPool::side_transfer_peers_dir(&owner_config.share_mem_path); match std::fs::remove_dir_all(&peers_dir) { Ok(()) => { info!( @@ -1211,7 +1223,7 @@ async fn wait_for_side_transfer_workers_ready( } if is_side_transfer_worker_ready( cluster_manager, - &owner_config.shared_file_path, + &owner_config.share_mem_path, &owner_info.id, owner_info.node_start_time, &worker.side_id, @@ -1569,6 +1581,225 @@ fn merge_startup_member_metadata( Ok(()) } +async fn bootstrap_zero_contribution_client_config(config: ClientConfig) -> KvResult { + let dram = config.contribute_to_cluster_pool_size.dram; + let vram_is_zero = config + .contribute_to_cluster_pool_size + .vram + .values() + .all(|&v| v == 0); + let is_zero_contribution = dram == 0 && vram_is_zero; + if !is_zero_contribution { + return Ok(config); + } + + let metadata = + load_external_bootstrap_metadata(&config.share_mem_path, &config.cluster_name).await?; + let mut final_config = config; + final_config.etcd_addresses_raw = metadata.meta.etcd_addresses.clone(); + final_config.fluxonkv_spec.etcd_addresses = metadata.etcd_endpoints; + final_config.fluxonkv_spec.sub_cluster = metadata.meta.sub_cluster.clone(); + final_config.share_mem_path = metadata.share_mem_path; + final_config.large_file_paths = metadata.meta.large_file_paths; + Ok(final_config) +} + +async fn load_external_bootstrap_metadata( + share_mem_path: &str, + expected_cluster_name: &str, +) -> KvResult { + let build_version = fluxon_util::git_version_build_record::get_current_git_commitid().unwrap(); + let share_mem_dir = Path::new(share_mem_path); + let shared_json_path = share_mem_dir.join("shared.json"); + + let mut waited_ticks: u64 = 0; + loop { + let shared_json_buf = match std::fs::read_to_string(&shared_json_path) { + Ok(v) => v, + Err(e) => { + limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)).await; + waited_ticks += 1; + if waited_ticks % 25 == 0 { + warn!( + "Waiting owner shared.json readable... ({}s), path={}, err={}", + waited_ticks / 5, + shared_json_path.to_string_lossy(), + e + ); + } + continue; + } + }; + + let meta: crate::SharedJsonMeta = match serde_json::from_str(&shared_json_buf) { + Ok(v) => v, + Err(e) => { + limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)).await; + waited_ticks += 1; + if waited_ticks % 25 == 0 { + warn!( + "Waiting owner shared.json schema ready... ({}s), path={}, err={}", + waited_ticks / 5, + shared_json_path.to_string_lossy(), + e + ); + } + continue; + } + }; + + if meta.protocol_version != build_version { + limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)).await; + waited_ticks += 1; + if waited_ticks % 25 == 0 { + warn!( + "Waiting protocol_version match... ({}s), share_mem_dir='{}', shared='{}', local='{}'", + waited_ticks / 5, + share_mem_dir.to_string_lossy(), + meta.protocol_version, + build_version + ); + } + continue; + } + + if meta.cluster_name != expected_cluster_name { + limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)).await; + waited_ticks += 1; + if waited_ticks % 25 == 0 { + warn!( + "Waiting cluster_name match... ({}s), share_mem_dir='{}', config='{}', shared.json='{}'", + waited_ticks / 5, + share_mem_dir.to_string_lossy(), + expected_cluster_name, + meta.cluster_name + ); + } + continue; + } + + let share_mem_path_canonical = match std::fs::canonicalize(share_mem_path) { + Ok(v) => v.to_string_lossy().into_owned(), + Err(e) => { + limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)).await; + waited_ticks += 1; + if waited_ticks % 25 == 0 { + warn!( + "Waiting share_mem_path canonicalizable... ({}s), share_mem_dir='{}', path='{}', err={}", + waited_ticks / 5, + share_mem_dir.to_string_lossy(), + share_mem_path, + e + ); + } + continue; + } + }; + + let meta_shm_canonical = match std::fs::canonicalize(&meta.share_mem_path) { + Ok(v) => v.to_string_lossy().into_owned(), + Err(e) => { + limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)).await; + waited_ticks += 1; + if waited_ticks % 25 == 0 { + warn!( + "Waiting shared.json share_mem_path canonicalizable... ({}s), share_mem_dir='{}', path='{}', err={}", + waited_ticks / 5, + share_mem_dir.to_string_lossy(), + meta.share_mem_path, + e + ); + } + continue; + } + }; + + if meta_shm_canonical != share_mem_path_canonical { + limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)).await; + waited_ticks += 1; + if waited_ticks % 25 == 0 { + warn!( + "Waiting share_mem_path match... ({}s), share_mem_dir='{}', config='{}', shared.json='{}'", + waited_ticks / 5, + share_mem_dir.to_string_lossy(), + share_mem_path_canonical, + meta_shm_canonical + ); + } + continue; + } + + if meta.etcd_addresses.is_empty() { + limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)).await; + waited_ticks += 1; + if waited_ticks % 25 == 0 { + warn!( + "Waiting shared.json etcd_addresses non-empty... ({}s), share_mem_dir='{}', share_mem_path='{}'", + waited_ticks / 5, + share_mem_dir.to_string_lossy(), + meta_shm_canonical + ); + } + continue; + } + + let etcd_endpoints = match normalize_etcd_addresses(&meta.etcd_addresses) { + Ok(v) => v, + Err(e) => { + limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)).await; + waited_ticks += 1; + if waited_ticks % 25 == 0 { + warn!( + "Waiting shared.json etcd_addresses valid... ({}s), share_mem_dir='{}', raw={:?}, err={}", + waited_ticks / 5, + share_mem_dir.to_string_lossy(), + meta.etcd_addresses, + e + ); + } + continue; + } + }; + + return Ok(ExternalBootstrapMetadata { + meta, + share_mem_path: meta_shm_canonical, + etcd_endpoints, + }); + } +} + +async fn wait_for_external_bootstrap_bundle( + config: &ClientConfig, +) -> KvResult { + let metadata = + load_external_bootstrap_metadata(&config.share_mem_path, &config.cluster_name).await?; + let share_mem_dir = Path::new(&metadata.share_mem_path); + let shared_json_path = share_mem_dir.join("shared.json"); + let mmap_file_path = share_mem_dir.join("mmap.file"); + + let mut waited_ticks: u64 = 0; + loop { + if !shared_json_path.exists() || !mmap_file_path.exists() { + limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)).await; + waited_ticks += 1; + if waited_ticks % 25 == 0 { + info!( + "Waiting owner shared bundle to be ready... ({}s), share_mem_dir={} (shared.json={}, mmap.file={})", + waited_ticks / 5, + share_mem_dir.to_string_lossy(), + shared_json_path.exists(), + mmap_file_path.exists() + ); + } + continue; + } + return Ok(ExternalBootstrapBundle { + meta: metadata.meta, + }); + } +} + async fn run_client_impl( config_arg: ConfigArg, test_overrides: Option, @@ -1598,10 +1829,11 @@ async fn run_client_impl( let build_version = fluxon_util::git_version_build_record::get_current_git_commitid().unwrap(); let source_sha256 = fluxon_util::build_info::SOURCE_SHA256; - // 初始化日志系统:将日志放到共享文件根目录 - // 下的 {cluster_name}_cluster_kv_logs 子目录,避免在 shm 根目录下展开普通文件。 - let kv_logs_dir = Path::new(&config.shared_file_path) - .join(format!("{}_cluster_kv_logs", config.cluster_name)); + // Logs and other large files are isolated from shared.json/peer metadata. + let kv_logs_dir = config + .large_file_paths + .kv_logs_dir(&config.cluster_name) + .map_err(|e| anyhow::anyhow!("invalid large_file_paths for kv logs: {}", e))?; let observability_disabled = config.test_spec_config.disable_observability; let greptime_tracing_rx = if observability_disabled { fluxon_util::init_log(&kv_logs_dir, &config.instance_key); @@ -1626,14 +1858,14 @@ async fn run_client_impl( println!("Client config: {:?}", config); println!( - "Client shared_memory_path resolved to: {:?}", - config.shared_memory_path + "Client share_mem_path resolved to: {:?}", + config.share_mem_path ); info!("Client config: {:?}", config); info!( - "Client shared_memory_path resolved to: {:?}", - config.shared_memory_path + "Client share_mem_path resolved to: {:?}", + config.share_mem_path ); info!("Build version (git commit): {}", build_version); info!("Build version (source-sha256): {}", source_sha256); @@ -1651,263 +1883,10 @@ async fn run_client_impl( config.test_spec_config.side_transfer_role, Some(SideTransferRole::Worker) ); - let mut bootstrapped_shared_meta: Option = None; - - let config = if is_external { - let shared_memory_dir = Path::new(&config.shared_memory_path); - let shared_file_dir = Path::new(&config.shared_file_path); - let shared_json_path = shared_file_dir.join("shared.json"); - let mmap_file_path = shared_memory_dir.join("mmap.file"); - - let mut waited_ticks: u64 = 0; - let (meta, meta_shm_canonical, meta_file_canonical, etcd_endpoints) = loop { - if !shared_json_path.exists() || !mmap_file_path.exists() { - limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)).await; - waited_ticks += 1; - if waited_ticks % 25 == 0 { - info!( - "Waiting owner shared bundle to be ready... ({}s), shm_dir={} file_dir={} (shared.json={}, mmap.file={})", - waited_ticks / 5, - shared_memory_dir.to_string_lossy(), - shared_file_dir.to_string_lossy(), - shared_json_path.exists(), - mmap_file_path.exists() - ); - } - continue; - } - - let shared_json_buf = match std::fs::read_to_string(&shared_json_path) { - Ok(v) => v, - Err(e) => { - limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)) - .await; - waited_ticks += 1; - if waited_ticks % 25 == 0 { - warn!( - "Waiting owner shared.json readable... ({}s), path={}, err={}", - waited_ticks / 5, - shared_json_path.to_string_lossy(), - e - ); - } - continue; - } - }; - - let meta: crate::SharedJsonMeta = match serde_json::from_str(&shared_json_buf) { - Ok(v) => v, - Err(e) => { - limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)) - .await; - waited_ticks += 1; - if waited_ticks % 25 == 0 { - warn!( - "Waiting owner shared.json schema ready... ({}s), path={}, err={}", - waited_ticks / 5, - shared_json_path.to_string_lossy(), - e - ); - } - continue; - } - }; - - if meta.protocol_version != build_version { - limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)).await; - waited_ticks += 1; - if waited_ticks % 25 == 0 { - warn!( - "Waiting protocol_version match... ({}s), shm_dir='{}' file_dir='{}', shared='{}', local='{}'", - waited_ticks / 5, - shared_memory_dir.to_string_lossy(), - shared_file_dir.to_string_lossy(), - meta.protocol_version, - build_version - ); - } - continue; - } - - if meta.cluster_name != config.cluster_name { - limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)).await; - waited_ticks += 1; - if waited_ticks % 25 == 0 { - warn!( - "Waiting cluster_name match... ({}s), shm_dir='{}' file_dir='{}', config='{}', shared.json='{}'", - waited_ticks / 5, - shared_memory_dir.to_string_lossy(), - shared_file_dir.to_string_lossy(), - config.cluster_name, - meta.cluster_name - ); - } - continue; - } - - let shared_memory_path_canonical = match std::fs::canonicalize( - &config.shared_memory_path, - ) { - Ok(v) => v.to_string_lossy().into_owned(), - Err(e) => { - limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)) - .await; - waited_ticks += 1; - if waited_ticks % 25 == 0 { - warn!( - "Waiting shared_memory_path canonicalizable... ({}s), shm_dir='{}', path='{}', err={}", - waited_ticks / 5, - shared_memory_dir.to_string_lossy(), - config.shared_memory_path, - e - ); - } - continue; - } - }; - - let meta_shm_canonical = match std::fs::canonicalize(&meta.shared_memory_path) { - Ok(v) => v.to_string_lossy().into_owned(), - Err(e) => { - limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)) - .await; - waited_ticks += 1; - if waited_ticks % 25 == 0 { - warn!( - "Waiting shared.json shared_memory_path canonicalizable... ({}s), shm_dir='{}', path='{}', err={}", - waited_ticks / 5, - shared_memory_dir.to_string_lossy(), - meta.shared_memory_path, - e - ); - } - continue; - } - }; - let shared_file_path_canonical = match std::fs::canonicalize(&config.shared_file_path) { - Ok(v) => v.to_string_lossy().into_owned(), - Err(e) => { - limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)) - .await; - waited_ticks += 1; - if waited_ticks % 25 == 0 { - warn!( - "Waiting shared_file_path canonicalizable... ({}s), file_dir='{}', path='{}', err={}", - waited_ticks / 5, - shared_file_dir.to_string_lossy(), - config.shared_file_path, - e - ); - } - continue; - } - }; - let meta_file_canonical = match std::fs::canonicalize(&meta.shared_file_path) { - Ok(v) => v.to_string_lossy().into_owned(), - Err(e) => { - limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)) - .await; - waited_ticks += 1; - if waited_ticks % 25 == 0 { - warn!( - "Waiting shared.json shared_file_path canonicalizable... ({}s), file_dir='{}', path='{}', err={}", - waited_ticks / 5, - shared_file_dir.to_string_lossy(), - meta.shared_file_path, - e - ); - } - continue; - } - }; - - if meta_shm_canonical != shared_memory_path_canonical { - limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)).await; - waited_ticks += 1; - if waited_ticks % 25 == 0 { - warn!( - "Waiting shared_memory_path match... ({}s), shm_dir='{}', config='{}', shared.json='{}'", - waited_ticks / 5, - shared_memory_dir.to_string_lossy(), - shared_memory_path_canonical, - meta_shm_canonical - ); - } - continue; - } - if meta_file_canonical != shared_file_path_canonical { - limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)).await; - waited_ticks += 1; - if waited_ticks % 25 == 0 { - warn!( - "Waiting shared_file_path match... ({}s), file_dir='{}', config='{}', shared.json='{}'", - waited_ticks / 5, - shared_file_dir.to_string_lossy(), - shared_file_path_canonical, - meta_file_canonical - ); - } - continue; - } - - if meta.etcd_addresses.is_empty() { - limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)).await; - waited_ticks += 1; - if waited_ticks % 25 == 0 { - warn!( - "Waiting shared.json etcd_addresses non-empty... ({}s), shm_dir='{}' file_dir='{}', shared_memory_path='{}'", - waited_ticks / 5, - shared_memory_dir.to_string_lossy(), - shared_file_dir.to_string_lossy(), - meta_shm_canonical - ); - } - continue; - } - - let etcd_endpoints = match normalize_etcd_addresses(&meta.etcd_addresses) { - Ok(v) => v, - Err(e) => { - limit_thirdparty::tokio::time::sleep(std::time::Duration::from_millis(200)) - .await; - waited_ticks += 1; - if waited_ticks % 25 == 0 { - warn!( - "Waiting shared.json etcd_addresses valid... ({}s), shm_dir='{}' file_dir='{}', raw={:?}, err={}", - waited_ticks / 5, - shared_memory_dir.to_string_lossy(), - shared_file_dir.to_string_lossy(), - meta.etcd_addresses, - e - ); - } - continue; - } - }; - - break ( - meta, - meta_shm_canonical, - meta_file_canonical, - etcd_endpoints, - ); - }; - bootstrapped_shared_meta = Some(meta.clone()); - // External bootstrap contract: - // - Config provides: instance_key, fluxonkv_spec.cluster_name, fluxonkv_spec.shared_memory_path, - // fluxonkv_spec.shared_file_path, fluxonkv_spec.p2p_listen_port. - // - shared.json provides: cluster_name, etcd_addresses (raw), shared_memory_path (canonical), - // shared_file_path (canonical), protocol_version, sub_cluster. - // - pprof_duration_seconds is not inherited; it is controlled solely by config. - let mut final_config = config.clone(); - final_config.etcd_addresses_raw = meta.etcd_addresses.clone(); - final_config.fluxonkv_spec.etcd_addresses = etcd_endpoints; - final_config.fluxonkv_spec.sub_cluster = meta.sub_cluster; - final_config.shared_memory_path = meta_shm_canonical; - final_config.shared_file_path = meta_file_canonical; - final_config + let bootstrapped_shared_meta = if is_side_transfer_worker { + Some(wait_for_external_bootstrap_bundle(&config).await?.meta) } else { - config + None }; if !is_external && config.test_spec_config.side_transfer_worker_count > 0 { @@ -1987,7 +1966,7 @@ async fn run_client_impl( port: None, metadata, local_ipc_root: cluster_manager_local_ipc_root( - &config.shared_memory_path, + &config.share_mem_path, &config.test_spec_config, ), rdma_control_init: rdma_control_init.clone(), @@ -2010,8 +1989,8 @@ async fn run_client_impl( test_spec_config: config.test_spec_config.clone(), }, external_client_api_arg: ExternalClientApiNewArg { - shared_memory_path: config.shared_memory_path.clone(), - shared_file_path: config.shared_file_path.clone(), + share_mem_path: config.share_mem_path.clone(), + large_file_paths: config.large_file_paths.clone(), expected_cluster_name: config.cluster_name.clone(), expected_protocol_version: build_version.clone(), enable_side_transfer: config.test_spec_config.enable_side_transfer, @@ -2033,7 +2012,7 @@ async fn run_client_impl( port: None, metadata, local_ipc_root: cluster_manager_local_ipc_root( - &config.shared_memory_path, + &config.share_mem_path, &config.test_spec_config, ), rdma_control_init, @@ -2061,8 +2040,8 @@ async fn run_client_impl( client_seg_pool_arg: ClientSegPoolNewArg { contribute_size: config.contribute_to_cluster_pool_size.clone(), // Read shared memory path from config (must not be empty). - shared_memory_path: config.shared_memory_path.clone(), - shared_file_path: config.shared_file_path.clone(), + share_mem_path: config.share_mem_path.clone(), + large_file_paths: config.large_file_paths.clone(), cluster_name: config.cluster_name.clone(), etcd_addresses: config.etcd_addresses_raw.clone(), attach_existing_meta: if is_side_transfer_worker { @@ -2214,7 +2193,7 @@ async fn run_client_impl( Ok(None) => { if is_side_transfer_worker_ready( cluster_manager, - &reconcile_owner_config.shared_memory_path, + &reconcile_owner_config.share_mem_path, &reconcile_owner_info.id, reconcile_owner_info.node_start_time, &worker.side_id, @@ -2405,8 +2384,10 @@ async fn run_client_impl( } let shutdown_waiter = framework.cluster_manager_view().register_shutdown_waiter(); - let kv_profiles_dir = Path::new(&config.shared_file_path) - .join(format!("{}_cluster_kv_profiles", config.cluster_name)); + let kv_profiles_dir = config + .large_file_paths + .kv_profiles_dir(&config.cluster_name) + .map_err(|e| anyhow::anyhow!("invalid large_file_paths for kv profiles: {}", e))?; profile::spawn_pprof_flamegraph_on_timeout_or_shutdown( config.pprof_duration_seconds, kv_profiles_dir, @@ -2483,8 +2464,10 @@ mod tests { enable_transfer_rpc_fast_path: true, sub_cluster: Some("owner-sub".to_string()), }, - shared_memory_path: "/tmp/fluxon_side_transfer_test".to_string(), - shared_file_path: "/tmp/fluxon_side_transfer_test_files".to_string(), + share_mem_path: "/tmp/fluxon_side_transfer_test".to_string(), + large_file_paths: crate::config::LargeFilePaths { + paths: vec!["/tmp/fluxon_side_transfer_test_large".to_string()], + }, test_spec_config: TestSpecConfig { enable_side_transfer: true, side_transfer_worker_count: 4, @@ -2550,11 +2533,11 @@ mod tests { #[test] fn derive_short_local_ipc_root_is_stable_for_canonical_path() { let tempdir = new_test_dir("fluxon_local_ipc_root_stable"); - let shared_memory_root = tempdir.join("owner_shm"); - std::fs::create_dir_all(&shared_memory_root).unwrap(); + let share_mem_root = tempdir.join("owner_shm"); + std::fs::create_dir_all(&share_mem_root).unwrap(); - let canonical = std::fs::canonicalize(&shared_memory_root).unwrap(); - let alias_a = derive_short_local_ipc_root(shared_memory_root.to_str().unwrap()).unwrap(); + let canonical = std::fs::canonicalize(&share_mem_root).unwrap(); + let alias_a = derive_short_local_ipc_root(share_mem_root.to_str().unwrap()).unwrap(); let alias_b = derive_short_local_ipc_root(canonical.to_str().unwrap()).unwrap(); assert_eq!(alias_a, alias_b); @@ -2566,10 +2549,10 @@ mod tests { #[test] fn derive_short_local_ipc_root_keeps_iceoryx_event_path_short() { let tempdir = new_test_dir("fluxon_local_ipc_root_short"); - let shared_memory_root = tempdir.join( - "this_is_a_deliberately_long_shared_memory_root_name_for_iceoryx_socket_length_checks", + let share_mem_root = tempdir.join( + "this_is_a_deliberately_long_share_mem_root_name_for_iceoryx_socket_length_checks", ); - let alias = derive_short_local_ipc_root(shared_memory_root.to_str().unwrap()).unwrap(); + let alias = derive_short_local_ipc_root(share_mem_root.to_str().unwrap()).unwrap(); let example_event_path = format!("{}/iox2_254771654226413701181693419284.event", alias); assert!(Path::new(&alias).is_absolute()); @@ -2585,17 +2568,17 @@ mod tests { #[test] fn cluster_manager_local_ipc_root_respects_test_disable_switch() { let tempdir = new_test_dir("fluxon_local_ipc_root_disable_switch"); - let shared_memory_root = tempdir.join("owner_shm"); - std::fs::create_dir_all(&shared_memory_root).unwrap(); + let share_mem_root = tempdir.join("owner_shm"); + std::fs::create_dir_all(&share_mem_root).unwrap(); let enabled = cluster_manager_local_ipc_root( - shared_memory_root.to_str().unwrap(), + share_mem_root.to_str().unwrap(), &TestSpecConfig::default(), ); assert!(enabled.is_some()); let disabled = cluster_manager_local_ipc_root( - shared_memory_root.to_str().unwrap(), + share_mem_root.to_str().unwrap(), &TestSpecConfig { disable_local_ipc: true, ..Default::default() @@ -2720,6 +2703,7 @@ mod tests { ); assert!(side_cfg_yaml.contribute_to_cluster_pool_size.is_none()); assert!(side_cfg_yaml.fluxonkv_spec.etcd_addresses.is_none()); + assert!(side_cfg_yaml.fluxonkv_spec.large_file_paths.is_none()); assert!(side_cfg_yaml.fluxonkv_spec.sub_cluster.is_none()); assert_eq!(side_cfg_yaml.fluxonkv_spec.p2p_listen_port, Some(42001)); assert_eq!( @@ -2728,6 +2712,87 @@ mod tests { ); } + #[tokio::test] + async fn zero_contribution_bootstrap_inherits_large_file_paths_from_owner_shared_json() { + let tempdir = new_test_dir("fluxon_external_bootstrap_large_paths"); + let share_mem_root = tempdir.join("shared_mem"); + let owner_large_root = tempdir.join("owner_large"); + std::fs::create_dir_all(&share_mem_root).unwrap(); + std::fs::create_dir_all(&owner_large_root).unwrap(); + std::fs::write(share_mem_root.join("mmap.file"), vec![0u8; 4096]).unwrap(); + + let shared_meta = SharedJsonMeta { + owner_id: "owner-a".to_string(), + node_start_time: 123, + segment_len: 4096, + segment_label: Some("cpu:0".to_string()), + sub_cluster: Some("owner-sub".to_string()), + cluster_name: "test_cluster".to_string(), + etcd_addresses: vec!["127.0.0.1:2379".to_string()], + share_mem_path: std::fs::canonicalize(&share_mem_root) + .unwrap() + .to_string_lossy() + .into_owned(), + large_file_paths: crate::config::LargeFilePaths { + paths: vec![owner_large_root.to_string_lossy().into_owned()], + }, + protocol_version: + fluxon_util::git_version_build_record::get_current_git_commitid().unwrap(), + write_ts: Some(chrono::Utc::now().timestamp_micros()), + }; + let shared_meta_json = serde_json::to_string(&shared_meta).unwrap(); + assert!(shared_meta_json.contains("\"large_file_paths\":[")); + assert!(!shared_meta_json.contains("root_paths")); + std::fs::write( + share_mem_root.join("shared.json"), + shared_meta_json.as_bytes(), + ) + .unwrap(); + + let config = ClientConfig { + cluster_name: "test_cluster".to_string(), + etcd_addresses_raw: Vec::new(), + instance_key: "external-a".to_string(), + contribute_to_cluster_pool_size: ContributeToClusterPoolSize { + dram: 0, + vram: HashMap::new(), + }, + protocol: ProtocolConfig { + protocol_type: ProtocolType::Tcp, + rdma_device_names: None, + }, + pprof_duration_seconds: None, + redis_compat_listen_addr: None, + fluxonkv_spec: FluxonKvSpec { + etcd_addresses: Vec::new(), + cluster_name: "test_cluster".to_string(), + p2p_listen_port: Some(41001), + transfer_engine: TransferEngineType::P2p, + enable_transfer_rpc_fast_path: false, + sub_cluster: None, + }, + share_mem_path: share_mem_root.to_string_lossy().into_owned(), + large_file_paths: crate::config::LargeFilePaths { paths: Vec::new() }, + test_spec_config: TestSpecConfig::default(), + }; + + let bootstrapped = bootstrap_zero_contribution_client_config(config) + .await + .expect("bootstrap zero-contribution config"); + assert_eq!( + bootstrapped.large_file_paths.paths, + vec![owner_large_root.to_string_lossy().into_owned()] + ); + assert_eq!( + bootstrapped.fluxonkv_spec.sub_cluster, + Some("owner-sub".to_string()) + ); + assert_eq!( + bootstrapped.fluxonkv_spec.etcd_addresses, + vec!["http://127.0.0.1:2379".to_string()] + ); + } + #[test] fn current_exe_name_helpers_detect_python_and_fluxon_kv() { assert!(current_exe_looks_like_python(Path::new( diff --git a/fluxon_rs/fluxon_kv/src/memholder/memholder_test.rs b/fluxon_rs/fluxon_kv/src/memholder/memholder_test.rs index 377a1c2..7bc7a70 100644 --- a/fluxon_rs/fluxon_kv/src/memholder/memholder_test.rs +++ b/fluxon_rs/fluxon_kv/src/memholder/memholder_test.rs @@ -92,8 +92,10 @@ fn new_client_config_with_size( enable_transfer_rpc_fast_path: true, sub_cluster: None, }, - shared_memory_path: format!("/tmp/kvcache_shared_memory/{}", instance_key), - shared_file_path: format!("/tmp/kvcache_shared_files/{}", instance_key), + share_mem_path: format!("/tmp/kvcache_shared_memory/{}", instance_key), + large_file_paths: crate::config::LargeFilePaths { + paths: vec![format!("/tmp/kvcache_large/{}", instance_key)], + }, test_spec_config: TestSpecConfig::default(), } } @@ -125,8 +127,8 @@ fn new_zero_contribution_client_config( enable_transfer_rpc_fast_path: false, sub_cluster: None, }, - shared_memory_path: format!("/tmp/kvcache_shared_memory/{}", owner_instance_key), - shared_file_path: format!("/tmp/kvcache_shared_files/{}", owner_instance_key), + share_mem_path: format!("/tmp/kvcache_shared_memory/{}", owner_instance_key), + large_file_paths: crate::config::LargeFilePaths { paths: Vec::new() }, test_spec_config: TestSpecConfig::default(), } } @@ -415,7 +417,7 @@ pub mod test_memholder { sleep(Duration::from_secs(2)).await; let owner_name = "pin_owner"; - // 第二个 owner 必须使用不同的 member key(也会带来不同的 shared_memory_path) + // 第二个 owner 必须使用不同的 member key(也会带来不同的 share_mem_path) let owner2_name = "pin_owner2"; let (owner, _) = run_client(ConfigArg::Config(new_client_config_with_size( owner_name, diff --git a/fluxon_rs/fluxon_ops/Cargo.toml b/fluxon_rs/fluxon_ops/Cargo.toml index 0d54fc5..f4f772a 100644 --- a/fluxon_rs/fluxon_ops/Cargo.toml +++ b/fluxon_rs/fluxon_ops/Cargo.toml @@ -5,6 +5,7 @@ edition = "2024" [dependencies] anyhow = { workspace = true } +chrono = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } serde_yaml = { workspace = true } @@ -28,3 +29,6 @@ fluxon_framework = { path = "../fluxon_framework" } fluxon_util = { path = "../fluxon_util" } fluxon_cli = { path = "../fluxon_cli" } fluxon_proxy = { path = "../fluxon_proxy" } + +[dev-dependencies] +tempfile = { workspace = true } diff --git a/fluxon_rs/fluxon_ops/build.rs b/fluxon_rs/fluxon_ops/build.rs index ae424ef..585fbfc 100644 --- a/fluxon_rs/fluxon_ops/build.rs +++ b/fluxon_rs/fluxon_ops/build.rs @@ -58,14 +58,23 @@ print( String::from_utf8(output.stdout).expect("selection supervisor output must be utf-8") } +fn render_log_shard_helper(repo_root: &Path) -> String { + let helper_path = repo_root.join("deployment").join("utils").join("log_shard.py"); + fs::read_to_string(&helper_path) + .unwrap_or_else(|e| panic!("read log shard helper failed: {} ({})", helper_path.display(), e)) +} + fn main() { let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").expect("CARGO_MANIFEST_DIR")); let repo_root = repo_root(&manifest_dir); let source = render_selection_supervisor(&repo_root); + let log_shard_source = render_log_shard_helper(&repo_root); let out_dir = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR")); let out_path = out_dir.join("selection_supervisor.py"); fs::write(&out_path, source).expect("write embedded selection supervisor source"); + let helper_out_path = out_dir.join("log_shard.py"); + fs::write(&helper_out_path, log_shard_source).expect("write embedded log shard helper"); println!("cargo:rerun-if-changed=build.rs"); println!( @@ -76,4 +85,8 @@ fn main() { .join("selection_supervisor_codegen.py") .display() ); + println!( + "cargo:rerun-if-changed={}", + repo_root.join("deployment").join("utils").join("log_shard.py").display() + ); } diff --git a/fluxon_rs/fluxon_ops/src/lib.rs b/fluxon_rs/fluxon_ops/src/lib.rs index 40f646a..29d9434 100644 --- a/fluxon_rs/fluxon_ops/src/lib.rs +++ b/fluxon_rs/fluxon_ops/src/lib.rs @@ -28,7 +28,8 @@ use fluxon_kv::{ConfigArg, Framework, run_client}; use fluxon_proxy::{HeaderKv, PanelProxyMethod, PanelProxyResp}; use fluxon_util::{ - FluxonCliProxyDescriptorV2, FluxonCliProxyTransportV2, fluxon_cli_proxy_desc_etcd_key_v2, + FluxonCliProxyDescriptorV2, FluxonCliProxyTransportV2, display_runtime_log_path, + fluxon_cli_proxy_desc_etcd_key_v2, resolve_readable_log_path, }; pub const OPS_SERVICE_NAME: &str = "ops"; @@ -57,6 +58,7 @@ const OPS_ATOMIC_GROUP_ANNOTATION_KEY: &str = "fluxon.io/atomic_group"; const OPS_ATOMIC_GROUP_PHASE_ANNOTATION_KEY: &str = "fluxon.io/atomic_group_phase"; const OPS_ATOMIC_GROUP_ORDER_ANNOTATION_KEY: &str = "fluxon.io/atomic_group_order"; const OPS_SELECTION_SUPERVISOR_FILENAME: &str = "selection_supervisor.py"; +const OPS_LOG_SHARD_HELPER_FILENAME: &str = "log_shard.py"; const OPS_SELECTION_SUPERVISOR_DIR_NAME: &str = "selection_supervisor"; const OPS_SELECTION_SUPERVISOR_RUN_RESTART_DELAY_SECONDS: u64 = 5; const OPS_SELECTION_SUPERVISOR_RUN_MAX_BACKOFF_SECONDS: u64 = 30; @@ -78,6 +80,7 @@ const DELETE_APPLY_NO_WAIT_DELAY_SECONDS: u64 = 30; const EMBEDDED_SELECTION_SUPERVISOR_SOURCE: &str = include_str!(concat!(env!("OUT_DIR"), "/selection_supervisor.py")); +const EMBEDDED_LOG_SHARD_HELPER_SOURCE: &str = include_str!(concat!(env!("OUT_DIR"), "/log_shard.py")); // Ops controller uses Fluxon user-RPC to talk to ops agents. // Keep the timeout as a fixed constant to avoid config surface area. @@ -225,6 +228,132 @@ fn workload_log_filename(kind: WorkloadKind, name: &str) -> anyhow::Result anyhow::Result { + let logical_name = logical_path + .file_name() + .and_then(|v| v.to_str()) + .ok_or_else(|| anyhow::anyhow!("logical log path must end with a utf-8 filename"))?; + let resolved_name = resolved_path + .file_name() + .and_then(|v| v.to_str()) + .ok_or_else(|| anyhow::anyhow!("resolved log path must end with a utf-8 filename"))?; + if resolved_name == logical_name { + return Ok("base".to_string()); + } + let stem = logical_name + .strip_suffix(".log") + .ok_or_else(|| anyhow::anyhow!("logical log filename must end with .log"))?; + let prefix = format!("{stem}."); + let suffix = ".log"; + if !resolved_name.starts_with(prefix.as_str()) || !resolved_name.ends_with(suffix) { + anyhow::bail!( + "resolved log path is not a recognized shard of logical log: logical={} resolved={}", + logical_name, + resolved_name + ); + } + let shard = &resolved_name[prefix.len()..resolved_name.len() - suffix.len()]; + if shard.is_empty() { + anyhow::bail!( + "resolved log shard identity is empty: logical={} resolved={}", + logical_name, + resolved_name + ); + } + Ok(shard.to_string()) +} + +fn workload_log_path_for_shard(logical_path: &Path, shard: &str) -> anyhow::Result { + if shard == "base" { + return Ok(logical_path.to_path_buf()); + } + let date = chrono::NaiveDate::parse_from_str(shard, "%Y-%m-%d") + .map_err(|e| anyhow::anyhow!("invalid workload log shard identity '{}': {}", shard, e))?; + fluxon_util::daily_sharded_log_path(logical_path, date) +} + +fn workload_log_existing_shards(logical_path: &Path) -> anyhow::Result> { + let mut dated_shards = Vec::new(); + let mut has_base = false; + let parent = logical_path.parent().unwrap_or_else(|| Path::new(".")); + let logical_name = logical_path + .file_name() + .and_then(|v| v.to_str()) + .ok_or_else(|| anyhow::anyhow!("logical log path must end with a utf-8 filename"))?; + let stem = logical_name + .strip_suffix(".log") + .ok_or_else(|| anyhow::anyhow!("logical log filename must end with .log"))?; + let prefix = format!("{stem}."); + let suffix = ".log"; + let entries = std::fs::read_dir(parent)?; + for entry in entries { + let entry = entry?; + let path = entry.path(); + if !path.is_file() { + continue; + } + let entry_name = entry.file_name(); + let Some(entry_name) = entry_name.to_str() else { + continue; + }; + if entry_name == logical_name { + has_base = true; + continue; + } + if !entry_name.starts_with(prefix.as_str()) || !entry_name.ends_with(suffix) { + continue; + } + if entry_name.len() <= prefix.len() + suffix.len() { + continue; + } + let shard = &entry_name[prefix.len()..entry_name.len() - suffix.len()]; + if chrono::NaiveDate::parse_from_str(shard, "%Y-%m-%d").is_ok() { + dated_shards.push(shard.to_string()); + } + } + dated_shards.sort(); + dated_shards.dedup(); + if !dated_shards.is_empty() { + return Ok(dated_shards); + } + if has_base { + return Ok(vec!["base".to_string()]); + } + Ok(Vec::new()) +} + +fn workload_log_previous_shard(logical_path: &Path, shard: &str) -> anyhow::Result> { + let shards = workload_log_existing_shards(logical_path)?; + let Some(idx) = shards.iter().position(|v| v == shard) else { + return Ok(None); + }; + if idx == 0 { + return Ok(None); + } + Ok(Some(shards[idx - 1].clone())) +} + +fn workload_log_next_shard(logical_path: &Path, shard: &str) -> anyhow::Result> { + let shards = workload_log_existing_shards(logical_path)?; + let Some(idx) = shards.iter().position(|v| v == shard) else { + return Ok(None); + }; + if idx + 1 >= shards.len() { + return Ok(None); + } + Ok(Some(shards[idx + 1].clone())) +} + +fn workload_log_latest_shard_identity(logical_path: &Path) -> anyhow::Result> { + let Some(path) = resolve_readable_log_path(logical_path) else { + return Ok(None); + }; + Ok(Some(workload_log_shard_identity_from_path(logical_path, &path)?)) +} + #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)] #[serde(deny_unknown_fields)] struct WorkloadId { @@ -577,7 +706,7 @@ struct ReadWorkloadLogReq { name: String, direction: LogReadDirection, #[serde(skip_serializing_if = "Option::is_none")] - cursor: Option, + cursor: Option, // Contract: // - max_bytes may be omitted to mean "unlimited" (no byte cap). // - This supports ad-hoc debugging where the caller wants the full log without knowing file_size up-front. @@ -599,9 +728,20 @@ struct ReadWorkloadLogResp { #[serde(skip_serializing_if = "Option::is_none")] end_offset: Option, #[serde(skip_serializing_if = "Option::is_none")] + start_cursor: Option, + #[serde(skip_serializing_if = "Option::is_none")] + end_cursor: Option, + #[serde(skip_serializing_if = "Option::is_none")] text: Option, } +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +struct WorkloadLogCursor { + shard: String, + offset: u64, +} + fn ensure_positive_u64(v: u64, field: &str) -> KvResult { if v == 0 { return Err(KvError::Api(ApiError::InvalidArgument { @@ -970,7 +1110,7 @@ fn resolve_python_host_executable(python_exe: &Path) -> anyhow::Result Ok(resolved) } -fn ensure_embedded_selection_supervisor(workdir: &Path) -> anyhow::Result { +fn ensure_embedded_selection_supervisor_runtime(workdir: &Path) -> anyhow::Result<(PathBuf, PathBuf)> { let runtime_dir = workdir.join(OPS_SELECTION_SUPERVISOR_DIR_NAME); std::fs::create_dir_all(&runtime_dir).with_context(|| { format!( @@ -979,6 +1119,7 @@ fn ensure_embedded_selection_supervisor(workdir: &Path) -> anyhow::Result existing != EMBEDDED_SELECTION_SUPERVISOR_SOURCE, Err(e) => { @@ -992,6 +1133,19 @@ fn ensure_embedded_selection_supervisor(workdir: &Path) -> anyhow::Result existing != EMBEDDED_LOG_SHARD_HELPER_SOURCE, + Err(e) => { + if e.kind() == std::io::ErrorKind::NotFound { + true + } else { + return Err(anyhow::Error::new(e).context(format!( + "read embedded log shard helper failed: {}", + helper_path.display() + ))); + } + } + }; if should_write { std::fs::write(&script_path, EMBEDDED_SELECTION_SUPERVISOR_SOURCE).with_context(|| { format!( @@ -1019,13 +1173,21 @@ fn ensure_embedded_selection_supervisor(workdir: &Path) -> anyhow::Result anyhow::Result { let python_exe = resolve_python_host_executable(python_exe)?; - let script_path = ensure_embedded_selection_supervisor(workdir)?; + let (script_path, _helper_path) = ensure_embedded_selection_supervisor_runtime(workdir)?; if !hostworkdir.is_absolute() { anyhow::bail!( "hostworkdir must be absolute for shared selection supervisor runtime: {}", @@ -1647,7 +1809,9 @@ fn selection_status_from_live_supervisor( apply_id: runtime_state.as_ref().and_then(|v| v.apply_id.clone()), argv: runtime_state.as_ref().map(|v| v.argv.clone()), cwd: runtime_state.as_ref().and_then(|v| v.cwd.clone()), - log_path: runtime_state.as_ref().map(|v| v.log_path.clone()), + log_path: runtime_state + .as_ref() + .map(|v| display_runtime_log_path(v.log_path.as_str())), started_ts_ms: None, owner_ts_ms: Some(supervisor.owner_ts_ms), supervisor_start_time_ticks: Some(supervisor.start_time_ticks()), @@ -2964,117 +3128,268 @@ impl UserRpcHandler for ReadWorkloadLogChunkHandler { file_size: None, start_offset: None, end_offset: None, + start_cursor: None, + end_cursor: None, text: None, }; return Ok(serde_json::to_vec(&resp).unwrap()); } }; - let path = self.log_dir.join(log_filename); + let logical_path = self.log_dir.join(log_filename); + let make_err_resp = |err: String, file_size: Option| ReadWorkloadLogResp { + ok: false, + err: Some(err), + file_size, + start_offset: None, + end_offset: None, + start_cursor: None, + end_cursor: None, + text: None, + }; + + let (path, shard) = match req.cursor.as_ref() { + Some(cursor) => match workload_log_path_for_shard(&logical_path, &cursor.shard) { + Ok(path) => (path, cursor.shard.clone()), + Err(e) => { + let resp = make_err_resp(format!("{}", e), None); + return Ok(serde_json::to_vec(&resp).unwrap()); + } + }, + None => { + let Some(path) = resolve_readable_log_path(&logical_path) else { + let resp = make_err_resp( + format!("log file is not available yet: logical_path={}", logical_path.display()), + None, + ); + return Ok(serde_json::to_vec(&resp).unwrap()); + }; + let shard = match workload_log_shard_identity_from_path(&logical_path, &path) { + Ok(v) => v, + Err(e) => { + let resp = make_err_resp(format!("{}", e), None); + return Ok(serde_json::to_vec(&resp).unwrap()); + } + }; + (path, shard) + } + }; let meta = match std::fs::metadata(&path) { Ok(v) => v, Err(e) => { - let resp = ReadWorkloadLogResp { - ok: false, - err: Some(format!( - "stat log failed: path={} err={}", - path.display(), - e - )), - file_size: None, - start_offset: None, - end_offset: None, - text: None, - }; + let resp = make_err_resp( + format!("stat log failed: path={} err={}", path.display(), e), + None, + ); return Ok(serde_json::to_vec(&resp).unwrap()); } }; let file_size = meta.len(); - let (start, end) = match req.direction { - LogReadDirection::Forward => { - if let Some(cursor) = req.cursor { - if cursor > file_size { - let resp = ReadWorkloadLogResp { - ok: false, - err: Some(format!( - "cursor out of range: cursor={} file_size={}", - cursor, file_size - )), - file_size: Some(file_size), - start_offset: None, - end_offset: None, - text: None, + let (start, end, start_cursor, end_cursor, effective_path, effective_file_size) = + match req.direction { + LogReadDirection::Forward => { + if let Some(cursor) = req.cursor.as_ref() { + if cursor.offset > file_size { + let resp = make_err_resp( + format!( + "cursor out of range: shard={} cursor={} file_size={}", + cursor.shard, cursor.offset, file_size + ), + Some(file_size), + ); + return Ok(serde_json::to_vec(&resp).unwrap()); + } + let mut effective_path = path.clone(); + let mut effective_shard = shard.clone(); + let mut effective_file_size = file_size; + let mut start = cursor.offset; + if cursor.offset == file_size { + if let Ok(Some(next_shard)) = + workload_log_next_shard(&logical_path, &cursor.shard) + { + let next_path = match workload_log_path_for_shard(&logical_path, &next_shard) { + Ok(v) => v, + Err(e) => { + let resp = make_err_resp(format!("{}", e), Some(file_size)); + return Ok(serde_json::to_vec(&resp).unwrap()); + } + }; + match std::fs::metadata(&next_path) { + Ok(next_meta) => { + effective_file_size = next_meta.len(); + effective_path = next_path; + effective_shard = next_shard; + start = 0; + } + Err(e) => { + let resp = make_err_resp( + format!( + "stat next log shard failed: path={} err={}", + next_path.display(), + e + ), + Some(file_size), + ); + return Ok(serde_json::to_vec(&resp).unwrap()); + } + } + } else if let Ok(Some(latest_shard)) = + workload_log_latest_shard_identity(&logical_path) + { + if latest_shard != cursor.shard { + let latest_path = + match workload_log_path_for_shard(&logical_path, &latest_shard) { + Ok(v) => v, + Err(e) => { + let resp = make_err_resp(format!("{}", e), Some(file_size)); + return Ok(serde_json::to_vec(&resp).unwrap()); + } + }; + match std::fs::metadata(&latest_path) { + Ok(latest_meta) => { + effective_file_size = latest_meta.len(); + effective_path = latest_path; + effective_shard = latest_shard; + start = 0; + } + Err(e) => { + let resp = make_err_resp( + format!( + "stat latest log shard failed: path={} err={}", + latest_path.display(), + e + ), + Some(file_size), + ); + return Ok(serde_json::to_vec(&resp).unwrap()); + } + } + } + } + } + let end = match max_bytes { + Some(max_bytes) => { + std::cmp::min(effective_file_size, start.saturating_add(max_bytes)) + } + None => effective_file_size, }; + ( + start, + end, + Some(WorkloadLogCursor { + shard: effective_shard.clone(), + offset: start, + }), + Some(WorkloadLogCursor { + shard: effective_shard.clone(), + offset: end, + }), + effective_path, + effective_file_size, + ) + } else { + let end = file_size; + let start = match max_bytes { + Some(max_bytes) => end.saturating_sub(max_bytes), + None => 0, + }; + ( + start, + end, + Some(WorkloadLogCursor { + shard: shard.clone(), + offset: start, + }), + Some(WorkloadLogCursor { + shard: shard.clone(), + offset: end, + }), + path.clone(), + file_size, + ) + } + } + LogReadDirection::Backward => { + let Some(cursor) = req.cursor.as_ref() else { + let resp = make_err_resp( + "cursor is required for Backward reads".to_string(), + Some(file_size), + ); + return Ok(serde_json::to_vec(&resp).unwrap()); + }; + if cursor.offset > file_size { + let resp = make_err_resp( + format!( + "cursor out of range: shard={} cursor={} file_size={}", + cursor.shard, cursor.offset, file_size + ), + Some(file_size), + ); return Ok(serde_json::to_vec(&resp).unwrap()); } - let start = cursor; - let end = match max_bytes { - Some(max_bytes) => { - std::cmp::min(file_size, start.saturating_add(max_bytes)) + let mut effective_path = path.clone(); + let mut effective_shard = shard.clone(); + let mut effective_file_size = file_size; + let mut end = cursor.offset; + if cursor.offset == 0 { + if let Ok(Some(prev_shard)) = + workload_log_previous_shard(&logical_path, &cursor.shard) + { + let prev_path = match workload_log_path_for_shard(&logical_path, &prev_shard) { + Ok(v) => v, + Err(e) => { + let resp = make_err_resp(format!("{}", e), Some(file_size)); + return Ok(serde_json::to_vec(&resp).unwrap()); + } + }; + match std::fs::metadata(&prev_path) { + Ok(prev_meta) => { + effective_file_size = prev_meta.len(); + effective_path = prev_path; + effective_shard = prev_shard; + end = effective_file_size; + } + Err(e) => { + let resp = make_err_resp( + format!( + "stat previous log shard failed: path={} err={}", + prev_path.display(), + e + ), + Some(file_size), + ); + return Ok(serde_json::to_vec(&resp).unwrap()); + } + } } - None => file_size, - }; - (start, end) - } else { - // Tail: - // - max_bytes=Some => return the last max_bytes bytes. - // - max_bytes=None => return the whole file. - let end = file_size; + } let start = match max_bytes { Some(max_bytes) => end.saturating_sub(max_bytes), None => 0, }; - (start, end) - } - } - LogReadDirection::Backward => { - let Some(cursor) = req.cursor else { - let resp = ReadWorkloadLogResp { - ok: false, - err: Some("cursor is required for Backward reads".to_string()), - file_size: Some(file_size), - start_offset: None, - end_offset: None, - text: None, - }; - return Ok(serde_json::to_vec(&resp).unwrap()); - }; - if cursor > file_size { - let resp = ReadWorkloadLogResp { - ok: false, - err: Some(format!( - "cursor out of range: cursor={} file_size={}", - cursor, file_size - )), - file_size: Some(file_size), - start_offset: None, - end_offset: None, - text: None, - }; - return Ok(serde_json::to_vec(&resp).unwrap()); + ( + start, + end, + Some(WorkloadLogCursor { + shard: effective_shard.clone(), + offset: start, + }), + Some(WorkloadLogCursor { + shard: effective_shard.clone(), + offset: end, + }), + effective_path, + effective_file_size, + ) } - let end = cursor; - let start = match max_bytes { - Some(max_bytes) => end.saturating_sub(max_bytes), - None => 0, - }; - (start, end) - } - }; + }; if end < start { - let resp = ReadWorkloadLogResp { - ok: false, - err: Some(format!( - "internal error: end < start: start={} end={}", - start, end - )), - file_size: Some(file_size), - start_offset: None, - end_offset: None, - text: None, - }; + let resp = make_err_resp( + format!("internal error: end < start: start={} end={}", start, end), + Some(effective_file_size), + ); return Ok(serde_json::to_vec(&resp).unwrap()); } @@ -3086,70 +3401,42 @@ impl UserRpcHandler for ReadWorkloadLogChunkHandler { })?; if let Some(max_bytes_usize) = max_bytes_usize { if len > max_bytes_usize { - let resp = ReadWorkloadLogResp { - ok: false, - err: Some(format!( + let resp = make_err_resp( + format!( "internal error: computed read_len exceeds max_bytes: read_len={} max_bytes={}", len, max_bytes_usize - )), - file_size: Some(file_size), - start_offset: None, - end_offset: None, - text: None, - }; + ), + Some(effective_file_size), + ); return Ok(serde_json::to_vec(&resp).unwrap()); } } - let mut f = match std::fs::File::open(&path) { + let mut f = match std::fs::File::open(&effective_path) { Ok(v) => v, Err(e) => { - let resp = ReadWorkloadLogResp { - ok: false, - err: Some(format!( - "open log failed: path={} err={}", - path.display(), - e - )), - file_size: Some(file_size), - start_offset: None, - end_offset: None, - text: None, - }; + let resp = make_err_resp( + format!("open log failed: path={} err={}", effective_path.display(), e), + Some(effective_file_size), + ); return Ok(serde_json::to_vec(&resp).unwrap()); } }; if let Err(e) = std::io::Seek::seek(&mut f, std::io::SeekFrom::Start(start)) { - let resp = ReadWorkloadLogResp { - ok: false, - err: Some(format!( - "seek log failed: path={} err={}", - path.display(), - e - )), - file_size: Some(file_size), - start_offset: None, - end_offset: None, - text: None, - }; + let resp = make_err_resp( + format!("seek log failed: path={} err={}", effective_path.display(), e), + Some(effective_file_size), + ); return Ok(serde_json::to_vec(&resp).unwrap()); } let mut buf: Vec = vec![0; len]; if let Err(e) = std::io::Read::read_exact(&mut f, &mut buf) { - let resp = ReadWorkloadLogResp { - ok: false, - err: Some(format!( - "read log failed: path={} err={}", - path.display(), - e - )), - file_size: Some(file_size), - start_offset: None, - end_offset: None, - text: None, - }; + let resp = make_err_resp( + format!("read log failed: path={} err={}", effective_path.display(), e), + Some(effective_file_size), + ); return Ok(serde_json::to_vec(&resp).unwrap()); } @@ -3162,9 +3449,11 @@ impl UserRpcHandler for ReadWorkloadLogChunkHandler { let resp = ReadWorkloadLogResp { ok: true, err: None, - file_size: Some(file_size), + file_size: Some(effective_file_size), start_offset: Some(start), end_offset: Some(end), + start_cursor, + end_cursor, text: Some(text), }; Ok(serde_json::to_vec(&resp).unwrap()) @@ -3773,8 +4062,12 @@ fn desired_workload_matches_running( workloads: &SupervisorBackedWorkloads, desired: &AgentDesiredWorkload, ) -> bool { - let _ = workloads; - let Ok(status) = observe_selection_status(desired.kind, &desired.name, &desired.authority) + let Ok(status) = observe_selection_status_for_scope( + desired.kind, + &desired.name, + &desired.authority, + Some(workloads.scope_key.as_str()), + ) else { return false; }; @@ -3854,7 +4147,6 @@ fn desired_workload_recovery_superseded( workloads: &SupervisorBackedWorkloads, desired: &AgentDesiredWorkload, ) -> anyhow::Result { - let _ = workloads; // English note: // - A newer apply-owned generation overlapping an older applyless bare owner is the expected // phase-1 state of the self-host two-phase handover. @@ -3863,7 +4155,12 @@ fn desired_workload_recovery_superseded( // phase 2 has a chance to cut over. // - Only an owner_ts that is newer than the requested workload and is not this intentional // phase-1 overlap is treated as a hard superseding fact. - let status = observe_selection_status(desired.kind, &desired.name, &desired.authority)?; + let status = observe_selection_status_for_scope( + desired.kind, + &desired.name, + &desired.authority, + Some(workloads.scope_key.as_str()), + )?; if phase1_overlap_with_applyless_owner(&status, desired) { return Ok(false); } @@ -8355,20 +8652,22 @@ async function deleteGenerationFromControl() { const LOG_DIR_FORWARD = 'Forward'; const LOG_DIR_BACKWARD = 'Backward'; - let workloadLogTimer = null; - let workloadLogSelection = { instanceKey: '', kind: '', name: '' }; - let workloadLogStartOffset = 0; - let workloadLogEndOffset = 0; - let workloadLogLoadingOlder = false; - let workloadLogAnsiState = newAnsiSgrState(); + let workloadLogTimer = null; + let workloadLogSelection = { instanceKey: '', kind: '', name: '' }; + let workloadLogStartOffset = 0; + let workloadLogEndOffset = 0; + let workloadLogStartCursor = null; + let workloadLogEndCursor = null; + let workloadLogLoadingOlder = false; + let workloadLogAnsiState = newAnsiSgrState(); function isWorkloadLogFollowEnabled() { const cb = document.getElementById('workload_log_follow'); return cb && cb.checked === true; } - function setWorkloadLogHeader() { - const h = document.getElementById('workload_log_header'); + function setWorkloadLogHeader() { + const h = document.getElementById('workload_log_header'); const ik = workloadLogSelection.instanceKey || ''; const kind = workloadLogSelection.kind || ''; const name = workloadLogSelection.name || ''; @@ -8376,10 +8675,14 @@ async function deleteGenerationFromControl() { if (!ik || !kind || !name) { h.textContent = 'No log selected. Click "Logs" in the table.'; return; - } - h.textContent = 'instance_key=' + ik + ' workload=' + kind + '/' + name - + ' range=[' + String(workloadLogStartOffset) + ',' + String(workloadLogEndOffset) + ')'; - } + } + const shardText = (workloadLogEndCursor && workloadLogEndCursor.shard) + ? String(workloadLogEndCursor.shard) + : ((workloadLogStartCursor && workloadLogStartCursor.shard) ? String(workloadLogStartCursor.shard) : '-'); + h.textContent = 'instance_key=' + ik + ' workload=' + kind + '/' + name + + ' shard=' + shardText + + ' range=[' + String(workloadLogStartOffset) + ',' + String(workloadLogEndOffset) + ')'; + } function stopWorkloadLogTail() { if (workloadLogTimer != null) { @@ -8388,12 +8691,14 @@ async function deleteGenerationFromControl() { } } - function clearWorkloadLogView() { - stopWorkloadLogTail(); - workloadLogStartOffset = 0; - workloadLogEndOffset = 0; - workloadLogLoadingOlder = false; - workloadLogAnsiState = newAnsiSgrState(); + function clearWorkloadLogView() { + stopWorkloadLogTail(); + workloadLogStartOffset = 0; + workloadLogEndOffset = 0; + workloadLogStartCursor = null; + workloadLogEndCursor = null; + workloadLogLoadingOlder = false; + workloadLogAnsiState = newAnsiSgrState(); setWorkloadLogHeader(); const pre = document.getElementById('workload_log_out'); if (pre) { pre.textContent = '(empty)'; } @@ -8449,11 +8754,13 @@ async function deleteGenerationFromControl() { return; } - const txt = (v.text != null) ? String(v.text) : ''; - workloadLogStartOffset = (v.start_offset != null) ? Number(v.start_offset) : 0; - workloadLogEndOffset = (v.end_offset != null) ? Number(v.end_offset) : workloadLogStartOffset; - workloadLogLoadingOlder = false; - setWorkloadLogHeader(); + const txt = (v.text != null) ? String(v.text) : ''; + workloadLogStartOffset = (v.start_offset != null) ? Number(v.start_offset) : 0; + workloadLogEndOffset = (v.end_offset != null) ? Number(v.end_offset) : workloadLogStartOffset; + workloadLogStartCursor = (v.start_cursor != null) ? v.start_cursor : null; + workloadLogEndCursor = (v.end_cursor != null) ? v.end_cursor : null; + workloadLogLoadingOlder = false; + setWorkloadLogHeader(); if (pre) { const r = ansiSgrToHtmlChunkWithState(txt, workloadLogAnsiState); @@ -8473,8 +8780,8 @@ async function deleteGenerationFromControl() { return; } - const v2 = await fetchWorkloadLogChunk(LOG_DIR_FORWARD, workloadLogEndOffset); - if (!v2 || v2.ok !== true) { + const v2 = await fetchWorkloadLogChunk(LOG_DIR_FORWARD, workloadLogEndCursor); + if (!v2 || v2.ok !== true) { // Keep the existing view; update the header so operators see the error. const h = document.getElementById('workload_log_header'); if (h) { @@ -8482,12 +8789,15 @@ async function deleteGenerationFromControl() { h.textContent = 'log tail ERROR: ' + err; } return; - } - const txt2 = (v2.text != null) ? String(v2.text) : ''; - const newEnd = (v2.end_offset != null) ? Number(v2.end_offset) : workloadLogEndOffset; - if (newEnd < workloadLogEndOffset) { - const h = document.getElementById('workload_log_header'); - if (h) { + } + const txt2 = (v2.text != null) ? String(v2.text) : ''; + const newEnd = (v2.end_offset != null) ? Number(v2.end_offset) : workloadLogEndOffset; + const newEndCursor = (v2.end_cursor != null) ? v2.end_cursor : workloadLogEndCursor; + const sameShard = workloadLogEndCursor && newEndCursor + && workloadLogEndCursor.shard === newEndCursor.shard; + if (sameShard && newEnd < workloadLogEndOffset) { + const h = document.getElementById('workload_log_header'); + if (h) { h.textContent = 'log tail ERROR: end_offset moved backwards (file truncated/rotated?)' + ' old_end=' + String(workloadLogEndOffset) + ' new_end=' + String(newEnd); @@ -8496,21 +8806,26 @@ async function deleteGenerationFromControl() { return; } - if (txt2.length > 0) { - const r2 = ansiSgrToHtmlChunkWithState(txt2, workloadLogAnsiState); - workloadLogAnsiState = r2.state; - pre2.insertAdjacentHTML('beforeend', r2.html); - workloadLogEndOffset = newEnd; - setWorkloadLogHeader(); + if (txt2.length > 0) { + const r2 = ansiSgrToHtmlChunkWithState(txt2, workloadLogAnsiState); + workloadLogAnsiState = r2.state; + pre2.insertAdjacentHTML('beforeend', r2.html); + workloadLogEndOffset = newEnd; + workloadLogEndCursor = newEndCursor; + if (v2.start_cursor != null && workloadLogStartCursor == null) { + workloadLogStartCursor = v2.start_cursor; + } + setWorkloadLogHeader(); if (follow || atBottom) { pre2.scrollTop = pre2.scrollHeight; } - } else { - workloadLogEndOffset = newEnd; - setWorkloadLogHeader(); - } - }, WORKLOAD_LOG_POLL_INTERVAL_MS); - } + } else { + workloadLogEndOffset = newEnd; + workloadLogEndCursor = newEndCursor; + setWorkloadLogHeader(); + } + }, WORKLOAD_LOG_POLL_INTERVAL_MS); + } async function loadOlderWorkloadLog() { if (workloadLogLoadingOlder) { return; } @@ -8520,39 +8835,46 @@ async function deleteGenerationFromControl() { pre.textContent = 'ERROR: no log selected.'; return; } - if (workloadLogStartOffset <= 0) { - return; - } - workloadLogLoadingOlder = true; - const beforeHeight = pre.scrollHeight; - const v = await fetchWorkloadLogChunk(LOG_DIR_BACKWARD, workloadLogStartOffset); - if (!v || v.ok !== true) { + if (workloadLogStartCursor == null) { + return; + } + workloadLogLoadingOlder = true; + const beforeHeight = pre.scrollHeight; + const v = await fetchWorkloadLogChunk(LOG_DIR_BACKWARD, workloadLogStartCursor); + if (!v || v.ok !== true) { const err = v && v.err ? String(v.err) : 'unknown error'; pre.insertAdjacentText('afterbegin', 'ERROR: ' + err + '\n\n'); workloadLogLoadingOlder = false; return; - } - const txt = (v.text != null) ? String(v.text) : ''; - const newStart = (v.start_offset != null) ? Number(v.start_offset) : workloadLogStartOffset; - if (txt.length > 0) { + } + const txt = (v.text != null) ? String(v.text) : ''; + const newStart = (v.start_offset != null) ? Number(v.start_offset) : workloadLogStartOffset; + const newStartCursor = (v.start_cursor != null) ? v.start_cursor : workloadLogStartCursor; + if (txt.length > 0) { // English note: prepend is stateless; this is best-effort because boundary SGR state // cannot be re-applied to already-rendered newer content. pre.insertAdjacentHTML('afterbegin', ansiSgrToHtmlChunkStateless(txt)); - } - workloadLogStartOffset = newStart; - setWorkloadLogHeader(); + } + workloadLogStartOffset = newStart; + workloadLogStartCursor = newStartCursor; + if (v.end_cursor != null) { + workloadLogEndCursor = workloadLogEndCursor || v.end_cursor; + } + setWorkloadLogHeader(); const afterHeight = pre.scrollHeight; pre.scrollTop = (afterHeight - beforeHeight) + pre.scrollTop; workloadLogLoadingOlder = false; } - function openWorkloadLog(instanceKey, kind, name) { - workloadLogSelection = { instanceKey: String(instanceKey || ''), kind: String(kind || ''), name: String(name || '') }; - workloadLogStartOffset = 0; - workloadLogEndOffset = 0; - workloadLogLoadingOlder = false; - setWorkloadLogHeader(); + function openWorkloadLog(instanceKey, kind, name) { + workloadLogSelection = { instanceKey: String(instanceKey || ''), kind: String(kind || ''), name: String(name || '') }; + workloadLogStartOffset = 0; + workloadLogEndOffset = 0; + workloadLogStartCursor = null; + workloadLogEndCursor = null; + workloadLogLoadingOlder = false; + setWorkloadLogHeader(); const pre = document.getElementById('workload_log_out'); if (pre) { pre.textContent = 'Selected. Click Tail to start.'; } startWorkloadLogTail(); @@ -10301,7 +10623,7 @@ struct WorkloadLogHttpReq { kind: WorkloadKind, name: String, direction: LogReadDirection, - cursor: Option, + cursor: Option, #[serde(skip_serializing_if = "Option::is_none")] max_bytes: Option, } @@ -10319,6 +10641,8 @@ async fn handle_workload_log( file_size: None, start_offset: None, end_offset: None, + start_cursor: None, + end_cursor: None, text: None, }; return Ok(response_json(StatusCode::BAD_REQUEST, &resp)); @@ -10335,6 +10659,8 @@ async fn handle_workload_log( file_size: None, start_offset: None, end_offset: None, + start_cursor: None, + end_cursor: None, text: None, }; return Ok(response_json(StatusCode::BAD_REQUEST, &resp)); @@ -10379,6 +10705,8 @@ async fn handle_workload_log( file_size: None, start_offset: None, end_offset: None, + start_cursor: None, + end_cursor: None, text: None, }; Ok(response_json(StatusCode::BAD_GATEWAY, &resp)) @@ -13938,6 +14266,90 @@ mod tests { assert!(err_text.contains("owner_ts_ms collision"), "{err_text}"); } + #[test] + fn live_selection_supervisors_isolate_same_label_collision_by_scope_key() { + let snapshot = SelectionSupervisorProcSnapshot { + infos_by_pid: std::collections::HashMap::from([ + ( + 11, + ProcessInfoObservation { + pid: 11, + ppid: 1, + pgid: 11, + state: 'S', + start_time_ticks: 100, + }, + ), + ( + 22, + ProcessInfoObservation { + pid: 22, + ppid: 1, + pgid: 22, + state: 'S', + start_time_ticks: 200, + }, + ), + ]), + children_by_ppid: std::collections::HashMap::new(), + cmdlines: vec![ + ( + 11, + vec![ + "/usr/bin/python3".to_string(), + "selection_supervisor.py".to_string(), + "run".to_string(), + "--label".to_string(), + "DaemonSet/target".to_string(), + "--scope-key".to_string(), + "/tmp/scope-a".to_string(), + "--owner-ts-ms".to_string(), + "2".to_string(), + ], + ), + ( + 22, + vec![ + "/usr/bin/python3".to_string(), + "selection_supervisor.py".to_string(), + "run".to_string(), + "--label".to_string(), + "DaemonSet/target".to_string(), + "--scope-key".to_string(), + "/tmp/scope-b".to_string(), + "--owner-ts-ms".to_string(), + "2".to_string(), + ], + ), + ], + zombie_infos: Vec::new(), + }; + + let scoped_a = + live_selection_supervisors(&snapshot, Some("DaemonSet/target"), Some("/tmp/scope-a")) + .unwrap(); + assert_eq!(scoped_a.len(), 1); + assert_eq!(scoped_a[0].pid(), 11); + + let scoped_b = + live_selection_supervisors(&snapshot, Some("DaemonSet/target"), Some("/tmp/scope-b")) + .unwrap(); + assert_eq!(scoped_b.len(), 1); + assert_eq!(scoped_b[0].pid(), 22); + + let listed_a = observe_all_selection_statuses_for_snapshot(&snapshot, Some("/tmp/scope-a")) + .unwrap(); + assert_eq!(listed_a.len(), 1); + assert_eq!(listed_a[0].label, "DaemonSet/target"); + assert_eq!(listed_a[0].pid, Some(11)); + + let listed_b = observe_all_selection_statuses_for_snapshot(&snapshot, Some("/tmp/scope-b")) + .unwrap(); + assert_eq!(listed_b.len(), 1); + assert_eq!(listed_b[0].label, "DaemonSet/target"); + assert_eq!(listed_b[0].pid, Some(22)); + } + #[test] fn live_selection_supervisors_reject_matching_legacy_entry_without_owner_ts_ms() { let snapshot = SelectionSupervisorProcSnapshot { @@ -14405,6 +14817,95 @@ mod tests { .unwrap(); } + #[test] + fn materialize_selection_supervisor_runtime_writes_log_shard_helper() { + let python_exe = PathBuf::from("/usr/bin/python3"); + assert!( + python_exe.is_file(), + "python executable does not exist: {}", + python_exe.display() + ); + let workdir = tempfile::tempdir().unwrap(); + let runtime = + SelectionSupervisorRuntime::materialize(workdir.path(), workdir.path(), python_exe.as_path()) + .unwrap(); + assert!(runtime.script_path.exists()); + assert!( + runtime + .script_path + .parent() + .unwrap() + .join(OPS_LOG_SHARD_HELPER_FILENAME) + .is_file() + ); + } + + #[test] + fn detached_selection_supervisor_preserves_early_startup_logs() { + let python_exe = PathBuf::from("/usr/bin/python3"); + assert!( + python_exe.is_file(), + "python executable does not exist: {}", + python_exe.display() + ); + let workdir = tempfile::tempdir().unwrap(); + let runtime = + SelectionSupervisorRuntime::materialize(workdir.path(), workdir.path(), python_exe.as_path()) + .unwrap(); + let log_path = workdir.path().join("startup.log"); + let command = vec![ + python_exe.display().to_string(), + runtime.script_path.display().to_string(), + "run".to_string(), + "--label".to_string(), + "Deployment/startup_demo".to_string(), + "--scope-key".to_string(), + workdir.path().display().to_string(), + "--owner-ts-ms".to_string(), + "0".to_string(), + "--restart-policy".to_string(), + "always".to_string(), + "--restart-delay-seconds".to_string(), + "5".to_string(), + "--max-backoff-seconds".to_string(), + "30".to_string(), + "--crashloop-consecutive-restarts".to_string(), + "0".to_string(), + "--crashloop-interval-lt-seconds".to_string(), + "0".to_string(), + "--".to_string(), + "/bin/true".to_string(), + ]; + let pid = runtime.spawn_detached_command(&log_path, command.as_slice()).unwrap(); + let deadline = Instant::now() + Duration::from_secs(10); + let expected = "owner-ts-ms must be positive"; + let mut saw_expected = false; + while Instant::now() < deadline { + if let Some(path) = resolve_readable_log_path(&log_path) { + let text = std::fs::read_to_string(path).unwrap_or_default(); + if text.contains(expected) { + saw_expected = true; + break; + } + } + std::thread::sleep(Duration::from_millis(100)); + } + if let Some(path) = resolve_readable_log_path(&log_path) { + let text = std::fs::read_to_string(path).unwrap_or_default(); + assert!( + text.contains(expected), + "expected detached supervisor startup logs to reach runtime log, got: {text:?}" + ); + } else { + panic!("runtime log path did not materialize"); + } + assert!(saw_expected, "startup log was not observed before timeout"); + let _ = std::process::Command::new("kill") + .arg("-TERM") + .arg(pid.to_string()) + .status(); + } + #[test] fn atomic_group_non_agent_requires_present_before_running_match() { let desired = AgentDesiredWorkload { @@ -14616,4 +15117,115 @@ mod tests { }; assert!(!phase1_overlap_with_applyless_owner(&status, &desired)); } + + #[test] + fn resolve_readable_log_path_prefers_latest_daily_shard() { + let td = tempfile::tempdir().unwrap(); + let base_path = td.path().join("workload__Deployment__demo.log"); + std::fs::write( + td.path().join("workload__Deployment__demo.2026-06-19.log"), + "old\n", + ) + .unwrap(); + std::fs::write( + td.path().join("workload__Deployment__demo.2026-06-20.log"), + "new\n", + ) + .unwrap(); + let resolved = resolve_readable_log_path(&base_path).unwrap(); + assert_eq!( + resolved.file_name().and_then(|v| v.to_str()), + Some("workload__Deployment__demo.2026-06-20.log") + ); + } + + #[test] + fn read_workload_log_forward_cursor_rolls_into_next_shard() { + let td = tempfile::tempdir().unwrap(); + let log_dir = td.path().to_path_buf(); + std::fs::write( + log_dir.join("workload__Deployment__demo.2026-06-19.log"), + "old\n", + ) + .unwrap(); + std::fs::write( + log_dir.join("workload__Deployment__demo.2026-06-20.log"), + "new\n", + ) + .unwrap(); + let handler = ReadWorkloadLogChunkHandler { log_dir }; + let req = ReadWorkloadLogReq { + kind: WorkloadKind::Deployment, + name: "demo".to_string(), + direction: LogReadDirection::Forward, + cursor: Some(WorkloadLogCursor { + shard: "2026-06-19".to_string(), + offset: 4, + }), + max_bytes: Some(65536), + }; + let raw = handler.handle("n1".into(), &serde_json::to_vec(&req).unwrap()).unwrap(); + let resp: ReadWorkloadLogResp = serde_json::from_slice(&raw).unwrap(); + assert!(resp.ok, "{resp:?}"); + assert_eq!(resp.text.as_deref(), Some("new\n")); + assert_eq!( + resp.start_cursor, + Some(WorkloadLogCursor { + shard: "2026-06-20".to_string(), + offset: 0, + }) + ); + assert_eq!( + resp.end_cursor, + Some(WorkloadLogCursor { + shard: "2026-06-20".to_string(), + offset: 4, + }) + ); + } + + #[test] + fn read_workload_log_backward_cursor_rolls_into_previous_shard() { + let td = tempfile::tempdir().unwrap(); + let log_dir = td.path().to_path_buf(); + std::fs::write( + log_dir.join("workload__Deployment__demo.2026-06-19.log"), + "old\n", + ) + .unwrap(); + std::fs::write( + log_dir.join("workload__Deployment__demo.2026-06-20.log"), + "new\n", + ) + .unwrap(); + let handler = ReadWorkloadLogChunkHandler { log_dir }; + let req = ReadWorkloadLogReq { + kind: WorkloadKind::Deployment, + name: "demo".to_string(), + direction: LogReadDirection::Backward, + cursor: Some(WorkloadLogCursor { + shard: "2026-06-20".to_string(), + offset: 0, + }), + max_bytes: Some(65536), + }; + let raw = handler.handle("n1".into(), &serde_json::to_vec(&req).unwrap()).unwrap(); + let resp: ReadWorkloadLogResp = serde_json::from_slice(&raw).unwrap(); + assert!(resp.ok, "{resp:?}"); + assert_eq!(resp.text.as_deref(), Some("old\n")); + assert_eq!( + resp.start_cursor, + Some(WorkloadLogCursor { + shard: "2026-06-19".to_string(), + offset: 0, + }) + ); + assert_eq!( + resp.end_cursor, + Some(WorkloadLogCursor { + shard: "2026-06-19".to_string(), + offset: 4, + }) + ); + } } diff --git a/fluxon_rs/fluxon_pyo3/src/lib.rs b/fluxon_rs/fluxon_pyo3/src/lib.rs index b29a083..a73591f 100644 --- a/fluxon_rs/fluxon_pyo3/src/lib.rs +++ b/fluxon_rs/fluxon_pyo3/src/lib.rs @@ -2759,18 +2759,30 @@ impl KvClient { inner_new(config_yaml, py).into_py_object(py) } - /// Return the logs directory for MQ-related components. + /// Return the logs directory for third-party Python components. /// - /// For the fluxon unified backend, this is derived from the - /// client's shared_memory_path and cluster_name: - /// {shared_memory_path}/{cluster_name}_cluster_mq_logs - fn logs_dir(&self, py: Python) -> PyObject { - fn logs_dir_inner(client: &KvClient, py: Python) -> ApiResult { - let base = PathBuf::from(&client.config.shared_memory_path); - let dir = base.join(format!("{}_cluster_mq_logs", client.config.cluster_name)); + /// For the fluxon unified backend, this is derived from owner + /// large_file_paths and cluster_name: + /// {large_file_paths[0]}/{cluster_name}_cluster_third_party_logs + fn third_party_logs_dir(&self, py: Python) -> PyObject { + fn third_party_logs_dir_inner(client: &KvClient, py: Python) -> ApiResult { + let dir = match client + .config + .large_file_paths + .third_party_logs_dir(&client.config.cluster_name) + { + Ok(dir) => dir, + Err(e) => { + return ApiResult::new_error(crate::error::py_error_from_kv_error( + py, + &e, + "third_party_logs_dir failed", + )); + } + }; ApiResult::new_success(dir.to_string_lossy().into_owned().into_py(py)) } - logs_dir_inner(self, py).into_py_object(py) + third_party_logs_dir_inner(self, py).into_py_object(py) } /// Return raw etcd addresses (host:port) used by this client. diff --git a/fluxon_rs/fluxon_util/build.rs b/fluxon_rs/fluxon_util/build.rs index 0f586d3..2bf7b87 100644 --- a/fluxon_rs/fluxon_util/build.rs +++ b/fluxon_rs/fluxon_util/build.rs @@ -88,12 +88,15 @@ fn collect_crates_for_runtime(ws: &CargoWorkspace) { println!("cargo:rerun-if-changed=Cargo.toml"); } -fn try_discover_git_dir(manifest_dir: &Path) -> Option { +fn try_discover_git_dir(manifest_dir: &Path, workspace_root: &Path) -> Option { + let workspace_search_ceiling = workspace_root.parent().unwrap_or(workspace_root); let mut cur = Some(manifest_dir); while let Some(dir) = cur { let candidate = dir.join(".git"); if candidate.is_dir() { - return Some(candidate); + if candidate.join("HEAD").is_file() { + return Some(candidate); + } } if candidate.is_file() { // Worktree/submodule style: .git is a file containing `gitdir: ` @@ -106,11 +109,17 @@ fn try_discover_git_dir(manifest_dir: &Path) -> Option { .unwrap_or_else(|| panic!("invalid .git file format: {}", candidate.display())) .trim(); let gitdir_path = Path::new(gitdir); - return Some(if gitdir_path.is_absolute() { + let resolved = if gitdir_path.is_absolute() { gitdir_path.to_path_buf() } else { dir.join(gitdir_path) - }); + }; + if resolved.join("HEAD").is_file() { + return Some(resolved); + } + } + if dir == workspace_search_ceiling { + break; } cur = dir.parent(); } @@ -309,7 +318,7 @@ fn main() { v } Err(_) => { - match try_discover_git_dir(&manifest_dir) { + match try_discover_git_dir(&manifest_dir, &ws.workspace_root) { Some(git_dir) => { emit_rerun_hints(&git_dir); resolve_head_commit_id(&git_dir) diff --git a/fluxon_rs/fluxon_util/src/lib.rs b/fluxon_rs/fluxon_util/src/lib.rs index 2f4f9fa..e575a75 100644 --- a/fluxon_rs/fluxon_util/src/lib.rs +++ b/fluxon_rs/fluxon_util/src/lib.rs @@ -36,7 +36,12 @@ pub mod limitrate; // PyO3 helpers: run long-time Python call without holding GIL in caller thread. pub mod pyo3; // Re-export for stable public API: existing call sites can keep using `fluxon_util::init_log`. -pub use log::{current_log_file_path, init_log, init_log_test, init_log_with_extra_layer}; +pub use log::{ + current_daily_sharded_log_path, current_log_file_path, daily_sharded_log_path, + display_runtime_log_path, init_log, init_log_test, init_log_with_extra_layer, + latest_existing_daily_sharded_log_path, resolve_readable_log_path, + DEFAULT_DAILY_LOG_RETENTION_DAYS, +}; #[cfg(test)] mod test_util_test; diff --git a/fluxon_rs/fluxon_util/src/log.rs b/fluxon_rs/fluxon_util/src/log.rs index db3d88f..fc6066f 100644 --- a/fluxon_rs/fluxon_util/src/log.rs +++ b/fluxon_rs/fluxon_util/src/log.rs @@ -3,6 +3,7 @@ use std::io; use std::path::{Path, PathBuf}; use std::sync::OnceLock; +use parking_lot::Mutex; use tracing_appender::non_blocking; use tracing_appender::non_blocking::WorkerGuard; use tracing_subscriber::EnvFilter; @@ -20,6 +21,9 @@ mod generated_crates { // RPC fast-path traffic actually entered the closed transfer / verbs backend. Keep the scope explicit: // only these dependency targets are promoted to DEBUG alongside workspace crates. const RDMA_DEBUG_TARGETS: &[&str] = &["fabric_lib", "libfabric_sys", "libibverbs_sys"]; +const LOG_RETENTION_DAYS: usize = 31; +const TEST_LOG_SHARD_WINDOW_SECONDS_ENV: &str = "FLUXON_TEST_LOG_SHARD_WINDOW_SECONDS"; +const TEST_LOG_SHARD_ANCHOR_UNIX_SECONDS_ENV: &str = "FLUXON_TEST_LOG_SHARD_ANCHOR_UNIX_SECONDS"; // Simple UTC timer in RFC3339 seconds (no subsecond precision) struct UtcSecondTimer; @@ -37,6 +41,191 @@ static GLOBAL_CONSOLE_LOG_GUARD: OnceLock = OnceLock::new(); // Expose the current process log file path for sidecar collectors (e.g. OTLP tailer). static GLOBAL_LOG_FILE_PATH: OnceLock = OnceLock::new(); +pub const DEFAULT_DAILY_LOG_RETENTION_DAYS: usize = LOG_RETENTION_DAYS; + +#[derive(Clone, Copy, Debug)] +struct LogShardWindowConfig { + window_seconds: i64, + anchor_unix_seconds: i64, +} + +fn read_test_log_shard_window_config() -> anyhow::Result> { + let Some(raw_window) = std::env::var_os(TEST_LOG_SHARD_WINDOW_SECONDS_ENV) else { + return Ok(None); + }; + let raw_window = raw_window + .into_string() + .map_err(|_| anyhow::anyhow!("{TEST_LOG_SHARD_WINDOW_SECONDS_ENV} must be valid utf-8"))?; + let window_text = raw_window.trim(); + if window_text.is_empty() { + return Ok(None); + } + let window_seconds: i64 = window_text.parse().map_err(|e| { + anyhow::anyhow!( + "{TEST_LOG_SHARD_WINDOW_SECONDS_ENV} must be a positive integer: {e}" + ) + })?; + if window_seconds <= 0 { + anyhow::bail!("{TEST_LOG_SHARD_WINDOW_SECONDS_ENV} must be > 0"); + } + + let raw_anchor = std::env::var(TEST_LOG_SHARD_ANCHOR_UNIX_SECONDS_ENV).map_err(|_| { + anyhow::anyhow!( + "{TEST_LOG_SHARD_ANCHOR_UNIX_SECONDS_ENV} is required when {TEST_LOG_SHARD_WINDOW_SECONDS_ENV} is set" + ) + })?; + let anchor_unix_seconds: i64 = raw_anchor.trim().parse().map_err(|e| { + anyhow::anyhow!( + "{TEST_LOG_SHARD_ANCHOR_UNIX_SECONDS_ENV} must be an integer unix timestamp: {e}" + ) + })?; + Ok(Some(LogShardWindowConfig { + window_seconds, + anchor_unix_seconds, + })) +} + +fn resolve_shard_date_from_datetime(now: chrono::DateTime) -> anyhow::Result { + let Some(config) = read_test_log_shard_window_config()? else { + return Ok(now.date_naive()); + }; + let unix_seconds = now.timestamp(); + let delta_seconds = unix_seconds - config.anchor_unix_seconds; + if delta_seconds < 0 { + anyhow::bail!( + "test log shard anchor must not be in the future: anchor={}, ts={}", + config.anchor_unix_seconds, + unix_seconds + ); + } + let bucket_index = delta_seconds / config.window_seconds; + let base_date = chrono::NaiveDate::from_ymd_opt(2026, 1, 1) + .expect("valid hard-coded synthetic base date"); + Ok(base_date + chrono::Days::new(bucket_index as u64)) +} + +fn current_shard_date() -> anyhow::Result { + resolve_shard_date_from_datetime(chrono::Utc::now()) +} + +fn cleanup_old_daily_sharded_logs( + base_path: &Path, + retention_days: usize, +) -> anyhow::Result<()> { + let parent = match base_path.parent() { + Some(parent) => parent, + None => return Ok(()), + }; + let file_name = match base_path.file_name().and_then(|v| v.to_str()) { + Some(file_name) => file_name, + None => return Ok(()), + }; + let Some(stem) = file_name.strip_suffix(".log") else { + return Ok(()); + }; + fs::create_dir_all(parent)?; + let keep_since = current_shard_date()? - chrono::Days::new(retention_days.saturating_sub(1) as u64); + let prefix = format!("{stem}."); + for entry in std::fs::read_dir(parent)? { + let entry = entry?; + let path = entry.path(); + if !path.is_file() { + continue; + } + let entry_name = entry.file_name(); + let Some(entry_name) = entry_name.to_str() else { + continue; + }; + if !entry_name.starts_with(prefix.as_str()) || !entry_name.ends_with(".log") { + continue; + } + let date_text = &entry_name[prefix.len()..entry_name.len() - ".log".len()]; + let Ok(shard_date) = chrono::NaiveDate::parse_from_str(date_text, "%Y-%m-%d") else { + continue; + }; + if shard_date < keep_since { + match fs::remove_file(&path) { + Ok(()) => {} + Err(err) if err.kind() == io::ErrorKind::NotFound => {} + Err(err) => return Err(err.into()), + } + } + } + Ok(()) +} + +#[derive(Debug)] +struct DailyShardedFileWriter { + base_path: PathBuf, + retention_days: usize, + state: Mutex, +} + +#[derive(Debug, Default)] +struct DailyShardedFileWriterState { + current_path: Option, + current_file: Option, +} + +impl DailyShardedFileWriter { + fn new(base_path: PathBuf, retention_days: usize) -> Self { + Self { + base_path, + retention_days, + state: Mutex::new(DailyShardedFileWriterState::default()), + } + } + + fn current_path(&self) -> anyhow::Result { + current_daily_sharded_log_path(&self.base_path) + } + + fn rotate_if_needed( + &self, + state: &mut DailyShardedFileWriterState, + ) -> io::Result<()> { + let next_path = self + .current_path() + .map_err(|err| io::Error::new(io::ErrorKind::Other, err.to_string()))?; + if state.current_path.as_ref() == Some(&next_path) && state.current_file.is_some() { + return Ok(()); + } + cleanup_old_daily_sharded_logs(&self.base_path, self.retention_days) + .map_err(|err| io::Error::new(io::ErrorKind::Other, err.to_string()))?; + if let Some(parent) = next_path.parent() { + fs::create_dir_all(parent)?; + } + let file = fs::OpenOptions::new() + .create(true) + .append(true) + .open(&next_path)?; + state.current_path = Some(next_path); + state.current_file = Some(file); + Ok(()) + } +} + +impl io::Write for DailyShardedFileWriter { + fn write(&mut self, buf: &[u8]) -> io::Result { + let mut state = self.state.lock(); + self.rotate_if_needed(&mut state)?; + state + .current_file + .as_mut() + .expect("log writer file must exist after rotation") + .write(buf) + } + + fn flush(&mut self) -> io::Result<()> { + let mut state = self.state.lock(); + if let Some(file) = state.current_file.as_mut() { + file.flush() + } else { + Ok(()) + } + } +} + fn setup_global_log_guards(file_guard: WorkerGuard, console_guard: WorkerGuard) { let _ = GLOBAL_FILE_LOG_GUARD.set(file_guard); let _ = GLOBAL_CONSOLE_LOG_GUARD.set(console_guard); @@ -90,9 +279,9 @@ fn third_party_log_target_overrides( targets } -/// Init log for production +/// Init log for production. /// - `log_path`: directory to write log files -/// - `instance_key`: used in file names to disambiguate instances +/// - `instance_key`: used in daily file names to disambiguate instances pub fn init_log(log_path: &Path, instance_key: &str) { init_log_impl(log_path, instance_key, NoopLayer); } @@ -113,6 +302,101 @@ struct NoopLayer; impl tracing_subscriber::Layer for NoopLayer where S: tracing::Subscriber {} +fn current_daily_log_file_path(log_path: &Path, instance_key: &str) -> PathBuf { + current_daily_sharded_log_path(&log_path.join(format!("fluxon-kv-{instance_key}.log"))) + .unwrap_or_else(|_| { + let date = chrono::Utc::now().format("%Y-%m-%d"); + log_path.join(format!("fluxon-kv-{instance_key}.{date}.log")) + }) +} + +pub fn daily_sharded_log_path( + base_path: &Path, + date: chrono::NaiveDate, +) -> anyhow::Result { + let file_name = base_path.file_name().and_then(|v| v.to_str()).ok_or_else(|| { + anyhow::anyhow!( + "log path must end with a valid utf-8 filename: {}", + base_path.display() + ) + })?; + let stem = file_name + .strip_suffix(".log") + .ok_or_else(|| anyhow::anyhow!("log path must end with .log: {}", base_path.display()))?; + Ok(base_path.with_file_name(format!( + "{}.{}.log", + stem, + date.format("%Y-%m-%d") + ))) +} + +pub fn current_daily_sharded_log_path(base_path: &Path) -> anyhow::Result { + daily_sharded_log_path(base_path, current_shard_date()?) +} + +pub fn latest_existing_daily_sharded_log_path(base_path: &Path) -> Option { + let parent = base_path.parent()?; + let file_name = base_path.file_name()?.to_str()?; + let stem = file_name.strip_suffix(".log")?; + let prefix = format!("{}.", stem); + let mut latest: Option<(chrono::NaiveDate, PathBuf)> = None; + let entries = std::fs::read_dir(parent).ok()?; + for entry in entries { + let Ok(entry) = entry else { + continue; + }; + let path = entry.path(); + if !path.is_file() { + continue; + } + let entry_name = entry.file_name(); + let Some(entry_name) = entry_name.to_str() else { + continue; + }; + if !entry_name.starts_with(prefix.as_str()) || !entry_name.ends_with(".log") { + continue; + } + if entry_name.len() <= prefix.len() + ".log".len() { + continue; + } + let date_text = &entry_name[prefix.len()..entry_name.len() - ".log".len()]; + let Ok(date) = chrono::NaiveDate::parse_from_str(date_text, "%Y-%m-%d") else { + continue; + }; + let replace = match latest.as_ref() { + Some((prev, _)) => date > *prev, + None => true, + }; + if replace { + latest = Some((date, path)); + } + } + latest.map(|(_, path)| path) +} + +pub fn resolve_readable_log_path(base_path: &Path) -> Option { + if let Ok(current) = current_daily_sharded_log_path(base_path) { + if current.exists() { + return Some(current); + } + } + if let Some(latest) = latest_existing_daily_sharded_log_path(base_path) { + return Some(latest); + } + if base_path.exists() { + return Some(base_path.to_path_buf()); + } + None +} + +pub fn display_runtime_log_path(base_path_text: &str) -> String { + let base_path = Path::new(base_path_text); + resolve_readable_log_path(base_path) + .unwrap_or_else(|| base_path.to_path_buf()) + .display() + .to_string() +} + fn init_log_impl(log_path: &Path, instance_key: &str, extra_layer: L) where L: tracing_subscriber::Layer + Send + Sync + 'static, @@ -238,83 +522,9 @@ where } } - // Archive existing logs for the same instance into a sibling history directory. - // Scope is strictly within the provided `log_path` (cluster is implied by the dir path), - // and only files of the current `instance_key` are moved. This avoids any cross-instance - // interference and keeps behavior explicit and bounded. - { - let history_dir = log_path.join("history"); - if let Err(e) = fs::create_dir_all(&history_dir) { - panic!( - "[fluxon] Create history directory failed: {:?}. Base log_path: {:?}. \ -This log_path is provided by the caller's configuration. \ -For Master mode it is derived from MasterConfigYaml.log_dir with a subdirectory '_cluster_kv_logs'; \ -for Client mode it is derived from ClientConfigYaml.fluxonkv_spec.shared_memory_path with subdirectory '_cluster_kv_logs'. \ -Please ensure the directory exists and is writable. Underlying OS error: {:?}", - history_dir, log_path, e - ); - } - - // Pattern: fluxon-kv-..log - // No fallback patterns: keep rule strict and explicit. - let prefix = format!("fluxon-kv-{}.", instance_key); - let mut moved = 0usize; - - let iter = fs::read_dir(log_path).unwrap_or_else(|e| { - panic!( - "[fluxon] Read log directory failed at {:?}. This directory is the configured log_path described above. OS error: {:?}", - log_path, e - ) - }); - - for entry in iter { - let entry = entry.unwrap_or_else(|e| { - panic!( - "[fluxon] Failed to read a directory entry under {:?}. OS error: {:?}", - log_path, e - ) - }); - let path = entry.path(); - if !path.is_file() { - continue; - } - let name_os = match path.file_name() { - Some(n) => n, - None => continue, - }; - let name = match name_os.to_str() { - Some(s) => s, - None => continue, - }; - let is_target = name.starts_with(&prefix) && name.ends_with(".log"); - if !is_target { - continue; - } - let dst = history_dir.join(name); - if let Err(err) = fs::rename(&path, &dst) { - panic!( - "[fluxon] Move old log failed: {:?} -> {:?}. Base log_path: {:?}. OS error: {:?}", - path, dst, log_path, err - ); - } - moved += 1; - } - - if moved > 0 { - println!( - "[fluxon] Archived {moved} existing logs for instance_key='{instance_key}' into {:?}", - history_dir - ); - } - } - - // Files named with UTC timestamp once per process run - let ts = chrono::Utc::now().format("%Y-%m-%d_%H-%M-%S"); - // File log keeps workspace crates at DEBUG; non-workspace crates default to WARN. // This avoids dumping verbose dependency debug logs (e.g. h2/tower) into file output. - let file_name = format!("fluxon-kv-{instance_key}.{ts}.log"); - let file_path = log_path.join(&file_name); + let file_path = current_daily_log_file_path(log_path, instance_key); // Keep a copy for the whole process lifetime; collectors can clone it. if let Some(prev) = GLOBAL_LOG_FILE_PATH.get() { if prev != &file_path { @@ -326,18 +536,11 @@ Please ensure the directory exists and is writable. Underlying OS error: {:?}", } else { let _ = GLOBAL_LOG_FILE_PATH.set(file_path.clone()); } - let file = match std::fs::OpenOptions::new() - .create(true) - .append(true) - .open(&file_path) - { - Ok(f) => f, - Err(e) => { - eprintln!("Failed to open log file {:?}, err: {:?}", file_path, e); - return; - } - }; - let (file_writer, file_guard) = non_blocking(file); + let file_appender = DailyShardedFileWriter::new( + log_path.join(format!("fluxon-kv-{instance_key}.log")), + LOG_RETENTION_DAYS, + ); + let (file_writer, file_guard) = non_blocking(file_appender); let enable_iceoryx_logs = matches!( std::env::var("FLUXON_ENABLE_ICEORYX_LOGS") .ok() @@ -380,10 +583,9 @@ Please ensure the directory exists and is writable. Underlying OS error: {:?}", setup_global_log_guards(file_guard, console_guard); // Success notice: tell users where logs are written. - let history_dir_for_print = log_path.join("history"); println!( - "[fluxon] Logging initialized. base_dir={:?}, history_dir={:?}, instance_key='{}'", - log_path, history_dir_for_print, instance_key + "[fluxon] Logging initialized. base_dir={:?}, retention_days={}, current_file={:?}, instance_key='{}'", + log_path, LOG_RETENTION_DAYS, file_path, instance_key ); } diff --git a/fluxon_rs/fluxon_util/tests/log_mgmt.rs b/fluxon_rs/fluxon_util/tests/log_mgmt.rs new file mode 100644 index 0000000..431c5fc --- /dev/null +++ b/fluxon_rs/fluxon_util/tests/log_mgmt.rs @@ -0,0 +1,134 @@ +use std::fs; +use std::path::Path; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; + +use fluxon_util::DEFAULT_DAILY_LOG_RETENTION_DAYS; +use tempfile::TempDir; + +const TEST_LOG_SHARD_WINDOW_SECONDS_ENV: &str = "FLUXON_TEST_LOG_SHARD_WINDOW_SECONDS"; +const TEST_LOG_SHARD_ANCHOR_UNIX_SECONDS_ENV: &str = "FLUXON_TEST_LOG_SHARD_ANCHOR_UNIX_SECONDS"; + +struct EnvVarGuard { + key: &'static str, + previous: Option, +} + +impl EnvVarGuard { + fn set(key: &'static str, value: impl Into) -> Self { + let previous = std::env::var(key).ok(); + unsafe { + std::env::set_var(key, value.into()); + } + Self { key, previous } + } +} + +impl Drop for EnvVarGuard { + fn drop(&mut self) { + match self.previous.as_deref() { + Some(value) => unsafe { + std::env::set_var(self.key, value); + }, + None => unsafe { + std::env::remove_var(self.key); + }, + } + } +} + +fn count_service_shards(root: &Path, prefix: &str) -> usize { + fs::read_dir(root) + .expect("read log directory") + .filter_map(|entry| entry.ok()) + .map(|entry| entry.file_name().to_string_lossy().to_string()) + .filter(|name| name.starts_with(prefix) && name.ends_with(".log")) + .count() +} + +#[test] +fn kv_log_shards_roll_and_cleanup_with_test_window() { + let temp_dir = TempDir::new().expect("create temp dir"); + let log_path = temp_dir.path(); + let instance_key = "log_mgmt_window"; + let base_prefix = format!("fluxon-kv-{instance_key}"); + let stale_path = log_path.join(format!("{base_prefix}.2025-12-01.log")); + fs::write(&stale_path, "stale\n").expect("write stale shard"); + + let now = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("unix epoch") + .as_secs() as i64; + let _window_guard = EnvVarGuard::set(TEST_LOG_SHARD_WINDOW_SECONDS_ENV, "10"); + let _anchor_guard = EnvVarGuard::set(TEST_LOG_SHARD_ANCHOR_UNIX_SECONDS_ENV, (now - 2).to_string()); + + fluxon_util::init_log(log_path, instance_key); + tracing::info!(target: "fluxon_util", "[kv-log-mgmt][phase=before] ts={}", now); + std::thread::sleep(Duration::from_millis(300)); + std::thread::sleep(Duration::from_secs(11)); + let after_ts = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("unix epoch") + .as_secs(); + tracing::info!(target: "fluxon_util", "[kv-log-mgmt][phase=after] ts={after_ts}"); + std::thread::sleep(Duration::from_millis(500)); + + let shard_1 = log_path.join(format!("{base_prefix}.2026-01-01.log")); + let shard_2 = log_path.join(format!("{base_prefix}.2026-01-02.log")); + assert!(shard_1.exists(), "missing shard: {}", shard_1.display()); + assert!(shard_2.exists(), "missing shard: {}", shard_2.display()); + assert!( + !stale_path.exists(), + "stale shard should be removed once retention cleanup runs" + ); + assert_eq!( + count_service_shards(log_path, base_prefix.as_str()), + 2, + "expected exactly two retained shard files within the synthetic test window" + ); + + let shard_1_text = fs::read_to_string(&shard_1).expect("read first shard"); + let shard_2_text = fs::read_to_string(&shard_2).expect("read second shard"); + assert!( + shard_1_text.contains("[kv-log-mgmt][phase=before]"), + "first shard should contain the before marker" + ); + assert!( + !shard_1_text.contains("[kv-log-mgmt][phase=after]"), + "first shard should not contain the after marker" + ); + assert!( + shard_2_text.contains("[kv-log-mgmt][phase=after]"), + "second shard should contain the after marker" + ); + assert!( + !shard_2_text.contains("[kv-log-mgmt][phase=before]"), + "second shard should not contain the before marker" + ); + assert_eq!(DEFAULT_DAILY_LOG_RETENTION_DAYS, 31); +} + +#[test] +fn resolve_readable_log_path_ignores_plain_base_log_when_daily_shards_exist() { + let temp_dir = TempDir::new().expect("create temp dir"); + let base_path = temp_dir.path().join("startup.log"); + fs::write(&base_path, "plain\n").expect("write base log"); + let shard_path = temp_dir.path().join("startup.2026-06-21.log"); + fs::write(&shard_path, "shard\n").expect("write shard log"); + + let resolved = fluxon_util::resolve_readable_log_path(&base_path).expect("resolve readable log path"); + assert_eq!(resolved, shard_path); +} + +#[test] +fn latest_existing_daily_sharded_log_path_skips_invalid_candidates() { + let temp_dir = TempDir::new().expect("create temp dir"); + let base_path = temp_dir.path().join("demo.log"); + let invalid_shard_path = temp_dir.path().join("demo.not-a-date.log"); + let valid_shard_path = temp_dir.path().join("demo.2026-06-20.log"); + fs::write(&invalid_shard_path, "invalid\n").expect("write invalid shard"); + fs::write(&valid_shard_path, "valid\n").expect("write valid shard"); + + let resolved = + fluxon_util::latest_existing_daily_sharded_log_path(&base_path).expect("resolve latest shard"); + assert_eq!(resolved, valid_shard_path); +} diff --git a/fluxon_test_stack/benchmark_full_matrix.yaml b/fluxon_test_stack/benchmark_full_matrix.yaml index ffa1f2b..eaf0fbe 100644 --- a/fluxon_test_stack/benchmark_full_matrix.yaml +++ b/fluxon_test_stack/benchmark_full_matrix.yaml @@ -318,7 +318,7 @@ profiles: disable_prefix_index: true fluxonkv_spec: cluster_name: __STACK_CLUSTER_NAME__ - shared_memory_path: __STACK_SHARED_MEMORY_PATH__ + share_mem_path: __STACK_SHARE_MEM_PATH__ mq_base: capacity: 40 ttl_seconds: 90 diff --git a/fluxon_test_stack/ci_2_virt_node.py b/fluxon_test_stack/ci_2_virt_node.py index 28e9b82..405c9a2 100644 --- a/fluxon_test_stack/ci_2_virt_node.py +++ b/fluxon_test_stack/ci_2_virt_node.py @@ -710,6 +710,7 @@ def _render_ci_nix_pack_config( env_cfg = _load_yaml_mapping(env_companion_path.resolve(), ctx="CI pack env companion") merged_cfg = copy.deepcopy(static_cfg) merged_cfg.update(copy.deepcopy(env_cfg)) + merged_cfg["project_root"] = str(repo_root.resolve()) profile_cfg = merged_cfg.get("profile") if not isinstance(profile_cfg, dict): @@ -879,6 +880,8 @@ def main() -> int: sys.executable, str((REPO_ROOT / "fluxon_test_stack" / "pack_test_stack_rsc.py").resolve()), "--all-profiles", + "--release-dir", + str(release_dir), "-c", str(pack_metadata["suite_path"]), ] diff --git a/fluxon_test_stack/ci_test_list.yaml b/fluxon_test_stack/ci_test_list.yaml index 6a3c56b..4230559 100644 --- a/fluxon_test_stack/ci_test_list.yaml +++ b/fluxon_test_stack/ci_test_list.yaml @@ -29,6 +29,14 @@ scenes: scales: [n1_kvowner_dram_20gib] profiles: [fluxon_tcp] + ci_top_attention_log_mgmt: + ci: + subject: rust + runtime_contract: rust_self_managed + select: + scales: [n1_kvowner_dram_20gib] + profiles: [fluxon_tcp] + ci_top_attention_mq_core: ci: subject: mq @@ -319,6 +327,10 @@ profiles: doc_site_base_url: example.com ci_top_attention_bin_kvtest: kv_test_rounds: all + ci_top_attention_log_mgmt: + enabled: true + ci_top_attention_log_mgmt: + enabled: true ci_top_attention_mq_core: {} runtime_contracts: cluster_kv_owner: &cluster_kv_owner_runtime @@ -427,7 +439,7 @@ profiles: # Self-host benchmark nodes are launched by the host-side deployer, not inside a # container with /hostworkdir mounted. Use the real host path so both infra44/46 # benchmark nodes can open the same self-host shared-memory root directly. - shared_memory_path: __STACK_SHARED_MEMORY_PATH__ + share_mem_path: __STACK_SHARE_MEM_PATH__ mq_base: capacity: 40 ttl_seconds: 90 @@ -465,6 +477,10 @@ profiles: doc_site_base_url: example.com ci_top_attention_bin_kvtest: kv_test_rounds: all + ci_top_attention_log_mgmt: + enabled: true + ci_top_attention_log_mgmt: + enabled: true ci_top_attention_mq_core: {} test_stack: <<: *common_test_stack_runtime @@ -478,6 +494,10 @@ profiles: doc_site_base_url: example.com ci_top_attention_bin_kvtest: kv_test_rounds: all + ci_top_attention_log_mgmt: + enabled: true + ci_top_attention_log_mgmt: + enabled: true ci_top_attention_mq_core: {} test_stack: <<: *common_test_stack_runtime @@ -491,6 +511,10 @@ profiles: doc_site_base_url: example.com ci_top_attention_bin_kvtest: kv_test_rounds: all + ci_top_attention_log_mgmt: + enabled: true + ci_top_attention_log_mgmt: + enabled: true ci_top_attention_mq_core: {} test_stack: <<: *common_test_stack_runtime diff --git a/fluxon_test_stack/deployconf_testbed.yml b/fluxon_test_stack/deployconf_testbed.yml index fe431de..4015beb 100644 --- a/fluxon_test_stack/deployconf_testbed.yml +++ b/fluxon_test_stack/deployconf_testbed.yml @@ -102,9 +102,7 @@ global_envs: FLUXON_CLUSTER_NAME: "fluxon_testbed" FLUXON_SHARED_MEM: "${HOSTWORKDIR}/shm1" - FLUXON_SHARED_FILE: "${HOSTWORKDIR}/shm1_files" - # Test-stack benchmark nodes use FLUXON_SHARED_MEM for mmap.file roots and - # FLUXON_SHARED_MEM2 as the shared-file/log root. Keep both under the current + # Test-stack benchmark nodes use explicit shared bundle roots under the current # hostworkdir so benchmarks do not bleed into fluxon4 paths. FLUXON_SHARED_MEM2: "${HOSTWORKDIR}/shm2_files" FLUXON_PPROF_DURATION_SECONDS: "" @@ -335,7 +333,7 @@ service: WORKDIR="${FLUXON_SHARED_MEM}/owner_work_${NODE_ID}" CONFIG_PATH="${WORKDIR}/all_config.yaml" - mkdir -p "${WORKDIR}" "${FLUXON_SHARED_FILE}" + mkdir -p "${WORKDIR}" cat > "${CONFIG_PATH}" < "${WORKDIR}/ops_controller.yaml" < int: script_utils.reset_stage_summary() try: @@ -933,125 +913,13 @@ def _git_stage_ci_source_tree(*, repo_root: Path, stage_root: Path) -> list[str] return selected -def _collect_git_listed_source_relpaths( - *, - repo_root: Path, - git_root: Path, - rel_prefix: str = "", -) -> list[str]: - script_utils.require_cmd("git") - argv = [ - "git", - "ls-files", - "--cached", - "--others", - "--exclude-standard", - "-z", - ] - raw = subprocess.check_output(argv, cwd=str(git_root)) - selected: list[str] = [] - rel_prefix = rel_prefix.strip("/") - for entry in raw.split(b"\0"): - if not entry: - continue - rel = entry.decode("utf-8").strip() - if not rel: - continue - repo_rel = rel if not rel_prefix else f"{rel_prefix}/{rel}" - if _ci_source_relpath_excluded(repo_rel): - continue - src_path = (repo_root / repo_rel).resolve() - if not src_path.exists(): - continue - selected.append(repo_rel) - return selected - - -def _load_rather_no_git_submodule_source_roots( - *, - repo_root: Path, -) -> tuple[tuple[str, Path], ...]: - config_path = (repo_root / DEFAULT_RATHER_NO_GIT_SUBMODULE_CONFIG_RELPATH).resolve() - if not config_path.exists(): - return () - raw_cfg = _load_yaml_file(config_path) - if raw_cfg is None: - return () - if not isinstance(raw_cfg, dict): - raise RuntimeError( - "rather_no_git_submodule config must be a YAML mapping: " - f"{config_path}" - ) - raw_modules = raw_cfg.get("modules") - if raw_modules is None: - return () - if not isinstance(raw_modules, list): - raise RuntimeError( - "rather_no_git_submodule config `modules` must be a list: " - f"{config_path}" - ) - - repo_root = repo_root.resolve() - selected: list[tuple[str, Path]] = [] - seen_relpaths: set[str] = set() - for index, raw_item in enumerate(raw_modules): - if not isinstance(raw_item, dict): - raise RuntimeError( - "rather_no_git_submodule config entries must be mappings: " - f"{config_path} modules[{index}]" - ) - raw_path = raw_item.get("path") - if not isinstance(raw_path, str) or not raw_path.strip(): - raise RuntimeError( - "rather_no_git_submodule config path must be a non-empty string: " - f"{config_path} modules[{index}].path" - ) - rel_path = Path(raw_path.strip()) - if rel_path.is_absolute() or ".." in rel_path.parts: - raise RuntimeError( - "rather_no_git_submodule config path must stay within the repo root: " - f"{config_path} modules[{index}].path={raw_path!r}" - ) - relpath = rel_path.as_posix() - if relpath in seen_relpaths: - continue - seen_relpaths.add(relpath) - module_root = (repo_root / rel_path).resolve() - if module_root != repo_root and repo_root not in module_root.parents: - raise RuntimeError( - "rather_no_git_submodule config path escapes the repo root: " - f"{config_path} modules[{index}].path={raw_path!r}" - ) - if not module_root.is_dir(): - raise RuntimeError( - "CI source pack requires configured rather_no_git_submodule path to exist as a directory: " - f"path={relpath} resolved={module_root}" - ) - selected.append((relpath, module_root)) - return tuple(selected) - - def _collect_ci_source_relpaths(*, repo_root: Path) -> list[str]: - repo_root = repo_root.resolve() - selected = set( - _collect_git_listed_source_relpaths( - repo_root=repo_root, - git_root=repo_root, + return list( + collect_source_profile_relpaths( + repo_root=repo_root.resolve(), + profile=SOURCE_SELECTION_PROFILE_SOURCE_PACK, ) ) - for relpath, module_root in _load_rather_no_git_submodule_source_roots( - repo_root=repo_root - ): - selected.update( - _collect_git_listed_source_relpaths( - repo_root=repo_root, - git_root=module_root, - rel_prefix=relpath, - ) - ) - if not selected: - raise RuntimeError("git-based CI source selection produced no files") - return sorted(selected) def _compute_ci_source_digest(*, repo_root: Path) -> str: @@ -1060,18 +928,17 @@ def _compute_ci_source_digest(*, repo_root: Path) -> str: relative_to=repo_root, mode=script_utils.PathDigestMode.PACK_INPUTS, algorithm=script_utils.PathHashAlgorithm.SHA256, - ignored_dir_names=CI_SOURCE_DIGEST_IGNORED_DIR_NAMES, - ignored_file_names=CI_SOURCE_DIGEST_IGNORED_FILE_NAMES, - ignored_file_suffixes=CI_SOURCE_DIGEST_IGNORED_FILE_SUFFIXES, + ignored_dir_names=(), + ignored_file_names=(), + ignored_file_suffixes=(), ) def _ci_source_relpath_excluded(relpath: str) -> bool: - if relpath in CI_SOURCE_STAGE_EXCLUDE_NAMES: - return True - if _relpath_matches_exclude_patterns(relpath, CI_SOURCE_COMMON_EXCLUDE_REL_PATHS): - return True - return any(relpath == prefix.rstrip("/") or relpath.startswith(prefix) for prefix in CI_SOURCE_STAGE_EXCLUDE_PREFIXES) + return source_profile_relpath_excluded( + profile=SOURCE_SELECTION_PROFILE_SOURCE_PACK, + relpath=relpath, + ) def _relpath_matches_exclude_patterns(relpath: str, patterns: tuple[str, ...]) -> bool: @@ -1127,8 +994,8 @@ def build_tarball() -> None: src=src, dst=packed_stage_root / rel_name, honor_gitignore=False, + exclude_rel_paths=PACKED_RUNTIME_EXCLUDE_REL_PATHS, ) - _prune_stage_paths(packed_stage_root, PACKED_RUNTIME_EXCLUDE_REL_PATHS) script_utils.tar_gz( cwd=stage_root, out_path=out_path, @@ -1165,7 +1032,7 @@ def _stage_repo_test_rsc_tree(*, repo_test_rsc_root: Path, out_dir: Path) -> Non else: dst.parent.mkdir(parents=True, exist_ok=True) shutil.copy2(src, dst) - _prune_stage_paths(out_dir, TEST_RSC_REPO_TREE_EXCLUDE_REL_PATHS) + script_utils.prune_stage_paths(out_dir, TEST_RSC_REPO_TREE_EXCLUDE_REL_PATHS) def _release_shared_baselines_root(*, release_dir: Path) -> Path: @@ -1196,7 +1063,7 @@ def _stage_release_shared_baselines_into_root(*, release_dir: Path, prepared_roo if baselines_dst.exists(): raise RuntimeError(f"prepared test_rsc baselines path already exists before release authority stage: {baselines_dst}") shutil.copytree(shared_baselines_root, baselines_dst, dirs_exist_ok=False) - _prune_stage_paths(baselines_dst, TEST_RSC_REPO_TREE_EXCLUDE_REL_PATHS) + script_utils.prune_stage_paths(baselines_dst, TEST_RSC_REPO_TREE_EXCLUDE_REL_PATHS) def _stage_canonical_profile_prepared_resources_into_root(*, profile_id: str, prepared_root: Path) -> None: @@ -1223,7 +1090,7 @@ def _stage_canonical_profile_prepared_resources_into_root(*, profile_id: str, pr else: dst.parent.mkdir(parents=True, exist_ok=True) shutil.copy2(src, dst) - _prune_stage_paths(dst, TEST_RSC_REPO_TREE_EXCLUDE_REL_PATHS) + script_utils.prune_stage_paths(dst, TEST_RSC_REPO_TREE_EXCLUDE_REL_PATHS) def _stage_prepared_test_rsc(*, prepared_root: Path, out_dir: Path) -> None: @@ -1237,7 +1104,7 @@ def _stage_prepared_test_rsc(*, prepared_root: Path, out_dir: Path) -> None: else: dst.parent.mkdir(parents=True, exist_ok=True) shutil.copy2(src, dst) - _prune_stage_paths(out_dir, TEST_RSC_REPO_TREE_EXCLUDE_REL_PATHS) + script_utils.prune_stage_paths(out_dir, TEST_RSC_REPO_TREE_EXCLUDE_REL_PATHS) def _prepare_baselines_into_root( @@ -1266,7 +1133,7 @@ def _prepare_baselines_into_root( dir_source=dir_source, archive_source=archive_source, ) - _prune_stage_paths(prepared_root, TEST_RSC_REPO_TREE_EXCLUDE_REL_PATHS) + script_utils.prune_stage_paths(prepared_root, TEST_RSC_REPO_TREE_EXCLUDE_REL_PATHS) def _prepare_configured_test_rsc_resources_into_root( @@ -1293,7 +1160,7 @@ def _prepare_configured_test_rsc_resources_into_root( scratch_root=scratch_root, mooncake_cfg=mooncake_cfg_raw, ) - _prune_stage_paths(prepared_root, TEST_RSC_REPO_TREE_EXCLUDE_REL_PATHS) + script_utils.prune_stage_paths(prepared_root, TEST_RSC_REPO_TREE_EXCLUDE_REL_PATHS) def _prepare_python_runtime_wheelhouse_into_root( @@ -1803,7 +1670,7 @@ def _sync_prepared_baselines_into_release_tree(*, prepared_root: Path, release_d release_shared_baselines_root.parent.mkdir(parents=True, exist_ok=True) _remove_path(release_shared_baselines_root) shutil.copytree(prepared_baselines_root, release_shared_baselines_root, dirs_exist_ok=False) - _prune_stage_paths(release_shared_baselines_root, TEST_RSC_REPO_TREE_EXCLUDE_REL_PATHS) + script_utils.prune_stage_paths(release_shared_baselines_root, TEST_RSC_REPO_TREE_EXCLUDE_REL_PATHS) def _extract_bundle_archive(*, archive_path: Path, out_dir: Path, expected_root_name: str) -> None: @@ -1831,62 +1698,6 @@ def _remove_path(path: Path) -> None: path.unlink() -def _rsync_stage_filtered( - *, - repo_root: Path, - src: Path, - dst: Path, - honor_gitignore: bool, - exclude_rel_paths: tuple[str, ...] = (), -) -> None: - if not exclude_rel_paths: - script_utils.rsync_stage( - repo_root=repo_root, - src=src, - dst=dst, - honor_gitignore=honor_gitignore, - ) - return - - if not src.exists(): - raise RuntimeError(f"missing required source path for staging: {src}") - if dst.exists(): - raise RuntimeError(f"staging destination already exists (no overwrite): {dst}") - if shutil.which("rsync") is None: - raise RuntimeError("rsync is required for filtered staging, but was not found in PATH") - - dst.parent.mkdir(parents=True, exist_ok=True) - argv = ["rsync", "-a"] - if honor_gitignore: - argv += [ - "--exclude=.git/", - "--exclude-from=.gitignore", - "--filter=:- .gitignore", - ] - for pattern in exclude_rel_paths: - argv.append(f"--exclude={pattern}") - if src.is_dir(): - argv += [str(src) + "/", str(dst) + "/"] - else: - argv += [str(src), str(dst)] - subprocess.check_call(argv, cwd=str(repo_root)) - - -def _prune_stage_paths(stage_root: Path, exclude_rel_paths: tuple[str, ...]) -> None: - if not stage_root.exists(): - return - for path in sorted(stage_root.rglob("*"), reverse=True): - rel_path = path.relative_to(stage_root).as_posix() - for pattern in exclude_rel_paths: - normalized_pattern = pattern.rstrip("/") - if fnmatch.fnmatch(rel_path, normalized_pattern) or fnmatch.fnmatch(path.name, normalized_pattern): - if path.is_dir(): - shutil.rmtree(path) - else: - path.unlink(missing_ok=True) - break - - def _test_rsc_manifest_file_list(*, out_dir: Path, prepared_root: Path) -> list[Path]: files: list[Path] = [] for fixed_name in ("src_ci.tar.gz", "fluxon_ci_ext_rsc.tar.gz"): diff --git a/fluxon_test_stack/start_test_bed.py b/fluxon_test_stack/start_test_bed.py index da942de..2b13d22 100644 --- a/fluxon_test_stack/start_test_bed.py +++ b/fluxon_test_stack/start_test_bed.py @@ -7,6 +7,7 @@ import fcntl import json import os +import re import subprocess import sys import time @@ -26,6 +27,7 @@ sys.path.insert(0, str(DEPLOYMENT_DIR)) sys.path.insert(0, str(DEPLOYMENT_UTILS_DIR)) import manual_dispatch_release +from utils import log_shard from selection_runtime import ( atomic_group_member_authority_name as _selection_atomic_group_member_authority_name, atomic_group_member_selection_workload_name as _selection_atomic_group_member_selection_workload_name, @@ -434,11 +436,12 @@ def main() -> None: waves=coverage_bootstrap_waves, bootstrap_bare_services=bootstrap_bare_services, ) - _wait_controller_ready_stable( - controller_url=controller_url, - timeout_seconds=controller_ready_timeout_seconds, - stability_window_seconds=bootstrap_stability_window_seconds, - ) + if bootstrap_mode in (BOOTSTRAP_MODE_BARE_THEN_APPLY, BOOTSTRAP_MODE_BARE_ONLY): + _wait_controller_ready_stable( + controller_url=controller_url, + timeout_seconds=controller_ready_timeout_seconds, + stability_window_seconds=bootstrap_stability_window_seconds, + ) test_runner_ui_summary = _ensure_test_runner_ui_started(ui_cfg=test_runner_ui_cfg) if bootstrap_mode == BOOTSTRAP_MODE_BARE_THEN_APPLY: post_bootstrap_agent_instance_keys = _selection_agent_instance_keys( @@ -766,6 +769,9 @@ def _normalize_bootstrap_deployconf( if isinstance(master_cfg, dict): entrypoint = master_cfg.get("entrypoint") if isinstance(entrypoint, str): + master_port = _extract_master_listen_port(entrypoint=entrypoint) + if master_port is not None: + _set_service_port(master_cfg, port=master_port) normalized_entrypoint, removed = _strip_legacy_master_p2p_listen_port(entrypoint=entrypoint) if removed: master_cfg["entrypoint"] = normalized_entrypoint @@ -829,7 +835,6 @@ def _rewrite_same_host_local_multi_node_fixed_ports( master_cfg = _require_mapping(services.get("master"), "deployconf.service.master") ops_agent_cfg = _require_mapping(services.get("ops_agent"), "deployconf.service.ops_agent") ops_controller_cfg = _require_mapping(services.get("ops_controller"), "deployconf.service.ops_controller") - global_envs["TIKV_PD_PEER_PORT"] = str(plan["tikv_pd_peer_port"]) global_envs["TIKV_STATUS_FULL_ADDRESS"] = ( "${${TIKV__NODE_ID}__IP}:" + str(plan["tikv_status_port"]) @@ -845,6 +850,7 @@ def _rewrite_same_host_local_multi_node_fixed_ports( _set_service_port(greptime_cfg, port=plan["greptime_port"]) _set_service_port(tikv_pd_cfg, port=plan["tikv_pd_port"]) _set_service_port(tikv_cfg, port=plan["tikv_port"]) + _set_service_port(master_cfg, port=plan["master_port"]) etcd_entrypoint = _require_str(etcd_cfg.get("entrypoint"), "deployconf.service.etcd.entrypoint") etcd_entrypoint = _replace_expected_substring( @@ -974,6 +980,13 @@ def _set_service_port(service_cfg: dict[str, Any], *, port: int) -> None: service_cfg["in_container_port"] = int(port) +def _extract_master_listen_port(*, entrypoint: str) -> int | None: + match = re.search(r"(?m)^[ \t]*port:\s*(\d+)\s*$", entrypoint) + if match is None: + return None + return _require_port_number(match.group(1), "deployconf.service.master.entrypoint port") + + def _replace_expected_substring(*, value: str, old: str, new: str, ctx: str) -> str: if old in value: return value.replace(old, new) @@ -1400,7 +1413,7 @@ def _test_runner_ui_summary_from_cfg( "url": ui_cfg["url"], "probe_url": ui_cfg["probe_url"], "workdir": str(ui_cfg["workdir"]), - "log_path": str(ui_cfg["log_path"]), + "log_path": str(ui_cfg["active_log_path"]), "history_lookback_days": int(ui_cfg["history_lookback_days"]), "history_roots": [str(path) for path in ui_cfg["history_roots"]], "gitops_config_path": ( @@ -1461,7 +1474,8 @@ def _parse_test_runner_ui_config( _require_str(ui_cfg.get("gitops_config_path"), "test_runner_ui.gitops_config_path"), "test_runner_ui.gitops_config_path", ) - log_path = (workdir / TEST_RUNNER_UI_LOG_FILENAME).resolve() + log_path = (workdir.resolve() / TEST_RUNNER_UI_LOG_FILENAME).resolve() + active_log_path = log_shard.daily_sharded_log_path(log_path) return { "enabled": True, "host": host, @@ -1470,6 +1484,7 @@ def _parse_test_runner_ui_config( "probe_url": _test_runner_ui_probe_url(host=host, port=port), "workdir": workdir.resolve(), "log_path": log_path, + "active_log_path": active_log_path, "history_lookback_days": int(history_lookback_days), "history_roots": [path.resolve() for path in history_roots], "gitops_config_path": gitops_config_path.resolve() if gitops_config_path is not None else None, @@ -1498,6 +1513,23 @@ def _bare_service_runtime_log_path(*, local_node_cfg: dict[str, Any], service_na return root / "log" / f"{service_name}.log" +def _resolve_bare_service_readable_runtime_log_path( + *, + local_node_cfg: dict[str, Any], + service_name: str, +) -> Path | None: + runtime_log_path = _bare_service_runtime_log_path( + local_node_cfg=local_node_cfg, + service_name=service_name, + ) + if runtime_log_path is None: + return None + resolved_log_path = log_shard.resolve_readable_log_path(runtime_log_path) + if resolved_log_path is not None: + return resolved_log_path + return runtime_log_path + + def _test_runner_ui_health_payload(*, probe_url: str, timeout_seconds: float) -> dict[str, Any] | None: req = urllib.request.Request(probe_url.rstrip("/") + "/health", method="GET") try: @@ -1590,7 +1622,7 @@ def _ensure_test_runner_ui_started(*, ui_cfg: dict[str, Any]) -> dict[str, Any]: if ui_cfg["gitops_config_path"] is not None: argv.extend(["--gitops-config", str(ui_cfg["gitops_config_path"])]) - log_path = Path(ui_cfg["log_path"]).resolve() + log_path = Path(ui_cfg["active_log_path"]).resolve() log_path.parent.mkdir(parents=True, exist_ok=True) log_handle = log_path.open("a", encoding="utf-8") try: @@ -3385,7 +3417,7 @@ def _collect_bare_runtime_statuses( raise ValueError("bare_launch_result.bootstrap_log_path must be a Path") statuses: list[dict[str, Any]] = [] for service_name in expected_service_names: - runtime_log_path = _bare_service_runtime_log_path( + runtime_log_path = _resolve_bare_service_readable_runtime_log_path( local_node_cfg=local_node_cfg, service_name=service_name, ) diff --git a/fluxon_test_stack/test_runner.py b/fluxon_test_stack/test_runner.py index d8cd1c9..2236be5 100644 --- a/fluxon_test_stack/test_runner.py +++ b/fluxon_test_stack/test_runner.py @@ -37,6 +37,11 @@ import yaml +RUNNER_REPO_ROOT = Path(__file__).resolve().parent.parent +RUNNER_DEPLOYMENT_DIR = RUNNER_REPO_ROOT / "deployment" +RUNNER_TEMPLATE_DIR = (RUNNER_REPO_ROOT / "fluxon_test_stack" / "test_runner_templates").resolve() +sys.path.insert(0, str(RUNNER_DEPLOYMENT_DIR)) + from benchmark_role_names import ( KV_NODE_ROLE_SEED, KV_NODE_ROLE_WORKER, @@ -51,6 +56,7 @@ run_top_attention_entries, select_top_attention_entries, ) +from utils import log_shard from test_runner_ci_runtime import ( _assert_ci_runtime_python_abi as _assert_ci_runtime_python_abi_impl, _ci_runtime_python_abi as _ci_runtime_python_abi_impl, @@ -329,10 +335,10 @@ def _test_stack_mode_requires_kv_master(mode: str) -> bool: "workloads may still be stopping", ) _WAIT_DELETE_APPLY_REQUIRES_DELETE_ERR = "wait_delete_apply requires delete_apply first" -RUNNER_REPO_ROOT = Path(__file__).resolve().parent.parent RUNNER_SHARED_RUNTIME_DIR = (RUNNER_REPO_ROOT / "fluxon_test_stack" / "test_runner").resolve() RUNNER_SHARED_LOCK_DIR = (RUNNER_SHARED_RUNTIME_DIR / "locks").resolve() RUNNER_STDIO_LOG_FILENAME = "test_runner.log" +_SERVICE_LOG_RETENTION_DAYS = log_shard.DEFAULT_DAILY_LOG_RETENTION_DAYS _ACTIVE_TEST_BED_SELECTION_SUPERVISOR_CHECK_CACHE_KEY: Optional[str] = None # TEST_STACK coordinator uses a stable workload name across cases; if a previous run crashed @@ -401,7 +407,9 @@ def _runner_native_ci_scene_ids() -> Tuple[str, ...]: return ( "ci_top_attention_doc_page_build", "ci_top_attention_bin_kvtest", + "ci_top_attention_log_mgmt", "ci_top_attention_mq_core", + "ci_top_attention_log_mgmt", ) @@ -455,6 +463,7 @@ def _scene_id_uses_runner_native_ci_commands(scene_id: str) -> bool: _RUNNER_STDIO_LOG_FP: Optional[Any] = None _RUNNER_STDIO_KEEPALIVE_FDS: Optional[Tuple[int, int]] = None _RUNNER_STDIO_MIRROR_THREAD: Optional[threading.Thread] = None +_RUNNER_STDIO_ROUTER_THREAD: Optional[threading.Thread] = None _CI_WAIT_HEARTBEAT_INTERVAL_SECONDS = 15.0 _CI_WAIT_TAIL_MAX_CHARS = 8000 _TEST_RUNNER_UI_MAX_LOG_CHUNK_BYTES = 1024 * 1024 @@ -479,6 +488,49 @@ def _ci_log_prefix_lines(text: str, *, now: Optional[float] = None) -> str: return _ci_log_prefix_lines_impl(text, now=now) +def _service_log_base_path(workdir_root: Path, *, filename: str) -> Path: + return (workdir_root / filename).resolve() + + +def _service_log_daily_path(base_path: Path, *, now: Optional[datetime.datetime] = None) -> Path: + return log_shard.daily_sharded_log_path(base_path, now=now) + + +def _service_log_latest_path(base_path: Path) -> Optional[Path]: + return log_shard.latest_existing_daily_sharded_log_path(base_path) + + +def _service_log_resolve_read_path(workdir_root: Path, *, filename: str) -> Optional[Path]: + base_path = _service_log_base_path(workdir_root, filename=filename) + return _service_log_resolve_read_path_from_base(base_path) + + +def _service_log_resolve_read_path_from_base(base_path: Path) -> Optional[Path]: + return log_shard.resolve_readable_log_path(base_path) + + +def _cleanup_old_service_logs(base_path: Path, *, retention_days: int = _SERVICE_LOG_RETENTION_DAYS) -> None: + log_shard.cleanup_old_daily_sharded_logs(base_path, retention_days=retention_days) + + +def _start_runner_stdio_log_router(*, base_log_path: Path, read_fd: int) -> None: + def _router_loop() -> None: + log_shard.relay_fd_to_daily_sharded_logs( + base_log_path=str(base_log_path), + read_fd=read_fd, + retention_days=_SERVICE_LOG_RETENTION_DAYS, + ) + + router = threading.Thread( + target=_router_loop, + name="test-runner-stdio-log-router", + daemon=True, + ) + router.start() + global _RUNNER_STDIO_ROUTER_THREAD + _RUNNER_STDIO_ROUTER_THREAD = router + + def _start_runner_stdio_log_mirror(*, log_path: Path, stdout_fd: int) -> None: global _RUNNER_STDIO_MIRROR_THREAD _RUNNER_STDIO_MIRROR_THREAD = _start_runner_stdio_log_mirror_impl( @@ -487,7 +539,11 @@ def _start_runner_stdio_log_mirror(*, log_path: Path, stdout_fd: int) -> None: ) -def _redirect_process_stdio_to_log(workdir_root: Path) -> None: +def _redirect_process_stdio_to_log( + workdir_root: Path, + *, + filename: str = RUNNER_STDIO_LOG_FILENAME, +) -> None: """Route runner stdio to a stable workdir log so long suites survive PTY loss. English note: @@ -501,7 +557,7 @@ def _redirect_process_stdio_to_log(workdir_root: Path) -> None: global _RUNNER_STDIO_KEEPALIVE_FDS _RUNNER_STDIO_LOG_FP, _RUNNER_STDIO_KEEPALIVE_FDS = _redirect_process_stdio_to_log_impl( workdir_root=workdir_root, - runner_stdio_log_filename=RUNNER_STDIO_LOG_FILENAME, + runner_stdio_log_filename=filename, stdio_log_fp=_RUNNER_STDIO_LOG_FP, stdio_keepalive_fds=_RUNNER_STDIO_KEEPALIVE_FDS, start_mirror=_start_runner_stdio_log_mirror, @@ -1277,26 +1333,16 @@ def _load_source_stack_contract() -> Dict[str, Any]: global_envs.get("FLUXON_CLUSTER_NAME"), "bootstrap source deployconf.global_envs.FLUXON_CLUSTER_NAME", ) - shared_memory_hostworkdir = _require_str( + share_mem_hostworkdir = _require_str( global_envs.get("FLUXON_SHARED_MEM"), "bootstrap source deployconf.global_envs.FLUXON_SHARED_MEM", ) - shared_file_hostworkdir = _require_str( - global_envs.get("FLUXON_SHARED_MEM2"), - "bootstrap source deployconf.global_envs.FLUXON_SHARED_MEM2", - ) _resolve_stack_contract_path( contract_hostworkdir, - shared_memory_hostworkdir, + share_mem_hostworkdir, field_name="bootstrap source deployconf.global_envs.FLUXON_SHARED_MEM", allow_absolute=True, ) - _resolve_stack_contract_path( - contract_hostworkdir, - shared_file_hostworkdir, - field_name="bootstrap source deployconf.global_envs.FLUXON_SHARED_MEM2", - allow_absolute=False, - ) source_bootstrap_cfg_path = _load_test_bed_bootstrap_config_path() source_bootstrap_cfg = _require_dict( @@ -1331,8 +1377,7 @@ def _load_source_stack_contract() -> Dict[str, Any]: # - /r/fs_s3/* proxy for downloading release artifacts "ops_controller_url": controller_url, "controller_basic_auth": controller_basic_auth, - "shared_memory_hostworkdir": shared_memory_hostworkdir, - "shared_file_hostworkdir": shared_file_hostworkdir, + "share_mem_hostworkdir": share_mem_hostworkdir, } @@ -1341,8 +1386,7 @@ def _write_ci_runtime_test_config( src_root: Path, etcd_address: str, cluster_name: str, - shared_memory_path: str, - shared_file_path: str, + share_mem_path: str, ) -> Path: """Materialize the single CI test authority consumed by fluxon_py integration tests. @@ -1361,8 +1405,7 @@ def _write_ci_runtime_test_config( "kv_svc_type": "fluxon", "etcd_address": str(etcd_address), "cluster_name": str(cluster_name), - "shared_memory_path": str(shared_memory_path), - "shared_file_path": str(shared_file_path), + "share_mem_path": str(share_mem_path), }, ) return test_cfg_path @@ -1992,21 +2035,16 @@ def _cluster_scoped_shared_dir(*, root_path: str, cluster_name: str) -> Path: def _shared_bundle_paths_for_cluster( *, - shared_memory_root: str, - shared_file_root: str, + share_mem_root: str, cluster_name: str, ) -> List[Path]: - shared_memory_dir = _cluster_scoped_shared_dir( - root_path=shared_memory_root, - cluster_name=cluster_name, - ) - shared_file_dir = _cluster_scoped_shared_dir( - root_path=shared_file_root, + share_mem_dir = _cluster_scoped_shared_dir( + root_path=share_mem_root, cluster_name=cluster_name, ) return [ - shared_file_dir / "shared.json", - shared_memory_dir / "mmap.file", + share_mem_dir / "shared.json", + share_mem_dir / "mmap.file", ] @@ -2020,35 +2058,28 @@ def _owner_target_slug(*, owner_target: str, ctx: str) -> str: def _owner_bundle_roots_for_target( *, - shared_memory_root: str, - shared_file_root: str, + share_mem_root: str, owner_target: str, ctx: str, -) -> Tuple[str, str]: +) -> str: owner_slug = _owner_target_slug(owner_target=owner_target, ctx=ctx) - return ( - str((Path(shared_memory_root) / owner_slug).resolve()), - str((Path(shared_file_root) / owner_slug).resolve()), - ) + return str((Path(share_mem_root) / owner_slug).resolve()) def _owner_bundle_paths_for_target( *, - shared_memory_root: str, - shared_file_root: str, + share_mem_root: str, cluster_name: str, owner_target: str, ctx: str, ) -> List[Path]: - owner_shared_memory_root, owner_shared_file_root = _owner_bundle_roots_for_target( - shared_memory_root=shared_memory_root, - shared_file_root=shared_file_root, + owner_share_mem_root = _owner_bundle_roots_for_target( + share_mem_root=share_mem_root, owner_target=owner_target, ctx=ctx, ) return _shared_bundle_paths_for_cluster( - shared_memory_root=owner_shared_memory_root, - shared_file_root=owner_shared_file_root, + share_mem_root=owner_share_mem_root, cluster_name=cluster_name, ) @@ -2164,7 +2195,7 @@ def _require_explicit_owner_group_processes_for_multi_owner_same_machine( raise ValueError( f"{ctx}.benchmark.owner_group_processes is required when external Fluxon KV owners share a machine: " f"machines={multi_owner_machines}. Without an explicit group size, benchmark nodes and owners " - "silently reuse the same shared_memory_path/shared_file_path roots, which invalidates owner binding." + "silently reuse the same share_mem_path roots, which invalidates owner binding." ) @@ -2177,13 +2208,9 @@ def _load_stack_identity(*, workdir_root: Path) -> Dict[str, Any]: contract.get("controller_basic_auth"), field_name="bootstrap_contract.controller_basic_auth", ) - shared_mem_hostworkdir = _require_str( - contract.get("shared_memory_hostworkdir"), - "bootstrap_contract.shared_memory_hostworkdir", - ) - shared_file_hostworkdir = _require_str( - contract.get("shared_file_hostworkdir"), - "bootstrap_contract.shared_file_hostworkdir", + share_mem_hostworkdir = _require_str( + contract.get("share_mem_hostworkdir"), + "bootstrap_contract.share_mem_hostworkdir", ) cluster_name = _suite_cluster_name_for_workdir(workdir_root) if cluster_name == ops_cluster_name: @@ -2198,18 +2225,12 @@ def _load_stack_identity(*, workdir_root: Path) -> Dict[str, Any]: "cluster_name": cluster_name, "controller_url": ops_controller_url, "controller_basic_auth": controller_basic_auth, - "shared_memory_path": _resolve_stack_contract_path( + "share_mem_path": _resolve_stack_contract_path( hostworkdir, - shared_mem_hostworkdir, - field_name="bootstrap_contract.shared_memory_hostworkdir", + share_mem_hostworkdir, + field_name="bootstrap_contract.share_mem_hostworkdir", allow_absolute=True, ), - "shared_file_path": _resolve_stack_contract_path( - hostworkdir, - shared_file_hostworkdir, - field_name="bootstrap_contract.shared_file_hostworkdir", - allow_absolute=False, - ), } @@ -2239,13 +2260,9 @@ def _build_runtime_token_mapping( stack_identity.get("controller_url"), "stack_identity.controller_url", ), - "__STACK_SHARED_MEMORY_PATH__": _require_str( - stack_identity.get("shared_memory_path"), - "stack_identity.shared_memory_path", - ), - "__STACK_SHARED_FILE_PATH__": _require_str( - stack_identity.get("shared_file_path"), - "stack_identity.shared_file_path", + "__STACK_SHARE_MEM_PATH__": _require_str( + stack_identity.get("share_mem_path"), + "stack_identity.share_mem_path", ), } if extra_tokens is not None: @@ -2791,105 +2808,17 @@ def _write_deployer_manifests(resolved_case: Dict[str, Any], run_dir: Path, *, a orig_argv = [cmd0] + args exec_cmd = " ".join(_shell_quote(x) for x in orig_argv) - # Generate a self-contained SigV4 GET downloader (Fluxon FS S3 gateway) and then exec the original argv. - bash_script = ( - "set -euo pipefail\n" - "python3 - <<'PY'\n" - "import datetime\n" - "import hashlib\n" - "import hmac\n" - "import os\n" - "import urllib.parse\n" - "import urllib.request\n" - "from pathlib import Path\n" - "\n" - f"BASE_URL = {s3_base_url!r}\n" - f"BUCKET = {s3_bucket!r}\n" - f"OBJECT_KEY = {object_key!r}\n" - f"DEST_PATH = {payload_dest_path_s!r}\n" - f"ACCESS_KEY = {s3_access_key!r}\n" - f"SECRET_KEY = {s3_secret_key!r}\n" - f"REGION = {s3_region!r}\n" - "\n" - "ALG = 'AWS4-HMAC-SHA256'\n" - "SERVICE = 's3'\n" - "TERM = 'aws4_request'\n" - "UNSIGNED = 'UNSIGNED-PAYLOAD'\n" - "\n" - "def _hmac_sha256(key: bytes, msg: bytes) -> bytes:\n" - " return hmac.new(key, msg, hashlib.sha256).digest()\n" - "\n" - "def _sha256_hex(msg: bytes) -> str:\n" - " return hashlib.sha256(msg).hexdigest()\n" - "\n" - "def _derive_signing_key(secret_key: str, scope_date: str, region: str) -> bytes:\n" - " k_date = _hmac_sha256(('AWS4' + secret_key).encode('utf-8'), scope_date.encode('utf-8'))\n" - " k_region = _hmac_sha256(k_date, region.encode('utf-8'))\n" - " k_service = _hmac_sha256(k_region, SERVICE.encode('utf-8'))\n" - " return _hmac_sha256(k_service, TERM.encode('utf-8'))\n" - "\n" - "def _sigv4_headers(*, method: str, signing_path: str, query: str, host: str, scope_date: str, amz_date: str, payload_hash: str) -> dict:\n" - " signed_headers = 'host;x-amz-content-sha256;x-amz-date'\n" - " canonical_headers = ''\n" - " canonical_headers += f'host:{host}\\n'\n" - " canonical_headers += f'x-amz-content-sha256:{payload_hash}\\n'\n" - " canonical_headers += f'x-amz-date:{amz_date}\\n'\n" - " canonical_request = '\\n'.join([method, signing_path, query, canonical_headers, signed_headers, payload_hash])\n" - " cr_hash = _sha256_hex(canonical_request.encode('utf-8'))\n" - " scope = f'{scope_date}/{REGION}/{SERVICE}/{TERM}'\n" - " string_to_sign = '\\n'.join([ALG, amz_date, scope, cr_hash])\n" - " signing_key = _derive_signing_key(SECRET_KEY, scope_date, REGION)\n" - " sig = hmac.new(signing_key, string_to_sign.encode('utf-8'), hashlib.sha256).hexdigest()\n" - " auth = f\"{ALG} Credential={ACCESS_KEY}/{scope}, SignedHeaders={signed_headers}, Signature={sig}\"\n" - " return {\n" - " 'Authorization': auth,\n" - " 'x-amz-date': amz_date,\n" - " 'x-amz-content-sha256': payload_hash,\n" - " 'Host': host,\n" - " }\n" - "\n" - "u = urllib.parse.urlparse(BASE_URL)\n" - "if u.scheme not in ('http', 'https'):\n" - " raise ValueError('BASE_URL must be http(s)')\n" - "if not u.netloc:\n" - " raise ValueError('BASE_URL missing host')\n" - "base_path = u.path.rstrip('/')\n" - "if base_path == '':\n" - " raise ValueError('BASE_URL must include a non-root path prefix (e.g. /fs_s3)')\n" - "\n" - "bucket_enc = urllib.parse.quote(BUCKET, safe='-_.~')\n" - "key_enc = urllib.parse.quote(OBJECT_KEY, safe='/-_.~')\n" - "full_path = base_path + '/' + bucket_enc + '/' + key_enc\n" - # Sign the *actual* client-visible request path (including s3_base_url path prefix, e.g. "/fs_s3"). - "signing_path = full_path\n" - "url = f'{u.scheme}://{u.netloc}{full_path}'\n" - "\n" - "now = datetime.datetime.utcnow()\n" - "amz_date = now.strftime('%Y%m%dT%H%M%SZ')\n" - "scope_date = now.strftime('%Y%m%d')\n" - "hdrs = _sigv4_headers(method='GET', signing_path=signing_path, query='', host=u.netloc, scope_date=scope_date, amz_date=amz_date, payload_hash=UNSIGNED)\n" - "\n" - "dest = Path(DEST_PATH)\n" - "dest.parent.mkdir(parents=True, exist_ok=True)\n" - "tmp = Path(str(dest) + '.tmp')\n" - "if tmp.exists():\n" - " tmp.unlink()\n" - "req = urllib.request.Request(url, method='GET')\n" - "for k, v in hdrs.items():\n" - " req.add_header(k, v)\n" - "with urllib.request.urlopen(req, timeout=60) as resp:\n" - " if getattr(resp, 'status', None) != 200:\n" - " body = resp.read(4096)\n" - " raise RuntimeError(f'download failed: status={getattr(resp, \"status\", None)} body={body!r}')\n" - " with tmp.open('wb') as f:\n" - " while True:\n" - " b = resp.read(1024 * 1024)\n" - " if not b:\n" - " break\n" - " f.write(b)\n" - "tmp.replace(dest)\n" - "PY\n" - f"exec {exec_cmd}\n" + # Keep the remote wrapper self-contained, but store it as a standalone template + # instead of hardcoding a long inline script in this Python source file. + bash_script = _render_fluxon_fs_s3_payload_wrapper( + s3_base_url=s3_base_url, + s3_bucket=s3_bucket, + object_key=object_key, + payload_dest_path=payload_dest_path_s, + s3_access_key=s3_access_key, + s3_secret_key=s3_secret_key, + s3_region=s3_region, + exec_cmd=exec_cmd, ) # Deployer only consumes argv/cwd; container image is required by the YAML subset parser @@ -3723,30 +3652,19 @@ def _resolved_run_dir_path(resolved_case: Dict[str, Any]) -> Path: return Path(_require_str(runtime.get("run_dir"), "runtime.run_dir")).resolve() -def _ci_shared_memory_path(resolved_case: Dict[str, Any], *, run_dir: Path) -> str: +def _ci_share_mem_path(resolved_case: Dict[str, Any], *, run_dir: Path) -> str: runtime = _require_dict(resolved_case.get("runtime"), "resolved_case.runtime") stack_identity = _require_dict(runtime.get("stack_identity"), "resolved_case.runtime.stack_identity") - shared_memory_root = _require_str( - stack_identity.get("shared_memory_path"), - "resolved_case.runtime.stack_identity.shared_memory_path", + share_mem_root = _require_str( + stack_identity.get("share_mem_path"), + "resolved_case.runtime.stack_identity.share_mem_path", ) # English note: - # - iceoryx2 uses shared_memory_path as a base for per-node paths (e.g. .../nodes//iox2_/.service_tag). + # - iceoryx2 uses share_mem_path as a base for per-node paths (e.g. .../nodes//iox2_/.service_tag). # - The per-node suffix can be long, and some filesystems enforce a max path length of 255 bytes. - # - Therefore shared_memory_path must be short and must not embed run_dir (which can be deep under repo/workdir). - token = hashlib.sha256(str(run_dir.resolve()).encode("utf-8")).hexdigest()[:16] - return str((Path(shared_memory_root) / "ci" / token).resolve()) - - -def _ci_shared_file_path(resolved_case: Dict[str, Any], *, run_dir: Path) -> str: - runtime = _require_dict(resolved_case.get("runtime"), "resolved_case.runtime") - stack_identity = _require_dict(runtime.get("stack_identity"), "resolved_case.runtime.stack_identity") - shared_file_root = _require_str( - stack_identity.get("shared_file_path"), - "resolved_case.runtime.stack_identity.shared_file_path", - ) + # - Therefore share_mem_path must be short and must not embed run_dir (which can be deep under repo/workdir). token = hashlib.sha256(str(run_dir.resolve()).encode("utf-8")).hexdigest()[:16] - return str((Path(shared_file_root) / "ci" / token).resolve()) + return str((Path(share_mem_root) / "ci" / token).resolve()) def _ci_owner_shared_bundle_paths(run_dir: Path, *, owner_config_path: Path) -> List[Path]: @@ -3759,14 +3677,9 @@ def _ci_owner_shared_bundle_paths(run_dir: Path, *, owner_config_path: Path) -> fluxonkv_spec.get("cluster_name"), "ci_owner_0.yaml.fluxonkv_spec.cluster_name", ) - shm = _require_str(fluxonkv_spec.get("shared_memory_path"), "ci_owner_0.yaml.fluxonkv_spec.shared_memory_path") - shared_file = _require_str( - fluxonkv_spec.get("shared_file_path"), - "ci_owner_0.yaml.fluxonkv_spec.shared_file_path", - ) + shm = _require_str(fluxonkv_spec.get("share_mem_path"), "ci_owner_0.yaml.fluxonkv_spec.share_mem_path") return _shared_bundle_paths_for_cluster( - shared_memory_root=shm, - shared_file_root=shared_file, + share_mem_root=shm, cluster_name=cluster_name, ) @@ -3781,7 +3694,7 @@ def _wait_ci_owner_shared_bundle_ready_and_stage_shared_json( timeout_s: int, ) -> None: # English note: - # - `shared_memory_path` is host-local. When owner_0 runs on a remote node, the runner host + # - `share_mem_path` is host-local. When owner_0 runs on a remote node, the runner host # cannot see shared.json/mmap.file by filesystem path. # - CI execution already depends on the remote shared bundle being ready. Here we additionally # fetch shared.json back to a stable local path for determinism and postmortem. @@ -3802,8 +3715,7 @@ def _wait_ci_owner_shared_bundle_ready_and_stage_shared_json( required_str_keys = ( "owner_id", "cluster_name", - "shared_memory_path", - "shared_file_path", + "share_mem_path", "protocol_version", ) for k in required_str_keys: @@ -3823,17 +3735,11 @@ def _wait_ci_owner_shared_bundle_ready_and_stage_shared_json( f"expected={_ci_cluster_name(resolved_case)!r}" ) expected_shm_dir = str(mmap_file_path.parent.resolve()) - if meta.get("shared_memory_path") != expected_shm_dir: + if meta.get("share_mem_path") != expected_shm_dir: raise ValueError( - f"shared.json shared_memory_path mismatch: shared={meta.get('shared_memory_path')!r} " + f"shared.json share_mem_path mismatch: shared={meta.get('share_mem_path')!r} " f"expected={expected_shm_dir!r}" ) - expected_file_dir = str(shared_json_path.parent.resolve()) - if meta.get("shared_file_path") != expected_file_dir: - raise ValueError( - f"shared.json shared_file_path mismatch: shared={meta.get('shared_file_path')!r} " - f"expected={expected_file_dir!r}" - ) except Exception as exc: # noqa: BLE001 last_err = f"{type(exc).__name__}: {exc}" else: @@ -7027,6 +6933,18 @@ def _runner_native_ci_commands_for_case(case: _ResolvedCase, *, ctx: str) -> Lis "timeout_seconds": 21600, } ] + if scene_id == "ci_top_attention_log_mgmt": + return [ + { + "id": "top_attention_log_mgmt", + "command": ( + "__RUN_DIR__/venv/bin/python3 -u " + "__RUN_DIR__/src/fluxon_test_stack/top_attention_test_index/_log_mgmt.py " + "--case-config __RUN_DIR__/configs/ci_scene_config.yaml" + ), + "timeout_seconds": 21600, + } + ] if scene_id == "ci_top_attention_mq_core": return [ { @@ -8638,6 +8556,13 @@ def _test_stack_kv_owner_runtime_instance_key(*, runtime_instance_prefix: str, o return f"{runtime_instance_prefix}__kv_owner__{target_slug}" +def _fluxon_kv_owner_large_file_paths(*, owner_work_root: Path) -> List[str]: + # Owner mode always needs explicit large-file roots, even on surfaces that + # intentionally leave p2p_listen_port implicit. + root = owner_work_root.resolve() + return [str((root / "large").resolve())] + + def _build_test_stack_external_kv_owner_instances( *, scene_mode: str, @@ -8682,8 +8607,7 @@ def _build_test_stack_external_kv_owner_instances( stack_identity = _require_dict(runtime.get("stack_identity"), "runtime.stack_identity") cluster_name = _require_str(stack_identity.get("cluster_name"), "runtime.stack_identity.cluster_name") - shared_memory_root = _require_str(stack_identity.get("shared_memory_path"), "runtime.stack_identity.shared_memory_path") - shared_file_root = _require_str(stack_identity.get("shared_file_path"), "runtime.stack_identity.shared_file_path") + share_mem_root = _require_str(stack_identity.get("share_mem_path"), "runtime.stack_identity.share_mem_path") etcd_endpoints = _test_stack_etcd_addresses(resolved_case) master_port_offset = 0 owner_instances: List[Dict[str, Any]] = [] @@ -8692,6 +8616,8 @@ def _build_test_stack_external_kv_owner_instances( owner_target=target, ctx="external kv owner", ) + # TEST_STACK case-local owners use the compiled slot-based port plan so + # node runtimes in the same case can resolve stable owner peers. owner_p2p_listen_port = ( int(kv_p2p_port_base) + int(kv_p2p_port_stride) * int(run_index - 1) @@ -8704,15 +8630,15 @@ def _build_test_stack_external_kv_owner_instances( raise ValueError(f"computed owner_p2p_listen_port out of range: {owner_p2p_listen_port}") if owner_group_processes is None: - owner_shared_memory_path = shared_memory_root - owner_shared_file_path = shared_file_root + owner_share_mem_path = share_mem_root else: - owner_shared_memory_path, owner_shared_file_path = _owner_bundle_roots_for_target( - shared_memory_root=shared_memory_root, - shared_file_root=shared_file_root, + owner_share_mem_path = _owner_bundle_roots_for_target( + share_mem_root=share_mem_root, owner_target=target, ctx="runtime.stack_identity owner bundle roots", ) + owner_services_dir = run_dir / "services" / "kv_owner" / target_slug + owner_large_file_paths = _fluxon_kv_owner_large_file_paths(owner_work_root=owner_services_dir) owner_cfg = { "instance_key": _test_stack_kv_owner_runtime_instance_key( runtime_instance_prefix=runtime_instance_prefix, @@ -8723,8 +8649,8 @@ def _build_test_stack_external_kv_owner_instances( "fluxonkv_spec": { "etcd_addresses": list(etcd_endpoints), "cluster_name": cluster_name, - "shared_memory_path": owner_shared_memory_path, - "shared_file_path": owner_shared_file_path, + "share_mem_path": owner_share_mem_path, + "large_file_paths": owner_large_file_paths, "sub_cluster": FLUXON_KV_OWNER_SUB_CLUSTER, "p2p_listen_port": int(owner_p2p_listen_port), }, @@ -8747,7 +8673,6 @@ def _build_test_stack_external_kv_owner_instances( raise ValueError(f"test_stack owner config already exists (no overwrite): {owner_cfg_path}") _write_yaml_file(owner_cfg_path, owner_cfg) - owner_services_dir = run_dir / "services" / "kv_owner" / target_slug owner_services_dir.mkdir(parents=True, exist_ok=True) owner_inst = copy.deepcopy(coord_tpl) owner_inst["id"] = instance_id @@ -9242,21 +9167,16 @@ def _compile_test_stack_case(resolved_case: Dict[str, Any], *, run_index: int) - node_roles: List[str] = [] node_overrides: List[Dict[str, Any]] = [] stack_cluster_name: Optional[str] = None - stack_shared_memory_path: Optional[str] = None - stack_shared_file_path: Optional[str] = None + stack_share_mem_path: Optional[str] = None if backend_kind == TEST_STACK_BACKEND_FLUXON: stack_identity = _require_dict(runtime.get("stack_identity"), "runtime.stack_identity") stack_cluster_name = _require_str( stack_identity.get("cluster_name"), "runtime.stack_identity.cluster_name", ) - stack_shared_memory_path = _require_str( - stack_identity.get("shared_memory_path"), - "runtime.stack_identity.shared_memory_path", - ) - stack_shared_file_path = _require_str( - stack_identity.get("shared_file_path"), - "runtime.stack_identity.shared_file_path", + stack_share_mem_path = _require_str( + stack_identity.get("share_mem_path"), + "runtime.stack_identity.share_mem_path", ) rc = _require_dict(ts_profile.get("runtime_config"), "profile.test_stack.runtime_config") @@ -9956,8 +9876,7 @@ def _compile_test_stack_case(resolved_case: Dict[str, Any], *, run_index: int) - # Benchmark nodes bootstrap from owner shared bundles. Strict dual-owner mode # routes each process group to a different same-machine owner bundle root. assert stack_cluster_name is not None - assert stack_shared_memory_path is not None - assert stack_shared_file_path is not None + assert stack_share_mem_path is not None selected_owner_target = _test_stack_owner_target_for_node_process( target=target, process_idx=process_idx, @@ -9966,18 +9885,15 @@ def _compile_test_stack_case(resolved_case: Dict[str, Any], *, run_index: int) - owner_group_processes=owner_group_processes, ) if selected_owner_target is None: - selected_shared_memory_path = stack_shared_memory_path - selected_shared_file_path = stack_shared_file_path + selected_share_mem_path = stack_share_mem_path else: - selected_shared_memory_path, selected_shared_file_path = _owner_bundle_roots_for_target( - shared_memory_root=stack_shared_memory_path, - shared_file_root=stack_shared_file_path, + selected_share_mem_path = _owner_bundle_roots_for_target( + share_mem_root=stack_share_mem_path, owner_target=selected_owner_target, ctx=f"strict dual-owner routing target={target} process_idx={process_idx}", ) fluxonkv_override["cluster_name"] = stack_cluster_name - fluxonkv_override["shared_memory_path"] = selected_shared_memory_path - fluxonkv_override["shared_file_path"] = selected_shared_file_path + fluxonkv_override["share_mem_path"] = selected_share_mem_path fluxonkv_override["p2p_listen_port"] = int(kv_p2p_listen_port) kv["fluxonkv_spec"] = fluxonkv_override elif backend_kind == TEST_STACK_BACKEND_ALLUXIO: @@ -11642,6 +11558,51 @@ def _shell_quote(s: str) -> str: return "'" + s.replace("'", "'\\''") + "'" +def _json_string_literal(value: str) -> str: + return json.dumps(value, ensure_ascii=True) + + +def _render_runner_template(*, template_name: str, replacements: Dict[str, str]) -> str: + template_path = (RUNNER_TEMPLATE_DIR / template_name).resolve() + if template_path.parent != RUNNER_TEMPLATE_DIR: + raise ValueError(f"template must stay under {RUNNER_TEMPLATE_DIR}: {template_path}") + if not template_path.is_file(): + raise ValueError(f"missing runner template: {template_path}") + rendered = template_path.read_text(encoding="utf-8") + for token, value in replacements.items(): + rendered = rendered.replace(token, value) + unresolved = sorted(set(re.findall(r"__FLUXON_TMPL_[A-Z0-9_]+__", rendered))) + if unresolved: + raise ValueError(f"unresolved runner template tokens: {unresolved} template={template_path}") + return rendered + + +def _render_fluxon_fs_s3_payload_wrapper( + *, + s3_base_url: str, + s3_bucket: str, + object_key: str, + payload_dest_path: str, + s3_access_key: str, + s3_secret_key: str, + s3_region: str, + exec_cmd: str, +) -> str: + return _render_runner_template( + template_name="payload_fluxon_fs_s3_download_and_exec.sh.template", + replacements={ + "__FLUXON_TMPL_BASE_URL_JSON__": _json_string_literal(s3_base_url), + "__FLUXON_TMPL_BUCKET_JSON__": _json_string_literal(s3_bucket), + "__FLUXON_TMPL_OBJECT_KEY_JSON__": _json_string_literal(object_key), + "__FLUXON_TMPL_DEST_PATH_JSON__": _json_string_literal(payload_dest_path), + "__FLUXON_TMPL_ACCESS_KEY_JSON__": _json_string_literal(s3_access_key), + "__FLUXON_TMPL_SECRET_KEY_JSON__": _json_string_literal(s3_secret_key), + "__FLUXON_TMPL_REGION_JSON__": _json_string_literal(s3_region), + "__FLUXON_TMPL_EXEC_CMD__": exec_cmd, + }, + ) + + def _find_deploy_instance_opt(resolved_case: Dict[str, Any], *, instance_id: str) -> Optional[Dict[str, Any]]: deploy = _require_dict(resolved_case.get("deploy"), "resolved_case.deploy") @@ -13784,8 +13745,7 @@ def _ci_prepare_run_inputs( overlay_live_checkout: bool, etcd_address: str, cluster_name: str, - shared_memory_path: str, - shared_file_path: str, + share_mem_path: str, ) -> None: """Materialize CI run inputs from the case release into an isolated run_dir. @@ -13860,8 +13820,7 @@ def _ci_prepare_run_inputs( src_root=src_root, etcd_address=etcd_address, cluster_name=cluster_name, - shared_memory_path=shared_memory_path, - shared_file_path=shared_file_path, + share_mem_path=share_mem_path, ) release_link_path = src_root / "fluxon_release" _materialize_ci_runtime_release_view( @@ -13927,8 +13886,14 @@ def _write_ci_scene_config_yaml( def _write_ci_master_owner_configs( - resolved_case: Dict[str, Any], *, run_dir: Path, cluster_name: str, share_mem_path: str, share_file_path: str, owner_dram_bytes: int + resolved_case: Dict[str, Any], + *, + run_dir: Path, + cluster_name: str, + share_mem_path: str, + owner_dram_bytes: int, ) -> tuple[Path, Path]: + owner_work_root = run_dir / "services" / "owner_0" master_cfg = { "etcd_endpoints": ["__ETCD__"], "cluster_name": cluster_name, @@ -13955,8 +13920,11 @@ def _write_ci_master_owner_configs( "fluxonkv_spec": { "etcd_addresses": ["__ETCD__"], "cluster_name": cluster_name, - "shared_memory_path": share_mem_path, - "shared_file_path": share_file_path, + "share_mem_path": share_mem_path, + # Shared testbed / CI owners keep p2p_listen_port implicit so the + # runtime can bind a free host port, but owner mode still requires + # explicit large-file roots. + "large_file_paths": _fluxon_kv_owner_large_file_paths(owner_work_root=owner_work_root), "sub_cluster": FLUXON_KV_OWNER_SUB_CLUSTER, }, } @@ -14227,7 +14195,6 @@ def _write_ci_runner_script( run_dir: Path, src_root: Path, share_mem_path: str, - share_file_path: str, ) -> Path: commands = _resolved_ci_command_list(resolved_case) venv_python = run_dir / "venv" / "bin" / "python3" @@ -14268,30 +14235,22 @@ def _write_ci_runner_script( readiness_probe_block = "" if requires_owner_shared_bundle: bundle_cluster_name = _ci_cluster_name(resolved_case) - bundle_shared_memory_dir = str( - _cluster_scoped_shared_dir(root_path=share_mem_path, cluster_name=bundle_cluster_name) - ) - bundle_shared_file_dir = str( - _cluster_scoped_shared_dir(root_path=share_file_path, cluster_name=bundle_cluster_name) - ) + bundle_dir = str(_cluster_scoped_shared_dir(root_path=share_mem_path, cluster_name=bundle_cluster_name)) shared_bundle_block = f""" echo "[ci_runner] waiting for owner shared bundle..." deadline=$(( $(date +%s) + {CI_RUNNER_SHARED_BUNDLE_TIMEOUT_S} )) -shm={bundle_shared_memory_dir} -shared_file={bundle_shared_file_dir} +share_mem={bundle_dir} while [ $(date +%s) -lt "$deadline" ]; do - if [ -f "$shared_file/shared.json" ] && [ -f "$shm/mmap.file" ]; then + if [ -f "$share_mem/shared.json" ] && [ -f "$share_mem/mmap.file" ]; then echo "[ci_runner] owner shared bundle ready" break fi sleep 1 done -if [ ! -f "$shared_file/shared.json" ] || [ ! -f "$shm/mmap.file" ]; then +if [ ! -f "$share_mem/shared.json" ] || [ ! -f "$share_mem/mmap.file" ]; then echo "[ci_runner] ERROR: owner shared bundle not ready in {CI_RUNNER_SHARED_BUNDLE_TIMEOUT_S}s" - echo "[ci_runner] shm=$shm" - echo "[ci_runner] shared_file=$shared_file" - ls -la "$shm" - ls -la "$shared_file" + echo "[ci_runner] share_mem=$share_mem" + ls -la "$share_mem" fail_and_exit 2 fi """ @@ -15002,13 +14961,9 @@ def _test_stack_external_owner_shared_bundle_paths( stack_identity.get("cluster_name"), "resolved_case.runtime.stack_identity.cluster_name", ) - shared_memory_path = _require_str( - stack_identity.get("shared_memory_path"), - "resolved_case.runtime.stack_identity.shared_memory_path", - ) - shared_file_path = _require_str( - stack_identity.get("shared_file_path"), - "resolved_case.runtime.stack_identity.shared_file_path", + share_mem_path = _require_str( + stack_identity.get("share_mem_path"), + "resolved_case.runtime.stack_identity.share_mem_path", ) if owner_target is not None: scale = _require_dict(resolved_case.get("scale"), "resolved_case.scale") @@ -15038,15 +14993,13 @@ def _test_stack_external_owner_shared_bundle_paths( ) if owner_group_processes is not None: return _owner_bundle_paths_for_target( - shared_memory_root=shared_memory_path, - shared_file_root=shared_file_path, + share_mem_root=share_mem_path, cluster_name=cluster_name, owner_target=owner_target, ctx="TEST_STACK owner shared bundle paths", ) return _shared_bundle_paths_for_cluster( - shared_memory_root=shared_memory_path, - shared_file_root=shared_file_path, + share_mem_root=share_mem_path, cluster_name=cluster_name, ) @@ -15728,7 +15681,9 @@ def _consume_path(path: Path) -> None: return _consume_path((workdir_root / "case_runs.yaml").resolve()) - _consume_path((workdir_root / RUNNER_STDIO_LOG_FILENAME).resolve()) + runner_log_path = _service_log_resolve_read_path(workdir_root, filename=RUNNER_STDIO_LOG_FILENAME) + if isinstance(runner_log_path, Path): + _consume_path(runner_log_path) run_dir = (_ui_case_result_root(workdir_root, case_id) / _ui_run_dir_name(run_index)).resolve() _consume_path(run_dir) @@ -15835,7 +15790,7 @@ def _ui_case_overview(workdir_root: Path, *, case_id: str) -> Dict[str, Any]: def _ui_collect_suite_overview(workdir_root: Path) -> Dict[str, Any]: case_ids = _ui_collect_case_ids(workdir_root) cases = [_ui_case_overview(workdir_root, case_id=case_id) for case_id in case_ids] - runner_log_path = (workdir_root / RUNNER_STDIO_LOG_FILENAME).resolve() + runner_log_path = _service_log_resolve_read_path(workdir_root, filename=RUNNER_STDIO_LOG_FILENAME) running_cases = [case for case in cases if case.get("status") == "RUNNING"] incomplete_cases = [case for case in cases if case.get("status") in {"INCOMPLETE", "RESERVED"}] last_updated_unix_s = 0 @@ -15858,7 +15813,7 @@ def _ui_collect_suite_overview(workdir_root: Path) -> Dict[str, Any]: return { "workdir_root": workdir_root.resolve(), "case_runs_path": (workdir_root / "case_runs.yaml").resolve(), - "runner_log_path": runner_log_path if runner_log_path.exists() else None, + "runner_log_path": runner_log_path if isinstance(runner_log_path, Path) and runner_log_path.exists() else None, "running_case_count": len(running_cases), "status": "RUNNING" if running_cases else ("INCOMPLETE" if incomplete_cases else ("IDLE" if cases else "EMPTY")), "last_updated_unix_s": int(last_updated_unix_s), @@ -15959,7 +15914,7 @@ def _ui_workdir_id(workdir_root: Path) -> str: def _ui_workdir_touch_unix_s(workdir_root: Path) -> int: touched = 0 - for name in ("case_runs.yaml", RUNNER_STDIO_LOG_FILENAME): + for name in ("case_runs.yaml",): path = (workdir_root / name).resolve() if not path.exists(): continue @@ -15967,6 +15922,12 @@ def _ui_workdir_touch_unix_s(workdir_root: Path) -> int: touched = max(touched, int(path.stat().st_mtime)) except Exception: continue + runner_log_path = _service_log_resolve_read_path(workdir_root, filename=RUNNER_STDIO_LOG_FILENAME) + if isinstance(runner_log_path, Path) and runner_log_path.exists(): + try: + touched = max(touched, int(runner_log_path.stat().st_mtime)) + except Exception: + pass return int(touched) @@ -17405,8 +17366,11 @@ def _handle_api_log_chunk(self, parsed) -> None: self._send_json(400, {"error": "missing workdir_id"}) return suite_workdir = _ui_workdir_by_id(workdir_root, workdir_id, extra_history_roots) - path = (suite_workdir / RUNNER_STDIO_LOG_FILENAME).resolve() - if not path.exists(): + path = _service_log_resolve_read_path( + suite_workdir, + filename=RUNNER_STDIO_LOG_FILENAME, + ) + if not isinstance(path, Path) or not path.exists(): raise FileNotFoundError(f"runner log not found: {path}") elif kind == "run": workdir_id = (qs.get("workdir_id") or [""])[0] diff --git a/fluxon_test_stack/test_runner_runtime_backend.py b/fluxon_test_stack/test_runner_runtime_backend.py index bc46a76..14a85e4 100644 --- a/fluxon_test_stack/test_runner_runtime_backend.py +++ b/fluxon_test_stack/test_runner_runtime_backend.py @@ -64,10 +64,8 @@ def _prepare_ci_case( services_root = (run_dir / "services").resolve() services_root.mkdir(parents=True, exist_ok=True) (services_root / "share_mem").mkdir(parents=True, exist_ok=True) - share_mem_path = ctx._ci_shared_memory_path(resolved_case, run_dir=run_dir) - share_file_path = ctx._ci_shared_file_path(resolved_case, run_dir=run_dir) + share_mem_path = ctx._ci_share_mem_path(resolved_case, run_dir=run_dir) Path(share_mem_path).mkdir(parents=True, exist_ok=True) - Path(share_file_path).mkdir(parents=True, exist_ok=True) venv_python = ctx._create_ci_runtime_venv(run_dir=run_dir) @@ -83,8 +81,7 @@ def _prepare_ci_case( overlay_live_checkout=True, etcd_address=f"{ctx._ci_base_runtime_service_target_ip(resolved_case, service_id='etcd')}:{ctx._ci_base_runtime_service_port(resolved_case, service_id='etcd')}", cluster_name=out_cluster_name, - shared_memory_path=share_mem_path, - shared_file_path=share_file_path, + share_mem_path=share_mem_path, ) prepare_env_exports = ctx._run_ci_prepare_steps( @@ -108,7 +105,6 @@ def _prepare_ci_case( run_dir=run_dir, cluster_name=out_cluster_name, share_mem_path=share_mem_path, - share_file_path=share_file_path, owner_dram_bytes=owner_dram_bytes, ) _ = ctx._write_ci_runner_script( @@ -116,7 +112,6 @@ def _prepare_ci_case( run_dir=run_dir, src_root=src_root, share_mem_path=share_mem_path, - share_file_path=share_file_path, ) ci_runner_exit_code_path = (run_dir / "logs" / "ci_runner" / "exit_code.txt").resolve() ci_runner_exit_code_baseline = ctx._observe_file_state(ci_runner_exit_code_path) @@ -196,8 +191,7 @@ def _prepare_test_stack_case( "resolved_case.profile.test_stack.kind", ) owner_instance_ids: List[str] = [] - shared_memory_path: Optional[str] = None - shared_file_path: Optional[str] = None + share_mem_path: Optional[str] = None stack_cluster_name: Optional[str] = None if ctx._test_stack_backend_uses_dedicated_kv_owners(backend_kind=backend_kind, mode=mode): runtime = ctx._require_dict(resolved_case.get("runtime"), "resolved_case.runtime") @@ -216,13 +210,9 @@ def _prepare_test_stack_case( stack_identity.get("cluster_name"), "runtime.stack_identity.cluster_name", ) - shared_memory_path = ctx._require_str( - stack_identity.get("shared_memory_path"), - "runtime.stack_identity.shared_memory_path", - ) - shared_file_path = ctx._require_str( - stack_identity.get("shared_file_path"), - "runtime.stack_identity.shared_file_path", + share_mem_path = ctx._require_str( + stack_identity.get("share_mem_path"), + "runtime.stack_identity.share_mem_path", ) ctx._converge_test_stack_external_owner_shared_bundle_cleanup( resolved_case, @@ -302,7 +292,7 @@ def _prepare_test_stack_case( ctx="TEST_STACK prepare", ) if ctx._test_stack_backend_uses_external_fluxon_kv(backend_kind=backend_kind, mode=mode): - if shared_memory_path is None or shared_file_path is None or stack_cluster_name is None: + if share_mem_path is None or stack_cluster_name is None: raise ValueError( "internal error: TEST_STACK shared bundle identity is missing after pre-deploy cleanup" ) diff --git a/fluxon_test_stack/test_runner_templates/payload_fluxon_fs_s3_download_and_exec.sh.template b/fluxon_test_stack/test_runner_templates/payload_fluxon_fs_s3_download_and_exec.sh.template new file mode 100644 index 0000000..ca677bc --- /dev/null +++ b/fluxon_test_stack/test_runner_templates/payload_fluxon_fs_s3_download_and_exec.sh.template @@ -0,0 +1,108 @@ +set -euo pipefail +python3 - <<'PY' +import datetime +import hashlib +import hmac +import urllib.parse +import urllib.request +from pathlib import Path + +BASE_URL = __FLUXON_TMPL_BASE_URL_JSON__ +BUCKET = __FLUXON_TMPL_BUCKET_JSON__ +OBJECT_KEY = __FLUXON_TMPL_OBJECT_KEY_JSON__ +DEST_PATH = __FLUXON_TMPL_DEST_PATH_JSON__ +ACCESS_KEY = __FLUXON_TMPL_ACCESS_KEY_JSON__ +SECRET_KEY = __FLUXON_TMPL_SECRET_KEY_JSON__ +REGION = __FLUXON_TMPL_REGION_JSON__ + +ALG = "AWS4-HMAC-SHA256" +SERVICE = "s3" +TERM = "aws4_request" +UNSIGNED = "UNSIGNED-PAYLOAD" + + +def _hmac_sha256(key: bytes, msg: bytes) -> bytes: + return hmac.new(key, msg, hashlib.sha256).digest() + + +def _sha256_hex(msg: bytes) -> str: + return hashlib.sha256(msg).hexdigest() + + +def _derive_signing_key(secret_key: str, scope_date: str, region: str) -> bytes: + k_date = _hmac_sha256(("AWS4" + secret_key).encode("utf-8"), scope_date.encode("utf-8")) + k_region = _hmac_sha256(k_date, region.encode("utf-8")) + k_service = _hmac_sha256(k_region, SERVICE.encode("utf-8")) + return _hmac_sha256(k_service, TERM.encode("utf-8")) + + +def _sigv4_headers(*, method: str, signing_path: str, query: str, host: str, scope_date: str, amz_date: str, payload_hash: str) -> dict: + signed_headers = "host;x-amz-content-sha256;x-amz-date" + canonical_headers = "" + canonical_headers += f"host:{host}\n" + canonical_headers += f"x-amz-content-sha256:{payload_hash}\n" + canonical_headers += f"x-amz-date:{amz_date}\n" + canonical_request = "\n".join([method, signing_path, query, canonical_headers, signed_headers, payload_hash]) + cr_hash = _sha256_hex(canonical_request.encode("utf-8")) + scope = f"{scope_date}/{REGION}/{SERVICE}/{TERM}" + string_to_sign = "\n".join([ALG, amz_date, scope, cr_hash]) + signing_key = _derive_signing_key(SECRET_KEY, scope_date, REGION) + sig = hmac.new(signing_key, string_to_sign.encode("utf-8"), hashlib.sha256).hexdigest() + auth = f"{ALG} Credential={ACCESS_KEY}/{scope}, SignedHeaders={signed_headers}, Signature={sig}" + return { + "Authorization": auth, + "x-amz-date": amz_date, + "x-amz-content-sha256": payload_hash, + "Host": host, + } + + +u = urllib.parse.urlparse(BASE_URL) +if u.scheme not in ("http", "https"): + raise ValueError("BASE_URL must be http(s)") +if not u.netloc: + raise ValueError("BASE_URL missing host") +base_path = u.path.rstrip("/") +if base_path == "": + raise ValueError("BASE_URL must include a non-root path prefix (e.g. /fs_s3)") + +bucket_enc = urllib.parse.quote(BUCKET, safe="-_.~") +key_enc = urllib.parse.quote(OBJECT_KEY, safe="/-_.~") +full_path = base_path + "/" + bucket_enc + "/" + key_enc +signing_path = full_path +url = f"{u.scheme}://{u.netloc}{full_path}" + +now = datetime.datetime.utcnow() +amz_date = now.strftime("%Y%m%dT%H%M%SZ") +scope_date = now.strftime("%Y%m%d") +hdrs = _sigv4_headers( + method="GET", + signing_path=signing_path, + query="", + host=u.netloc, + scope_date=scope_date, + amz_date=amz_date, + payload_hash=UNSIGNED, +) + +dest = Path(DEST_PATH) +dest.parent.mkdir(parents=True, exist_ok=True) +tmp = Path(str(dest) + ".tmp") +if tmp.exists(): + tmp.unlink() +req = urllib.request.Request(url, method="GET") +for k, v in hdrs.items(): + req.add_header(k, v) +with urllib.request.urlopen(req, timeout=60) as resp: + if getattr(resp, "status", None) != 200: + body = resp.read(4096) + raise RuntimeError(f'download failed: status={getattr(resp, "status", None)} body={body!r}') + with tmp.open("wb") as f: + while True: + b = resp.read(1024 * 1024) + if not b: + break + f.write(b) +tmp.replace(dest) +PY +exec __FLUXON_TMPL_EXEC_CMD__ diff --git a/fluxon_test_stack/test_runner_ui.py b/fluxon_test_stack/test_runner_ui.py index 9702da4..d7c6ac2 100644 --- a/fluxon_test_stack/test_runner_ui.py +++ b/fluxon_test_stack/test_runner_ui.py @@ -53,6 +53,10 @@ def main() -> None: raw_path=Path(args.workdir), field_name="workdir", ) + test_runner._redirect_process_stdio_to_log( + workdir_root, + filename="test_runner_ui.log", + ) gitops_cfg_path = None if args.gitops_config: gitops_cfg_path = test_runner._resolve_repo_root_cli_path( diff --git a/fluxon_test_stack/tests/test_ci_2_virt_node_contract.py b/fluxon_test_stack/tests/test_ci_2_virt_node_contract.py index 4392be6..6ebbecd 100644 --- a/fluxon_test_stack/tests/test_ci_2_virt_node_contract.py +++ b/fluxon_test_stack/tests/test_ci_2_virt_node_contract.py @@ -29,13 +29,14 @@ def _load_module(): class TestCi2VirtNodeContract(unittest.TestCase): _KVTEST_SCENE_ID = "ci_top_attention_bin_kvtest" _DOC_SCENE_ID = "ci_top_attention_doc_page_build" + _LOG_MGMT_SCENE_ID = "ci_top_attention_log_mgmt" _MQ_SCENE_ID = "ci_top_attention_mq_core" def test_generated_suite_is_public_dual_local_nodes_ci_only(self) -> None: suite_cfg = _ENTRY._load_yaml_mapping(_ENTRY.DEFAULT_SUITE_PATH, ctx="suite") generated = _ENTRY._rewrite_suite_for_local_dual_nodes( suite_cfg=suite_cfg, - scene_ids=[self._DOC_SCENE_ID, self._KVTEST_SCENE_ID], + scene_ids=[self._DOC_SCENE_ID, self._KVTEST_SCENE_ID, self._LOG_MGMT_SCENE_ID, self._MQ_SCENE_ID], primary_node_name="local-node-a", secondary_node_name="local-node-b", host_ip="10.1.1.119", @@ -44,7 +45,10 @@ def test_generated_suite_is_public_dual_local_nodes_ci_only(self) -> None: ) self.assertEqual(generated["run"]["selectors"]["profile_ids"], ["fluxon_tcp_thread"]) - self.assertEqual(set(generated["scenes"].keys()), {self._DOC_SCENE_ID, self._KVTEST_SCENE_ID}) + self.assertEqual( + set(generated["scenes"].keys()), + {self._DOC_SCENE_ID, self._KVTEST_SCENE_ID, self._LOG_MGMT_SCENE_ID, self._MQ_SCENE_ID}, + ) self.assertEqual(generated["profiles"]["fluxon_tcp_thread"]["artifact_set"], "fluxon_tcp_thread") self.assertEqual( generated["profiles"]["fluxon_tcp_thread"]["runtime"]["ci"]["scene_configs"][self._KVTEST_SCENE_ID][ @@ -52,6 +56,16 @@ def test_generated_suite_is_public_dual_local_nodes_ci_only(self) -> None: ], "tcp_thread_transport", ) + self.assertEqual( + generated["profiles"]["fluxon_tcp_thread"]["runtime"]["ci"]["scene_configs"][self._LOG_MGMT_SCENE_ID][ + "enabled" + ], + True, + ) + self.assertEqual( + generated["profiles"]["fluxon_tcp_thread"]["runtime"]["ci"]["scene_configs"][self._MQ_SCENE_ID], + {}, + ) self.assertEqual( generated["profiles"]["fluxon_tcp_thread"]["runtime"]["ci"]["deploy"]["target_ip_map"], {"local-node-a": "10.1.1.119", "local-node-b": "10.1.1.119"}, @@ -114,11 +128,21 @@ def test_generated_suite_is_public_dual_local_nodes_ci_only(self) -> None: generated["scenes"][self._KVTEST_SCENE_ID]["select"]["scales"], ["n1_kvowner_dram_20gib"], ) + self.assertEqual( + generated["scenes"][self._LOG_MGMT_SCENE_ID]["select"]["scales"], + ["n1_kvowner_dram_20gib"], + ) + self.assertEqual( + generated["scenes"][self._MQ_SCENE_ID]["select"]["scales"], + ["n1_kvowner_dram_20gib"], + ) self.assertEqual( set(generated["scales"].keys()), {"n1_kvowner_dram_3gib", "n1_kvowner_dram_20gib"}, ) self.assertNotIn("commands", generated["scenes"][self._KVTEST_SCENE_ID]["ci"]) + self.assertNotIn("commands", generated["scenes"][self._LOG_MGMT_SCENE_ID]["ci"]) + self.assertNotIn("commands", generated["scenes"][self._MQ_SCENE_ID]["ci"]) def test_generated_suite_supports_mq_core_ci_scene(self) -> None: suite_cfg = _ENTRY._load_yaml_mapping(_ENTRY.DEFAULT_SUITE_PATH, ctx="suite") @@ -251,7 +275,23 @@ def test_generated_deployconf_rewrites_to_dual_local_nodes(self) -> None: self.assertIn('--wheel "$FLUXON_RELEASE_WHEEL"', generated["global_envs"]["FLUXON_RELEASE_WHEEL_FETCH_CMD"]) self.assertEqual(generated["atomic_groups"]["fluxon_core_controller"]["nodes"], ["local-node-a", "local-node-b"]) self.assertEqual(generated["service"]["owner"]["node_bind"]["node"], ["local-node-a", "local-node-b"]) + self.assertIn( + 'large_file_paths:', + generated["service"]["owner"]["entrypoint"], + ) + self.assertIn( + '- "${HOSTWORKDIR}/large/owner_${NODE_ID}"', + generated["service"]["owner"]["entrypoint"], + ) self.assertEqual(generated["service"]["ops_controller"]["port"], 19180) + self.assertIn( + 'http_listen_addr: "0.0.0.0:${OPS_CONTROLLER__PORT}"', + generated["service"]["ops_controller"]["entrypoint"], + ) + self.assertNotIn( + 'http_listen_addr: "0.0.0.0:${MASTER__PORT}"', + generated["service"]["ops_controller"]["entrypoint"], + ) self.assertIn("local-node-a", generated["service"]["ops_agent"]["entrypoint"]) self.assertIn("local-node-b", generated["service"]["ops_agent"]["entrypoint"]) self.assertIn(' - "10.1.1.119/32"', generated["service"]["master"]["entrypoint"]) @@ -348,6 +388,54 @@ def fake_run(argv: list[str], *, env=None) -> None: self.assertIn(str(env_path.resolve()), calls[0]) self.assertIn(str((root / "pack_release_runtime").resolve()), calls[0]) + def test_render_ci_nix_pack_config_sets_explicit_project_root(self) -> None: + with tempfile.TemporaryDirectory() as td: + root = Path(td) + static_config_path = root / "static.yaml" + env_companion_path = root / "env.yaml" + out_path = root / "generated" / "setup_and_pack" / "nix" / "pack_fluxonkv_pylib_ci.yaml" + + _ENTRY._write_yaml( + static_config_path, + { + "schema_version": 1, + "runtime": { + "base_system": "manylinux_2_28", + "architectures": ["x86_64"], + "python_abi": "cpython3.10", + }, + "profile": { + "source_kind": "bridge_prebuilt", + "native_runtime_dir_names": ["cxxpacked"], + "target_support_dir_names": ["meson-0.64.0"], + "ext_bundle_dir_name": "cxxpacked", + }, + "assembly": { + "baseline_path": "/tmp/baseline", + }, + }, + ) + _ENTRY._write_yaml( + env_companion_path, + { + "host_paths": { + "root_path": "/tmp/project-data", + }, + }, + ) + + rendered_path = _ENTRY._render_ci_nix_pack_config( + static_config_path=static_config_path, + env_companion_path=env_companion_path, + out_path=out_path, + repo_root=REPO_ROOT, + ) + + self.assertEqual(rendered_path, out_path.resolve()) + rendered_cfg = _ENTRY._load_yaml_mapping(rendered_path, ctx="rendered nix pack config") + self.assertEqual(rendered_cfg["project_root"], str(REPO_ROOT.resolve())) + self.assertEqual(rendered_cfg["profile"]["build_root_path"], str(REPO_ROOT.resolve())) + def test_prepare_pack_release_runtime_dirs_creates_expected_layout(self) -> None: with tempfile.TemporaryDirectory() as td: root = Path(td) / "pack_release_runtime" @@ -441,7 +529,7 @@ def test_main_supports_explicit_suite_path(self) -> None: suite_cfg["scenes"] = { key: value for key, value in suite_cfg["scenes"].items() - if key in (self._DOC_SCENE_ID, self._KVTEST_SCENE_ID, self._MQ_SCENE_ID) + if key in (self._DOC_SCENE_ID, self._KVTEST_SCENE_ID, self._LOG_MGMT_SCENE_ID, self._MQ_SCENE_ID) } suite_cfg["profiles"] = {"fluxon_tcp": suite_cfg["profiles"]["fluxon_tcp"]} suite_cfg["run"]["selectors"]["profile_ids"] = ["fluxon_tcp"] @@ -449,6 +537,8 @@ def test_main_supports_explicit_suite_path(self) -> None: suite_cfg["profiles"]["fluxon_tcp"]["runtime"]["ci"]["scene_configs"][self._DOC_SCENE_ID]["doc_site_base_url"] = ( "tele-ai.github.io/Fluxon" ) + suite_cfg["profiles"]["fluxon_tcp"]["runtime"]["ci"]["scene_configs"][self._LOG_MGMT_SCENE_ID]["enabled"] = True + suite_cfg["profiles"]["fluxon_tcp"]["runtime"]["ci"]["scene_configs"][self._MQ_SCENE_ID] = {} _ENTRY._write_yaml(suite_path, suite_cfg) release_dir = REPO_ROOT / "fluxon_release" release_dir.mkdir(parents=True, exist_ok=True) @@ -487,7 +577,7 @@ def test_main_supports_explicit_suite_path(self) -> None: ) self.assertEqual( set(generated_suite["scenes"].keys()), - {self._DOC_SCENE_ID, self._KVTEST_SCENE_ID, self._MQ_SCENE_ID}, + {self._DOC_SCENE_ID, self._KVTEST_SCENE_ID, self._LOG_MGMT_SCENE_ID, self._MQ_SCENE_ID}, ) self.assertEqual( generated_suite["profiles"]["fluxon_tcp_thread"]["runtime"]["ci"]["scene_configs"][self._KVTEST_SCENE_ID][ @@ -501,6 +591,12 @@ def test_main_supports_explicit_suite_path(self) -> None: ], "tele-ai.github.io/Fluxon", ) + self.assertEqual( + generated_suite["profiles"]["fluxon_tcp_thread"]["runtime"]["ci"]["scene_configs"][self._LOG_MGMT_SCENE_ID][ + "enabled" + ], + True, + ) self.assertEqual( generated_suite["profiles"]["fluxon_tcp_thread"]["runtime"]["ci"]["scene_configs"][self._MQ_SCENE_ID], {}, @@ -610,6 +706,60 @@ def fake_run(argv: list[str], *, env=None) -> None: str((REPO_ROOT / "fluxon_test_stack" / "pack_test_stack_rsc.py").resolve()), ) + def test_main_passes_explicit_release_dir_to_pack_stage(self) -> None: + with tempfile.TemporaryDirectory() as td: + root = Path(td) + workdir = root / "ci_2_virt_node_workdir" + hostworkdir = root / "hostworkdir" + release_dir = root / "custom_release" + release_dir.mkdir(parents=True, exist_ok=True) + wheel_path = release_dir / "fluxon-0.2.1-cp38-abi3-manylinux_2_28_x86_64.whl" + wheel_path.write_text("", encoding="utf-8") + calls: list[tuple[list[str], dict[str, str] | None]] = [] + + def fake_run(argv: list[str], *, env=None) -> None: + calls.append((list(argv), None if env is None else dict(env))) + + argv = [ + "ci_2_virt_node.py", + "--workdir", + str(workdir), + "--testbed-hostworkdir", + str(hostworkdir), + "--release-dir", + str(release_dir), + "--scene-id", + self._KVTEST_SCENE_ID, + "--skip-builder-image", + "--skip-dispatch", + "--skip-start-testbed", + "--skip-runner", + ] + original_argv = sys.argv[:] + try: + with mock.patch.object(_ENTRY, "_run", side_effect=fake_run): + with mock.patch.object(_ENTRY, "_detect_local_hostname", return_value="runner-host"): + with mock.patch.object(_ENTRY, "_detect_local_ipv4", return_value="10.1.1.119"): + with mock.patch.object(_ENTRY, "_ensure_ci_pack_release_env", return_value=Path("/tmp/env.yaml")): + with mock.patch.object(_ENTRY, "_render_ci_nix_pack_config", return_value=Path("/tmp/cfg.yaml")): + sys.argv = argv + rc = _ENTRY.main() + finally: + sys.argv = original_argv + + self.assertEqual(rc, 0) + self.assertGreaterEqual(len(calls), 2) + pack_cmd = calls[1][0] + self.assertEqual( + pack_cmd[1], + str((REPO_ROOT / "fluxon_test_stack" / "pack_test_stack_rsc.py").resolve()), + ) + self.assertIn("--release-dir", pack_cmd) + self.assertEqual( + pack_cmd[pack_cmd.index("--release-dir") + 1], + str(release_dir.resolve()), + ) + def test_main_uses_apply_check_config_for_explicit_apply_validation(self) -> None: with tempfile.TemporaryDirectory() as td: root = Path(td) diff --git a/fluxon_test_stack/tests/test_pack_test_stack_rsc_cli.py b/fluxon_test_stack/tests/test_pack_test_stack_rsc_cli.py index 5f642a9..d87b3fa 100644 --- a/fluxon_test_stack/tests/test_pack_test_stack_rsc_cli.py +++ b/fluxon_test_stack/tests/test_pack_test_stack_rsc_cli.py @@ -261,14 +261,10 @@ def test_git_stage_ci_source_tree_excludes_runtime_outputs(self) -> None: "scripts/_build_doc_site_in_container_inner.py", "fluxon_doc_cn/roadmap.md", "README.md", - "fluxon_release/install.py", - ".dever/run.log", - "skills/demo/SKILL.md", ): path = repo_root / relpath path.parent.mkdir(parents=True, exist_ok=True) path.write_text("x\n", encoding="utf-8") - raw = b"\0".join( [ b"scripts/_build_doc_site_in_container_inner.py", @@ -308,6 +304,18 @@ def test_collect_ci_source_relpaths_excludes_runtime_outputs(self) -> None: path = repo_root / relpath path.parent.mkdir(parents=True, exist_ok=True) path.write_text("x\n", encoding="utf-8") + (repo_root / ".gitignore").write_text( + "\n".join( + [ + "fluxon_release/*", + "!fluxon_release/install.py", + ".dever", + "skills/", + ] + ) + + "\n", + encoding="utf-8", + ) raw = b"\0".join( [ @@ -320,13 +328,20 @@ def test_collect_ci_source_relpaths_excludes_runtime_outputs(self) -> None: ] ) + b"\0" - with mock.patch.object(_PACK.subprocess, "check_output", return_value=raw): + with mock.patch.object( + _PACK.collect_source_profile_relpaths.__globals__["git_source_selection_utils"].subprocess, + "check_output", + return_value=raw, + ): relpaths = _PACK._collect_ci_source_relpaths(repo_root=repo_root) self.assertEqual( relpaths, ["README.md", "fluxon_doc_cn/roadmap.md", "scripts/_build_doc_site_in_container_inner.py"], ) + self.assertNotIn("fluxon_release/install.py", relpaths) + self.assertNotIn(".dever/run.log", relpaths) + self.assertNotIn("skills/demo/SKILL.md", relpaths) def test_collect_ci_source_relpaths_includes_rather_no_git_submodule_sources(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: @@ -357,7 +372,11 @@ def fake_check_output(argv, cwd=None): return b"Cargo.toml\0src/lib.rs\0" raise AssertionError(f"unexpected git ls-files cwd: {cwd_path}") - with mock.patch.object(_PACK.subprocess, "check_output", side_effect=fake_check_output): + with mock.patch.object( + _PACK.collect_source_profile_relpaths.__globals__["git_source_selection_utils"].subprocess, + "check_output", + side_effect=fake_check_output, + ): relpaths = _PACK._collect_ci_source_relpaths(repo_root=repo_root) self.assertEqual( @@ -404,13 +423,13 @@ def test_collect_ci_source_relpaths_requires_rather_no_git_submodule_root_to_exi with ( mock.patch.object( - _PACK, - "_collect_git_listed_source_relpaths", + _PACK.collect_source_profile_relpaths.__globals__["git_source_selection_utils"], + "collect_git_listed_source_relpaths", return_value=["scripts/_build_doc_site_in_container_inner.py"], ), self.assertRaisesRegex( RuntimeError, - "requires configured rather_no_git_submodule path to exist", + "CI source pack requires configured rather_no_git_submodule path to exist", ), ): _PACK._collect_ci_source_relpaths(repo_root=repo_root) @@ -442,6 +461,54 @@ def test_compute_ci_source_digest_uses_selected_git_paths_only(self) -> None: digest_roots = digest_mock.call_args.args[0] self.assertEqual(digest_roots, [tracked.resolve()]) + def test_prune_stage_paths_applies_glob_patterns(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + stage_root = Path(tmpdir) + keep_path = stage_root / "keep.txt" + pyc_path = stage_root / "pkg" / "drop.pyc" + baseline_file = stage_root / "baselines" / "manifest.txt" + pyc_path.parent.mkdir(parents=True, exist_ok=True) + baseline_file.parent.mkdir(parents=True, exist_ok=True) + keep_path.write_text("keep\n", encoding="utf-8") + pyc_path.write_text("drop\n", encoding="utf-8") + baseline_file.write_text("drop\n", encoding="utf-8") + + _PACK.script_utils.prune_stage_paths( + stage_root, + ("*.pyc", "baselines/"), + ) + + self.assertTrue(keep_path.exists()) + self.assertFalse(pyc_path.exists()) + self.assertFalse(baseline_file.exists()) + + def test_shared_rsync_stage_accepts_exclude_patterns(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + repo_root = Path(tmpdir) + src = repo_root / "src" + dst = repo_root / "dst" + (src / "keep").mkdir(parents=True, exist_ok=True) + (src / "drop").mkdir(parents=True, exist_ok=True) + (src / "keep" / "a.txt").write_text("keep\n", encoding="utf-8") + (src / "drop" / "b.txt").write_text("drop\n", encoding="utf-8") + + run_mock = mock.Mock() + with mock.patch.dict( + _PACK.script_utils.rsync_stage.__globals__, + {"run_cmd_argv": run_mock}, + ): + _PACK.script_utils.rsync_stage( + repo_root=repo_root, + src=src, + dst=dst, + honor_gitignore=False, + exclude_rel_paths=("drop/", "*.tmp"), + ) + + argv = run_mock.call_args.args[0] + self.assertIn("--exclude=drop/", argv) + self.assertIn("--exclude=*.tmp", argv) + if __name__ == "__main__": raise SystemExit(unittest.main()) diff --git a/fluxon_test_stack/tests/test_runner_contract.py b/fluxon_test_stack/tests/test_runner_contract.py index 7c8fddd..f2e5a64 100644 --- a/fluxon_test_stack/tests/test_runner_contract.py +++ b/fluxon_test_stack/tests/test_runner_contract.py @@ -59,6 +59,10 @@ def _build_checks(selected_test_id: Optional[str]) -> List[Tuple[str, Callable[[ "ci_top_attention_doc_page_build_uses_online_docker_image", test_ci_top_attention_doc_page_build_uses_online_docker_image, ), + ( + "ci_top_attention_log_mgmt_scene_exists", + test_ci_top_attention_log_mgmt_scene_exists, + ), ( "ci_top_attention_mq_core_uses_cluster_kv_owner_runtime", test_ci_top_attention_mq_core_uses_cluster_kv_owner_runtime, @@ -236,6 +240,50 @@ def test_ci_top_attention_doc_page_build_uses_online_docker_image() -> None: return print("PASS: test_ci_top_attention_doc_page_build_uses_online_docker_image") +def test_ci_top_attention_log_mgmt_scene_exists() -> None: + repo_root = Path(__file__).resolve().parents[2] + suite_cfg_path = repo_root / "fluxon_test_stack" / "ci_test_list.yaml" + suite_cfg = yaml.safe_load(suite_cfg_path.read_text(encoding="utf-8")) + if not isinstance(suite_cfg, dict): + print("FAIL: test_ci_top_attention_log_mgmt_scene_exists - suite config is not a mapping") + return + + suite_for_contract = copy.deepcopy(suite_cfg) + artifact_sets = suite_for_contract.get("artifact_sets") + if not isinstance(artifact_sets, dict): + print("FAIL: test_ci_top_attention_log_mgmt_scene_exists - artifact_sets is not a mapping") + return + for artifact_set in artifact_sets.values(): + if not isinstance(artifact_set, dict): + continue + release_artifacts = artifact_set.get("release_artifacts") + if isinstance(release_artifacts, dict): + python_wheel = release_artifacts.get("python_wheel") + if isinstance(python_wheel, str) and python_wheel.strip(): + artifact_set["release_artifacts"] = {"wheel": python_wheel} + + suite = _TEST_RUNNER._parse_suite_config(suite_for_contract) + scene = suite.scenes.get("ci_top_attention_log_mgmt") + if not isinstance(scene, dict): + print("FAIL: test_ci_top_attention_log_mgmt_scene_exists - missing scene") + return + ci = scene.get("ci") + if not isinstance(ci, dict): + print("FAIL: test_ci_top_attention_log_mgmt_scene_exists - scene.ci missing") + return + if ci.get("subject") != "rust": + print( + "FAIL: test_ci_top_attention_log_mgmt_scene_exists - " + f"expected subject 'rust', got {ci.get('subject')!r}" + ) + return + if ci.get("runtime_contract") != "rust_self_managed": + print( + "FAIL: test_ci_top_attention_log_mgmt_scene_exists - " + f"expected runtime_contract 'rust_self_managed', got {ci.get('runtime_contract')!r}" + ) + return + print("PASS: test_ci_top_attention_log_mgmt_scene_exists") def test_ci_top_attention_mq_core_uses_cluster_kv_owner_runtime() -> None: repo_root = Path(__file__).resolve().parents[2] diff --git a/fluxon_test_stack/tests/test_test_runner_testbed_contract.py b/fluxon_test_stack/tests/test_test_runner_testbed_contract.py index 34bf640..86f41cb 100644 --- a/fluxon_test_stack/tests/test_test_runner_testbed_contract.py +++ b/fluxon_test_stack/tests/test_test_runner_testbed_contract.py @@ -38,6 +38,36 @@ def _load_module(): class TestTestRunnerTestbedContract(unittest.TestCase): + def test_write_ci_master_owner_configs_emits_owner_large_file_paths(self) -> None: + with tempfile.TemporaryDirectory() as td: + run_dir = Path(td) + resolved_case = { + "deploy": { + "instances": [ + {"id": "master", "deployer": {"target": "local-node-a"}}, + {"id": "owner_0", "deployer": {"target": "local-node-a"}}, + ], + "target_ip_map": {"local-node-a": "127.0.0.1"}, + } + } + + with mock.patch.object(_RUNNER, "_ci_base_runtime_service_target_ip", side_effect=["127.0.0.1", "127.0.0.1"]): + with mock.patch.object(_RUNNER, "_ci_base_runtime_service_port", side_effect=[19180, 19190]): + _, owner_path = _RUNNER._write_ci_master_owner_configs( + resolved_case, + run_dir=run_dir, + cluster_name="ci_cluster", + share_mem_path="/tmp/ci_shm", + owner_dram_bytes=1073741824, + ) + + owner_cfg = yaml.safe_load(owner_path.read_text(encoding="utf-8")) + self.assertEqual( + owner_cfg["fluxonkv_spec"]["large_file_paths"], + [str((run_dir / "services" / "owner_0" / "large").resolve())], + ) + self.assertNotIn("shared_file_path", owner_cfg["fluxonkv_spec"]) + def test_ci_runtime_python_executable_requires_python310_on_path(self) -> None: with mock.patch.object(_RUNNER.shutil, "which", return_value=None): with self.assertRaisesRegex(ValueError, "requires python3.10 on PATH"): @@ -210,6 +240,65 @@ def test_write_ci_scene_config_yaml_emits_structured_scene_config(self) -> None: self.assertEqual(payload["scene_runtime"]["etcd"], {"ip": "127.0.0.1", "port": 2379}) self.assertEqual(payload["scene_runtime"]["greptime"], {"ip": "127.0.0.1", "port": 4000}) + def test_generated_test_stack_owner_config_emits_large_file_paths(self) -> None: + with tempfile.TemporaryDirectory() as td: + run_dir = Path(td) + cfg_dir = run_dir / "configs" + cfg_dir.mkdir(parents=True) + owner_target = "local-node-a" + target_slug = "local-node-a" + runtime_instance_prefix = "case1" + coord_tpl = {"deployer": {"target": ""}} + cluster_nodes = {"local-node-a": {"python_abi": "cpython3.10"}} + resolved_case = { + "runtime": { + "run_dir": str(run_dir), + "stack_identity": { + "cluster_name": "bench_cluster", + "share_mem_path": "/tmp/bench_shm", + }, + } + } + + with mock.patch.object(_RUNNER, "_test_stack_runtime_required_python_abi", return_value="cpython3.10"): + with mock.patch.object(_RUNNER, "_test_stack_etcd_addresses", return_value=["127.0.0.1:19180"]): + with mock.patch.object(_RUNNER, "_test_stack_target_host_venv_python", return_value="/tmp/venv/bin/python3"): + with mock.patch.object(_RUNNER, "_test_stack_runtime_module_command", return_value="owner-cmd"): + owner_instances = _RUNNER._build_test_stack_external_kv_owner_instances( + scene_mode="bench", + resolved_case=resolved_case, + scale={"owner": {"owner_count": 1, "owner_dram_bytes": 1073741824}}, + runtime=resolved_case["runtime"], + run_dir=run_dir, + cfg_dir=cfg_dir, + coord_tpl=coord_tpl, + test_stack_runtime={}, + cluster_nodes=cluster_nodes, + owner_targets=[owner_target], + needs_kv_master=True, + kv_p2p_port_base=31000, + kv_p2p_port_stride=100, + kv_p2p_slot_offset=0, + p2p_ports_per_slot=10, + node_total=1, + run_index=1, + runtime_instance_prefix=runtime_instance_prefix, + kv_base={}, + test_spec_config={}, + perf_config=None, + runtime_env={}, + owner_group_processes=None, + owner_cpu_core_by_target={}, + ) + + self.assertEqual(len(owner_instances), 1) + owner_cfg_path = cfg_dir / f"test_stack_kv_owner__{target_slug}.yaml" + owner_cfg = yaml.safe_load(owner_cfg_path.read_text(encoding="utf-8")) + self.assertEqual( + owner_cfg["fluxonkv_spec"]["large_file_paths"], + [str((run_dir / "services" / "kv_owner" / target_slug / "large").resolve())], + ) + def test_ci_source_overlay_includes_fluxon_test_stack(self) -> None: self.assertIn("fluxon_test_stack", _RUNNER._CI_SOURCE_OVERLAY_ROOTS) self.assertNotIn("quartz_prewarm", _RUNNER._CI_SOURCE_OVERLAY_ROOTS) @@ -224,6 +313,31 @@ def test_top_attention_ci_execution_plan_is_runner_native(self) -> None: self.assertEqual(planned[0].ci_commands[0]["id"], "top_attention_bin_kvtest") self.assertIn("--case-config __RUN_DIR__/configs/ci_scene_config.yaml", planned[0].ci_commands[0]["command"]) + def test_top_attention_log_mgmt_ci_execution_plan_is_runner_native(self) -> None: + suite_cfg = yaml.safe_load((_RUNNER.RUNNER_REPO_ROOT / "fluxon_test_stack" / "ci_test_list.yaml").read_text(encoding="utf-8")) + artifact_sets = suite_cfg.get("artifact_sets") + if isinstance(artifact_sets, dict): + for artifact_set in artifact_sets.values(): + if not isinstance(artifact_set, dict): + continue + release_artifacts = artifact_set.get("release_artifacts") + if isinstance(release_artifacts, dict): + python_wheel = release_artifacts.get("python_wheel") + if isinstance(python_wheel, str) and python_wheel.strip(): + artifact_set["release_artifacts"] = {"wheel": python_wheel} + suite = _RUNNER._parse_suite_config(suite_cfg) + cases = _RUNNER._expand_cases(suite) + case = next(item for item in cases if item.scene_id == "ci_top_attention_log_mgmt" and item.profile_id == "fluxon_tcp") + planned = _RUNNER._build_ci_execution_plan(case, suite) + self.assertEqual(len(planned), 1) + self.assertEqual(planned[0].ci_commands[0]["id"], "top_attention_log_mgmt") + self.assertIn( + "__RUN_DIR__/src/fluxon_test_stack/top_attention_test_index/_log_mgmt.py", + + planned[0].ci_commands[0]["command"], + ) + self.assertIn("--case-config __RUN_DIR__/configs/ci_scene_config.yaml", planned[0].ci_commands[0]["command"]) + def test_top_attention_mq_core_ci_execution_plan_is_runner_native(self) -> None: suite_cfg = yaml.safe_load((_RUNNER.RUNNER_REPO_ROOT / "fluxon_test_stack" / "ci_test_list.yaml").read_text(encoding="utf-8")) suite = _RUNNER._parse_suite_config(suite_cfg) @@ -270,8 +384,7 @@ def test_ci_prepare_run_inputs_rebuilds_release_view_without_reusing_source_test "kv_svc_type: fluxon", "etcd_address: 127.0.0.1:2379", "cluster_name: fluxon-example-cluster", - "shared_memory_path: /tmp/fluxon-example-cluster/shm", - "shared_file_path: /tmp/fluxon-example-cluster/share", + "share_mem_path: /tmp/fluxon-example-cluster/shm", "", ] ), @@ -405,8 +518,7 @@ def test_ci_prepare_run_inputs_rebuilds_release_view_without_reusing_source_test overlay_live_checkout=False, etcd_address="127.0.0.1:32579", cluster_name="ci_case_cluster", - shared_memory_path="/tmp/ci_case_cluster/shm", - shared_file_path="/tmp/ci_case_cluster/share", + share_mem_path="/tmp/ci_case_cluster/shm", ) release_view_root = src_root / "fluxon_release" @@ -425,8 +537,7 @@ def test_ci_prepare_run_inputs_rebuilds_release_view_without_reusing_source_test "kv_svc_type": "fluxon", "etcd_address": "127.0.0.1:32579", "cluster_name": "ci_case_cluster", - "shared_memory_path": "/tmp/ci_case_cluster/shm", - "shared_file_path": "/tmp/ci_case_cluster/share", + "share_mem_path": "/tmp/ci_case_cluster/shm", }, ) assert_python_abi.assert_called_once_with(venv_python=venv_python) @@ -510,8 +621,7 @@ def test_ci_runner_script_sources_prepare_env_when_present(self) -> None: "ops_cluster_name": "fluxon_testbed", "cluster_name": "fluxon_testbed", "controller_url": "http://127.0.0.1:19080/r/ops/fluxon_testbed", - "shared_memory_path": "/tmp/shm", - "shared_file_path": "/tmp/share", + "share_mem_path": "/tmp/shm", }, "deploy_instances": { "case_runtime": [ @@ -535,7 +645,6 @@ def test_ci_runner_script_sources_prepare_env_when_present(self) -> None: run_dir=run_dir, src_root=src_root, share_mem_path="/tmp/shm", - share_file_path="/tmp/share", ) script_text = script_path.read_text(encoding="utf-8") self.assertIn('prepare_env_path="', script_text) @@ -760,8 +869,9 @@ def test_load_source_stack_contract_accepts_same_host_dual_local_hostworkdirs(se contract["ops_controller_url"], "http://127.0.0.1:19080/r/ops/fluxon_testbed", ) - self.assertEqual(contract["shared_memory_hostworkdir"], "${HOSTWORKDIR}/shm1") - self.assertEqual(contract["shared_file_hostworkdir"], "${HOSTWORKDIR}/shm2_files") + self.assertEqual(contract["share_mem_hostworkdir"], "${HOSTWORKDIR}/shm1") + self.assertNotIn("shared_memory_hostworkdir", contract) + self.assertNotIn("shared_file_hostworkdir", contract) def test_load_source_stack_contract_rejects_multi_hostworkdir_remote_layout(self) -> None: with tempfile.TemporaryDirectory() as td: @@ -886,6 +996,80 @@ def test_ci_base_runtime_service_target_ip_uses_loopback_for_same_host_local_nod "127.0.0.1", ) + def test_write_deployer_manifests_renders_payload_wrapper_from_template(self) -> None: + with tempfile.TemporaryDirectory() as td: + run_dir = Path(td) + resolved_case = { + "case": { + "case_id": "bench_case", + "profile_id": "bench_profile", + }, + "scene": { + "bench": { + "subject": "kv", + } + }, + "deploy": { + "instances": [ + { + "id": "worker_0", + "k8s_ref": "deployment/test-worker", + "lifecycle": "service", + "deployer": { + "target": "logic-a", + "payload_file": "wheelhouse/pkg.whl", + "payload_dest_path": "/tmp/run/pkg.whl", + "command": ["/bin/sh", "-lc", "python3 /tmp/run/pkg.whl"], + }, + } + ], + "payload_delivery": { + "kind": _RUNNER.PAYLOAD_DELIVERY_KIND_FLUXON_FS_S3, + "s3_base_url": "http://127.0.0.1:19080/fs_s3", + "bucket": "bench-bucket", + "access_key": "bench-ak", + "secret_key": "bench-sk", + "region": "bench-region", + "key_prefix": "case-prefix", + }, + }, + "runtime": { + "workdir_root": str(run_dir.parent), + "run_dir": str(run_dir), + "stack_identity": { + "cluster_name": "fluxon_testbed", + "controller_url": "http://127.0.0.1:19080/r/ops/fluxon_testbed", + "share_mem_path": "/tmp/shm", + }, + }, + "artifact_set": { + "release_root": str(run_dir / "fluxon_release"), + "test_rsc_root": str(run_dir / "test_rsc"), + }, + } + + template_path = ( + _RUNNER.RUNNER_TEMPLATE_DIR / "payload_fluxon_fs_s3_download_and_exec.sh.template" + ).resolve() + self.assertTrue(template_path.is_file()) + + _RUNNER._write_deployer_manifests(resolved_case, run_dir, allow_overwrite=False) + + manifest_docs = list( + yaml.safe_load_all((run_dir / "deployer_deploy.yaml").read_text(encoding="utf-8")) + ) + self.assertEqual(len(manifest_docs), 1) + container = manifest_docs[0]["spec"]["template"]["spec"]["containers"][0] + self.assertEqual(container["command"], ["/bin/bash", "-lc"]) + self.assertEqual(len(container["args"]), 1) + script_text = container["args"][0] + self.assertIn("python3 - <<'PY'", script_text) + self.assertIn('BASE_URL = "http://127.0.0.1:19080/fs_s3"', script_text) + self.assertIn('OBJECT_KEY = "case-prefix/wheelhouse/pkg.whl"', script_text) + self.assertIn('DEST_PATH = "/tmp/run/pkg.whl"', script_text) + self.assertIn('exec /bin/sh -lc', script_text) + self.assertNotIn("__FLUXON_TMPL_", script_text) + if __name__ == "__main__": raise SystemExit(unittest.main()) diff --git a/fluxon_test_stack/tests/test_test_runner_ui_contract.py b/fluxon_test_stack/tests/test_test_runner_ui_contract.py index ff407e2..2abc4ec 100644 --- a/fluxon_test_stack/tests/test_test_runner_ui_contract.py +++ b/fluxon_test_stack/tests/test_test_runner_ui_contract.py @@ -119,6 +119,8 @@ def test_redirect_process_stdio_starts_mirror_on_github_actions(self) -> None: workdir = Path(td) original_log_fp = _RUNNER._RUNNER_STDIO_LOG_FP original_keepalive = _RUNNER._RUNNER_STDIO_KEEPALIVE_FDS + saved_stdout = sys.stdout + saved_stderr = sys.stderr with mock.patch.dict(os.environ, {"GITHUB_ACTIONS": "true"}, clear=False): _RUNNER._RUNNER_STDIO_LOG_FP = None _RUNNER._RUNNER_STDIO_KEEPALIVE_FDS = (11, 12) @@ -129,10 +131,18 @@ def test_redirect_process_stdio_starts_mirror_on_github_actions(self) -> None: self.assertEqual(dup2_mock.call_count, 2) start_mirror.assert_called_once() kwargs = start_mirror.call_args.kwargs - self.assertEqual(kwargs["log_path"], (workdir / _RUNNER.RUNNER_STDIO_LOG_FILENAME).resolve()) + expected_log_path = _RUNNER._service_log_base_path( + workdir, filename=_RUNNER.RUNNER_STDIO_LOG_FILENAME + ) + self.assertEqual(kwargs["log_path"], expected_log_path) self.assertEqual(kwargs["stdout_fd"], 11) self.assertNotIn("stderr_fd", kwargs) - if _RUNNER._RUNNER_STDIO_LOG_FP is not None: + sys.stdout = saved_stdout + sys.stderr = saved_stderr + if _RUNNER._RUNNER_STDIO_LOG_FP is not None and _RUNNER._RUNNER_STDIO_LOG_FP not in ( + sys.__stdout__, + sys.__stderr__, + ): _RUNNER._RUNNER_STDIO_LOG_FP.close() _RUNNER._RUNNER_STDIO_LOG_FP = original_log_fp _RUNNER._RUNNER_STDIO_KEEPALIVE_FDS = original_keepalive @@ -142,6 +152,8 @@ def test_redirect_process_stdio_skips_mirror_outside_github_actions(self) -> Non workdir = Path(td) original_log_fp = _RUNNER._RUNNER_STDIO_LOG_FP original_keepalive = _RUNNER._RUNNER_STDIO_KEEPALIVE_FDS + saved_stdout = sys.stdout + saved_stderr = sys.stderr with mock.patch.dict(os.environ, {}, clear=True): _RUNNER._RUNNER_STDIO_LOG_FP = None _RUNNER._RUNNER_STDIO_KEEPALIVE_FDS = (11, 12) @@ -151,7 +163,12 @@ def test_redirect_process_stdio_skips_mirror_outside_github_actions(self) -> Non _RUNNER._redirect_process_stdio_to_log(workdir) self.assertEqual(dup2_mock.call_count, 2) start_mirror.assert_not_called() - if _RUNNER._RUNNER_STDIO_LOG_FP is not None: + sys.stdout = saved_stdout + sys.stderr = saved_stderr + if _RUNNER._RUNNER_STDIO_LOG_FP is not None and _RUNNER._RUNNER_STDIO_LOG_FP not in ( + sys.__stdout__, + sys.__stderr__, + ): _RUNNER._RUNNER_STDIO_LOG_FP.close() _RUNNER._RUNNER_STDIO_LOG_FP = original_log_fp _RUNNER._RUNNER_STDIO_KEEPALIVE_FDS = original_keepalive @@ -225,6 +242,20 @@ def test_log_chunk_tail_and_before_window(self) -> None: self.assertEqual(older["text"], "2345") self.assertEqual(older["start"], 2) + def test_service_log_resolve_read_path_prefers_latest_daily_shard(self) -> None: + with tempfile.TemporaryDirectory() as td: + workdir = Path(td) + (workdir / "test_runner.2026-06-19.log").write_text("old\n", encoding="utf-8") + (workdir / "test_runner.2026-06-20.log").write_text("new\n", encoding="utf-8") + resolved = _RUNNER._service_log_resolve_read_path( + workdir, + filename=_RUNNER.RUNNER_STDIO_LOG_FILENAME, + ) + self.assertEqual( + resolved, + (workdir / "test_runner.2026-06-20.log").resolve(), + ) + def test_ops_logs_base_url_derives_from_controller_proxy(self) -> None: url = _RUNNER._ui_ops_logs_base_url("http://127.0.0.1:19080/r/ops/fluxon_testbed") self.assertEqual(url, "http://127.0.0.1:19080/logs") diff --git a/fluxon_test_stack/tests/test_top_attention_log_mgmt_contract.py b/fluxon_test_stack/tests/test_top_attention_log_mgmt_contract.py new file mode 100644 index 0000000..2b92fd0 --- /dev/null +++ b/fluxon_test_stack/tests/test_top_attention_log_mgmt_contract.py @@ -0,0 +1,263 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +import importlib.util +import sys +import tempfile +import unittest +from pathlib import Path +from unittest import mock + +import yaml + + +REPO_ROOT = Path(__file__).resolve().parents[2] +MODULE_PATH = REPO_ROOT / "fluxon_test_stack" / "top_attention_test_index" / "_log_mgmt.py" +COMMON_MODULE_PATH = REPO_ROOT / "fluxon_test_stack" / "top_attention_test_index" / "_common.py" + + +def _load_module(): + module_dir = MODULE_PATH.parent + sys.path.insert(0, str(module_dir)) + try: + spec = importlib.util.spec_from_file_location("fluxon_test_stack_top_attention_log_mgmt_contract", MODULE_PATH) + assert spec is not None and spec.loader is not None + mod = importlib.util.module_from_spec(spec) + sys.modules[spec.name] = mod + spec.loader.exec_module(mod) + return mod + finally: + if sys.path and sys.path[0] == str(module_dir): + sys.path.pop(0) + + +_ENTRY = _load_module() + + +def _load_common_module(): + module_dir = COMMON_MODULE_PATH.parent + sys.path.insert(0, str(module_dir)) + try: + spec = importlib.util.spec_from_file_location("fluxon_test_stack_top_attention_common_contract", COMMON_MODULE_PATH) + assert spec is not None and spec.loader is not None + mod = importlib.util.module_from_spec(spec) + sys.modules[spec.name] = mod + spec.loader.exec_module(mod) + return mod + finally: + if sys.path and sys.path[0] == str(module_dir): + sys.path.pop(0) + + +_COMMON = _load_common_module() + + +class TestTopAttentionLogMgmtContract(unittest.TestCase): + def test_main_accepts_case_config_and_runs_canonical_tests(self) -> None: + with tempfile.TemporaryDirectory() as td: + run_dir = Path(td) + cfg_dir = run_dir / "configs" + cfg_dir.mkdir(parents=True) + case_cfg = cfg_dir / "ci_scene_config.yaml" + case_cfg.write_text( + yaml.safe_dump( + { + "case": { + "scene_id": "ci_top_attention_log_mgmt", + "scale_id": "n1_kvowner_dram_20gib", + "profile_id": "fluxon_tcp_thread", + "case_id": "ci_top_attention_log_mgmt__n1_kvowner_dram_20gib__fluxon_tcp_thread", + }, + "scene_config": { + "enabled": True, + }, + "scene_runtime": { + "etcd": {"ip": "127.0.0.1", "port": 19180}, + "greptime": {"ip": "127.0.0.1", "port": 19190}, + }, + }, + sort_keys=False, + ), + encoding="utf-8", + ) + + python_calls: list[tuple[str, tuple[str, ...]]] = [] + + def fake_run_python_file(description: str, path: str, extra_args=()): + del description + python_calls.append((path, tuple(extra_args))) + return 0 + + with mock.patch.object(_ENTRY, "run_python_file", side_effect=fake_run_python_file): + with mock.patch.object(_ENTRY, "run_cargo", return_value=0) as run_cargo: + with mock.patch.object( + sys, + "argv", + [str(MODULE_PATH), "--case-config", str(case_cfg)], + ): + rc = _ENTRY.main() + + self.assertEqual(rc, 0) + self.assertEqual( + python_calls, + [ + ("deployment/tests/test_log_shard.py", ()), + ( + "deployment/tests/test_selection_supervisor_codegen.py", + ("--test-id", "runtime_log_path_uses_daily_shard_files"), + ), + ( + "deployment/tests/test_selection_supervisor_codegen.py", + ("--test-id", "runtime_log_shards_roll_and_preserve_content_boundaries"), + ), + ], + ) + self.assertEqual( + run_cargo.call_args.args[0], + [ + "test", + "--manifest-path", + str(REPO_ROOT / "fluxon_rs" / "fluxon_util" / "Cargo.toml"), + "--test", + "log_mgmt", + ], + ) + + def test_main_rejects_passthrough_args(self) -> None: + with tempfile.TemporaryDirectory() as td: + run_dir = Path(td) + cfg_dir = run_dir / "configs" + cfg_dir.mkdir(parents=True) + case_cfg = cfg_dir / "ci_scene_config.yaml" + case_cfg.write_text( + yaml.safe_dump( + { + "case": { + "scene_id": "ci_top_attention_log_mgmt", + "scale_id": "n1_kvowner_dram_20gib", + "profile_id": "fluxon_tcp_thread", + "case_id": "ci_top_attention_log_mgmt__n1_kvowner_dram_20gib__fluxon_tcp_thread", + }, + "scene_config": {"enabled": True}, + "scene_runtime": { + "etcd": {"ip": "127.0.0.1", "port": 19180}, + "greptime": {"ip": "127.0.0.1", "port": 19190}, + }, + }, + sort_keys=False, + ), + encoding="utf-8", + ) + + with mock.patch.object( + sys, + "argv", + [ + str(MODULE_PATH), + "--case-config", + str(case_cfg), + "--", + "--nocapture", + ], + ): + with self.assertRaisesRegex(ValueError, "_log_mgmt does not accept passthrough args"): + _ENTRY.main() + + def test_run_python_file_does_not_forward_parent_passthrough(self) -> None: + with mock.patch.object(_COMMON, "call", return_value=0) as call_mock: + with mock.patch.object( + sys, + "argv", + [ + str(COMMON_MODULE_PATH), + "--case-config", + "/tmp/should_not_leak.yaml", + "--", + "--nocapture", + ], + ): + rc = _COMMON.run_python_file( + "delegate test", + "deployment/tests/test_log_shard.py", + ) + + self.assertEqual(rc, 0) + self.assertEqual( + call_mock.call_args.args[0], + [ + sys.executable, + "-u", + str(REPO_ROOT / "deployment/tests/test_log_shard.py"), + ], + ) + + def test_run_pytest_does_not_forward_parent_passthrough(self) -> None: + with mock.patch.object(_COMMON, "call", return_value=0) as call_mock: + with mock.patch.object( + sys, + "argv", + [ + str(COMMON_MODULE_PATH), + "--case-config", + "/tmp/should_not_leak.yaml", + "--", + "-k", + "smoke", + ], + ): + rc = _COMMON.run_pytest( + "delegate pytest", + ["fluxon_test_stack/tests/test_top_attention_bin_kvtest_contract.py"], + ) + + self.assertEqual(rc, 0) + self.assertEqual( + call_mock.call_args.args[0], + [ + sys.executable, + "-m", + "pytest", + "fluxon_test_stack/tests/test_top_attention_bin_kvtest_contract.py", + ], + ) + + def test_run_cargo_does_not_forward_parent_passthrough(self) -> None: + with mock.patch.object(_COMMON, "call", return_value=0) as call_mock: + with mock.patch.object( + sys, + "argv", + [ + str(COMMON_MODULE_PATH), + "--case-config", + "/tmp/should_not_leak.yaml", + "--", + "--nocapture", + ], + ): + rc = _COMMON.run_cargo( + [ + "test", + "--manifest-path", + str(REPO_ROOT / "fluxon_rs" / "fluxon_util" / "Cargo.toml"), + "--test", + "log_mgmt", + ], + ) + + self.assertEqual(rc, 0) + self.assertEqual( + call_mock.call_args.args[0], + [ + "cargo", + "test", + "--manifest-path", + str(REPO_ROOT / "fluxon_rs" / "fluxon_util" / "Cargo.toml"), + "--test", + "log_mgmt", + ], + ) + + +if __name__ == "__main__": + raise SystemExit(unittest.main()) diff --git a/fluxon_test_stack/top_attention_test_index/README.md b/fluxon_test_stack/top_attention_test_index/README.md index 516c07e..e36b326 100644 --- a/fluxon_test_stack/top_attention_test_index/README.md +++ b/fluxon_test_stack/top_attention_test_index/README.md @@ -47,6 +47,7 @@ Entries: - `_fs_remote_mount.py`: heavier Fluxon FS remote mount integration coverage - `_test_stack_contract.py`: test-stack runner contract coverage - `_deployment_codegen.py`: deployment code generation coverage +- `_log_mgmt.py`: shared-supervisor ops log rolling plus Rust KV log sharding coverage. `ci_test_list.yaml` now exposes this wrapper as the formal `ci_top_attention_log_mgmt` scene, and `test_runner.py` dispatches to it from the runner-native `top_attention` CI execution model. - `_script_tools.py`: script utility coverage - `_cargo_fs_core.py`: cargo tests for the Rust FS core crate - `_cargo_util.py`: cargo tests for the Rust util crate diff --git a/fluxon_test_stack/top_attention_test_index/_common.py b/fluxon_test_stack/top_attention_test_index/_common.py index 3991aa7..c890584 100755 --- a/fluxon_test_stack/top_attention_test_index/_common.py +++ b/fluxon_test_stack/top_attention_test_index/_common.py @@ -32,20 +32,33 @@ def parse_python_passthrough(description: str) -> tuple[str, list[str]]: return args.python, passthrough -def run_pytest(description: str, paths: Iterable[str]) -> int: - python, passthrough = parse_python_passthrough(description) - return call([python, "-m", "pytest", *paths, *passthrough]) - - -def run_python_file(description: str, path: str, extra_args: Iterable[str] = ()) -> int: - python, passthrough = parse_python_passthrough(description) - return call([python, "-u", str(REPO_ROOT / path), *extra_args, *passthrough]) - - -def run_python_files(description: str, paths: Iterable[str]) -> int: - python, passthrough = parse_python_passthrough(description) +def run_pytest( + description: str, + paths: Iterable[str], + *, + passthrough: Sequence[str] | None = None, +) -> int: + python, _ = parse_python_passthrough(description) + effective_passthrough = [] if passthrough is None else list(passthrough) + return call([python, "-m", "pytest", *paths, *effective_passthrough]) + + +def run_python_file( + description: str, + path: str, + extra_args: Iterable[str] = (), +) -> int: + python, _ = parse_python_passthrough(description) + return call([python, "-u", str(REPO_ROOT / path), *extra_args]) + + +def run_python_files( + description: str, + paths: Iterable[str], +) -> int: + python, _ = parse_python_passthrough(description) for path in paths: - rc = call([python, "-u", str(REPO_ROOT / path), *passthrough]) + rc = call([python, "-u", str(REPO_ROOT / path)]) if rc != 0: return rc return 0 @@ -150,7 +163,13 @@ def _prepare_cargo_env(env: dict[str, str] | None) -> dict[str, str] | None: return prepared_env -def run_cargo(args: Iterable[str], *, env: dict[str, str] | None = None) -> int: +def run_cargo( + args: Iterable[str], + *, + env: dict[str, str] | None = None, + passthrough: Sequence[str] | None = None, +) -> int: # Rust test binaries launched via cargo run/load depend on the wheel-bundled native # runtime under the active venv. Keep one authoritative search root for all wrappers. - return call(["cargo", *args], env=_prepare_cargo_env(env)) + effective_passthrough = [] if passthrough is None else list(passthrough) + return call(["cargo", *args, *effective_passthrough], env=_prepare_cargo_env(env)) diff --git a/fluxon_test_stack/top_attention_test_index/_log_mgmt.py b/fluxon_test_stack/top_attention_test_index/_log_mgmt.py new file mode 100644 index 0000000..9ef7cd5 --- /dev/null +++ b/fluxon_test_stack/top_attention_test_index/_log_mgmt.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse + +from _common import ( + REPO_ROOT, + load_case_config, + run_cargo, + run_python_file, +) + + +TEST_REQUIREMENTS = ["cargo", "etcd", "ops", "submodules"] +SCENE_ID = "ci_top_attention_log_mgmt" + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Flat index entry for shared-supervisor ops log rolling and Rust KV log sharding coverage." + ) + parser.add_argument( + "--case-config", + help="Canonical CI case config YAML emitted by test_runner.", + ) + args, passthrough = parser.parse_known_args() + if args.case_config: + _ = load_case_config(args.case_config, expected_scene_id=SCENE_ID) + if passthrough: + raise ValueError(f"_log_mgmt does not accept passthrough args: {tuple(passthrough)!r}") + + rc = run_python_file( + "Flat index entry for ops/shared-supervisor log shard helper coverage.", + "deployment/tests/test_log_shard.py", + ) + if rc != 0: + return rc + for test_id in ( + "runtime_log_path_uses_daily_shard_files", + "runtime_log_shards_roll_and_preserve_content_boundaries", + ): + rc = run_python_file( + "Flat index entry for ops/shared-supervisor log routing coverage.", + "deployment/tests/test_selection_supervisor_codegen.py", + extra_args=("--test-id", test_id), + ) + if rc != 0: + return rc + return run_cargo([ + "test", + "--manifest-path", + str(REPO_ROOT / "fluxon_rs" / "fluxon_util" / "Cargo.toml"), + "--test", + "log_mgmt", + ]) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/git_source_selection.py b/scripts/git_source_selection.py new file mode 100644 index 0000000..491a0c1 --- /dev/null +++ b/scripts/git_source_selection.py @@ -0,0 +1,163 @@ +from __future__ import annotations + +import subprocess +from pathlib import Path +from typing import Callable + +import yaml + + +DEFAULT_RATHER_NO_GIT_SUBMODULE_CONFIG_RELPATH = Path( + "setup_and_pack/rather_no_git_submodule.yaml" +) + + +def collect_git_listed_source_relpaths( + *, + repo_root: Path, + git_root: Path, + rel_prefix: str = "", + is_excluded: Callable[[str], bool], +) -> list[str]: + argv = [ + "git", + "ls-files", + "--cached", + "--others", + "--exclude-standard", + "-z", + ] + raw = subprocess.check_output(argv, cwd=str(git_root)) + selected: list[str] = [] + rel_prefix = rel_prefix.strip("/") + for entry in raw.split(b"\0"): + if not entry: + continue + rel = entry.decode("utf-8").strip() + if not rel: + continue + repo_rel = rel if not rel_prefix else f"{rel_prefix}/{rel}" + if is_excluded(repo_rel): + continue + source_path = (repo_root / repo_rel).resolve() + if not source_path.exists(): + continue + selected.append(repo_rel) + return selected + + +def load_rather_no_git_submodule_source_roots( + *, + repo_root: Path, + context_name: str, +) -> tuple[tuple[str, Path], ...]: + config_path = (repo_root / DEFAULT_RATHER_NO_GIT_SUBMODULE_CONFIG_RELPATH).resolve() + if not config_path.exists(): + return () + raw_cfg = yaml.safe_load(config_path.read_text(encoding="utf-8")) + if raw_cfg is None: + return () + if not isinstance(raw_cfg, dict): + raise RuntimeError( + "rather_no_git_submodule config must be a YAML mapping: " + f"{config_path}" + ) + raw_modules = raw_cfg.get("modules") + if raw_modules is None: + return () + if not isinstance(raw_modules, list): + raise RuntimeError( + "rather_no_git_submodule config `modules` must be a list: " + f"{config_path}" + ) + + repo_root = repo_root.resolve() + selected: list[tuple[str, Path]] = [] + seen_relpaths: set[str] = set() + for index, raw_item in enumerate(raw_modules): + if not isinstance(raw_item, dict): + raise RuntimeError( + "rather_no_git_submodule config entries must be mappings: " + f"{config_path} modules[{index}]" + ) + raw_path = raw_item.get("path") + if not isinstance(raw_path, str) or not raw_path.strip(): + raise RuntimeError( + "rather_no_git_submodule config path must be a non-empty string: " + f"{config_path} modules[{index}].path" + ) + rel_path = Path(raw_path.strip()) + if rel_path.is_absolute() or ".." in rel_path.parts: + raise RuntimeError( + "rather_no_git_submodule config path must stay within the repo root: " + f"{config_path} modules[{index}].path={raw_path!r}" + ) + relpath = rel_path.as_posix() + if relpath in seen_relpaths: + continue + seen_relpaths.add(relpath) + module_root = (repo_root / rel_path).resolve() + if module_root != repo_root and repo_root not in module_root.parents: + raise RuntimeError( + "rather_no_git_submodule config path escapes the repo root: " + f"{config_path} modules[{index}].path={raw_path!r}" + ) + if not module_root.is_dir(): + raise RuntimeError( + f"{context_name} requires configured rather_no_git_submodule path " + f"to exist as a directory: path={relpath} resolved={module_root}" + ) + selected.append((relpath, module_root)) + return tuple(selected) + + +def collect_source_relpaths_with_rather_no_git_submodule( + *, + repo_root: Path, + source_roots: tuple[str, ...], + is_excluded: Callable[[str], bool], + empty_selection_error: str, + rather_no_git_submodule_context_name: str, +) -> list[str]: + repo_root = repo_root.resolve() + selected: set[str] = set() + for source_root in source_roots: + root_path = (repo_root / source_root).resolve() + if not root_path.exists(): + continue + if root_path.is_file(): + relpath = Path(source_root).as_posix() + if not is_excluded(relpath): + selected.add(relpath) + continue + selected.update( + collect_git_listed_source_relpaths( + repo_root=repo_root, + git_root=root_path, + rel_prefix="" if source_root == "." else source_root, + is_excluded=is_excluded, + ) + ) + for relpath, module_root in load_rather_no_git_submodule_source_roots( + repo_root=repo_root, + context_name=rather_no_git_submodule_context_name, + ): + selected.update( + collect_git_listed_source_relpaths( + repo_root=repo_root, + git_root=module_root, + rel_prefix=relpath, + is_excluded=is_excluded, + ) + ) + if not selected: + raise RuntimeError(empty_selection_error) + return sorted(selected) + + +__all__ = [ + "DEFAULT_RATHER_NO_GIT_SUBMODULE_CONFIG_RELPATH", + "collect_git_listed_source_relpaths", + "collect_source_relpaths_with_rather_no_git_submodule", + "load_rather_no_git_submodule_source_roots", +] diff --git a/scripts/source_selection_profiles.py b/scripts/source_selection_profiles.py new file mode 100644 index 0000000..6c7493c --- /dev/null +++ b/scripts/source_selection_profiles.py @@ -0,0 +1,134 @@ +from __future__ import annotations + +from dataclasses import dataclass, field +from pathlib import Path +import sys + +SCRIPT_DIR = Path(__file__).resolve().parent +script_dir_str = str(SCRIPT_DIR) +if script_dir_str in sys.path: + sys.path.remove(script_dir_str) +sys.path.insert(0, script_dir_str) + +import git_source_selection as git_source_selection_utils + + +SOURCE_SELECTION_PROFILE_BUILD_SEED = "build_seed" +SOURCE_SELECTION_PROFILE_SOURCE_PACK = "source_pack" +SOURCE_SELECTION_PROFILES = ( + SOURCE_SELECTION_PROFILE_BUILD_SEED, + SOURCE_SELECTION_PROFILE_SOURCE_PACK, +) + +BUILD_SEED_SOURCE_ROOTS: tuple[str, ...] = ( + "README.md", + "setup.py", + "deployment", + "fluxon_py", + "fluxon_release/closed_sdk", + "fluxon_rs", + "scripts/git_source_selection.py", + "scripts/source_selection_profiles.py", + "setup_and_pack", +) +SOURCE_PACK_SOURCE_ROOTS: tuple[str, ...] = (".",) + +BUILD_SEED_INCLUDED_RELPATHS: frozenset[str] = frozenset( + { + "fluxon_release/closed_sdk/manifest.json", + "setup_and_pack/pub_prepare_build.yaml", + } +) +SOURCE_PACK_EXCLUDED_RELPATH_PREFIXES: tuple[str, ...] = ( + ".dever/", + "fluxon_release/", + "skills/", +) +SOURCE_PACK_EXCLUDED_RELPATH_NAMES: frozenset[str] = frozenset( + { + ".DS_Store", + } +) + + +@dataclass(frozen=True) +class SourceSelectionProfileSpec: + source_roots: tuple[str, ...] + empty_selection_error: str + rather_no_git_submodule_context_name: str + include_relpaths: frozenset[str] = field(default_factory=frozenset) + + +BUILD_SEED_PROFILE_SPEC = SourceSelectionProfileSpec( + source_roots=BUILD_SEED_SOURCE_ROOTS, + empty_selection_error="public workspace source selection produced no files", + rather_no_git_submodule_context_name="public workspace source selection", + include_relpaths=BUILD_SEED_INCLUDED_RELPATHS, +) +SOURCE_PACK_PROFILE_SPEC = SourceSelectionProfileSpec( + source_roots=SOURCE_PACK_SOURCE_ROOTS, + empty_selection_error="git-based CI source selection produced no files", + rather_no_git_submodule_context_name="CI source pack", +) + + +def get_source_profile_spec(*, profile: str) -> SourceSelectionProfileSpec: + if profile == SOURCE_SELECTION_PROFILE_BUILD_SEED: + return BUILD_SEED_PROFILE_SPEC + if profile == SOURCE_SELECTION_PROFILE_SOURCE_PACK: + return SOURCE_PACK_PROFILE_SPEC + raise ValueError( + f"unsupported source selection profile: {profile!r}; expected one of {SOURCE_SELECTION_PROFILES}" + ) + + +def get_source_profile_source_roots(*, profile: str) -> tuple[str, ...]: + return get_source_profile_spec(profile=profile).source_roots + + +def source_profile_relpath_excluded(*, profile: str, relpath: str) -> bool: + spec = get_source_profile_spec(profile=profile) + normalized = relpath.strip("/") + if not normalized: + return True + if normalized in spec.include_relpaths: + return False + if profile == SOURCE_SELECTION_PROFILE_SOURCE_PACK: + if normalized in SOURCE_PACK_EXCLUDED_RELPATH_NAMES: + return True + return any( + normalized == prefix.rstrip("/") or normalized.startswith(prefix) + for prefix in SOURCE_PACK_EXCLUDED_RELPATH_PREFIXES + ) + return False + + +def collect_source_profile_relpaths(*, repo_root: Path, profile: str) -> tuple[str, ...]: + spec = get_source_profile_spec(profile=profile) + return tuple( + git_source_selection_utils.collect_source_relpaths_with_rather_no_git_submodule( + repo_root=repo_root, + source_roots=spec.source_roots, + is_excluded=lambda relpath: source_profile_relpath_excluded( + profile=profile, + relpath=relpath, + ), + empty_selection_error=spec.empty_selection_error, + rather_no_git_submodule_context_name=spec.rather_no_git_submodule_context_name, + ) + ) + + +__all__ = [ + "BUILD_SEED_SOURCE_ROOTS", + "SOURCE_PACK_SOURCE_ROOTS", + "SOURCE_PACK_EXCLUDED_RELPATH_NAMES", + "SOURCE_PACK_EXCLUDED_RELPATH_PREFIXES", + "SOURCE_SELECTION_PROFILE_BUILD_SEED", + "SOURCE_SELECTION_PROFILE_SOURCE_PACK", + "SOURCE_SELECTION_PROFILES", + "collect_source_profile_relpaths", + "get_source_profile_source_roots", + "get_source_profile_spec", + "source_profile_relpath_excluded", +] diff --git a/setup_and_pack/nix/lib_layout.py b/setup_and_pack/nix/lib_layout.py index 05ac4b0..b25d7d1 100644 --- a/setup_and_pack/nix/lib_layout.py +++ b/setup_and_pack/nix/lib_layout.py @@ -10,7 +10,7 @@ import yaml from setup_and_pack.public_workspace_contract import ( - PUBLIC_WORKSPACE_INPUT_RELATIVE_PATHS, + collect_public_workspace_input_relative_paths, _copy_public_workspace_input_path, _sanitize_public_workspace_input, ) @@ -41,6 +41,7 @@ PACK_CONFIG_STATIC_STEM_SUFFIX = "_static" PACK_CONFIG_ENV_STEM_SUFFIX = "_env" DEFAULT_PACK_CONFIG_ENV_DIR_NAME = "setup_and_pack" +PROJECT_ROOT_CONFIG_KEY = "project_root" HOST_PATHS_ROOT_KEY = "root_path" DEFAULT_HOST_PATH_SUFFIXES = { ("store", "project_data_root"): "", @@ -48,35 +49,6 @@ ("manylinux", "cargo_registry_dir"): "manylinux-cache/cargo-registry", ("manylinux", "cargo_git_dir"): "manylinux-cache/cargo-git", } -BRIDGE_PREBUILT_WORKSPACE_SEED_EXTRA_RELATIVE_PATHS = ( - "setup_and_pack/nix", - "setup_and_pack/lib_tool.py", - "setup_and_pack/pyscript_util.py", - "setup_and_pack/closed_sdk_contract.py", - "setup_and_pack/public_workspace_contract.py", - "setup_and_pack/pub_prepare_build.py", - "setup_and_pack/pub_prepare_build.yaml", - "setup_and_pack/utils/wheel_runtime_helper.py", - "setup_and_pack/utils", - "deployment/utils/placeholder_utils.py", - "deployment/utils/proc_lifecycle_codegen.py", - "deployment/utils/selection_supervisor_codegen.py", - "fluxon_release/closed_sdk", - "fluxon_rs/fluxon_commu_contract", - "fluxon_rs/fluxon_commu", - "fluxon_rs/fluxon_commu_closed_sdk_consumer", - "fluxon_rs/Cargo.lock", -) -BRIDGE_PREBUILT_WORKSPACE_SEED_RELATIVE_PATHS = tuple( - dict.fromkeys( - ( - *PUBLIC_WORKSPACE_INPUT_RELATIVE_PATHS, - *BRIDGE_PREBUILT_WORKSPACE_SEED_EXTRA_RELATIVE_PATHS, - ) - ) -) - - @dataclass(frozen=True) class AssemblyRefs: baseline_path: str @@ -253,7 +225,7 @@ def load_experiment_spec_from_root(*, config_path: Path, config_root: dict) -> E profile_config = _require_mapping(raw_config, "profile") assembly_config = _require_mapping(raw_config, "assembly") - project_root = _detect_project_root(config_path=config_path) + project_root = _resolve_project_root(config_path=config_path, raw_config=raw_config) project_data_root = _require_absolute_path(store_config, "project_data_root") base_system = _require_enum_string(runtime_config, "base_system", SUPPORTED_BASE_SYSTEMS) architectures = tuple( @@ -576,6 +548,25 @@ def _detect_project_root(*, config_path: Path) -> Path: ) +def _resolve_project_root(*, config_path: Path, raw_config: dict) -> Path: + configured_project_root = _optional_non_empty_string(raw_config, PROJECT_ROOT_CONFIG_KEY) + if configured_project_root is not None: + project_root = Path(configured_project_root) + if not project_root.is_absolute(): + raise RuntimeError( + f"config.{PROJECT_ROOT_CONFIG_KEY} must be an absolute path: {configured_project_root}" + ) + resolved_project_root = project_root.resolve() + if not _is_project_root_candidate(resolved_project_root): + raise RuntimeError( + "config.project_root must point at a project root containing one of " + f"{PROJECT_ROOT_MARKER_FILE_NAMES + PROJECT_ROOT_MARKER_DIR_NAMES} " + f"and child dirs {PROJECT_ROOT_REQUIRED_CHILD_DIR_NAMES}: {resolved_project_root}" + ) + return resolved_project_root + return _detect_project_root(config_path=config_path) + + def _is_project_root_candidate(candidate_root: Path) -> bool: has_marker = any( (candidate_root / marker_name).exists() @@ -757,7 +748,9 @@ def _materialize_bridge_prebuilt_workspace_seed(*, source_root: Path, target_roo _remove_stale_derived_entry(path=target_root) target_root.mkdir(parents=True, exist_ok=True) target_root.chmod(0o777) - for relative_path in BRIDGE_PREBUILT_WORKSPACE_SEED_RELATIVE_PATHS: + for relative_path in collect_public_workspace_input_relative_paths( + repo_root=source_root + ): source_path = source_root / relative_path if not source_path.exists(): raise RuntimeError( diff --git a/setup_and_pack/nix/pack_fluxonkv_pylib.py b/setup_and_pack/nix/pack_fluxonkv_pylib.py index c44df13..e12f8fe 100644 --- a/setup_and_pack/nix/pack_fluxonkv_pylib.py +++ b/setup_and_pack/nix/pack_fluxonkv_pylib.py @@ -43,6 +43,9 @@ CLOSED_SDK_CONSUMER_BOUNDARY_MODE, rewrite_fluxon_native_export_bundle, ) +from setup_and_pack.public_workspace_contract import ( + collect_public_workspace_input_relative_paths, +) from utils.sudo_prefix_utils import host_sudo_prefix import utils as script_utils ABI3_SMOKE_TEST_INTERPRETERS = ( @@ -142,11 +145,6 @@ ) ) SUPPORTED_TARGET_CACHE_GENERATOR_KINDS = frozenset() -REQUIRED_DEPLOYMENT_UTIL_FILES_FOR_PYO3_BUILD = ( - "placeholder_utils.py", - "proc_lifecycle_codegen.py", - "selection_supervisor_codegen.py", -) TEMP_WORKSPACE_MOUNT_DIRS: list[Path] = [] @@ -160,54 +158,6 @@ def _cleanup_temp_workspace_mount_dirs() -> None: atexit.register(_cleanup_temp_workspace_mount_dirs) -REQUIRED_DEPLOYMENT_UTIL_RELATIVE_PATHS_FOR_PYO3_BUILD = tuple( - f"deployment/utils/{name}" for name in REQUIRED_DEPLOYMENT_UTIL_FILES_FOR_PYO3_BUILD -) -PYO3_WORKSPACE_HELPER_RELATIVE_PATHS = ( - "fluxon_rs/rust-toolchain.toml", - "setup_and_pack/lib_tool.py", - "setup_and_pack/pyscript_util.py", - "setup_and_pack/closed_sdk_contract.py", - "setup_and_pack/public_workspace_contract.py", - "setup_and_pack/pub_prepare_build.py", - "setup_and_pack/pub_prepare_build.yaml", - "setup_and_pack/nix/pack_release_in_container.py", - "setup_and_pack/utils/wheel_runtime_helper.py", - "setup_and_pack/nix/lib_layout.py", -) -PYO3_INPUT_RELATIVE_PATHS_COMMON = ( - "fluxon_rs/Cargo.toml", - "fluxon_rs/Cargo.lock", - "fluxon_rs/.cargo", - "fluxon_rs/rust-toolchain.toml", - "fluxon_rs/fluxon_commu_contract", - "fluxon_rs/fluxon_commu_closed_sdk_consumer", - "fluxon_rs/fluxon_pyo3", - "fluxon_rs/limit_thirdparty", - "fluxon_rs/fluxon_commu", - "fluxon_rs/fluxon_kv", - "fluxon_rs/fluxon_framework", - "fluxon_rs/fluxon_framework_compiled", - "fluxon_rs/fluxon_util", - "fluxon_rs/fluxon_mq", - "fluxon_rs/fluxon_cli", - "fluxon_rs/fluxon_ops", - "fluxon_rs/fluxon_proxy_proto", - "fluxon_rs/fluxon_proxy", - "fluxon_rs/fluxon_fs", - "fluxon_rs/fluxon_fs_core", - "fluxon_rs/fluxon_fs_s3_gateway", - "fluxon_rs/fluxon_observability", - "fluxon_rs/moka", - "fluxon_py", - "fluxon_release/closed_sdk", - "setup_and_pack/nix/lib_layout.py", - "setup_and_pack/closed_sdk_contract.py", - "setup_and_pack/public_workspace_contract.py", - "setup_and_pack/lib_tool.py", - "setup_and_pack/pyscript_util.py", - *REQUIRED_DEPLOYMENT_UTIL_RELATIVE_PATHS_FOR_PYO3_BUILD, -) PYO3_INPUT_RELATIVE_PATHS_BY_TRANSPORT_BACKEND = { "fastws": (), "tquic": (), @@ -218,15 +168,6 @@ def _cleanup_temp_workspace_mount_dirs() -> None: PYO3_INPUT_RELATIVE_PATHS_BY_RDMA_BACKEND = { "closed_sdk": ("fluxon_release/closed_sdk",), } -PYO3_WORKSPACE_COPY_RELATIVE_PATHS_PUBLIC_NATIVE = () -PYO3_WORKSPACE_COPY_RELATIVE_PATHS_COMMON = tuple( - relative_path - for relative_path in ( - *PYO3_INPUT_RELATIVE_PATHS_COMMON, - *PYO3_WORKSPACE_COPY_RELATIVE_PATHS_PUBLIC_NATIVE, - *PYO3_WORKSPACE_HELPER_RELATIVE_PATHS, - ) -) TRANSPORT_BACKEND_FEATURES = { "fastws": ["fastws_transport"], "tquic": ["tquic_transport"], @@ -257,90 +198,6 @@ def _cleanup_temp_workspace_mount_dirs() -> None: "libstdc++.so.6", "libgomp.so.1", ) -IGNORED_FILE_SUFFIXES = ( - ".gitignore", - ".pkl", - ".pyc", - ".md", - ".rst", - ".html", - ".htm", - ".xml", - ".css", - ".js", - ".map", - ".png", - ".jpg", - ".jpeg", - ".gif", - ".bmp", - ".svg", - ".ico", - ".pdf", - ".ppt", - ".pptx", - ".doc", - ".docx", - ".pem", - ".crt", - ".crl", - ".key", - ".csr", - ".p12", - ".der", - ".serial", - ".old", - ".orig", - ".rej", - ".tar", - ".tar.gz", - ".tgz", - ".tar.xz", - ".txz", - ".tar.bz2", - ".tbz2", - ".zip", - ".7z", - ".xz", - ".bz2", - ".gz", -) -IGNORED_DIR_NAMES = { - ".git", - "__pycache__", - "target", - "wheels", - "docs", - "doc", - "doxygen", - "examples", - "example", - "tests", - "test", - "testdata", - "bench", - "benches", - "benchmark", - "benchmarks", - "fuzz", - "fuzzers", - "packagecache", - "wycheproof_testvectors", - "tfprof", -} -IGNORED_FILE_NAMES = ( - PYO3_CHECKSUM_FILE_NAME, - "configure~", -) - - -def _pyo3_input_relative_paths(transport_backend: str, rdma_backend: str) -> tuple[str, ...]: - return ( - PYO3_INPUT_RELATIVE_PATHS_COMMON - + PYO3_INPUT_RELATIVE_PATHS_BY_TRANSPORT_BACKEND[transport_backend] - + PYO3_INPUT_RELATIVE_PATHS_BY_RDMA_BACKEND[rdma_backend] - ) - def _dedupe_relative_paths(relative_paths: tuple[str, ...]) -> tuple[str, ...]: ordered_relative_paths: list[str] = [] @@ -354,11 +211,13 @@ def _dedupe_relative_paths(relative_paths: tuple[str, ...]) -> tuple[str, ...]: def pyo3_workspace_copy_relative_paths(transport_backend: str, rdma_backend: str) -> tuple[str, ...]: - return _dedupe_relative_paths( - PYO3_WORKSPACE_COPY_RELATIVE_PATHS_COMMON - + PYO3_INPUT_RELATIVE_PATHS_BY_TRANSPORT_BACKEND[transport_backend] - + PYO3_INPUT_RELATIVE_PATHS_BY_RDMA_BACKEND[rdma_backend] - ) + del transport_backend + del rdma_backend + return collect_public_workspace_input_relative_paths(repo_root=REPO_ROOT) + + +def _pyo3_input_relative_paths(transport_backend: str, rdma_backend: str) -> tuple[str, ...]: + return pyo3_workspace_copy_relative_paths(transport_backend, rdma_backend) def _wheel_variant_key(transport_backend: str, rdma_backend: str) -> str: @@ -435,9 +294,9 @@ def _compute_inputs_digest(repo_root: Path, relative_paths: tuple[str, ...]) -> relative_to=repo_root, mode=script_utils.PathDigestMode.CONTENTS_ONLY, algorithm=script_utils.PathHashAlgorithm.MD5, - ignored_dir_names=IGNORED_DIR_NAMES, - ignored_file_names=IGNORED_FILE_NAMES, - ignored_file_suffixes=IGNORED_FILE_SUFFIXES, + ignored_dir_names=(), + ignored_file_names=(), + ignored_file_suffixes=(), ) @@ -593,33 +452,6 @@ def current_checksum(self) -> str: _pyo3_input_relative_paths(self.transport_backend, self.rdma_backend), ) + f"|transport_backend={self.transport_backend}|rdma_backend={self.rdma_backend}" - def _legacy_checksum_map(self) -> dict[str, str]: - file_hash: dict[str, str] = {} - for current_root, dirnames, filenames in os.walk(self.rs_root, topdown=True): - current_root_path = Path(current_root) - root_rel = current_root_path.relative_to(self.rs_root).as_posix() - root_text = current_root_path.as_posix() - if root_rel == "target" or root_rel.startswith("target/"): - dirnames[:] = [] - continue - if root_rel == "wheels" or root_rel.startswith("wheels/"): - dirnames[:] = [] - continue - if "/.git/" in root_text or root_text.endswith("/.git"): - dirnames[:] = [] - continue - dirnames[:] = sorted(dir_name for dir_name in dirnames if dir_name != ".git") - for file_name in sorted(filenames): - if file_name in IGNORED_FILE_NAMES or file_name.endswith(IGNORED_FILE_SUFFIXES): - continue - file_path = current_root_path / file_name - hash_md5 = hashlib.md5() - with open(file_path, "rb") as f: - for chunk in iter(lambda: f.read(4096), b""): - hash_md5.update(chunk) - file_hash[file_path.relative_to(self.rs_root).as_posix()] = hash_md5.hexdigest() - return file_hash - def find_cached_wheel(self) -> Path | None: if not self.target_wheels_dir.exists(): return None @@ -2976,12 +2808,15 @@ def _build_published_profile_manifest( selected_backend_plan: dict, native_build_authority: dict | None, ) -> dict: - workspace_seed_digest = _compute_inputs_digest( - workspace_seed_dir, - _public_workspace_seed_relative_paths( - transport_backend, - rdma_backend=selected_backend_plan["rdma_backend"], - ), + del transport_backend + workspace_seed_digest = script_utils.compute_paths_digest( + [workspace_seed_dir], + relative_to=workspace_seed_dir, + mode=script_utils.PathDigestMode.CONTENTS_ONLY, + algorithm=script_utils.PathHashAlgorithm.MD5, + ignored_dir_names=(), + ignored_file_names=(), + ignored_file_suffixes=(), ) manifest = { "object_kind": "FluxonManylinuxPublishedProfile", @@ -3137,18 +2972,19 @@ def _copy_workspace_seed_subset( transport_backend: str, rdma_backend: str, ) -> None: + del transport_backend + del rdma_backend target_workspace_seed_dir.mkdir(parents=True, exist_ok=True) target_workspace_seed_dir.chmod(0o777) - for relative_path in _public_workspace_seed_relative_paths( - transport_backend, - rdma_backend=rdma_backend, - ): - source_path = source_workspace_seed_dir / relative_path - if not source_path.exists(): - raise RuntimeError( - f"workspace seed path is missing required publish input: {source_path}" - ) + for source_path in sorted(source_workspace_seed_dir.rglob("*")): + if source_path == source_workspace_seed_dir: + continue + relative_path = source_path.relative_to(source_workspace_seed_dir) target_path = target_workspace_seed_dir / relative_path + if source_path.is_dir() and not source_path.is_symlink(): + target_path.mkdir(parents=True, exist_ok=True) + target_path.chmod(0o777) + continue target_path.parent.mkdir(parents=True, exist_ok=True) target_path.parent.chmod(0o777) _sudo_copy_path(source_path=source_path, target_path=target_path) @@ -3248,10 +3084,6 @@ def _run_with_tee_log(*, argv: list[str], log_path: Path) -> None: if return_code != 0: raise RuntimeError(f"docker run failed with exit code {return_code}, log={log_path}") -def _public_workspace_seed_relative_paths(transport_backend: str, *, rdma_backend: str) -> tuple[str, ...]: - return pyo3_workspace_copy_relative_paths(transport_backend, rdma_backend) - - def _require_workspace_seed_fluxon_commu_source_dir(*, workspace_seed_dir: Path, field_name: str) -> Path: source_dir = workspace_seed_dir / FLUXON_COMMU_AUTHORITY_RELATIVE_PATH cargo_toml_path = source_dir / "Cargo.toml" diff --git a/setup_and_pack/public_workspace_contract.py b/setup_and_pack/public_workspace_contract.py index 5cd6b50..cb1574e 100644 --- a/setup_and_pack/public_workspace_contract.py +++ b/setup_and_pack/public_workspace_contract.py @@ -1,47 +1,46 @@ from __future__ import annotations +import os import shutil +import sys from pathlib import Path +REPO_ROOT = Path(__file__).resolve().parent.parent +SCRIPTS_DIR = REPO_ROOT / "scripts" +scripts_dir_str = str(SCRIPTS_DIR) +if scripts_dir_str in sys.path: + sys.path.remove(scripts_dir_str) +sys.path.insert(0, scripts_dir_str) -PUBLIC_WORKSPACE_INPUT_RELATIVE_PATHS = ( - "setup.py", - "fluxon_py", - "fluxon_release/closed_sdk", - "fluxon_rs/Cargo.toml", - "fluxon_rs/Cargo.lock", - "fluxon_rs/.cargo", - "fluxon_rs/rust-toolchain.toml", - "fluxon_rs/fluxon_commu_contract", - "fluxon_rs/fluxon_commu_closed_sdk_consumer", - "fluxon_rs/fluxon_commu", - "fluxon_rs/fluxon_pyo3", - "fluxon_rs/limit_thirdparty", - "fluxon_rs/fluxon_kv", - "fluxon_rs/fluxon_framework", - "fluxon_rs/fluxon_framework_compiled", - "fluxon_rs/fluxon_util", - "fluxon_rs/fluxon_mq", - "fluxon_rs/fluxon_cli", - "fluxon_rs/fluxon_ops", - "fluxon_rs/fluxon_proxy_proto", - "fluxon_rs/fluxon_proxy", - "fluxon_rs/fluxon_fs", - "fluxon_rs/fluxon_fs_core", - "fluxon_rs/fluxon_fs_s3_gateway", - "fluxon_rs/fluxon_observability", - "fluxon_rs/moka", +from source_selection_profiles import ( + SOURCE_SELECTION_PROFILE_BUILD_SEED, + collect_source_profile_relpaths, ) def _copy_public_workspace_input_path(source_path: Path, target_path: Path) -> None: target_path.parent.mkdir(parents=True, exist_ok=True) + if source_path.is_symlink(): + if target_path.exists() or target_path.is_symlink(): + if target_path.is_dir() and not target_path.is_symlink(): + shutil.rmtree(target_path) + else: + target_path.unlink() + os.symlink(os.readlink(source_path), target_path) + return if source_path.is_dir(): shutil.copytree(source_path, target_path, symlinks=True, dirs_exist_ok=True) return shutil.copy2(source_path, target_path) +def collect_public_workspace_input_relative_paths(*, repo_root: Path) -> tuple[str, ...]: + return collect_source_profile_relpaths( + repo_root=repo_root, + profile=SOURCE_SELECTION_PROFILE_BUILD_SEED, + ) + + def _sanitize_public_workspace_input(*, workspace_root: Path) -> None: for pycache_dir in workspace_root.rglob("__pycache__"): shutil.rmtree(pycache_dir, ignore_errors=True) @@ -51,9 +50,8 @@ def _sanitize_public_workspace_input(*, workspace_root: Path) -> None: except FileNotFoundError: pass - __all__ = [ - "PUBLIC_WORKSPACE_INPUT_RELATIVE_PATHS", + "collect_public_workspace_input_relative_paths", "_copy_public_workspace_input_path", "_sanitize_public_workspace_input", ] diff --git a/setup_and_pack/tests/test_git_source_selection_utils.py b/setup_and_pack/tests/test_git_source_selection_utils.py new file mode 100644 index 0000000..b28d64d --- /dev/null +++ b/setup_and_pack/tests/test_git_source_selection_utils.py @@ -0,0 +1,182 @@ +from __future__ import annotations + +import importlib.util +import sys +import tempfile +import unittest +from pathlib import Path +from unittest import mock + + +REPO_ROOT = Path(__file__).resolve().parents[2] +MODULE_PATH = REPO_ROOT / "scripts" / "git_source_selection.py" +PROFILE_MODULE_PATH = REPO_ROOT / "scripts" / "source_selection_profiles.py" + + +def _load_module(): + spec = importlib.util.spec_from_file_location( + "scripts_git_source_selection_test", + MODULE_PATH, + ) + assert spec is not None and spec.loader is not None + mod = importlib.util.module_from_spec(spec) + sys.modules[spec.name] = mod + spec.loader.exec_module(mod) + return mod + + +_MOD = _load_module() + + +def _load_profile_module(): + scripts_root_str = str(REPO_ROOT / "scripts") + if scripts_root_str in sys.path: + sys.path.remove(scripts_root_str) + sys.path.insert(0, scripts_root_str) + spec = importlib.util.spec_from_file_location( + "scripts_source_selection_profiles_test", + PROFILE_MODULE_PATH, + ) + assert spec is not None and spec.loader is not None + mod = importlib.util.module_from_spec(spec) + sys.modules[spec.name] = mod + spec.loader.exec_module(mod) + return mod + + +_PROFILE_MOD = _load_profile_module() + + +class GitSourceSelectionUtilsTest(unittest.TestCase): + def test_collect_source_relpaths_with_rather_no_git_submodule_merges_module_sources(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + repo_root = Path(tmpdir) + (repo_root / "README.md").write_text("repo\n", encoding="utf-8") + module_root = repo_root / "fluxon_rs" / "moka" + (module_root / "src").mkdir(parents=True, exist_ok=True) + (module_root / "Cargo.toml").write_text("module\n", encoding="utf-8") + (module_root / "src" / "lib.rs").write_text("pub fn x() {}\n", encoding="utf-8") + cfg_path = repo_root / "setup_and_pack" / "rather_no_git_submodule.yaml" + cfg_path.parent.mkdir(parents=True, exist_ok=True) + cfg_path.write_text( + "modules:\n" + " - path: fluxon_rs/moka\n" + " repo: https://example.com/moka.git\n" + " checkout: main\n", + encoding="utf-8", + ) + + def fake_check_output(argv, cwd=None): + del argv + cwd_path = Path(cwd).resolve() + if cwd_path == repo_root.resolve(): + return b"README.md\0" + if cwd_path == module_root.resolve(): + return b"Cargo.toml\0src/lib.rs\0" + raise AssertionError(f"unexpected git ls-files cwd: {cwd_path}") + + with mock.patch.object(_MOD.subprocess, "check_output", side_effect=fake_check_output): + relpaths = _MOD.collect_source_relpaths_with_rather_no_git_submodule( + repo_root=repo_root, + source_roots=("README.md",), + is_excluded=lambda _relpath: False, + empty_selection_error="no files", + rather_no_git_submodule_context_name="test source selection", + ) + + self.assertEqual( + relpaths, + [ + "README.md", + "fluxon_rs/moka/Cargo.toml", + "fluxon_rs/moka/src/lib.rs", + ], + ) + + def test_load_rather_no_git_submodule_source_roots_uses_context_name_in_missing_dir_error(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + repo_root = Path(tmpdir) + cfg_path = repo_root / "setup_and_pack" / "rather_no_git_submodule.yaml" + cfg_path.parent.mkdir(parents=True, exist_ok=True) + cfg_path.write_text( + "modules:\n" + " - path: fluxon_rs/moka\n" + " repo: https://example.com/moka.git\n" + " checkout: main\n", + encoding="utf-8", + ) + + with self.assertRaisesRegex( + RuntimeError, + "test source selection requires configured rather_no_git_submodule path to exist", + ): + _MOD.load_rather_no_git_submodule_source_roots( + repo_root=repo_root, + context_name="test source selection", + ) + + def test_source_profiles_only_add_inclusions_beyond_gitignore(self) -> None: + self.assertTrue( + _PROFILE_MOD.source_profile_relpath_excluded( + profile=_PROFILE_MOD.SOURCE_SELECTION_PROFILE_SOURCE_PACK, + relpath=".dever/run.log", + ) + ) + self.assertTrue( + _PROFILE_MOD.source_profile_relpath_excluded( + profile=_PROFILE_MOD.SOURCE_SELECTION_PROFILE_SOURCE_PACK, + relpath="fluxon_release/install.py", + ) + ) + self.assertTrue( + _PROFILE_MOD.source_profile_relpath_excluded( + profile=_PROFILE_MOD.SOURCE_SELECTION_PROFILE_SOURCE_PACK, + relpath="skills/demo/SKILL.md", + ) + ) + self.assertFalse( + _PROFILE_MOD.source_profile_relpath_excluded( + profile=_PROFILE_MOD.SOURCE_SELECTION_PROFILE_BUILD_SEED, + relpath="fluxon_release/closed_sdk/manifest.json", + ) + ) + self.assertFalse( + _PROFILE_MOD.source_profile_relpath_excluded( + profile=_PROFILE_MOD.SOURCE_SELECTION_PROFILE_BUILD_SEED, + relpath="fluxon_doc_cn/roadmap.md", + ) + ) + self.assertFalse( + _PROFILE_MOD.source_profile_relpath_excluded( + profile=_PROFILE_MOD.SOURCE_SELECTION_PROFILE_BUILD_SEED, + relpath="deployment/utils/log_shard.py", + ) + ) + self.assertFalse( + _PROFILE_MOD.source_profile_relpath_excluded( + profile=_PROFILE_MOD.SOURCE_SELECTION_PROFILE_BUILD_SEED, + relpath="scripts/source_selection_profiles.py", + ) + ) + self.assertFalse( + _PROFILE_MOD.source_profile_relpath_excluded( + profile=_PROFILE_MOD.SOURCE_SELECTION_PROFILE_BUILD_SEED, + relpath="fluxon_rs/moka/examples/append_value_async.rs", + ) + ) + self.assertFalse( + _PROFILE_MOD.source_profile_relpath_excluded( + profile=_PROFILE_MOD.SOURCE_SELECTION_PROFILE_BUILD_SEED, + relpath="fluxon_rs/moka/tests/entry_api_sync.rs", + ) + ) + self.assertFalse( + _PROFILE_MOD.source_profile_relpath_excluded( + profile=_PROFILE_MOD.SOURCE_SELECTION_PROFILE_BUILD_SEED, + relpath="fluxon_rs/fluxon_cli/templates/landing.html", + ) + ) + + +if __name__ == "__main__": + raise SystemExit(unittest.main()) diff --git a/setup_and_pack/tests/test_lib_layout.py b/setup_and_pack/tests/test_lib_layout.py index dd19442..049be32 100644 --- a/setup_and_pack/tests/test_lib_layout.py +++ b/setup_and_pack/tests/test_lib_layout.py @@ -14,6 +14,10 @@ def _load_lib_layout(): + repo_root_str = str(REPO_ROOT) + if repo_root_str in sys.path: + sys.path.remove(repo_root_str) + sys.path.insert(0, repo_root_str) spec = importlib.util.spec_from_file_location("setup_and_pack_nix_lib_layout_test", LIB_LAYOUT_PATH) assert spec is not None and spec.loader is not None mod = importlib.util.module_from_spec(spec) @@ -83,14 +87,17 @@ def test_bridge_prebuilt_materializes_workspace_seed(self) -> None: self.assertTrue(workspace_seed_dir.is_dir()) self.assertTrue((workspace_seed_dir / "setup_and_pack/closed_sdk_contract.py").is_file()) self.assertTrue((workspace_seed_dir / "setup_and_pack/public_workspace_contract.py").is_file()) + self.assertTrue((workspace_seed_dir / "README.md").is_file()) self.assertTrue((workspace_seed_dir / "fluxon_rs/fluxon_commu_contract/Cargo.toml").is_file()) self.assertTrue((workspace_seed_dir / "fluxon_rs/fluxon_commu/Cargo.toml").is_file()) + self.assertTrue((workspace_seed_dir / "fluxon_rs/fluxon_ops/build.rs").is_file()) self.assertTrue((workspace_seed_dir / "fluxon_release/closed_sdk/manifest.json").is_file()) self.assertTrue((workspace_seed_dir / "setup_and_pack/nix/pack_fluxonkv_pylib.py").is_file()) self.assertTrue((workspace_seed_dir / "setup_and_pack/nix/pack_release_in_container.py").is_file()) self.assertTrue((workspace_seed_dir / "setup_and_pack/utils/__init__.py").is_file()) self.assertTrue((workspace_seed_dir / "setup_and_pack/utils/sudo_prefix_utils.py").is_file()) self.assertTrue((workspace_seed_dir / "setup_and_pack/utils/wheel_runtime_helper.py").is_file()) + self.assertTrue((workspace_seed_dir / "deployment/utils/log_shard.py").is_file()) self.assertTrue((workspace_seed_dir / "fluxon_rs/fluxon_kv/Cargo.toml").is_file()) self.assertTrue((workspace_seed_dir / "fluxon_rs/Cargo.lock").is_file()) self.assertTrue((workspace_seed_dir / "fluxon_rs/moka/Cargo.toml").is_file()) @@ -132,6 +139,41 @@ def test_load_experiment_spec_from_root_parses_closed_sdk_search_roots(self) -> (str(closed_sdk_root.resolve()),), ) + def test_load_experiment_spec_from_root_accepts_explicit_project_root(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + project_root = root / "repo" + generated_config_path = root / "generated" / "setup_and_pack" / "nix" / "pack_fluxonkv_pylib_ci.yaml" + (project_root / ".git").mkdir(parents=True, exist_ok=True) + (project_root / "setup_and_pack").mkdir(parents=True, exist_ok=True) + generated_config_path.parent.mkdir(parents=True, exist_ok=True) + + spec = _LIB_LAYOUT.load_experiment_spec_from_root( + config_path=generated_config_path, + config_root={ + "project_root": str(project_root.resolve()), + "store": { + "project_data_root": str((root / "project_data").resolve()), + }, + "runtime": { + "base_system": "manylinux_2_28", + "architectures": ["x86_64"], + "python_abi": "cpython3.10", + }, + "profile": { + "source_kind": "bridge_prebuilt", + "native_runtime_dir_names": ["cxxpacked"], + "target_support_dir_names": ["meson-0.64.0"], + "ext_bundle_dir_name": "cxxpacked", + }, + "assembly": { + "baseline_path": str((root / "baseline").resolve()), + }, + }, + ) + + self.assertEqual(spec.project_root, project_root.resolve()) + def test_load_experiment_config_root_expands_host_root_aliases(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: root = Path(tmpdir) diff --git a/setup_and_pack/tests/test_pack_fluxonkv_pylib_bridge_prebuilt.py b/setup_and_pack/tests/test_pack_fluxonkv_pylib_bridge_prebuilt.py index db1bcd7..bae0e86 100644 --- a/setup_and_pack/tests/test_pack_fluxonkv_pylib_bridge_prebuilt.py +++ b/setup_and_pack/tests/test_pack_fluxonkv_pylib_bridge_prebuilt.py @@ -38,6 +38,39 @@ def _load_module(): class BridgePrebuiltAuthorityMaterializationTest(unittest.TestCase): + def test_pyo3_workspace_inputs_follow_dynamic_public_workspace_selection(self) -> None: + relpaths = _PACKMOD.pyo3_workspace_copy_relative_paths( + transport_backend="tcp_thread", + rdma_backend="closed_sdk", + ) + + self.assertIn("README.md", relpaths) + self.assertIn("deployment/utils/log_shard.py", relpaths) + self.assertIn("fluxon_rs/fluxon_ops/build.rs", relpaths) + self.assertIn("fluxon_rs/moka/examples/append_value_async.rs", relpaths) + self.assertIn("fluxon_rs/fluxon_cli/templates/landing.html", relpaths) + self.assertNotIn("skills/browser-helm/SKILL.md", relpaths) + self.assertNotIn("fluxon_doc_cn/roadmap.md", relpaths) + + def test_pyo3_workspace_digest_tracks_selected_template_inputs(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + repo_root = Path(tmpdir) + landing_path = repo_root / "fluxon_rs" / "fluxon_cli" / "templates" / "landing.html" + landing_path.parent.mkdir(parents=True, exist_ok=True) + landing_path.write_text("v1\n", encoding="utf-8") + + digest_before = _PACKMOD._compute_inputs_digest( + repo_root, + ("fluxon_rs/fluxon_cli/templates/landing.html",), + ) + landing_path.write_text("v2\n", encoding="utf-8") + digest_after = _PACKMOD._compute_inputs_digest( + repo_root, + ("fluxon_rs/fluxon_cli/templates/landing.html",), + ) + + self.assertNotEqual(digest_before, digest_after) + def test_host_side_materialization_only_creates_placeholders(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: build_root = Path(tmpdir) diff --git a/setup_and_pack/tests/test_quick_start_release_only.py b/setup_and_pack/tests/test_quick_start_release_only.py index 6d41bf3..5b5b29f 100644 --- a/setup_and_pack/tests/test_quick_start_release_only.py +++ b/setup_and_pack/tests/test_quick_start_release_only.py @@ -123,6 +123,41 @@ def _handle_mq_shell_line(line, shutdown_requested, status_lines): self.assertIn("MQ shell status:", stdout.getvalue()) shutdown_requested.set.assert_not_called() + def test_quick_start_owner_configs_include_large_file_paths(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + workdir = Path(tmpdir) + + kv_cfg = _START._gen_kv_config( + "127.0.0.1:12379", + "qs_kv_cluster", + 31000, + 8083, + 0, + 14000, + workdir, + ) + mq_cfg = _START._gen_mq_config( + "127.0.0.1:12379", + "qs_mq_cluster", + 34200, + 14000, + workdir, + panel_port=18080, + ) + fs_cfg = _START._gen_fs_config( + "127.0.0.1:12379", + "qs_fs_cluster", + 34100, + 34180, + 14000, + workdir, + ) + + expected = [str(workdir / "large" / "owner")] + self.assertEqual(kv_cfg["kvclient"]["fluxonkv_spec"]["large_file_paths"], expected) + self.assertEqual(mq_cfg["kvclient"]["fluxonkv_spec"]["large_file_paths"], expected) + self.assertEqual(fs_cfg["kvclient"]["fluxonkv_spec"]["large_file_paths"], expected) + def test_pack_fluxon_pylib_cleans_stale_build_artifacts_before_bdist(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: repo_root = Path(tmpdir) diff --git a/setup_and_pack/utils/__init__.py b/setup_and_pack/utils/__init__.py index df414f6..3921245 100644 --- a/setup_and_pack/utils/__init__.py +++ b/setup_and_pack/utils/__init__.py @@ -10,6 +10,7 @@ _iter_digest_entries, build_cached_tarball, compute_paths_digest, + prune_stage_paths, rsync_stage, tar_gz, tarball_rule, @@ -66,6 +67,7 @@ "ArtifactRule", "tarball_rule", "build_cached_tarball", + "prune_stage_paths", "rsync_stage", "tar_gz", "_iter_digest_entries", diff --git a/setup_and_pack/utils/artifact_cache_digest_utils.py b/setup_and_pack/utils/artifact_cache_digest_utils.py index 11739ef..d3780e3 100644 --- a/setup_and_pack/utils/artifact_cache_digest_utils.py +++ b/setup_and_pack/utils/artifact_cache_digest_utils.py @@ -1,8 +1,10 @@ from __future__ import annotations import enum +import fnmatch import hashlib import os +import shutil from dataclasses import dataclass from pathlib import Path from typing import Callable, Collection, Iterator, Sequence @@ -19,6 +21,7 @@ "ArtifactCheck", "ArtifactRule", "tarball_rule", + "prune_stage_paths", "build_cached_tarball", "rsync_stage", "tar_gz", @@ -114,7 +117,14 @@ def build_cached_tarball(*, rule: ArtifactRule, out_path: Path, build_tarball: C rule.write_stamp(check.digest) -def rsync_stage(*, repo_root: Path, src: Path, dst: Path, honor_gitignore: bool) -> None: +def rsync_stage( + *, + repo_root: Path, + src: Path, + dst: Path, + honor_gitignore: bool, + exclude_rel_paths: tuple[str, ...] = (), +) -> None: if not src.exists(): print(f"Missing required source path for staging: {src}") raise SystemExit(1) @@ -132,6 +142,8 @@ def rsync_stage(*, repo_root: Path, src: Path, dst: Path, honor_gitignore: bool) "--exclude-from=.gitignore", "--filter=:- .gitignore", ] + for pattern in exclude_rel_paths: + argv.append(f"--exclude={pattern}") if src.is_dir(): argv += [str(src) + "/", str(dst) + "/"] else: @@ -139,6 +151,21 @@ def rsync_stage(*, repo_root: Path, src: Path, dst: Path, honor_gitignore: bool) run_cmd_argv(argv, cwd=repo_root) +def prune_stage_paths(stage_root: Path, exclude_rel_paths: tuple[str, ...]) -> None: + if not stage_root.exists(): + return + for path in sorted(stage_root.rglob("*"), reverse=True): + rel_path = path.relative_to(stage_root).as_posix() + for pattern in exclude_rel_paths: + normalized_pattern = pattern.rstrip("/") + if fnmatch.fnmatch(rel_path, normalized_pattern) or fnmatch.fnmatch(path.name, normalized_pattern): + if path.is_dir() and not path.is_symlink(): + shutil.rmtree(path) + else: + path.unlink(missing_ok=True) + break + + def tar_gz( *, cwd: Path, diff --git a/setup_and_pack/utils/repo_config_utils.py b/setup_and_pack/utils/repo_config_utils.py index 46f4686..ca51703 100644 --- a/setup_and_pack/utils/repo_config_utils.py +++ b/setup_and_pack/utils/repo_config_utils.py @@ -9,8 +9,7 @@ from deployment.utils.deployconf_config_utils import ( load_deployconf_etcd_address, load_deployconf_fluxon_cluster_name, - load_deployconf_fluxon_shared_file_path, - load_deployconf_fluxon_shared_memory_path, + load_deployconf_fluxon_share_mem_path, load_deployconf_mapping, load_deployconf_prom_remote_write_url, load_deployconf_prometheus_base_url, @@ -35,16 +34,14 @@ "load_test_kv_svc_type_from_test_config", "load_test_etcd_address_from_test_config", "load_test_fluxon_cluster_name_from_test_config", - "load_test_fluxon_shared_memory_path_from_test_config", - "load_test_fluxon_shared_file_path_from_test_config", + "load_test_fluxon_share_mem_path_from_test_config", "load_deployconf_mapping", "load_deployconf_resolved_global_envs", "load_deployconf_etcd_address", "load_deployconf_prometheus_base_url", "load_deployconf_prom_remote_write_url", "load_deployconf_fluxon_cluster_name", - "load_deployconf_fluxon_shared_memory_path", - "load_deployconf_fluxon_shared_file_path", + "load_deployconf_fluxon_share_mem_path", "load_deployconf_service_ip_port", ] @@ -375,19 +372,10 @@ def load_test_fluxon_cluster_name_from_test_config(*, config_path: Optional[Path return raw.strip() -def load_test_fluxon_shared_memory_path_from_test_config(*, config_path: Optional[Path] = None) -> str: - """Load Fluxon shared-memory root from test_config.yaml as the single test authority.""" +def load_test_fluxon_share_mem_path_from_test_config(*, config_path: Optional[Path] = None) -> str: + """Load Fluxon shared bundle root from test_config.yaml as the single test authority.""" test_cfg = load_test_config_mapping(config_path=config_path) - raw = test_cfg.get("shared_memory_path") + raw = test_cfg.get("share_mem_path") if not isinstance(raw, str) or not raw.strip(): - raise ValueError("test_config.yaml must define non-empty shared_memory_path") - return raw.strip() - - -def load_test_fluxon_shared_file_path_from_test_config(*, config_path: Optional[Path] = None) -> str: - """Load Fluxon shared-file root from test_config.yaml as the single test authority.""" - test_cfg = load_test_config_mapping(config_path=config_path) - raw = test_cfg.get("shared_file_path") - if not isinstance(raw, str) or not raw.strip(): - raise ValueError("test_config.yaml must define non-empty shared_file_path") + raise ValueError("test_config.yaml must define non-empty share_mem_path") return raw.strip()