From e986d0eceb13858b565b2fef3a49e981a1260eda Mon Sep 17 00:00:00 2001
From: Maosheng Liao <maoshengl@nvidia.com>
Date: Tue, 16 Jun 2026 06:59:57 -0700
Subject: [PATCH 1/6] feat(finetune): vendor the 4 Cosmos3 SFT recipe TOMLs

---
 .../finetune/toml/sft_config/llava_ov.toml    | 108 ++++++++++++++++++
 .../toml/sft_config/videophy2_sft_nano.toml   |  91 +++++++++++++++
 .../toml/sft_config/vision_sft_nano.toml      |  91 +++++++++++++++
 .../toml/sft_config/vision_sft_super.toml     |  92 +++++++++++++++
 4 files changed, 382 insertions(+)
 create mode 100644 cookbooks/cosmos3/finetune/toml/sft_config/llava_ov.toml
 create mode 100644 cookbooks/cosmos3/finetune/toml/sft_config/videophy2_sft_nano.toml
 create mode 100644 cookbooks/cosmos3/finetune/toml/sft_config/vision_sft_nano.toml
 create mode 100644 cookbooks/cosmos3/finetune/toml/sft_config/vision_sft_super.toml

diff --git a/cookbooks/cosmos3/finetune/toml/sft_config/llava_ov.toml b/cookbooks/cosmos3/finetune/toml/sft_config/llava_ov.toml
new file mode 100644
index 00000000..41fe3502
--- /dev/null
+++ b/cookbooks/cosmos3/finetune/toml/sft_config/llava_ov.toml
@@ -0,0 +1,108 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+# pre_exp012_llava_ov — VLM training on lmms-lab/LLaVA-OneVision-Data
+# via CosmosDataLoader. Base config = cosmos_framework/configs/base/vlm/config.py
+# (selected by [job].task="vlm").
+#
+# One knob that the SFTExperimentConfig dataclass does NOT model — supply
+# it as a CLI extra override at launch time:
+#
+#   data_setting.max_tokens=<int, drives both max_seq_len and dataloader.max_tokens>
+#
+# (The backbone is now modeled — see [model.backbone] below.)
+#
+# Example launch:
+#   torchrun --nproc_per_node=4 -m cosmos_framework.scripts.train \
+#       --sft-toml toml/sft_config/llava_ov.toml -- \
+#       data_setting.max_tokens=16000
+#
+# Per-task remap (see _PATH_REMAPS["vlm"]):
+#   model.parallelism.*            -> model.config.parallelism.*
+#   model.compile.*                -> model.config.compile.*
+#   model.activation_checkpointing.* -> model.config.activation_checkpointing.*
+#   model.precision                -> model.config.precision
+#   model.attn_implementation      -> model.config.policy.attn_implementation
+#   model.backbone.*               -> model.config.policy.backbone.*
+#   model.ema.*                    -> model.config.ema.*
+#   model.{max_num_tokens_after_packing, joint_attn_implementation, lora_*,
+#          tokenizer.*} and dataloader_train.{max_sequence_length, seed} -> SKIPPED
+
+[job]
+task         = "vlm"
+experiment   = "pre_exp012_llava_ov"
+project      = "cosmos3"                                 # matches legacy
+group        = "vlm_llava_ov_demo"
+name         = "pre_exp012_llava_ov"
+wandb_mode   = "disabled"
+
+[model]
+# VLM-only attention impl (PolicyConfig.attn_implementation).
+attn_implementation = "cosmos"     # "cosmos" | "flash_attention_2" | "sdpa" | "eager"
+precision           = "bfloat16"   # was [model.parallelism].precision
+
+[model.backbone]
+model_name = "Qwen/Qwen3-VL-8B-Instruct"   # → model.config.policy.backbone.model_name (VLM remap)
+
+[model.ema]
+enabled         = false
+rate            = 0.1
+iteration_shift = 0
+
+[model.parallelism]
+data_parallel_shard_degree      = 8                  # matches legacy dp_shard_size=8
+data_parallel_replicate_degree  = -1                 # matches legacy dp_replicate_size=-1
+context_parallel_shard_degree   = 1
+cfg_parallel_shard_degree       = 1
+
+[model.compile]
+enabled                         = false              # was [model.parallelism].use_torch_compile
+compile_dynamic                 = true
+
+[model.activation_checkpointing]
+mode                = "full"
+save_ops_regex      = ["fmha"]
+preserve_rng_state  = true
+determinism_check   = "default"
+
+[optimizer]
+betas         = [0.9, 0.95]
+eps           = 1.0e-8                              # skipped for VLM by _PATH_REMAPS
+fused         = true
+lr            = 1.0e-5                              # matches legacy
+weight_decay  = 0.1                                 # matches legacy
+# keys_to_select / lr_multipliers omitted — VLM Trainer defaults apply.
+
+[scheduler]
+cycle_lengths      = [500]                          # matches legacy (VLM_LAMBDACOSINE_KWARGS uses ${trainer.max_iter})
+f_max              = [1.0]
+f_min              = [0.5]                          # matches legacy
+f_start            = [0.05]                         # matches legacy
+verbosity_interval = 0                              # skipped for VLM by _PATH_REMAPS
+warm_up_steps      = [1000]                         # matches legacy
+
+[trainer]
+distributed_parallelism = "fsdp"
+grad_accum_iter         = 1
+logging_iter            = 1
+max_iter                = 500                     # matches legacy
+
+[trainer.callbacks.compile_tokenizer]
+compile_after_iterations = 3
+enabled                  = false
+
+[trainer.callbacks.grad_clip]
+clip_norm    = 1.0
+force_finite = false                                # matches VLM default in cosmos_framework/configs/base/vlm/defaults/callbacks.py:55
+
+[checkpoint]
+keys_to_skip_loading = []
+load_path            = "???"                      # MISSING sentinel; skipped by build_hydra_overrides — supply at runtime
+save_iter            = 100
+
+[dataloader_train]
+# Routed by PATH_REMAPS["vlm"] onto the CosmosDataLoader's nested PoolPackingBatcher:
+#   max_samples_per_batch -> dataloader_train.batcher.max_batch_size
+#   max_sequence_length   -> dataloader_train.batcher.max_tokens
+max_samples_per_batch = 1
+max_sequence_length   = 16000
diff --git a/cookbooks/cosmos3/finetune/toml/sft_config/videophy2_sft_nano.toml b/cookbooks/cosmos3/finetune/toml/sft_config/videophy2_sft_nano.toml
new file mode 100644
index 00000000..fa1ae613
--- /dev/null
+++ b/cookbooks/cosmos3/finetune/toml/sft_config/videophy2_sft_nano.toml
@@ -0,0 +1,91 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+# videophy2_sft_nano — VLM dialog SFT on VideoPhy-2 via CosmosDataLoader.
+# Base config = cosmos_framework/configs/base/vlm/config.py (selected by [job].task="vlm").
+#
+# Dataset prep:
+#   python -m cosmos_framework.scripts.vlm.prepare_videophy2_from_hf \
+#       --out_root $VIDEOPHYSICS_ROOT --split train  # and again with --split val
+#
+# Required env at launch: VIDEOPHYSICS_ROOT (read by the experiment Python).
+#
+# Example launch:
+#   bash launch_sft_videophy2_nano.sh
+
+[job]
+task         = "vlm"
+experiment   = "videophy2_sft_nano"
+project      = "cosmos3"
+group        = "vlm_videophy2_sft"
+name         = "videophy2_sft_nano"
+wandb_mode   = "disabled"
+
+[model]
+attn_implementation = "cosmos"
+precision           = "bfloat16"                         # was [model.parallelism].precision
+
+[model.backbone]
+model_name = "Qwen/Qwen3-VL-8B-Instruct"
+
+[model.ema]
+enabled         = false
+rate            = 0.1
+iteration_shift = 0
+
+[model.parallelism]
+data_parallel_shard_degree      = 8
+data_parallel_replicate_degree  = -1
+context_parallel_shard_degree   = 1
+cfg_parallel_shard_degree       = 1
+
+[model.compile]
+enabled                         = false                  # was [model.parallelism].use_torch_compile
+compile_dynamic                 = true
+
+[model.activation_checkpointing]
+mode                = "full"
+save_ops_regex      = ["fmha"]
+preserve_rng_state  = true
+determinism_check   = "default"
+
+[optimizer]
+betas         = [0.9, 0.95]
+eps           = 1.0e-8
+fused         = true
+lr            = 1.0e-6
+weight_decay  = 0.1
+
+[scheduler]
+cycle_lengths      = [50]
+f_max              = [1.0]
+f_min              = [0.1]
+f_start            = [0.05]
+verbosity_interval = 0
+warm_up_steps      = [5]
+
+[trainer]
+distributed_parallelism = "fsdp"
+grad_accum_iter         = 8
+logging_iter            = 1
+max_iter                = 50
+
+[trainer.callbacks.compile_tokenizer]
+compile_after_iterations = 3
+enabled                  = false
+
+[trainer.callbacks.grad_clip]
+clip_norm    = 1.0
+force_finite = false
+
+[checkpoint]
+keys_to_skip_loading = []
+load_path            = "???"
+save_iter            = 100
+
+[dataloader_train]
+# Routed by PATH_REMAPS["vlm"] onto the CosmosDataLoader's nested PoolPackingBatcher:
+#   max_samples_per_batch -> dataloader_train.batcher.max_batch_size
+#   max_sequence_length   -> dataloader_train.batcher.max_tokens
+max_samples_per_batch = 1
+max_sequence_length   = 16000
diff --git a/cookbooks/cosmos3/finetune/toml/sft_config/vision_sft_nano.toml b/cookbooks/cosmos3/finetune/toml/sft_config/vision_sft_nano.toml
new file mode 100644
index 00000000..dbb192dc
--- /dev/null
+++ b/cookbooks/cosmos3/finetune/toml/sft_config/vision_sft_nano.toml
@@ -0,0 +1,91 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+# vision_sft_nano — T2V / I2V / V2V vision-only SFT (Qwen3-VL-8B / nano)
+# Consumed by cosmos_framework.configs.toml_config.sft_config.load_experiment_from_toml.
+# Uses PackingDataLoader (no dataloader_train.seed slot — keep it omitted here).
+
+[job]
+task         = "vfm"
+experiment   = "vision_sft_nano"
+project      = "cosmos3"
+group        = "sft"
+name         = "vision_sft_nano"
+wandb_mode   = "disabled"
+
+[model]
+max_num_tokens_after_packing = 45056
+joint_attn_implementation    = "two_way"
+precision                    = "bfloat16"                # was [model.parallelism].precision
+
+[model.ema]
+enabled         = true
+rate            = 0.1
+iteration_shift = 0
+
+[model.parallelism]
+data_parallel_shard_degree      = -1                     # -1 = auto from WORLD_SIZE (matches legacy)
+data_parallel_replicate_degree  = 1
+
+[model.compile]
+enabled                         = true                   # was [model.parallelism].use_torch_compile
+compile_dynamic                 = true
+
+[model.activation_checkpointing]
+mode                = "full"
+save_ops_regex      = ["fmha"]
+preserve_rng_state  = true
+determinism_check   = "default"
+
+[model.tokenizer]
+vae_path = "${oc.env:WAN_VAE_PATH}"
+
+[optimizer]
+betas         = [0.9, 0.95]
+eps           = 1.0e-6
+fused         = true
+keys_to_select = [
+    "moe_gen",
+    "time_embedder",
+    "vae2llm",
+    "llm2vae",
+]
+lr            = 2.0e-5
+weight_decay  = 0                                        # int matches legacy YAML repr
+# lr_multipliers intentionally empty for vision SFT (Hydra default {} stands).
+
+[scheduler]
+cycle_lengths      = [1000]
+f_max              = [1.0]
+f_min              = [0.0]
+f_start            = [0.0]
+verbosity_interval = 0
+warm_up_steps      = [50]
+
+[trainer]
+distributed_parallelism = "fsdp"
+grad_accum_iter         = 2
+logging_iter            = 1
+max_iter                = 500
+
+[trainer.callbacks.compile_tokenizer]
+compile_after_iterations = 3
+enabled                  = false
+# warmup_resolutions omitted (None at experiment level)
+
+[trainer.callbacks.grad_clip]
+clip_norm    = 0.1
+force_finite = true
+
+[checkpoint]
+keys_to_skip_loading = ["net_ema."]
+load_path            = "${oc.env:BASE_CHECKPOINT_PATH}"
+save_iter            = 100
+
+[dataloader_train]
+max_sequence_length = 45056
+# Per-caption token cap before truncation. Structured-JSON captions run longer than
+# dense prose (measured max ~1790 tokens), so keep headroom; raise it for longer captions.
+max_caption_tokens = 2048
+# max_samples_per_batch omitted (None — PackingDataLoader doesn't cap by sample count)
+# seed omitted — PackingDataLoader has no seed ctor kwarg
diff --git a/cookbooks/cosmos3/finetune/toml/sft_config/vision_sft_super.toml b/cookbooks/cosmos3/finetune/toml/sft_config/vision_sft_super.toml
new file mode 100644
index 00000000..06a1574a
--- /dev/null
+++ b/cookbooks/cosmos3/finetune/toml/sft_config/vision_sft_super.toml
@@ -0,0 +1,92 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+# vision_sft_super — LoRA-only T2V/I2V/V2V SFT on Qwen3-VL-32B (super tier).
+# Consumed by cosmos_framework.configs.toml_config.sft_config.load_experiment_from_toml.
+# Uses PackingDataLoader (no dataloader_train.seed slot — keep it omitted).
+
+[job]
+task         = "vfm"
+experiment   = "vision_sft_super"
+project      = "cosmos3"
+group        = "sft"
+name         = "vision_sft_super"
+wandb_mode   = "disabled"
+
+[model]
+max_num_tokens_after_packing = 45056
+joint_attn_implementation    = "two_way"
+lora_enabled                 = true
+lora_rank                    = 16
+lora_alpha                   = 32
+lora_target_modules          = "q_proj_moe_gen,k_proj_moe_gen,v_proj_moe_gen,o_proj_moe_gen"
+precision                    = "bfloat16"                # was [model.parallelism].precision
+
+[model.ema]
+enabled         = false                                  # super uses LoRA, no EMA
+rate            = 0.1
+iteration_shift = 0
+
+[model.parallelism]
+data_parallel_shard_degree      = -1                     # -1 = auto from WORLD_SIZE (matches legacy)
+data_parallel_replicate_degree  = 1
+context_parallel_shard_degree   = 2                      # super uses CP=2
+cfg_parallel_shard_degree       = 1
+
+[model.compile]
+enabled                         = false                  # super disables compile (was use_torch_compile)
+compile_dynamic                 = true
+
+[model.activation_checkpointing]
+mode                = "full"
+save_ops_regex      = ["fmha"]
+preserve_rng_state  = true
+determinism_check   = "default"
+
+[model.tokenizer]
+vae_path = "${oc.env:WAN_VAE_PATH}"
+
+[optimizer]
+betas          = [0.9, 0.95]
+eps            = 1.0e-6
+fused          = true
+keys_to_select = ["lora_"]                               # train LoRA adapters only
+lr             = 5.0e-4
+weight_decay   = 0                                       # int matches legacy YAML repr
+# lr_multipliers intentionally empty.
+
+[scheduler]
+cycle_lengths      = [1000]
+f_max              = [1.0]
+f_min              = [0.0]
+f_start            = [0.0]
+verbosity_interval = 0
+warm_up_steps      = [50]
+
+[trainer]
+distributed_parallelism = "fsdp"
+grad_accum_iter         = 2
+logging_iter            = 1
+max_iter                = 500
+
+[trainer.callbacks.compile_tokenizer]
+compile_after_iterations = 3
+enabled                  = false
+warmup_resolutions       = ["256", "480", "720"]
+
+[trainer.callbacks.grad_clip]
+clip_norm    = 0.1
+force_finite = true
+
+[checkpoint]
+keys_to_skip_loading = ["net_ema.", "lora_"]             # LoRA tensors freshly init
+load_path            = "${oc.env:BASE_CHECKPOINT_PATH}"
+save_iter            = 100
+
+[dataloader_train]
+max_sequence_length = 45056
+# Per-caption token cap before truncation. Structured-JSON captions run longer than
+# dense prose (measured max ~1790 tokens), so keep headroom; raise it for longer captions.
+max_caption_tokens = 2048
+# max_samples_per_batch omitted (None — PackingDataLoader doesn't cap by count)
+# seed omitted — PackingDataLoader has no seed ctor kwarg

From 1cb31427da4479ce259e2019d8c0911bd4ba796a Mon Sep 17 00:00:00 2001
From: Maosheng Liao <maoshengl@nvidia.com>
Date: Tue, 16 Jun 2026 07:00:53 -0700
Subject: [PATCH 2/6] feat(finetune): vendor + rewire shared SFT launcher for
 the cookbook

---
 .../cosmos3/finetune/_sft_launcher_common.sh  | 98 +++++++++++++++++++
 1 file changed, 98 insertions(+)
 create mode 100644 cookbooks/cosmos3/finetune/_sft_launcher_common.sh

diff --git a/cookbooks/cosmos3/finetune/_sft_launcher_common.sh b/cookbooks/cosmos3/finetune/_sft_launcher_common.sh
new file mode 100644
index 00000000..377b10f9
--- /dev/null
+++ b/cookbooks/cosmos3/finetune/_sft_launcher_common.sh
@@ -0,0 +1,98 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+# Shared launch plumbing for the cookbook launch_sft_<recipe>.sh — the
+# structured-TOML / pydantic-schema flow that drives cosmos_framework.scripts.train.
+#
+# REQUIRES: an activated cosmos-framework venv (see the finetune README
+# Prerequisites) so `cosmos_framework` is importable. This launcher does NOT
+# add the framework's .venv/bin to PATH.
+#
+# Caller MUST set before sourcing:
+#   TOML_FILE            recipe TOML, e.g. "toml/sft_config/<recipe>.toml".
+#                        Absolute or cookbook-relative.
+#
+# Caller MAY set before sourcing (presence drives which existence checks fire):
+#   DATASET_PATH         recipe-local dataset dir, e.g. "data/<name>".
+#                        If unset, no dataset existence check fires
+#                        (reasoner / HF-streaming case).
+#   BASE_CHECKPOINT_PATH recipe-local base DCP dir, e.g. "checkpoints/<name>".
+#                        Setting it also enables WAN_VAE_PATH plumbing + check.
+#   WAN_VAE_PATH         override the default checkpoints/wan22_vae/Wan2.2_VAE.pth.
+#   EXTRA_DATASET_CHECK  bash snippet (string) eval'd after the default checks.
+#   TAIL_OVERRIDES       bash array of Hydra CLI overrides appended after `--`
+#                        (e.g. data_setting.max_tokens=16000 for VLM smokes).
+#   MASTER_PORT          torchrun --master_port; default 50012.
+#   NPROC_PER_NODE       torchrun --nproc_per_node; default 8.
+#   LOG_FILENAME         override $LOG_DIR/${LOG_FILENAME}
+#                        (default <toml-stem>_sft.log).
+#
+# Absolute paths are passed through; relative paths are anchored to the cookbook
+# dir (the directory containing this launcher). Paths set in the caller's shell
+# via `export DATASET_PATH=...` etc. win over the launcher's defaults (use the
+# `: "${VAR:=default}"` idiom in the launcher to preserve this).
+
+set -uo pipefail
+
+: "${TOML_FILE:?TOML_FILE must be set before sourcing _sft_launcher_common.sh}"
+
+# Cookbook dir = the wrapper's own directory (cookbooks/cosmos3/finetune/).
+WORKDIR="$(cd "$(dirname "${BASH_SOURCE[1]}")" && pwd)"
+
+# Anchor relative paths to $WORKDIR.
+[[ "$TOML_FILE" = /* ]] || TOML_FILE="$WORKDIR/$TOML_FILE"
+
+if [[ -n "${DATASET_PATH:-}" ]]; then
+    [[ "$DATASET_PATH" = /* ]] || DATASET_PATH="$WORKDIR/$DATASET_PATH"
+    export DATASET_PATH
+fi
+
+if [[ -n "${BASE_CHECKPOINT_PATH:-}" ]]; then
+    [[ "$BASE_CHECKPOINT_PATH" = /* ]] || BASE_CHECKPOINT_PATH="$WORKDIR/$BASE_CHECKPOINT_PATH"
+    WAN_VAE_PATH="${WAN_VAE_PATH:-checkpoints/wan22_vae/Wan2.2_VAE.pth}"
+    [[ "$WAN_VAE_PATH" = /* ]] || WAN_VAE_PATH="$WORKDIR/$WAN_VAE_PATH"
+    export BASE_CHECKPOINT_PATH WAN_VAE_PATH
+fi
+
+OUTPUT_ROOT="${OUTPUT_ROOT:-$WORKDIR/outputs/train}"
+LOG_DIR="$OUTPUT_ROOT/logs"
+TOML_STEM="$(basename "$TOML_FILE" .toml)"
+LOG_FILE="$LOG_DIR/${LOG_FILENAME:-${TOML_STEM}_sft.log}"
+IMAGINAIRE_OUTPUT_ROOT="${IMAGINAIRE_OUTPUT_ROOT:-$OUTPUT_ROOT}"
+mkdir -p "$LOG_DIR"
+
+echo ">>> $(date '+%H:%M:%S') Checking inputs..."
+[[ -f "$TOML_FILE" ]] || { echo "ERROR: TOML not found: $TOML_FILE" >&2; exit 1; }
+if [[ -n "${DATASET_PATH:-}" ]]; then
+    [[ -d "$DATASET_PATH" ]] || { echo "ERROR: DATASET_PATH not found: $DATASET_PATH (run Step 1 of the finetune README, or export DATASET_PATH=<path>)" >&2; exit 1; }
+fi
+if [[ -n "${BASE_CHECKPOINT_PATH:-}" ]]; then
+    [[ -d "$BASE_CHECKPOINT_PATH" ]] || { echo "ERROR: BASE_CHECKPOINT_PATH not found: $BASE_CHECKPOINT_PATH (run Step 2 of the finetune README, or export BASE_CHECKPOINT_PATH=<path>)" >&2; exit 1; }
+    [[ -f "$WAN_VAE_PATH" ]]         || { echo "ERROR: WAN_VAE_PATH not found: $WAN_VAE_PATH (run Step 1 of the finetune README, or export WAN_VAE_PATH=<path>)" >&2; exit 1; }
+fi
+if [[ -n "${EXTRA_DATASET_CHECK:-}" ]]; then eval "$EXTRA_DATASET_CHECK"; fi
+
+cd "$WORKDIR"
+echo ">>> $(date '+%H:%M:%S') WORKDIR:    $WORKDIR"
+echo ">>> $(date '+%H:%M:%S') TOML:       $TOML_FILE"
+[[ -n "${DATASET_PATH:-}" ]]         && echo ">>> $(date '+%H:%M:%S') dataset:    $DATASET_PATH"
+[[ -n "${BASE_CHECKPOINT_PATH:-}" ]] && echo ">>> $(date '+%H:%M:%S') checkpoint: $BASE_CHECKPOINT_PATH"
+echo ">>> $(date '+%H:%M:%S') log:        $LOG_FILE"
+
+# Default empty if caller didn't set; safe under set -u.
+[[ ${TAIL_OVERRIDES+x} ]] || TAIL_OVERRIDES=()
+
+TRAILING_ARGS=()
+if (( ${#TAIL_OVERRIDES[@]} > 0 )); then
+    TRAILING_ARGS=(-- "${TAIL_OVERRIDES[@]}")
+fi
+
+IMAGINAIRE_OUTPUT_ROOT="$IMAGINAIRE_OUTPUT_ROOT" \
+    torchrun --nproc_per_node="${NPROC_PER_NODE:-8}" --master_port="${MASTER_PORT:-50012}" -m cosmos_framework.scripts.train \
+    --sft-toml="$TOML_FILE" \
+    "${TRAILING_ARGS[@]}" \
+    2>&1 | tee "$LOG_FILE"
+
+EXIT_CODE=${PIPESTATUS[0]}
+echo ">>> $(date '+%H:%M:%S') Done (exit $EXIT_CODE)"
+exit $EXIT_CODE

From ca03aa042f4429e7f63ee6dd5067f9e07452beae Mon Sep 17 00:00:00 2001
From: Maosheng Liao <maoshengl@nvidia.com>
Date: Tue, 16 Jun 2026 07:01:57 -0700
Subject: [PATCH 3/6] feat(finetune): vendor + rewire the 4 SFT launch shells

---
 .../cosmos3/finetune/launch_sft_llava_ov.sh   | 43 ++++++++++++++++++
 .../finetune/launch_sft_videophy2_nano.sh     | 45 +++++++++++++++++++
 .../finetune/launch_sft_vision_nano.sh        | 30 +++++++++++++
 .../finetune/launch_sft_vision_super.sh       | 36 +++++++++++++++
 4 files changed, 154 insertions(+)
 create mode 100644 cookbooks/cosmos3/finetune/launch_sft_llava_ov.sh
 create mode 100644 cookbooks/cosmos3/finetune/launch_sft_videophy2_nano.sh
 create mode 100644 cookbooks/cosmos3/finetune/launch_sft_vision_nano.sh
 create mode 100644 cookbooks/cosmos3/finetune/launch_sft_vision_super.sh

diff --git a/cookbooks/cosmos3/finetune/launch_sft_llava_ov.sh b/cookbooks/cosmos3/finetune/launch_sft_llava_ov.sh
new file mode 100644
index 00000000..1967cfca
--- /dev/null
+++ b/cookbooks/cosmos3/finetune/launch_sft_llava_ov.sh
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+# Structured-TOML launch for llava_ov (VLM SFT on
+# lmms-lab/LLaVA-OneVision-Data via CosmosDataLoader). Drives
+# cosmos_framework.scripts.train against toml/sft_config/llava_ov.toml.
+#
+# [job].task = "vlm" — picks cosmos_framework/configs/base/vlm/config.py as the base config.
+#
+# Requires an activated cosmos-framework venv (see the finetune README
+# Prerequisites). Run from cookbooks/cosmos3/finetune/.
+#
+# The dataset streams from the HuggingFace Hub, so DATASET_PATH /
+# WAN_VAE_PATH / BASE_CHECKPOINT_PATH are NOT required.
+#
+# Optional env:
+#   HF_TOKEN               for gated Qwen3-VL-8B-Instruct downloads.
+#   VLM_SAFETENSORS_PATH   local directory of pre-converted Qwen3-VL safetensors
+#                          (e.g. a Cosmos3-Nano LM merged with Qwen3-VL visual via
+#                          `cosmos_framework.scripts.convert_model_to_vlm_safetensors`).
+#                          When set, plumbed to backbone.safetensors_path via a
+#                          tail override. When unset, the framework falls back
+#                          to the public Qwen/Qwen3-VL-8B-Instruct HF snapshot.
+#
+# Usage (8-GPU allocation, framework venv active, from cookbooks/cosmos3/finetune/):
+#   bash launch_sft_llava_ov.sh
+
+TOML_FILE="toml/sft_config/llava_ov.toml"
+
+TAIL_OVERRIDES=(
+    ${EXTRA_TAIL_OVERRIDES:-}
+)
+
+# When VLM_SAFETENSORS_PATH is set, plumb it to backbone.safetensors_path so the
+# framework loads weights from the local snapshot (e.g. a Cosmos3-Nano LM merged
+# with Qwen3-VL visual via `cosmos_framework.scripts.convert_model_to_vlm_safetensors`)
+# while keeping the public HF model_name for tokenizer/architecture discovery.
+if [[ -n "${VLM_SAFETENSORS_PATH:-}" ]]; then
+    TAIL_OVERRIDES+=("model.config.policy.backbone.safetensors_path=$VLM_SAFETENSORS_PATH")
+fi
+
+source "$(dirname "${BASH_SOURCE[0]}")/_sft_launcher_common.sh"
diff --git a/cookbooks/cosmos3/finetune/launch_sft_videophy2_nano.sh b/cookbooks/cosmos3/finetune/launch_sft_videophy2_nano.sh
new file mode 100644
index 00000000..9499cc5c
--- /dev/null
+++ b/cookbooks/cosmos3/finetune/launch_sft_videophy2_nano.sh
@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+# Structured-TOML launch for videophy2_sft_nano (VLM dialog SFT on VideoPhy-2
+# via CosmosDataLoader). Drives cosmos_framework.scripts.train against
+# toml/sft_config/videophy2_sft_nano.toml.
+#
+# [job].task = "vlm" — picks cosmos_framework/configs/base/vlm/config.py as the base config.
+#
+# Requires an activated cosmos-framework venv (see the finetune README
+# Prerequisites). Run from cookbooks/cosmos3/finetune/.
+#
+# Required env:
+#   VIDEOPHYSICS_ROOT  dir containing videophy2_train/ and videophy2_val/
+#                      (each with meta.json + media/ + text/). Populate via
+#                      `python -m cosmos_framework.scripts.vlm.prepare_videophy2_from_hf`.
+#
+# Optional env:
+#   HF_TOKEN               for gated Qwen3-VL-8B-Instruct downloads.
+#   VLM_SAFETENSORS_PATH   local directory of pre-converted Qwen3-VL safetensors
+#                          (e.g. Cosmos3-Nano LM merged with Qwen3-VL visual via
+#                          `cosmos_framework.scripts.convert_model_to_vlm_safetensors`).
+#                          When set, plumbed to backbone.safetensors_path via a
+#                          tail override. When unset, the framework falls back
+#                          to the public Qwen/Qwen3-VL-8B-Instruct HF snapshot.
+#
+# Usage (8-GPU allocation, framework venv active, from cookbooks/cosmos3/finetune/):
+#   VIDEOPHYSICS_ROOT=/path/to/videophysics bash launch_sft_videophy2_nano.sh
+
+TOML_FILE="toml/sft_config/videophy2_sft_nano.toml"
+
+TAIL_OVERRIDES=(
+    ${EXTRA_TAIL_OVERRIDES:-}
+)
+
+# When VLM_SAFETENSORS_PATH is set, plumb it to backbone.safetensors_path so the
+# framework loads weights from the local snapshot (e.g. a Cosmos3-Nano LM merged
+# with Qwen3-VL visual via `cosmos_framework.scripts.convert_model_to_vlm_safetensors`)
+# while keeping the public HF model_name for tokenizer/architecture discovery.
+if [[ -n "${VLM_SAFETENSORS_PATH:-}" ]]; then
+    TAIL_OVERRIDES+=("model.config.policy.backbone.safetensors_path=$VLM_SAFETENSORS_PATH")
+fi
+
+source "$(dirname "${BASH_SOURCE[0]}")/_sft_launcher_common.sh"
diff --git a/cookbooks/cosmos3/finetune/launch_sft_vision_nano.sh b/cookbooks/cosmos3/finetune/launch_sft_vision_nano.sh
new file mode 100644
index 00000000..d67c4ddc
--- /dev/null
+++ b/cookbooks/cosmos3/finetune/launch_sft_vision_nano.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+# Structured-TOML launch for vision_sft_nano (T2V / I2V / V2V vision-only
+# SFT on Qwen3-VL-8B, 8-GPU FSDP). Drives cosmos_framework.scripts.train against
+# toml/sft_config/vision_sft_nano.toml.
+#
+# Requires an activated cosmos-framework venv (see the finetune README
+# Prerequisites). Run from cookbooks/cosmos3/finetune/.
+#
+# Optional env vars (defaults below point under this cookbook dir; override to
+# put data or checkpoints on a different filesystem):
+#   DATASET_PATH          default: data/BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge
+#                         (must contain train/video_dataset_file.jsonl)
+#   BASE_CHECKPOINT_PATH  default: checkpoints/Cosmos3-Nano
+#   WAN_VAE_PATH          default: checkpoints/wan22_vae/Wan2.2_VAE.pth
+#   HF_TOKEN              if any tokenizer download requires gated HF access
+#   OUTPUT_ROOT           default: outputs/train
+#
+# Usage (8-GPU allocation, framework venv active, from cookbooks/cosmos3/finetune/):
+#   bash launch_sft_vision_nano.sh
+
+TOML_FILE="toml/sft_config/vision_sft_nano.toml"
+: "${DATASET_PATH:=data/BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge}"
+: "${BASE_CHECKPOINT_PATH:=checkpoints/Cosmos3-Nano}"
+
+EXTRA_DATASET_CHECK='[[ -f "$DATASET_PATH/train/video_dataset_file.jsonl" ]] || { echo "ERROR: missing $DATASET_PATH/train/video_dataset_file.jsonl" >&2; exit 1; }'
+
+source "$(dirname "${BASH_SOURCE[0]}")/_sft_launcher_common.sh"
diff --git a/cookbooks/cosmos3/finetune/launch_sft_vision_super.sh b/cookbooks/cosmos3/finetune/launch_sft_vision_super.sh
new file mode 100644
index 00000000..54bfde97
--- /dev/null
+++ b/cookbooks/cosmos3/finetune/launch_sft_vision_super.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+# Structured-TOML launch for vision_sft_super (T2V / I2V / V2V LoRA SFT on
+# Qwen3-VL-32B-Instruct, 8-GPU FSDP with CP=2 / DP=4). Drives
+# cosmos_framework.scripts.train against toml/sft_config/vision_sft_super.toml.
+#
+# Requires an activated cosmos-framework venv (see the finetune README
+# Prerequisites). Run from cookbooks/cosmos3/finetune/.
+#
+# Optional env vars (defaults below point under this cookbook dir; override to
+# put data or checkpoints on a different filesystem):
+#   DATASET_PATH          default: data/BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge
+#                         (must contain train/video_dataset_file.jsonl)
+#   BASE_CHECKPOINT_PATH  default: checkpoints/Cosmos3-Super
+#   WAN_VAE_PATH          default: checkpoints/wan22_vae/Wan2.2_VAE.pth
+#   HF_TOKEN              if any tokenizer download requires gated HF access
+#   OUTPUT_ROOT           default: outputs/train
+#
+# Usage (8-GPU allocation, framework venv active, from cookbooks/cosmos3/finetune/):
+#   bash launch_sft_vision_super.sh
+
+TOML_FILE="toml/sft_config/vision_sft_super.toml"
+: "${DATASET_PATH:=data/BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge}"
+: "${BASE_CHECKPOINT_PATH:=checkpoints/Cosmos3-Super}"
+
+EXTRA_DATASET_CHECK='[[ -f "$DATASET_PATH/train/video_dataset_file.jsonl" ]] || { echo "ERROR: missing $DATASET_PATH/train/video_dataset_file.jsonl" >&2; exit 1; }'
+
+# Super-variant env tweaks: clear LD_LIBRARY_PATH to avoid host CUDA/NCCL libs
+# bleeding into the venv, switch the allocator to expandable_segments so the
+# 32B backbone fits without OOM during compile/decode.
+export LD_LIBRARY_PATH=""
+export PYTORCH_ALLOC_CONF="${PYTORCH_ALLOC_CONF:-expandable_segments:True}"
+
+source "$(dirname "${BASH_SOURCE[0]}")/_sft_launcher_common.sh"

From c983cf84f7cf7b7130931173e8d8be565a4be7cf Mon Sep 17 00:00:00 2001
From: Maosheng Liao <maoshengl@nvidia.com>
Date: Tue, 16 Jun 2026 07:06:50 -0700
Subject: [PATCH 4/6] docs(finetune): adapt training.md into the cookbook SFT
 guide

---
 cookbooks/cosmos3/finetune/README.md | 345 +++++++++++++++++++++++++++
 1 file changed, 345 insertions(+)
 create mode 100644 cookbooks/cosmos3/finetune/README.md

diff --git a/cookbooks/cosmos3/finetune/README.md b/cookbooks/cosmos3/finetune/README.md
new file mode 100644
index 00000000..ac0e3ff6
--- /dev/null
+++ b/cookbooks/cosmos3/finetune/README.md
@@ -0,0 +1,345 @@
+# Cosmos3 Fine-Tuning (Supervised Fine-Tuning)
+
+<!--TOC-->
+
+______________________________________________________________________
+
+**Table of Contents**
+
+- [Prerequisites](#prerequisites)
+- [Step 1 - Prepare data and config](#step-1---prepare-data-and-config)
+- [Step 2 — Prepare checkpoint](#step-2--prepare-checkpoint)
+- [Step 3 — Run training](#step-3--run-training)
+  - [Option A (recommended): the paired launch shell](#option-a-recommended-the-paired-launch-shell)
+    - [Overriding the defaults](#overriding-the-defaults)
+  - [Option B: raw `torchrun`](#option-b-raw-torchrun)
+- [Outputs](#outputs)
+- [Export checkpoint to Hugging Face safetensors](#export-checkpoint-to-hugging-face-safetensors)
+- [Config](#config)
+  - [Common Hydra tail overrides](#common-hydra-tail-overrides)
+
+______________________________________________________________________
+
+<!--TOC-->
+
+Fine-tune a pre-trained Cosmos3 model on your own dataset using supervised fine-tuning (SFT). Tested on 8× H100 (80 GB).
+
+## Prerequisites
+
+Training runs through the **cosmos-framework** package: the `cosmos_framework.scripts.train` entry point and the experiment-SKU configs live there, so you must install a framework checkout before running anything in this guide. The recipe TOMLs and launch shells in this folder drive that entry point.
+
+1. **Clone and install cosmos-framework.** Follow the cosmos3 cookbook's [Cosmos Framework setup](../README.md#cosmos-framework) — clone into `packages/cosmos3` and run `uv sync --all-extras --group=cu130-train` (use `cu128-train` on a CUDA 12.x driver). `uv sync` is the install: it installs the `cosmos-framework` project itself (editable) plus all training dependencies into `.venv`; no separate `pip install` is needed.
+
+2. **Activate the framework venv** so `cosmos_framework` is importable. These launch shells deliberately do **not** add `.venv/bin` to `PATH`:
+
+   ```shell
+   source <path-to>/packages/cosmos3/.venv/bin/activate
+   ```
+
+3. **Run every command below from this cookbook directory** (`cookbooks/cosmos3/finetune/`) with that venv active. Data, checkpoints, and outputs default to `data/`, `checkpoints/`, and `outputs/` under this folder (all git-ignored); export `DATASET_PATH` / `BASE_CHECKPOINT_PATH` / `WAN_VAE_PATH` to override (see [Step 3 → Overriding the defaults](#overriding-the-defaults)).
+
+For deeper references see the framework docs: [environment variables](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/environment_variables.md), [FAQ / troubleshooting](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/faq.md) (OOM during SFT, common pitfalls), and the [JSONL dataset format](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/dataset_jsonl.md).
+
+## Step 1 - Prepare data and config
+
+Some datasets are license gated — visit the repository page and accept any terms, and authenticate with `uvx hf@latest auth login` (or set `HF_TOKEN`).
+
+The per-recipe download commands below write to `data/<dataset>/` and `checkpoints/wan22_vae/Wan2.2_VAE.pth`, which match the launcher's default `$DATASET_PATH` and `$WAN_VAE_PATH`. See [Step 3 → Option A](#option-a-recommended-the-paired-launch-shell) for how to override these defaults if you'd rather keep data on a different filesystem.
+
+Select one of the following recipes:
+
+<details open><summary><b>Vision SFT (Cosmos3-Nano)</b></summary>
+
+T2V/I2V/V2V SFT on [nvidia/BridgeData2-Subset-Synthetic-Captions](https://huggingface.co/datasets/nvidia/BridgeData2-Subset-Synthetic-Captions/tree/main). `$DATASET_PATH` should be the directory containing `train/video_dataset_file.jsonl`. Each clip carries a structured-JSON caption (`caption_json`) — the model's native prompt format — which the SFT loader trains on by default (the dense narrative is kept as a backup), so training stays aligned with [Inference](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/dataset_jsonl.md#inference); see [JSONL Dataset → Format](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/dataset_jsonl.md#format).
+
+Launch shell: `launch_sft_vision_nano.sh`
+
+```shell
+BASE_CHECKPOINT_NAME=Cosmos3-Nano
+
+# Defaults match the launcher (see Step 3 → Option A to override).
+uvx hf@latest download --repo-type dataset nvidia/BridgeData2-Subset-Synthetic-Captions \
+    --revision 40d018ac1c1a2a4b9734f17fdb21f3d933c49a01 \
+    --local-dir data/BridgeData2-Subset-Synthetic-Captions --quiet
+uvx hf@latest download Wan-AI/Wan2.2-TI2V-5B Wan2.2_VAE.pth \
+    --local-dir checkpoints/wan22_vae --quiet
+```
+
+</details>
+
+<details><summary><b>Vision SFT LoRA (Cosmos3-Super)</b></summary>
+
+LoRA SFT on Qwen3-VL-32B MoT (Cosmos3-Super), on the same Bridge dataset as **Vision SFT (Cosmos3-Nano)**. Step 2 must convert the Cosmos3-Super checkpoint, not Cosmos3-Nano.
+
+Launch shell: `launch_sft_vision_super.sh`
+
+```shell
+BASE_CHECKPOINT_NAME=Cosmos3-Super
+
+# Defaults match the launcher (see Step 3 → Option A to override).
+uvx hf@latest download --repo-type dataset nvidia/BridgeData2-Subset-Synthetic-Captions \
+    --revision 40d018ac1c1a2a4b9734f17fdb21f3d933c49a01 \
+    --local-dir data/BridgeData2-Subset-Synthetic-Captions --quiet
+uvx hf@latest download Wan-AI/Wan2.2-TI2V-5B Wan2.2_VAE.pth \
+    --local-dir checkpoints/wan22_vae --quiet
+```
+
+</details>
+
+<details><summary><b>Reasoner Alignment SFT with LLaVA-OneVision (vfm-vlm)</b></summary>
+
+Alignment SFT for the Reasoner variant on the [lmms-lab/LLaVA-OneVision-Data](https://huggingface.co/datasets/lmms-lab/LLaVA-OneVision-Data) dataset (streamed from HF Hub). Skips Step 2: by default the backbone `Qwen/Qwen3-VL-8B-Instruct` is fetched from the HF Hub by the model downloader at startup — no DCP conversion needed and no required env vars. To instead start from a merged Cosmos3 reasoner snapshot (Cosmos3-Nano LM merged onto the Qwen3-VL visual tower), build it with `convert_model_to_vlm_safetensors` (see [Step 2](#step-2--prepare-checkpoint)) and point `VLM_SAFETENSORS_PATH` at it — same mechanism as the VideoPhy-2 recipe below.
+
+Launch shell: `launch_sft_llava_ov.sh`
+
+```shell
+# No required env vars. The first launch will populate the HF Hub cache under
+# $HF_HOME (defaults to /tmp/hf_cache inside the wrapper); subsequent launches
+# reuse the cached snapshot.
+#
+# (optional) HF_TOKEN raises HF Hub rate limits for the streamed dataset
+# revision lookup — useful if you're running 8-rank fan-out from a single IP:
+# export HF_TOKEN=hf_...
+#
+# (optional) VLM_SAFETENSORS_PATH starts training from a local pre-converted
+# Qwen3-VL safetensors snapshot (e.g. Cosmos3-Nano LM merged with the Qwen3-VL
+# visual tower) instead of the public HF backbone:
+# export VLM_SAFETENSORS_PATH=$PWD/checkpoints/Cosmos3-Nano-VLM
+```
+
+</details>
+
+<details><summary><b>Reasoner Alignment SFT with VideoPhy-2 (Cosmos3-Nano)</b></summary>
+
+Reasoner alignment SFT for 1–5 physical-plausibility scoring on [videophysics/videophy2_train](https://huggingface.co/datasets/videophysics/videophy2_train) (HF test split renamed to `videophy2_val/`). `[job].task = "vlm"`. Bootstraps from `Cosmos3-Nano`'s language-model weights merged onto the public Qwen3-VL-8B-Instruct visual tower; the merged HF directory is consumed via `[model.backbone].safetensors_path` (plumbed by `VLM_SAFETENSORS_PATH`).
+
+Launch shell: `launch_sft_videophy2_nano.sh`
+
+```shell
+# Step 1 (data): materialize the public HF dataset into the canonical local layout
+# (videophy2_{train,val}/{meta.json, media/, text/}).
+python -m cosmos_framework.scripts.vlm.prepare_videophy2_from_hf \
+    --out_root data/videophysics --split both
+```
+
+</details>
+
+## Step 2 — Prepare checkpoint
+
+Convert the base checkpoint to [PyTorch Distributed Checkpoint (DCP)](https://pytorch.org/docs/stable/distributed.checkpoint.html) format. `cosmos_framework.scripts.convert_model_to_dcp` ships in the unified `cosmos_framework/` package, so this step runs from this cookbook directory (with the framework venv active per [Prerequisites](#prerequisites)).
+
+Set `BASE_CHECKPOINT_NAME` to the value from the recipe block you picked in Step 1 (`Cosmos3-Nano` or `Cosmos3-Super`):
+
+```shell
+BASE_CHECKPOINT_NAME=Cosmos3-Nano   # or Cosmos3-Super — match the recipe in Step 1
+
+# Default output dir matches the launcher (see Step 3 → Option A to override).
+python -m cosmos_framework.scripts.convert_model_to_dcp \
+  -o checkpoints/$BASE_CHECKPOINT_NAME \
+  --checkpoint-path $BASE_CHECKPOINT_NAME
+```
+
+`$BASE_CHECKPOINT_NAME` (e.g. `Cosmos3-Nano`, `Cosmos3-Super`) is a registered name in the checkpoint catalog; the converter downloads the matching repo from the Hugging Face Hub and writes the DCP into `checkpoints/$BASE_CHECKPOINT_NAME`.
+
+**Reasoner Alignment SFT with LLaVA-OneVision (vfm-vlm):** Skip this step — the Reasoner alignment SFT loads `Qwen/Qwen3-VL-8B-Instruct` from the HF Hub at startup (no DCP conversion required). To start from a merged Cosmos3 reasoner snapshot instead, build one with `convert_model_to_vlm_safetensors` (see the VideoPhy-2 note below) and pass it via `VLM_SAFETENSORS_PATH`.
+
+**Reasoner Alignment SFT with VideoPhy-2 (Cosmos3-Nano):** Use `cosmos_framework.scripts.convert_model_to_vlm_safetensors` instead.
+
+```shell
+# Step 2 (VLM checkpoint): merge Cosmos3-Nano LM onto the Qwen3-VL visual tower.
+# Replaces the convert_model_to_dcp step used by the VFM recipes above.
+python -m cosmos_framework.scripts.convert_model_to_vlm_safetensors \
+    --checkpoint-path Cosmos3-Nano \
+    -o checkpoints/Cosmos3-Nano-VLM
+```
+
+## Step 3 — Run training
+
+**Weights & Biases (optional):** every recipe TOML defaults to `job.wandb_mode = "disabled"`. To log a run to W&B, flip that field to `"online"` in the TOML and export `WANDB_API_KEY` in your environment before launching.
+
+### Option A (recommended): the paired launch shell
+
+Each recipe ships as a `toml/sft_config/<recipe>.toml` (validated against the pydantic schema at [`cosmos_framework/configs/toml_config/sft_config.py`](https://github.com/NVIDIA/cosmos-framework/blob/main/cosmos_framework/configs/toml_config/sft_config.py)) paired with `launch_sft_<recipe>.sh`; the full upstream catalog is indexed in [the framework's examples index](https://github.com/NVIDIA/cosmos-framework/blob/main/examples/README.md). Each `.sh` sources [`_sft_launcher_common.sh`](_sft_launcher_common.sh) and forwards into `cosmos_framework.scripts.train --sft-toml=<recipe-toml>`. From this cookbook directory, run the launch shell paired with the recipe you set up in Step 1. The wrapper resolves `DATASET_PATH`, `BASE_CHECKPOINT_PATH`, and `WAN_VAE_PATH` from the default locations under this cookbook directory (populated by Step 1 + Step 2), so no env-var setup is required (see [below](#overriding-the-defaults) to override):
+
+```shell
+# from this cookbook directory, after Step 1 + Step 2:
+bash launch_sft_vision_nano.sh
+```
+
+Each launcher's default paths come from the `DATASET_PATH` + `BASE_CHECKPOINT_PATH` defaults declared at the top of its `.sh` (each uses `: "${VAR:=…}"` so any value you `export` in the shell before launching wins over the default):
+
+| Launch shell                   | Post-Training Task | Default $DATASET_PATH (under data/)                        | Default $BASE_CHECKPOINT_PATH (under checkpoints/)          |
+| ------------------------------ | ------------------ | ---------------------------------------------------------- | ----------------------------------------------------------- |
+| `launch_sft_vision_nano.sh`    | Generator SFT      | `BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge` | `Cosmos3-Nano`                                              |
+| `launch_sft_vision_super.sh`   | Generator SFT      | `BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge` | `Cosmos3-Super`                                             |
+| `launch_sft_llava_ov.sh`       | Reasoner SFT       | (none; dataset streams from HF Hub)                        | (none; backbone fetched at startup, or set `VLM_SAFETENSORS_PATH`) |
+| `launch_sft_videophy2_nano.sh` | Reasoner SFT       | (none; set `VIDEOPHYSICS_ROOT` env)                        | (none; set `VLM_SAFETENSORS_PATH` env)                      |
+
+`WAN_VAE_PATH` defaults to `checkpoints/wan22_vae/Wan2.2_VAE.pth` for every non-reasoner recipe.
+
+**Reasoner Alignment SFT with VideoPhy-2 (Cosmos3-Nano):**
+
+```shell
+# Step 3 (launch): export both env vars, then launch.
+export VIDEOPHYSICS_ROOT=$PWD/data/videophysics
+export VLM_SAFETENSORS_PATH=$PWD/checkpoints/Cosmos3-Nano-VLM
+bash launch_sft_videophy2_nano.sh
+```
+
+#### Overriding the defaults
+
+If you'd rather put data or checkpoints on a different filesystem (e.g. a faster SSD or shared mount), download to your chosen path in Step 1 / convert the DCP to your chosen path in Step 2, then export the matching env var(s) before launching:
+
+```shell
+# Example: data on /scratch, base DCP on /nfs/ckpts.
+export DATASET_PATH=/scratch/BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge
+export BASE_CHECKPOINT_PATH=/nfs/ckpts/Cosmos3-Nano
+export WAN_VAE_PATH=/nfs/ckpts/wan22_vae/Wan2.2_VAE.pth
+bash launch_sft_vision_nano.sh
+```
+
+Each env var falls back to its default if unset, so you only need to export the ones you're moving. The downloads / `convert_model_to_dcp` commands in Step 1 + Step 2 just need their `--local-dir` / `-o` argument pointed at the same path you export here. `.gitignore` excludes `data/`, `checkpoints/`, and `outputs/` under this cookbook directory so the multi-GB downloads aren't tracked when you keep the defaults.
+
+### Option B: raw `torchrun`
+
+If you'd rather not use the paired launch shell, invoke `torchrun` directly with the recipe's TOML. Unlike Option A, **raw `torchrun` does not auto-resolve `DATASET_PATH` / `BASE_CHECKPOINT_PATH` / `WAN_VAE_PATH`** — they have to come from your shell:
+
+- `BASE_CHECKPOINT_PATH` and `WAN_VAE_PATH` are read via `${oc.env:BASE_CHECKPOINT_PATH}` / `${oc.env:WAN_VAE_PATH}` at the TOML's `[checkpoint].load_path` / `[model.tokenizer].vae_path` keys.
+- `DATASET_PATH` is read via `${oc.env:DATASET_PATH}` inside the experiment-SKU Python (e.g. `cosmos_framework/configs/base/experiment/sft/<recipe>.py`), not in the TOML.
+
+You have two options to fill them in (pick either, not both):
+
+1. **Export them in the shell before `torchrun`** (whether they point at the default `data/` / `checkpoints/` paths from Step 1+2 or your own overrides) — shown below.
+2. **Edit the TOML by hand** — open `toml/sft_config/<recipe>.toml` and replace the `${oc.env:BASE_CHECKPOINT_PATH}` / `${oc.env:WAN_VAE_PATH}` placeholders with literal paths. Useful if you want a self-contained TOML you can hand to a colleague or commit alongside an experiment record. (Hand-editing won't help for `DATASET_PATH` — that's resolved out of the experiment Python, so you must still export it.)
+
+Run from this cookbook directory (`cookbooks/cosmos3/finetune/`) with the framework venv active; the snippet uses `$PWD` to absolutize the relative paths.
+
+```shell
+# This example uses the vision_sft_nano recipe end-to-end (same recipe as
+# Option A). To switch recipes, swap TOML_FILE + DATASET_PATH per the table in
+# Option A, and Cosmos3-Nano → Cosmos3-Super on the LoRA / super recipes.
+TOML_FILE="toml/sft_config/vision_sft_nano.toml"
+
+# Match the launcher's defaults — or substitute your own paths.
+export DATASET_PATH="$PWD/data/BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge"
+export BASE_CHECKPOINT_PATH="$PWD/checkpoints/Cosmos3-Nano"
+export WAN_VAE_PATH="$PWD/checkpoints/wan22_vae/Wan2.2_VAE.pth"
+
+IMAGINAIRE_OUTPUT_ROOT=outputs/train \
+torchrun --nproc_per_node=8 -m cosmos_framework.scripts.train \
+    --sft-toml=$TOML_FILE
+```
+
+To resume from the latest in-progress checkpoint, point `BASE_CHECKPOINT_PATH` at the run's `checkpoints/iter_<N>/` directory under `$IMAGINAIRE_OUTPUT_ROOT/<project>/<group>/<name>/` (see [Outputs](#outputs) below for the full layout).
+
+## Outputs
+
+Outputs land under `$IMAGINAIRE_OUTPUT_ROOT/<project>/<group>/<name>/`:
+
+1. `config.yaml`, `config.pkl`: Finalized resolved config (YAML for inspection, pickle for re-instantiation).
+1. `launch_info.yaml`, `job_env.yaml`: Job metadata and captured launch environment.
+1. `checkpoints/`:
+    1. `latest_checkpoint.txt`: Pointer file containing the latest checkpoint directory name (e.g. `iter_000000200`).
+    1. `iter_<iter>/`: DCP checkpoint saved every `[train.ckpt].save_freq` iterations (zero-padded 9-digit, e.g. `iter_000000200/`):
+        1. `model/`: model weights (sharded `.distcp`).
+        1. `optim/`: optimizer state.
+        1. `scheduler/`: LR scheduler state.
+        1. `trainer/`: training state — includes the `iteration` counter and per-rank `rng_state_<i>` (numpy + random + torch + torch_cuda).
+        1. `dataloader/`: optional per-rank pickle shards (`rank_<i>.pkl`) — only present for dataloaders that implement `has_state()`.
+1. `<callback_name>/`: Callback outputs, one directory per registered callback (e.g. `DeviceMonitor/`, `EveryNDrawSample/`, `norm_monitor/`).
+1. `wandb/`, `wandb_id.txt`: Wandb run files — only present when `[job].wandb_mode` is `online` or `offline`.
+
+The shorthand `$RUN_DIR` used in the rest of this page refers to `$IMAGINAIRE_OUTPUT_ROOT/<project>/<group>/<name>`. For example, with `IMAGINAIRE_OUTPUT_ROOT=outputs/train` and the `vision_sft_nano` recipe, `$RUN_DIR` is `outputs/train/cosmos3/sft/vision_sft_nano`.
+
+## Export checkpoint to Hugging Face safetensors
+
+Export the DCP checkpoint produced in Step 3 to a Hugging Face safetensors checkpoint:
+
+```shell
+RUN_DIR=$IMAGINAIRE_OUTPUT_ROOT/<project>/<group>/<name>
+
+CHECKPOINT_ITER=$(cat $RUN_DIR/checkpoints/latest_checkpoint.txt)
+CHECKPOINT_PATH=$RUN_DIR/checkpoints/$CHECKPOINT_ITER
+
+python -m cosmos_framework.scripts.export_model \
+  --checkpoint-path $CHECKPOINT_PATH \
+  --config-file $RUN_DIR/config.yaml \
+  -o $RUN_DIR/model
+```
+
+The exported safetensors land at `$RUN_DIR/model` and can be used in [Inference](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/inference.md) commands by passing `--checkpoint-path $RUN_DIR/model`.
+
+## Config
+
+The recipe TOML is parsed against the pydantic schema [`SFTExperimentConfig`](https://github.com/NVIDIA/cosmos-framework/blob/main/cosmos_framework/configs/toml_config/sft_config.py) at load time. Every top-level key listed below maps to a sub-model in that file; unknown keys raise a `ValidationError` before training starts (`extra="forbid"` on every sub-model). Values may use OmegaConf env interpolation `${oc.env:NAME}` — the recipe TOMLs use this for `BASE_CHECKPOINT_PATH` (`[checkpoint].load_path`) and `WAN_VAE_PATH` (`[model.tokenizer].vae_path`). `DATASET_PATH` is consumed the same way but inside the experiment-SKU Python (`cosmos_framework/configs/base/experiment/sft/<recipe>.py`), not in the TOML.
+
+For the full field-by-field reference (every section, every default, every VFM/VLM applicability note, the `"???"` MISSING sentinel, env interpolation, the VFM↔VLM path-remap table, and how to extend the schema), see [SFT Structured-TOML Config Reference](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/sft_config.md).
+
+The commonly tuned knobs:
+
+1. `[job]`
+    1. `task` — `"vfm"` (generator recipes) or `"vlm"` (Reasoner alignment). Picks the base config: `cosmos_framework/configs/base/config.py` vs `…/vlm/config.py`. Also drives `PATH_REMAPS` in `toml_config_helper.py`.
+    1. `experiment` — Registered experiment SKU name (e.g. `vision_sft_nano`). Each SKU is a Python file under `cosmos_framework/configs/base/experiment/sft/` that wires up dataloader, model variant, and recipe-specific defaults.
+    1. `project`, `group`, `name` — Components of the run output dir `$IMAGINAIRE_OUTPUT_ROOT/<project>/<group>/<name>/`. Also flow to W&B as the project / group / run name.
+    1. `wandb_mode` — `"online"` (logs to W&B; `WANDB_API_KEY` must be set), `"offline"` (logs locally, sync later with `wandb sync`), or `"disabled"`.
+1. `[model]`
+    1. `max_num_tokens_after_packing` — VFM token-packing target. `-1` disables the cap. VFM only; VLM uses `data_setting.max_tokens` (tail override).
+    1. `joint_attn_implementation` — VFM attention layout: `"two_way"` / `"three_way"` (NATTEN) / `"flex"`.
+    1. `attn_implementation` — VLM attention impl: `"cosmos"` / `"flash_attention_2"` / `"sdpa"` / `"eager"`. VLM only.
+    1. `lora_enabled`, `lora_rank`, `lora_alpha`, `lora_target_modules` — LoRA adapter knobs for the generation pathway. Used by SUPER-tier recipes; NANO-tier leaves `lora_enabled=false`. VFM only.
+1. `[model.ema]`
+    1. `enabled`, `rate`, `iteration_shift` — Exponential moving average of generation-pathway weights. Full fine-tunes typically enable it; LoRA recipes leave it off.
+1. `[model.parallelism]`
+    1. `data_parallel_shard_degree` — FSDP shard degree. `data_parallel_shard_degree × data_parallel_replicate_degree × context_parallel_shard_degree` must equal `WORLD_SIZE`. `-1` autoselects from torchrun world size.
+    1. `data_parallel_replicate_degree` — HSDP replicate degree (outer replicate loop over the shard topology).
+    1. `context_parallel_shard_degree` — Context-parallel shard degree. `>1` splits the sequence dim across ranks (used by super-tier configs: DP=4, CP=2 → 8 GPUs).
+    1. `cfg_parallel_shard_degree` — Classifier-free-guidance shard degree. Almost always `1` for SFT.
+    1. `fsdp_master_dtype` — Master parameter / FSDP reduce dtype: typically `"float32"`.
+1. `[model.compile]`
+    1. `enabled` — Enable `torch.compile`. Improves speed at the cost of memory.
+    1. `compile_dynamic` — Whether to compile with symbolic-shape (dynamic) kernels. `True` (default) is appropriate for training; AR inference may prefer `False` for stable shapes.
+1. `[model]`
+    1. `precision` — Compute dtype for forward/backward: `"bfloat16"` / `"float16"` / `"float32"`. Master weights stay fp32 separately.
+1. `[model.activation_checkpointing]`
+    1. `mode` — `"none"` / `"selective"` (per-op SAC, MoT-only) / `"full"` (per-block checkpointing).
+    1. `save_ops_regex` — Regex patterns for ops to keep saved under `mode="selective"`.
+    1. `preserve_rng_state`, `determinism_check` — Recompute determinism plumbing.
+1. `[model.tokenizer]`
+    1. `vae_path` — Wan2.2 VAE `.pth` path. Recipe TOMLs use `"${oc.env:WAN_VAE_PATH}"`. VFM only.
+1. `[optimizer]`
+    1. `lr` — Base learning rate.
+    1. `betas`, `eps`, `fused`, `weight_decay` — Standard AdamW knobs. `eps` is VFM-only.
+    1. `keys_to_select` — Substring allowlist for trainable params. Empty list = train everything; `["lora_"]` = adapter-only fine-tune.
+1. `[optimizer.lr_multipliers]`
+    1. Inline table of `<substring> = <multiplier>` pairs that scale the LR of params whose name contains the substring. The shipped vision recipes leave this empty (Hydra default `{}` stands).
+1. `[scheduler]`
+    1. `cycle_lengths`, `warm_up_steps` — Cycle length and warmup duration (lists, one entry per cycle), in optimizer steps.
+    1. `f_max`, `f_min`, `f_start` — LR multipliers at peak / trough / step-0 (ratios of `optimizer.lr`).
+    1. `verbosity_interval` — Scheduler-side LR log frequency. VFM only.
+1. `[trainer]`
+    1. `max_iter` — Total optimizer steps.
+    1. `grad_accum_iter` — Micro-batches per optimizer step. Effective global batch = `grad_accum_iter × per-rank batch × world_size`.
+    1. `logging_iter` — Console / W&B scalar log frequency.
+    1. `distributed_parallelism` — `"fsdp"` is the only supported value.
+1. `[trainer.callbacks.compile_tokenizer]`
+    1. `enabled`, `compile_after_iterations`, `warmup_resolutions` — Lazy `torch.compile` of the VAE tokenizer. VFM only.
+1. `[trainer.callbacks.grad_clip]`
+    1. `clip_norm` — Max global L2 norm of the gradient (steps with larger norm are rescaled).
+    1. `force_finite` — Replace NaN/Inf grads with zero (default `true` on VFM, `false` on VLM).
+1. `[checkpoint]`
+    1. `load_path` — Base DCP checkpoint directory to resume from (Step 2 output, or a prior run's `checkpoints/iter_<N>/`). Recipe TOMLs use `"${oc.env:BASE_CHECKPOINT_PATH}"`.
+    1. `save_iter` — Save a new DCP checkpoint every N optimizer steps.
+    1. `keys_to_skip_loading` — Substring blocklist applied at load time. Used to mask EMA / LoRA tensors when warm-starting from a checkpoint that doesn't have them yet.
+1. `[dataloader_train]` — Top-level scalars only; the dataloader's class (LazyCall) and pipeline wiring (datasets, packers, …) stay in the experiment Python.
+    1. `max_samples_per_batch` — Per-micro-batch sample cap (remapped to `max_batch_size` on the VLM packer). `null` / omitted = no per-count cap.
+    1. `max_sequence_length` — Per-packed-sequence token cap (remapped to `max_tokens` on the VLM packer).
+    1. `seed` — Dataloader RNG seed (VFM only).
+
+### Common Hydra tail overrides
+
+These knobs aren't part of the pydantic schema today; pass them as trailing `key.path=value` positionals after `--` (the `cosmos_framework.scripts.train` flow forwards them through OmegaConf):
+
+- `model.config.policy.backbone.model_name` — VLM backbone HF identifier (e.g. `Qwen/Qwen3-VL-8B-Instruct`). Used by `launch_sft_llava_ov.sh`.
+- `data_setting.max_tokens` — VLM token-packing cap (the VLM analogue of `[model].max_num_tokens_after_packing`). Used by `launch_sft_llava_ov.sh`.
+
+The launchers wire these via `TAIL_OVERRIDES=(…)`; the helper appends `-- "${TAIL_OVERRIDES[@]}"` after the `--sft-toml=` argument.

From 4fb51967df99311ddba243bd043f4157e8ccb55b Mon Sep 17 00:00:00 2001
From: Maosheng Liao <maoshengl@nvidia.com>
Date: Tue, 16 Jun 2026 07:07:21 -0700
Subject: [PATCH 5/6] chore: gitignore Cosmos3 finetune cookbook runtime
 artifacts

---
 .gitignore | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.gitignore b/.gitignore
index fdbd9f7c..3da55bb6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -181,6 +181,14 @@ packages/
 cookbooks/cosmos3/generator/audiovisual/outputs/
 outputs/
 
+# Cosmos3 finetune cookbook runtime artifacts (downloads, converted ckpts, runs)
+cookbooks/cosmos3/finetune/data/
+cookbooks/cosmos3/finetune/checkpoints/
+cookbooks/cosmos3/finetune/outputs/
+
+# Superpowers design specs / implementation plans (kept local, not tracked)
+docs/superpowers/
+
 # Streamlit
 .streamlit/
 

From 8abd6d8aade81f8457d5b4cfeaa08568d95645a1 Mon Sep 17 00:00:00 2001
From: Simon Zhang <simonz@nvidia.com>
Date: Tue, 16 Jun 2026 09:27:20 -0700
Subject: [PATCH 6/6] restructure cookbook into per-capability folders +
 address review

  - Place SFT recipes under their capability cookbooks: vision recipes ->
    cookbooks/cosmos3/generator/audiovisual/finetune/, reasoner recipes ->
    cookbooks/cosmos3/reasoner/finetune/ (sibling of existing inference content,
    forward-compatible with #214).
  - Surface the finetune cookbooks on the repo landing page (README Finetune).
  - Trim each README to the happy path; link advanced config + raw torchrun to the
    canonical framework docs (training.md, sft_config.md).
  - Add the recommended NGC PyTorch base image to Prerequisites.
  - Rewrite each launch_sft_*.sh as a simple, self-contained recipe: linear
    numbered steps (download -> convert -> train) with hardcoded paths, dropping
    the shared launcher helper and all env-var override knobs.
---
 .gitignore                                    |  12 +-
 README.md                                     |   9 +-
 cookbooks/cosmos3/finetune/README.md          | 345 ------------------
 .../cosmos3/finetune/_sft_launcher_common.sh  |  98 -----
 .../cosmos3/finetune/launch_sft_llava_ov.sh   |  43 ---
 .../finetune/launch_sft_videophy2_nano.sh     |  45 ---
 .../finetune/launch_sft_vision_nano.sh        |  30 --
 .../finetune/launch_sft_vision_super.sh       |  36 --
 .../generator/audiovisual/finetune/README.md  |  58 +++
 .../finetune/launch_sft_vision_nano.sh        |  39 ++
 .../finetune/launch_sft_vision_super.sh       |  42 +++
 .../toml/sft_config/vision_sft_nano.toml      |   0
 .../toml/sft_config/vision_sft_super.toml     |   0
 cookbooks/cosmos3/reasoner/finetune/README.md |  58 +++
 .../reasoner/finetune/launch_sft_llava_ov.sh  |  16 +
 .../finetune/launch_sft_videophy2_nano.sh     |  32 ++
 .../finetune/toml/sft_config/llava_ov.toml    |   0
 .../toml/sft_config/videophy2_sft_nano.toml   |   0
 18 files changed, 258 insertions(+), 605 deletions(-)
 delete mode 100644 cookbooks/cosmos3/finetune/README.md
 delete mode 100644 cookbooks/cosmos3/finetune/_sft_launcher_common.sh
 delete mode 100644 cookbooks/cosmos3/finetune/launch_sft_llava_ov.sh
 delete mode 100644 cookbooks/cosmos3/finetune/launch_sft_videophy2_nano.sh
 delete mode 100644 cookbooks/cosmos3/finetune/launch_sft_vision_nano.sh
 delete mode 100644 cookbooks/cosmos3/finetune/launch_sft_vision_super.sh
 create mode 100644 cookbooks/cosmos3/generator/audiovisual/finetune/README.md
 create mode 100644 cookbooks/cosmos3/generator/audiovisual/finetune/launch_sft_vision_nano.sh
 create mode 100644 cookbooks/cosmos3/generator/audiovisual/finetune/launch_sft_vision_super.sh
 rename cookbooks/cosmos3/{ => generator/audiovisual}/finetune/toml/sft_config/vision_sft_nano.toml (100%)
 rename cookbooks/cosmos3/{ => generator/audiovisual}/finetune/toml/sft_config/vision_sft_super.toml (100%)
 create mode 100644 cookbooks/cosmos3/reasoner/finetune/README.md
 create mode 100644 cookbooks/cosmos3/reasoner/finetune/launch_sft_llava_ov.sh
 create mode 100644 cookbooks/cosmos3/reasoner/finetune/launch_sft_videophy2_nano.sh
 rename cookbooks/cosmos3/{ => reasoner}/finetune/toml/sft_config/llava_ov.toml (100%)
 rename cookbooks/cosmos3/{ => reasoner}/finetune/toml/sft_config/videophy2_sft_nano.toml (100%)

diff --git a/.gitignore b/.gitignore
index 3da55bb6..c5b9a910 100644
--- a/.gitignore
+++ b/.gitignore
@@ -182,12 +182,12 @@ cookbooks/cosmos3/generator/audiovisual/outputs/
 outputs/
 
 # Cosmos3 finetune cookbook runtime artifacts (downloads, converted ckpts, runs)
-cookbooks/cosmos3/finetune/data/
-cookbooks/cosmos3/finetune/checkpoints/
-cookbooks/cosmos3/finetune/outputs/
-
-# Superpowers design specs / implementation plans (kept local, not tracked)
-docs/superpowers/
+cookbooks/cosmos3/generator/audiovisual/finetune/data/
+cookbooks/cosmos3/generator/audiovisual/finetune/checkpoints/
+cookbooks/cosmos3/generator/audiovisual/finetune/outputs/
+cookbooks/cosmos3/reasoner/finetune/data/
+cookbooks/cosmos3/reasoner/finetune/checkpoints/
+cookbooks/cosmos3/reasoner/finetune/outputs/
 
 # Streamlit
 .streamlit/
diff --git a/README.md b/README.md
index 6d3e51eb..c4fd1aea 100644
--- a/README.md
+++ b/README.md
@@ -646,9 +646,14 @@ Cosmos 3 latency and serving numbers live in [`inference_benchmarks.md`](inferen
 
 ### Finetune
 
-Finetune Cosmos 3 with the [Cosmos Framework](https://github.com/NVIDIA/cosmos-framework), NVIDIA's end-to-end Physical AI framework for training and serving world models. It provides runnable setup, inference, omni-model training, and evaluation workflows for the Generator and Reasoner surfaces, with reference recipes for vision, action, and reasoning post-training.
+Post-train Cosmos 3 on your own data with the supervised fine-tuning (SFT) cookbooks below. Each recipe is a self-contained launch script: a single `bash launch_sft_<recipe>.sh` downloads the data, prepares the base checkpoint, and runs 8×H100 training.
 
-See the [Cosmos Framework training guide](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/training.md) for the full post-training workflow, including data preparation, configuration, and launch commands.
+| Cookbook | Surface | Recipes |
+| --- | --- | --- |
+| [Vision generator SFT](cookbooks/cosmos3/generator/audiovisual/finetune/README.md) | Generator | Full SFT (Cosmos3-Nano) and LoRA SFT (Cosmos3-Super) on captioned video |
+| [Reasoner SFT](cookbooks/cosmos3/reasoner/finetune/README.md) | Reasoner | Alignment SFT on LLaVA-OneVision and physical-plausibility SFT on VideoPhy-2 |
+
+These cookbooks run on the [Cosmos Framework](https://github.com/NVIDIA/cosmos-framework), NVIDIA's end-to-end Physical AI framework for training and serving world models. For the full post-training reference — every config field, raw `torchrun`, resuming, and advanced parallelism — see the [Cosmos Framework training guide](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/training.md).
 
 ### Limitations
 
diff --git a/cookbooks/cosmos3/finetune/README.md b/cookbooks/cosmos3/finetune/README.md
deleted file mode 100644
index ac0e3ff6..00000000
--- a/cookbooks/cosmos3/finetune/README.md
+++ /dev/null
@@ -1,345 +0,0 @@
-# Cosmos3 Fine-Tuning (Supervised Fine-Tuning)
-
-<!--TOC-->
-
-______________________________________________________________________
-
-**Table of Contents**
-
-- [Prerequisites](#prerequisites)
-- [Step 1 - Prepare data and config](#step-1---prepare-data-and-config)
-- [Step 2 — Prepare checkpoint](#step-2--prepare-checkpoint)
-- [Step 3 — Run training](#step-3--run-training)
-  - [Option A (recommended): the paired launch shell](#option-a-recommended-the-paired-launch-shell)
-    - [Overriding the defaults](#overriding-the-defaults)
-  - [Option B: raw `torchrun`](#option-b-raw-torchrun)
-- [Outputs](#outputs)
-- [Export checkpoint to Hugging Face safetensors](#export-checkpoint-to-hugging-face-safetensors)
-- [Config](#config)
-  - [Common Hydra tail overrides](#common-hydra-tail-overrides)
-
-______________________________________________________________________
-
-<!--TOC-->
-
-Fine-tune a pre-trained Cosmos3 model on your own dataset using supervised fine-tuning (SFT). Tested on 8× H100 (80 GB).
-
-## Prerequisites
-
-Training runs through the **cosmos-framework** package: the `cosmos_framework.scripts.train` entry point and the experiment-SKU configs live there, so you must install a framework checkout before running anything in this guide. The recipe TOMLs and launch shells in this folder drive that entry point.
-
-1. **Clone and install cosmos-framework.** Follow the cosmos3 cookbook's [Cosmos Framework setup](../README.md#cosmos-framework) — clone into `packages/cosmos3` and run `uv sync --all-extras --group=cu130-train` (use `cu128-train` on a CUDA 12.x driver). `uv sync` is the install: it installs the `cosmos-framework` project itself (editable) plus all training dependencies into `.venv`; no separate `pip install` is needed.
-
-2. **Activate the framework venv** so `cosmos_framework` is importable. These launch shells deliberately do **not** add `.venv/bin` to `PATH`:
-
-   ```shell
-   source <path-to>/packages/cosmos3/.venv/bin/activate
-   ```
-
-3. **Run every command below from this cookbook directory** (`cookbooks/cosmos3/finetune/`) with that venv active. Data, checkpoints, and outputs default to `data/`, `checkpoints/`, and `outputs/` under this folder (all git-ignored); export `DATASET_PATH` / `BASE_CHECKPOINT_PATH` / `WAN_VAE_PATH` to override (see [Step 3 → Overriding the defaults](#overriding-the-defaults)).
-
-For deeper references see the framework docs: [environment variables](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/environment_variables.md), [FAQ / troubleshooting](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/faq.md) (OOM during SFT, common pitfalls), and the [JSONL dataset format](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/dataset_jsonl.md).
-
-## Step 1 - Prepare data and config
-
-Some datasets are license gated — visit the repository page and accept any terms, and authenticate with `uvx hf@latest auth login` (or set `HF_TOKEN`).
-
-The per-recipe download commands below write to `data/<dataset>/` and `checkpoints/wan22_vae/Wan2.2_VAE.pth`, which match the launcher's default `$DATASET_PATH` and `$WAN_VAE_PATH`. See [Step 3 → Option A](#option-a-recommended-the-paired-launch-shell) for how to override these defaults if you'd rather keep data on a different filesystem.
-
-Select one of the following recipes:
-
-<details open><summary><b>Vision SFT (Cosmos3-Nano)</b></summary>
-
-T2V/I2V/V2V SFT on [nvidia/BridgeData2-Subset-Synthetic-Captions](https://huggingface.co/datasets/nvidia/BridgeData2-Subset-Synthetic-Captions/tree/main). `$DATASET_PATH` should be the directory containing `train/video_dataset_file.jsonl`. Each clip carries a structured-JSON caption (`caption_json`) — the model's native prompt format — which the SFT loader trains on by default (the dense narrative is kept as a backup), so training stays aligned with [Inference](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/dataset_jsonl.md#inference); see [JSONL Dataset → Format](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/dataset_jsonl.md#format).
-
-Launch shell: `launch_sft_vision_nano.sh`
-
-```shell
-BASE_CHECKPOINT_NAME=Cosmos3-Nano
-
-# Defaults match the launcher (see Step 3 → Option A to override).
-uvx hf@latest download --repo-type dataset nvidia/BridgeData2-Subset-Synthetic-Captions \
-    --revision 40d018ac1c1a2a4b9734f17fdb21f3d933c49a01 \
-    --local-dir data/BridgeData2-Subset-Synthetic-Captions --quiet
-uvx hf@latest download Wan-AI/Wan2.2-TI2V-5B Wan2.2_VAE.pth \
-    --local-dir checkpoints/wan22_vae --quiet
-```
-
-</details>
-
-<details><summary><b>Vision SFT LoRA (Cosmos3-Super)</b></summary>
-
-LoRA SFT on Qwen3-VL-32B MoT (Cosmos3-Super), on the same Bridge dataset as **Vision SFT (Cosmos3-Nano)**. Step 2 must convert the Cosmos3-Super checkpoint, not Cosmos3-Nano.
-
-Launch shell: `launch_sft_vision_super.sh`
-
-```shell
-BASE_CHECKPOINT_NAME=Cosmos3-Super
-
-# Defaults match the launcher (see Step 3 → Option A to override).
-uvx hf@latest download --repo-type dataset nvidia/BridgeData2-Subset-Synthetic-Captions \
-    --revision 40d018ac1c1a2a4b9734f17fdb21f3d933c49a01 \
-    --local-dir data/BridgeData2-Subset-Synthetic-Captions --quiet
-uvx hf@latest download Wan-AI/Wan2.2-TI2V-5B Wan2.2_VAE.pth \
-    --local-dir checkpoints/wan22_vae --quiet
-```
-
-</details>
-
-<details><summary><b>Reasoner Alignment SFT with LLaVA-OneVision (vfm-vlm)</b></summary>
-
-Alignment SFT for the Reasoner variant on the [lmms-lab/LLaVA-OneVision-Data](https://huggingface.co/datasets/lmms-lab/LLaVA-OneVision-Data) dataset (streamed from HF Hub). Skips Step 2: by default the backbone `Qwen/Qwen3-VL-8B-Instruct` is fetched from the HF Hub by the model downloader at startup — no DCP conversion needed and no required env vars. To instead start from a merged Cosmos3 reasoner snapshot (Cosmos3-Nano LM merged onto the Qwen3-VL visual tower), build it with `convert_model_to_vlm_safetensors` (see [Step 2](#step-2--prepare-checkpoint)) and point `VLM_SAFETENSORS_PATH` at it — same mechanism as the VideoPhy-2 recipe below.
-
-Launch shell: `launch_sft_llava_ov.sh`
-
-```shell
-# No required env vars. The first launch will populate the HF Hub cache under
-# $HF_HOME (defaults to /tmp/hf_cache inside the wrapper); subsequent launches
-# reuse the cached snapshot.
-#
-# (optional) HF_TOKEN raises HF Hub rate limits for the streamed dataset
-# revision lookup — useful if you're running 8-rank fan-out from a single IP:
-# export HF_TOKEN=hf_...
-#
-# (optional) VLM_SAFETENSORS_PATH starts training from a local pre-converted
-# Qwen3-VL safetensors snapshot (e.g. Cosmos3-Nano LM merged with the Qwen3-VL
-# visual tower) instead of the public HF backbone:
-# export VLM_SAFETENSORS_PATH=$PWD/checkpoints/Cosmos3-Nano-VLM
-```
-
-</details>
-
-<details><summary><b>Reasoner Alignment SFT with VideoPhy-2 (Cosmos3-Nano)</b></summary>
-
-Reasoner alignment SFT for 1–5 physical-plausibility scoring on [videophysics/videophy2_train](https://huggingface.co/datasets/videophysics/videophy2_train) (HF test split renamed to `videophy2_val/`). `[job].task = "vlm"`. Bootstraps from `Cosmos3-Nano`'s language-model weights merged onto the public Qwen3-VL-8B-Instruct visual tower; the merged HF directory is consumed via `[model.backbone].safetensors_path` (plumbed by `VLM_SAFETENSORS_PATH`).
-
-Launch shell: `launch_sft_videophy2_nano.sh`
-
-```shell
-# Step 1 (data): materialize the public HF dataset into the canonical local layout
-# (videophy2_{train,val}/{meta.json, media/, text/}).
-python -m cosmos_framework.scripts.vlm.prepare_videophy2_from_hf \
-    --out_root data/videophysics --split both
-```
-
-</details>
-
-## Step 2 — Prepare checkpoint
-
-Convert the base checkpoint to [PyTorch Distributed Checkpoint (DCP)](https://pytorch.org/docs/stable/distributed.checkpoint.html) format. `cosmos_framework.scripts.convert_model_to_dcp` ships in the unified `cosmos_framework/` package, so this step runs from this cookbook directory (with the framework venv active per [Prerequisites](#prerequisites)).
-
-Set `BASE_CHECKPOINT_NAME` to the value from the recipe block you picked in Step 1 (`Cosmos3-Nano` or `Cosmos3-Super`):
-
-```shell
-BASE_CHECKPOINT_NAME=Cosmos3-Nano   # or Cosmos3-Super — match the recipe in Step 1
-
-# Default output dir matches the launcher (see Step 3 → Option A to override).
-python -m cosmos_framework.scripts.convert_model_to_dcp \
-  -o checkpoints/$BASE_CHECKPOINT_NAME \
-  --checkpoint-path $BASE_CHECKPOINT_NAME
-```
-
-`$BASE_CHECKPOINT_NAME` (e.g. `Cosmos3-Nano`, `Cosmos3-Super`) is a registered name in the checkpoint catalog; the converter downloads the matching repo from the Hugging Face Hub and writes the DCP into `checkpoints/$BASE_CHECKPOINT_NAME`.
-
-**Reasoner Alignment SFT with LLaVA-OneVision (vfm-vlm):** Skip this step — the Reasoner alignment SFT loads `Qwen/Qwen3-VL-8B-Instruct` from the HF Hub at startup (no DCP conversion required). To start from a merged Cosmos3 reasoner snapshot instead, build one with `convert_model_to_vlm_safetensors` (see the VideoPhy-2 note below) and pass it via `VLM_SAFETENSORS_PATH`.
-
-**Reasoner Alignment SFT with VideoPhy-2 (Cosmos3-Nano):** Use `cosmos_framework.scripts.convert_model_to_vlm_safetensors` instead.
-
-```shell
-# Step 2 (VLM checkpoint): merge Cosmos3-Nano LM onto the Qwen3-VL visual tower.
-# Replaces the convert_model_to_dcp step used by the VFM recipes above.
-python -m cosmos_framework.scripts.convert_model_to_vlm_safetensors \
-    --checkpoint-path Cosmos3-Nano \
-    -o checkpoints/Cosmos3-Nano-VLM
-```
-
-## Step 3 — Run training
-
-**Weights & Biases (optional):** every recipe TOML defaults to `job.wandb_mode = "disabled"`. To log a run to W&B, flip that field to `"online"` in the TOML and export `WANDB_API_KEY` in your environment before launching.
-
-### Option A (recommended): the paired launch shell
-
-Each recipe ships as a `toml/sft_config/<recipe>.toml` (validated against the pydantic schema at [`cosmos_framework/configs/toml_config/sft_config.py`](https://github.com/NVIDIA/cosmos-framework/blob/main/cosmos_framework/configs/toml_config/sft_config.py)) paired with `launch_sft_<recipe>.sh`; the full upstream catalog is indexed in [the framework's examples index](https://github.com/NVIDIA/cosmos-framework/blob/main/examples/README.md). Each `.sh` sources [`_sft_launcher_common.sh`](_sft_launcher_common.sh) and forwards into `cosmos_framework.scripts.train --sft-toml=<recipe-toml>`. From this cookbook directory, run the launch shell paired with the recipe you set up in Step 1. The wrapper resolves `DATASET_PATH`, `BASE_CHECKPOINT_PATH`, and `WAN_VAE_PATH` from the default locations under this cookbook directory (populated by Step 1 + Step 2), so no env-var setup is required (see [below](#overriding-the-defaults) to override):
-
-```shell
-# from this cookbook directory, after Step 1 + Step 2:
-bash launch_sft_vision_nano.sh
-```
-
-Each launcher's default paths come from the `DATASET_PATH` + `BASE_CHECKPOINT_PATH` defaults declared at the top of its `.sh` (each uses `: "${VAR:=…}"` so any value you `export` in the shell before launching wins over the default):
-
-| Launch shell                   | Post-Training Task | Default $DATASET_PATH (under data/)                        | Default $BASE_CHECKPOINT_PATH (under checkpoints/)          |
-| ------------------------------ | ------------------ | ---------------------------------------------------------- | ----------------------------------------------------------- |
-| `launch_sft_vision_nano.sh`    | Generator SFT      | `BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge` | `Cosmos3-Nano`                                              |
-| `launch_sft_vision_super.sh`   | Generator SFT      | `BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge` | `Cosmos3-Super`                                             |
-| `launch_sft_llava_ov.sh`       | Reasoner SFT       | (none; dataset streams from HF Hub)                        | (none; backbone fetched at startup, or set `VLM_SAFETENSORS_PATH`) |
-| `launch_sft_videophy2_nano.sh` | Reasoner SFT       | (none; set `VIDEOPHYSICS_ROOT` env)                        | (none; set `VLM_SAFETENSORS_PATH` env)                      |
-
-`WAN_VAE_PATH` defaults to `checkpoints/wan22_vae/Wan2.2_VAE.pth` for every non-reasoner recipe.
-
-**Reasoner Alignment SFT with VideoPhy-2 (Cosmos3-Nano):**
-
-```shell
-# Step 3 (launch): export both env vars, then launch.
-export VIDEOPHYSICS_ROOT=$PWD/data/videophysics
-export VLM_SAFETENSORS_PATH=$PWD/checkpoints/Cosmos3-Nano-VLM
-bash launch_sft_videophy2_nano.sh
-```
-
-#### Overriding the defaults
-
-If you'd rather put data or checkpoints on a different filesystem (e.g. a faster SSD or shared mount), download to your chosen path in Step 1 / convert the DCP to your chosen path in Step 2, then export the matching env var(s) before launching:
-
-```shell
-# Example: data on /scratch, base DCP on /nfs/ckpts.
-export DATASET_PATH=/scratch/BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge
-export BASE_CHECKPOINT_PATH=/nfs/ckpts/Cosmos3-Nano
-export WAN_VAE_PATH=/nfs/ckpts/wan22_vae/Wan2.2_VAE.pth
-bash launch_sft_vision_nano.sh
-```
-
-Each env var falls back to its default if unset, so you only need to export the ones you're moving. The downloads / `convert_model_to_dcp` commands in Step 1 + Step 2 just need their `--local-dir` / `-o` argument pointed at the same path you export here. `.gitignore` excludes `data/`, `checkpoints/`, and `outputs/` under this cookbook directory so the multi-GB downloads aren't tracked when you keep the defaults.
-
-### Option B: raw `torchrun`
-
-If you'd rather not use the paired launch shell, invoke `torchrun` directly with the recipe's TOML. Unlike Option A, **raw `torchrun` does not auto-resolve `DATASET_PATH` / `BASE_CHECKPOINT_PATH` / `WAN_VAE_PATH`** — they have to come from your shell:
-
-- `BASE_CHECKPOINT_PATH` and `WAN_VAE_PATH` are read via `${oc.env:BASE_CHECKPOINT_PATH}` / `${oc.env:WAN_VAE_PATH}` at the TOML's `[checkpoint].load_path` / `[model.tokenizer].vae_path` keys.
-- `DATASET_PATH` is read via `${oc.env:DATASET_PATH}` inside the experiment-SKU Python (e.g. `cosmos_framework/configs/base/experiment/sft/<recipe>.py`), not in the TOML.
-
-You have two options to fill them in (pick either, not both):
-
-1. **Export them in the shell before `torchrun`** (whether they point at the default `data/` / `checkpoints/` paths from Step 1+2 or your own overrides) — shown below.
-2. **Edit the TOML by hand** — open `toml/sft_config/<recipe>.toml` and replace the `${oc.env:BASE_CHECKPOINT_PATH}` / `${oc.env:WAN_VAE_PATH}` placeholders with literal paths. Useful if you want a self-contained TOML you can hand to a colleague or commit alongside an experiment record. (Hand-editing won't help for `DATASET_PATH` — that's resolved out of the experiment Python, so you must still export it.)
-
-Run from this cookbook directory (`cookbooks/cosmos3/finetune/`) with the framework venv active; the snippet uses `$PWD` to absolutize the relative paths.
-
-```shell
-# This example uses the vision_sft_nano recipe end-to-end (same recipe as
-# Option A). To switch recipes, swap TOML_FILE + DATASET_PATH per the table in
-# Option A, and Cosmos3-Nano → Cosmos3-Super on the LoRA / super recipes.
-TOML_FILE="toml/sft_config/vision_sft_nano.toml"
-
-# Match the launcher's defaults — or substitute your own paths.
-export DATASET_PATH="$PWD/data/BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge"
-export BASE_CHECKPOINT_PATH="$PWD/checkpoints/Cosmos3-Nano"
-export WAN_VAE_PATH="$PWD/checkpoints/wan22_vae/Wan2.2_VAE.pth"
-
-IMAGINAIRE_OUTPUT_ROOT=outputs/train \
-torchrun --nproc_per_node=8 -m cosmos_framework.scripts.train \
-    --sft-toml=$TOML_FILE
-```
-
-To resume from the latest in-progress checkpoint, point `BASE_CHECKPOINT_PATH` at the run's `checkpoints/iter_<N>/` directory under `$IMAGINAIRE_OUTPUT_ROOT/<project>/<group>/<name>/` (see [Outputs](#outputs) below for the full layout).
-
-## Outputs
-
-Outputs land under `$IMAGINAIRE_OUTPUT_ROOT/<project>/<group>/<name>/`:
-
-1. `config.yaml`, `config.pkl`: Finalized resolved config (YAML for inspection, pickle for re-instantiation).
-1. `launch_info.yaml`, `job_env.yaml`: Job metadata and captured launch environment.
-1. `checkpoints/`:
-    1. `latest_checkpoint.txt`: Pointer file containing the latest checkpoint directory name (e.g. `iter_000000200`).
-    1. `iter_<iter>/`: DCP checkpoint saved every `[train.ckpt].save_freq` iterations (zero-padded 9-digit, e.g. `iter_000000200/`):
-        1. `model/`: model weights (sharded `.distcp`).
-        1. `optim/`: optimizer state.
-        1. `scheduler/`: LR scheduler state.
-        1. `trainer/`: training state — includes the `iteration` counter and per-rank `rng_state_<i>` (numpy + random + torch + torch_cuda).
-        1. `dataloader/`: optional per-rank pickle shards (`rank_<i>.pkl`) — only present for dataloaders that implement `has_state()`.
-1. `<callback_name>/`: Callback outputs, one directory per registered callback (e.g. `DeviceMonitor/`, `EveryNDrawSample/`, `norm_monitor/`).
-1. `wandb/`, `wandb_id.txt`: Wandb run files — only present when `[job].wandb_mode` is `online` or `offline`.
-
-The shorthand `$RUN_DIR` used in the rest of this page refers to `$IMAGINAIRE_OUTPUT_ROOT/<project>/<group>/<name>`. For example, with `IMAGINAIRE_OUTPUT_ROOT=outputs/train` and the `vision_sft_nano` recipe, `$RUN_DIR` is `outputs/train/cosmos3/sft/vision_sft_nano`.
-
-## Export checkpoint to Hugging Face safetensors
-
-Export the DCP checkpoint produced in Step 3 to a Hugging Face safetensors checkpoint:
-
-```shell
-RUN_DIR=$IMAGINAIRE_OUTPUT_ROOT/<project>/<group>/<name>
-
-CHECKPOINT_ITER=$(cat $RUN_DIR/checkpoints/latest_checkpoint.txt)
-CHECKPOINT_PATH=$RUN_DIR/checkpoints/$CHECKPOINT_ITER
-
-python -m cosmos_framework.scripts.export_model \
-  --checkpoint-path $CHECKPOINT_PATH \
-  --config-file $RUN_DIR/config.yaml \
-  -o $RUN_DIR/model
-```
-
-The exported safetensors land at `$RUN_DIR/model` and can be used in [Inference](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/inference.md) commands by passing `--checkpoint-path $RUN_DIR/model`.
-
-## Config
-
-The recipe TOML is parsed against the pydantic schema [`SFTExperimentConfig`](https://github.com/NVIDIA/cosmos-framework/blob/main/cosmos_framework/configs/toml_config/sft_config.py) at load time. Every top-level key listed below maps to a sub-model in that file; unknown keys raise a `ValidationError` before training starts (`extra="forbid"` on every sub-model). Values may use OmegaConf env interpolation `${oc.env:NAME}` — the recipe TOMLs use this for `BASE_CHECKPOINT_PATH` (`[checkpoint].load_path`) and `WAN_VAE_PATH` (`[model.tokenizer].vae_path`). `DATASET_PATH` is consumed the same way but inside the experiment-SKU Python (`cosmos_framework/configs/base/experiment/sft/<recipe>.py`), not in the TOML.
-
-For the full field-by-field reference (every section, every default, every VFM/VLM applicability note, the `"???"` MISSING sentinel, env interpolation, the VFM↔VLM path-remap table, and how to extend the schema), see [SFT Structured-TOML Config Reference](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/sft_config.md).
-
-The commonly tuned knobs:
-
-1. `[job]`
-    1. `task` — `"vfm"` (generator recipes) or `"vlm"` (Reasoner alignment). Picks the base config: `cosmos_framework/configs/base/config.py` vs `…/vlm/config.py`. Also drives `PATH_REMAPS` in `toml_config_helper.py`.
-    1. `experiment` — Registered experiment SKU name (e.g. `vision_sft_nano`). Each SKU is a Python file under `cosmos_framework/configs/base/experiment/sft/` that wires up dataloader, model variant, and recipe-specific defaults.
-    1. `project`, `group`, `name` — Components of the run output dir `$IMAGINAIRE_OUTPUT_ROOT/<project>/<group>/<name>/`. Also flow to W&B as the project / group / run name.
-    1. `wandb_mode` — `"online"` (logs to W&B; `WANDB_API_KEY` must be set), `"offline"` (logs locally, sync later with `wandb sync`), or `"disabled"`.
-1. `[model]`
-    1. `max_num_tokens_after_packing` — VFM token-packing target. `-1` disables the cap. VFM only; VLM uses `data_setting.max_tokens` (tail override).
-    1. `joint_attn_implementation` — VFM attention layout: `"two_way"` / `"three_way"` (NATTEN) / `"flex"`.
-    1. `attn_implementation` — VLM attention impl: `"cosmos"` / `"flash_attention_2"` / `"sdpa"` / `"eager"`. VLM only.
-    1. `lora_enabled`, `lora_rank`, `lora_alpha`, `lora_target_modules` — LoRA adapter knobs for the generation pathway. Used by SUPER-tier recipes; NANO-tier leaves `lora_enabled=false`. VFM only.
-1. `[model.ema]`
-    1. `enabled`, `rate`, `iteration_shift` — Exponential moving average of generation-pathway weights. Full fine-tunes typically enable it; LoRA recipes leave it off.
-1. `[model.parallelism]`
-    1. `data_parallel_shard_degree` — FSDP shard degree. `data_parallel_shard_degree × data_parallel_replicate_degree × context_parallel_shard_degree` must equal `WORLD_SIZE`. `-1` autoselects from torchrun world size.
-    1. `data_parallel_replicate_degree` — HSDP replicate degree (outer replicate loop over the shard topology).
-    1. `context_parallel_shard_degree` — Context-parallel shard degree. `>1` splits the sequence dim across ranks (used by super-tier configs: DP=4, CP=2 → 8 GPUs).
-    1. `cfg_parallel_shard_degree` — Classifier-free-guidance shard degree. Almost always `1` for SFT.
-    1. `fsdp_master_dtype` — Master parameter / FSDP reduce dtype: typically `"float32"`.
-1. `[model.compile]`
-    1. `enabled` — Enable `torch.compile`. Improves speed at the cost of memory.
-    1. `compile_dynamic` — Whether to compile with symbolic-shape (dynamic) kernels. `True` (default) is appropriate for training; AR inference may prefer `False` for stable shapes.
-1. `[model]`
-    1. `precision` — Compute dtype for forward/backward: `"bfloat16"` / `"float16"` / `"float32"`. Master weights stay fp32 separately.
-1. `[model.activation_checkpointing]`
-    1. `mode` — `"none"` / `"selective"` (per-op SAC, MoT-only) / `"full"` (per-block checkpointing).
-    1. `save_ops_regex` — Regex patterns for ops to keep saved under `mode="selective"`.
-    1. `preserve_rng_state`, `determinism_check` — Recompute determinism plumbing.
-1. `[model.tokenizer]`
-    1. `vae_path` — Wan2.2 VAE `.pth` path. Recipe TOMLs use `"${oc.env:WAN_VAE_PATH}"`. VFM only.
-1. `[optimizer]`
-    1. `lr` — Base learning rate.
-    1. `betas`, `eps`, `fused`, `weight_decay` — Standard AdamW knobs. `eps` is VFM-only.
-    1. `keys_to_select` — Substring allowlist for trainable params. Empty list = train everything; `["lora_"]` = adapter-only fine-tune.
-1. `[optimizer.lr_multipliers]`
-    1. Inline table of `<substring> = <multiplier>` pairs that scale the LR of params whose name contains the substring. The shipped vision recipes leave this empty (Hydra default `{}` stands).
-1. `[scheduler]`
-    1. `cycle_lengths`, `warm_up_steps` — Cycle length and warmup duration (lists, one entry per cycle), in optimizer steps.
-    1. `f_max`, `f_min`, `f_start` — LR multipliers at peak / trough / step-0 (ratios of `optimizer.lr`).
-    1. `verbosity_interval` — Scheduler-side LR log frequency. VFM only.
-1. `[trainer]`
-    1. `max_iter` — Total optimizer steps.
-    1. `grad_accum_iter` — Micro-batches per optimizer step. Effective global batch = `grad_accum_iter × per-rank batch × world_size`.
-    1. `logging_iter` — Console / W&B scalar log frequency.
-    1. `distributed_parallelism` — `"fsdp"` is the only supported value.
-1. `[trainer.callbacks.compile_tokenizer]`
-    1. `enabled`, `compile_after_iterations`, `warmup_resolutions` — Lazy `torch.compile` of the VAE tokenizer. VFM only.
-1. `[trainer.callbacks.grad_clip]`
-    1. `clip_norm` — Max global L2 norm of the gradient (steps with larger norm are rescaled).
-    1. `force_finite` — Replace NaN/Inf grads with zero (default `true` on VFM, `false` on VLM).
-1. `[checkpoint]`
-    1. `load_path` — Base DCP checkpoint directory to resume from (Step 2 output, or a prior run's `checkpoints/iter_<N>/`). Recipe TOMLs use `"${oc.env:BASE_CHECKPOINT_PATH}"`.
-    1. `save_iter` — Save a new DCP checkpoint every N optimizer steps.
-    1. `keys_to_skip_loading` — Substring blocklist applied at load time. Used to mask EMA / LoRA tensors when warm-starting from a checkpoint that doesn't have them yet.
-1. `[dataloader_train]` — Top-level scalars only; the dataloader's class (LazyCall) and pipeline wiring (datasets, packers, …) stay in the experiment Python.
-    1. `max_samples_per_batch` — Per-micro-batch sample cap (remapped to `max_batch_size` on the VLM packer). `null` / omitted = no per-count cap.
-    1. `max_sequence_length` — Per-packed-sequence token cap (remapped to `max_tokens` on the VLM packer).
-    1. `seed` — Dataloader RNG seed (VFM only).
-
-### Common Hydra tail overrides
-
-These knobs aren't part of the pydantic schema today; pass them as trailing `key.path=value` positionals after `--` (the `cosmos_framework.scripts.train` flow forwards them through OmegaConf):
-
-- `model.config.policy.backbone.model_name` — VLM backbone HF identifier (e.g. `Qwen/Qwen3-VL-8B-Instruct`). Used by `launch_sft_llava_ov.sh`.
-- `data_setting.max_tokens` — VLM token-packing cap (the VLM analogue of `[model].max_num_tokens_after_packing`). Used by `launch_sft_llava_ov.sh`.
-
-The launchers wire these via `TAIL_OVERRIDES=(…)`; the helper appends `-- "${TAIL_OVERRIDES[@]}"` after the `--sft-toml=` argument.
diff --git a/cookbooks/cosmos3/finetune/_sft_launcher_common.sh b/cookbooks/cosmos3/finetune/_sft_launcher_common.sh
deleted file mode 100644
index 377b10f9..00000000
--- a/cookbooks/cosmos3/finetune/_sft_launcher_common.sh
+++ /dev/null
@@ -1,98 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: OpenMDW-1.1
-
-# Shared launch plumbing for the cookbook launch_sft_<recipe>.sh — the
-# structured-TOML / pydantic-schema flow that drives cosmos_framework.scripts.train.
-#
-# REQUIRES: an activated cosmos-framework venv (see the finetune README
-# Prerequisites) so `cosmos_framework` is importable. This launcher does NOT
-# add the framework's .venv/bin to PATH.
-#
-# Caller MUST set before sourcing:
-#   TOML_FILE            recipe TOML, e.g. "toml/sft_config/<recipe>.toml".
-#                        Absolute or cookbook-relative.
-#
-# Caller MAY set before sourcing (presence drives which existence checks fire):
-#   DATASET_PATH         recipe-local dataset dir, e.g. "data/<name>".
-#                        If unset, no dataset existence check fires
-#                        (reasoner / HF-streaming case).
-#   BASE_CHECKPOINT_PATH recipe-local base DCP dir, e.g. "checkpoints/<name>".
-#                        Setting it also enables WAN_VAE_PATH plumbing + check.
-#   WAN_VAE_PATH         override the default checkpoints/wan22_vae/Wan2.2_VAE.pth.
-#   EXTRA_DATASET_CHECK  bash snippet (string) eval'd after the default checks.
-#   TAIL_OVERRIDES       bash array of Hydra CLI overrides appended after `--`
-#                        (e.g. data_setting.max_tokens=16000 for VLM smokes).
-#   MASTER_PORT          torchrun --master_port; default 50012.
-#   NPROC_PER_NODE       torchrun --nproc_per_node; default 8.
-#   LOG_FILENAME         override $LOG_DIR/${LOG_FILENAME}
-#                        (default <toml-stem>_sft.log).
-#
-# Absolute paths are passed through; relative paths are anchored to the cookbook
-# dir (the directory containing this launcher). Paths set in the caller's shell
-# via `export DATASET_PATH=...` etc. win over the launcher's defaults (use the
-# `: "${VAR:=default}"` idiom in the launcher to preserve this).
-
-set -uo pipefail
-
-: "${TOML_FILE:?TOML_FILE must be set before sourcing _sft_launcher_common.sh}"
-
-# Cookbook dir = the wrapper's own directory (cookbooks/cosmos3/finetune/).
-WORKDIR="$(cd "$(dirname "${BASH_SOURCE[1]}")" && pwd)"
-
-# Anchor relative paths to $WORKDIR.
-[[ "$TOML_FILE" = /* ]] || TOML_FILE="$WORKDIR/$TOML_FILE"
-
-if [[ -n "${DATASET_PATH:-}" ]]; then
-    [[ "$DATASET_PATH" = /* ]] || DATASET_PATH="$WORKDIR/$DATASET_PATH"
-    export DATASET_PATH
-fi
-
-if [[ -n "${BASE_CHECKPOINT_PATH:-}" ]]; then
-    [[ "$BASE_CHECKPOINT_PATH" = /* ]] || BASE_CHECKPOINT_PATH="$WORKDIR/$BASE_CHECKPOINT_PATH"
-    WAN_VAE_PATH="${WAN_VAE_PATH:-checkpoints/wan22_vae/Wan2.2_VAE.pth}"
-    [[ "$WAN_VAE_PATH" = /* ]] || WAN_VAE_PATH="$WORKDIR/$WAN_VAE_PATH"
-    export BASE_CHECKPOINT_PATH WAN_VAE_PATH
-fi
-
-OUTPUT_ROOT="${OUTPUT_ROOT:-$WORKDIR/outputs/train}"
-LOG_DIR="$OUTPUT_ROOT/logs"
-TOML_STEM="$(basename "$TOML_FILE" .toml)"
-LOG_FILE="$LOG_DIR/${LOG_FILENAME:-${TOML_STEM}_sft.log}"
-IMAGINAIRE_OUTPUT_ROOT="${IMAGINAIRE_OUTPUT_ROOT:-$OUTPUT_ROOT}"
-mkdir -p "$LOG_DIR"
-
-echo ">>> $(date '+%H:%M:%S') Checking inputs..."
-[[ -f "$TOML_FILE" ]] || { echo "ERROR: TOML not found: $TOML_FILE" >&2; exit 1; }
-if [[ -n "${DATASET_PATH:-}" ]]; then
-    [[ -d "$DATASET_PATH" ]] || { echo "ERROR: DATASET_PATH not found: $DATASET_PATH (run Step 1 of the finetune README, or export DATASET_PATH=<path>)" >&2; exit 1; }
-fi
-if [[ -n "${BASE_CHECKPOINT_PATH:-}" ]]; then
-    [[ -d "$BASE_CHECKPOINT_PATH" ]] || { echo "ERROR: BASE_CHECKPOINT_PATH not found: $BASE_CHECKPOINT_PATH (run Step 2 of the finetune README, or export BASE_CHECKPOINT_PATH=<path>)" >&2; exit 1; }
-    [[ -f "$WAN_VAE_PATH" ]]         || { echo "ERROR: WAN_VAE_PATH not found: $WAN_VAE_PATH (run Step 1 of the finetune README, or export WAN_VAE_PATH=<path>)" >&2; exit 1; }
-fi
-if [[ -n "${EXTRA_DATASET_CHECK:-}" ]]; then eval "$EXTRA_DATASET_CHECK"; fi
-
-cd "$WORKDIR"
-echo ">>> $(date '+%H:%M:%S') WORKDIR:    $WORKDIR"
-echo ">>> $(date '+%H:%M:%S') TOML:       $TOML_FILE"
-[[ -n "${DATASET_PATH:-}" ]]         && echo ">>> $(date '+%H:%M:%S') dataset:    $DATASET_PATH"
-[[ -n "${BASE_CHECKPOINT_PATH:-}" ]] && echo ">>> $(date '+%H:%M:%S') checkpoint: $BASE_CHECKPOINT_PATH"
-echo ">>> $(date '+%H:%M:%S') log:        $LOG_FILE"
-
-# Default empty if caller didn't set; safe under set -u.
-[[ ${TAIL_OVERRIDES+x} ]] || TAIL_OVERRIDES=()
-
-TRAILING_ARGS=()
-if (( ${#TAIL_OVERRIDES[@]} > 0 )); then
-    TRAILING_ARGS=(-- "${TAIL_OVERRIDES[@]}")
-fi
-
-IMAGINAIRE_OUTPUT_ROOT="$IMAGINAIRE_OUTPUT_ROOT" \
-    torchrun --nproc_per_node="${NPROC_PER_NODE:-8}" --master_port="${MASTER_PORT:-50012}" -m cosmos_framework.scripts.train \
-    --sft-toml="$TOML_FILE" \
-    "${TRAILING_ARGS[@]}" \
-    2>&1 | tee "$LOG_FILE"
-
-EXIT_CODE=${PIPESTATUS[0]}
-echo ">>> $(date '+%H:%M:%S') Done (exit $EXIT_CODE)"
-exit $EXIT_CODE
diff --git a/cookbooks/cosmos3/finetune/launch_sft_llava_ov.sh b/cookbooks/cosmos3/finetune/launch_sft_llava_ov.sh
deleted file mode 100644
index 1967cfca..00000000
--- a/cookbooks/cosmos3/finetune/launch_sft_llava_ov.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/usr/bin/env bash
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: OpenMDW-1.1
-
-# Structured-TOML launch for llava_ov (VLM SFT on
-# lmms-lab/LLaVA-OneVision-Data via CosmosDataLoader). Drives
-# cosmos_framework.scripts.train against toml/sft_config/llava_ov.toml.
-#
-# [job].task = "vlm" — picks cosmos_framework/configs/base/vlm/config.py as the base config.
-#
-# Requires an activated cosmos-framework venv (see the finetune README
-# Prerequisites). Run from cookbooks/cosmos3/finetune/.
-#
-# The dataset streams from the HuggingFace Hub, so DATASET_PATH /
-# WAN_VAE_PATH / BASE_CHECKPOINT_PATH are NOT required.
-#
-# Optional env:
-#   HF_TOKEN               for gated Qwen3-VL-8B-Instruct downloads.
-#   VLM_SAFETENSORS_PATH   local directory of pre-converted Qwen3-VL safetensors
-#                          (e.g. a Cosmos3-Nano LM merged with Qwen3-VL visual via
-#                          `cosmos_framework.scripts.convert_model_to_vlm_safetensors`).
-#                          When set, plumbed to backbone.safetensors_path via a
-#                          tail override. When unset, the framework falls back
-#                          to the public Qwen/Qwen3-VL-8B-Instruct HF snapshot.
-#
-# Usage (8-GPU allocation, framework venv active, from cookbooks/cosmos3/finetune/):
-#   bash launch_sft_llava_ov.sh
-
-TOML_FILE="toml/sft_config/llava_ov.toml"
-
-TAIL_OVERRIDES=(
-    ${EXTRA_TAIL_OVERRIDES:-}
-)
-
-# When VLM_SAFETENSORS_PATH is set, plumb it to backbone.safetensors_path so the
-# framework loads weights from the local snapshot (e.g. a Cosmos3-Nano LM merged
-# with Qwen3-VL visual via `cosmos_framework.scripts.convert_model_to_vlm_safetensors`)
-# while keeping the public HF model_name for tokenizer/architecture discovery.
-if [[ -n "${VLM_SAFETENSORS_PATH:-}" ]]; then
-    TAIL_OVERRIDES+=("model.config.policy.backbone.safetensors_path=$VLM_SAFETENSORS_PATH")
-fi
-
-source "$(dirname "${BASH_SOURCE[0]}")/_sft_launcher_common.sh"
diff --git a/cookbooks/cosmos3/finetune/launch_sft_videophy2_nano.sh b/cookbooks/cosmos3/finetune/launch_sft_videophy2_nano.sh
deleted file mode 100644
index 9499cc5c..00000000
--- a/cookbooks/cosmos3/finetune/launch_sft_videophy2_nano.sh
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/usr/bin/env bash
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: OpenMDW-1.1
-
-# Structured-TOML launch for videophy2_sft_nano (VLM dialog SFT on VideoPhy-2
-# via CosmosDataLoader). Drives cosmos_framework.scripts.train against
-# toml/sft_config/videophy2_sft_nano.toml.
-#
-# [job].task = "vlm" — picks cosmos_framework/configs/base/vlm/config.py as the base config.
-#
-# Requires an activated cosmos-framework venv (see the finetune README
-# Prerequisites). Run from cookbooks/cosmos3/finetune/.
-#
-# Required env:
-#   VIDEOPHYSICS_ROOT  dir containing videophy2_train/ and videophy2_val/
-#                      (each with meta.json + media/ + text/). Populate via
-#                      `python -m cosmos_framework.scripts.vlm.prepare_videophy2_from_hf`.
-#
-# Optional env:
-#   HF_TOKEN               for gated Qwen3-VL-8B-Instruct downloads.
-#   VLM_SAFETENSORS_PATH   local directory of pre-converted Qwen3-VL safetensors
-#                          (e.g. Cosmos3-Nano LM merged with Qwen3-VL visual via
-#                          `cosmos_framework.scripts.convert_model_to_vlm_safetensors`).
-#                          When set, plumbed to backbone.safetensors_path via a
-#                          tail override. When unset, the framework falls back
-#                          to the public Qwen/Qwen3-VL-8B-Instruct HF snapshot.
-#
-# Usage (8-GPU allocation, framework venv active, from cookbooks/cosmos3/finetune/):
-#   VIDEOPHYSICS_ROOT=/path/to/videophysics bash launch_sft_videophy2_nano.sh
-
-TOML_FILE="toml/sft_config/videophy2_sft_nano.toml"
-
-TAIL_OVERRIDES=(
-    ${EXTRA_TAIL_OVERRIDES:-}
-)
-
-# When VLM_SAFETENSORS_PATH is set, plumb it to backbone.safetensors_path so the
-# framework loads weights from the local snapshot (e.g. a Cosmos3-Nano LM merged
-# with Qwen3-VL visual via `cosmos_framework.scripts.convert_model_to_vlm_safetensors`)
-# while keeping the public HF model_name for tokenizer/architecture discovery.
-if [[ -n "${VLM_SAFETENSORS_PATH:-}" ]]; then
-    TAIL_OVERRIDES+=("model.config.policy.backbone.safetensors_path=$VLM_SAFETENSORS_PATH")
-fi
-
-source "$(dirname "${BASH_SOURCE[0]}")/_sft_launcher_common.sh"
diff --git a/cookbooks/cosmos3/finetune/launch_sft_vision_nano.sh b/cookbooks/cosmos3/finetune/launch_sft_vision_nano.sh
deleted file mode 100644
index d67c4ddc..00000000
--- a/cookbooks/cosmos3/finetune/launch_sft_vision_nano.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/usr/bin/env bash
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: OpenMDW-1.1
-
-# Structured-TOML launch for vision_sft_nano (T2V / I2V / V2V vision-only
-# SFT on Qwen3-VL-8B, 8-GPU FSDP). Drives cosmos_framework.scripts.train against
-# toml/sft_config/vision_sft_nano.toml.
-#
-# Requires an activated cosmos-framework venv (see the finetune README
-# Prerequisites). Run from cookbooks/cosmos3/finetune/.
-#
-# Optional env vars (defaults below point under this cookbook dir; override to
-# put data or checkpoints on a different filesystem):
-#   DATASET_PATH          default: data/BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge
-#                         (must contain train/video_dataset_file.jsonl)
-#   BASE_CHECKPOINT_PATH  default: checkpoints/Cosmos3-Nano
-#   WAN_VAE_PATH          default: checkpoints/wan22_vae/Wan2.2_VAE.pth
-#   HF_TOKEN              if any tokenizer download requires gated HF access
-#   OUTPUT_ROOT           default: outputs/train
-#
-# Usage (8-GPU allocation, framework venv active, from cookbooks/cosmos3/finetune/):
-#   bash launch_sft_vision_nano.sh
-
-TOML_FILE="toml/sft_config/vision_sft_nano.toml"
-: "${DATASET_PATH:=data/BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge}"
-: "${BASE_CHECKPOINT_PATH:=checkpoints/Cosmos3-Nano}"
-
-EXTRA_DATASET_CHECK='[[ -f "$DATASET_PATH/train/video_dataset_file.jsonl" ]] || { echo "ERROR: missing $DATASET_PATH/train/video_dataset_file.jsonl" >&2; exit 1; }'
-
-source "$(dirname "${BASH_SOURCE[0]}")/_sft_launcher_common.sh"
diff --git a/cookbooks/cosmos3/finetune/launch_sft_vision_super.sh b/cookbooks/cosmos3/finetune/launch_sft_vision_super.sh
deleted file mode 100644
index 54bfde97..00000000
--- a/cookbooks/cosmos3/finetune/launch_sft_vision_super.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/usr/bin/env bash
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: OpenMDW-1.1
-
-# Structured-TOML launch for vision_sft_super (T2V / I2V / V2V LoRA SFT on
-# Qwen3-VL-32B-Instruct, 8-GPU FSDP with CP=2 / DP=4). Drives
-# cosmos_framework.scripts.train against toml/sft_config/vision_sft_super.toml.
-#
-# Requires an activated cosmos-framework venv (see the finetune README
-# Prerequisites). Run from cookbooks/cosmos3/finetune/.
-#
-# Optional env vars (defaults below point under this cookbook dir; override to
-# put data or checkpoints on a different filesystem):
-#   DATASET_PATH          default: data/BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge
-#                         (must contain train/video_dataset_file.jsonl)
-#   BASE_CHECKPOINT_PATH  default: checkpoints/Cosmos3-Super
-#   WAN_VAE_PATH          default: checkpoints/wan22_vae/Wan2.2_VAE.pth
-#   HF_TOKEN              if any tokenizer download requires gated HF access
-#   OUTPUT_ROOT           default: outputs/train
-#
-# Usage (8-GPU allocation, framework venv active, from cookbooks/cosmos3/finetune/):
-#   bash launch_sft_vision_super.sh
-
-TOML_FILE="toml/sft_config/vision_sft_super.toml"
-: "${DATASET_PATH:=data/BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge}"
-: "${BASE_CHECKPOINT_PATH:=checkpoints/Cosmos3-Super}"
-
-EXTRA_DATASET_CHECK='[[ -f "$DATASET_PATH/train/video_dataset_file.jsonl" ]] || { echo "ERROR: missing $DATASET_PATH/train/video_dataset_file.jsonl" >&2; exit 1; }'
-
-# Super-variant env tweaks: clear LD_LIBRARY_PATH to avoid host CUDA/NCCL libs
-# bleeding into the venv, switch the allocator to expandable_segments so the
-# 32B backbone fits without OOM during compile/decode.
-export LD_LIBRARY_PATH=""
-export PYTORCH_ALLOC_CONF="${PYTORCH_ALLOC_CONF:-expandable_segments:True}"
-
-source "$(dirname "${BASH_SOURCE[0]}")/_sft_launcher_common.sh"
diff --git a/cookbooks/cosmos3/generator/audiovisual/finetune/README.md b/cookbooks/cosmos3/generator/audiovisual/finetune/README.md
new file mode 100644
index 00000000..77dd1a04
--- /dev/null
+++ b/cookbooks/cosmos3/generator/audiovisual/finetune/README.md
@@ -0,0 +1,58 @@
+# Cosmos3 Vision Generator Fine-Tuning (SFT)
+
+Supervised fine-tuning (SFT) of the Cosmos3 video generator on your own captioned video data. Tested on 8×H100 (80 GB).
+
+| Recipe | Launch shell | Base model | Dataset |
+| --- | --- | --- | --- |
+| Vision SFT (full) | `launch_sft_vision_nano.sh` | Cosmos3-Nano | [BridgeData2-Subset-Synthetic-Captions](https://huggingface.co/datasets/nvidia/BridgeData2-Subset-Synthetic-Captions) |
+| Vision SFT (LoRA) | `launch_sft_vision_super.sh` | Cosmos3-Super | same as above |
+
+Both recipes train on structured-JSON captions (`caption_json`, the model's native prompt format), so training stays aligned with inference.
+
+## Prerequisites
+
+1. **Install the framework.** These recipes drive `cosmos_framework.scripts.train`, so install a cosmos-framework checkout first — follow the shared [Cosmos Framework setup](../../../README.md#cosmos-framework) (clone into `packages/cosmos3`, then `uv sync --all-extras --group=cu130-train`; use `cu128-train` on a CUDA 12.x driver).
+2. **Recommended container.** For a curated CUDA + PyTorch base, NVIDIA recommends starting from the NGC PyTorch container **`nvcr.io/nvidia/pytorch:25.09-py3`** (CUDA 13; use **`:25.06-py3`** for a CUDA 12.8 driver). See the framework [setup guide](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/setup.md#recommended-base-image).
+3. **Activate** the framework venv so `cosmos_framework` is importable: `source <path-to>/packages/cosmos3/.venv/bin/activate`.
+4. **Hugging Face access.** Some assets are license-gated — accept terms on the dataset/model pages and authenticate once with `uvx hf@latest auth login` (or export `HF_TOKEN`).
+5. **Run from this directory** (`cookbooks/cosmos3/generator/audiovisual/finetune/`). Downloads, converted checkpoints, and run outputs default to `data/`, `checkpoints/`, and `outputs/` here (all git-ignored).
+
+## Quick start
+
+Each launcher is a complete recipe — run it from this folder and it downloads the dataset, fetches the Wan2.2 VAE, converts the base checkpoint, then runs 8-GPU training (the download/convert steps are skipped if their outputs already exist):
+
+```shell
+bash launch_sft_vision_nano.sh      # full SFT on Cosmos3-Nano
+# or
+bash launch_sft_vision_super.sh     # LoRA SFT on Cosmos3-Super
+```
+
+Paths are fixed at the top of each script (under this git-ignored folder) — edit them there to put data or checkpoints on another filesystem.
+
+## Outputs
+
+Training writes to `outputs/train/<project>/<group>/<name>/`:
+
+- `checkpoints/iter_<N>/` — DCP checkpoint (model / optim / scheduler / trainer state); `checkpoints/latest_checkpoint.txt` names the newest.
+- `config.yaml`, launch metadata, logs, and one directory per registered callback.
+
+## Export to Hugging Face safetensors
+
+```shell
+RUN_DIR=outputs/train/<project>/<group>/<name>
+CKPT=$RUN_DIR/checkpoints/$(cat "$RUN_DIR/checkpoints/latest_checkpoint.txt")
+python -m cosmos_framework.scripts.export_model \
+    --checkpoint-path "$CKPT" --config-file "$RUN_DIR/config.yaml" -o "$RUN_DIR/model"
+```
+
+Use the exported `$RUN_DIR/model` with the [audiovisual inference cookbook](../README.md).
+
+## Advanced configuration
+
+These recipes are intentionally minimal. For the full post-training reference — raw `torchrun`, resuming, every TOML field, parallelism / LoRA / EMA knobs, and the VFM↔VLM remap — see the canonical framework docs:
+
+- [Post-Training (SFT) guide](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/training.md)
+- [SFT structured-TOML config reference](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/sft_config.md)
+- [JSONL dataset format](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/dataset_jsonl.md) · [environment variables](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/environment_variables.md) · [FAQ / OOM during SFT](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/faq.md)
+
+> SFT here is a multi-GPU `torchrun` job, so these cookbooks ship as launch scripts + this README rather than a one-click notebook.
diff --git a/cookbooks/cosmos3/generator/audiovisual/finetune/launch_sft_vision_nano.sh b/cookbooks/cosmos3/generator/audiovisual/finetune/launch_sft_vision_nano.sh
new file mode 100644
index 00000000..52b3d9f2
--- /dev/null
+++ b/cookbooks/cosmos3/generator/audiovisual/finetune/launch_sft_vision_nano.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+# Complete recipe: Vision SFT on Cosmos3-Nano (T2V / I2V / V2V, 8x H100).
+# Run from this folder with the cosmos-framework venv active (see README):
+#   bash launch_sft_vision_nano.sh
+# It downloads the data, prepares the base checkpoint, and trains — in order.
+# Paths are fixed under this (git-ignored) folder; edit them below to relocate.
+
+set -euo pipefail
+cd "$(dirname "${BASH_SOURCE[0]}")"
+
+DATASET_DIR="$PWD/data/BridgeData2-Subset-Synthetic-Captions"
+CHECKPOINT_DIR="$PWD/checkpoints/Cosmos3-Nano"
+VAE_PATH="$PWD/checkpoints/wan22_vae/Wan2.2_VAE.pth"
+
+# 1. Download the SFT dataset (skipped if present; license-gated — accept terms + 'uvx hf@latest auth login').
+if [[ ! -f "$DATASET_DIR/sft_dataset_bridge/train/video_dataset_file.jsonl" ]]; then
+    uvx hf@latest download --repo-type dataset nvidia/BridgeData2-Subset-Synthetic-Captions \
+        --revision 40d018ac1c1a2a4b9734f17fdb21f3d933c49a01 --local-dir "$DATASET_DIR"
+fi
+
+# 2. Download the Wan2.2 VAE (skipped if present).
+if [[ ! -f "$VAE_PATH" ]]; then
+    uvx hf@latest download Wan-AI/Wan2.2-TI2V-5B Wan2.2_VAE.pth --local-dir "$(dirname "$VAE_PATH")"
+fi
+
+# 3. Convert the base checkpoint to DCP (skipped if present).
+if [[ ! -d "$CHECKPOINT_DIR" ]]; then
+    python -m cosmos_framework.scripts.convert_model_to_dcp -o "$CHECKPOINT_DIR" --checkpoint-path Cosmos3-Nano
+fi
+
+# 4. Train (8-GPU FSDP). The TOML reads these three paths from the environment.
+export DATASET_PATH="$DATASET_DIR/sft_dataset_bridge"
+export BASE_CHECKPOINT_PATH="$CHECKPOINT_DIR"
+export WAN_VAE_PATH="$VAE_PATH"
+IMAGINAIRE_OUTPUT_ROOT="$PWD/outputs/train" torchrun --nproc_per_node=8 \
+    -m cosmos_framework.scripts.train --sft-toml="toml/sft_config/vision_sft_nano.toml"
diff --git a/cookbooks/cosmos3/generator/audiovisual/finetune/launch_sft_vision_super.sh b/cookbooks/cosmos3/generator/audiovisual/finetune/launch_sft_vision_super.sh
new file mode 100644
index 00000000..e4dd114d
--- /dev/null
+++ b/cookbooks/cosmos3/generator/audiovisual/finetune/launch_sft_vision_super.sh
@@ -0,0 +1,42 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+# Complete recipe: Vision LoRA SFT on Cosmos3-Super (T2V / I2V / V2V, 8x H100).
+# Run from this folder with the cosmos-framework venv active (see README):
+#   bash launch_sft_vision_super.sh
+# It downloads the data, prepares the base checkpoint, and trains — in order.
+# Paths are fixed under this (git-ignored) folder; edit them below to relocate.
+
+set -euo pipefail
+cd "$(dirname "${BASH_SOURCE[0]}")"
+
+DATASET_DIR="$PWD/data/BridgeData2-Subset-Synthetic-Captions"
+CHECKPOINT_DIR="$PWD/checkpoints/Cosmos3-Super"
+VAE_PATH="$PWD/checkpoints/wan22_vae/Wan2.2_VAE.pth"
+
+# 1. Download the SFT dataset (skipped if present; license-gated — accept terms + 'uvx hf@latest auth login').
+if [[ ! -f "$DATASET_DIR/sft_dataset_bridge/train/video_dataset_file.jsonl" ]]; then
+    uvx hf@latest download --repo-type dataset nvidia/BridgeData2-Subset-Synthetic-Captions \
+        --revision 40d018ac1c1a2a4b9734f17fdb21f3d933c49a01 --local-dir "$DATASET_DIR"
+fi
+
+# 2. Download the Wan2.2 VAE (skipped if present).
+if [[ ! -f "$VAE_PATH" ]]; then
+    uvx hf@latest download Wan-AI/Wan2.2-TI2V-5B Wan2.2_VAE.pth --local-dir "$(dirname "$VAE_PATH")"
+fi
+
+# 3. Convert the base checkpoint to DCP (skipped if present).
+if [[ ! -d "$CHECKPOINT_DIR" ]]; then
+    python -m cosmos_framework.scripts.convert_model_to_dcp -o "$CHECKPOINT_DIR" --checkpoint-path Cosmos3-Super
+fi
+
+# 4. Train (8-GPU FSDP, CP=2 / DP=4). The 32B backbone needs the host CUDA libs
+#    cleared and the expandable_segments allocator to fit without OOM.
+export LD_LIBRARY_PATH=""
+export PYTORCH_ALLOC_CONF="expandable_segments:True"
+export DATASET_PATH="$DATASET_DIR/sft_dataset_bridge"
+export BASE_CHECKPOINT_PATH="$CHECKPOINT_DIR"
+export WAN_VAE_PATH="$VAE_PATH"
+IMAGINAIRE_OUTPUT_ROOT="$PWD/outputs/train" torchrun --nproc_per_node=8 \
+    -m cosmos_framework.scripts.train --sft-toml="toml/sft_config/vision_sft_super.toml"
diff --git a/cookbooks/cosmos3/finetune/toml/sft_config/vision_sft_nano.toml b/cookbooks/cosmos3/generator/audiovisual/finetune/toml/sft_config/vision_sft_nano.toml
similarity index 100%
rename from cookbooks/cosmos3/finetune/toml/sft_config/vision_sft_nano.toml
rename to cookbooks/cosmos3/generator/audiovisual/finetune/toml/sft_config/vision_sft_nano.toml
diff --git a/cookbooks/cosmos3/finetune/toml/sft_config/vision_sft_super.toml b/cookbooks/cosmos3/generator/audiovisual/finetune/toml/sft_config/vision_sft_super.toml
similarity index 100%
rename from cookbooks/cosmos3/finetune/toml/sft_config/vision_sft_super.toml
rename to cookbooks/cosmos3/generator/audiovisual/finetune/toml/sft_config/vision_sft_super.toml
diff --git a/cookbooks/cosmos3/reasoner/finetune/README.md b/cookbooks/cosmos3/reasoner/finetune/README.md
new file mode 100644
index 00000000..ff7816da
--- /dev/null
+++ b/cookbooks/cosmos3/reasoner/finetune/README.md
@@ -0,0 +1,58 @@
+# Cosmos3 Reasoner Fine-Tuning (SFT)
+
+Supervised fine-tuning (SFT) of the Cosmos3 Reasoner (VLM) on your own data. Tested on 8×H100 (80 GB).
+
+| Recipe | Launch shell | Dataset | Notes |
+| --- | --- | --- | --- |
+| Alignment SFT (LLaVA-OneVision) | `launch_sft_llava_ov.sh` | [lmms-lab/LLaVA-OneVision-Data](https://huggingface.co/datasets/lmms-lab/LLaVA-OneVision-Data) | Streams from HF; backbone fetched at startup — no local prep |
+| Physical-plausibility SFT (VideoPhy-2) | `launch_sft_videophy2_nano.sh` | [videophysics/videophy2_train](https://huggingface.co/datasets/videophysics/videophy2_train) | 1–5 plausibility scoring; dataset + checkpoint auto-prepared |
+
+Both use `[job].task = "vlm"` and bootstrap from `Qwen/Qwen3-VL-8B-Instruct` (optionally a merged Cosmos3-Nano reasoner snapshot).
+
+## Prerequisites
+
+1. **Install the framework.** These recipes drive `cosmos_framework.scripts.train`, so install a cosmos-framework checkout first — follow the shared [Cosmos Framework setup](../../README.md#cosmos-framework) (clone into `packages/cosmos3`, then `uv sync --all-extras --group=cu130-train`; use `cu128-train` on a CUDA 12.x driver).
+2. **Recommended container.** For a curated CUDA + PyTorch base, NVIDIA recommends starting from the NGC PyTorch container **`nvcr.io/nvidia/pytorch:25.09-py3`** (CUDA 13; use **`:25.06-py3`** for a CUDA 12.8 driver). See the framework [setup guide](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/setup.md#recommended-base-image).
+3. **Activate** the framework venv so `cosmos_framework` is importable: `source <path-to>/packages/cosmos3/.venv/bin/activate`.
+4. **Hugging Face access.** The Qwen3-VL backbone and datasets are fetched from HF — authenticate once with `uvx hf@latest auth login` (or export `HF_TOKEN`); accept any dataset terms first.
+5. **Run from this directory** (`cookbooks/cosmos3/reasoner/finetune/`). Any downloads, converted checkpoints, and run outputs default to `data/`, `checkpoints/`, and `outputs/` here (all git-ignored).
+
+## Quick start
+
+Each launcher is a complete recipe — just run it from this folder:
+
+```shell
+bash launch_sft_llava_ov.sh          # alignment SFT; dataset streams from HF, backbone fetched at startup
+# or
+bash launch_sft_videophy2_nano.sh    # first run materializes VideoPhy-2 + builds the merged Cosmos3-Nano VLM checkpoint, then trains
+```
+
+The VideoPhy-2 download/convert steps are skipped once their outputs exist. Paths are fixed at the top of each script (under this git-ignored folder) — edit them there to relocate data or checkpoints.
+
+## Outputs
+
+Training writes to `outputs/train/<project>/<group>/<name>/`:
+
+- `checkpoints/iter_<N>/` — DCP checkpoint (model / optim / scheduler / trainer state); `checkpoints/latest_checkpoint.txt` names the newest.
+- `config.yaml`, launch metadata, logs, and one directory per registered callback.
+
+## Export to Hugging Face safetensors
+
+```shell
+RUN_DIR=outputs/train/<project>/<group>/<name>
+CKPT=$RUN_DIR/checkpoints/$(cat "$RUN_DIR/checkpoints/latest_checkpoint.txt")
+python -m cosmos_framework.scripts.export_model \
+    --checkpoint-path "$CKPT" --config-file "$RUN_DIR/config.yaml" -o "$RUN_DIR/model"
+```
+
+Use the exported `$RUN_DIR/model` with the [reasoner inference cookbook](../README.md).
+
+## Advanced configuration
+
+These recipes are intentionally minimal. For the full post-training reference — raw `torchrun`, resuming, every TOML field, and advanced parallelism — see the canonical framework docs:
+
+- [Post-Training (SFT) guide](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/training.md)
+- [SFT structured-TOML config reference](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/sft_config.md)
+- [JSONL dataset format](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/dataset_jsonl.md) · [environment variables](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/environment_variables.md) · [FAQ / OOM during SFT](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/faq.md)
+
+> SFT here is a multi-GPU `torchrun` job, so these cookbooks ship as launch scripts + this README rather than a one-click notebook.
diff --git a/cookbooks/cosmos3/reasoner/finetune/launch_sft_llava_ov.sh b/cookbooks/cosmos3/reasoner/finetune/launch_sft_llava_ov.sh
new file mode 100644
index 00000000..844f5a3b
--- /dev/null
+++ b/cookbooks/cosmos3/reasoner/finetune/launch_sft_llava_ov.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+# Complete recipe: Reasoner alignment SFT on LLaVA-OneVision (8x H100).
+# Run from this folder with the cosmos-framework venv active (see README):
+#   bash launch_sft_llava_ov.sh
+# The dataset streams from HuggingFace and the Qwen3-VL-8B-Instruct backbone is
+# fetched at startup, so there's nothing to download first — this just trains.
+
+set -euo pipefail
+cd "$(dirname "${BASH_SOURCE[0]}")"
+
+# Train (8-GPU FSDP).
+IMAGINAIRE_OUTPUT_ROOT="$PWD/outputs/train" torchrun --nproc_per_node=8 \
+    -m cosmos_framework.scripts.train --sft-toml="toml/sft_config/llava_ov.toml"
diff --git a/cookbooks/cosmos3/reasoner/finetune/launch_sft_videophy2_nano.sh b/cookbooks/cosmos3/reasoner/finetune/launch_sft_videophy2_nano.sh
new file mode 100644
index 00000000..30648a8a
--- /dev/null
+++ b/cookbooks/cosmos3/reasoner/finetune/launch_sft_videophy2_nano.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+# Complete recipe: Reasoner physical-plausibility SFT on VideoPhy-2 (8x H100).
+# Run from this folder with the cosmos-framework venv active (see README):
+#   bash launch_sft_videophy2_nano.sh
+# It materializes the dataset, builds the merged Cosmos3-Nano VLM checkpoint, and
+# trains — in order. Paths are fixed under this (git-ignored) folder.
+
+set -euo pipefail
+cd "$(dirname "${BASH_SOURCE[0]}")"
+
+VIDEOPHYSICS_ROOT="$PWD/data/videophysics"
+VLM_CHECKPOINT="$PWD/checkpoints/Cosmos3-Nano-VLM"
+
+# 1. Materialize the VideoPhy-2 dataset (skipped if present).
+if [[ ! -d "$VIDEOPHYSICS_ROOT/videophy2_train" ]]; then
+    python -m cosmos_framework.scripts.vlm.prepare_videophy2_from_hf --out_root "$VIDEOPHYSICS_ROOT" --split both
+fi
+
+# 2. Merge Cosmos3-Nano LM onto the Qwen3-VL-8B-Instruct visual tower (skipped if present).
+if [[ ! -d "$VLM_CHECKPOINT" ]]; then
+    python -m cosmos_framework.scripts.convert_model_to_vlm_safetensors --checkpoint-path Cosmos3-Nano -o "$VLM_CHECKPOINT"
+fi
+
+# 3. Train (8-GPU FSDP). VIDEOPHYSICS_ROOT is read from the environment; the
+#    merged checkpoint is supplied as a config override after `--`.
+export VIDEOPHYSICS_ROOT
+IMAGINAIRE_OUTPUT_ROOT="$PWD/outputs/train" torchrun --nproc_per_node=8 \
+    -m cosmos_framework.scripts.train --sft-toml="toml/sft_config/videophy2_sft_nano.toml" \
+    -- model.config.policy.backbone.safetensors_path="$VLM_CHECKPOINT"
diff --git a/cookbooks/cosmos3/finetune/toml/sft_config/llava_ov.toml b/cookbooks/cosmos3/reasoner/finetune/toml/sft_config/llava_ov.toml
similarity index 100%
rename from cookbooks/cosmos3/finetune/toml/sft_config/llava_ov.toml
rename to cookbooks/cosmos3/reasoner/finetune/toml/sft_config/llava_ov.toml
diff --git a/cookbooks/cosmos3/finetune/toml/sft_config/videophy2_sft_nano.toml b/cookbooks/cosmos3/reasoner/finetune/toml/sft_config/videophy2_sft_nano.toml
similarity index 100%
rename from cookbooks/cosmos3/finetune/toml/sft_config/videophy2_sft_nano.toml
rename to cookbooks/cosmos3/reasoner/finetune/toml/sft_config/videophy2_sft_nano.toml