From e986d0eceb13858b565b2fef3a49e981a1260eda Mon Sep 17 00:00:00 2001 From: Maosheng Liao Date: Tue, 16 Jun 2026 06:59:57 -0700 Subject: [PATCH 1/6] feat(finetune): vendor the 4 Cosmos3 SFT recipe TOMLs --- .../finetune/toml/sft_config/llava_ov.toml | 108 ++++++++++++++++++ .../toml/sft_config/videophy2_sft_nano.toml | 91 +++++++++++++++ .../toml/sft_config/vision_sft_nano.toml | 91 +++++++++++++++ .../toml/sft_config/vision_sft_super.toml | 92 +++++++++++++++ 4 files changed, 382 insertions(+) create mode 100644 cookbooks/cosmos3/finetune/toml/sft_config/llava_ov.toml create mode 100644 cookbooks/cosmos3/finetune/toml/sft_config/videophy2_sft_nano.toml create mode 100644 cookbooks/cosmos3/finetune/toml/sft_config/vision_sft_nano.toml create mode 100644 cookbooks/cosmos3/finetune/toml/sft_config/vision_sft_super.toml diff --git a/cookbooks/cosmos3/finetune/toml/sft_config/llava_ov.toml b/cookbooks/cosmos3/finetune/toml/sft_config/llava_ov.toml new file mode 100644 index 00000000..41fe3502 --- /dev/null +++ b/cookbooks/cosmos3/finetune/toml/sft_config/llava_ov.toml @@ -0,0 +1,108 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +# pre_exp012_llava_ov — VLM training on lmms-lab/LLaVA-OneVision-Data +# via CosmosDataLoader. Base config = cosmos_framework/configs/base/vlm/config.py +# (selected by [job].task="vlm"). +# +# One knob that the SFTExperimentConfig dataclass does NOT model — supply +# it as a CLI extra override at launch time: +# +# data_setting.max_tokens= +# +# (The backbone is now modeled — see [model.backbone] below.) +# +# Example launch: +# torchrun --nproc_per_node=4 -m cosmos_framework.scripts.train \ +# --sft-toml toml/sft_config/llava_ov.toml -- \ +# data_setting.max_tokens=16000 +# +# Per-task remap (see _PATH_REMAPS["vlm"]): +# model.parallelism.* -> model.config.parallelism.* +# model.compile.* -> model.config.compile.* +# model.activation_checkpointing.* -> model.config.activation_checkpointing.* +# model.precision -> model.config.precision +# model.attn_implementation -> model.config.policy.attn_implementation +# model.backbone.* -> model.config.policy.backbone.* +# model.ema.* -> model.config.ema.* +# model.{max_num_tokens_after_packing, joint_attn_implementation, lora_*, +# tokenizer.*} and dataloader_train.{max_sequence_length, seed} -> SKIPPED + +[job] +task = "vlm" +experiment = "pre_exp012_llava_ov" +project = "cosmos3" # matches legacy +group = "vlm_llava_ov_demo" +name = "pre_exp012_llava_ov" +wandb_mode = "disabled" + +[model] +# VLM-only attention impl (PolicyConfig.attn_implementation). +attn_implementation = "cosmos" # "cosmos" | "flash_attention_2" | "sdpa" | "eager" +precision = "bfloat16" # was [model.parallelism].precision + +[model.backbone] +model_name = "Qwen/Qwen3-VL-8B-Instruct" # → model.config.policy.backbone.model_name (VLM remap) + +[model.ema] +enabled = false +rate = 0.1 +iteration_shift = 0 + +[model.parallelism] +data_parallel_shard_degree = 8 # matches legacy dp_shard_size=8 +data_parallel_replicate_degree = -1 # matches legacy dp_replicate_size=-1 +context_parallel_shard_degree = 1 +cfg_parallel_shard_degree = 1 + +[model.compile] +enabled = false # was [model.parallelism].use_torch_compile +compile_dynamic = true + +[model.activation_checkpointing] +mode = "full" +save_ops_regex = ["fmha"] +preserve_rng_state = true +determinism_check = "default" + +[optimizer] +betas = [0.9, 0.95] +eps = 1.0e-8 # skipped for VLM by _PATH_REMAPS +fused = true +lr = 1.0e-5 # matches legacy +weight_decay = 0.1 # matches legacy +# keys_to_select / lr_multipliers omitted — VLM Trainer defaults apply. + +[scheduler] +cycle_lengths = [500] # matches legacy (VLM_LAMBDACOSINE_KWARGS uses ${trainer.max_iter}) +f_max = [1.0] +f_min = [0.5] # matches legacy +f_start = [0.05] # matches legacy +verbosity_interval = 0 # skipped for VLM by _PATH_REMAPS +warm_up_steps = [1000] # matches legacy + +[trainer] +distributed_parallelism = "fsdp" +grad_accum_iter = 1 +logging_iter = 1 +max_iter = 500 # matches legacy + +[trainer.callbacks.compile_tokenizer] +compile_after_iterations = 3 +enabled = false + +[trainer.callbacks.grad_clip] +clip_norm = 1.0 +force_finite = false # matches VLM default in cosmos_framework/configs/base/vlm/defaults/callbacks.py:55 + +[checkpoint] +keys_to_skip_loading = [] +load_path = "???" # MISSING sentinel; skipped by build_hydra_overrides — supply at runtime +save_iter = 100 + +[dataloader_train] +# Routed by PATH_REMAPS["vlm"] onto the CosmosDataLoader's nested PoolPackingBatcher: +# max_samples_per_batch -> dataloader_train.batcher.max_batch_size +# max_sequence_length -> dataloader_train.batcher.max_tokens +max_samples_per_batch = 1 +max_sequence_length = 16000 diff --git a/cookbooks/cosmos3/finetune/toml/sft_config/videophy2_sft_nano.toml b/cookbooks/cosmos3/finetune/toml/sft_config/videophy2_sft_nano.toml new file mode 100644 index 00000000..fa1ae613 --- /dev/null +++ b/cookbooks/cosmos3/finetune/toml/sft_config/videophy2_sft_nano.toml @@ -0,0 +1,91 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +# videophy2_sft_nano — VLM dialog SFT on VideoPhy-2 via CosmosDataLoader. +# Base config = cosmos_framework/configs/base/vlm/config.py (selected by [job].task="vlm"). +# +# Dataset prep: +# python -m cosmos_framework.scripts.vlm.prepare_videophy2_from_hf \ +# --out_root $VIDEOPHYSICS_ROOT --split train # and again with --split val +# +# Required env at launch: VIDEOPHYSICS_ROOT (read by the experiment Python). +# +# Example launch: +# bash launch_sft_videophy2_nano.sh + +[job] +task = "vlm" +experiment = "videophy2_sft_nano" +project = "cosmos3" +group = "vlm_videophy2_sft" +name = "videophy2_sft_nano" +wandb_mode = "disabled" + +[model] +attn_implementation = "cosmos" +precision = "bfloat16" # was [model.parallelism].precision + +[model.backbone] +model_name = "Qwen/Qwen3-VL-8B-Instruct" + +[model.ema] +enabled = false +rate = 0.1 +iteration_shift = 0 + +[model.parallelism] +data_parallel_shard_degree = 8 +data_parallel_replicate_degree = -1 +context_parallel_shard_degree = 1 +cfg_parallel_shard_degree = 1 + +[model.compile] +enabled = false # was [model.parallelism].use_torch_compile +compile_dynamic = true + +[model.activation_checkpointing] +mode = "full" +save_ops_regex = ["fmha"] +preserve_rng_state = true +determinism_check = "default" + +[optimizer] +betas = [0.9, 0.95] +eps = 1.0e-8 +fused = true +lr = 1.0e-6 +weight_decay = 0.1 + +[scheduler] +cycle_lengths = [50] +f_max = [1.0] +f_min = [0.1] +f_start = [0.05] +verbosity_interval = 0 +warm_up_steps = [5] + +[trainer] +distributed_parallelism = "fsdp" +grad_accum_iter = 8 +logging_iter = 1 +max_iter = 50 + +[trainer.callbacks.compile_tokenizer] +compile_after_iterations = 3 +enabled = false + +[trainer.callbacks.grad_clip] +clip_norm = 1.0 +force_finite = false + +[checkpoint] +keys_to_skip_loading = [] +load_path = "???" +save_iter = 100 + +[dataloader_train] +# Routed by PATH_REMAPS["vlm"] onto the CosmosDataLoader's nested PoolPackingBatcher: +# max_samples_per_batch -> dataloader_train.batcher.max_batch_size +# max_sequence_length -> dataloader_train.batcher.max_tokens +max_samples_per_batch = 1 +max_sequence_length = 16000 diff --git a/cookbooks/cosmos3/finetune/toml/sft_config/vision_sft_nano.toml b/cookbooks/cosmos3/finetune/toml/sft_config/vision_sft_nano.toml new file mode 100644 index 00000000..dbb192dc --- /dev/null +++ b/cookbooks/cosmos3/finetune/toml/sft_config/vision_sft_nano.toml @@ -0,0 +1,91 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +# vision_sft_nano — T2V / I2V / V2V vision-only SFT (Qwen3-VL-8B / nano) +# Consumed by cosmos_framework.configs.toml_config.sft_config.load_experiment_from_toml. +# Uses PackingDataLoader (no dataloader_train.seed slot — keep it omitted here). + +[job] +task = "vfm" +experiment = "vision_sft_nano" +project = "cosmos3" +group = "sft" +name = "vision_sft_nano" +wandb_mode = "disabled" + +[model] +max_num_tokens_after_packing = 45056 +joint_attn_implementation = "two_way" +precision = "bfloat16" # was [model.parallelism].precision + +[model.ema] +enabled = true +rate = 0.1 +iteration_shift = 0 + +[model.parallelism] +data_parallel_shard_degree = -1 # -1 = auto from WORLD_SIZE (matches legacy) +data_parallel_replicate_degree = 1 + +[model.compile] +enabled = true # was [model.parallelism].use_torch_compile +compile_dynamic = true + +[model.activation_checkpointing] +mode = "full" +save_ops_regex = ["fmha"] +preserve_rng_state = true +determinism_check = "default" + +[model.tokenizer] +vae_path = "${oc.env:WAN_VAE_PATH}" + +[optimizer] +betas = [0.9, 0.95] +eps = 1.0e-6 +fused = true +keys_to_select = [ + "moe_gen", + "time_embedder", + "vae2llm", + "llm2vae", +] +lr = 2.0e-5 +weight_decay = 0 # int matches legacy YAML repr +# lr_multipliers intentionally empty for vision SFT (Hydra default {} stands). + +[scheduler] +cycle_lengths = [1000] +f_max = [1.0] +f_min = [0.0] +f_start = [0.0] +verbosity_interval = 0 +warm_up_steps = [50] + +[trainer] +distributed_parallelism = "fsdp" +grad_accum_iter = 2 +logging_iter = 1 +max_iter = 500 + +[trainer.callbacks.compile_tokenizer] +compile_after_iterations = 3 +enabled = false +# warmup_resolutions omitted (None at experiment level) + +[trainer.callbacks.grad_clip] +clip_norm = 0.1 +force_finite = true + +[checkpoint] +keys_to_skip_loading = ["net_ema."] +load_path = "${oc.env:BASE_CHECKPOINT_PATH}" +save_iter = 100 + +[dataloader_train] +max_sequence_length = 45056 +# Per-caption token cap before truncation. Structured-JSON captions run longer than +# dense prose (measured max ~1790 tokens), so keep headroom; raise it for longer captions. +max_caption_tokens = 2048 +# max_samples_per_batch omitted (None — PackingDataLoader doesn't cap by sample count) +# seed omitted — PackingDataLoader has no seed ctor kwarg diff --git a/cookbooks/cosmos3/finetune/toml/sft_config/vision_sft_super.toml b/cookbooks/cosmos3/finetune/toml/sft_config/vision_sft_super.toml new file mode 100644 index 00000000..06a1574a --- /dev/null +++ b/cookbooks/cosmos3/finetune/toml/sft_config/vision_sft_super.toml @@ -0,0 +1,92 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +# vision_sft_super — LoRA-only T2V/I2V/V2V SFT on Qwen3-VL-32B (super tier). +# Consumed by cosmos_framework.configs.toml_config.sft_config.load_experiment_from_toml. +# Uses PackingDataLoader (no dataloader_train.seed slot — keep it omitted). + +[job] +task = "vfm" +experiment = "vision_sft_super" +project = "cosmos3" +group = "sft" +name = "vision_sft_super" +wandb_mode = "disabled" + +[model] +max_num_tokens_after_packing = 45056 +joint_attn_implementation = "two_way" +lora_enabled = true +lora_rank = 16 +lora_alpha = 32 +lora_target_modules = "q_proj_moe_gen,k_proj_moe_gen,v_proj_moe_gen,o_proj_moe_gen" +precision = "bfloat16" # was [model.parallelism].precision + +[model.ema] +enabled = false # super uses LoRA, no EMA +rate = 0.1 +iteration_shift = 0 + +[model.parallelism] +data_parallel_shard_degree = -1 # -1 = auto from WORLD_SIZE (matches legacy) +data_parallel_replicate_degree = 1 +context_parallel_shard_degree = 2 # super uses CP=2 +cfg_parallel_shard_degree = 1 + +[model.compile] +enabled = false # super disables compile (was use_torch_compile) +compile_dynamic = true + +[model.activation_checkpointing] +mode = "full" +save_ops_regex = ["fmha"] +preserve_rng_state = true +determinism_check = "default" + +[model.tokenizer] +vae_path = "${oc.env:WAN_VAE_PATH}" + +[optimizer] +betas = [0.9, 0.95] +eps = 1.0e-6 +fused = true +keys_to_select = ["lora_"] # train LoRA adapters only +lr = 5.0e-4 +weight_decay = 0 # int matches legacy YAML repr +# lr_multipliers intentionally empty. + +[scheduler] +cycle_lengths = [1000] +f_max = [1.0] +f_min = [0.0] +f_start = [0.0] +verbosity_interval = 0 +warm_up_steps = [50] + +[trainer] +distributed_parallelism = "fsdp" +grad_accum_iter = 2 +logging_iter = 1 +max_iter = 500 + +[trainer.callbacks.compile_tokenizer] +compile_after_iterations = 3 +enabled = false +warmup_resolutions = ["256", "480", "720"] + +[trainer.callbacks.grad_clip] +clip_norm = 0.1 +force_finite = true + +[checkpoint] +keys_to_skip_loading = ["net_ema.", "lora_"] # LoRA tensors freshly init +load_path = "${oc.env:BASE_CHECKPOINT_PATH}" +save_iter = 100 + +[dataloader_train] +max_sequence_length = 45056 +# Per-caption token cap before truncation. Structured-JSON captions run longer than +# dense prose (measured max ~1790 tokens), so keep headroom; raise it for longer captions. +max_caption_tokens = 2048 +# max_samples_per_batch omitted (None — PackingDataLoader doesn't cap by count) +# seed omitted — PackingDataLoader has no seed ctor kwarg From 1cb31427da4479ce259e2019d8c0911bd4ba796a Mon Sep 17 00:00:00 2001 From: Maosheng Liao Date: Tue, 16 Jun 2026 07:00:53 -0700 Subject: [PATCH 2/6] feat(finetune): vendor + rewire shared SFT launcher for the cookbook --- .../cosmos3/finetune/_sft_launcher_common.sh | 98 +++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 cookbooks/cosmos3/finetune/_sft_launcher_common.sh diff --git a/cookbooks/cosmos3/finetune/_sft_launcher_common.sh b/cookbooks/cosmos3/finetune/_sft_launcher_common.sh new file mode 100644 index 00000000..377b10f9 --- /dev/null +++ b/cookbooks/cosmos3/finetune/_sft_launcher_common.sh @@ -0,0 +1,98 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +# Shared launch plumbing for the cookbook launch_sft_.sh — the +# structured-TOML / pydantic-schema flow that drives cosmos_framework.scripts.train. +# +# REQUIRES: an activated cosmos-framework venv (see the finetune README +# Prerequisites) so `cosmos_framework` is importable. This launcher does NOT +# add the framework's .venv/bin to PATH. +# +# Caller MUST set before sourcing: +# TOML_FILE recipe TOML, e.g. "toml/sft_config/.toml". +# Absolute or cookbook-relative. +# +# Caller MAY set before sourcing (presence drives which existence checks fire): +# DATASET_PATH recipe-local dataset dir, e.g. "data/". +# If unset, no dataset existence check fires +# (reasoner / HF-streaming case). +# BASE_CHECKPOINT_PATH recipe-local base DCP dir, e.g. "checkpoints/". +# Setting it also enables WAN_VAE_PATH plumbing + check. +# WAN_VAE_PATH override the default checkpoints/wan22_vae/Wan2.2_VAE.pth. +# EXTRA_DATASET_CHECK bash snippet (string) eval'd after the default checks. +# TAIL_OVERRIDES bash array of Hydra CLI overrides appended after `--` +# (e.g. data_setting.max_tokens=16000 for VLM smokes). +# MASTER_PORT torchrun --master_port; default 50012. +# NPROC_PER_NODE torchrun --nproc_per_node; default 8. +# LOG_FILENAME override $LOG_DIR/${LOG_FILENAME} +# (default _sft.log). +# +# Absolute paths are passed through; relative paths are anchored to the cookbook +# dir (the directory containing this launcher). Paths set in the caller's shell +# via `export DATASET_PATH=...` etc. win over the launcher's defaults (use the +# `: "${VAR:=default}"` idiom in the launcher to preserve this). + +set -uo pipefail + +: "${TOML_FILE:?TOML_FILE must be set before sourcing _sft_launcher_common.sh}" + +# Cookbook dir = the wrapper's own directory (cookbooks/cosmos3/finetune/). +WORKDIR="$(cd "$(dirname "${BASH_SOURCE[1]}")" && pwd)" + +# Anchor relative paths to $WORKDIR. +[[ "$TOML_FILE" = /* ]] || TOML_FILE="$WORKDIR/$TOML_FILE" + +if [[ -n "${DATASET_PATH:-}" ]]; then + [[ "$DATASET_PATH" = /* ]] || DATASET_PATH="$WORKDIR/$DATASET_PATH" + export DATASET_PATH +fi + +if [[ -n "${BASE_CHECKPOINT_PATH:-}" ]]; then + [[ "$BASE_CHECKPOINT_PATH" = /* ]] || BASE_CHECKPOINT_PATH="$WORKDIR/$BASE_CHECKPOINT_PATH" + WAN_VAE_PATH="${WAN_VAE_PATH:-checkpoints/wan22_vae/Wan2.2_VAE.pth}" + [[ "$WAN_VAE_PATH" = /* ]] || WAN_VAE_PATH="$WORKDIR/$WAN_VAE_PATH" + export BASE_CHECKPOINT_PATH WAN_VAE_PATH +fi + +OUTPUT_ROOT="${OUTPUT_ROOT:-$WORKDIR/outputs/train}" +LOG_DIR="$OUTPUT_ROOT/logs" +TOML_STEM="$(basename "$TOML_FILE" .toml)" +LOG_FILE="$LOG_DIR/${LOG_FILENAME:-${TOML_STEM}_sft.log}" +IMAGINAIRE_OUTPUT_ROOT="${IMAGINAIRE_OUTPUT_ROOT:-$OUTPUT_ROOT}" +mkdir -p "$LOG_DIR" + +echo ">>> $(date '+%H:%M:%S') Checking inputs..." +[[ -f "$TOML_FILE" ]] || { echo "ERROR: TOML not found: $TOML_FILE" >&2; exit 1; } +if [[ -n "${DATASET_PATH:-}" ]]; then + [[ -d "$DATASET_PATH" ]] || { echo "ERROR: DATASET_PATH not found: $DATASET_PATH (run Step 1 of the finetune README, or export DATASET_PATH=)" >&2; exit 1; } +fi +if [[ -n "${BASE_CHECKPOINT_PATH:-}" ]]; then + [[ -d "$BASE_CHECKPOINT_PATH" ]] || { echo "ERROR: BASE_CHECKPOINT_PATH not found: $BASE_CHECKPOINT_PATH (run Step 2 of the finetune README, or export BASE_CHECKPOINT_PATH=)" >&2; exit 1; } + [[ -f "$WAN_VAE_PATH" ]] || { echo "ERROR: WAN_VAE_PATH not found: $WAN_VAE_PATH (run Step 1 of the finetune README, or export WAN_VAE_PATH=)" >&2; exit 1; } +fi +if [[ -n "${EXTRA_DATASET_CHECK:-}" ]]; then eval "$EXTRA_DATASET_CHECK"; fi + +cd "$WORKDIR" +echo ">>> $(date '+%H:%M:%S') WORKDIR: $WORKDIR" +echo ">>> $(date '+%H:%M:%S') TOML: $TOML_FILE" +[[ -n "${DATASET_PATH:-}" ]] && echo ">>> $(date '+%H:%M:%S') dataset: $DATASET_PATH" +[[ -n "${BASE_CHECKPOINT_PATH:-}" ]] && echo ">>> $(date '+%H:%M:%S') checkpoint: $BASE_CHECKPOINT_PATH" +echo ">>> $(date '+%H:%M:%S') log: $LOG_FILE" + +# Default empty if caller didn't set; safe under set -u. +[[ ${TAIL_OVERRIDES+x} ]] || TAIL_OVERRIDES=() + +TRAILING_ARGS=() +if (( ${#TAIL_OVERRIDES[@]} > 0 )); then + TRAILING_ARGS=(-- "${TAIL_OVERRIDES[@]}") +fi + +IMAGINAIRE_OUTPUT_ROOT="$IMAGINAIRE_OUTPUT_ROOT" \ + torchrun --nproc_per_node="${NPROC_PER_NODE:-8}" --master_port="${MASTER_PORT:-50012}" -m cosmos_framework.scripts.train \ + --sft-toml="$TOML_FILE" \ + "${TRAILING_ARGS[@]}" \ + 2>&1 | tee "$LOG_FILE" + +EXIT_CODE=${PIPESTATUS[0]} +echo ">>> $(date '+%H:%M:%S') Done (exit $EXIT_CODE)" +exit $EXIT_CODE From ca03aa042f4429e7f63ee6dd5067f9e07452beae Mon Sep 17 00:00:00 2001 From: Maosheng Liao Date: Tue, 16 Jun 2026 07:01:57 -0700 Subject: [PATCH 3/6] feat(finetune): vendor + rewire the 4 SFT launch shells --- .../cosmos3/finetune/launch_sft_llava_ov.sh | 43 ++++++++++++++++++ .../finetune/launch_sft_videophy2_nano.sh | 45 +++++++++++++++++++ .../finetune/launch_sft_vision_nano.sh | 30 +++++++++++++ .../finetune/launch_sft_vision_super.sh | 36 +++++++++++++++ 4 files changed, 154 insertions(+) create mode 100644 cookbooks/cosmos3/finetune/launch_sft_llava_ov.sh create mode 100644 cookbooks/cosmos3/finetune/launch_sft_videophy2_nano.sh create mode 100644 cookbooks/cosmos3/finetune/launch_sft_vision_nano.sh create mode 100644 cookbooks/cosmos3/finetune/launch_sft_vision_super.sh diff --git a/cookbooks/cosmos3/finetune/launch_sft_llava_ov.sh b/cookbooks/cosmos3/finetune/launch_sft_llava_ov.sh new file mode 100644 index 00000000..1967cfca --- /dev/null +++ b/cookbooks/cosmos3/finetune/launch_sft_llava_ov.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +# Structured-TOML launch for llava_ov (VLM SFT on +# lmms-lab/LLaVA-OneVision-Data via CosmosDataLoader). Drives +# cosmos_framework.scripts.train against toml/sft_config/llava_ov.toml. +# +# [job].task = "vlm" — picks cosmos_framework/configs/base/vlm/config.py as the base config. +# +# Requires an activated cosmos-framework venv (see the finetune README +# Prerequisites). Run from cookbooks/cosmos3/finetune/. +# +# The dataset streams from the HuggingFace Hub, so DATASET_PATH / +# WAN_VAE_PATH / BASE_CHECKPOINT_PATH are NOT required. +# +# Optional env: +# HF_TOKEN for gated Qwen3-VL-8B-Instruct downloads. +# VLM_SAFETENSORS_PATH local directory of pre-converted Qwen3-VL safetensors +# (e.g. a Cosmos3-Nano LM merged with Qwen3-VL visual via +# `cosmos_framework.scripts.convert_model_to_vlm_safetensors`). +# When set, plumbed to backbone.safetensors_path via a +# tail override. When unset, the framework falls back +# to the public Qwen/Qwen3-VL-8B-Instruct HF snapshot. +# +# Usage (8-GPU allocation, framework venv active, from cookbooks/cosmos3/finetune/): +# bash launch_sft_llava_ov.sh + +TOML_FILE="toml/sft_config/llava_ov.toml" + +TAIL_OVERRIDES=( + ${EXTRA_TAIL_OVERRIDES:-} +) + +# When VLM_SAFETENSORS_PATH is set, plumb it to backbone.safetensors_path so the +# framework loads weights from the local snapshot (e.g. a Cosmos3-Nano LM merged +# with Qwen3-VL visual via `cosmos_framework.scripts.convert_model_to_vlm_safetensors`) +# while keeping the public HF model_name for tokenizer/architecture discovery. +if [[ -n "${VLM_SAFETENSORS_PATH:-}" ]]; then + TAIL_OVERRIDES+=("model.config.policy.backbone.safetensors_path=$VLM_SAFETENSORS_PATH") +fi + +source "$(dirname "${BASH_SOURCE[0]}")/_sft_launcher_common.sh" diff --git a/cookbooks/cosmos3/finetune/launch_sft_videophy2_nano.sh b/cookbooks/cosmos3/finetune/launch_sft_videophy2_nano.sh new file mode 100644 index 00000000..9499cc5c --- /dev/null +++ b/cookbooks/cosmos3/finetune/launch_sft_videophy2_nano.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +# Structured-TOML launch for videophy2_sft_nano (VLM dialog SFT on VideoPhy-2 +# via CosmosDataLoader). Drives cosmos_framework.scripts.train against +# toml/sft_config/videophy2_sft_nano.toml. +# +# [job].task = "vlm" — picks cosmos_framework/configs/base/vlm/config.py as the base config. +# +# Requires an activated cosmos-framework venv (see the finetune README +# Prerequisites). Run from cookbooks/cosmos3/finetune/. +# +# Required env: +# VIDEOPHYSICS_ROOT dir containing videophy2_train/ and videophy2_val/ +# (each with meta.json + media/ + text/). Populate via +# `python -m cosmos_framework.scripts.vlm.prepare_videophy2_from_hf`. +# +# Optional env: +# HF_TOKEN for gated Qwen3-VL-8B-Instruct downloads. +# VLM_SAFETENSORS_PATH local directory of pre-converted Qwen3-VL safetensors +# (e.g. Cosmos3-Nano LM merged with Qwen3-VL visual via +# `cosmos_framework.scripts.convert_model_to_vlm_safetensors`). +# When set, plumbed to backbone.safetensors_path via a +# tail override. When unset, the framework falls back +# to the public Qwen/Qwen3-VL-8B-Instruct HF snapshot. +# +# Usage (8-GPU allocation, framework venv active, from cookbooks/cosmos3/finetune/): +# VIDEOPHYSICS_ROOT=/path/to/videophysics bash launch_sft_videophy2_nano.sh + +TOML_FILE="toml/sft_config/videophy2_sft_nano.toml" + +TAIL_OVERRIDES=( + ${EXTRA_TAIL_OVERRIDES:-} +) + +# When VLM_SAFETENSORS_PATH is set, plumb it to backbone.safetensors_path so the +# framework loads weights from the local snapshot (e.g. a Cosmos3-Nano LM merged +# with Qwen3-VL visual via `cosmos_framework.scripts.convert_model_to_vlm_safetensors`) +# while keeping the public HF model_name for tokenizer/architecture discovery. +if [[ -n "${VLM_SAFETENSORS_PATH:-}" ]]; then + TAIL_OVERRIDES+=("model.config.policy.backbone.safetensors_path=$VLM_SAFETENSORS_PATH") +fi + +source "$(dirname "${BASH_SOURCE[0]}")/_sft_launcher_common.sh" diff --git a/cookbooks/cosmos3/finetune/launch_sft_vision_nano.sh b/cookbooks/cosmos3/finetune/launch_sft_vision_nano.sh new file mode 100644 index 00000000..d67c4ddc --- /dev/null +++ b/cookbooks/cosmos3/finetune/launch_sft_vision_nano.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +# Structured-TOML launch for vision_sft_nano (T2V / I2V / V2V vision-only +# SFT on Qwen3-VL-8B, 8-GPU FSDP). Drives cosmos_framework.scripts.train against +# toml/sft_config/vision_sft_nano.toml. +# +# Requires an activated cosmos-framework venv (see the finetune README +# Prerequisites). Run from cookbooks/cosmos3/finetune/. +# +# Optional env vars (defaults below point under this cookbook dir; override to +# put data or checkpoints on a different filesystem): +# DATASET_PATH default: data/BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge +# (must contain train/video_dataset_file.jsonl) +# BASE_CHECKPOINT_PATH default: checkpoints/Cosmos3-Nano +# WAN_VAE_PATH default: checkpoints/wan22_vae/Wan2.2_VAE.pth +# HF_TOKEN if any tokenizer download requires gated HF access +# OUTPUT_ROOT default: outputs/train +# +# Usage (8-GPU allocation, framework venv active, from cookbooks/cosmos3/finetune/): +# bash launch_sft_vision_nano.sh + +TOML_FILE="toml/sft_config/vision_sft_nano.toml" +: "${DATASET_PATH:=data/BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge}" +: "${BASE_CHECKPOINT_PATH:=checkpoints/Cosmos3-Nano}" + +EXTRA_DATASET_CHECK='[[ -f "$DATASET_PATH/train/video_dataset_file.jsonl" ]] || { echo "ERROR: missing $DATASET_PATH/train/video_dataset_file.jsonl" >&2; exit 1; }' + +source "$(dirname "${BASH_SOURCE[0]}")/_sft_launcher_common.sh" diff --git a/cookbooks/cosmos3/finetune/launch_sft_vision_super.sh b/cookbooks/cosmos3/finetune/launch_sft_vision_super.sh new file mode 100644 index 00000000..54bfde97 --- /dev/null +++ b/cookbooks/cosmos3/finetune/launch_sft_vision_super.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +# Structured-TOML launch for vision_sft_super (T2V / I2V / V2V LoRA SFT on +# Qwen3-VL-32B-Instruct, 8-GPU FSDP with CP=2 / DP=4). Drives +# cosmos_framework.scripts.train against toml/sft_config/vision_sft_super.toml. +# +# Requires an activated cosmos-framework venv (see the finetune README +# Prerequisites). Run from cookbooks/cosmos3/finetune/. +# +# Optional env vars (defaults below point under this cookbook dir; override to +# put data or checkpoints on a different filesystem): +# DATASET_PATH default: data/BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge +# (must contain train/video_dataset_file.jsonl) +# BASE_CHECKPOINT_PATH default: checkpoints/Cosmos3-Super +# WAN_VAE_PATH default: checkpoints/wan22_vae/Wan2.2_VAE.pth +# HF_TOKEN if any tokenizer download requires gated HF access +# OUTPUT_ROOT default: outputs/train +# +# Usage (8-GPU allocation, framework venv active, from cookbooks/cosmos3/finetune/): +# bash launch_sft_vision_super.sh + +TOML_FILE="toml/sft_config/vision_sft_super.toml" +: "${DATASET_PATH:=data/BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge}" +: "${BASE_CHECKPOINT_PATH:=checkpoints/Cosmos3-Super}" + +EXTRA_DATASET_CHECK='[[ -f "$DATASET_PATH/train/video_dataset_file.jsonl" ]] || { echo "ERROR: missing $DATASET_PATH/train/video_dataset_file.jsonl" >&2; exit 1; }' + +# Super-variant env tweaks: clear LD_LIBRARY_PATH to avoid host CUDA/NCCL libs +# bleeding into the venv, switch the allocator to expandable_segments so the +# 32B backbone fits without OOM during compile/decode. +export LD_LIBRARY_PATH="" +export PYTORCH_ALLOC_CONF="${PYTORCH_ALLOC_CONF:-expandable_segments:True}" + +source "$(dirname "${BASH_SOURCE[0]}")/_sft_launcher_common.sh" From c983cf84f7cf7b7130931173e8d8be565a4be7cf Mon Sep 17 00:00:00 2001 From: Maosheng Liao Date: Tue, 16 Jun 2026 07:06:50 -0700 Subject: [PATCH 4/6] docs(finetune): adapt training.md into the cookbook SFT guide --- cookbooks/cosmos3/finetune/README.md | 345 +++++++++++++++++++++++++++ 1 file changed, 345 insertions(+) create mode 100644 cookbooks/cosmos3/finetune/README.md diff --git a/cookbooks/cosmos3/finetune/README.md b/cookbooks/cosmos3/finetune/README.md new file mode 100644 index 00000000..ac0e3ff6 --- /dev/null +++ b/cookbooks/cosmos3/finetune/README.md @@ -0,0 +1,345 @@ +# Cosmos3 Fine-Tuning (Supervised Fine-Tuning) + + + +______________________________________________________________________ + +**Table of Contents** + +- [Prerequisites](#prerequisites) +- [Step 1 - Prepare data and config](#step-1---prepare-data-and-config) +- [Step 2 — Prepare checkpoint](#step-2--prepare-checkpoint) +- [Step 3 — Run training](#step-3--run-training) + - [Option A (recommended): the paired launch shell](#option-a-recommended-the-paired-launch-shell) + - [Overriding the defaults](#overriding-the-defaults) + - [Option B: raw `torchrun`](#option-b-raw-torchrun) +- [Outputs](#outputs) +- [Export checkpoint to Hugging Face safetensors](#export-checkpoint-to-hugging-face-safetensors) +- [Config](#config) + - [Common Hydra tail overrides](#common-hydra-tail-overrides) + +______________________________________________________________________ + + + +Fine-tune a pre-trained Cosmos3 model on your own dataset using supervised fine-tuning (SFT). Tested on 8× H100 (80 GB). + +## Prerequisites + +Training runs through the **cosmos-framework** package: the `cosmos_framework.scripts.train` entry point and the experiment-SKU configs live there, so you must install a framework checkout before running anything in this guide. The recipe TOMLs and launch shells in this folder drive that entry point. + +1. **Clone and install cosmos-framework.** Follow the cosmos3 cookbook's [Cosmos Framework setup](../README.md#cosmos-framework) — clone into `packages/cosmos3` and run `uv sync --all-extras --group=cu130-train` (use `cu128-train` on a CUDA 12.x driver). `uv sync` is the install: it installs the `cosmos-framework` project itself (editable) plus all training dependencies into `.venv`; no separate `pip install` is needed. + +2. **Activate the framework venv** so `cosmos_framework` is importable. These launch shells deliberately do **not** add `.venv/bin` to `PATH`: + + ```shell + source /packages/cosmos3/.venv/bin/activate + ``` + +3. **Run every command below from this cookbook directory** (`cookbooks/cosmos3/finetune/`) with that venv active. Data, checkpoints, and outputs default to `data/`, `checkpoints/`, and `outputs/` under this folder (all git-ignored); export `DATASET_PATH` / `BASE_CHECKPOINT_PATH` / `WAN_VAE_PATH` to override (see [Step 3 → Overriding the defaults](#overriding-the-defaults)). + +For deeper references see the framework docs: [environment variables](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/environment_variables.md), [FAQ / troubleshooting](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/faq.md) (OOM during SFT, common pitfalls), and the [JSONL dataset format](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/dataset_jsonl.md). + +## Step 1 - Prepare data and config + +Some datasets are license gated — visit the repository page and accept any terms, and authenticate with `uvx hf@latest auth login` (or set `HF_TOKEN`). + +The per-recipe download commands below write to `data//` and `checkpoints/wan22_vae/Wan2.2_VAE.pth`, which match the launcher's default `$DATASET_PATH` and `$WAN_VAE_PATH`. See [Step 3 → Option A](#option-a-recommended-the-paired-launch-shell) for how to override these defaults if you'd rather keep data on a different filesystem. + +Select one of the following recipes: + +
Vision SFT (Cosmos3-Nano) + +T2V/I2V/V2V SFT on [nvidia/BridgeData2-Subset-Synthetic-Captions](https://huggingface.co/datasets/nvidia/BridgeData2-Subset-Synthetic-Captions/tree/main). `$DATASET_PATH` should be the directory containing `train/video_dataset_file.jsonl`. Each clip carries a structured-JSON caption (`caption_json`) — the model's native prompt format — which the SFT loader trains on by default (the dense narrative is kept as a backup), so training stays aligned with [Inference](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/dataset_jsonl.md#inference); see [JSONL Dataset → Format](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/dataset_jsonl.md#format). + +Launch shell: `launch_sft_vision_nano.sh` + +```shell +BASE_CHECKPOINT_NAME=Cosmos3-Nano + +# Defaults match the launcher (see Step 3 → Option A to override). +uvx hf@latest download --repo-type dataset nvidia/BridgeData2-Subset-Synthetic-Captions \ + --revision 40d018ac1c1a2a4b9734f17fdb21f3d933c49a01 \ + --local-dir data/BridgeData2-Subset-Synthetic-Captions --quiet +uvx hf@latest download Wan-AI/Wan2.2-TI2V-5B Wan2.2_VAE.pth \ + --local-dir checkpoints/wan22_vae --quiet +``` + +
+ +
Vision SFT LoRA (Cosmos3-Super) + +LoRA SFT on Qwen3-VL-32B MoT (Cosmos3-Super), on the same Bridge dataset as **Vision SFT (Cosmos3-Nano)**. Step 2 must convert the Cosmos3-Super checkpoint, not Cosmos3-Nano. + +Launch shell: `launch_sft_vision_super.sh` + +```shell +BASE_CHECKPOINT_NAME=Cosmos3-Super + +# Defaults match the launcher (see Step 3 → Option A to override). +uvx hf@latest download --repo-type dataset nvidia/BridgeData2-Subset-Synthetic-Captions \ + --revision 40d018ac1c1a2a4b9734f17fdb21f3d933c49a01 \ + --local-dir data/BridgeData2-Subset-Synthetic-Captions --quiet +uvx hf@latest download Wan-AI/Wan2.2-TI2V-5B Wan2.2_VAE.pth \ + --local-dir checkpoints/wan22_vae --quiet +``` + +
+ +
Reasoner Alignment SFT with LLaVA-OneVision (vfm-vlm) + +Alignment SFT for the Reasoner variant on the [lmms-lab/LLaVA-OneVision-Data](https://huggingface.co/datasets/lmms-lab/LLaVA-OneVision-Data) dataset (streamed from HF Hub). Skips Step 2: by default the backbone `Qwen/Qwen3-VL-8B-Instruct` is fetched from the HF Hub by the model downloader at startup — no DCP conversion needed and no required env vars. To instead start from a merged Cosmos3 reasoner snapshot (Cosmos3-Nano LM merged onto the Qwen3-VL visual tower), build it with `convert_model_to_vlm_safetensors` (see [Step 2](#step-2--prepare-checkpoint)) and point `VLM_SAFETENSORS_PATH` at it — same mechanism as the VideoPhy-2 recipe below. + +Launch shell: `launch_sft_llava_ov.sh` + +```shell +# No required env vars. The first launch will populate the HF Hub cache under +# $HF_HOME (defaults to /tmp/hf_cache inside the wrapper); subsequent launches +# reuse the cached snapshot. +# +# (optional) HF_TOKEN raises HF Hub rate limits for the streamed dataset +# revision lookup — useful if you're running 8-rank fan-out from a single IP: +# export HF_TOKEN=hf_... +# +# (optional) VLM_SAFETENSORS_PATH starts training from a local pre-converted +# Qwen3-VL safetensors snapshot (e.g. Cosmos3-Nano LM merged with the Qwen3-VL +# visual tower) instead of the public HF backbone: +# export VLM_SAFETENSORS_PATH=$PWD/checkpoints/Cosmos3-Nano-VLM +``` + +
+ +
Reasoner Alignment SFT with VideoPhy-2 (Cosmos3-Nano) + +Reasoner alignment SFT for 1–5 physical-plausibility scoring on [videophysics/videophy2_train](https://huggingface.co/datasets/videophysics/videophy2_train) (HF test split renamed to `videophy2_val/`). `[job].task = "vlm"`. Bootstraps from `Cosmos3-Nano`'s language-model weights merged onto the public Qwen3-VL-8B-Instruct visual tower; the merged HF directory is consumed via `[model.backbone].safetensors_path` (plumbed by `VLM_SAFETENSORS_PATH`). + +Launch shell: `launch_sft_videophy2_nano.sh` + +```shell +# Step 1 (data): materialize the public HF dataset into the canonical local layout +# (videophy2_{train,val}/{meta.json, media/, text/}). +python -m cosmos_framework.scripts.vlm.prepare_videophy2_from_hf \ + --out_root data/videophysics --split both +``` + +
+ +## Step 2 — Prepare checkpoint + +Convert the base checkpoint to [PyTorch Distributed Checkpoint (DCP)](https://pytorch.org/docs/stable/distributed.checkpoint.html) format. `cosmos_framework.scripts.convert_model_to_dcp` ships in the unified `cosmos_framework/` package, so this step runs from this cookbook directory (with the framework venv active per [Prerequisites](#prerequisites)). + +Set `BASE_CHECKPOINT_NAME` to the value from the recipe block you picked in Step 1 (`Cosmos3-Nano` or `Cosmos3-Super`): + +```shell +BASE_CHECKPOINT_NAME=Cosmos3-Nano # or Cosmos3-Super — match the recipe in Step 1 + +# Default output dir matches the launcher (see Step 3 → Option A to override). +python -m cosmos_framework.scripts.convert_model_to_dcp \ + -o checkpoints/$BASE_CHECKPOINT_NAME \ + --checkpoint-path $BASE_CHECKPOINT_NAME +``` + +`$BASE_CHECKPOINT_NAME` (e.g. `Cosmos3-Nano`, `Cosmos3-Super`) is a registered name in the checkpoint catalog; the converter downloads the matching repo from the Hugging Face Hub and writes the DCP into `checkpoints/$BASE_CHECKPOINT_NAME`. + +**Reasoner Alignment SFT with LLaVA-OneVision (vfm-vlm):** Skip this step — the Reasoner alignment SFT loads `Qwen/Qwen3-VL-8B-Instruct` from the HF Hub at startup (no DCP conversion required). To start from a merged Cosmos3 reasoner snapshot instead, build one with `convert_model_to_vlm_safetensors` (see the VideoPhy-2 note below) and pass it via `VLM_SAFETENSORS_PATH`. + +**Reasoner Alignment SFT with VideoPhy-2 (Cosmos3-Nano):** Use `cosmos_framework.scripts.convert_model_to_vlm_safetensors` instead. + +```shell +# Step 2 (VLM checkpoint): merge Cosmos3-Nano LM onto the Qwen3-VL visual tower. +# Replaces the convert_model_to_dcp step used by the VFM recipes above. +python -m cosmos_framework.scripts.convert_model_to_vlm_safetensors \ + --checkpoint-path Cosmos3-Nano \ + -o checkpoints/Cosmos3-Nano-VLM +``` + +## Step 3 — Run training + +**Weights & Biases (optional):** every recipe TOML defaults to `job.wandb_mode = "disabled"`. To log a run to W&B, flip that field to `"online"` in the TOML and export `WANDB_API_KEY` in your environment before launching. + +### Option A (recommended): the paired launch shell + +Each recipe ships as a `toml/sft_config/.toml` (validated against the pydantic schema at [`cosmos_framework/configs/toml_config/sft_config.py`](https://github.com/NVIDIA/cosmos-framework/blob/main/cosmos_framework/configs/toml_config/sft_config.py)) paired with `launch_sft_.sh`; the full upstream catalog is indexed in [the framework's examples index](https://github.com/NVIDIA/cosmos-framework/blob/main/examples/README.md). Each `.sh` sources [`_sft_launcher_common.sh`](_sft_launcher_common.sh) and forwards into `cosmos_framework.scripts.train --sft-toml=`. From this cookbook directory, run the launch shell paired with the recipe you set up in Step 1. The wrapper resolves `DATASET_PATH`, `BASE_CHECKPOINT_PATH`, and `WAN_VAE_PATH` from the default locations under this cookbook directory (populated by Step 1 + Step 2), so no env-var setup is required (see [below](#overriding-the-defaults) to override): + +```shell +# from this cookbook directory, after Step 1 + Step 2: +bash launch_sft_vision_nano.sh +``` + +Each launcher's default paths come from the `DATASET_PATH` + `BASE_CHECKPOINT_PATH` defaults declared at the top of its `.sh` (each uses `: "${VAR:=…}"` so any value you `export` in the shell before launching wins over the default): + +| Launch shell | Post-Training Task | Default $DATASET_PATH (under data/) | Default $BASE_CHECKPOINT_PATH (under checkpoints/) | +| ------------------------------ | ------------------ | ---------------------------------------------------------- | ----------------------------------------------------------- | +| `launch_sft_vision_nano.sh` | Generator SFT | `BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge` | `Cosmos3-Nano` | +| `launch_sft_vision_super.sh` | Generator SFT | `BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge` | `Cosmos3-Super` | +| `launch_sft_llava_ov.sh` | Reasoner SFT | (none; dataset streams from HF Hub) | (none; backbone fetched at startup, or set `VLM_SAFETENSORS_PATH`) | +| `launch_sft_videophy2_nano.sh` | Reasoner SFT | (none; set `VIDEOPHYSICS_ROOT` env) | (none; set `VLM_SAFETENSORS_PATH` env) | + +`WAN_VAE_PATH` defaults to `checkpoints/wan22_vae/Wan2.2_VAE.pth` for every non-reasoner recipe. + +**Reasoner Alignment SFT with VideoPhy-2 (Cosmos3-Nano):** + +```shell +# Step 3 (launch): export both env vars, then launch. +export VIDEOPHYSICS_ROOT=$PWD/data/videophysics +export VLM_SAFETENSORS_PATH=$PWD/checkpoints/Cosmos3-Nano-VLM +bash launch_sft_videophy2_nano.sh +``` + +#### Overriding the defaults + +If you'd rather put data or checkpoints on a different filesystem (e.g. a faster SSD or shared mount), download to your chosen path in Step 1 / convert the DCP to your chosen path in Step 2, then export the matching env var(s) before launching: + +```shell +# Example: data on /scratch, base DCP on /nfs/ckpts. +export DATASET_PATH=/scratch/BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge +export BASE_CHECKPOINT_PATH=/nfs/ckpts/Cosmos3-Nano +export WAN_VAE_PATH=/nfs/ckpts/wan22_vae/Wan2.2_VAE.pth +bash launch_sft_vision_nano.sh +``` + +Each env var falls back to its default if unset, so you only need to export the ones you're moving. The downloads / `convert_model_to_dcp` commands in Step 1 + Step 2 just need their `--local-dir` / `-o` argument pointed at the same path you export here. `.gitignore` excludes `data/`, `checkpoints/`, and `outputs/` under this cookbook directory so the multi-GB downloads aren't tracked when you keep the defaults. + +### Option B: raw `torchrun` + +If you'd rather not use the paired launch shell, invoke `torchrun` directly with the recipe's TOML. Unlike Option A, **raw `torchrun` does not auto-resolve `DATASET_PATH` / `BASE_CHECKPOINT_PATH` / `WAN_VAE_PATH`** — they have to come from your shell: + +- `BASE_CHECKPOINT_PATH` and `WAN_VAE_PATH` are read via `${oc.env:BASE_CHECKPOINT_PATH}` / `${oc.env:WAN_VAE_PATH}` at the TOML's `[checkpoint].load_path` / `[model.tokenizer].vae_path` keys. +- `DATASET_PATH` is read via `${oc.env:DATASET_PATH}` inside the experiment-SKU Python (e.g. `cosmos_framework/configs/base/experiment/sft/.py`), not in the TOML. + +You have two options to fill them in (pick either, not both): + +1. **Export them in the shell before `torchrun`** (whether they point at the default `data/` / `checkpoints/` paths from Step 1+2 or your own overrides) — shown below. +2. **Edit the TOML by hand** — open `toml/sft_config/.toml` and replace the `${oc.env:BASE_CHECKPOINT_PATH}` / `${oc.env:WAN_VAE_PATH}` placeholders with literal paths. Useful if you want a self-contained TOML you can hand to a colleague or commit alongside an experiment record. (Hand-editing won't help for `DATASET_PATH` — that's resolved out of the experiment Python, so you must still export it.) + +Run from this cookbook directory (`cookbooks/cosmos3/finetune/`) with the framework venv active; the snippet uses `$PWD` to absolutize the relative paths. + +```shell +# This example uses the vision_sft_nano recipe end-to-end (same recipe as +# Option A). To switch recipes, swap TOML_FILE + DATASET_PATH per the table in +# Option A, and Cosmos3-Nano → Cosmos3-Super on the LoRA / super recipes. +TOML_FILE="toml/sft_config/vision_sft_nano.toml" + +# Match the launcher's defaults — or substitute your own paths. +export DATASET_PATH="$PWD/data/BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge" +export BASE_CHECKPOINT_PATH="$PWD/checkpoints/Cosmos3-Nano" +export WAN_VAE_PATH="$PWD/checkpoints/wan22_vae/Wan2.2_VAE.pth" + +IMAGINAIRE_OUTPUT_ROOT=outputs/train \ +torchrun --nproc_per_node=8 -m cosmos_framework.scripts.train \ + --sft-toml=$TOML_FILE +``` + +To resume from the latest in-progress checkpoint, point `BASE_CHECKPOINT_PATH` at the run's `checkpoints/iter_/` directory under `$IMAGINAIRE_OUTPUT_ROOT////` (see [Outputs](#outputs) below for the full layout). + +## Outputs + +Outputs land under `$IMAGINAIRE_OUTPUT_ROOT////`: + +1. `config.yaml`, `config.pkl`: Finalized resolved config (YAML for inspection, pickle for re-instantiation). +1. `launch_info.yaml`, `job_env.yaml`: Job metadata and captured launch environment. +1. `checkpoints/`: + 1. `latest_checkpoint.txt`: Pointer file containing the latest checkpoint directory name (e.g. `iter_000000200`). + 1. `iter_/`: DCP checkpoint saved every `[train.ckpt].save_freq` iterations (zero-padded 9-digit, e.g. `iter_000000200/`): + 1. `model/`: model weights (sharded `.distcp`). + 1. `optim/`: optimizer state. + 1. `scheduler/`: LR scheduler state. + 1. `trainer/`: training state — includes the `iteration` counter and per-rank `rng_state_` (numpy + random + torch + torch_cuda). + 1. `dataloader/`: optional per-rank pickle shards (`rank_.pkl`) — only present for dataloaders that implement `has_state()`. +1. `/`: Callback outputs, one directory per registered callback (e.g. `DeviceMonitor/`, `EveryNDrawSample/`, `norm_monitor/`). +1. `wandb/`, `wandb_id.txt`: Wandb run files — only present when `[job].wandb_mode` is `online` or `offline`. + +The shorthand `$RUN_DIR` used in the rest of this page refers to `$IMAGINAIRE_OUTPUT_ROOT///`. For example, with `IMAGINAIRE_OUTPUT_ROOT=outputs/train` and the `vision_sft_nano` recipe, `$RUN_DIR` is `outputs/train/cosmos3/sft/vision_sft_nano`. + +## Export checkpoint to Hugging Face safetensors + +Export the DCP checkpoint produced in Step 3 to a Hugging Face safetensors checkpoint: + +```shell +RUN_DIR=$IMAGINAIRE_OUTPUT_ROOT/// + +CHECKPOINT_ITER=$(cat $RUN_DIR/checkpoints/latest_checkpoint.txt) +CHECKPOINT_PATH=$RUN_DIR/checkpoints/$CHECKPOINT_ITER + +python -m cosmos_framework.scripts.export_model \ + --checkpoint-path $CHECKPOINT_PATH \ + --config-file $RUN_DIR/config.yaml \ + -o $RUN_DIR/model +``` + +The exported safetensors land at `$RUN_DIR/model` and can be used in [Inference](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/inference.md) commands by passing `--checkpoint-path $RUN_DIR/model`. + +## Config + +The recipe TOML is parsed against the pydantic schema [`SFTExperimentConfig`](https://github.com/NVIDIA/cosmos-framework/blob/main/cosmos_framework/configs/toml_config/sft_config.py) at load time. Every top-level key listed below maps to a sub-model in that file; unknown keys raise a `ValidationError` before training starts (`extra="forbid"` on every sub-model). Values may use OmegaConf env interpolation `${oc.env:NAME}` — the recipe TOMLs use this for `BASE_CHECKPOINT_PATH` (`[checkpoint].load_path`) and `WAN_VAE_PATH` (`[model.tokenizer].vae_path`). `DATASET_PATH` is consumed the same way but inside the experiment-SKU Python (`cosmos_framework/configs/base/experiment/sft/.py`), not in the TOML. + +For the full field-by-field reference (every section, every default, every VFM/VLM applicability note, the `"???"` MISSING sentinel, env interpolation, the VFM↔VLM path-remap table, and how to extend the schema), see [SFT Structured-TOML Config Reference](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/sft_config.md). + +The commonly tuned knobs: + +1. `[job]` + 1. `task` — `"vfm"` (generator recipes) or `"vlm"` (Reasoner alignment). Picks the base config: `cosmos_framework/configs/base/config.py` vs `…/vlm/config.py`. Also drives `PATH_REMAPS` in `toml_config_helper.py`. + 1. `experiment` — Registered experiment SKU name (e.g. `vision_sft_nano`). Each SKU is a Python file under `cosmos_framework/configs/base/experiment/sft/` that wires up dataloader, model variant, and recipe-specific defaults. + 1. `project`, `group`, `name` — Components of the run output dir `$IMAGINAIRE_OUTPUT_ROOT////`. Also flow to W&B as the project / group / run name. + 1. `wandb_mode` — `"online"` (logs to W&B; `WANDB_API_KEY` must be set), `"offline"` (logs locally, sync later with `wandb sync`), or `"disabled"`. +1. `[model]` + 1. `max_num_tokens_after_packing` — VFM token-packing target. `-1` disables the cap. VFM only; VLM uses `data_setting.max_tokens` (tail override). + 1. `joint_attn_implementation` — VFM attention layout: `"two_way"` / `"three_way"` (NATTEN) / `"flex"`. + 1. `attn_implementation` — VLM attention impl: `"cosmos"` / `"flash_attention_2"` / `"sdpa"` / `"eager"`. VLM only. + 1. `lora_enabled`, `lora_rank`, `lora_alpha`, `lora_target_modules` — LoRA adapter knobs for the generation pathway. Used by SUPER-tier recipes; NANO-tier leaves `lora_enabled=false`. VFM only. +1. `[model.ema]` + 1. `enabled`, `rate`, `iteration_shift` — Exponential moving average of generation-pathway weights. Full fine-tunes typically enable it; LoRA recipes leave it off. +1. `[model.parallelism]` + 1. `data_parallel_shard_degree` — FSDP shard degree. `data_parallel_shard_degree × data_parallel_replicate_degree × context_parallel_shard_degree` must equal `WORLD_SIZE`. `-1` autoselects from torchrun world size. + 1. `data_parallel_replicate_degree` — HSDP replicate degree (outer replicate loop over the shard topology). + 1. `context_parallel_shard_degree` — Context-parallel shard degree. `>1` splits the sequence dim across ranks (used by super-tier configs: DP=4, CP=2 → 8 GPUs). + 1. `cfg_parallel_shard_degree` — Classifier-free-guidance shard degree. Almost always `1` for SFT. + 1. `fsdp_master_dtype` — Master parameter / FSDP reduce dtype: typically `"float32"`. +1. `[model.compile]` + 1. `enabled` — Enable `torch.compile`. Improves speed at the cost of memory. + 1. `compile_dynamic` — Whether to compile with symbolic-shape (dynamic) kernels. `True` (default) is appropriate for training; AR inference may prefer `False` for stable shapes. +1. `[model]` + 1. `precision` — Compute dtype for forward/backward: `"bfloat16"` / `"float16"` / `"float32"`. Master weights stay fp32 separately. +1. `[model.activation_checkpointing]` + 1. `mode` — `"none"` / `"selective"` (per-op SAC, MoT-only) / `"full"` (per-block checkpointing). + 1. `save_ops_regex` — Regex patterns for ops to keep saved under `mode="selective"`. + 1. `preserve_rng_state`, `determinism_check` — Recompute determinism plumbing. +1. `[model.tokenizer]` + 1. `vae_path` — Wan2.2 VAE `.pth` path. Recipe TOMLs use `"${oc.env:WAN_VAE_PATH}"`. VFM only. +1. `[optimizer]` + 1. `lr` — Base learning rate. + 1. `betas`, `eps`, `fused`, `weight_decay` — Standard AdamW knobs. `eps` is VFM-only. + 1. `keys_to_select` — Substring allowlist for trainable params. Empty list = train everything; `["lora_"]` = adapter-only fine-tune. +1. `[optimizer.lr_multipliers]` + 1. Inline table of ` = ` pairs that scale the LR of params whose name contains the substring. The shipped vision recipes leave this empty (Hydra default `{}` stands). +1. `[scheduler]` + 1. `cycle_lengths`, `warm_up_steps` — Cycle length and warmup duration (lists, one entry per cycle), in optimizer steps. + 1. `f_max`, `f_min`, `f_start` — LR multipliers at peak / trough / step-0 (ratios of `optimizer.lr`). + 1. `verbosity_interval` — Scheduler-side LR log frequency. VFM only. +1. `[trainer]` + 1. `max_iter` — Total optimizer steps. + 1. `grad_accum_iter` — Micro-batches per optimizer step. Effective global batch = `grad_accum_iter × per-rank batch × world_size`. + 1. `logging_iter` — Console / W&B scalar log frequency. + 1. `distributed_parallelism` — `"fsdp"` is the only supported value. +1. `[trainer.callbacks.compile_tokenizer]` + 1. `enabled`, `compile_after_iterations`, `warmup_resolutions` — Lazy `torch.compile` of the VAE tokenizer. VFM only. +1. `[trainer.callbacks.grad_clip]` + 1. `clip_norm` — Max global L2 norm of the gradient (steps with larger norm are rescaled). + 1. `force_finite` — Replace NaN/Inf grads with zero (default `true` on VFM, `false` on VLM). +1. `[checkpoint]` + 1. `load_path` — Base DCP checkpoint directory to resume from (Step 2 output, or a prior run's `checkpoints/iter_/`). Recipe TOMLs use `"${oc.env:BASE_CHECKPOINT_PATH}"`. + 1. `save_iter` — Save a new DCP checkpoint every N optimizer steps. + 1. `keys_to_skip_loading` — Substring blocklist applied at load time. Used to mask EMA / LoRA tensors when warm-starting from a checkpoint that doesn't have them yet. +1. `[dataloader_train]` — Top-level scalars only; the dataloader's class (LazyCall) and pipeline wiring (datasets, packers, …) stay in the experiment Python. + 1. `max_samples_per_batch` — Per-micro-batch sample cap (remapped to `max_batch_size` on the VLM packer). `null` / omitted = no per-count cap. + 1. `max_sequence_length` — Per-packed-sequence token cap (remapped to `max_tokens` on the VLM packer). + 1. `seed` — Dataloader RNG seed (VFM only). + +### Common Hydra tail overrides + +These knobs aren't part of the pydantic schema today; pass them as trailing `key.path=value` positionals after `--` (the `cosmos_framework.scripts.train` flow forwards them through OmegaConf): + +- `model.config.policy.backbone.model_name` — VLM backbone HF identifier (e.g. `Qwen/Qwen3-VL-8B-Instruct`). Used by `launch_sft_llava_ov.sh`. +- `data_setting.max_tokens` — VLM token-packing cap (the VLM analogue of `[model].max_num_tokens_after_packing`). Used by `launch_sft_llava_ov.sh`. + +The launchers wire these via `TAIL_OVERRIDES=(…)`; the helper appends `-- "${TAIL_OVERRIDES[@]}"` after the `--sft-toml=` argument. From 4fb51967df99311ddba243bd043f4157e8ccb55b Mon Sep 17 00:00:00 2001 From: Maosheng Liao Date: Tue, 16 Jun 2026 07:07:21 -0700 Subject: [PATCH 5/6] chore: gitignore Cosmos3 finetune cookbook runtime artifacts --- .gitignore | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.gitignore b/.gitignore index fdbd9f7c..3da55bb6 100644 --- a/.gitignore +++ b/.gitignore @@ -181,6 +181,14 @@ packages/ cookbooks/cosmos3/generator/audiovisual/outputs/ outputs/ +# Cosmos3 finetune cookbook runtime artifacts (downloads, converted ckpts, runs) +cookbooks/cosmos3/finetune/data/ +cookbooks/cosmos3/finetune/checkpoints/ +cookbooks/cosmos3/finetune/outputs/ + +# Superpowers design specs / implementation plans (kept local, not tracked) +docs/superpowers/ + # Streamlit .streamlit/ From 8abd6d8aade81f8457d5b4cfeaa08568d95645a1 Mon Sep 17 00:00:00 2001 From: Simon Zhang Date: Tue, 16 Jun 2026 09:27:20 -0700 Subject: [PATCH 6/6] restructure cookbook into per-capability folders + address review - Place SFT recipes under their capability cookbooks: vision recipes -> cookbooks/cosmos3/generator/audiovisual/finetune/, reasoner recipes -> cookbooks/cosmos3/reasoner/finetune/ (sibling of existing inference content, forward-compatible with #214). - Surface the finetune cookbooks on the repo landing page (README Finetune). - Trim each README to the happy path; link advanced config + raw torchrun to the canonical framework docs (training.md, sft_config.md). - Add the recommended NGC PyTorch base image to Prerequisites. - Rewrite each launch_sft_*.sh as a simple, self-contained recipe: linear numbered steps (download -> convert -> train) with hardcoded paths, dropping the shared launcher helper and all env-var override knobs. --- .gitignore | 12 +- README.md | 9 +- cookbooks/cosmos3/finetune/README.md | 345 ------------------ .../cosmos3/finetune/_sft_launcher_common.sh | 98 ----- .../cosmos3/finetune/launch_sft_llava_ov.sh | 43 --- .../finetune/launch_sft_videophy2_nano.sh | 45 --- .../finetune/launch_sft_vision_nano.sh | 30 -- .../finetune/launch_sft_vision_super.sh | 36 -- .../generator/audiovisual/finetune/README.md | 58 +++ .../finetune/launch_sft_vision_nano.sh | 39 ++ .../finetune/launch_sft_vision_super.sh | 42 +++ .../toml/sft_config/vision_sft_nano.toml | 0 .../toml/sft_config/vision_sft_super.toml | 0 cookbooks/cosmos3/reasoner/finetune/README.md | 58 +++ .../reasoner/finetune/launch_sft_llava_ov.sh | 16 + .../finetune/launch_sft_videophy2_nano.sh | 32 ++ .../finetune/toml/sft_config/llava_ov.toml | 0 .../toml/sft_config/videophy2_sft_nano.toml | 0 18 files changed, 258 insertions(+), 605 deletions(-) delete mode 100644 cookbooks/cosmos3/finetune/README.md delete mode 100644 cookbooks/cosmos3/finetune/_sft_launcher_common.sh delete mode 100644 cookbooks/cosmos3/finetune/launch_sft_llava_ov.sh delete mode 100644 cookbooks/cosmos3/finetune/launch_sft_videophy2_nano.sh delete mode 100644 cookbooks/cosmos3/finetune/launch_sft_vision_nano.sh delete mode 100644 cookbooks/cosmos3/finetune/launch_sft_vision_super.sh create mode 100644 cookbooks/cosmos3/generator/audiovisual/finetune/README.md create mode 100644 cookbooks/cosmos3/generator/audiovisual/finetune/launch_sft_vision_nano.sh create mode 100644 cookbooks/cosmos3/generator/audiovisual/finetune/launch_sft_vision_super.sh rename cookbooks/cosmos3/{ => generator/audiovisual}/finetune/toml/sft_config/vision_sft_nano.toml (100%) rename cookbooks/cosmos3/{ => generator/audiovisual}/finetune/toml/sft_config/vision_sft_super.toml (100%) create mode 100644 cookbooks/cosmos3/reasoner/finetune/README.md create mode 100644 cookbooks/cosmos3/reasoner/finetune/launch_sft_llava_ov.sh create mode 100644 cookbooks/cosmos3/reasoner/finetune/launch_sft_videophy2_nano.sh rename cookbooks/cosmos3/{ => reasoner}/finetune/toml/sft_config/llava_ov.toml (100%) rename cookbooks/cosmos3/{ => reasoner}/finetune/toml/sft_config/videophy2_sft_nano.toml (100%) diff --git a/.gitignore b/.gitignore index 3da55bb6..c5b9a910 100644 --- a/.gitignore +++ b/.gitignore @@ -182,12 +182,12 @@ cookbooks/cosmos3/generator/audiovisual/outputs/ outputs/ # Cosmos3 finetune cookbook runtime artifacts (downloads, converted ckpts, runs) -cookbooks/cosmos3/finetune/data/ -cookbooks/cosmos3/finetune/checkpoints/ -cookbooks/cosmos3/finetune/outputs/ - -# Superpowers design specs / implementation plans (kept local, not tracked) -docs/superpowers/ +cookbooks/cosmos3/generator/audiovisual/finetune/data/ +cookbooks/cosmos3/generator/audiovisual/finetune/checkpoints/ +cookbooks/cosmos3/generator/audiovisual/finetune/outputs/ +cookbooks/cosmos3/reasoner/finetune/data/ +cookbooks/cosmos3/reasoner/finetune/checkpoints/ +cookbooks/cosmos3/reasoner/finetune/outputs/ # Streamlit .streamlit/ diff --git a/README.md b/README.md index 6d3e51eb..c4fd1aea 100644 --- a/README.md +++ b/README.md @@ -646,9 +646,14 @@ Cosmos 3 latency and serving numbers live in [`inference_benchmarks.md`](inferen ### Finetune -Finetune Cosmos 3 with the [Cosmos Framework](https://github.com/NVIDIA/cosmos-framework), NVIDIA's end-to-end Physical AI framework for training and serving world models. It provides runnable setup, inference, omni-model training, and evaluation workflows for the Generator and Reasoner surfaces, with reference recipes for vision, action, and reasoning post-training. +Post-train Cosmos 3 on your own data with the supervised fine-tuning (SFT) cookbooks below. Each recipe is a self-contained launch script: a single `bash launch_sft_.sh` downloads the data, prepares the base checkpoint, and runs 8×H100 training. -See the [Cosmos Framework training guide](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/training.md) for the full post-training workflow, including data preparation, configuration, and launch commands. +| Cookbook | Surface | Recipes | +| --- | --- | --- | +| [Vision generator SFT](cookbooks/cosmos3/generator/audiovisual/finetune/README.md) | Generator | Full SFT (Cosmos3-Nano) and LoRA SFT (Cosmos3-Super) on captioned video | +| [Reasoner SFT](cookbooks/cosmos3/reasoner/finetune/README.md) | Reasoner | Alignment SFT on LLaVA-OneVision and physical-plausibility SFT on VideoPhy-2 | + +These cookbooks run on the [Cosmos Framework](https://github.com/NVIDIA/cosmos-framework), NVIDIA's end-to-end Physical AI framework for training and serving world models. For the full post-training reference — every config field, raw `torchrun`, resuming, and advanced parallelism — see the [Cosmos Framework training guide](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/training.md). ### Limitations diff --git a/cookbooks/cosmos3/finetune/README.md b/cookbooks/cosmos3/finetune/README.md deleted file mode 100644 index ac0e3ff6..00000000 --- a/cookbooks/cosmos3/finetune/README.md +++ /dev/null @@ -1,345 +0,0 @@ -# Cosmos3 Fine-Tuning (Supervised Fine-Tuning) - - - -______________________________________________________________________ - -**Table of Contents** - -- [Prerequisites](#prerequisites) -- [Step 1 - Prepare data and config](#step-1---prepare-data-and-config) -- [Step 2 — Prepare checkpoint](#step-2--prepare-checkpoint) -- [Step 3 — Run training](#step-3--run-training) - - [Option A (recommended): the paired launch shell](#option-a-recommended-the-paired-launch-shell) - - [Overriding the defaults](#overriding-the-defaults) - - [Option B: raw `torchrun`](#option-b-raw-torchrun) -- [Outputs](#outputs) -- [Export checkpoint to Hugging Face safetensors](#export-checkpoint-to-hugging-face-safetensors) -- [Config](#config) - - [Common Hydra tail overrides](#common-hydra-tail-overrides) - -______________________________________________________________________ - - - -Fine-tune a pre-trained Cosmos3 model on your own dataset using supervised fine-tuning (SFT). Tested on 8× H100 (80 GB). - -## Prerequisites - -Training runs through the **cosmos-framework** package: the `cosmos_framework.scripts.train` entry point and the experiment-SKU configs live there, so you must install a framework checkout before running anything in this guide. The recipe TOMLs and launch shells in this folder drive that entry point. - -1. **Clone and install cosmos-framework.** Follow the cosmos3 cookbook's [Cosmos Framework setup](../README.md#cosmos-framework) — clone into `packages/cosmos3` and run `uv sync --all-extras --group=cu130-train` (use `cu128-train` on a CUDA 12.x driver). `uv sync` is the install: it installs the `cosmos-framework` project itself (editable) plus all training dependencies into `.venv`; no separate `pip install` is needed. - -2. **Activate the framework venv** so `cosmos_framework` is importable. These launch shells deliberately do **not** add `.venv/bin` to `PATH`: - - ```shell - source /packages/cosmos3/.venv/bin/activate - ``` - -3. **Run every command below from this cookbook directory** (`cookbooks/cosmos3/finetune/`) with that venv active. Data, checkpoints, and outputs default to `data/`, `checkpoints/`, and `outputs/` under this folder (all git-ignored); export `DATASET_PATH` / `BASE_CHECKPOINT_PATH` / `WAN_VAE_PATH` to override (see [Step 3 → Overriding the defaults](#overriding-the-defaults)). - -For deeper references see the framework docs: [environment variables](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/environment_variables.md), [FAQ / troubleshooting](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/faq.md) (OOM during SFT, common pitfalls), and the [JSONL dataset format](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/dataset_jsonl.md). - -## Step 1 - Prepare data and config - -Some datasets are license gated — visit the repository page and accept any terms, and authenticate with `uvx hf@latest auth login` (or set `HF_TOKEN`). - -The per-recipe download commands below write to `data//` and `checkpoints/wan22_vae/Wan2.2_VAE.pth`, which match the launcher's default `$DATASET_PATH` and `$WAN_VAE_PATH`. See [Step 3 → Option A](#option-a-recommended-the-paired-launch-shell) for how to override these defaults if you'd rather keep data on a different filesystem. - -Select one of the following recipes: - -
Vision SFT (Cosmos3-Nano) - -T2V/I2V/V2V SFT on [nvidia/BridgeData2-Subset-Synthetic-Captions](https://huggingface.co/datasets/nvidia/BridgeData2-Subset-Synthetic-Captions/tree/main). `$DATASET_PATH` should be the directory containing `train/video_dataset_file.jsonl`. Each clip carries a structured-JSON caption (`caption_json`) — the model's native prompt format — which the SFT loader trains on by default (the dense narrative is kept as a backup), so training stays aligned with [Inference](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/dataset_jsonl.md#inference); see [JSONL Dataset → Format](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/dataset_jsonl.md#format). - -Launch shell: `launch_sft_vision_nano.sh` - -```shell -BASE_CHECKPOINT_NAME=Cosmos3-Nano - -# Defaults match the launcher (see Step 3 → Option A to override). -uvx hf@latest download --repo-type dataset nvidia/BridgeData2-Subset-Synthetic-Captions \ - --revision 40d018ac1c1a2a4b9734f17fdb21f3d933c49a01 \ - --local-dir data/BridgeData2-Subset-Synthetic-Captions --quiet -uvx hf@latest download Wan-AI/Wan2.2-TI2V-5B Wan2.2_VAE.pth \ - --local-dir checkpoints/wan22_vae --quiet -``` - -
- -
Vision SFT LoRA (Cosmos3-Super) - -LoRA SFT on Qwen3-VL-32B MoT (Cosmos3-Super), on the same Bridge dataset as **Vision SFT (Cosmos3-Nano)**. Step 2 must convert the Cosmos3-Super checkpoint, not Cosmos3-Nano. - -Launch shell: `launch_sft_vision_super.sh` - -```shell -BASE_CHECKPOINT_NAME=Cosmos3-Super - -# Defaults match the launcher (see Step 3 → Option A to override). -uvx hf@latest download --repo-type dataset nvidia/BridgeData2-Subset-Synthetic-Captions \ - --revision 40d018ac1c1a2a4b9734f17fdb21f3d933c49a01 \ - --local-dir data/BridgeData2-Subset-Synthetic-Captions --quiet -uvx hf@latest download Wan-AI/Wan2.2-TI2V-5B Wan2.2_VAE.pth \ - --local-dir checkpoints/wan22_vae --quiet -``` - -
- -
Reasoner Alignment SFT with LLaVA-OneVision (vfm-vlm) - -Alignment SFT for the Reasoner variant on the [lmms-lab/LLaVA-OneVision-Data](https://huggingface.co/datasets/lmms-lab/LLaVA-OneVision-Data) dataset (streamed from HF Hub). Skips Step 2: by default the backbone `Qwen/Qwen3-VL-8B-Instruct` is fetched from the HF Hub by the model downloader at startup — no DCP conversion needed and no required env vars. To instead start from a merged Cosmos3 reasoner snapshot (Cosmos3-Nano LM merged onto the Qwen3-VL visual tower), build it with `convert_model_to_vlm_safetensors` (see [Step 2](#step-2--prepare-checkpoint)) and point `VLM_SAFETENSORS_PATH` at it — same mechanism as the VideoPhy-2 recipe below. - -Launch shell: `launch_sft_llava_ov.sh` - -```shell -# No required env vars. The first launch will populate the HF Hub cache under -# $HF_HOME (defaults to /tmp/hf_cache inside the wrapper); subsequent launches -# reuse the cached snapshot. -# -# (optional) HF_TOKEN raises HF Hub rate limits for the streamed dataset -# revision lookup — useful if you're running 8-rank fan-out from a single IP: -# export HF_TOKEN=hf_... -# -# (optional) VLM_SAFETENSORS_PATH starts training from a local pre-converted -# Qwen3-VL safetensors snapshot (e.g. Cosmos3-Nano LM merged with the Qwen3-VL -# visual tower) instead of the public HF backbone: -# export VLM_SAFETENSORS_PATH=$PWD/checkpoints/Cosmos3-Nano-VLM -``` - -
- -
Reasoner Alignment SFT with VideoPhy-2 (Cosmos3-Nano) - -Reasoner alignment SFT for 1–5 physical-plausibility scoring on [videophysics/videophy2_train](https://huggingface.co/datasets/videophysics/videophy2_train) (HF test split renamed to `videophy2_val/`). `[job].task = "vlm"`. Bootstraps from `Cosmos3-Nano`'s language-model weights merged onto the public Qwen3-VL-8B-Instruct visual tower; the merged HF directory is consumed via `[model.backbone].safetensors_path` (plumbed by `VLM_SAFETENSORS_PATH`). - -Launch shell: `launch_sft_videophy2_nano.sh` - -```shell -# Step 1 (data): materialize the public HF dataset into the canonical local layout -# (videophy2_{train,val}/{meta.json, media/, text/}). -python -m cosmos_framework.scripts.vlm.prepare_videophy2_from_hf \ - --out_root data/videophysics --split both -``` - -
- -## Step 2 — Prepare checkpoint - -Convert the base checkpoint to [PyTorch Distributed Checkpoint (DCP)](https://pytorch.org/docs/stable/distributed.checkpoint.html) format. `cosmos_framework.scripts.convert_model_to_dcp` ships in the unified `cosmos_framework/` package, so this step runs from this cookbook directory (with the framework venv active per [Prerequisites](#prerequisites)). - -Set `BASE_CHECKPOINT_NAME` to the value from the recipe block you picked in Step 1 (`Cosmos3-Nano` or `Cosmos3-Super`): - -```shell -BASE_CHECKPOINT_NAME=Cosmos3-Nano # or Cosmos3-Super — match the recipe in Step 1 - -# Default output dir matches the launcher (see Step 3 → Option A to override). -python -m cosmos_framework.scripts.convert_model_to_dcp \ - -o checkpoints/$BASE_CHECKPOINT_NAME \ - --checkpoint-path $BASE_CHECKPOINT_NAME -``` - -`$BASE_CHECKPOINT_NAME` (e.g. `Cosmos3-Nano`, `Cosmos3-Super`) is a registered name in the checkpoint catalog; the converter downloads the matching repo from the Hugging Face Hub and writes the DCP into `checkpoints/$BASE_CHECKPOINT_NAME`. - -**Reasoner Alignment SFT with LLaVA-OneVision (vfm-vlm):** Skip this step — the Reasoner alignment SFT loads `Qwen/Qwen3-VL-8B-Instruct` from the HF Hub at startup (no DCP conversion required). To start from a merged Cosmos3 reasoner snapshot instead, build one with `convert_model_to_vlm_safetensors` (see the VideoPhy-2 note below) and pass it via `VLM_SAFETENSORS_PATH`. - -**Reasoner Alignment SFT with VideoPhy-2 (Cosmos3-Nano):** Use `cosmos_framework.scripts.convert_model_to_vlm_safetensors` instead. - -```shell -# Step 2 (VLM checkpoint): merge Cosmos3-Nano LM onto the Qwen3-VL visual tower. -# Replaces the convert_model_to_dcp step used by the VFM recipes above. -python -m cosmos_framework.scripts.convert_model_to_vlm_safetensors \ - --checkpoint-path Cosmos3-Nano \ - -o checkpoints/Cosmos3-Nano-VLM -``` - -## Step 3 — Run training - -**Weights & Biases (optional):** every recipe TOML defaults to `job.wandb_mode = "disabled"`. To log a run to W&B, flip that field to `"online"` in the TOML and export `WANDB_API_KEY` in your environment before launching. - -### Option A (recommended): the paired launch shell - -Each recipe ships as a `toml/sft_config/.toml` (validated against the pydantic schema at [`cosmos_framework/configs/toml_config/sft_config.py`](https://github.com/NVIDIA/cosmos-framework/blob/main/cosmos_framework/configs/toml_config/sft_config.py)) paired with `launch_sft_.sh`; the full upstream catalog is indexed in [the framework's examples index](https://github.com/NVIDIA/cosmos-framework/blob/main/examples/README.md). Each `.sh` sources [`_sft_launcher_common.sh`](_sft_launcher_common.sh) and forwards into `cosmos_framework.scripts.train --sft-toml=`. From this cookbook directory, run the launch shell paired with the recipe you set up in Step 1. The wrapper resolves `DATASET_PATH`, `BASE_CHECKPOINT_PATH`, and `WAN_VAE_PATH` from the default locations under this cookbook directory (populated by Step 1 + Step 2), so no env-var setup is required (see [below](#overriding-the-defaults) to override): - -```shell -# from this cookbook directory, after Step 1 + Step 2: -bash launch_sft_vision_nano.sh -``` - -Each launcher's default paths come from the `DATASET_PATH` + `BASE_CHECKPOINT_PATH` defaults declared at the top of its `.sh` (each uses `: "${VAR:=…}"` so any value you `export` in the shell before launching wins over the default): - -| Launch shell | Post-Training Task | Default $DATASET_PATH (under data/) | Default $BASE_CHECKPOINT_PATH (under checkpoints/) | -| ------------------------------ | ------------------ | ---------------------------------------------------------- | ----------------------------------------------------------- | -| `launch_sft_vision_nano.sh` | Generator SFT | `BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge` | `Cosmos3-Nano` | -| `launch_sft_vision_super.sh` | Generator SFT | `BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge` | `Cosmos3-Super` | -| `launch_sft_llava_ov.sh` | Reasoner SFT | (none; dataset streams from HF Hub) | (none; backbone fetched at startup, or set `VLM_SAFETENSORS_PATH`) | -| `launch_sft_videophy2_nano.sh` | Reasoner SFT | (none; set `VIDEOPHYSICS_ROOT` env) | (none; set `VLM_SAFETENSORS_PATH` env) | - -`WAN_VAE_PATH` defaults to `checkpoints/wan22_vae/Wan2.2_VAE.pth` for every non-reasoner recipe. - -**Reasoner Alignment SFT with VideoPhy-2 (Cosmos3-Nano):** - -```shell -# Step 3 (launch): export both env vars, then launch. -export VIDEOPHYSICS_ROOT=$PWD/data/videophysics -export VLM_SAFETENSORS_PATH=$PWD/checkpoints/Cosmos3-Nano-VLM -bash launch_sft_videophy2_nano.sh -``` - -#### Overriding the defaults - -If you'd rather put data or checkpoints on a different filesystem (e.g. a faster SSD or shared mount), download to your chosen path in Step 1 / convert the DCP to your chosen path in Step 2, then export the matching env var(s) before launching: - -```shell -# Example: data on /scratch, base DCP on /nfs/ckpts. -export DATASET_PATH=/scratch/BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge -export BASE_CHECKPOINT_PATH=/nfs/ckpts/Cosmos3-Nano -export WAN_VAE_PATH=/nfs/ckpts/wan22_vae/Wan2.2_VAE.pth -bash launch_sft_vision_nano.sh -``` - -Each env var falls back to its default if unset, so you only need to export the ones you're moving. The downloads / `convert_model_to_dcp` commands in Step 1 + Step 2 just need their `--local-dir` / `-o` argument pointed at the same path you export here. `.gitignore` excludes `data/`, `checkpoints/`, and `outputs/` under this cookbook directory so the multi-GB downloads aren't tracked when you keep the defaults. - -### Option B: raw `torchrun` - -If you'd rather not use the paired launch shell, invoke `torchrun` directly with the recipe's TOML. Unlike Option A, **raw `torchrun` does not auto-resolve `DATASET_PATH` / `BASE_CHECKPOINT_PATH` / `WAN_VAE_PATH`** — they have to come from your shell: - -- `BASE_CHECKPOINT_PATH` and `WAN_VAE_PATH` are read via `${oc.env:BASE_CHECKPOINT_PATH}` / `${oc.env:WAN_VAE_PATH}` at the TOML's `[checkpoint].load_path` / `[model.tokenizer].vae_path` keys. -- `DATASET_PATH` is read via `${oc.env:DATASET_PATH}` inside the experiment-SKU Python (e.g. `cosmos_framework/configs/base/experiment/sft/.py`), not in the TOML. - -You have two options to fill them in (pick either, not both): - -1. **Export them in the shell before `torchrun`** (whether they point at the default `data/` / `checkpoints/` paths from Step 1+2 or your own overrides) — shown below. -2. **Edit the TOML by hand** — open `toml/sft_config/.toml` and replace the `${oc.env:BASE_CHECKPOINT_PATH}` / `${oc.env:WAN_VAE_PATH}` placeholders with literal paths. Useful if you want a self-contained TOML you can hand to a colleague or commit alongside an experiment record. (Hand-editing won't help for `DATASET_PATH` — that's resolved out of the experiment Python, so you must still export it.) - -Run from this cookbook directory (`cookbooks/cosmos3/finetune/`) with the framework venv active; the snippet uses `$PWD` to absolutize the relative paths. - -```shell -# This example uses the vision_sft_nano recipe end-to-end (same recipe as -# Option A). To switch recipes, swap TOML_FILE + DATASET_PATH per the table in -# Option A, and Cosmos3-Nano → Cosmos3-Super on the LoRA / super recipes. -TOML_FILE="toml/sft_config/vision_sft_nano.toml" - -# Match the launcher's defaults — or substitute your own paths. -export DATASET_PATH="$PWD/data/BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge" -export BASE_CHECKPOINT_PATH="$PWD/checkpoints/Cosmos3-Nano" -export WAN_VAE_PATH="$PWD/checkpoints/wan22_vae/Wan2.2_VAE.pth" - -IMAGINAIRE_OUTPUT_ROOT=outputs/train \ -torchrun --nproc_per_node=8 -m cosmos_framework.scripts.train \ - --sft-toml=$TOML_FILE -``` - -To resume from the latest in-progress checkpoint, point `BASE_CHECKPOINT_PATH` at the run's `checkpoints/iter_/` directory under `$IMAGINAIRE_OUTPUT_ROOT////` (see [Outputs](#outputs) below for the full layout). - -## Outputs - -Outputs land under `$IMAGINAIRE_OUTPUT_ROOT////`: - -1. `config.yaml`, `config.pkl`: Finalized resolved config (YAML for inspection, pickle for re-instantiation). -1. `launch_info.yaml`, `job_env.yaml`: Job metadata and captured launch environment. -1. `checkpoints/`: - 1. `latest_checkpoint.txt`: Pointer file containing the latest checkpoint directory name (e.g. `iter_000000200`). - 1. `iter_/`: DCP checkpoint saved every `[train.ckpt].save_freq` iterations (zero-padded 9-digit, e.g. `iter_000000200/`): - 1. `model/`: model weights (sharded `.distcp`). - 1. `optim/`: optimizer state. - 1. `scheduler/`: LR scheduler state. - 1. `trainer/`: training state — includes the `iteration` counter and per-rank `rng_state_` (numpy + random + torch + torch_cuda). - 1. `dataloader/`: optional per-rank pickle shards (`rank_.pkl`) — only present for dataloaders that implement `has_state()`. -1. `/`: Callback outputs, one directory per registered callback (e.g. `DeviceMonitor/`, `EveryNDrawSample/`, `norm_monitor/`). -1. `wandb/`, `wandb_id.txt`: Wandb run files — only present when `[job].wandb_mode` is `online` or `offline`. - -The shorthand `$RUN_DIR` used in the rest of this page refers to `$IMAGINAIRE_OUTPUT_ROOT///`. For example, with `IMAGINAIRE_OUTPUT_ROOT=outputs/train` and the `vision_sft_nano` recipe, `$RUN_DIR` is `outputs/train/cosmos3/sft/vision_sft_nano`. - -## Export checkpoint to Hugging Face safetensors - -Export the DCP checkpoint produced in Step 3 to a Hugging Face safetensors checkpoint: - -```shell -RUN_DIR=$IMAGINAIRE_OUTPUT_ROOT/// - -CHECKPOINT_ITER=$(cat $RUN_DIR/checkpoints/latest_checkpoint.txt) -CHECKPOINT_PATH=$RUN_DIR/checkpoints/$CHECKPOINT_ITER - -python -m cosmos_framework.scripts.export_model \ - --checkpoint-path $CHECKPOINT_PATH \ - --config-file $RUN_DIR/config.yaml \ - -o $RUN_DIR/model -``` - -The exported safetensors land at `$RUN_DIR/model` and can be used in [Inference](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/inference.md) commands by passing `--checkpoint-path $RUN_DIR/model`. - -## Config - -The recipe TOML is parsed against the pydantic schema [`SFTExperimentConfig`](https://github.com/NVIDIA/cosmos-framework/blob/main/cosmos_framework/configs/toml_config/sft_config.py) at load time. Every top-level key listed below maps to a sub-model in that file; unknown keys raise a `ValidationError` before training starts (`extra="forbid"` on every sub-model). Values may use OmegaConf env interpolation `${oc.env:NAME}` — the recipe TOMLs use this for `BASE_CHECKPOINT_PATH` (`[checkpoint].load_path`) and `WAN_VAE_PATH` (`[model.tokenizer].vae_path`). `DATASET_PATH` is consumed the same way but inside the experiment-SKU Python (`cosmos_framework/configs/base/experiment/sft/.py`), not in the TOML. - -For the full field-by-field reference (every section, every default, every VFM/VLM applicability note, the `"???"` MISSING sentinel, env interpolation, the VFM↔VLM path-remap table, and how to extend the schema), see [SFT Structured-TOML Config Reference](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/sft_config.md). - -The commonly tuned knobs: - -1. `[job]` - 1. `task` — `"vfm"` (generator recipes) or `"vlm"` (Reasoner alignment). Picks the base config: `cosmos_framework/configs/base/config.py` vs `…/vlm/config.py`. Also drives `PATH_REMAPS` in `toml_config_helper.py`. - 1. `experiment` — Registered experiment SKU name (e.g. `vision_sft_nano`). Each SKU is a Python file under `cosmos_framework/configs/base/experiment/sft/` that wires up dataloader, model variant, and recipe-specific defaults. - 1. `project`, `group`, `name` — Components of the run output dir `$IMAGINAIRE_OUTPUT_ROOT////`. Also flow to W&B as the project / group / run name. - 1. `wandb_mode` — `"online"` (logs to W&B; `WANDB_API_KEY` must be set), `"offline"` (logs locally, sync later with `wandb sync`), or `"disabled"`. -1. `[model]` - 1. `max_num_tokens_after_packing` — VFM token-packing target. `-1` disables the cap. VFM only; VLM uses `data_setting.max_tokens` (tail override). - 1. `joint_attn_implementation` — VFM attention layout: `"two_way"` / `"three_way"` (NATTEN) / `"flex"`. - 1. `attn_implementation` — VLM attention impl: `"cosmos"` / `"flash_attention_2"` / `"sdpa"` / `"eager"`. VLM only. - 1. `lora_enabled`, `lora_rank`, `lora_alpha`, `lora_target_modules` — LoRA adapter knobs for the generation pathway. Used by SUPER-tier recipes; NANO-tier leaves `lora_enabled=false`. VFM only. -1. `[model.ema]` - 1. `enabled`, `rate`, `iteration_shift` — Exponential moving average of generation-pathway weights. Full fine-tunes typically enable it; LoRA recipes leave it off. -1. `[model.parallelism]` - 1. `data_parallel_shard_degree` — FSDP shard degree. `data_parallel_shard_degree × data_parallel_replicate_degree × context_parallel_shard_degree` must equal `WORLD_SIZE`. `-1` autoselects from torchrun world size. - 1. `data_parallel_replicate_degree` — HSDP replicate degree (outer replicate loop over the shard topology). - 1. `context_parallel_shard_degree` — Context-parallel shard degree. `>1` splits the sequence dim across ranks (used by super-tier configs: DP=4, CP=2 → 8 GPUs). - 1. `cfg_parallel_shard_degree` — Classifier-free-guidance shard degree. Almost always `1` for SFT. - 1. `fsdp_master_dtype` — Master parameter / FSDP reduce dtype: typically `"float32"`. -1. `[model.compile]` - 1. `enabled` — Enable `torch.compile`. Improves speed at the cost of memory. - 1. `compile_dynamic` — Whether to compile with symbolic-shape (dynamic) kernels. `True` (default) is appropriate for training; AR inference may prefer `False` for stable shapes. -1. `[model]` - 1. `precision` — Compute dtype for forward/backward: `"bfloat16"` / `"float16"` / `"float32"`. Master weights stay fp32 separately. -1. `[model.activation_checkpointing]` - 1. `mode` — `"none"` / `"selective"` (per-op SAC, MoT-only) / `"full"` (per-block checkpointing). - 1. `save_ops_regex` — Regex patterns for ops to keep saved under `mode="selective"`. - 1. `preserve_rng_state`, `determinism_check` — Recompute determinism plumbing. -1. `[model.tokenizer]` - 1. `vae_path` — Wan2.2 VAE `.pth` path. Recipe TOMLs use `"${oc.env:WAN_VAE_PATH}"`. VFM only. -1. `[optimizer]` - 1. `lr` — Base learning rate. - 1. `betas`, `eps`, `fused`, `weight_decay` — Standard AdamW knobs. `eps` is VFM-only. - 1. `keys_to_select` — Substring allowlist for trainable params. Empty list = train everything; `["lora_"]` = adapter-only fine-tune. -1. `[optimizer.lr_multipliers]` - 1. Inline table of ` = ` pairs that scale the LR of params whose name contains the substring. The shipped vision recipes leave this empty (Hydra default `{}` stands). -1. `[scheduler]` - 1. `cycle_lengths`, `warm_up_steps` — Cycle length and warmup duration (lists, one entry per cycle), in optimizer steps. - 1. `f_max`, `f_min`, `f_start` — LR multipliers at peak / trough / step-0 (ratios of `optimizer.lr`). - 1. `verbosity_interval` — Scheduler-side LR log frequency. VFM only. -1. `[trainer]` - 1. `max_iter` — Total optimizer steps. - 1. `grad_accum_iter` — Micro-batches per optimizer step. Effective global batch = `grad_accum_iter × per-rank batch × world_size`. - 1. `logging_iter` — Console / W&B scalar log frequency. - 1. `distributed_parallelism` — `"fsdp"` is the only supported value. -1. `[trainer.callbacks.compile_tokenizer]` - 1. `enabled`, `compile_after_iterations`, `warmup_resolutions` — Lazy `torch.compile` of the VAE tokenizer. VFM only. -1. `[trainer.callbacks.grad_clip]` - 1. `clip_norm` — Max global L2 norm of the gradient (steps with larger norm are rescaled). - 1. `force_finite` — Replace NaN/Inf grads with zero (default `true` on VFM, `false` on VLM). -1. `[checkpoint]` - 1. `load_path` — Base DCP checkpoint directory to resume from (Step 2 output, or a prior run's `checkpoints/iter_/`). Recipe TOMLs use `"${oc.env:BASE_CHECKPOINT_PATH}"`. - 1. `save_iter` — Save a new DCP checkpoint every N optimizer steps. - 1. `keys_to_skip_loading` — Substring blocklist applied at load time. Used to mask EMA / LoRA tensors when warm-starting from a checkpoint that doesn't have them yet. -1. `[dataloader_train]` — Top-level scalars only; the dataloader's class (LazyCall) and pipeline wiring (datasets, packers, …) stay in the experiment Python. - 1. `max_samples_per_batch` — Per-micro-batch sample cap (remapped to `max_batch_size` on the VLM packer). `null` / omitted = no per-count cap. - 1. `max_sequence_length` — Per-packed-sequence token cap (remapped to `max_tokens` on the VLM packer). - 1. `seed` — Dataloader RNG seed (VFM only). - -### Common Hydra tail overrides - -These knobs aren't part of the pydantic schema today; pass them as trailing `key.path=value` positionals after `--` (the `cosmos_framework.scripts.train` flow forwards them through OmegaConf): - -- `model.config.policy.backbone.model_name` — VLM backbone HF identifier (e.g. `Qwen/Qwen3-VL-8B-Instruct`). Used by `launch_sft_llava_ov.sh`. -- `data_setting.max_tokens` — VLM token-packing cap (the VLM analogue of `[model].max_num_tokens_after_packing`). Used by `launch_sft_llava_ov.sh`. - -The launchers wire these via `TAIL_OVERRIDES=(…)`; the helper appends `-- "${TAIL_OVERRIDES[@]}"` after the `--sft-toml=` argument. diff --git a/cookbooks/cosmos3/finetune/_sft_launcher_common.sh b/cookbooks/cosmos3/finetune/_sft_launcher_common.sh deleted file mode 100644 index 377b10f9..00000000 --- a/cookbooks/cosmos3/finetune/_sft_launcher_common.sh +++ /dev/null @@ -1,98 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: OpenMDW-1.1 - -# Shared launch plumbing for the cookbook launch_sft_.sh — the -# structured-TOML / pydantic-schema flow that drives cosmos_framework.scripts.train. -# -# REQUIRES: an activated cosmos-framework venv (see the finetune README -# Prerequisites) so `cosmos_framework` is importable. This launcher does NOT -# add the framework's .venv/bin to PATH. -# -# Caller MUST set before sourcing: -# TOML_FILE recipe TOML, e.g. "toml/sft_config/.toml". -# Absolute or cookbook-relative. -# -# Caller MAY set before sourcing (presence drives which existence checks fire): -# DATASET_PATH recipe-local dataset dir, e.g. "data/". -# If unset, no dataset existence check fires -# (reasoner / HF-streaming case). -# BASE_CHECKPOINT_PATH recipe-local base DCP dir, e.g. "checkpoints/". -# Setting it also enables WAN_VAE_PATH plumbing + check. -# WAN_VAE_PATH override the default checkpoints/wan22_vae/Wan2.2_VAE.pth. -# EXTRA_DATASET_CHECK bash snippet (string) eval'd after the default checks. -# TAIL_OVERRIDES bash array of Hydra CLI overrides appended after `--` -# (e.g. data_setting.max_tokens=16000 for VLM smokes). -# MASTER_PORT torchrun --master_port; default 50012. -# NPROC_PER_NODE torchrun --nproc_per_node; default 8. -# LOG_FILENAME override $LOG_DIR/${LOG_FILENAME} -# (default _sft.log). -# -# Absolute paths are passed through; relative paths are anchored to the cookbook -# dir (the directory containing this launcher). Paths set in the caller's shell -# via `export DATASET_PATH=...` etc. win over the launcher's defaults (use the -# `: "${VAR:=default}"` idiom in the launcher to preserve this). - -set -uo pipefail - -: "${TOML_FILE:?TOML_FILE must be set before sourcing _sft_launcher_common.sh}" - -# Cookbook dir = the wrapper's own directory (cookbooks/cosmos3/finetune/). -WORKDIR="$(cd "$(dirname "${BASH_SOURCE[1]}")" && pwd)" - -# Anchor relative paths to $WORKDIR. -[[ "$TOML_FILE" = /* ]] || TOML_FILE="$WORKDIR/$TOML_FILE" - -if [[ -n "${DATASET_PATH:-}" ]]; then - [[ "$DATASET_PATH" = /* ]] || DATASET_PATH="$WORKDIR/$DATASET_PATH" - export DATASET_PATH -fi - -if [[ -n "${BASE_CHECKPOINT_PATH:-}" ]]; then - [[ "$BASE_CHECKPOINT_PATH" = /* ]] || BASE_CHECKPOINT_PATH="$WORKDIR/$BASE_CHECKPOINT_PATH" - WAN_VAE_PATH="${WAN_VAE_PATH:-checkpoints/wan22_vae/Wan2.2_VAE.pth}" - [[ "$WAN_VAE_PATH" = /* ]] || WAN_VAE_PATH="$WORKDIR/$WAN_VAE_PATH" - export BASE_CHECKPOINT_PATH WAN_VAE_PATH -fi - -OUTPUT_ROOT="${OUTPUT_ROOT:-$WORKDIR/outputs/train}" -LOG_DIR="$OUTPUT_ROOT/logs" -TOML_STEM="$(basename "$TOML_FILE" .toml)" -LOG_FILE="$LOG_DIR/${LOG_FILENAME:-${TOML_STEM}_sft.log}" -IMAGINAIRE_OUTPUT_ROOT="${IMAGINAIRE_OUTPUT_ROOT:-$OUTPUT_ROOT}" -mkdir -p "$LOG_DIR" - -echo ">>> $(date '+%H:%M:%S') Checking inputs..." -[[ -f "$TOML_FILE" ]] || { echo "ERROR: TOML not found: $TOML_FILE" >&2; exit 1; } -if [[ -n "${DATASET_PATH:-}" ]]; then - [[ -d "$DATASET_PATH" ]] || { echo "ERROR: DATASET_PATH not found: $DATASET_PATH (run Step 1 of the finetune README, or export DATASET_PATH=)" >&2; exit 1; } -fi -if [[ -n "${BASE_CHECKPOINT_PATH:-}" ]]; then - [[ -d "$BASE_CHECKPOINT_PATH" ]] || { echo "ERROR: BASE_CHECKPOINT_PATH not found: $BASE_CHECKPOINT_PATH (run Step 2 of the finetune README, or export BASE_CHECKPOINT_PATH=)" >&2; exit 1; } - [[ -f "$WAN_VAE_PATH" ]] || { echo "ERROR: WAN_VAE_PATH not found: $WAN_VAE_PATH (run Step 1 of the finetune README, or export WAN_VAE_PATH=)" >&2; exit 1; } -fi -if [[ -n "${EXTRA_DATASET_CHECK:-}" ]]; then eval "$EXTRA_DATASET_CHECK"; fi - -cd "$WORKDIR" -echo ">>> $(date '+%H:%M:%S') WORKDIR: $WORKDIR" -echo ">>> $(date '+%H:%M:%S') TOML: $TOML_FILE" -[[ -n "${DATASET_PATH:-}" ]] && echo ">>> $(date '+%H:%M:%S') dataset: $DATASET_PATH" -[[ -n "${BASE_CHECKPOINT_PATH:-}" ]] && echo ">>> $(date '+%H:%M:%S') checkpoint: $BASE_CHECKPOINT_PATH" -echo ">>> $(date '+%H:%M:%S') log: $LOG_FILE" - -# Default empty if caller didn't set; safe under set -u. -[[ ${TAIL_OVERRIDES+x} ]] || TAIL_OVERRIDES=() - -TRAILING_ARGS=() -if (( ${#TAIL_OVERRIDES[@]} > 0 )); then - TRAILING_ARGS=(-- "${TAIL_OVERRIDES[@]}") -fi - -IMAGINAIRE_OUTPUT_ROOT="$IMAGINAIRE_OUTPUT_ROOT" \ - torchrun --nproc_per_node="${NPROC_PER_NODE:-8}" --master_port="${MASTER_PORT:-50012}" -m cosmos_framework.scripts.train \ - --sft-toml="$TOML_FILE" \ - "${TRAILING_ARGS[@]}" \ - 2>&1 | tee "$LOG_FILE" - -EXIT_CODE=${PIPESTATUS[0]} -echo ">>> $(date '+%H:%M:%S') Done (exit $EXIT_CODE)" -exit $EXIT_CODE diff --git a/cookbooks/cosmos3/finetune/launch_sft_llava_ov.sh b/cookbooks/cosmos3/finetune/launch_sft_llava_ov.sh deleted file mode 100644 index 1967cfca..00000000 --- a/cookbooks/cosmos3/finetune/launch_sft_llava_ov.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env bash -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: OpenMDW-1.1 - -# Structured-TOML launch for llava_ov (VLM SFT on -# lmms-lab/LLaVA-OneVision-Data via CosmosDataLoader). Drives -# cosmos_framework.scripts.train against toml/sft_config/llava_ov.toml. -# -# [job].task = "vlm" — picks cosmos_framework/configs/base/vlm/config.py as the base config. -# -# Requires an activated cosmos-framework venv (see the finetune README -# Prerequisites). Run from cookbooks/cosmos3/finetune/. -# -# The dataset streams from the HuggingFace Hub, so DATASET_PATH / -# WAN_VAE_PATH / BASE_CHECKPOINT_PATH are NOT required. -# -# Optional env: -# HF_TOKEN for gated Qwen3-VL-8B-Instruct downloads. -# VLM_SAFETENSORS_PATH local directory of pre-converted Qwen3-VL safetensors -# (e.g. a Cosmos3-Nano LM merged with Qwen3-VL visual via -# `cosmos_framework.scripts.convert_model_to_vlm_safetensors`). -# When set, plumbed to backbone.safetensors_path via a -# tail override. When unset, the framework falls back -# to the public Qwen/Qwen3-VL-8B-Instruct HF snapshot. -# -# Usage (8-GPU allocation, framework venv active, from cookbooks/cosmos3/finetune/): -# bash launch_sft_llava_ov.sh - -TOML_FILE="toml/sft_config/llava_ov.toml" - -TAIL_OVERRIDES=( - ${EXTRA_TAIL_OVERRIDES:-} -) - -# When VLM_SAFETENSORS_PATH is set, plumb it to backbone.safetensors_path so the -# framework loads weights from the local snapshot (e.g. a Cosmos3-Nano LM merged -# with Qwen3-VL visual via `cosmos_framework.scripts.convert_model_to_vlm_safetensors`) -# while keeping the public HF model_name for tokenizer/architecture discovery. -if [[ -n "${VLM_SAFETENSORS_PATH:-}" ]]; then - TAIL_OVERRIDES+=("model.config.policy.backbone.safetensors_path=$VLM_SAFETENSORS_PATH") -fi - -source "$(dirname "${BASH_SOURCE[0]}")/_sft_launcher_common.sh" diff --git a/cookbooks/cosmos3/finetune/launch_sft_videophy2_nano.sh b/cookbooks/cosmos3/finetune/launch_sft_videophy2_nano.sh deleted file mode 100644 index 9499cc5c..00000000 --- a/cookbooks/cosmos3/finetune/launch_sft_videophy2_nano.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env bash -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: OpenMDW-1.1 - -# Structured-TOML launch for videophy2_sft_nano (VLM dialog SFT on VideoPhy-2 -# via CosmosDataLoader). Drives cosmos_framework.scripts.train against -# toml/sft_config/videophy2_sft_nano.toml. -# -# [job].task = "vlm" — picks cosmos_framework/configs/base/vlm/config.py as the base config. -# -# Requires an activated cosmos-framework venv (see the finetune README -# Prerequisites). Run from cookbooks/cosmos3/finetune/. -# -# Required env: -# VIDEOPHYSICS_ROOT dir containing videophy2_train/ and videophy2_val/ -# (each with meta.json + media/ + text/). Populate via -# `python -m cosmos_framework.scripts.vlm.prepare_videophy2_from_hf`. -# -# Optional env: -# HF_TOKEN for gated Qwen3-VL-8B-Instruct downloads. -# VLM_SAFETENSORS_PATH local directory of pre-converted Qwen3-VL safetensors -# (e.g. Cosmos3-Nano LM merged with Qwen3-VL visual via -# `cosmos_framework.scripts.convert_model_to_vlm_safetensors`). -# When set, plumbed to backbone.safetensors_path via a -# tail override. When unset, the framework falls back -# to the public Qwen/Qwen3-VL-8B-Instruct HF snapshot. -# -# Usage (8-GPU allocation, framework venv active, from cookbooks/cosmos3/finetune/): -# VIDEOPHYSICS_ROOT=/path/to/videophysics bash launch_sft_videophy2_nano.sh - -TOML_FILE="toml/sft_config/videophy2_sft_nano.toml" - -TAIL_OVERRIDES=( - ${EXTRA_TAIL_OVERRIDES:-} -) - -# When VLM_SAFETENSORS_PATH is set, plumb it to backbone.safetensors_path so the -# framework loads weights from the local snapshot (e.g. a Cosmos3-Nano LM merged -# with Qwen3-VL visual via `cosmos_framework.scripts.convert_model_to_vlm_safetensors`) -# while keeping the public HF model_name for tokenizer/architecture discovery. -if [[ -n "${VLM_SAFETENSORS_PATH:-}" ]]; then - TAIL_OVERRIDES+=("model.config.policy.backbone.safetensors_path=$VLM_SAFETENSORS_PATH") -fi - -source "$(dirname "${BASH_SOURCE[0]}")/_sft_launcher_common.sh" diff --git a/cookbooks/cosmos3/finetune/launch_sft_vision_nano.sh b/cookbooks/cosmos3/finetune/launch_sft_vision_nano.sh deleted file mode 100644 index d67c4ddc..00000000 --- a/cookbooks/cosmos3/finetune/launch_sft_vision_nano.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env bash -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: OpenMDW-1.1 - -# Structured-TOML launch for vision_sft_nano (T2V / I2V / V2V vision-only -# SFT on Qwen3-VL-8B, 8-GPU FSDP). Drives cosmos_framework.scripts.train against -# toml/sft_config/vision_sft_nano.toml. -# -# Requires an activated cosmos-framework venv (see the finetune README -# Prerequisites). Run from cookbooks/cosmos3/finetune/. -# -# Optional env vars (defaults below point under this cookbook dir; override to -# put data or checkpoints on a different filesystem): -# DATASET_PATH default: data/BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge -# (must contain train/video_dataset_file.jsonl) -# BASE_CHECKPOINT_PATH default: checkpoints/Cosmos3-Nano -# WAN_VAE_PATH default: checkpoints/wan22_vae/Wan2.2_VAE.pth -# HF_TOKEN if any tokenizer download requires gated HF access -# OUTPUT_ROOT default: outputs/train -# -# Usage (8-GPU allocation, framework venv active, from cookbooks/cosmos3/finetune/): -# bash launch_sft_vision_nano.sh - -TOML_FILE="toml/sft_config/vision_sft_nano.toml" -: "${DATASET_PATH:=data/BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge}" -: "${BASE_CHECKPOINT_PATH:=checkpoints/Cosmos3-Nano}" - -EXTRA_DATASET_CHECK='[[ -f "$DATASET_PATH/train/video_dataset_file.jsonl" ]] || { echo "ERROR: missing $DATASET_PATH/train/video_dataset_file.jsonl" >&2; exit 1; }' - -source "$(dirname "${BASH_SOURCE[0]}")/_sft_launcher_common.sh" diff --git a/cookbooks/cosmos3/finetune/launch_sft_vision_super.sh b/cookbooks/cosmos3/finetune/launch_sft_vision_super.sh deleted file mode 100644 index 54bfde97..00000000 --- a/cookbooks/cosmos3/finetune/launch_sft_vision_super.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env bash -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: OpenMDW-1.1 - -# Structured-TOML launch for vision_sft_super (T2V / I2V / V2V LoRA SFT on -# Qwen3-VL-32B-Instruct, 8-GPU FSDP with CP=2 / DP=4). Drives -# cosmos_framework.scripts.train against toml/sft_config/vision_sft_super.toml. -# -# Requires an activated cosmos-framework venv (see the finetune README -# Prerequisites). Run from cookbooks/cosmos3/finetune/. -# -# Optional env vars (defaults below point under this cookbook dir; override to -# put data or checkpoints on a different filesystem): -# DATASET_PATH default: data/BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge -# (must contain train/video_dataset_file.jsonl) -# BASE_CHECKPOINT_PATH default: checkpoints/Cosmos3-Super -# WAN_VAE_PATH default: checkpoints/wan22_vae/Wan2.2_VAE.pth -# HF_TOKEN if any tokenizer download requires gated HF access -# OUTPUT_ROOT default: outputs/train -# -# Usage (8-GPU allocation, framework venv active, from cookbooks/cosmos3/finetune/): -# bash launch_sft_vision_super.sh - -TOML_FILE="toml/sft_config/vision_sft_super.toml" -: "${DATASET_PATH:=data/BridgeData2-Subset-Synthetic-Captions/sft_dataset_bridge}" -: "${BASE_CHECKPOINT_PATH:=checkpoints/Cosmos3-Super}" - -EXTRA_DATASET_CHECK='[[ -f "$DATASET_PATH/train/video_dataset_file.jsonl" ]] || { echo "ERROR: missing $DATASET_PATH/train/video_dataset_file.jsonl" >&2; exit 1; }' - -# Super-variant env tweaks: clear LD_LIBRARY_PATH to avoid host CUDA/NCCL libs -# bleeding into the venv, switch the allocator to expandable_segments so the -# 32B backbone fits without OOM during compile/decode. -export LD_LIBRARY_PATH="" -export PYTORCH_ALLOC_CONF="${PYTORCH_ALLOC_CONF:-expandable_segments:True}" - -source "$(dirname "${BASH_SOURCE[0]}")/_sft_launcher_common.sh" diff --git a/cookbooks/cosmos3/generator/audiovisual/finetune/README.md b/cookbooks/cosmos3/generator/audiovisual/finetune/README.md new file mode 100644 index 00000000..77dd1a04 --- /dev/null +++ b/cookbooks/cosmos3/generator/audiovisual/finetune/README.md @@ -0,0 +1,58 @@ +# Cosmos3 Vision Generator Fine-Tuning (SFT) + +Supervised fine-tuning (SFT) of the Cosmos3 video generator on your own captioned video data. Tested on 8×H100 (80 GB). + +| Recipe | Launch shell | Base model | Dataset | +| --- | --- | --- | --- | +| Vision SFT (full) | `launch_sft_vision_nano.sh` | Cosmos3-Nano | [BridgeData2-Subset-Synthetic-Captions](https://huggingface.co/datasets/nvidia/BridgeData2-Subset-Synthetic-Captions) | +| Vision SFT (LoRA) | `launch_sft_vision_super.sh` | Cosmos3-Super | same as above | + +Both recipes train on structured-JSON captions (`caption_json`, the model's native prompt format), so training stays aligned with inference. + +## Prerequisites + +1. **Install the framework.** These recipes drive `cosmos_framework.scripts.train`, so install a cosmos-framework checkout first — follow the shared [Cosmos Framework setup](../../../README.md#cosmos-framework) (clone into `packages/cosmos3`, then `uv sync --all-extras --group=cu130-train`; use `cu128-train` on a CUDA 12.x driver). +2. **Recommended container.** For a curated CUDA + PyTorch base, NVIDIA recommends starting from the NGC PyTorch container **`nvcr.io/nvidia/pytorch:25.09-py3`** (CUDA 13; use **`:25.06-py3`** for a CUDA 12.8 driver). See the framework [setup guide](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/setup.md#recommended-base-image). +3. **Activate** the framework venv so `cosmos_framework` is importable: `source /packages/cosmos3/.venv/bin/activate`. +4. **Hugging Face access.** Some assets are license-gated — accept terms on the dataset/model pages and authenticate once with `uvx hf@latest auth login` (or export `HF_TOKEN`). +5. **Run from this directory** (`cookbooks/cosmos3/generator/audiovisual/finetune/`). Downloads, converted checkpoints, and run outputs default to `data/`, `checkpoints/`, and `outputs/` here (all git-ignored). + +## Quick start + +Each launcher is a complete recipe — run it from this folder and it downloads the dataset, fetches the Wan2.2 VAE, converts the base checkpoint, then runs 8-GPU training (the download/convert steps are skipped if their outputs already exist): + +```shell +bash launch_sft_vision_nano.sh # full SFT on Cosmos3-Nano +# or +bash launch_sft_vision_super.sh # LoRA SFT on Cosmos3-Super +``` + +Paths are fixed at the top of each script (under this git-ignored folder) — edit them there to put data or checkpoints on another filesystem. + +## Outputs + +Training writes to `outputs/train////`: + +- `checkpoints/iter_/` — DCP checkpoint (model / optim / scheduler / trainer state); `checkpoints/latest_checkpoint.txt` names the newest. +- `config.yaml`, launch metadata, logs, and one directory per registered callback. + +## Export to Hugging Face safetensors + +```shell +RUN_DIR=outputs/train/// +CKPT=$RUN_DIR/checkpoints/$(cat "$RUN_DIR/checkpoints/latest_checkpoint.txt") +python -m cosmos_framework.scripts.export_model \ + --checkpoint-path "$CKPT" --config-file "$RUN_DIR/config.yaml" -o "$RUN_DIR/model" +``` + +Use the exported `$RUN_DIR/model` with the [audiovisual inference cookbook](../README.md). + +## Advanced configuration + +These recipes are intentionally minimal. For the full post-training reference — raw `torchrun`, resuming, every TOML field, parallelism / LoRA / EMA knobs, and the VFM↔VLM remap — see the canonical framework docs: + +- [Post-Training (SFT) guide](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/training.md) +- [SFT structured-TOML config reference](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/sft_config.md) +- [JSONL dataset format](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/dataset_jsonl.md) · [environment variables](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/environment_variables.md) · [FAQ / OOM during SFT](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/faq.md) + +> SFT here is a multi-GPU `torchrun` job, so these cookbooks ship as launch scripts + this README rather than a one-click notebook. diff --git a/cookbooks/cosmos3/generator/audiovisual/finetune/launch_sft_vision_nano.sh b/cookbooks/cosmos3/generator/audiovisual/finetune/launch_sft_vision_nano.sh new file mode 100644 index 00000000..52b3d9f2 --- /dev/null +++ b/cookbooks/cosmos3/generator/audiovisual/finetune/launch_sft_vision_nano.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +# Complete recipe: Vision SFT on Cosmos3-Nano (T2V / I2V / V2V, 8x H100). +# Run from this folder with the cosmos-framework venv active (see README): +# bash launch_sft_vision_nano.sh +# It downloads the data, prepares the base checkpoint, and trains — in order. +# Paths are fixed under this (git-ignored) folder; edit them below to relocate. + +set -euo pipefail +cd "$(dirname "${BASH_SOURCE[0]}")" + +DATASET_DIR="$PWD/data/BridgeData2-Subset-Synthetic-Captions" +CHECKPOINT_DIR="$PWD/checkpoints/Cosmos3-Nano" +VAE_PATH="$PWD/checkpoints/wan22_vae/Wan2.2_VAE.pth" + +# 1. Download the SFT dataset (skipped if present; license-gated — accept terms + 'uvx hf@latest auth login'). +if [[ ! -f "$DATASET_DIR/sft_dataset_bridge/train/video_dataset_file.jsonl" ]]; then + uvx hf@latest download --repo-type dataset nvidia/BridgeData2-Subset-Synthetic-Captions \ + --revision 40d018ac1c1a2a4b9734f17fdb21f3d933c49a01 --local-dir "$DATASET_DIR" +fi + +# 2. Download the Wan2.2 VAE (skipped if present). +if [[ ! -f "$VAE_PATH" ]]; then + uvx hf@latest download Wan-AI/Wan2.2-TI2V-5B Wan2.2_VAE.pth --local-dir "$(dirname "$VAE_PATH")" +fi + +# 3. Convert the base checkpoint to DCP (skipped if present). +if [[ ! -d "$CHECKPOINT_DIR" ]]; then + python -m cosmos_framework.scripts.convert_model_to_dcp -o "$CHECKPOINT_DIR" --checkpoint-path Cosmos3-Nano +fi + +# 4. Train (8-GPU FSDP). The TOML reads these three paths from the environment. +export DATASET_PATH="$DATASET_DIR/sft_dataset_bridge" +export BASE_CHECKPOINT_PATH="$CHECKPOINT_DIR" +export WAN_VAE_PATH="$VAE_PATH" +IMAGINAIRE_OUTPUT_ROOT="$PWD/outputs/train" torchrun --nproc_per_node=8 \ + -m cosmos_framework.scripts.train --sft-toml="toml/sft_config/vision_sft_nano.toml" diff --git a/cookbooks/cosmos3/generator/audiovisual/finetune/launch_sft_vision_super.sh b/cookbooks/cosmos3/generator/audiovisual/finetune/launch_sft_vision_super.sh new file mode 100644 index 00000000..e4dd114d --- /dev/null +++ b/cookbooks/cosmos3/generator/audiovisual/finetune/launch_sft_vision_super.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +# Complete recipe: Vision LoRA SFT on Cosmos3-Super (T2V / I2V / V2V, 8x H100). +# Run from this folder with the cosmos-framework venv active (see README): +# bash launch_sft_vision_super.sh +# It downloads the data, prepares the base checkpoint, and trains — in order. +# Paths are fixed under this (git-ignored) folder; edit them below to relocate. + +set -euo pipefail +cd "$(dirname "${BASH_SOURCE[0]}")" + +DATASET_DIR="$PWD/data/BridgeData2-Subset-Synthetic-Captions" +CHECKPOINT_DIR="$PWD/checkpoints/Cosmos3-Super" +VAE_PATH="$PWD/checkpoints/wan22_vae/Wan2.2_VAE.pth" + +# 1. Download the SFT dataset (skipped if present; license-gated — accept terms + 'uvx hf@latest auth login'). +if [[ ! -f "$DATASET_DIR/sft_dataset_bridge/train/video_dataset_file.jsonl" ]]; then + uvx hf@latest download --repo-type dataset nvidia/BridgeData2-Subset-Synthetic-Captions \ + --revision 40d018ac1c1a2a4b9734f17fdb21f3d933c49a01 --local-dir "$DATASET_DIR" +fi + +# 2. Download the Wan2.2 VAE (skipped if present). +if [[ ! -f "$VAE_PATH" ]]; then + uvx hf@latest download Wan-AI/Wan2.2-TI2V-5B Wan2.2_VAE.pth --local-dir "$(dirname "$VAE_PATH")" +fi + +# 3. Convert the base checkpoint to DCP (skipped if present). +if [[ ! -d "$CHECKPOINT_DIR" ]]; then + python -m cosmos_framework.scripts.convert_model_to_dcp -o "$CHECKPOINT_DIR" --checkpoint-path Cosmos3-Super +fi + +# 4. Train (8-GPU FSDP, CP=2 / DP=4). The 32B backbone needs the host CUDA libs +# cleared and the expandable_segments allocator to fit without OOM. +export LD_LIBRARY_PATH="" +export PYTORCH_ALLOC_CONF="expandable_segments:True" +export DATASET_PATH="$DATASET_DIR/sft_dataset_bridge" +export BASE_CHECKPOINT_PATH="$CHECKPOINT_DIR" +export WAN_VAE_PATH="$VAE_PATH" +IMAGINAIRE_OUTPUT_ROOT="$PWD/outputs/train" torchrun --nproc_per_node=8 \ + -m cosmos_framework.scripts.train --sft-toml="toml/sft_config/vision_sft_super.toml" diff --git a/cookbooks/cosmos3/finetune/toml/sft_config/vision_sft_nano.toml b/cookbooks/cosmos3/generator/audiovisual/finetune/toml/sft_config/vision_sft_nano.toml similarity index 100% rename from cookbooks/cosmos3/finetune/toml/sft_config/vision_sft_nano.toml rename to cookbooks/cosmos3/generator/audiovisual/finetune/toml/sft_config/vision_sft_nano.toml diff --git a/cookbooks/cosmos3/finetune/toml/sft_config/vision_sft_super.toml b/cookbooks/cosmos3/generator/audiovisual/finetune/toml/sft_config/vision_sft_super.toml similarity index 100% rename from cookbooks/cosmos3/finetune/toml/sft_config/vision_sft_super.toml rename to cookbooks/cosmos3/generator/audiovisual/finetune/toml/sft_config/vision_sft_super.toml diff --git a/cookbooks/cosmos3/reasoner/finetune/README.md b/cookbooks/cosmos3/reasoner/finetune/README.md new file mode 100644 index 00000000..ff7816da --- /dev/null +++ b/cookbooks/cosmos3/reasoner/finetune/README.md @@ -0,0 +1,58 @@ +# Cosmos3 Reasoner Fine-Tuning (SFT) + +Supervised fine-tuning (SFT) of the Cosmos3 Reasoner (VLM) on your own data. Tested on 8×H100 (80 GB). + +| Recipe | Launch shell | Dataset | Notes | +| --- | --- | --- | --- | +| Alignment SFT (LLaVA-OneVision) | `launch_sft_llava_ov.sh` | [lmms-lab/LLaVA-OneVision-Data](https://huggingface.co/datasets/lmms-lab/LLaVA-OneVision-Data) | Streams from HF; backbone fetched at startup — no local prep | +| Physical-plausibility SFT (VideoPhy-2) | `launch_sft_videophy2_nano.sh` | [videophysics/videophy2_train](https://huggingface.co/datasets/videophysics/videophy2_train) | 1–5 plausibility scoring; dataset + checkpoint auto-prepared | + +Both use `[job].task = "vlm"` and bootstrap from `Qwen/Qwen3-VL-8B-Instruct` (optionally a merged Cosmos3-Nano reasoner snapshot). + +## Prerequisites + +1. **Install the framework.** These recipes drive `cosmos_framework.scripts.train`, so install a cosmos-framework checkout first — follow the shared [Cosmos Framework setup](../../README.md#cosmos-framework) (clone into `packages/cosmos3`, then `uv sync --all-extras --group=cu130-train`; use `cu128-train` on a CUDA 12.x driver). +2. **Recommended container.** For a curated CUDA + PyTorch base, NVIDIA recommends starting from the NGC PyTorch container **`nvcr.io/nvidia/pytorch:25.09-py3`** (CUDA 13; use **`:25.06-py3`** for a CUDA 12.8 driver). See the framework [setup guide](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/setup.md#recommended-base-image). +3. **Activate** the framework venv so `cosmos_framework` is importable: `source /packages/cosmos3/.venv/bin/activate`. +4. **Hugging Face access.** The Qwen3-VL backbone and datasets are fetched from HF — authenticate once with `uvx hf@latest auth login` (or export `HF_TOKEN`); accept any dataset terms first. +5. **Run from this directory** (`cookbooks/cosmos3/reasoner/finetune/`). Any downloads, converted checkpoints, and run outputs default to `data/`, `checkpoints/`, and `outputs/` here (all git-ignored). + +## Quick start + +Each launcher is a complete recipe — just run it from this folder: + +```shell +bash launch_sft_llava_ov.sh # alignment SFT; dataset streams from HF, backbone fetched at startup +# or +bash launch_sft_videophy2_nano.sh # first run materializes VideoPhy-2 + builds the merged Cosmos3-Nano VLM checkpoint, then trains +``` + +The VideoPhy-2 download/convert steps are skipped once their outputs exist. Paths are fixed at the top of each script (under this git-ignored folder) — edit them there to relocate data or checkpoints. + +## Outputs + +Training writes to `outputs/train////`: + +- `checkpoints/iter_/` — DCP checkpoint (model / optim / scheduler / trainer state); `checkpoints/latest_checkpoint.txt` names the newest. +- `config.yaml`, launch metadata, logs, and one directory per registered callback. + +## Export to Hugging Face safetensors + +```shell +RUN_DIR=outputs/train/// +CKPT=$RUN_DIR/checkpoints/$(cat "$RUN_DIR/checkpoints/latest_checkpoint.txt") +python -m cosmos_framework.scripts.export_model \ + --checkpoint-path "$CKPT" --config-file "$RUN_DIR/config.yaml" -o "$RUN_DIR/model" +``` + +Use the exported `$RUN_DIR/model` with the [reasoner inference cookbook](../README.md). + +## Advanced configuration + +These recipes are intentionally minimal. For the full post-training reference — raw `torchrun`, resuming, every TOML field, and advanced parallelism — see the canonical framework docs: + +- [Post-Training (SFT) guide](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/training.md) +- [SFT structured-TOML config reference](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/sft_config.md) +- [JSONL dataset format](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/dataset_jsonl.md) · [environment variables](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/environment_variables.md) · [FAQ / OOM during SFT](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/faq.md) + +> SFT here is a multi-GPU `torchrun` job, so these cookbooks ship as launch scripts + this README rather than a one-click notebook. diff --git a/cookbooks/cosmos3/reasoner/finetune/launch_sft_llava_ov.sh b/cookbooks/cosmos3/reasoner/finetune/launch_sft_llava_ov.sh new file mode 100644 index 00000000..844f5a3b --- /dev/null +++ b/cookbooks/cosmos3/reasoner/finetune/launch_sft_llava_ov.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +# Complete recipe: Reasoner alignment SFT on LLaVA-OneVision (8x H100). +# Run from this folder with the cosmos-framework venv active (see README): +# bash launch_sft_llava_ov.sh +# The dataset streams from HuggingFace and the Qwen3-VL-8B-Instruct backbone is +# fetched at startup, so there's nothing to download first — this just trains. + +set -euo pipefail +cd "$(dirname "${BASH_SOURCE[0]}")" + +# Train (8-GPU FSDP). +IMAGINAIRE_OUTPUT_ROOT="$PWD/outputs/train" torchrun --nproc_per_node=8 \ + -m cosmos_framework.scripts.train --sft-toml="toml/sft_config/llava_ov.toml" diff --git a/cookbooks/cosmos3/reasoner/finetune/launch_sft_videophy2_nano.sh b/cookbooks/cosmos3/reasoner/finetune/launch_sft_videophy2_nano.sh new file mode 100644 index 00000000..30648a8a --- /dev/null +++ b/cookbooks/cosmos3/reasoner/finetune/launch_sft_videophy2_nano.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +# Complete recipe: Reasoner physical-plausibility SFT on VideoPhy-2 (8x H100). +# Run from this folder with the cosmos-framework venv active (see README): +# bash launch_sft_videophy2_nano.sh +# It materializes the dataset, builds the merged Cosmos3-Nano VLM checkpoint, and +# trains — in order. Paths are fixed under this (git-ignored) folder. + +set -euo pipefail +cd "$(dirname "${BASH_SOURCE[0]}")" + +VIDEOPHYSICS_ROOT="$PWD/data/videophysics" +VLM_CHECKPOINT="$PWD/checkpoints/Cosmos3-Nano-VLM" + +# 1. Materialize the VideoPhy-2 dataset (skipped if present). +if [[ ! -d "$VIDEOPHYSICS_ROOT/videophy2_train" ]]; then + python -m cosmos_framework.scripts.vlm.prepare_videophy2_from_hf --out_root "$VIDEOPHYSICS_ROOT" --split both +fi + +# 2. Merge Cosmos3-Nano LM onto the Qwen3-VL-8B-Instruct visual tower (skipped if present). +if [[ ! -d "$VLM_CHECKPOINT" ]]; then + python -m cosmos_framework.scripts.convert_model_to_vlm_safetensors --checkpoint-path Cosmos3-Nano -o "$VLM_CHECKPOINT" +fi + +# 3. Train (8-GPU FSDP). VIDEOPHYSICS_ROOT is read from the environment; the +# merged checkpoint is supplied as a config override after `--`. +export VIDEOPHYSICS_ROOT +IMAGINAIRE_OUTPUT_ROOT="$PWD/outputs/train" torchrun --nproc_per_node=8 \ + -m cosmos_framework.scripts.train --sft-toml="toml/sft_config/videophy2_sft_nano.toml" \ + -- model.config.policy.backbone.safetensors_path="$VLM_CHECKPOINT" diff --git a/cookbooks/cosmos3/finetune/toml/sft_config/llava_ov.toml b/cookbooks/cosmos3/reasoner/finetune/toml/sft_config/llava_ov.toml similarity index 100% rename from cookbooks/cosmos3/finetune/toml/sft_config/llava_ov.toml rename to cookbooks/cosmos3/reasoner/finetune/toml/sft_config/llava_ov.toml diff --git a/cookbooks/cosmos3/finetune/toml/sft_config/videophy2_sft_nano.toml b/cookbooks/cosmos3/reasoner/finetune/toml/sft_config/videophy2_sft_nano.toml similarity index 100% rename from cookbooks/cosmos3/finetune/toml/sft_config/videophy2_sft_nano.toml rename to cookbooks/cosmos3/reasoner/finetune/toml/sft_config/videophy2_sft_nano.toml