-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtrain_sft.sh
More file actions
53 lines (46 loc) · 2.52 KB
/
train_sft.sh
File metadata and controls
53 lines (46 loc) · 2.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/usr/bin/env bash
# ═══════════════════════════════════════════════════════════════════════════════
# Launch SFT training with Axolotl
# Run this AFTER CPT is complete and lora_weights is set in sft_config.yaml
# (run_pipeline.sh does this automatically after merging the CPT LoRA).
#
# Known issues fixed vs. the naive invocation:
# - AXOLOTL_DO_NOT_TRACK=1 : whitelist.yaml is missing in GitHub HEAD installs
# - source .env : loads HF_TOKEN so gated model download works
# - auto-resume : passes --resume_from_checkpoint if a checkpoint
# exists in output/sft/ (axolotl pydantic requires
# a string path, not resume_from_checkpoint: true)
# - REPO + cd : works correctly whether called directly or via
# run_pipeline.sh (which already cd'd to REPO)
# ═══════════════════════════════════════════════════════════════════════════════
set -euo pipefail
REPO="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$REPO"
# Activate training venv (nohup/cron starts with a bare env)
source "$REPO/.venv/bin/activate"
# Load HF_TOKEN, optional WANDB_* etc.
set -a; source .env; set +a
# axolotl GitHub HEAD installs lack whitelist.yaml — disable telemetry to avoid crash
export AXOLOTL_DO_NOT_TRACK=1
echo "=== Starting SFT training ==="
echo "Config : configs/sft_config.yaml"
echo "Output : output/sft"
echo ""
# Auto-resume: if a checkpoint exists in output/sft/, continue from it.
# axolotl's pydantic schema requires a string path, not a boolean.
LATEST_CKPT=$(ls -d output/sft/checkpoint-* 2>/dev/null | sort -V | tail -1 || true)
if [ -n "$LATEST_CKPT" ]; then
echo "Resuming from checkpoint: $LATEST_CKPT"
accelerate launch -m axolotl.cli.train configs/sft_config.yaml \
--resume_from_checkpoint "$LATEST_CKPT"
else
echo "No checkpoint found — starting from scratch"
accelerate launch -m axolotl.cli.train configs/sft_config.yaml
fi
echo ""
echo "=== SFT training complete ==="
echo "Adapter saved to: output/sft"
echo ""
echo "To merge the SFT LoRA into the base model:"
echo " accelerate launch -m axolotl.cli.merge_lora configs/sft_config.yaml \\"
echo " --lora-model-dir output/sft"