-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtrain_cpt.sh
More file actions
38 lines (31 loc) · 1.59 KB
/
train_cpt.sh
File metadata and controls
38 lines (31 loc) · 1.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
#!/usr/bin/env bash
# ═══════════════════════════════════════════════════════════════════════════════
# Launch CPT training with Axolotl
# Run this on the DGX Spark after the data pipeline is complete.
# ═══════════════════════════════════════════════════════════════════════════════
set -euo pipefail
REPO="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$REPO"
# Activate training venv (nohup/cron starts with a bare env)
source "$REPO/.venv/bin/activate"
# Load HF_TOKEN, optional WANDB_* etc.
set -a; source .env; set +a
export AXOLOTL_DO_NOT_TRACK=1
echo "=== Starting CPT training ==="
echo "Config : configs/cpt_config.yaml"
echo "Output : output/cpt"
echo ""
# Auto-resume: find latest checkpoint and pass it as a path string.
# axolotl's pydantic schema requires a string path, not a boolean.
LATEST_CKPT=$(ls -d output/cpt/checkpoint-* 2>/dev/null | sort -V | tail -1 || true)
if [ -n "$LATEST_CKPT" ]; then
echo "Resuming from checkpoint: $LATEST_CKPT"
accelerate launch -m axolotl.cli.train configs/cpt_config.yaml \
--resume_from_checkpoint "$LATEST_CKPT"
else
echo "No checkpoint found — starting from scratch"
accelerate launch -m axolotl.cli.train configs/cpt_config.yaml
fi
echo ""
echo "=== CPT training complete ==="
echo "Adapter saved to: output/cpt"