-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdvc.yaml
More file actions
68 lines (66 loc) · 1.81 KB
/
dvc.yaml
File metadata and controls
68 lines (66 loc) · 1.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
stages:
tokenize:
cmd: mkdir -p tokenizer/build && cmake -S tokenizer -B tokenizer/build -DCMAKE_BUILD_TYPE=Release && cmake --build tokenizer/build -j$(nproc) && ./tokenizer/build/tokenize
deps:
- tokenizer/CMakeLists.txt
- tokenizer/src/lib/dataloader.cpp
- tokenizer/src/lib/dataloader.hpp
- tokenizer/src/lib/io.cpp
- tokenizer/src/lib/io.hpp
- tokenizer/src/lib/text.cpp
- tokenizer/src/lib/text.hpp
- tokenizer/src/lib/threading.cpp
- tokenizer/src/lib/threading.hpp
- tokenizer/src/lib/tokenizer.cpp
- tokenizer/src/lib/tokenizer.hpp
- tokenizer/src/tokenize.cpp
params:
- tokenize
outs:
- out/tokenize
train:
cmd: uv run python src/train.py
deps:
- out/tokenize
- src/train.py
- src/models/qwen3.py
- src/dataloaders/token_datamodule.py
- src/dataloaders/token_dataloader.py
- src/trainers/trainer.py
params:
- tokenize.dataset_dir
- tokenize.tok_file
- tokenize.vocab_size
- data.dataset_dir
- data.seq_length
- data.max_tokens
- data.num_workers
- data.bos_token_id
- data.eos_token_id
- data.pad_token_id
- data.split_ratio
- model
- training
outs:
- out/train/checkpoints/best.ckpt
- out/train/checkpoints/latest.ckpt
- out/train/logs:
persist: true
metrics:
- out/train/checkpoints/metrics.json:
cache: false
export:
cmd: uv run python src/export.py
deps:
- out/train/checkpoints/best.ckpt
- src/export.py
- src/models/qwen3.py
params:
- tokenize.vocab_size
- tokenize.tok_file
- tokenize.bos_token
- tokenize.eos_token
- tokenize.pad_token
- training.save_dir
outs:
- out/export