Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
*/*/build*
*/*/__pycache__
*/*/*.pyc

.pytest_cache
.git
comm_traces
*outputs*
2 changes: 1 addition & 1 deletion 3rdparty/torchtitan
Submodule torchtitan updated 333 files
30 changes: 30 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
FROM rocm/pytorch-nightly:20260429082157-rocm7.2.2

RUN apt-get update && apt-get install -y \
git-lfs \
pkg-config \
&& rm -rf /var/lib/apt/lists/*

RUN update-pciids

RUN pip install --no-cache-dir huggingface_hub "datasets>=3.6.0" \
transformers tabulate wandb fsspec tyro "tokenizers>=0.15.0" safetensors \
tensorboard pre-commit yapf pybind11 meson-python torchdata pytablewriter \
"antlr4-python3-runtime==4.11.0" sympy math_verify more_itertools peft \
accelerate pillow "numpy<2" opencv-python-headless scipy \
numba huggingface-hub[cli,hf_transfer] "packaging>=24.2" \
"setuptools>=77.0.3,<80.0.0" "setuptools-scm>=8" \
protobuf-protoc-bin fmt && \
pip install --no-cache-dir /opt/rocm/share/amd_smi

RUN cd /var/lib/jenkins && \
git clone --depth 1 https://github.com/EleutherAI/lm-evaluation-harness && \
cd lm-evaluation-harness && \
pip install -e .

COPY . /var/lib/jenkins/alto

RUN cd /var/lib/jenkins/alto/3rdparty/torchtitan && \
pip install --no-build-isolation -e . && \
cd /var/lib/jenkins/alto && \
pip install -e .
10 changes: 3 additions & 7 deletions LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -29,19 +29,15 @@ ALTO uses or references the following third-party projects:
- License: BSD License
- Repository: https://github.com/pytorch/torchtitan

2. Megatron-LM
- License: Apache License 2.0
- Repository: https://github.com/nvidia/megatron-lm

3. vLLM
2. vLLM
- License: Apache License 2.0
- Repository: https://github.com/vllm-project/vllm

4. compressed-tensors
3. compressed-tensors
- License: Apache License 2.0
- Repository: https://github.com/vllm-project/compressed-tensors

5. llm-compressor
4. llm-compressor
- License: Apache License 2.0
- Repository: https://github.com/vllm-project/llm-compressor

Expand Down
4 changes: 2 additions & 2 deletions alto/kernels/dispatch/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@

from .config import TrainingOpConfig
from .conversion import swap_params
from .attention import LPScaledDotProductAttentionWrapper
from .attention import LPScaledDotProductAttention

__all__ = [
"TrainingOpConfig",
"swap_params",
"LPScaledDotProductAttentionWrapper",
"LPScaledDotProductAttention",
]
6 changes: 3 additions & 3 deletions alto/kernels/dispatch/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@
# SPDX-License-Identifier: MIT

import torch
from torchtitan.models.common.attention import (ScaledDotProductAttentionWrapper)
from torchtitan.models.common.attention import ScaledDotProductAttention

from alto.kernels.fp4.mxfp4.triton_flash_attention_mxfp4 import triton_attention_mxfp4
from .config import TrainingOpConfig

__all__ = ["LPScaledDotProductAttentionWrapper"]
__all__ = ["LPScaledDotProductAttention"]


class LPScaledDotProductAttentionWrapper(ScaledDotProductAttentionWrapper):
class LPScaledDotProductAttention(ScaledDotProductAttention):

def __init__(self, config: TrainingOpConfig):
super().__init__()
Expand Down
5 changes: 2 additions & 3 deletions alto/models/deepseek_v3/config_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

def deepseek_v3_debugmodel() -> Trainer.Config:
config = deepseek_v3_debugmodel_orig()
config.profiling.enable_profiling = False
config.profiler.enable_profiling = False
config.training.steps = 10
config.training.local_batch_size = 4
config.training.global_batch_size = 16
Expand All @@ -43,7 +43,7 @@ def deepseek_v3_16b() -> Trainer.Config:
config = deepseek_v3_16b_orig()
config.hf_assets_path = "/huggingface/hub/models--deepseek-ai--deepseek-moe-16b-base/snapshots/521d2bc4fb69a3f3ae565310fcc3b65f97af2580"
config.dump_folder = "deepseek_v3_16b-outputs"
config.profiling.enable_profiling = False
config.profiler.enable_profiling = False
config.training.steps = 0
config.training.local_batch_size = 1
config.training.seq_len = 4096
Expand All @@ -62,7 +62,6 @@ def deepseek_v3_16b() -> Trainer.Config:
config.validator.freq = 10
config.validator.steps = 10
config.activation_checkpoint.mode = "none"
config.activation_checkpoint.selective_ac_option = "1"
config.debug.seed = 1234
return config

Expand Down
8 changes: 3 additions & 5 deletions alto/models/gpt_oss/config_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

def gpt_oss_debugmodel() -> Trainer.Config:
config = gpt_oss_debugmodel_orig()
config.profiling.enable_profiling = False
config.profiler.enable_profiling = False
config.training.steps = 10
config.training.local_batch_size = 4
config.training.global_batch_size = 16
Expand All @@ -44,7 +44,7 @@ def gpt_oss_20b() -> Trainer.Config:
config = gpt_oss_20b_orig()
config.hf_assets_path = "/huggingface/hub/models--openai--gpt-oss-20b/snapshots/6cee5e81ee83917806bbde320786a8fb61efebee/"
config.dump_folder = "gpt_oss_20b-outputs"
config.profiling.enable_profiling = False
config.profiler.enable_profiling = False
config.training.steps = 0
config.training.local_batch_size = 1
config.training.seq_len = 8192
Expand All @@ -64,7 +64,6 @@ def gpt_oss_20b() -> Trainer.Config:
config.validator.freq = 10
config.validator.steps = 10
config.activation_checkpoint.mode = "none"
config.activation_checkpoint.selective_ac_option = "1"
config.debug.seed = 1234
return config

Expand All @@ -73,7 +72,7 @@ def gpt_oss_20b_pretrain() -> Trainer.Config:
config = gpt_oss_20b_orig()
config.hf_assets_path = "/huggingface/hub/models--openai--gpt-oss-20b/snapshots/6cee5e81ee83917806bbde320786a8fb61efebee/"
config.dump_folder = "gpt_oss_20b-pretrain-subset-lr4e-4-outputs"
config.profiling.enable_profiling = False
config.profiler.enable_profiling = False
config.training.steps = 1200000
config.training.local_batch_size = 1
config.training.global_batch_size = 16
Expand Down Expand Up @@ -103,7 +102,6 @@ def gpt_oss_20b_pretrain() -> Trainer.Config:
config.validator.freq = 768
config.validator.steps = 64
config.activation_checkpoint.mode = "selective"
config.activation_checkpoint.selective_ac_option = "1"
config.debug.seed = 1234
return config

Expand Down
10 changes: 5 additions & 5 deletions alto/models/llama3/config_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@

def llama3_debugmodel() -> Trainer.Config:
config = llama3_debugmodel_orig()
config.profiling.enable_profiling = False
config.profiler.enable_profiling = False
config.training.steps = 0
config.training.local_batch_size = 4
config.training.global_batch_size = 16
Expand Down Expand Up @@ -82,7 +82,7 @@ def llama3_1b() -> Trainer.Config:
config = llama3_1b_orig()
config.hf_assets_path = "/group/archive_dataset_6_nobkup/archive_modelzoo/sequence_learning/weights/nlp-pretrained-model/meta-llama/Llama-3.2-1B"
config.metrics.log_freq = 1
config.profiling.enable_profiling = False
config.profiler.enable_profiling = False
config.training.steps = 0
config.training.local_batch_size = 1
config.training.global_batch_size = 10
Expand Down Expand Up @@ -123,7 +123,7 @@ def llama3_8b() -> Trainer.Config:
config = llama3_8b_orig()
config.hf_assets_path = "/huggingface/hub/models--unsloth--Llama-3.1-8B/snapshots/3f0d51f8e5640f98f1a96ea9044a0e55c0a83814"
config.metrics.log_freq = 1
config.profiling.enable_profiling = False
config.profiler.enable_profiling = False
config.training.steps = 0
config.training.local_batch_size = 1
config.training.seq_len = 8192
Expand Down Expand Up @@ -188,7 +188,7 @@ def llama3_8b() -> Trainer.Config:
config = llama3_8b_orig()
config.hf_assets_path = LLAMA3_8B_PATH
config.metrics.log_freq = 1
config.profiling.enable_profiling = False
config.profiler.enable_profiling = False
config.training.steps = 0
config.training.local_batch_size = 1
config.training.global_batch_size = 8
Expand Down Expand Up @@ -352,7 +352,7 @@ def instella_3b() -> Trainer.Config:
config = instella_3b_orig()
config.hf_assets_path = "/group/ossmodelzoo/hanwang2/huggingface/hub/models--amd--Instella-3B-Stage1/snapshots/cb33253ab0a5b9f2ea0b98f3edd818d46454580e"
config.metrics.log_freq = 1
config.profiling.enable_profiling = False
config.profiler.enable_profiling = False
config.training.steps = 0
config.training.local_batch_size = 1
config.training.global_batch_size = 10
Expand Down
34 changes: 17 additions & 17 deletions alto/models/llama3/configs/recipe.yaml
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
# sparsity_stage:
# sparsity_modifiers:
# WandaModifier:
# sparsity: 0.5
# mask_structure: "2:4"
# targets: ["Linear"]
# ignore: ["output"]
sparsity_stage:
sparsity_modifiers:
WandaModifier:
sparsity: 0.5
mask_structure: "2:4"
targets: ["Linear"]
ignore: ["output"]
quantization_stage:
quantization_modifiers:
QuantizationModifier:
Expand All @@ -23,13 +23,13 @@ quantization_stage:
strategy: "tensor"
observer: "minmax"
targets: ["Linear"]
SelfDistillationModifier:
criterion: "LogitsDistillationLoss"
targets: ["__all__"]
steps: 10
warmup_steps: 0
lr: 1e-4
min_lr_factor: 1.0
decay_ratio: null
decay_type: "linear"
optimizer: "AdamW"
# SelfDistillationModifier:
# criterion: "LogitsDistillationLoss"
# targets: ["__all__"]
# steps: 10
# warmup_steps: 0
# lr: 1e-4
# min_lr_factor: 1.0
# decay_ratio: null
# decay_type: "linear"
# optimizer: "AdamW"
Loading