AMD-AGI · hann-wang · Apr 16, 2026 · Apr 16, 2026 · Apr 17, 2026 · Apr 20, 2026
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,8 @@
+*/*/build*
+*/*/__pycache__
+*/*/*.pyc
+
+.pytest_cache
+.git
+comm_traces
+*outputs*
diff --git a/3rdparty/torchtitan b/3rdparty/torchtitan
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,30 @@
+FROM rocm/pytorch-nightly:20260429082157-rocm7.2.2
+
+RUN apt-get update && apt-get install -y \
+    git-lfs \
+    pkg-config \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN update-pciids
+
+RUN pip install --no-cache-dir huggingface_hub "datasets>=3.6.0" \
+    transformers tabulate wandb fsspec tyro "tokenizers>=0.15.0" safetensors \
+    tensorboard pre-commit yapf pybind11 meson-python torchdata pytablewriter \
+    "antlr4-python3-runtime==4.11.0" sympy math_verify more_itertools peft \
+    accelerate pillow "numpy<2" opencv-python-headless scipy \
+    numba huggingface-hub[cli,hf_transfer] "packaging>=24.2" \
+    "setuptools>=77.0.3,<80.0.0" "setuptools-scm>=8" \
+    protobuf-protoc-bin fmt && \
+    pip install --no-cache-dir /opt/rocm/share/amd_smi
+
+RUN cd /var/lib/jenkins && \
+    git clone --depth 1 https://github.com/EleutherAI/lm-evaluation-harness && \
+    cd lm-evaluation-harness && \
+    pip install -e .
+
+COPY . /var/lib/jenkins/alto
+
+RUN cd /var/lib/jenkins/alto/3rdparty/torchtitan && \
+    pip install --no-build-isolation -e . && \
+    cd /var/lib/jenkins/alto && \
+    pip install -e .
diff --git a/LICENSE b/LICENSE
@@ -29,19 +29,15 @@ ALTO uses or references the following third-party projects:
    - License: BSD License
    - Repository: https://github.com/pytorch/torchtitan
 
-2. Megatron-LM
-   - License: Apache License 2.0
-   - Repository: https://github.com/nvidia/megatron-lm
-
-3. vLLM
+2. vLLM
    - License: Apache License 2.0
    - Repository: https://github.com/vllm-project/vllm
 
-4. compressed-tensors
+3. compressed-tensors
    - License: Apache License 2.0
    - Repository: https://github.com/vllm-project/compressed-tensors
 
-5. llm-compressor
+4. llm-compressor
    - License: Apache License 2.0
    - Repository: https://github.com/vllm-project/llm-compressor
 

diff --git a/alto/kernels/dispatch/__init__.py b/alto/kernels/dispatch/__init__.py
@@ -4,10 +4,10 @@
 
 from .config import TrainingOpConfig
 from .conversion import swap_params
-from .attention import LPScaledDotProductAttentionWrapper
+from .attention import LPScaledDotProductAttention
 
 __all__ = [
     "TrainingOpConfig",
     "swap_params",
-    "LPScaledDotProductAttentionWrapper",
+    "LPScaledDotProductAttention",
 ]
diff --git a/alto/kernels/dispatch/attention.py b/alto/kernels/dispatch/attention.py
@@ -3,15 +3,15 @@
 # SPDX-License-Identifier: MIT
 
 import torch
-from torchtitan.models.common.attention import (ScaledDotProductAttentionWrapper)
+from torchtitan.models.common.attention import ScaledDotProductAttention
 
 from alto.kernels.fp4.mxfp4.triton_flash_attention_mxfp4 import triton_attention_mxfp4
 from .config import TrainingOpConfig
 
-__all__ = ["LPScaledDotProductAttentionWrapper"]
+__all__ = ["LPScaledDotProductAttention"]
 
 
-class LPScaledDotProductAttentionWrapper(ScaledDotProductAttentionWrapper):
+class LPScaledDotProductAttention(ScaledDotProductAttention):
 
     def __init__(self, config: TrainingOpConfig):
         super().__init__()

diff --git a/alto/models/deepseek_v3/config_registry.py b/alto/models/deepseek_v3/config_registry.py
@@ -21,7 +21,7 @@
 
 def deepseek_v3_debugmodel() -> Trainer.Config:
     config = deepseek_v3_debugmodel_orig()
-    config.profiling.enable_profiling = False
+    config.profiler.enable_profiling = False
     config.training.steps = 10
     config.training.local_batch_size = 4
     config.training.global_batch_size = 16
@@ -43,7 +43,7 @@ def deepseek_v3_16b() -> Trainer.Config:
     config = deepseek_v3_16b_orig()
     config.hf_assets_path = "/huggingface/hub/models--deepseek-ai--deepseek-moe-16b-base/snapshots/521d2bc4fb69a3f3ae565310fcc3b65f97af2580"
     config.dump_folder = "deepseek_v3_16b-outputs"
-    config.profiling.enable_profiling = False
+    config.profiler.enable_profiling = False
     config.training.steps = 0
     config.training.local_batch_size = 1
     config.training.seq_len = 4096
@@ -62,7 +62,6 @@ def deepseek_v3_16b() -> Trainer.Config:
     config.validator.freq = 10
     config.validator.steps = 10
     config.activation_checkpoint.mode = "none"
-    config.activation_checkpoint.selective_ac_option = "1"
     config.debug.seed = 1234
     return config
 

diff --git a/alto/models/gpt_oss/config_registry.py b/alto/models/gpt_oss/config_registry.py
@@ -22,7 +22,7 @@
 
 def gpt_oss_debugmodel() -> Trainer.Config:
     config = gpt_oss_debugmodel_orig()
-    config.profiling.enable_profiling = False
+    config.profiler.enable_profiling = False
     config.training.steps = 10
     config.training.local_batch_size = 4
     config.training.global_batch_size = 16
@@ -44,7 +44,7 @@ def gpt_oss_20b() -> Trainer.Config:
     config = gpt_oss_20b_orig()
     config.hf_assets_path = "/huggingface/hub/models--openai--gpt-oss-20b/snapshots/6cee5e81ee83917806bbde320786a8fb61efebee/"
     config.dump_folder = "gpt_oss_20b-outputs"
-    config.profiling.enable_profiling = False
+    config.profiler.enable_profiling = False
     config.training.steps = 0
     config.training.local_batch_size = 1
     config.training.seq_len = 8192
@@ -64,7 +64,6 @@ def gpt_oss_20b() -> Trainer.Config:
     config.validator.freq = 10
     config.validator.steps = 10
     config.activation_checkpoint.mode = "none"
-    config.activation_checkpoint.selective_ac_option = "1"
     config.debug.seed = 1234
     return config
 
@@ -73,7 +72,7 @@ def gpt_oss_20b_pretrain() -> Trainer.Config:
     config = gpt_oss_20b_orig()
     config.hf_assets_path = "/huggingface/hub/models--openai--gpt-oss-20b/snapshots/6cee5e81ee83917806bbde320786a8fb61efebee/"
     config.dump_folder = "gpt_oss_20b-pretrain-subset-lr4e-4-outputs"
-    config.profiling.enable_profiling = False
+    config.profiler.enable_profiling = False
     config.training.steps = 1200000
     config.training.local_batch_size = 1
     config.training.global_batch_size = 16
@@ -103,7 +102,6 @@ def gpt_oss_20b_pretrain() -> Trainer.Config:
     config.validator.freq = 768
     config.validator.steps = 64
     config.activation_checkpoint.mode = "selective"
-    config.activation_checkpoint.selective_ac_option = "1"
     config.debug.seed = 1234
     return config
 

diff --git a/alto/models/llama3/config_registry.py b/alto/models/llama3/config_registry.py
@@ -51,7 +51,7 @@
 
 def llama3_debugmodel() -> Trainer.Config:
     config = llama3_debugmodel_orig()
-    config.profiling.enable_profiling = False
+    config.profiler.enable_profiling = False
     config.training.steps = 0
     config.training.local_batch_size = 4
     config.training.global_batch_size = 16
@@ -82,7 +82,7 @@ def llama3_1b() -> Trainer.Config:
     config = llama3_1b_orig()
     config.hf_assets_path = "/group/archive_dataset_6_nobkup/archive_modelzoo/sequence_learning/weights/nlp-pretrained-model/meta-llama/Llama-3.2-1B"
     config.metrics.log_freq = 1
-    config.profiling.enable_profiling = False
+    config.profiler.enable_profiling = False
     config.training.steps = 0
     config.training.local_batch_size = 1
     config.training.global_batch_size = 10
@@ -123,7 +123,7 @@ def llama3_8b() -> Trainer.Config:
     config = llama3_8b_orig()
     config.hf_assets_path = "/huggingface/hub/models--unsloth--Llama-3.1-8B/snapshots/3f0d51f8e5640f98f1a96ea9044a0e55c0a83814"
     config.metrics.log_freq = 1
-    config.profiling.enable_profiling = False
+    config.profiler.enable_profiling = False
     config.training.steps = 0
     config.training.local_batch_size = 1
     config.training.seq_len = 8192
@@ -188,7 +188,7 @@ def llama3_8b() -> Trainer.Config:
     config = llama3_8b_orig()
     config.hf_assets_path = LLAMA3_8B_PATH
     config.metrics.log_freq = 1
-    config.profiling.enable_profiling = False
+    config.profiler.enable_profiling = False
     config.training.steps = 0
     config.training.local_batch_size = 1
     config.training.global_batch_size = 8
@@ -352,7 +352,7 @@ def instella_3b() -> Trainer.Config:
     config = instella_3b_orig()
     config.hf_assets_path = "/group/ossmodelzoo/hanwang2/huggingface/hub/models--amd--Instella-3B-Stage1/snapshots/cb33253ab0a5b9f2ea0b98f3edd818d46454580e"
     config.metrics.log_freq = 1
-    config.profiling.enable_profiling = False
+    config.profiler.enable_profiling = False
     config.training.steps = 0
     config.training.local_batch_size = 1
     config.training.global_batch_size = 10

diff --git a/alto/models/llama3/configs/recipe.yaml b/alto/models/llama3/configs/recipe.yaml
@@ -1,10 +1,10 @@
-# sparsity_stage:
-#   sparsity_modifiers:
-#     WandaModifier:
-#       sparsity: 0.5
-#       mask_structure: "2:4"
-#       targets: ["Linear"]
-#       ignore: ["output"]
+sparsity_stage:
+  sparsity_modifiers:
+    WandaModifier:
+      sparsity: 0.5
+      mask_structure: "2:4"
+      targets: ["Linear"]
+      ignore: ["output"]
 quantization_stage:
   quantization_modifiers:
     QuantizationModifier:
@@ -23,13 +23,13 @@ quantization_stage:
             strategy: "tensor"
             observer: "minmax"
           targets: ["Linear"]
-    SelfDistillationModifier:
-      criterion: "LogitsDistillationLoss"
-      targets: ["__all__"]
-      steps: 10
-      warmup_steps: 0
-      lr: 1e-4
-      min_lr_factor: 1.0
-      decay_ratio: null
-      decay_type: "linear"
-      optimizer: "AdamW"
+    # SelfDistillationModifier:
+    #   criterion: "LogitsDistillationLoss"
+    #   targets: ["__all__"]
+    #   steps: 10
+    #   warmup_steps: 0
+    #   lr: 1e-4
+    #   min_lr_factor: 1.0
+    #   decay_ratio: null
+    #   decay_type: "linear"
+    #   optimizer: "AdamW"