google · sizhit2 · Apr 28, 2026 · Apr 28, 2026 · Apr 28, 2026
@@ -29,3 +29,10 @@ does not match the exact configuration listed in the table.
 | GRPO | **Qwen3 0.6b**| LoRA  | v5e-1         | Num of generation = 4, batch_size = 1 | Train: fsdp Rollout: tp | |
 | GRPO | **Qwen3 14b** | Full  | v5p-2         | Num of generation = 4, batch_size = 4 | Train: fsdp Rollout: tp | |
 | GRPO | **Qwen3 14b** | LoRA  | v5p-2         | Num of generation = 4, batch_size = 4 | Train: fsdp Rollout: tp | |
+
+## PPO
+
+| Algo | Model         | Type  | Min Resources | Max Training Micro Batch Size     | Sharding              | Launch Script                 |
+| :--- | :------------ | :---- | :------------ | :-------------------------------- | :-------------------- | :---------------------------- |
+| PPO  | **Gemma2-2b** | Full  | v5e-4         | Num of generation = 4, batch_size = 1 | Train: fsdp Rollout: tp | *[run_gemma2_2b.sh](examples/rl/ppo/gsm8k/run_gemma2_2b.sh)* |
+| PPO  | **Gemma2-2b** | LoRA  | v5e-4         | Num of generation = 4, batch_size = 1 | Train: fsdp Rollout: tp | *[run_gemma2_2b.sh](examples/rl/ppo/gsm8k/run_gemma2_2b.sh)* |
@@ -0,0 +1,80 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+model_config:
+  model_name: "gemma2_2b_it"
+  model_id: "gemma2_2b_it"
+  model_path: "google/gemma-2/flax/gemma2-2b-it"
+  model_source: "kaggle"
+  mesh:
+    shape: "(2,4)"
+    axis_names: "('fsdp','tp')"
+  rng_seed: 42
+actor_model_config:
+  lora_config:
+    rank: 64
+    alpha: 64.0
+    module_path: ".*q_einsum|.*kv_einsum|.*gate_proj|.*down_proj|.*up_proj|.*attn_vec_einsum"
+  mesh:
+    shape: "(2,4)"
+    axis_names: "('fsdp','tp')"
+rollout_model_config:
+  mesh:
+    shape: "(2,4)"
+    axis_names: "('fsdp','tp')"
+tokenizer_config:
+  tokenizer_type: "sentencepiece"
+  add_bos: False
+dataset_name: "gsm8k"
+batch_size: 1
+num_batches: 3738
+num_test_batches: 100
+num_train_epochs: 1
+rl_training_config:
+  actor_optimizer_config:
+    opt_type: "adamw"
+    peak_value: 3e-6
+    schedule_type: "warmup_cosine_decay_schedule"
+    init_value: 0.0
+    end_value: 0.0
+    warmup_ratio: 0.1
+    warmup_steps: 374
+    decay_steps: 3738
+    b1: 0.9
+    b2: 0.99
+    weight_decay: 0.1
+    max_grad_norm: 0.1
+  eval_every_n_steps: 10
+  max_steps: 3738
+  metrics_logging_options:
+    flush_every_n_steps: 20
+  checkpointing_options:
+    save_interval_steps: 500
+    max_to_keep: 4
+  profiler_options: {}
+rollout_config:
+  total_generation_steps: 768
+  max_prompt_length: 256
+  temperature: 0.9
+  top_p: 1.0
+  top_k: 50
+rollout_engine: "vanilla"
+offload_to_cpu: False
+ppo_config:
+  num_generations: 2
+  num_iterations: 1
+  beta: 0.08
+  epsilon: 0.2
+reward_functions:
+  - "tunix/cli/reward_fn/gsm8k.py"
@@ -0,0 +1,54 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+set -x # Enable xtrace
+
+batch_size=${batch_size:-1}
+num_batches=${num_batches:-3738}
+num_train_epochs=${num_train_epochs:-1}
+warmup_ratio=${warmup_ratio:-0.1}
+train_fraction=${train_fraction:-1.0}
+
+echo "Using parameters:"
+echo "  Batch Size: $batch_size"
+echo "  Num Batches: $num_batches"
+echo "  Num Epochs: $num_train_epochs"
+echo "  Warmup Ratio: $warmup_ratio"
+echo "  Train Fraction: $train_fraction"
+
+max_steps_float=$(awk "BEGIN {print $batch_size * $num_batches * $num_train_epochs * $train_fraction}")
+
+max_steps=$(printf "%.0f" "$max_steps_float")
+
+
+warmup_steps=$(awk "BEGIN {printf \"%.0f\", $warmup_ratio * $max_steps}")
+
+echo "Max steps: $max_steps"
+echo "Rounded warmup steps: $warmup_steps"
+
+python3 -m tunix.cli.ppo_main \
+  tunix/cli/base_config.yaml \
+  override_config_file=examples/rl/ppo/gsm8k/configs/gemma2_2b.yaml \
+  model_config.model_download_path="/tmp/models/gemma2-2b" \
+  model_config.intermediate_ckpt_dir="/tmp/intermediate_ckpt/1" \
+  tokenizer_config.tokenizer_path="/tmp/models/gemma2-2b/models/google/gemma-2/flax/gemma2-2b-it/1/tokenizer.model" \
+  batch_size=$batch_size \
+  num_batches=$num_batches \
+  num_train_epochs=$num_train_epochs \
+  rl_training_config.actor_optimizer_config.warmup_ratio=$warmup_ratio \
+  rl_training_config.actor_optimizer_config.warmup_steps=$warmup_steps \
+  rl_training_config.actor_optimizer_config.decay_steps=$max_steps \
+  rl_training_config.max_steps=$max_steps \
+  rl_training_config.metrics_logging_options.log_dir="/tmp/tensorboard/ppo"
@@ -0,0 +1,135 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests that ppo_main dispatches correctly for both training modes
+
+and that KV cache computation is correct.
+"""
+
+import os
+import pathlib
+import tempfile
+from unittest import mock
+
+from absl.testing import absltest
+import omegaconf
+from tunix.cli import ppo_main
+from tunix.rl import rl_cluster as rl_cluster_lib
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+_REPO_ROOT = pathlib.Path(__file__).resolve().parents[2]
+
+
+def _make_pipeline(extra_yaml: str) -> ppo_main.PPOPipeline:
+  """Write a minimal valid YAML and instantiate PPOPipeline against it."""
+  base = """
+model_config:
+  model_name: "test_model"
+  model_id: "test/model"
+  model_source: "huggingface"
+  model_display: false
+  rng_seed: 0
+  intermediate_ckpt_dir: "/tmp/ckpt"
+
+actor_model_config:
+  mesh:
+    shape: "(1,1)"
+    axis_names: "('fsdp','tp')"
+
+reference_model_config:
+  mesh:
+    shape: "(1,1)"
+    axis_names: "('fsdp','tp')"
+
+rollout_model_config:
+  mesh:
+    shape: "(1,1)"
+    axis_names: "('fsdp','tp')"
+
+tokenizer_config:
+  tokenizer_type: "huggingface"
+  tokenizer_path: "test/model"
+  add_bos: false
+  add_eos: false
+
+rollout_engine: "vanilla"
+offload_to_cpu: false
+
+rollout_config:
+  max_prompt_length: 256
+  total_generation_steps: 512
+  temperature: 1.0
+  top_p: null
+  top_k: null
+
+rl_training_config:
+  max_steps: 1
+  eval_every_n_steps: 1
+  mini_batch_size: 1
+  train_micro_batch_size: 1
+  actor_optimizer_config:
+    opt_type: "adamw"
+    learning_rate: 1.0e-6
+    schedule_type: "warmup_cosine_decay_schedule"
+    init_value: 0.0
+    end_value: 0.0
+    warmup_ratio: 0.1
+    b1: 0.9
+    b2: 0.99
+    weight_decay: 0.01
+    max_grad_norm: 1.0
+  metrics_logging_options:
+    log_dir: "/tmp/tb_test"
+    flush_every_n_steps: 1
+  checkpointing_options:
+    save_interval_steps: 100
+    max_to_keep: 1
+  checkpoint_root_directory: "/tmp/ckpt_test"
+
+batch_size: 1
+num_batches: 1
+num_train_epochs: 1
+train_fraction: 1.0
+"""
+  with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
+    f.write(base + extra_yaml)
+    path = f.name
+
+  # Patch HF_TOKEN so tokenizer validation passes
+  with mock.patch.dict(os.environ, {"HF_TOKEN": "fake"}):
+    pipeline = ppo_main.PPOPipeline(["", path])
+  os.unlink(path)
+  return pipeline
+
+class DispatchTest(absltest.TestCase):
+
+  def test_ppo_dispatches_to_ppo(self):
+    yaml = """
+training_mode: "ppo"
+ppo_config:
+  num_generations: 2
+  num_iterations: 1
+"""
+    pipeline = _make_pipeline(yaml)
+    self.assertEqual(pipeline.config["training_mode"], "ppo")
+
+    with mock.patch.object(pipeline, "run_ppo_trainer") as mockrun_ppo_trainer:
+      pipeline.run_ppo_trainer()
+      mockrun_ppo_trainer.assert_called_once_with()
+
+if __name__ == "__main__":
+  absltest.main()