LLM360 · twkillian · Nov 5, 2025 · Nov 6, 2025 · Nov 6, 2025 · Nov 8, 2025
diff --git a/recipe/dapo/dapo_ray_trainer.py b/recipe/dapo/dapo_ray_trainer.py
@@ -251,9 +251,10 @@ def fit(self):
                         batch = new_batch if batch is None else DataProto.concat([batch, new_batch])
 
                         prompt_bsz = self.config.data.train_batch_size
-                        if num_prompt_in_batch < prompt_bsz:
+                        max_num_gen_batches = self.config.algorithm.filter_groups.max_num_gen_batches
+                        if num_prompt_in_batch < prompt_bsz and max_num_gen_batches > 1: # Added by Reasoning360 TWK NOTE: second condition is to account for when we have zero-variance filtering but are not dynamically growing the batch...
                             print(f"{num_prompt_in_batch=} < {prompt_bsz=}")
-                            max_num_gen_batches = self.config.algorithm.filter_groups.max_num_gen_batches
+                            # max_num_gen_batches = self.config.algorithm.filter_groups.max_num_gen_batches
                             if max_num_gen_batches <= 0 or num_gen_batches < max_num_gen_batches:
                                 print(f"{num_gen_batches=}. Keep generating...")
                                 progress_bar.update(1)
@@ -267,9 +268,14 @@ def fit(self):
                                     + " You could also try set max_num_gen_batches=0 to enable endless trials."
                                 )
                         else:
-                            # Align the batch
-                            traj_bsz = self.config.data.train_batch_size * self.config.actor_rollout_ref.rollout.n
-                            batch = batch[:traj_bsz]
+                            # Added by Reasoning360, need to account for when our batch is smaller due to zero-variance filtering
+                            if num_prompt_in_batch >= prompt_bsz:
+                                # Align the batch
+                                traj_bsz = self.config.data.train_batch_size * self.config.actor_rollout_ref.rollout.n
+                                batch = batch[:traj_bsz]
+                            else:
+                                # TWK TODO!!!: RESCALE THIS SO THAT THE BATCH*N IS DIVISIBLE BY k_partitions (n_gpus...)
+                                print(f"Final {num_prompt_in_batch=} < {prompt_bsz=} after {num_gen_batches=} generation batches. Proceeding with smaller batch...")
 
                     # === Updating ===
 

diff --git a/scripts/tools/serve_llm_as_verifier.sh b/scripts/tools/serve_llm_as_verifier.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 #SBATCH --job-name=server_llm_as_verifier
-#SBATCH --partition=main
+#SBATCH --partition=higherprio
 #SBATCH --nodes=1
 #SBATCH --ntasks=1
 #SBATCH --cpus-per-task=64

diff --git a/scripts/tools/serve_math_llm_as_verifier.sh b/scripts/tools/serve_math_llm_as_verifier.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+#SBATCH --job-name=server_math_llm_as_verifier
+#SBATCH --partition=higherprio
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=64
+#SBATCH --gres=gpu:8
+#SBATCH --time=720:00:00
+#SBATCH --output=slurm/serve_math_llm_as_verifier_%j.log
+#SBATCH --error=slurm/serve_math_llm_as_verifier_%j.log
+
+
+# (1) detect this node’s primary IP
+NODE_IP=$(hostname -I | awk '{print $1}')
+echo "Detected NODE_IP = $NODE_IP"
+
+# (2) export judge URL for downstream clients
+export MATH_LLM_JUDGE_URL="http://${NODE_IP}:8000"
+echo "MATH_LLM_JUDGE_URL=$MATH_LLM_JUDGE_URL"
+
+# (3) launch the vLLM server bound to that IP
+vllm serve openai/gpt-oss-120b --host "$NODE_IP" --data-parallel-size 8 --enable-expert-parallel