Add lora-checkpointing option to evo2 predict

balvisio · balvisio · commit c91aef22c63d · 2025-08-23T09:13:57.000Z
diff --git a/sub-packages/bionemo-evo2/src/bionemo/evo2/run/predict.py b/sub-packages/bionemo-evo2/src/bionemo/evo2/run/predict.py
@@ -40,6 +40,7 @@
 
 # Add import for Mamba models
 from bionemo.evo2.models.mamba import MAMBA_MODEL_OPTIONS, MambaModel
+from bionemo.evo2.run.peft import Evo2LoRA
 from bionemo.llm.lightning import LightningPassthroughPredictionMixin
 from bionemo.llm.model.biobert.lightning import get_batch_on_this_context_parallel_rank
 from bionemo.llm.utils.callbacks import PredictionWriter
@@ -130,6 +131,13 @@ def parse_args():
         "know a model was trained with a specific interpolation factor for ROPE, provide it here, it can make a big "
         "difference in accuracy.",
     )
+    ap.add_argument(
+        "--lora-checkpoint-path",
+        type=Path,
+        required=False,
+        default=None,
+        help="Path to the lora states to restore from.",
+    )
     return ap.parse_args()
 
 
@@ -168,6 +176,10 @@ def __init__(
         self.output_log_prob_seqs = output_log_prob_seqs
         self.log_prob_collapse_option = log_prob_collapse_option
 
+    def configure_model(self, *args, **kwargs) -> None:
+        super().configure_model(*args, **kwargs)
+        self.trainer.strategy._init_model_parallel = True
+
     def predict_step(self, batch, batch_idx: int | None = None) -> Tensor:
         """Alias for forward_step, also log the pad mask since sequences may not all have the same length."""
         if len(batch) == 0:
@@ -326,6 +338,7 @@ def predict(
     hybrid_override_pattern: str | None = None,
     num_layers: int | None = None,
     seq_len_interpolation_factor: int | None = None,
+    lora_checkpoint_path: Path | None = None,
 ):
     """Inference workflow for Evo2.
 
@@ -342,57 +355,26 @@ def predict(
             f"Requested model parallel size {model_parallel_size} is greater than the "
             f"number of available CUDA devices {torch.cuda.device_count()}"
         )
-    # Create PTL trainer.
-    trainer = nl.Trainer(
-        accelerator="gpu",
-        devices=model_parallel_size,
-        strategy=nl.MegatronStrategy(
-            drop_last_batch=False,
-            tensor_model_parallel_size=tensor_parallel_size,
-            pipeline_model_parallel_size=pipeline_model_parallel_size,
-            context_parallel_size=context_parallel_size,
-            pipeline_dtype=torch.bfloat16,
-            ckpt_load_optimizer=False,  # Needs to be false for a normal model checkpoint.
-            ckpt_save_optimizer=False,
-            ckpt_async_save=False,
-            sequence_parallel=tensor_parallel_size > 1 and sequence_parallel,
-            save_ckpt_format=ckpt_format,
-            ckpt_load_strictness="log_all",
-            data_sampler=nl.MegatronDataSampler(
-                micro_batch_size=batch_size,
-                global_batch_size=batch_size,
-                seq_len=8192,
-                output_log=False,  # this is needed for predict step to work
-            ),
-        ),
-        log_every_n_steps=1,
-        limit_val_batches=10,
-        num_sanity_val_steps=0,
-        callbacks=[
-            PredictionWriter(
-                output_dir=output_dir,
-                write_interval="epoch",
-                batch_dim_key_defaults={"token_logits": 0},
-                seq_dim_key_defaults={"token_logits": 1},
-            )
-        ],
-        plugins=nl.MegatronMixedPrecision(
-            precision="bf16-mixed",
-            params_dtype=torch.bfloat16,
-            # Only use FP8 in this plugin when using full FP8 precision and FP8.
-            #   Otherwise use vortex_style_fp8 in the model config.
-            fp8="hybrid" if fp8 and full_fp8 else None,
-            fp8_amax_history_len=16 if fp8 and full_fp8 else 1,
-            fp8_amax_compute_algo="max" if fp8 and full_fp8 else "most_recent",
-        ),
-    )
+
+    callbacks = [
+        PredictionWriter(
+            output_dir=output_dir,
+            write_interval="epoch",
+            batch_dim_key_defaults={"token_logits": 0},
+            seq_dim_key_defaults={"token_logits": 1},
+        )
+    ]
+
     # The following two config options are really only used for testing, but may also be useful for getting output from
     #   specific layers of the model.
     config_modifiers_init = {}
     if hybrid_override_pattern is not None:
         config_modifiers_init["hybrid_override_pattern"] = hybrid_override_pattern
     if num_layers is not None:
         config_modifiers_init["num_layers"] = num_layers
+
+    tokenizer = get_nmt_tokenizer("byte-level")
+
     # Select model config based on model type
     if model_type == "hyena":
         if "-1m" in model_size and "nv" not in model_size and seq_len_interpolation_factor is None:
@@ -412,6 +394,20 @@ def predict(
             vortex_style_fp8=fp8 and not full_fp8,
             **config_modifiers_init,
         )
+
+        if lora_checkpoint_path:
+            model_transform = Evo2LoRA(peft_ckpt_path=str(lora_checkpoint_path))
+            callbacks.append(model_transform)
+        else:
+            model_transform = None
+
+        model = HyenaPredictor(
+            config,
+            tokenizer=tokenizer,
+            output_log_prob_seqs=output_log_prob_seqs,
+            log_prob_collapse_option=log_prob_collapse_option,
+            model_transform=model_transform,
+        )
     else:  # mamba
         if model_size not in MAMBA_MODEL_OPTIONS:
             raise ValueError(f"Invalid model size for Mamba: {model_size}")
@@ -422,6 +418,50 @@ def predict(
             **config_modifiers_init,
         )
 
+        model = MambaPredictor(
+            config,
+            tokenizer=tokenizer,
+            output_log_prob_seqs=output_log_prob_seqs,
+            log_prob_collapse_option=log_prob_collapse_option,
+        )
+
+    # Create PTL trainer.
+    trainer = nl.Trainer(
+        accelerator="gpu",
+        devices=model_parallel_size,
+        strategy=nl.MegatronStrategy(
+            drop_last_batch=False,
+            tensor_model_parallel_size=tensor_parallel_size,
+            pipeline_model_parallel_size=pipeline_model_parallel_size,
+            context_parallel_size=context_parallel_size,
+            pipeline_dtype=torch.bfloat16,
+            ckpt_load_optimizer=False,  # Needs to be false for a normal model checkpoint.
+            ckpt_save_optimizer=False,
+            ckpt_async_save=False,
+            sequence_parallel=tensor_parallel_size > 1 and sequence_parallel,
+            save_ckpt_format=ckpt_format,
+            ckpt_load_strictness="log_all",
+            data_sampler=nl.MegatronDataSampler(
+                micro_batch_size=batch_size,
+                global_batch_size=batch_size,
+                seq_len=8192,
+                output_log=False,  # this is needed for predict step to work
+            ),
+        ),
+        log_every_n_steps=1,
+        limit_val_batches=10,
+        num_sanity_val_steps=0,
+        callbacks=callbacks,
+        plugins=nl.MegatronMixedPrecision(
+            precision="bf16-mixed",
+            params_dtype=torch.bfloat16,
+            # Only use FP8 in this plugin when using full FP8 precision and FP8.
+            #   Otherwise use vortex_style_fp8 in the model config.
+            fp8="hybrid" if fp8 and full_fp8 else None,
+            fp8_amax_history_len=16 if fp8 and full_fp8 else 1,
+            fp8_amax_compute_algo="max" if fp8 and full_fp8 else "most_recent",
+        ),
+    )
     trainer.strategy._setup_optimizers = False
 
     nemo_logger = NeMoLogger(log_dir=work_dir)
@@ -437,23 +477,6 @@ def predict(
             load_artifacts=False,
         ),
     )
-    tokenizer = get_nmt_tokenizer("byte-level")
-
-    # Create appropriate model based on type
-    if model_type == "hyena":
-        model = HyenaPredictor(
-            config,
-            tokenizer=tokenizer,
-            output_log_prob_seqs=output_log_prob_seqs,
-            log_prob_collapse_option=log_prob_collapse_option,
-        )
-    else:  # mamba
-        model = MambaPredictor(
-            config,
-            tokenizer=tokenizer,
-            output_log_prob_seqs=output_log_prob_seqs,
-            log_prob_collapse_option=log_prob_collapse_option,
-        )
 
     resume.setup(trainer, model)  # this pulls weights from the starting checkpoint.
 
@@ -488,6 +511,7 @@ def main():
         hybrid_override_pattern=args.hybrid_override_pattern,
         seq_len_interpolation_factor=args.seq_len_interpolation_factor,
         num_layers=args.num_layers,
+        lora_checkpoint_path=args.lora_checkpoint_path,
     )
 
 
diff --git a/sub-packages/bionemo-evo2/tests/bionemo/evo2/run/common.py b/sub-packages/bionemo-evo2/tests/bionemo/evo2/run/common.py
@@ -48,3 +48,14 @@ def small_training_finetune_cmd(
         f"{'--create-tflops-callback' if create_tflops_callback else ''}"
     )
     return cmd
+
+
+def predict_cmd(ckpt_dir: str, output_dir: str, fasta_file_path: str, additional_args: str=""):
+    """Command fro predict."""
+    cmd = (
+        f"predict_evo2 --fasta {fasta_file_path} --ckpt-dir {ckpt_dir} --output-dir {output_dir} "
+        "--model-size 1b_nv --num-layers 4 --hybrid-override-pattern SDH* --tensor-parallel-size 1 "
+        f"--pipeline-model-parallel-size 1 --context-parallel-size 1 {additional_args}"
+    )
+
+    return cmd
diff --git a/sub-packages/bionemo-evo2/tests/bionemo/evo2/run/test_lora.py b/sub-packages/bionemo-evo2/tests/bionemo/evo2/run/test_lora.py
@@ -15,11 +15,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import glob
+
 import pytest
+import torch
 
+from bionemo.core.data.load import load
+from bionemo.testing.data.fasta import ALU_SEQUENCE, create_fasta_file
 from bionemo.testing.subprocess_utils import run_command_in_subprocess
 
-from .common import small_training_cmd, small_training_finetune_cmd
+from .common import predict_cmd, small_training_cmd, small_training_finetune_cmd
 
 
 @pytest.mark.timeout(512)  # Optional: fail if the test takes too long.
@@ -130,3 +135,84 @@ def test_train_evo2_finetune_runs_lora(tmp_path, with_peft: bool):
         assert log_dir_ft.exists(), "Logs folder should exist."
         # Check if checkpoints dir exists
         assert checkpoints_dir_ft.exists(), "Checkpoints folder does not exist."
+
+
+@pytest.mark.timeout(512)
+@pytest.mark.slow
+def test_different_results_with_peft(tmp_path):
+    try:
+        base_model_checkpoint_path = load("evo2/1b-8k:1.0")
+    except ValueError as e:
+        if e.args[0].endswith("does not have an NGC URL."):
+            raise ValueError(
+                "Please re-run test with `BIONEMO_DATA_SOURCE=pbss py.test ...`, "
+                "one or more files are missing from ngc."
+            )
+        else:
+            raise e
+
+    num_steps = 2
+
+    result_dir = tmp_path / "lora_finetune"
+
+    # Note: The command assumes that `train_evo2` is in your PATH.
+    command_finetune = small_training_finetune_cmd(
+        result_dir,
+        max_steps=num_steps,
+        val_check=num_steps,
+        prev_ckpt=base_model_checkpoint_path,
+        create_tflops_callback=False,
+        additional_args="--lora-finetune",
+    )
+    stdout_finetune: str = run_command_in_subprocess(command=command_finetune, path=str(tmp_path))
+    assert "Restoring model weights from RestoreConfig(path='" in stdout_finetune
+    assert "Loading adapters from" not in stdout_finetune
+
+    # Check if checkpoints dir exists
+    checkpoints_dir = result_dir / "evo2" / "checkpoints"
+    assert checkpoints_dir.exists(), "Checkpoints folder does not exist."
+
+    # Create a sample FASTA file to run predictions
+    fasta_file_path = tmp_path / "test.fasta"
+    create_fasta_file(fasta_file_path, 3, sequence_lengths=[32, 65, 129], repeating_dna_pattern=ALU_SEQUENCE)
+
+    result_dir_original = tmp_path / "results_original"
+    cmd_predict = predict_cmd(base_model_checkpoint_path, result_dir_original, fasta_file_path)
+    stdout_predict: str = run_command_in_subprocess(command=cmd_predict, path=str(tmp_path))
+
+    # Assert that the output directory was created.
+    pred_files_original = glob.glob(str(result_dir_original / "predictions__rank_*.pt"))
+    assert len(pred_files_original) == 1, f"Expected 1 prediction file (for this test), got {len(pred_files_original)}"
+
+    # Find the checkpoint dir generated by finetuning
+    expected_checkpoint_suffix = f"{num_steps}.0-last"
+    # Check if any subfolder ends with the expected suffix
+    matching_subfolders = [
+        p for p in checkpoints_dir.iterdir() if p.is_dir() and (expected_checkpoint_suffix in p.name)
+    ]
+
+    assert matching_subfolders, (
+        f"No checkpoint subfolder ending with '{expected_checkpoint_suffix}' found in {checkpoints_dir}."
+    )
+
+    result_dir_peft = tmp_path / "results_peft"
+    additional_args = f"--lora-checkpoint-path {matching_subfolders[0]}"
+    cmd_predict = predict_cmd(base_model_checkpoint_path, result_dir_peft, fasta_file_path, additional_args)
+    stdout_predict: str = run_command_in_subprocess(command=cmd_predict, path=str(tmp_path))
+    assert "Restoring model weights from RestoreConfig(path='" in stdout_finetune
+    assert "Loading adapters from" in stdout_predict
+
+    pred_files_peft = glob.glob(str(result_dir_peft / "predictions__rank_*.pt"))
+    assert len(pred_files_peft) == 1, f"Expected 1 prediction file (for this test), got {len(pred_files_peft)}"
+
+    results_original = torch.load(f"{result_dir_original}/predictions__rank_0.pt")
+    results_peft = torch.load(f"{result_dir_peft}/predictions__rank_0.pt")
+
+    seq_idx_original = results_original["seq_idx"]
+    seq_idx_peft = results_peft["seq_idx"]
+    assert torch.equal(seq_idx_original, seq_idx_peft), f"Tensors differ: {seq_idx_original} vs {seq_idx_peft}"
+
+    logits_original = results_original["token_logits"]
+    logits_peft = results_peft["token_logits"]
+    assert (logits_original != logits_peft).any()
+    assert logits_original.shape == logits_peft.shape, f"Shapes don't match: {logits_original.shape} vs {logits_peft.shape}"