From 2f9747156885cfb26cb8e407ab0141b28518a104 Mon Sep 17 00:00:00 2001
From: Anoushka Bhutani <abhutani@umich.edu>
Date: Mon, 11 May 2026 11:49:01 -0400
Subject: [PATCH 1/3] cache models and uv packages

---
 .github/workflows/test-hf-models.yml |  14 ++-
 opt/package/test_hf_org.py           | 132 ++++++++++++++++-----------
 opt/package/test_inference.py        |   9 +-
 3 files changed, 92 insertions(+), 63 deletions(-)

diff --git a/.github/workflows/test-hf-models.yml b/.github/workflows/test-hf-models.yml
index 6c777cb7..5f59a7c9 100644
--- a/.github/workflows/test-hf-models.yml
+++ b/.github/workflows/test-hf-models.yml
@@ -20,6 +20,13 @@ on:
 jobs:
   test-models:
     runs-on: ubuntu-latest
+    env:
+      HF_HOME: ${{ runner.temp }}/huggingface
+      HF_HUB_CACHE: ${{ runner.temp }}/huggingface/hub
+      HF_HUB_DISABLE_PROGRESS_BARS: "1"
+      HF_HUB_DISABLE_XET: "1"
+      HF_MODULES_CACHE: ${{ runner.temp }}/huggingface/modules
+      WANDB_MODE: disabled
     strategy:
       fail-fast: false
       matrix:
@@ -48,11 +55,12 @@ jobs:
       - name: Install project dependencies
         run: uv sync --all-extras --dev
 
+      - name: Free package download cache
+        run: uv cache clean
+
       - name: Run tests
-        env:
-          WANDB_MODE: disabled
         run: |
-          uv run pytest opt/package/test_hf_org.py::${{ matrix.test-class }} \
+          uv run --no-sync pytest opt/package/test_hf_org.py::${{ matrix.test-class }} \
             -v \
             --log-cli-level=INFO \
             --durations=10 \
diff --git a/opt/package/test_hf_org.py b/opt/package/test_hf_org.py
index 575ad4bb..9b31ffca 100644
--- a/opt/package/test_hf_org.py
+++ b/opt/package/test_hf_org.py
@@ -1,11 +1,16 @@
 """Tests for models in the mist-models HuggingFace organization."""
 
+from contextlib import contextmanager
+import gc
 import logging
 import os
 from pathlib import Path
+import tempfile
+from typing import Iterator
+
 import pytest
 from huggingface_hub import HfApi
-from transformers import AutoModel
+from transformers import AutoConfig, AutoModel
 
 from .test_inference import (
     single_molecule_smiles,
@@ -19,6 +24,37 @@
 logger = logging.getLogger(__name__)
 
 
+@contextmanager
+def loaded_hf_model(model_id: str, hf_token: str | None) -> Iterator[object]:
+    """Load one HF model in an isolated cache that is removed after the check."""
+    with tempfile.TemporaryDirectory(
+        prefix="hf-model-cache-", dir=os.getenv("RUNNER_TEMP") or None
+    ) as cache_dir:
+        model = AutoModel.from_pretrained(
+            model_id,
+            trust_remote_code=True,
+            token=hf_token,
+            cache_dir=cache_dir,
+        )
+        try:
+            yield model
+        finally:
+            del model
+            gc.collect()
+
+
+def load_hf_config(model_id: str, hf_token: str | None):
+    with tempfile.TemporaryDirectory(
+        prefix="hf-config-cache-", dir=os.getenv("RUNNER_TEMP") or None
+    ) as cache_dir:
+        return AutoConfig.from_pretrained(
+            model_id,
+            trust_remote_code=True,
+            token=hf_token,
+            cache_dir=cache_dir,
+        )
+
+
 @pytest.fixture
 def hf_token():
     # Not testing private models for now
@@ -49,27 +85,24 @@ def test_predict_single_molecules(
 
         for model_id in single_models:
             logger.info(f"Testing {model_id}")
-            model = AutoModel.from_pretrained(
-                model_id, trust_remote_code=True, token=hf_token
-            )
-
-            if "RobertaPreLayerNormModel" in type(model).__name__:
-                logger.info("Skipping encoder-only model")
-                continue
-
-            predictions = model.predict(single_molecule_smiles)
-            assert predictions is not None
-
-            if isinstance(predictions, dict):
-                assert len(predictions) > 0
-                for task_name, task_data in predictions.items():
-                    if isinstance(task_data, dict) and "value" in task_data:
-                        values = task_data["value"]
-                        assert len(values) == len(single_molecule_smiles)
-                        validate_predictions(values, name=f"{model_id}:{task_name}")
-            else:
-                assert len(predictions) == len(single_molecule_smiles)
-                validate_predictions(predictions, name=model_id)
+            with loaded_hf_model(model_id, hf_token) as model:
+                if "RobertaPreLayerNormModel" in type(model).__name__:
+                    logger.info("Skipping encoder-only model")
+                    continue
+
+                predictions = model.predict(single_molecule_smiles)
+                assert predictions is not None
+
+                if isinstance(predictions, dict):
+                    assert len(predictions) > 0
+                    for task_name, task_data in predictions.items():
+                        if isinstance(task_data, dict) and "value" in task_data:
+                            values = task_data["value"]
+                            assert len(values) == len(single_molecule_smiles)
+                            validate_predictions(values, name=f"{model_id}:{task_name}")
+                else:
+                    assert len(predictions) == len(single_molecule_smiles)
+                    validate_predictions(predictions, name=model_id)
 
 
 class TestHFOrgConductivityModels:
@@ -81,18 +114,15 @@ def test_predict_mixtures(self, hf_org_models, hf_token, conductivity_test_data)
         ]
         for model_id in cond_models:
             logger.info(f"Testing {model_id}")
-            model = AutoModel.from_pretrained(
-                model_id, trust_remote_code=True, token=hf_token
-            )
-
-            predictions = model.predict(conductivity_test_data)
-            assert predictions is not None
+            with loaded_hf_model(model_id, hf_token) as model:
+                predictions = model.predict(conductivity_test_data)
+                assert predictions is not None
 
-            if isinstance(predictions, dict):
-                for key, value in predictions.items():
-                    validate_predictions(value, name=f"{model_id}:{key}")
-            else:
-                validate_predictions(predictions, name=model_id)
+                if isinstance(predictions, dict):
+                    for key, value in predictions.items():
+                        validate_predictions(value, name=f"{model_id}:{key}")
+                else:
+                    validate_predictions(predictions, name=model_id)
 
 
 class TestHFOrgExcessPhysicsModels:
@@ -104,32 +134,26 @@ def test_predict_binary_mixture(self, hf_org_models, hf_token, excess_test_data)
 
         for model_id in excess_models:
             logger.info(f"Testing {model_id}")
-            model = AutoModel.from_pretrained(
-                model_id, trust_remote_code=True, token=hf_token
-            )
+            with loaded_hf_model(model_id, hf_token) as model:
+                predictions = model.predict(
+                    smiles_list=test_case["smiles_list"],
+                    composition=test_case["composition"],
+                    temperature=test_case["temperature"],
+                )
 
-            predictions = model.predict(
-                smiles_list=test_case["smiles_list"],
-                composition=test_case["composition"],
-                temperature=test_case["temperature"],
-            )
+                assert predictions is not None
 
-            assert predictions is not None
-
-            if isinstance(predictions, dict):
-                for key, value in predictions.items():
-                    validate_predictions(value, name=f"{model_id}:{key}")
+                if isinstance(predictions, dict):
+                    for key, value in predictions.items():
+                        validate_predictions(value, name=f"{model_id}:{key}")
 
 
 class TestHFOrgModelIntegrity:
     def test_all_models_config(self, hf_org_models, hf_token):
         for model_id in hf_org_models:
             logger.info(f"Checking config for {model_id}")
-            model = AutoModel.from_pretrained(
-                model_id, trust_remote_code=True, token=hf_token
-            )
-            assert hasattr(model, "config")
-            assert model.config is not None
+            config = load_hf_config(model_id, hf_token)
+            assert config is not None
 
     def test_models_required_files(self, hf_org_models, hf_token):
         api = HfApi(token=hf_token)
@@ -149,10 +173,8 @@ def test_models_required_files(self, hf_org_models, hf_token):
     def test_multi_channel_model_labels(self, hf_org_models, hf_token):
         for model_id in hf_org_models:
             logger.info(f"Checking channels for {model_id}")
-            model = AutoModel.from_pretrained(
-                model_id, trust_remote_code=True, token=hf_token
-            )
-            check_multi_channel_labels(model, model_id)
+            config = load_hf_config(model_id, hf_token)
+            check_multi_channel_labels(config, model_id)
 
 
 if __name__ == "__main__":
diff --git a/opt/package/test_inference.py b/opt/package/test_inference.py
index f4e73072..3e5901da 100644
--- a/opt/package/test_inference.py
+++ b/opt/package/test_inference.py
@@ -115,15 +115,14 @@ def get_model_type_from_path(path: Path) -> str:
         return "single"
 
 
-def check_multi_channel_labels(model, model_name: str):
+def check_multi_channel_labels(model_or_config, model_name: str):
     """Verify that multi-output models have channel labels."""
-    if "RobertaPreLayerNormModel" in type(model).__name__:
+    type_name = type(model_or_config).__name__
+    if "RobertaPreLayerNorm" in type_name:
         return
 
-    if not hasattr(model, "config"):
-        return
+    config = getattr(model_or_config, "config", model_or_config)
 
-    config = model.config
     if not hasattr(config, "task_network"):
         return
 

From 2224a11cfb0af0bbf379d2dc78021bd0fd67d5f1 Mon Sep 17 00:00:00 2001
From: Anoushka Bhutani <abhutani@umich.edu>
Date: Mon, 11 May 2026 11:54:08 -0400
Subject: [PATCH 2/3] move HF cahce env vars to later step

---
 .github/workflows/test-hf-models.yml | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/test-hf-models.yml b/.github/workflows/test-hf-models.yml
index 5f59a7c9..407427d1 100644
--- a/.github/workflows/test-hf-models.yml
+++ b/.github/workflows/test-hf-models.yml
@@ -21,11 +21,6 @@ jobs:
   test-models:
     runs-on: ubuntu-latest
     env:
-      HF_HOME: ${{ runner.temp }}/huggingface
-      HF_HUB_CACHE: ${{ runner.temp }}/huggingface/hub
-      HF_HUB_DISABLE_PROGRESS_BARS: "1"
-      HF_HUB_DISABLE_XET: "1"
-      HF_MODULES_CACHE: ${{ runner.temp }}/huggingface/modules
       WANDB_MODE: disabled
     strategy:
       fail-fast: false
@@ -59,6 +54,12 @@ jobs:
         run: uv cache clean
 
       - name: Run tests
+        env:
+          HF_HOME: ${{ runner.temp }}/huggingface
+          HF_HUB_CACHE: ${{ runner.temp }}/huggingface/hub
+          HF_HUB_DISABLE_PROGRESS_BARS: "1"
+          HF_HUB_DISABLE_XET: "1"
+          HF_MODULES_CACHE: ${{ runner.temp }}/huggingface/modules
         run: |
           uv run --no-sync pytest opt/package/test_hf_org.py::${{ matrix.test-class }} \
             -v \

From 902c47074bf86b3ee1ddae5f25ec3e47f3f7b259 Mon Sep 17 00:00:00 2001
From: Anoushka Bhutani <abhutani@umich.edu>
Date: Mon, 11 May 2026 16:02:08 -0400
Subject: [PATCH 3/3] seperate each model check into a test

---
 .github/workflows/test-hf-models.yml |   3 +-
 opt/package/test_hf_org.py           | 221 ++++++++++++++++-----------
 2 files changed, 129 insertions(+), 95 deletions(-)

diff --git a/.github/workflows/test-hf-models.yml b/.github/workflows/test-hf-models.yml
index 407427d1..2752aae1 100644
--- a/.github/workflows/test-hf-models.yml
+++ b/.github/workflows/test-hf-models.yml
@@ -64,8 +64,7 @@ jobs:
           uv run --no-sync pytest opt/package/test_hf_org.py::${{ matrix.test-class }} \
             -v \
             --log-cli-level=INFO \
-            --durations=10 \
-            --maxfail=5
+            --durations=10
 
       - name: Upload test results
         if: always()
diff --git a/opt/package/test_hf_org.py b/opt/package/test_hf_org.py
index 9b31ffca..9701f62c 100644
--- a/opt/package/test_hf_org.py
+++ b/opt/package/test_hf_org.py
@@ -1,6 +1,7 @@
 """Tests for models in the mist-models HuggingFace organization."""
 
 from contextlib import contextmanager
+from functools import lru_cache
 import gc
 import logging
 import os
@@ -22,6 +23,7 @@
 )
 
 logger = logging.getLogger(__name__)
+HF_ORG = "mist-models"
 
 
 @contextmanager
@@ -55,6 +57,69 @@ def load_hf_config(model_id: str, hf_token: str | None):
         )
 
 
+@lru_cache
+def list_hf_org_model_ids(hf_token: str | None) -> tuple[str, ...]:
+    api = HfApi(token=hf_token)
+    model_ids = tuple(m.id for m in api.list_models(author=HF_ORG))
+    logger.info("Found %d models in %s organization", len(model_ids), HF_ORG)
+    return model_ids
+
+
+def parametrize_model_ids(metafunc, fixture_name: str, model_ids: tuple[str, ...]):
+    if model_ids:
+        metafunc.parametrize(
+            fixture_name, model_ids, ids=lambda m: m.rsplit("/", 1)[-1]
+        )
+        return
+
+    metafunc.parametrize(
+        fixture_name,
+        [
+            pytest.param(
+                None,
+                marks=pytest.mark.skip(reason=f"No {fixture_name} models found"),
+            )
+        ],
+        ids=["no-models"],
+    )
+
+
+def pytest_generate_tests(metafunc):
+    model_ids = list_hf_org_model_ids(os.getenv("HF_TOKEN"))
+
+    if "hf_model_id" in metafunc.fixturenames:
+        parametrize_model_ids(metafunc, "hf_model_id", model_ids)
+
+    if "single_model_id" in metafunc.fixturenames:
+        parametrize_model_ids(
+            metafunc,
+            "single_model_id",
+            tuple(
+                m for m in model_ids if get_model_type_from_path(Path(m)) == "single"
+            ),
+        )
+
+    if "conductivity_model_id" in metafunc.fixturenames:
+        parametrize_model_ids(
+            metafunc,
+            "conductivity_model_id",
+            tuple(
+                m
+                for m in model_ids
+                if get_model_type_from_path(Path(m)) == "conductivity"
+            ),
+        )
+
+    if "excess_model_id" in metafunc.fixturenames:
+        parametrize_model_ids(
+            metafunc,
+            "excess_model_id",
+            tuple(
+                m for m in model_ids if get_model_type_from_path(Path(m)) == "excess"
+            ),
+        )
+
+
 @pytest.fixture
 def hf_token():
     # Not testing private models for now
@@ -62,119 +127,89 @@ def hf_token():
     return os.getenv("HF_TOKEN")
 
 
-@pytest.fixture
-def hf_org_models(hf_token):
-    api = HfApi(token=hf_token)
-    models = list(api.list_models(author="mist-models"))
-
-    if not models:
-        pytest.skip("No models found in mist-models organization")
-
-    model_ids = [m.id for m in models]
-    logger.info(f"Found {len(model_ids)} models in mist-models organization")
-    return model_ids
-
-
 class TestHFOrgSingleMoleculeModels:
     def test_predict_single_molecules(
-        self, hf_org_models, hf_token, single_molecule_smiles
+        self, single_model_id, hf_token, single_molecule_smiles
     ):
-        single_models = [
-            m for m in hf_org_models if get_model_type_from_path(Path(m)) == "single"
-        ]
-
-        for model_id in single_models:
-            logger.info(f"Testing {model_id}")
-            with loaded_hf_model(model_id, hf_token) as model:
-                if "RobertaPreLayerNormModel" in type(model).__name__:
-                    logger.info("Skipping encoder-only model")
-                    continue
-
-                predictions = model.predict(single_molecule_smiles)
-                assert predictions is not None
-
-                if isinstance(predictions, dict):
-                    assert len(predictions) > 0
-                    for task_name, task_data in predictions.items():
-                        if isinstance(task_data, dict) and "value" in task_data:
-                            values = task_data["value"]
-                            assert len(values) == len(single_molecule_smiles)
-                            validate_predictions(values, name=f"{model_id}:{task_name}")
-                else:
-                    assert len(predictions) == len(single_molecule_smiles)
-                    validate_predictions(predictions, name=model_id)
+        logger.info(f"Testing {single_model_id}")
+        with loaded_hf_model(single_model_id, hf_token) as model:
+            if "RobertaPreLayerNormModel" in type(model).__name__:
+                pytest.skip("Skipping encoder-only model")
+
+            predictions = model.predict(single_molecule_smiles)
+            assert predictions is not None
+
+            if isinstance(predictions, dict):
+                assert len(predictions) > 0
+                for task_name, task_data in predictions.items():
+                    if isinstance(task_data, dict) and "value" in task_data:
+                        values = task_data["value"]
+                        assert len(values) == len(single_molecule_smiles)
+                        validate_predictions(
+                            values, name=f"{single_model_id}:{task_name}"
+                        )
+            else:
+                assert len(predictions) == len(single_molecule_smiles)
+                validate_predictions(predictions, name=single_model_id)
 
 
 class TestHFOrgConductivityModels:
-    def test_predict_mixtures(self, hf_org_models, hf_token, conductivity_test_data):
-        cond_models = [
-            m
-            for m in hf_org_models
-            if get_model_type_from_path(Path(m)) == "conductivity"
-        ]
-        for model_id in cond_models:
-            logger.info(f"Testing {model_id}")
-            with loaded_hf_model(model_id, hf_token) as model:
-                predictions = model.predict(conductivity_test_data)
-                assert predictions is not None
-
-                if isinstance(predictions, dict):
-                    for key, value in predictions.items():
-                        validate_predictions(value, name=f"{model_id}:{key}")
-                else:
-                    validate_predictions(predictions, name=model_id)
+    def test_predict_mixtures(
+        self, conductivity_model_id, hf_token, conductivity_test_data
+    ):
+        logger.info(f"Testing {conductivity_model_id}")
+        with loaded_hf_model(conductivity_model_id, hf_token) as model:
+            predictions = model.predict(conductivity_test_data)
+            assert predictions is not None
+
+            if isinstance(predictions, dict):
+                for key, value in predictions.items():
+                    validate_predictions(value, name=f"{conductivity_model_id}:{key}")
+            else:
+                validate_predictions(predictions, name=conductivity_model_id)
 
 
 class TestHFOrgExcessPhysicsModels:
-    def test_predict_binary_mixture(self, hf_org_models, hf_token, excess_test_data):
-        excess_models = [
-            m for m in hf_org_models if get_model_type_from_path(Path(m)) == "excess"
-        ]
+    def test_predict_binary_mixture(self, excess_model_id, hf_token, excess_test_data):
         test_case = excess_test_data[0]
+        logger.info(f"Testing {excess_model_id}")
+        with loaded_hf_model(excess_model_id, hf_token) as model:
+            predictions = model.predict(
+                smiles_list=test_case["smiles_list"],
+                composition=test_case["composition"],
+                temperature=test_case["temperature"],
+            )
 
-        for model_id in excess_models:
-            logger.info(f"Testing {model_id}")
-            with loaded_hf_model(model_id, hf_token) as model:
-                predictions = model.predict(
-                    smiles_list=test_case["smiles_list"],
-                    composition=test_case["composition"],
-                    temperature=test_case["temperature"],
-                )
-
-                assert predictions is not None
+            assert predictions is not None
 
-                if isinstance(predictions, dict):
-                    for key, value in predictions.items():
-                        validate_predictions(value, name=f"{model_id}:{key}")
+            if isinstance(predictions, dict):
+                for key, value in predictions.items():
+                    validate_predictions(value, name=f"{excess_model_id}:{key}")
 
 
 class TestHFOrgModelIntegrity:
-    def test_all_models_config(self, hf_org_models, hf_token):
-        for model_id in hf_org_models:
-            logger.info(f"Checking config for {model_id}")
-            config = load_hf_config(model_id, hf_token)
-            assert config is not None
+    def test_all_models_config(self, hf_model_id, hf_token):
+        logger.info(f"Checking config for {hf_model_id}")
+        config = load_hf_config(hf_model_id, hf_token)
+        assert config is not None
 
-    def test_models_required_files(self, hf_org_models, hf_token):
+    def test_models_required_files(self, hf_model_id, hf_token):
         api = HfApi(token=hf_token)
+        model_info = api.model_info(hf_model_id)
+        siblings = {f.rfilename for f in model_info.siblings}
 
-        for model_id in hf_org_models:
-            model_info = api.model_info(model_id)
-            siblings = {f.rfilename for f in model_info.siblings}
+        assert "config.json" in siblings, f"{hf_model_id} missing config.json"
+        assert "README.md" in siblings, f"{hf_model_id} missing README.md"
 
-            assert "config.json" in siblings, f"{model_id} missing config.json"
-            assert "README.md" in siblings, f"{model_id} missing README.md"
-
-            has_weights = any(
-                "safetensors" in f or "pytorch_model.bin" in f for f in siblings
-            )
-            assert has_weights, f"{model_id} missing model weights"
+        has_weights = any(
+            "safetensors" in f or "pytorch_model.bin" in f for f in siblings
+        )
+        assert has_weights, f"{hf_model_id} missing model weights"
 
-    def test_multi_channel_model_labels(self, hf_org_models, hf_token):
-        for model_id in hf_org_models:
-            logger.info(f"Checking channels for {model_id}")
-            config = load_hf_config(model_id, hf_token)
-            check_multi_channel_labels(config, model_id)
+    def test_multi_channel_model_labels(self, hf_model_id, hf_token):
+        logger.info(f"Checking channels for {hf_model_id}")
+        config = load_hf_config(hf_model_id, hf_token)
+        check_multi_channel_labels(config, hf_model_id)
 
 
 if __name__ == "__main__":