diff --git a/.github/workflows/medcat-den_release.yml b/.github/workflows/medcat-den_release.yml index 5378b6674..e3744a96f 100644 --- a/.github/workflows/medcat-den_release.yml +++ b/.github/workflows/medcat-den_release.yml @@ -49,3 +49,13 @@ jobs: uses: pypa/gh-action-pypi-publish@release/v1 with: packages_dir: medcat-den/dist + + # test-time models for download + upload-test-models: + runs-on: ubuntu-latest + needs: test-and-publish-to-PyPI + steps: + - name: Upload test models to release + uses: ./.github/workflows/upload-test-models.yml + with: + tag: ${{ github.ref_name }} diff --git a/.github/workflows/medcat-v2_main.yml b/.github/workflows/medcat-v2_main.yml index bac2da0f3..4efb4dbf4 100644 --- a/.github/workflows/medcat-v2_main.yml +++ b/.github/workflows/medcat-v2_main.yml @@ -11,6 +11,13 @@ defaults: run: working-directory: ./medcat-v2 jobs: + test-resource-utils: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - name: Check test utils are in sync + run: diff tests/resource_fetch.py ../medcat-den/tests/resource_fetch.py + base-install-imports: runs-on: ubuntu-latest steps: diff --git a/.github/workflows/medcat-v2_release.yml b/.github/workflows/medcat-v2_release.yml index 9386cb5b7..f6b915cd6 100644 --- a/.github/workflows/medcat-v2_release.yml +++ b/.github/workflows/medcat-v2_release.yml @@ -205,3 +205,13 @@ jobs: uses: pypa/gh-action-pypi-publish@release/v1 with: packages-dir: medcat-v2/dist + + # test-time models for download + upload-test-models: + runs-on: ubuntu-latest + needs: release + steps: + - name: Upload test models to release + uses: ./.github/workflows/upload-test-models.yml + with: + tag: ${{ github.ref_name }} diff --git a/.github/workflows/upload-test-models-to-release.yml b/.github/workflows/upload-test-models-to-release.yml new file mode 100644 index 000000000..d09c24b30 --- /dev/null +++ b/.github/workflows/upload-test-models-to-release.yml @@ -0,0 +1,22 @@ +# .github/workflows/upload-test-models.yml +name: Upload test models to release + +on: + workflow_call: + inputs: + tag: + required: true + type: string + +jobs: + upload-test-models: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Upload test models to release + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh release upload ${{ inputs.tag }} medcat-test-models/* --clobber + diff --git a/medcat-den/pyproject.toml b/medcat-den/pyproject.toml index 678ad822d..61586ea06 100644 --- a/medcat-den/pyproject.toml +++ b/medcat-den/pyproject.toml @@ -41,6 +41,7 @@ dev = [ "ruff", "mypy", "diskcache-stubs", + "pooch", ] [project.urls] diff --git a/medcat-den/tests/__init__.py b/medcat-den/tests/__init__.py index 2dae78cfd..c3370decb 100644 --- a/medcat-den/tests/__init__.py +++ b/medcat-den/tests/__init__.py @@ -4,12 +4,10 @@ from medcat.cat import CAT +from .resource_fetch import get_resource -MODEL_PATH = os.path.join( - os.path.dirname(__file__), "resources", "mct2_model_pack.zip") -V1_MODEL_PATH = os.path.join( - os.path.dirname(MODEL_PATH), "mct_v1_model_pack.zip" -) +MODEL_PATH = get_resource("mct2_model_pack.zip", 'medcat_den') +V1_MODEL_PATH = get_resource("mct_v1_model_pack.zip", 'medcat_den') # unpack diff --git a/medcat-den/tests/resource_fetch.py b/medcat-den/tests/resource_fetch.py new file mode 100644 index 000000000..6ac710958 --- /dev/null +++ b/medcat-den/tests/resource_fetch.py @@ -0,0 +1,78 @@ +# NOTE: this file is designed to be copied across the following sub-folders +# 1. medcat-v2/tests/resource_fetch.py +# 2. medcat-den/tests/resource_fetch.py +# So if you make changes here, copy them over to the others as well. +# +# NB! This does mean we have duplicate code. But to me the alternatives +# are note better: +# a) keep and install a separate local project - not portable +# b) publish and install from PyPI - extra maintenance burden + + +import os +import pooch +import importlib +from enum import Enum + + +_REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')) +_CENTRAL_RESOURCES = os.path.join(_REPO_ROOT, 'medcat-test-models') + +class DefinedResource(Enum): + v1_model = "mct_v1_model_pack.zip" + v2_model = "mct2_model_pack.zip" + + +def _get_version(project_name: str = 'medcat') -> str: + # NOTE: plan to use this for medcat-den as well + try: + pkg = importlib.import_module(project_name) + ver = getattr(pkg, '__version__') + if ver is None: + raise + return "%2F".join((project_name, f"v{ver}")) + except ImportError: + raise RuntimeError( + f"Could not determine version for '{project_name}'. " + f"Is the package installed?" + ) + + +def _download_resource(version: str, relative_path: str) -> str: + url = f"https://github.com/CogStack/cogstack-nlp/releases/download/{version}/{relative_path}" + try: + return pooch.retrieve( + url=url, + known_hash=None, + path=pooch.os_cache('medcat_tests'), + fname=relative_path, + ) + except Exception as e: + raise FileNotFoundError( + f"Test resource '{relative_path}' not found locally in '{_CENTRAL_RESOURCES}' " + f"and could not be fetched from release {version!r}. " + f"If developing locally, ensure 'medcat-test-models/' exists at the repo root. " + f"Original error: {e}" + ) from e + + +def get_resource(relative_path: str | DefinedResource, project_name: str = 'medcat') -> str: + """ + Returns a local path to the requested test resource. + Prefers the central repo location (medcat-test-models/) if available, + falls back to downloading from the corresponding release via pooch. + """ + # allow passing string version of defined resoure (e.g v1_model) + try: + relative_path = DefinedResource[relative_path] + except KeyError: + pass # treat as a literal path + if isinstance(relative_path, DefinedResource): + relative_path = relative_path.value + central_path = os.path.join(_CENTRAL_RESOURCES, relative_path) + + if os.path.exists(central_path): + return central_path + + version = _get_version(project_name) + return _download_resource(version, relative_path) diff --git a/medcat-den/tests/resources/mct2_model_pack.zip b/medcat-test-models/mct2_model_pack.zip similarity index 100% rename from medcat-den/tests/resources/mct2_model_pack.zip rename to medcat-test-models/mct2_model_pack.zip diff --git a/medcat-den/tests/resources/mct_v1_model_pack.zip b/medcat-test-models/mct_v1_model_pack.zip similarity index 100% rename from medcat-den/tests/resources/mct_v1_model_pack.zip rename to medcat-test-models/mct_v1_model_pack.zip diff --git a/medcat-v2/pyproject.toml b/medcat-v2/pyproject.toml index 769f49bbb..9dd28f816 100644 --- a/medcat-v2/pyproject.toml +++ b/medcat-v2/pyproject.toml @@ -86,6 +86,7 @@ dev = [ "types-tqdm", "types-setuptools", "types-PyYAML", + "pooch", ] spacy = [ "spacy", diff --git a/medcat-v2/tests/__init__.py b/medcat-v2/tests/__init__.py index dd45aecd9..55d3eb60a 100644 --- a/medcat-v2/tests/__init__.py +++ b/medcat-v2/tests/__init__.py @@ -2,14 +2,14 @@ import os import shutil +from .resource_fetch import get_resource + RESOURCES_PATH = os.path.join(os.path.dirname(__file__), "resources") -EXAMPLE_MODEL_PACK_ZIP = os.path.join(RESOURCES_PATH, "mct2_model_pack.zip") -UNPACKED_EXAMPLE_MODEL_PACK_PATH = os.path.join( - RESOURCES_PATH, "mct2_model_pack") -V1_MODEL_PACK_PATH = os.path.join(RESOURCES_PATH, "mct_v1_model_pack.zip") -UNPACKED_V1_MODEL_PACK_PATH = os.path.join( - RESOURCES_PATH, "mct_v1_model_pack") +EXAMPLE_MODEL_PACK_ZIP = get_resource("mct2_model_pack.zip") +UNPACKED_EXAMPLE_MODEL_PACK_PATH = EXAMPLE_MODEL_PACK_ZIP.removesuffix(".zip") +V1_MODEL_PACK_PATH = get_resource("mct_v1_model_pack.zip") +UNPACKED_V1_MODEL_PACK_PATH = V1_MODEL_PACK_PATH.removesuffix(".zip") # unpack model pack at start so we can access stuff like Vocab diff --git a/medcat-v2/tests/resource_fetch.py b/medcat-v2/tests/resource_fetch.py new file mode 100644 index 000000000..6ac710958 --- /dev/null +++ b/medcat-v2/tests/resource_fetch.py @@ -0,0 +1,78 @@ +# NOTE: this file is designed to be copied across the following sub-folders +# 1. medcat-v2/tests/resource_fetch.py +# 2. medcat-den/tests/resource_fetch.py +# So if you make changes here, copy them over to the others as well. +# +# NB! This does mean we have duplicate code. But to me the alternatives +# are note better: +# a) keep and install a separate local project - not portable +# b) publish and install from PyPI - extra maintenance burden + + +import os +import pooch +import importlib +from enum import Enum + + +_REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')) +_CENTRAL_RESOURCES = os.path.join(_REPO_ROOT, 'medcat-test-models') + +class DefinedResource(Enum): + v1_model = "mct_v1_model_pack.zip" + v2_model = "mct2_model_pack.zip" + + +def _get_version(project_name: str = 'medcat') -> str: + # NOTE: plan to use this for medcat-den as well + try: + pkg = importlib.import_module(project_name) + ver = getattr(pkg, '__version__') + if ver is None: + raise + return "%2F".join((project_name, f"v{ver}")) + except ImportError: + raise RuntimeError( + f"Could not determine version for '{project_name}'. " + f"Is the package installed?" + ) + + +def _download_resource(version: str, relative_path: str) -> str: + url = f"https://github.com/CogStack/cogstack-nlp/releases/download/{version}/{relative_path}" + try: + return pooch.retrieve( + url=url, + known_hash=None, + path=pooch.os_cache('medcat_tests'), + fname=relative_path, + ) + except Exception as e: + raise FileNotFoundError( + f"Test resource '{relative_path}' not found locally in '{_CENTRAL_RESOURCES}' " + f"and could not be fetched from release {version!r}. " + f"If developing locally, ensure 'medcat-test-models/' exists at the repo root. " + f"Original error: {e}" + ) from e + + +def get_resource(relative_path: str | DefinedResource, project_name: str = 'medcat') -> str: + """ + Returns a local path to the requested test resource. + Prefers the central repo location (medcat-test-models/) if available, + falls back to downloading from the corresponding release via pooch. + """ + # allow passing string version of defined resoure (e.g v1_model) + try: + relative_path = DefinedResource[relative_path] + except KeyError: + pass # treat as a literal path + if isinstance(relative_path, DefinedResource): + relative_path = relative_path.value + central_path = os.path.join(_CENTRAL_RESOURCES, relative_path) + + if os.path.exists(central_path): + return central_path + + version = _get_version(project_name) + return _download_resource(version, relative_path) diff --git a/medcat-v2/tests/resources/mct2_model_pack.zip b/medcat-v2/tests/resources/mct2_model_pack.zip deleted file mode 100644 index b6bc74e49..000000000 Binary files a/medcat-v2/tests/resources/mct2_model_pack.zip and /dev/null differ diff --git a/medcat-v2/tests/resources/mct_v1_model_pack.zip b/medcat-v2/tests/resources/mct_v1_model_pack.zip deleted file mode 100644 index 500584e4c..000000000 Binary files a/medcat-v2/tests/resources/mct_v1_model_pack.zip and /dev/null differ diff --git a/medcat-v2/tests/utils/legacy/test_conversion_all.py b/medcat-v2/tests/utils/legacy/test_conversion_all.py index a49409f52..5ceb52dc3 100644 --- a/medcat-v2/tests/utils/legacy/test_conversion_all.py +++ b/medcat-v2/tests/utils/legacy/test_conversion_all.py @@ -7,11 +7,11 @@ import unittest.mock from .test_convert_vocab import TESTS_PATH +from ... import V1_MODEL_PACK_PATH class ConversionFromZIPTests(unittest.TestCase): - MODEL_FOLDER = os.path.join(TESTS_PATH, "resources", - "mct_v1_model_pack.zip") + MODEL_FOLDER = V1_MODEL_PACK_PATH @classmethod def setUpClass(cls):