From 71f2aa200de57942910cb4444308919d93f49abf Mon Sep 17 00:00:00 2001 From: jackson Date: Tue, 26 Aug 2025 15:47:56 +0000 Subject: [PATCH 1/3] [#124] FEATURE: add support for multiple GPU nodes https://jacksonjjacobs.com/openproject/work_packages/124 From c185ba350baa5d7638b2afc5637a406599a62bbe Mon Sep 17 00:00:00 2001 From: jackson Date: Tue, 26 Aug 2025 19:53:38 +0000 Subject: [PATCH 2/3] Update devcontainer to configure ray cluster for proper GPU management. Update readme --- .devcontainer/devcontainer.json | 8 ++++++++ README.md | 15 ++++++++++----- quickannotator/dl/ray_jackson.py | 6 ++++-- 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 9a4240ce..49f33e70 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -37,6 +37,14 @@ "-p", "6006:6006" ], + "containerEnv": { + // We always want to manage CUDA_VISIBLE_DEVICES ourselves. + "RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES": "0", + + // We set CUDA_VISIBLE_DEVICES here, as each container will need to set visible GPUs independently. + "CUDA_VISIBLE_DEVICES": "0,1" + }, + "postCreateCommand": "ln -sf /opt/QuickAnnotator/quickannotator/client/package.json /opt/package.json && ln -sf /opt/QuickAnnotator/quickannotator/client/package-lock.json /opt/package-lock.json && uv pip install -e ." } diff --git a/README.md b/README.md index 72d7a6e3..feff5f02 100644 --- a/README.md +++ b/README.md @@ -54,21 +54,26 @@ By default, QuickAnnotator uses a SQLite database. If you would like to use a po git checkout v2.0 ``` +2. Modify `devcontainer.json` to suit your use case. Particularly, change the value of `CUDA_VISIBLE_DEVICES` to your desired GPU ids. + 2. Within VS Code, open the cloned repository and click on the "Reopen in Container" button to build the devcontainer. This will create a docker container with all the necessary dependencies to run QuickAnnotator. ![image](https://github.com/user-attachments/assets/b776577f-a4c2-4eb8-858c-c603ac20cc6d) ### Usage -1. Connect to a Ray cluster. Ray is used to run operations which require asyncronous processing. There are three ways to connect to a Ray cluster: - - **Default**: By default QA will initialize a local Ray cluster within the docker container. - - Note: The default ray cluster does not host the Ray dashboard. +Once the devcontainer is built, run the following commands within the container terminal to use QuickAnnotator + +1. Connect to a Ray cluster. Ray is used to run operations which require asyncronous processing. There are two ways to connect to a Ray cluster: - **Manual local cluster**: Run the following command to start a Ray cluster with the Ray dashboard: ```bash ray start --head --dashboard-host 0.0.0.0 ``` - - **Pre-existing cluster**: If you would like QA to connect to an existing Ray cluster, use the `--cluster_address` argument. + - **Pre-existing cluster**: To add the container to an existing cluster, use the `--cluster_address` argument. + ```bash + ray start --cluster_address + ``` -2. Once the devcontainer is built, you can run the following command to start the QuickAnnotator server: +2. Run the following command to start the QuickAnnotator server: ``` (venv) root@e4392ecdd8ef:/opt/QuickAnnotator# quickannotator * Serving Flask app '__main__' diff --git a/quickannotator/dl/ray_jackson.py b/quickannotator/dl/ray_jackson.py index 9fef03dd..8c40ce53 100644 --- a/quickannotator/dl/ray_jackson.py +++ b/quickannotator/dl/ray_jackson.py @@ -1,4 +1,5 @@ import logging +import os from quickannotator.db.logging import LoggingManager import ray from ray.train import ScalingConfig @@ -49,12 +50,13 @@ def start_dlproc(self, allow_pred=True): self.setProcRunningSince() total_gpus = ray.cluster_resources().get("GPU", 0) - self.logger.info(f"Total GPUs available: {total_gpus}") + self.logger.info(f"{os.environ['RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES']=}") + self.logger.info(f"{os.environ['CUDA_VISIBLE_DEVICES']=}") scaling_config = ray.train.ScalingConfig( num_workers=int(total_gpus), use_gpu=True, resources_per_worker={"GPU": .01}, - placement_strategy="STRICT_SPREAD" + # placement_strategy="STRICT_SPREAD" #TODO: remove ) trainer = ray.train.torch.TorchTrainer( From 1ce08f419a3351d65a2b7cafcad4ebf74ed3a4b2 Mon Sep 17 00:00:00 2001 From: jackson Date: Tue, 26 Aug 2025 20:09:44 +0000 Subject: [PATCH 3/3] Fix README to update argument for connecting to pre-existing ray cluster --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index feff5f02..acdc7f4c 100644 --- a/README.md +++ b/README.md @@ -68,9 +68,9 @@ Once the devcontainer is built, run the following commands within the container ```bash ray start --head --dashboard-host 0.0.0.0 ``` - - **Pre-existing cluster**: To add the container to an existing cluster, use the `--cluster_address` argument. + - **Pre-existing cluster**: To add the container to an existing cluster, use the `--address` argument. ```bash - ray start --cluster_address + ray start --address ``` 2. Run the following command to start the QuickAnnotator server: