From 71f2aa200de57942910cb4444308919d93f49abf Mon Sep 17 00:00:00 2001
From: jackson <jjaco34@emory.edu>
Date: Tue, 26 Aug 2025 15:47:56 +0000
Subject: [PATCH 1/3] [#124] FEATURE: add support for multiple GPU nodes

https://jacksonjjacobs.com/openproject/work_packages/124

From c185ba350baa5d7638b2afc5637a406599a62bbe Mon Sep 17 00:00:00 2001
From: jackson <jacksonjjacobs@gmail.com>
Date: Tue, 26 Aug 2025 19:53:38 +0000
Subject: [PATCH 2/3] Update devcontainer to configure ray cluster for proper
 GPU management. Update readme

---
 .devcontainer/devcontainer.json  |  8 ++++++++
 README.md                        | 15 ++++++++++-----
 quickannotator/dl/ray_jackson.py |  6 ++++--
 3 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index 9a4240ce..49f33e70 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -37,6 +37,14 @@
 		"-p", "6006:6006"
 	],
 
+	"containerEnv": {
+		// We always want to manage CUDA_VISIBLE_DEVICES ourselves.
+		"RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES": "0",
+		
+		// We set CUDA_VISIBLE_DEVICES here, as each container will need to set visible GPUs independently.
+		"CUDA_VISIBLE_DEVICES": "0,1"
+	},
+
 	"postCreateCommand": "ln -sf /opt/QuickAnnotator/quickannotator/client/package.json /opt/package.json && ln -sf /opt/QuickAnnotator/quickannotator/client/package-lock.json /opt/package-lock.json && uv pip install -e ."
 
 }
diff --git a/README.md b/README.md
index 72d7a6e3..feff5f02 100644
--- a/README.md
+++ b/README.md
@@ -54,21 +54,26 @@ By default, QuickAnnotator uses a SQLite database. If you would like to use a po
     git checkout v2.0
     ```
 
+2. Modify `devcontainer.json` to suit your use case. Particularly, change the value of `CUDA_VISIBLE_DEVICES` to your desired GPU ids.
+
 2. Within VS Code, open the cloned repository and click on the "Reopen in Container" button to build the devcontainer. This will create a docker container with all the necessary dependencies to run QuickAnnotator.
 ![image](https://github.com/user-attachments/assets/b776577f-a4c2-4eb8-858c-c603ac20cc6d)
 
 
 ### Usage
-1. Connect to a Ray cluster. Ray is used to run operations which require asyncronous processing. There are three ways to connect to a Ray cluster:
-    - **Default**: By default QA will initialize a local Ray cluster within the docker container. 
-        - Note: The default ray cluster does not host the Ray dashboard.
+Once the devcontainer is built, run the following commands within the container terminal to use QuickAnnotator
+
+1. Connect to a Ray cluster. Ray is used to run operations which require asyncronous processing. There are two ways to connect to a Ray cluster:
     - **Manual local cluster**: Run the following command to start a Ray cluster with the Ray dashboard:
         ```bash
         ray start --head --dashboard-host 0.0.0.0
         ```
-    - **Pre-existing cluster**: If you would like QA to connect to an existing Ray cluster, use the `--cluster_address` argument.
+    - **Pre-existing cluster**: To add the container to an existing cluster, use the `--cluster_address` argument.
+        ```bash
+        ray start --cluster_address <cluster_address>
+        ```
 
-2. Once the devcontainer is built, you can run the following command to start the QuickAnnotator server:
+2. Run the following command to start the QuickAnnotator server:
     ```
     (venv) root@e4392ecdd8ef:/opt/QuickAnnotator# quickannotator
     * Serving Flask app '__main__'
diff --git a/quickannotator/dl/ray_jackson.py b/quickannotator/dl/ray_jackson.py
index 9fef03dd..8c40ce53 100644
--- a/quickannotator/dl/ray_jackson.py
+++ b/quickannotator/dl/ray_jackson.py
@@ -1,4 +1,5 @@
 import logging
+import os
 from quickannotator.db.logging import LoggingManager
 import ray
 from ray.train import ScalingConfig
@@ -49,12 +50,13 @@ def start_dlproc(self, allow_pred=True):
         self.setProcRunningSince()
 
         total_gpus = ray.cluster_resources().get("GPU", 0)
-        self.logger.info(f"Total GPUs available: {total_gpus}")
+        self.logger.info(f"{os.environ['RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES']=}")
+        self.logger.info(f"{os.environ['CUDA_VISIBLE_DEVICES']=}")
         scaling_config = ray.train.ScalingConfig(
             num_workers=int(total_gpus),
             use_gpu=True,
             resources_per_worker={"GPU": .01},
-            placement_strategy="STRICT_SPREAD"
+            # placement_strategy="STRICT_SPREAD"  #TODO: remove
         )
     
         trainer = ray.train.torch.TorchTrainer(

From 1ce08f419a3351d65a2b7cafcad4ebf74ed3a4b2 Mon Sep 17 00:00:00 2001
From: jackson <jacksonjjacobs@gmail.com>
Date: Tue, 26 Aug 2025 20:09:44 +0000
Subject: [PATCH 3/3] Fix README to update argument for connecting to
 pre-existing ray cluster

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index feff5f02..acdc7f4c 100644
--- a/README.md
+++ b/README.md
@@ -68,9 +68,9 @@ Once the devcontainer is built, run the following commands within the container
         ```bash
         ray start --head --dashboard-host 0.0.0.0
         ```
-    - **Pre-existing cluster**: To add the container to an existing cluster, use the `--cluster_address` argument.
+    - **Pre-existing cluster**: To add the container to an existing cluster, use the `--address` argument.
         ```bash
-        ray start --cluster_address <cluster_address>
+        ray start --address <cluster_address>
         ```
 
 2. Run the following command to start the QuickAnnotator server: