From e971662922195f0aec6f6b1eeba541b2b75cb0df Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Fri, 15 May 2026 17:32:20 -0700
Subject: [PATCH 01/39] Refactor: move model provisioning policies to
 streamwise/model_provisioner/

Move the 6 policy/allocator files (greedy, milp, naive_baseline, hexgen,
helix, policies) from simulator/ into streamwise/model_provisioner/ so
they can be reused by both the simulator evaluation framework and the
StreamWise serving system.

- Create streamwise/model_provisioner/ package with __init__.py that
  adds simulator/ to sys.path for foundation module access
- Create simulator/__init__.py that adds streamwise/ to sys.path so
  model_provisioner is importable from simulator code
- Update all imports across simulator files and 20 test files
- Switch data_loading.py to use Path instead of str for data_dir params
- Fix mypy issue in wrapper/run_httpserver.py (bytearray assignment)
- Add .venv to .flake8 exclude and .gitignore

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .flake8                                       |  1 +
 .gitignore                                    |  3 +++
 simulator/__init__.py                         | 15 ++++++++++++++
 simulator/actions.py                          |  2 +-
 simulator/auto_model_allocator.py             | 12 +++++------
 simulator/data_loading.py                     | 12 ++++++-----
 simulator/model_allocator.py                  |  2 +-
 simulator/multirequests.py                    |  2 +-
 simulator/provisioning.py                     |  2 +-
 streamwise/model_provisioner/__init__.py      | 15 ++++++++++++++
 .../model_provisioner}/greedy.py              |  6 +++---
 .../model_provisioner}/helix.py               |  6 +++---
 .../model_provisioner}/hexgen.py              |  8 ++++----
 .../model_provisioner}/milp.py                |  2 +-
 .../model_provisioner}/naive_baseline.py      |  4 ++--
 .../model_provisioner}/policies.py            |  0
 tests/simulator/test_auto_model_allocator.py  | 20 +++++++++----------
 tests/simulator/test_data_loading.py          |  2 +-
 tests/simulator/test_evaluator.py             |  4 ++--
 tests/simulator/test_greedy.py                |  6 +++---
 tests/simulator/test_helix.py                 |  6 +++---
 tests/simulator/test_hexgen.py                |  8 ++++----
 tests/simulator/test_milp.py                  |  6 +++---
 tests/simulator/test_models.py                |  6 +++---
 tests/simulator/test_multirequests_derive.py  |  2 +-
 tests/simulator/test_simulator.py             |  8 ++++----
 tests/simulator/test_simulator_actions.py     |  2 +-
 tests/simulator/test_simulator_baseline.py    | 12 +++++------
 tests/simulator/test_simulator_energy.py      | 10 +++++-----
 .../simulator/test_simulator_multirequests.py |  2 +-
 tests/simulator/test_simulator_plotutils.py   |  2 +-
 tests/simulator/test_simulator_policies.py    |  6 +++---
 .../simulator/test_simulator_provisioning.py  |  8 ++++----
 tests/simulator/test_simulator_types.py       |  4 ++--
 tests/simulator/test_simulator_utils.py       |  2 +-
 tests/simulator/test_workflows.py             |  4 ++--
 wrapper/run_httpserver.py                     |  4 ++--
 37 files changed, 126 insertions(+), 90 deletions(-)
 create mode 100644 streamwise/model_provisioner/__init__.py
 rename {simulator => streamwise/model_provisioner}/greedy.py (99%)
 rename {simulator => streamwise/model_provisioner}/helix.py (99%)
 rename {simulator => streamwise/model_provisioner}/hexgen.py (99%)
 rename {simulator => streamwise/model_provisioner}/milp.py (99%)
 rename {simulator => streamwise/model_provisioner}/naive_baseline.py (99%)
 rename {simulator => streamwise/model_provisioner}/policies.py (100%)

diff --git a/.flake8 b/.flake8
index 13cb9ba1..b32f0349 100644
--- a/.flake8
+++ b/.flake8
@@ -3,3 +3,4 @@ max-line-length = 120
 # Ignore E402: module-level import not at top of file
 # Ignore W503: line break before binary operator (incompatible with W504)
 ignore = E402,W503
+exclude = .venv
diff --git a/.gitignore b/.gitignore
index 51130c5b..9807bf14 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,6 +11,9 @@
 *.sln.docstates
 *.env
 
+# Environment files
+.venv/
+
 # User-specific files (MonoDevelop/Xamarin Studio)
 *.userprefs
 
diff --git a/simulator/__init__.py b/simulator/__init__.py
index e69de29b..263309ff 100644
--- a/simulator/__init__.py
+++ b/simulator/__init__.py
@@ -0,0 +1,15 @@
+"""
+Simulator package — provisioning sweeps, multi-request analysis, and plotting
+on top of the model_provisioner allocation policies.
+
+The allocation policy implementations live in ``streamwise/model_provisioner/``.
+"""
+import os
+import sys
+
+# Make model_provisioner importable for simulator modules.
+_STREAMWISE_DIR = os.path.normpath(
+    os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "streamwise")
+)
+if _STREAMWISE_DIR not in sys.path:
+    sys.path.insert(0, _STREAMWISE_DIR)
diff --git a/simulator/actions.py b/simulator/actions.py
index debea677..69af1618 100644
--- a/simulator/actions.py
+++ b/simulator/actions.py
@@ -27,7 +27,7 @@
 from sim_types import Objective
 from sim_types import Policy
 
-from policies import STREAMWISE_POLICY
+from model_provisioner.policies import STREAMWISE_POLICY
 
 from models import get_model_allocation
 
diff --git a/simulator/auto_model_allocator.py b/simulator/auto_model_allocator.py
index ea0fda61..3ca86cb7 100644
--- a/simulator/auto_model_allocator.py
+++ b/simulator/auto_model_allocator.py
@@ -19,7 +19,7 @@
 from sim_types import GPUType
 from sim_types import Result
 
-from policies import STREAMWISE_POLICY
+from model_provisioner.policies import STREAMWISE_POLICY
 
 from model_allocator import ModelAllocator
 
@@ -47,7 +47,7 @@ def __init__(
     def _build_allocator(self) -> ModelAllocator:
         """Create concrete allocator based on configured solver."""
         if self.policy.solver == Solver.GREEDY:
-            from greedy import GreedyAllocator
+            from model_provisioner.greedy import GreedyAllocator
             return GreedyAllocator(
                 workflow=self.workflow,
                 latency_data=self.latency_data,
@@ -55,7 +55,7 @@ def _build_allocator(self) -> ModelAllocator:
                 policy=self.policy,
             )
         if self.policy.solver == Solver.NAIVE:
-            from naive_baseline import NaiveAllocator
+            from model_provisioner.naive_baseline import NaiveAllocator
             return NaiveAllocator(
                 workflow=self.workflow,
                 latency_data=self.latency_data,
@@ -63,7 +63,7 @@ def _build_allocator(self) -> ModelAllocator:
                 policy=self.policy,
             )
         if self.policy.solver in {Solver.GUROBI, Solver.HIGHS}:
-            from milp import MILPAllocator
+            from model_provisioner.milp import MILPAllocator
             return MILPAllocator(
                 workflow=self.workflow,
                 latency_data=self.latency_data,
@@ -71,7 +71,7 @@ def _build_allocator(self) -> ModelAllocator:
                 policy=self.policy,
             )
         if self.policy.solver == Solver.HEXGEN:
-            from hexgen import HexGenAllocator
+            from model_provisioner.hexgen import HexGenAllocator
             return HexGenAllocator(
                 workflow=self.workflow,
                 latency_data=self.latency_data,
@@ -79,7 +79,7 @@ def _build_allocator(self) -> ModelAllocator:
                 policy=self.policy,
             )
         if self.policy.solver == Solver.HELIX:
-            from helix import HelixAllocator
+            from model_provisioner.helix import HelixAllocator
             return HelixAllocator(
                 workflow=self.workflow,
                 latency_data=self.latency_data,
diff --git a/simulator/data_loading.py b/simulator/data_loading.py
index 6ee59ec5..af37e5b8 100644
--- a/simulator/data_loading.py
+++ b/simulator/data_loading.py
@@ -28,15 +28,17 @@
 from constants import POWER_GPU_IDLE
 from constants import POWER_GPU_TDP
 
+_DEFAULT_DATA_DIR = Path(__file__).resolve().parent / "data"
+
 
 def load_latency_data(
-    data_dir: str = "data/",
+    data_dir: str | Path = _DEFAULT_DATA_DIR,
 ) -> LatencyData:
     """
     Load latency and throughput mapping data from CSV files.
 
     Args:
-        data_dir (str): The directory where the CSV files are stored.
+        data_dir: The directory where the CSV files are stored.
     Returns:
         LatencyData: An object containing all loaded latency data.
     """
@@ -107,13 +109,13 @@ def load_latency_data(
 
 
 def load_power_data(
-    data_dir: str = "data/"
+    data_dir: str | Path = _DEFAULT_DATA_DIR
 ) -> PowerData:
     """
     Load power consumption data from CSV files.
 
     Args:
-        data_dir (str): The directory where the CSV files are stored.
+        data_dir: The directory where the CSV files are stored.
     Returns:
         PowerData: An object containing all loaded power consumption data.
     """
@@ -216,7 +218,7 @@ def load_power_data(
 
 
 def load_adaptive_quality_data(
-    data_dir: str,
+    data_dir: str | Path,
     level: QualityLevel,
 ) -> LatencyData:
     """Load latency data for adaptive quality."""
diff --git a/simulator/model_allocator.py b/simulator/model_allocator.py
index ab1c7e39..0f773a51 100644
--- a/simulator/model_allocator.py
+++ b/simulator/model_allocator.py
@@ -27,7 +27,7 @@
 from models import UpscalerModelAllocation
 from models import OthersModelAllocation
 
-from policies import NAIVE_POLICY
+from model_provisioner.policies import NAIVE_POLICY
 
 
 class ModelAllocator(ABC):
diff --git a/simulator/multirequests.py b/simulator/multirequests.py
index 4fee5d55..a8d87a8b 100644
--- a/simulator/multirequests.py
+++ b/simulator/multirequests.py
@@ -18,7 +18,7 @@
 
 from workflows import PODCAST_WORKFLOW
 
-from policies import STREAMWISE_POLICY
+from model_provisioner.policies import STREAMWISE_POLICY
 
 from auto_model_allocator import AutoModelAllocator
 
diff --git a/simulator/provisioning.py b/simulator/provisioning.py
index 43612b53..dd4f2a89 100644
--- a/simulator/provisioning.py
+++ b/simulator/provisioning.py
@@ -33,7 +33,7 @@
 
 from auto_model_allocator import AutoModelAllocator
 
-from policies import STREAMWISE_POLICY
+from model_provisioner.policies import STREAMWISE_POLICY
 
 from constants import SECONDS_IN_HOUR
 
diff --git a/streamwise/model_provisioner/__init__.py b/streamwise/model_provisioner/__init__.py
new file mode 100644
index 00000000..c79b0cde
--- /dev/null
+++ b/streamwise/model_provisioner/__init__.py
@@ -0,0 +1,15 @@
+"""
+Model Provisioner — allocation policy implementations for GPU resource distribution.
+
+Contains greedy, naive, MILP, HexGen, and Helix allocation strategies.
+The foundation types (sim_types, constants, models, etc.) live in simulator/.
+"""
+import os
+import sys
+
+# Add simulator/ to sys.path so policy files can import foundation modules.
+_SIMULATOR_DIR = os.path.normpath(
+    os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "..", "simulator")
+)
+if _SIMULATOR_DIR not in sys.path:
+    sys.path.insert(0, _SIMULATOR_DIR)
diff --git a/simulator/greedy.py b/streamwise/model_provisioner/greedy.py
similarity index 99%
rename from simulator/greedy.py
rename to streamwise/model_provisioner/greedy.py
index 459742e5..8c1a1dd0 100644
--- a/simulator/greedy.py
+++ b/streamwise/model_provisioner/greedy.py
@@ -33,9 +33,9 @@
 
 from model_allocator import ModelAllocator
 
-from policies import STREAMWISE_POLICY
-from policies import MAX_ITERATIONS
-from policies import USE_ALL_GPUS
+from .policies import STREAMWISE_POLICY
+from .policies import MAX_ITERATIONS
+from .policies import USE_ALL_GPUS
 
 from actions import gen_actions
 from actions import choose_action
diff --git a/simulator/helix.py b/streamwise/model_provisioner/helix.py
similarity index 99%
rename from simulator/helix.py
rename to streamwise/model_provisioner/helix.py
index 5891538f..e8fededf 100644
--- a/simulator/helix.py
+++ b/streamwise/model_provisioner/helix.py
@@ -43,10 +43,10 @@
 
 from evaluator import evaluate_model_allocation
 
-from milp import MILPAllocator
+from .milp import MILPAllocator
 
-from policies import HELIX_POLICY
-from policies import MAX_DEVICES
+from .policies import HELIX_POLICY
+from .policies import MAX_DEVICES
 
 from constants import DEVICE_OPTIONS
 
diff --git a/simulator/hexgen.py b/streamwise/model_provisioner/hexgen.py
similarity index 99%
rename from simulator/hexgen.py
rename to streamwise/model_provisioner/hexgen.py
index 64c64160..4f37768a 100644
--- a/simulator/hexgen.py
+++ b/streamwise/model_provisioner/hexgen.py
@@ -30,15 +30,15 @@
 from evaluator import calc_used_gpus
 from evaluator import evaluate_model_allocation
 
-from greedy import GreedyAllocator
+from .greedy import GreedyAllocator
 
 from actions import gen_actions
 from actions import choose_action
 from actions import apply_action
 
-from policies import HEXGEN_POLICY
-from policies import MAX_ITERATIONS
-from policies import USE_ALL_GPUS
+from .policies import HEXGEN_POLICY
+from .policies import MAX_ITERATIONS
+from .policies import USE_ALL_GPUS
 
 
 def _get_model_order(workflow: WorkflowConfig) -> list[Model]:
diff --git a/simulator/milp.py b/streamwise/model_provisioner/milp.py
similarity index 99%
rename from simulator/milp.py
rename to streamwise/model_provisioner/milp.py
index 7a84e754..67749258 100644
--- a/simulator/milp.py
+++ b/streamwise/model_provisioner/milp.py
@@ -40,7 +40,7 @@
 from constants import NUM_GPUS_PER_SERVER
 from constants import SECONDS_IN_HOUR
 
-from policies import STREAMWISE_MILP_POLICY
+from .policies import STREAMWISE_MILP_POLICY
 
 
 MAX_INSTANCES = 16
diff --git a/simulator/naive_baseline.py b/streamwise/model_provisioner/naive_baseline.py
similarity index 99%
rename from simulator/naive_baseline.py
rename to streamwise/model_provisioner/naive_baseline.py
index 9f9c550c..ec95904e 100644
--- a/simulator/naive_baseline.py
+++ b/streamwise/model_provisioner/naive_baseline.py
@@ -31,8 +31,8 @@
 
 from evaluator import evaluate_model_allocation
 
-from policies import NAIVE_POLICY
-from policies import MAX_DEVICES
+from .policies import NAIVE_POLICY
+from .policies import MAX_DEVICES
 
 from model_allocator import ModelAllocator
 
diff --git a/simulator/policies.py b/streamwise/model_provisioner/policies.py
similarity index 100%
rename from simulator/policies.py
rename to streamwise/model_provisioner/policies.py
diff --git a/tests/simulator/test_auto_model_allocator.py b/tests/simulator/test_auto_model_allocator.py
index a9aa17d6..f7550822 100644
--- a/tests/simulator/test_auto_model_allocator.py
+++ b/tests/simulator/test_auto_model_allocator.py
@@ -23,7 +23,7 @@
 
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
+with temp_sys_path("simulator", "streamwise"):
     from sim_types import GPUType
     from sim_types import Model
     from sim_types import QualityLevel
@@ -33,18 +33,18 @@
 
     from data_loading import load_latency_data
 
-    from policies import STREAMWISE_POLICY
-    from policies import NAIVE_POLICY
-    from policies import HEXGEN_POLICY
-    from policies import HELIX_POLICY
+    from model_provisioner.policies import STREAMWISE_POLICY
+    from model_provisioner.policies import NAIVE_POLICY
+    from model_provisioner.policies import HEXGEN_POLICY
+    from model_provisioner.policies import HELIX_POLICY
 
     from auto_model_allocator import AutoModelAllocator
 
-    from greedy import GreedyAllocator
-    from naive_baseline import NaiveAllocator
-    from hexgen import HexGenAllocator
-    from helix import HelixAllocator
-    from milp import MILPAllocator
+    from model_provisioner.greedy import GreedyAllocator
+    from model_provisioner.naive_baseline import NaiveAllocator
+    from model_provisioner.hexgen import HexGenAllocator
+    from model_provisioner.helix import HelixAllocator
+    from model_provisioner.milp import MILPAllocator
 
     from workflows import PODCAST_WORKFLOW
 
diff --git a/tests/simulator/test_data_loading.py b/tests/simulator/test_data_loading.py
index 129a2f3b..72337375 100644
--- a/tests/simulator/test_data_loading.py
+++ b/tests/simulator/test_data_loading.py
@@ -11,7 +11,7 @@
 
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
+with temp_sys_path("simulator", "streamwise"):
     from sim_types import QualityLevel
 
     from data_loading import load_latency_data
diff --git a/tests/simulator/test_evaluator.py b/tests/simulator/test_evaluator.py
index a162e99b..b3c37e73 100644
--- a/tests/simulator/test_evaluator.py
+++ b/tests/simulator/test_evaluator.py
@@ -8,7 +8,7 @@
 from tests.test_utils import assert_equals_approx
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
+with temp_sys_path("simulator", "streamwise"):
     from constants import DEFAULT_WORKFLOW_CONFIG
     from constants import SECONDS_IN_HOUR
 
@@ -20,7 +20,7 @@
 
     from evaluator import evaluate_model_allocation
 
-    from policies import STREAMWISE_POLICY
+    from model_provisioner.policies import STREAMWISE_POLICY
 
     from models import FluxModelAllocation
     from models import GemmaModelAllocation
diff --git a/tests/simulator/test_greedy.py b/tests/simulator/test_greedy.py
index c33d6991..bfa2996e 100644
--- a/tests/simulator/test_greedy.py
+++ b/tests/simulator/test_greedy.py
@@ -8,7 +8,7 @@
 
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
+with temp_sys_path("simulator", "streamwise"):
     from constants import DEFAULT_WORKFLOW_CONFIG
     from constants import SECONDS_IN_HOUR
 
@@ -21,9 +21,9 @@
     from data_loading import load_latency_data
     from data_loading import load_power_data
 
-    from greedy import GreedyAllocator
+    from model_provisioner.greedy import GreedyAllocator
 
-    from policies import STREAMWISE_POLICY
+    from model_provisioner.policies import STREAMWISE_POLICY
 
 
 def test_allocate_8A_8H() -> None:
diff --git a/tests/simulator/test_helix.py b/tests/simulator/test_helix.py
index a336595d..7261b902 100644
--- a/tests/simulator/test_helix.py
+++ b/tests/simulator/test_helix.py
@@ -12,7 +12,7 @@
 
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
+with temp_sys_path("simulator", "streamwise"):
     from constants import DEFAULT_WORKFLOW_CONFIG
     from sim_types import GPUType
     from sim_types import Model
@@ -20,8 +20,8 @@
     from sim_types import Solver
     from data_loading import load_latency_data
     from data_loading import load_power_data
-    from helix import HelixAllocator
-    from policies import HELIX_POLICY
+    from model_provisioner.helix import HelixAllocator
+    from model_provisioner.policies import HELIX_POLICY
 
 
 def test_get_model_order() -> None:
diff --git a/tests/simulator/test_hexgen.py b/tests/simulator/test_hexgen.py
index 99e7eef5..3d77867b 100644
--- a/tests/simulator/test_hexgen.py
+++ b/tests/simulator/test_hexgen.py
@@ -7,12 +7,12 @@
 
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
+with temp_sys_path("simulator", "streamwise"):
     from constants import DEFAULT_WORKFLOW_CONFIG
     from sim_types import GPUType
     from data_loading import load_latency_data
-    from hexgen import HexGenAllocator
-    from hexgen import _get_model_order
+    from model_provisioner.hexgen import HexGenAllocator
+    from model_provisioner.hexgen import _get_model_order
     from sim_types import MODEL_ORDER
 
 
@@ -154,7 +154,7 @@ def test_no_gpus_error() -> None:
 
 def test_is_subclass_of_greedy() -> None:
     """HexGenAllocator should extend GreedyAllocator."""
-    from greedy import GreedyAllocator
+    from model_provisioner.greedy import GreedyAllocator
     latency_data = load_latency_data("simulator/data/")
     allocator = HexGenAllocator(
         workflow=DEFAULT_WORKFLOW_CONFIG,
diff --git a/tests/simulator/test_milp.py b/tests/simulator/test_milp.py
index 70c4bfa8..52a308bd 100644
--- a/tests/simulator/test_milp.py
+++ b/tests/simulator/test_milp.py
@@ -13,7 +13,7 @@
 
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
+with temp_sys_path("simulator", "streamwise"):
     from sim_types import LatencyData
     from sim_types import PowerData
     from sim_types import GPUType
@@ -27,11 +27,11 @@
     from constants import DEFAULT_WORKFLOW_CONFIG
     from constants import SECONDS_IN_HOUR
 
-    from policies import STREAMWISE_MILP_POLICY
+    from model_provisioner.policies import STREAMWISE_MILP_POLICY
 
     from workflows import WORKFLOWS
 
-    from milp import MILPAllocator
+    from model_provisioner.milp import MILPAllocator
 
     from evaluator import evaluate_model_allocation
 
diff --git a/tests/simulator/test_models.py b/tests/simulator/test_models.py
index 57e00a0a..eccb449b 100644
--- a/tests/simulator/test_models.py
+++ b/tests/simulator/test_models.py
@@ -16,7 +16,7 @@
 
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
+with temp_sys_path("simulator", "streamwise"):
     from sim_types import GPUType
     from sim_types import Model
     from sim_types import ModelAllocation
@@ -29,8 +29,8 @@
     from data_loading import load_latency_data
     from data_loading import load_power_data
 
-    from policies import STREAMWISE_POLICY
-    from policies import NAIVE_POLICY
+    from model_provisioner.policies import STREAMWISE_POLICY
+    from model_provisioner.policies import NAIVE_POLICY
 
     from models import get_model_allocation
     from models import _calculate_total_time
diff --git a/tests/simulator/test_multirequests_derive.py b/tests/simulator/test_multirequests_derive.py
index 8e7ed798..c809ccd0 100644
--- a/tests/simulator/test_multirequests_derive.py
+++ b/tests/simulator/test_multirequests_derive.py
@@ -7,7 +7,7 @@
 from tests.test_utils import assert_equal_dict
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
+with temp_sys_path("simulator", "streamwise"):
     from sim_types import GPUType
     from sim_types import Model
     from sim_types import QualityLevel
diff --git a/tests/simulator/test_simulator.py b/tests/simulator/test_simulator.py
index fc791151..d698bb9d 100644
--- a/tests/simulator/test_simulator.py
+++ b/tests/simulator/test_simulator.py
@@ -13,7 +13,7 @@
 
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
+with temp_sys_path("simulator", "streamwise"):
     from sim_types import WorkflowConfig
     from sim_types import Model
     from sim_types import Objective
@@ -26,10 +26,10 @@
     from data_loading import load_power_data
 
     from auto_model_allocator import AutoModelAllocator
-    from greedy import GreedyAllocator
+    from model_provisioner.greedy import GreedyAllocator
 
-    from policies import STREAMWISE_POLICY
-    from policies import NAIVE_POLICY
+    from model_provisioner.policies import STREAMWISE_POLICY
+    from model_provisioner.policies import NAIVE_POLICY
 
 
 def test_estimate_total_time() -> None:
diff --git a/tests/simulator/test_simulator_actions.py b/tests/simulator/test_simulator_actions.py
index dd3bf4fd..539946c5 100644
--- a/tests/simulator/test_simulator_actions.py
+++ b/tests/simulator/test_simulator_actions.py
@@ -7,7 +7,7 @@
 
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
+with temp_sys_path("simulator", "streamwise"):
     from sim_types import Action
     from sim_types import ActionName
     from sim_types import GPUType
diff --git a/tests/simulator/test_simulator_baseline.py b/tests/simulator/test_simulator_baseline.py
index 64282777..b195a1cf 100644
--- a/tests/simulator/test_simulator_baseline.py
+++ b/tests/simulator/test_simulator_baseline.py
@@ -11,7 +11,7 @@
 
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
+with temp_sys_path("simulator", "streamwise"):
     from sim_types import GPUType
     from sim_types import Model
 
@@ -24,12 +24,12 @@
     from data_loading import load_power_data
 
     from auto_model_allocator import AutoModelAllocator
-    from naive_baseline import NaiveAllocator
-    from greedy import GreedyAllocator
+    from model_provisioner.naive_baseline import NaiveAllocator
+    from model_provisioner.greedy import GreedyAllocator
 
-    from policies import NAIVE_POLICY
-    from policies import BASELINE_POLICIES
-    from policies import STREAMWISE_POLICY
+    from model_provisioner.policies import NAIVE_POLICY
+    from model_provisioner.policies import BASELINE_POLICIES
+    from model_provisioner.policies import STREAMWISE_POLICY
 
     from workflows import SHORTS_WORKFLOW
     from workflows import WORKFLOWS
diff --git a/tests/simulator/test_simulator_energy.py b/tests/simulator/test_simulator_energy.py
index 16b6e8bf..c96fd128 100644
--- a/tests/simulator/test_simulator_energy.py
+++ b/tests/simulator/test_simulator_energy.py
@@ -9,7 +9,7 @@
 
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
+with temp_sys_path("simulator", "streamwise"):
     from constants import DEFAULT_WORKFLOW_CONFIG
 
     from sim_types import GPUType
@@ -21,11 +21,11 @@
     from data_loading import load_power_data
 
     from auto_model_allocator import AutoModelAllocator
-    from greedy import GreedyAllocator
-    from naive_baseline import NaiveAllocator
+    from model_provisioner.greedy import GreedyAllocator
+    from model_provisioner.naive_baseline import NaiveAllocator
 
-    from policies import NAIVE_POLICY
-    from policies import STREAMWISE_POLICY
+    from model_provisioner.policies import NAIVE_POLICY
+    from model_provisioner.policies import STREAMWISE_POLICY
 
 
 def test_energy() -> None:
diff --git a/tests/simulator/test_simulator_multirequests.py b/tests/simulator/test_simulator_multirequests.py
index 972596ec..6403baba 100644
--- a/tests/simulator/test_simulator_multirequests.py
+++ b/tests/simulator/test_simulator_multirequests.py
@@ -7,7 +7,7 @@
 from tests.test_utils import assert_equals_approx
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
+with temp_sys_path("simulator", "streamwise"):
     from multirequests import QPM_LIST
     from multirequests import get_replicas
     from multirequests import get_costs
diff --git a/tests/simulator/test_simulator_plotutils.py b/tests/simulator/test_simulator_plotutils.py
index cee69368..b3bdead9 100644
--- a/tests/simulator/test_simulator_plotutils.py
+++ b/tests/simulator/test_simulator_plotutils.py
@@ -6,7 +6,7 @@
 
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
+with temp_sys_path("simulator", "streamwise"):
     from plot_utils import plot_ttff_vs_cost
     from plot_utils import plot_ttff_vs_energy
     from plot_utils import plot_adaptive_quality
diff --git a/tests/simulator/test_simulator_policies.py b/tests/simulator/test_simulator_policies.py
index ffab5ba0..d9e1421f 100644
--- a/tests/simulator/test_simulator_policies.py
+++ b/tests/simulator/test_simulator_policies.py
@@ -11,9 +11,9 @@
 
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
-    from policies import STREAMWISE_POLICY
-    from policies import BASELINE_POLICIES
+with temp_sys_path("simulator", "streamwise"):
+    from model_provisioner.policies import STREAMWISE_POLICY
+    from model_provisioner.policies import BASELINE_POLICIES
 
     from sim_types import Objective
 
diff --git a/tests/simulator/test_simulator_provisioning.py b/tests/simulator/test_simulator_provisioning.py
index 6bd142ae..fb5d46fd 100644
--- a/tests/simulator/test_simulator_provisioning.py
+++ b/tests/simulator/test_simulator_provisioning.py
@@ -7,7 +7,7 @@
 
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
+with temp_sys_path("simulator", "streamwise"):
     from constants import DEFAULT_WORKFLOW_CONFIG
 
     from provisioning import get_provisioning_results
@@ -23,9 +23,9 @@
 
     from data_loading import load_latency_data
 
-    from policies import NAIVE_POLICY
-    from policies import STREAMWISE_POLICY
-    from policies import HEXGEN_POLICY
+    from model_provisioner.policies import NAIVE_POLICY
+    from model_provisioner.policies import STREAMWISE_POLICY
+    from model_provisioner.policies import HEXGEN_POLICY
 
 
 @pytest.mark.parametrize("gpu_type", [gpu_type for gpu_type in GPUType])
diff --git a/tests/simulator/test_simulator_types.py b/tests/simulator/test_simulator_types.py
index 8bfc292f..223a3260 100644
--- a/tests/simulator/test_simulator_types.py
+++ b/tests/simulator/test_simulator_types.py
@@ -8,7 +8,7 @@
 
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
+with temp_sys_path("simulator", "streamwise"):
     from sim_types import Model
     from sim_types import GPUType
 
@@ -20,7 +20,7 @@
     from models import GemmaModelAllocation
     from models import FluxModelAllocation
 
-    from policies import STREAMWISE_POLICY
+    from model_provisioner.policies import STREAMWISE_POLICY
 
     from workflows import PODCAST_WORKFLOW
 
diff --git a/tests/simulator/test_simulator_utils.py b/tests/simulator/test_simulator_utils.py
index 9711a696..b78d675d 100644
--- a/tests/simulator/test_simulator_utils.py
+++ b/tests/simulator/test_simulator_utils.py
@@ -6,7 +6,7 @@
 
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
+with temp_sys_path("simulator", "streamwise"):
     from sim_types import Model
     from sim_types import GPUType
     from sim_types import ModelAllocation
diff --git a/tests/simulator/test_workflows.py b/tests/simulator/test_workflows.py
index bff7ed56..b38dc2ab 100644
--- a/tests/simulator/test_workflows.py
+++ b/tests/simulator/test_workflows.py
@@ -15,7 +15,7 @@
 
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
+with temp_sys_path("simulator", "streamwise"):
     from sim_types import WorkflowConfig, Model, QualityLevel, GPUType
     from constants import (
         FPS,
@@ -28,7 +28,7 @@
     )
     from data_loading import load_latency_data
     from auto_model_allocator import AutoModelAllocator
-    from policies import STREAMWISE_POLICY, NAIVE_POLICY
+    from model_provisioner.policies import STREAMWISE_POLICY, NAIVE_POLICY
     from workflows import (
         MAX_FT_FRAMES,
         SUBSCENE_SECONDS,
diff --git a/wrapper/run_httpserver.py b/wrapper/run_httpserver.py
index a9ec16ad..6ca398fe 100644
--- a/wrapper/run_httpserver.py
+++ b/wrapper/run_httpserver.py
@@ -1266,8 +1266,8 @@ async def send_task(gen_task: dict) -> None:
 
     try:
         payload_bytes = await asyncio.to_thread(pickle.dumps, gen_task)
-        payload_bytes = bytearray(payload_bytes)
-        payload_tensor = torch.frombuffer(payload_bytes, dtype=torch.uint8).to("cuda")
+        payload_buffer = bytearray(payload_bytes)
+        payload_tensor = torch.frombuffer(payload_buffer, dtype=torch.uint8).to("cuda")
         payload_size = torch.tensor([payload_tensor.numel()], dtype=torch.int64, device="cuda")
 
         if payload_size.item() > MAX_PAYLOAD_BYTES:

From 3c324f712d7ca3ef8be622026fea622ddff2aa87 Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Sat, 16 May 2026 16:32:27 -0700
Subject: [PATCH 02/39] Fix model_provisioner __init__.py to support Docker
 layout

Support both local dev (../../simulator) and Docker (../simulator) paths
when resolving the simulator directory for foundation module imports.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 streamwise/model_provisioner/__init__.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/streamwise/model_provisioner/__init__.py b/streamwise/model_provisioner/__init__.py
index c79b0cde..9d75609c 100644
--- a/streamwise/model_provisioner/__init__.py
+++ b/streamwise/model_provisioner/__init__.py
@@ -8,8 +8,13 @@
 import sys
 
 # Add simulator/ to sys.path so policy files can import foundation modules.
-_SIMULATOR_DIR = os.path.normpath(
-    os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "..", "simulator")
-)
-if _SIMULATOR_DIR not in sys.path:
-    sys.path.insert(0, _SIMULATOR_DIR)
+# Supports both local dev layout (../../simulator) and Docker layout (../simulator).
+_HERE = os.path.dirname(os.path.abspath(__file__))
+_CANDIDATES = [
+    os.path.normpath(os.path.join(_HERE, "..", "..", "simulator")),
+    os.path.normpath(os.path.join(_HERE, "..", "simulator")),
+]
+for _path in _CANDIDATES:
+    if os.path.isdir(_path) and _path not in sys.path:
+        sys.path.insert(0, _path)
+        break

From 8f9bc0b8fc5db35fc2a6781ef0d89f0432b081f0 Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Fri, 15 May 2026 17:34:27 -0700
Subject: [PATCH 03/39] Add auto-deploy feature to StreamWise dashboard

Add an 'Auto Deploy' button to the web dashboard that automatically
optimizes GPU resource allocation across workflow components using the
model provisioner's greedy allocator.

- Add streamwise/allocator_bridge.py: maps allocator output to K8s
  deployment parameters (Model enum -> container names, GPU specs)
- Add /api/auto_deploy and /api/auto_deploy/confirm routes to
  streamwise.py for computing and confirming deployment plans
- Add auto-deploy UI section to add_pod.html with GPU budget inputs,
  workflow selector, and deployment plan preview
- Add comprehensive tests for allocator bridge and auto-deploy API

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 streamwise/allocator_bridge.py                | 250 ++++++++++++++++
 streamwise/streamwise.py                      | 118 ++++++++
 streamwise/templates/add_pod.html             | 190 ++++++++++++
 tests/streamwise/test_allocator_bridge.py     | 282 ++++++++++++++++++
 .../streamwise/test_streamwise_auto_deploy.py | 226 ++++++++++++++
 5 files changed, 1066 insertions(+)
 create mode 100644 streamwise/allocator_bridge.py
 create mode 100644 tests/streamwise/test_allocator_bridge.py
 create mode 100644 tests/streamwise/test_streamwise_auto_deploy.py

diff --git a/streamwise/allocator_bridge.py b/streamwise/allocator_bridge.py
new file mode 100644
index 00000000..b1e610d2
--- /dev/null
+++ b/streamwise/allocator_bridge.py
@@ -0,0 +1,250 @@
+"""
+Bridge between the model provisioner's allocator output and StreamWise pod deployment.
+
+Translates ModelAllocation results (abstract Model enum + GPU counts) into concrete
+container deployment parameters compatible with pod_manager.add_pod().
+"""
+
+from __future__ import annotations
+
+import os
+
+import model_provisioner  # noqa: F401 — adds simulator/ to sys.path
+
+from dataclasses import dataclass
+from typing import Optional
+
+from sim_types import GPUType
+from sim_types import Model
+from sim_types import Result
+
+from auto_model_allocator import AutoModelAllocator
+from data_loading import load_latency_data
+from model_provisioner.policies import STREAMWISE_POLICY
+from workflows import WORKFLOWS
+
+
+# Mapping from simulator Model enum to concrete container names used by pod_manager.
+# Some Model entries map to multiple containers (e.g., OTHERS -> kokoro + yolo).
+MODEL_TO_CONTAINERS: dict[Model, list[str]] = {
+    Model.GEMMA: ["gemma"],
+    Model.FLUX: ["flux"],
+    Model.HF: ["hunyuanframepackf1"],
+    Model.HF_VAE: ["hunyuanframepackvae"],
+    Model.FT: ["fantasytalking"],
+    Model.FT_VAE: [],  # FT_VAE is handled within fantasytalking container
+    Model.UPSCALER: ["realesrgan"],
+    Model.OTHERS: ["kokoro", "yolo"],
+}
+
+# Default CPU/memory/storage for each container when deployed via auto-deploy.
+# Format: (cpu_cores, memory_gib, ephemeral_storage_gib)
+CONTAINER_RESOURCES: dict[str, tuple[int, int, int]] = {
+    "gemma": (16, 192, 64),
+    "flux": (12, 128, 64),
+    "hunyuanframepackf1": (24, 128, 64),
+    "hunyuanframepackvae": (4, 32, 16),
+    "fantasytalking": (12, 192, 64),
+    "realesrgan": (4, 32, 16),
+    "kokoro": (2, 8, 16),
+    "yolo": (4, 8, 16),
+}
+
+# GPU type string used by pod_manager (lowercase)
+GPU_TYPE_TO_POD_STR: dict[GPUType, str] = {
+    GPUType.A100: "a100",
+    GPUType.H100: "h100",
+    GPUType.H200: "h200",
+    GPUType.GB200: "gb200",
+}
+
+# MIG containers: these use a MIG slice instead of a full GPU
+MIG_CONTAINERS: dict[str, str] = {
+    "kokoro": "1g.10gb",
+    "yolo": "1g.10gb",
+    "realesrgan": "1g.10gb",
+}
+
+# Mapping from StreamWise app name to simulator workflow key
+APP_TO_WORKFLOW: dict[str, str] = {
+    "streamcast": "podcast",
+    "streampersona": "slide",
+    "streamchat": "chat",
+    "streamshort": "short",
+    "streammovie": "movie",
+    "streamanimate": "story",
+    "streamlecture": "lecture",
+    "streamdub": "dubbing",
+    "streamedit": "editing",
+}
+
+
+@dataclass
+class DeploymentSpec:
+    """A single container deployment specification."""
+    container_name: str
+    cpu: int
+    memory_gib: int
+    ephemeral_storage_gib: int
+    gpu: int
+    gpu_type: Optional[str]
+    mig_profile: Optional[str]
+
+
+@dataclass
+class DeploymentPlan:
+    """Complete deployment plan produced by the auto-allocator."""
+    specs: list[DeploymentSpec]
+    result: Result
+    workflow_name: str
+    gpu_budget: dict[str, int]
+
+
+def _get_data_dir() -> str:
+    """Get the path to the simulator data directory."""
+    default_path = os.path.join(os.path.dirname(__file__), "..", "simulator", "data")
+    return os.getenv("SIMULATOR_DATA_DIR", default_path)
+
+
+def get_available_workflows() -> list[str]:
+    """Return list of available workflow names for the UI."""
+    return list(APP_TO_WORKFLOW.keys())
+
+
+def get_available_gpu_types() -> list[str]:
+    """Return list of available GPU type strings for the UI."""
+    return [gpu_type.value for gpu_type in GPUType]
+
+
+def run_allocator(
+    gpu_budget: dict[str, int],
+    workflow_name: str,
+) -> DeploymentPlan:
+    """
+    Run the greedy model allocator and return a deployment plan.
+
+    Args:
+        gpu_budget: GPU counts keyed by GPU type string (e.g., {"A100": 8, "H100": 0}).
+        workflow_name: StreamWise app name (e.g., "streamcast").
+
+    Returns:
+        DeploymentPlan with concrete container deployment specs.
+
+    Raises:
+        ValueError: If workflow_name or GPU types are invalid.
+    """
+    # Validate workflow
+    workflow_key = APP_TO_WORKFLOW.get(workflow_name)
+    if workflow_key is None:
+        raise ValueError(
+            f"Unknown workflow '{workflow_name}'. "
+            f"Available: {list(APP_TO_WORKFLOW.keys())}")
+
+    workflow = WORKFLOWS[workflow_key]
+
+    # Parse GPU budget into GPUType enum
+    num_gpus: dict[GPUType, int] = {}
+    for gpu_str, count in gpu_budget.items():
+        try:
+            gpu_type = GPUType(gpu_str)
+        except ValueError:
+            raise ValueError(
+                f"Unknown GPU type '{gpu_str}'. "
+                f"Available: {[g.value for g in GPUType]}")
+        if count > 0:
+            num_gpus[gpu_type] = count
+
+    if not num_gpus or sum(num_gpus.values()) < 8:
+        raise ValueError("Total GPU budget must be at least 8 GPUs.")
+
+    # Load latency data and run allocator
+    data_dir = _get_data_dir()
+    latency_data = load_latency_data(data_dir=data_dir)
+
+    allocator = AutoModelAllocator(
+        workflow=workflow,
+        latency_data=latency_data,
+        policy=STREAMWISE_POLICY,
+    )
+
+    result = allocator.allocate(num_gpus=num_gpus, verbose=False)
+
+    # Convert result to deployment specs
+    specs = result_to_deployment_specs(result)
+
+    return DeploymentPlan(
+        specs=specs,
+        result=result,
+        workflow_name=workflow_name,
+        gpu_budget=gpu_budget,
+    )
+
+
+def result_to_deployment_specs(result: Result) -> list[DeploymentSpec]:
+    """
+    Convert an allocator Result into a list of DeploymentSpec objects.
+
+    Each ModelAllocation with replicas > 0 is mapped to one or more container deployments.
+    """
+    specs: list[DeploymentSpec] = []
+
+    for gpu_type, model_dict in result.models.items():
+        gpu_type_str = GPU_TYPE_TO_POD_STR[gpu_type]
+
+        for model, allocations in model_dict.items():
+            containers = MODEL_TO_CONTAINERS.get(model, [])
+            if not containers:
+                continue
+
+            for allocation in allocations:
+                if allocation.replicas <= 0:
+                    continue
+
+                for container_name in containers:
+                    resources = CONTAINER_RESOURCES.get(container_name, (4, 16, 16))
+                    cpu, memory_gib, ephemeral_storage_gib = resources
+
+                    mig_profile = MIG_CONTAINERS.get(container_name)
+                    gpu_count = allocation.devices if not mig_profile else 1
+
+                    for _ in range(allocation.replicas):
+                        specs.append(DeploymentSpec(
+                            container_name=container_name,
+                            cpu=cpu,
+                            memory_gib=memory_gib,
+                            ephemeral_storage_gib=ephemeral_storage_gib,
+                            gpu=gpu_count,
+                            gpu_type=gpu_type_str,
+                            mig_profile=mig_profile,
+                        ))
+
+    return specs
+
+
+def deployment_plan_to_json(plan: DeploymentPlan) -> dict:
+    """Serialize a DeploymentPlan to a JSON-friendly dict."""
+    return {
+        "workflow_name": plan.workflow_name,
+        "gpu_budget": plan.gpu_budget,
+        "metrics": {
+            "total_time_s": round(plan.result.total_time_s, 2),
+            "ttff_s": round(plan.result.ttff_s, 2),
+            "cost": round(plan.result.cost, 4),
+            "gpus_used": {
+                gpu_type.value: count
+                for gpu_type, count in plan.result.gpus_used.items()
+            },
+        },
+        "specs": [
+            {
+                "container_name": spec.container_name,
+                "cpu": spec.cpu,
+                "memory_gib": spec.memory_gib,
+                "ephemeral_storage_gib": spec.ephemeral_storage_gib,
+                "gpu": spec.gpu,
+                "gpu_type": spec.gpu_type,
+                "mig_profile": spec.mig_profile,
+            }
+            for spec in plan.specs
+        ],
+    }
diff --git a/streamwise/streamwise.py b/streamwise/streamwise.py
index 1c63eacf..0ce24ac5 100644
--- a/streamwise/streamwise.py
+++ b/streamwise/streamwise.py
@@ -34,6 +34,7 @@
 import pod_manager
 import node_manager
 import job_manager
+import allocator_bridge
 
 from service_manager import get_services
 from service_manager import get_service_timestamps
@@ -726,6 +727,123 @@ async def api_add_pod() -> QuartReturn:
         return jsonify({"error": str(ex)}), HTTPStatus.INTERNAL_SERVER_ERROR
 
 
+@route("/api/auto_deploy", methods=["POST"])
+async def api_auto_deploy() -> QuartReturn:
+    """Run the model allocator to produce an optimized deployment plan.
+
+    Expects JSON body:
+        {
+            "gpu_budget": {"A100": 8, "H100": 0, ...},
+            "workflow": "streamcast"
+        }
+
+    Returns the deployment plan with estimated metrics and per-container specs.
+    """
+    try:
+        data = await request.get_json()
+        if not data:
+            return jsonify({"error": "Request body must be JSON"}), HTTPStatus.BAD_REQUEST
+
+        gpu_budget = data.get("gpu_budget")
+        workflow_name = data.get("workflow")
+
+        if not gpu_budget or not isinstance(gpu_budget, dict):
+            return jsonify({"error": "Missing or invalid 'gpu_budget' field"}), HTTPStatus.BAD_REQUEST
+        if not workflow_name or not isinstance(workflow_name, str):
+            return jsonify({"error": "Missing or invalid 'workflow' field"}), HTTPStatus.BAD_REQUEST
+
+        plan = allocator_bridge.run_allocator(
+            gpu_budget=gpu_budget,
+            workflow_name=workflow_name,
+        )
+        return jsonify(allocator_bridge.deployment_plan_to_json(plan)), HTTPStatus.OK
+
+    except ValueError as ve:
+        return jsonify({"error": str(ve)}), HTTPStatus.BAD_REQUEST
+    except Exception as ex:
+        logging.exception("Error in auto_deploy: %s", ex)
+        return jsonify({"error": str(ex)}), HTTPStatus.INTERNAL_SERVER_ERROR
+
+
+@route("/api/auto_deploy/confirm", methods=["POST"])
+async def api_auto_deploy_confirm() -> QuartReturn:
+    """Execute a deployment plan produced by /api/auto_deploy.
+
+    Expects JSON body:
+        {
+            "specs": [
+                {
+                    "container_name": "gemma",
+                    "cpu": 16,
+                    "memory_gib": 192,
+                    "ephemeral_storage_gib": 64,
+                    "gpu": 2,
+                    "gpu_type": "a100",
+                    "mig_profile": null
+                },
+                ...
+            ]
+        }
+
+    Deploys all containers in the plan.
+    """
+    try:
+        data = await request.get_json()
+        if not data:
+            return jsonify({"error": "Request body must be JSON"}), HTTPStatus.BAD_REQUEST
+
+        specs = data.get("specs")
+        if not specs or not isinstance(specs, list):
+            return jsonify({"error": "Missing or invalid 'specs' field"}), HTTPStatus.BAD_REQUEST
+
+        deployed: List[str] = []
+        errors: List[str] = []
+
+        for spec in specs:
+            container_name = spec.get("container_name")
+            if not container_name:
+                errors.append("Spec missing 'container_name'")
+                continue
+
+            try:
+                await pod_manager.add_pod(
+                    container_name=container_name,
+                    cpu=int(spec.get("cpu", 4)),
+                    memory_gib=int(spec.get("memory_gib", 16)),
+                    ephemeral_storage_gib=int(spec.get("ephemeral_storage_gib", 16)),
+                    gpu=int(spec.get("gpu", 0)),
+                    gpu_type=spec.get("gpu_type"),
+                    mig_profile=spec.get("mig_profile"),
+                    namespace=NAMESPACE,
+                    k8s_cluster=k8s_cluster,
+                )
+                deployed.append(container_name)
+            except Exception as pod_ex:
+                msg = f"Failed to deploy '{container_name}': {pod_ex}"
+                logging.error(msg)
+                errors.append(msg)
+
+        status = HTTPStatus.OK if not errors else HTTPStatus.MULTI_STATUS
+        return jsonify({
+            "deployed": deployed,
+            "errors": errors,
+            "message": f"Deployed {len(deployed)}/{len(specs)} containers.",
+        }), status
+
+    except Exception as ex:
+        logging.exception("Error in auto_deploy/confirm: %s", ex)
+        return jsonify({"error": str(ex)}), HTTPStatus.INTERNAL_SERVER_ERROR
+
+
+@route("/api/auto_deploy/workflows", methods=["GET"])
+async def api_auto_deploy_workflows() -> QuartReturn:
+    """Return available workflows and GPU types for the auto-deploy UI."""
+    return jsonify({
+        "workflows": allocator_bridge.get_available_workflows(),
+        "gpu_types": allocator_bridge.get_available_gpu_types(),
+    }), HTTPStatus.OK
+
+
 @route("/api/node/<node_name>", methods=["DELETE"])
 async def api_remove_node(node_name: str) -> QuartReturn:
     return await node_manager.remove_node(
diff --git a/streamwise/templates/add_pod.html b/streamwise/templates/add_pod.html
index d61952aa..f5496e10 100644
--- a/streamwise/templates/add_pod.html
+++ b/streamwise/templates/add_pod.html
@@ -384,6 +384,94 @@ <h2 class="mt-5">🧩 Applications</h2>
             </form>
         {% endif %}
 
+        <!-- Auto-Deploy Section -->
+        <h2 class="mt-5">🤖 Auto Deploy</h2>
+        <p>Specify your GPU budget and the optimizer will determine the best allocation for each component:</p>
+
+        <form id="auto-deploy-form">
+            <fieldset class="border rounded-3 p-3 mb-4">
+                <legend class="float-none w-auto px-2 fw-semibold">
+                    💰 GPU Budget
+                </legend>
+                <div class="row g-3 mb-3">
+                    <div class="col-md-3">
+                        <label for="auto_gpu_a100" class="form-label">A100</label>
+                        <input type="number" class="form-control" id="auto_gpu_a100" name="gpu_a100"
+                            min="0" max="64" value="8">
+                    </div>
+                    <div class="col-md-3">
+                        <label for="auto_gpu_h100" class="form-label">H100</label>
+                        <input type="number" class="form-control" id="auto_gpu_h100" name="gpu_h100"
+                            min="0" max="64" value="0">
+                    </div>
+                    <div class="col-md-3">
+                        <label for="auto_gpu_h200" class="form-label">H200</label>
+                        <input type="number" class="form-control" id="auto_gpu_h200" name="gpu_h200"
+                            min="0" max="64" value="0">
+                    </div>
+                    <div class="col-md-3">
+                        <label for="auto_gpu_gb200" class="form-label">GB200</label>
+                        <input type="number" class="form-control" id="auto_gpu_gb200" name="gpu_gb200"
+                            min="0" max="64" value="0">
+                    </div>
+                </div>
+            </fieldset>
+
+            <fieldset class="border rounded-3 p-3 mb-4">
+                <legend class="float-none w-auto px-2 fw-semibold">
+                    🎬 Workflow
+                </legend>
+                <div class="mb-3">
+                    <label for="auto_workflow" class="form-label">Application workflow</label>
+                    <select class="form-select" id="auto_workflow" name="workflow">
+                        <option value="streamcast" selected>🎙️ StreamCast (Podcast)</option>
+                        <option value="streampersona">👤 StreamPersona (Slide)</option>
+                        <option value="streamchat">💬 StreamChat (Video Chat)</option>
+                        <option value="streamshort">🎬 StreamShort (Shorts)</option>
+                        <option value="streammovie">🎬 StreamMovie (Movie)</option>
+                        <option value="streamanimate">🎞️ StreamAnimate (Story)</option>
+                        <option value="streamlecture">📚 StreamLecture (Lecture)</option>
+                        <option value="streamdub">🎤 StreamDub (Dubbing)</option>
+                        <option value="streamedit">✂️ StreamEdit (Editing)</option>
+                    </select>
+                </div>
+            </fieldset>
+
+            <div class="text-end mb-3">
+                <button type="submit" class="btn btn-warning" style="width: 200px;"
+                    id="auto-deploy-optimize-btn">
+                    🤖 Optimize
+                </button>
+            </div>
+        </form>
+
+        <!-- Auto-deploy results (hidden until optimize is clicked) -->
+        <div id="auto-deploy-results" style="display:none;">
+            <h4>📊 Optimized Deployment Plan</h4>
+            <div id="auto-deploy-metrics" class="alert alert-success mb-3"></div>
+            <table class="table table-sm table-bordered" id="auto-deploy-plan-table">
+                <thead>
+                    <tr>
+                        <th>Container</th>
+                        <th>GPU</th>
+                        <th>GPU Type</th>
+                        <th>CPU</th>
+                        <th>Memory</th>
+                        <th>MIG</th>
+                    </tr>
+                </thead>
+                <tbody id="auto-deploy-plan-body"></tbody>
+            </table>
+            <div class="text-end">
+                <button type="button" class="btn btn-success" style="width: 200px;"
+                    id="auto-deploy-confirm-btn">
+                    ✅ Confirm Deploy
+                </button>
+            </div>
+        </div>
+
+        <div id="auto-deploy-error" class="alert alert-danger mt-3" style="display:none;"></div>
+
         <script src="{{ url_for('static', filename='js/form-utils.js') }}"></script>
         <script>
             // Keep aligned with deployment/helm/values.yaml and services.json
@@ -685,6 +773,108 @@ <h2 class="mt-5">🧩 Applications</h2>
                     });
                 });
             }
+            // Auto-Deploy
+            const autoDeployForm = document.getElementById('auto-deploy-form');
+            if (autoDeployForm) {
+                let currentPlan = null;
+
+                autoDeployForm.addEventListener('submit', function(e) {
+                    e.preventDefault();
+                    const btn = document.getElementById('auto-deploy-optimize-btn');
+                    btn.disabled = true;
+                    btn.textContent = '⏳ Optimizing...';
+
+                    const gpuBudget = {
+                        'A100': parseInt(document.getElementById('auto_gpu_a100').value) || 0,
+                        'H100': parseInt(document.getElementById('auto_gpu_h100').value) || 0,
+                        'H200': parseInt(document.getElementById('auto_gpu_h200').value) || 0,
+                        'GB200': parseInt(document.getElementById('auto_gpu_gb200').value) || 0,
+                    };
+                    const workflow = document.getElementById('auto_workflow').value;
+
+                    const errorDiv = document.getElementById('auto-deploy-error');
+                    const resultsDiv = document.getElementById('auto-deploy-results');
+                    errorDiv.style.display = 'none';
+                    resultsDiv.style.display = 'none';
+
+                    fetch('/api/auto_deploy', {
+                        method: 'POST',
+                        headers: {'Content-Type': 'application/json'},
+                        body: JSON.stringify({gpu_budget: gpuBudget, workflow: workflow}),
+                        credentials: 'same-origin'
+                    })
+                    .then(response => response.json().then(data => ({ok: response.ok, data})))
+                    .then(({ok, data}) => {
+                        btn.disabled = false;
+                        btn.textContent = '🤖 Optimize';
+                        if (!ok) {
+                            errorDiv.textContent = data.error || 'Unknown error';
+                            errorDiv.style.display = '';
+                            return;
+                        }
+                        currentPlan = data;
+                        // Show metrics
+                        const metrics = data.metrics;
+                        document.getElementById('auto-deploy-metrics').innerHTML =
+                            `<strong>Total Time:</strong> ${metrics.total_time_s}s &nbsp;|&nbsp; ` +
+                            `<strong>TTFF:</strong> ${metrics.ttff_s}s &nbsp;|&nbsp; ` +
+                            `<strong>Cost:</strong> $${metrics.cost} &nbsp;|&nbsp; ` +
+                            `<strong>GPUs Used:</strong> ${JSON.stringify(metrics.gpus_used)}`;
+                        // Show plan table
+                        const tbody = document.getElementById('auto-deploy-plan-body');
+                        tbody.innerHTML = '';
+                        data.specs.forEach(spec => {
+                            const row = document.createElement('tr');
+                            row.innerHTML =
+                                `<td>${escapeHtml(spec.container_name)}</td>` +
+                                `<td>${spec.gpu}</td>` +
+                                `<td>${escapeHtml(spec.gpu_type || 'any')}</td>` +
+                                `<td>${spec.cpu}</td>` +
+                                `<td>${spec.memory_gib} GiB</td>` +
+                                `<td>${spec.mig_profile || '-'}</td>`;
+                            tbody.appendChild(row);
+                        });
+                        resultsDiv.style.display = '';
+                    })
+                    .catch(err => {
+                        btn.disabled = false;
+                        btn.textContent = '🤖 Optimize';
+                        errorDiv.textContent = 'Network error: ' + err;
+                        errorDiv.style.display = '';
+                    });
+                });
+
+                document.getElementById('auto-deploy-confirm-btn').addEventListener('click', function() {
+                    if (!currentPlan || !currentPlan.specs) return;
+
+                    const btn = this;
+                    btn.disabled = true;
+                    btn.textContent = '⏳ Deploying...';
+
+                    fetch('/api/auto_deploy/confirm', {
+                        method: 'POST',
+                        headers: {'Content-Type': 'application/json'},
+                        body: JSON.stringify({specs: currentPlan.specs}),
+                        credentials: 'same-origin'
+                    })
+                    .then(response => response.json().then(data => ({ok: response.ok, data})))
+                    .then(({ok, data}) => {
+                        btn.disabled = false;
+                        btn.textContent = '✅ Confirm Deploy';
+                        if (data.errors && data.errors.length > 0) {
+                            alert('Deployed ' + data.deployed.length + ' containers.\nErrors:\n' + data.errors.join('\n'));
+                        } else {
+                            alert(data.message || 'Deployment complete!');
+                        }
+                        window.location.href = '/';
+                    })
+                    .catch(err => {
+                        btn.disabled = false;
+                        btn.textContent = '✅ Confirm Deploy';
+                        alert('Error: ' + err);
+                    });
+                });
+            }
         </script>
         <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.5/dist/js/bootstrap.bundle.min.js"
             integrity="sha384-k6d4wzSIapyDyv1kpU366/PK5hCdSbCRGRCMv+eplOQJWyd1fbcAu9OCUj5zNLiq"
diff --git a/tests/streamwise/test_allocator_bridge.py b/tests/streamwise/test_allocator_bridge.py
new file mode 100644
index 00000000..569e4073
--- /dev/null
+++ b/tests/streamwise/test_allocator_bridge.py
@@ -0,0 +1,282 @@
+"""
+Tests for streamwise/allocator_bridge.py.
+
+Covers:
+- Model-to-container name mapping.
+- Result to deployment specs conversion.
+- run_allocator end-to-end (with real latency data).
+- Error handling for invalid inputs.
+"""
+
+from __future__ import annotations
+
+import sys
+import os
+
+import pytest
+
+# Add current path and simulator/ permanently so lazy imports
+# (e.g. GreedyAllocator via auto_model_allocator) resolve at test time.
+sys.path.append(os.getcwd())
+sys.path[:0] = [os.path.join(os.getcwd(), "simulator")]
+
+from tests.test_utils import temp_sys_path
+
+with temp_sys_path("streamwise", "simulator"):
+    from allocator_bridge import (
+        MODEL_TO_CONTAINERS,
+        CONTAINER_RESOURCES,
+        GPU_TYPE_TO_POD_STR,
+        APP_TO_WORKFLOW,
+        DeploymentSpec,
+        DeploymentPlan,
+        get_available_workflows,
+        get_available_gpu_types,
+        result_to_deployment_specs,
+        deployment_plan_to_json,
+        run_allocator,
+    )
+    from sim_types import GPUType, Model, Result
+    from models import (
+        GemmaModelAllocation,
+        FluxModelAllocation,
+        HFModelAllocation,
+        HFVAEModelAllocation,
+        FTModelAllocation,
+        OthersModelAllocation,
+        UpscalerModelAllocation,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Mapping correctness
+# ---------------------------------------------------------------------------
+
+def test_model_to_containers_covers_all_models() -> None:
+    """Every Model enum value must have a mapping entry."""
+    for model in Model:
+        assert model in MODEL_TO_CONTAINERS, f"Missing mapping for {model}"
+
+
+def test_container_resources_covers_all_mapped_containers() -> None:
+    """Every container referenced in MODEL_TO_CONTAINERS must have resource defaults."""
+    for model, containers in MODEL_TO_CONTAINERS.items():
+        for container in containers:
+            assert container in CONTAINER_RESOURCES, (
+                f"Missing CONTAINER_RESOURCES for '{container}' (from {model})")
+
+
+def test_gpu_type_to_pod_str_covers_all_gpu_types() -> None:
+    """Every GPUType enum value must have a pod string mapping."""
+    for gpu_type in GPUType:
+        assert gpu_type in GPU_TYPE_TO_POD_STR
+
+
+def test_app_to_workflow_has_expected_entries() -> None:
+    """Key StreamWise apps should map to workflows."""
+    assert "streamcast" in APP_TO_WORKFLOW
+    assert "streampersona" in APP_TO_WORKFLOW
+    assert "streamchat" in APP_TO_WORKFLOW
+
+
+# ---------------------------------------------------------------------------
+# Utility functions
+# ---------------------------------------------------------------------------
+
+def test_get_available_workflows() -> None:
+    workflows = get_available_workflows()
+    assert isinstance(workflows, list)
+    assert "streamcast" in workflows
+    assert len(workflows) >= 5
+
+
+def test_get_available_gpu_types() -> None:
+    gpu_types = get_available_gpu_types()
+    assert isinstance(gpu_types, list)
+    assert "A100" in gpu_types
+    assert "H100" in gpu_types
+
+
+# ---------------------------------------------------------------------------
+# result_to_deployment_specs
+# ---------------------------------------------------------------------------
+
+def test_result_to_deployment_specs_basic() -> None:
+    """A simple result with one active allocation maps to the right container."""
+    models = {
+        GPUType.A100: {
+            Model.GEMMA: [GemmaModelAllocation(gpu_type=GPUType.A100, devices=1, replicas=1)],
+            Model.FLUX: [FluxModelAllocation(gpu_type=GPUType.A100, devices=2, replicas=1)],
+            Model.HF: [HFModelAllocation(gpu_type=GPUType.A100, devices=2, replicas=2)],
+            Model.HF_VAE: [HFVAEModelAllocation(gpu_type=GPUType.A100, devices=1, replicas=1)],
+            Model.FT: [FTModelAllocation(gpu_type=GPUType.A100, devices=1, replicas=0)],
+            Model.FT_VAE: [],
+            Model.UPSCALER: [UpscalerModelAllocation(gpu_type=GPUType.A100, devices=1, replicas=0)],
+            Model.OTHERS: [OthersModelAllocation(gpu_type=GPUType.A100, devices=1, replicas=1)],
+        }
+    }
+    result = Result(
+        total_time_s=100.0,
+        ttff_s=10.0,
+        cost=1.0,
+        gpus_used={GPUType.A100: 8},
+        gpus_total={GPUType.A100: 8},
+        models=models,
+    )
+
+    specs = result_to_deployment_specs(result)
+    assert isinstance(specs, list)
+    assert len(specs) > 0
+
+    container_names = [s.container_name for s in specs]
+    assert "gemma" in container_names
+    assert "flux" in container_names
+    assert "hunyuanframepackf1" in container_names  # HF model
+    assert "hunyuanframepackvae" in container_names  # HF_VAE model
+
+    # OTHERS maps to kokoro + yolo
+    assert "kokoro" in container_names
+    assert "yolo" in container_names
+
+    # Check GPU type mapping
+    gemma_spec = next(s for s in specs if s.container_name == "gemma")
+    assert gemma_spec.gpu_type == "a100"
+    assert gemma_spec.gpu == 1
+
+    # MIG containers get mig_profile set
+    kokoro_spec = next(s for s in specs if s.container_name == "kokoro")
+    assert kokoro_spec.mig_profile == "1g.10gb"
+
+
+def test_result_to_deployment_specs_skips_zero_replicas() -> None:
+    """Allocations with zero replicas should not produce deployment specs."""
+    models = {
+        GPUType.A100: {
+            Model.GEMMA: [GemmaModelAllocation(gpu_type=GPUType.A100, devices=1, replicas=0)],
+            Model.FLUX: [FluxModelAllocation(gpu_type=GPUType.A100, devices=1, replicas=0)],
+            Model.HF: [HFModelAllocation(gpu_type=GPUType.A100, devices=1, replicas=0)],
+            Model.HF_VAE: [],
+            Model.FT: [],
+            Model.FT_VAE: [],
+            Model.UPSCALER: [],
+            Model.OTHERS: [],
+        }
+    }
+    result = Result(
+        total_time_s=0.0,
+        ttff_s=0.0,
+        cost=0.0,
+        gpus_used={GPUType.A100: 0},
+        gpus_total={GPUType.A100: 8},
+        models=models,
+    )
+    specs = result_to_deployment_specs(result)
+    assert specs == []
+
+
+def test_result_to_deployment_specs_multiple_replicas() -> None:
+    """Multiple replicas should produce multiple deployment specs for same container."""
+    models = {
+        GPUType.H100: {
+            Model.GEMMA: [GemmaModelAllocation(gpu_type=GPUType.H100, devices=1, replicas=1)],
+            Model.FLUX: [FluxModelAllocation(gpu_type=GPUType.H100, devices=1, replicas=1)],
+            Model.HF: [HFModelAllocation(gpu_type=GPUType.H100, devices=2, replicas=3)],
+            Model.HF_VAE: [],
+            Model.FT: [],
+            Model.FT_VAE: [],
+            Model.UPSCALER: [],
+            Model.OTHERS: [],
+        }
+    }
+    result = Result(
+        total_time_s=50.0,
+        ttff_s=5.0,
+        cost=0.5,
+        gpus_used={GPUType.H100: 8},
+        gpus_total={GPUType.H100: 16},
+        models=models,
+    )
+    specs = result_to_deployment_specs(result)
+    hf_specs = [s for s in specs if s.container_name == "hunyuanframepackf1"]
+    assert len(hf_specs) == 3  # 3 replicas
+    for spec in hf_specs:
+        assert spec.gpu == 2
+        assert spec.gpu_type == "h100"
+
+
+# ---------------------------------------------------------------------------
+# deployment_plan_to_json
+# ---------------------------------------------------------------------------
+
+def test_deployment_plan_to_json() -> None:
+    """Serialization should produce all expected keys."""
+    result = Result(
+        total_time_s=100.0,
+        ttff_s=10.0,
+        cost=1.5,
+        gpus_used={GPUType.A100: 8},
+        gpus_total={GPUType.A100: 8},
+        models={},
+    )
+    plan = DeploymentPlan(
+        specs=[
+            DeploymentSpec(
+                container_name="gemma", cpu=16, memory_gib=192,
+                ephemeral_storage_gib=64, gpu=2, gpu_type="a100", mig_profile=None)
+        ],
+        result=result,
+        workflow_name="streamcast",
+        gpu_budget={"A100": 8},
+    )
+    data = deployment_plan_to_json(plan)
+    assert data["workflow_name"] == "streamcast"
+    assert data["gpu_budget"] == {"A100": 8}
+    assert data["metrics"]["total_time_s"] == 100.0
+    assert data["metrics"]["ttff_s"] == 10.0
+    assert len(data["specs"]) == 1
+    assert data["specs"][0]["container_name"] == "gemma"
+
+
+# ---------------------------------------------------------------------------
+# run_allocator (integration with real data)
+# ---------------------------------------------------------------------------
+
+def test_run_allocator_streamcast_8_a100() -> None:
+    """Run allocator for StreamCast with 8 A100s — should produce a valid plan."""
+    plan = run_allocator(
+        gpu_budget={"A100": 8},
+        workflow_name="streamcast",
+    )
+    assert isinstance(plan, DeploymentPlan)
+    assert len(plan.specs) > 0
+    assert plan.result.total_time_s > 0
+    assert plan.result.ttff_s > 0
+    assert plan.workflow_name == "streamcast"
+
+
+def test_run_allocator_streamchat_8_h100() -> None:
+    """Run allocator for StreamChat with 8 H100s."""
+    plan = run_allocator(
+        gpu_budget={"H100": 8},
+        workflow_name="streamchat",
+    )
+    assert isinstance(plan, DeploymentPlan)
+    assert len(plan.specs) > 0
+
+
+def test_run_allocator_invalid_workflow() -> None:
+    """Unknown workflow name raises ValueError."""
+    with pytest.raises(ValueError, match="Unknown workflow"):
+        run_allocator(gpu_budget={"A100": 8}, workflow_name="nonexistent")
+
+
+def test_run_allocator_invalid_gpu_type() -> None:
+    """Unknown GPU type raises ValueError."""
+    with pytest.raises(ValueError, match="Unknown GPU type"):
+        run_allocator(gpu_budget={"RTX4090": 8}, workflow_name="streamcast")
+
+
+def test_run_allocator_insufficient_gpus() -> None:
+    """Too few GPUs raises ValueError."""
+    with pytest.raises(ValueError, match="at least 8"):
+        run_allocator(gpu_budget={"A100": 4}, workflow_name="streamcast")
diff --git a/tests/streamwise/test_streamwise_auto_deploy.py b/tests/streamwise/test_streamwise_auto_deploy.py
new file mode 100644
index 00000000..a191785a
--- /dev/null
+++ b/tests/streamwise/test_streamwise_auto_deploy.py
@@ -0,0 +1,226 @@
+"""
+Tests for the auto-deploy API endpoints in streamwise.py.
+
+Covers:
+- POST /api/auto_deploy — returns optimized plan.
+- POST /api/auto_deploy/confirm — deploys the plan.
+- GET /api/auto_deploy/workflows — lists available options.
+- Error cases (missing fields, invalid inputs).
+"""
+
+from __future__ import annotations
+
+import sys
+
+import pytest
+
+from http import HTTPStatus
+from unittest.mock import patch
+
+from tests.test_utils import temp_sys_path
+from tests.k8s_mock import K8sMock
+
+mock_k8s = K8sMock()
+
+mock_modules = {}
+mock_modules.update(mock_k8s.get_sub_modules())
+
+import streamwise.http_session_manager  # noqa: F401 — registers the streamwise package
+
+# Permanently inject K8s mocks into sys.modules (not via context manager)
+# so that simulator modules loaded alongside streamwise remain importable
+# after setup completes.
+_original_modules = {}
+for mod_name, mock_mod in mock_modules.items():
+    _original_modules[mod_name] = sys.modules.get(mod_name)
+    sys.modules[mod_name] = mock_mod
+
+with temp_sys_path("streamwise"):
+    from streamwise import streamwise as sw
+
+
+def _get_client():  # type: ignore[no-untyped-def]
+    app = sw.app
+    return app.test_client()
+
+
+@pytest.fixture(scope="function", autouse=True)
+def setup_k8s_cluster() -> None:
+    sw.k8s_cluster = "unittest"
+    sw.use_https = False
+
+
+# ---------------------------------------------------------------------------
+# GET /api/auto_deploy/workflows
+# ---------------------------------------------------------------------------
+
+@pytest.mark.asyncio
+async def test_auto_deploy_workflows() -> None:
+    """Should return available workflows and GPU types."""
+    client = _get_client()
+    response = await client.get("/api/auto_deploy/workflows")
+    assert response.status_code == HTTPStatus.OK
+    data = await response.get_json()
+    assert "workflows" in data
+    assert "gpu_types" in data
+    assert "streamcast" in data["workflows"]
+    assert "A100" in data["gpu_types"]
+
+
+# ---------------------------------------------------------------------------
+# POST /api/auto_deploy
+# ---------------------------------------------------------------------------
+
+@pytest.mark.asyncio
+async def test_auto_deploy_success() -> None:
+    """Valid request returns an optimized deployment plan."""
+    fake_json = {
+        "workflow_name": "streamcast",
+        "gpu_budget": {"A100": 8},
+        "metrics": {"total_time_s": 3.5, "ttff_s": 1.0, "cost": 12.0, "gpus_used": {"A100": 3}},
+        "specs": [
+            {"container_name": "gemma", "cpu": 4, "memory_gib": 16,
+             "ephemeral_storage_gib": 10, "gpu": 1, "gpu_type": "A100", "mig_profile": None},
+            {"container_name": "flux", "cpu": 4, "memory_gib": 16,
+             "ephemeral_storage_gib": 10, "gpu": 2, "gpu_type": "A100", "mig_profile": None},
+        ],
+    }
+    # Patch on the actual module object that streamwise.py holds a reference to.
+    with patch.object(sw.allocator_bridge, "run_allocator") as mock_alloc, \
+         patch.object(sw.allocator_bridge, "deployment_plan_to_json", return_value=fake_json):
+        mock_alloc.return_value = "fake_plan"
+        client = _get_client()
+        response = await client.post(
+            "/api/auto_deploy",
+            json={
+                "gpu_budget": {"A100": 8},
+                "workflow": "streamcast",
+            },
+        )
+    assert response.status_code == HTTPStatus.OK
+    data = await response.get_json()
+    assert "specs" in data
+    assert "metrics" in data
+    assert len(data["specs"]) == 2
+    assert data["metrics"]["total_time_s"] == 3.5
+
+
+@pytest.mark.asyncio
+async def test_auto_deploy_missing_gpu_budget() -> None:
+    """Missing gpu_budget field returns 400."""
+    client = _get_client()
+    response = await client.post(
+        "/api/auto_deploy",
+        json={"workflow": "streamcast"},
+    )
+    assert response.status_code == HTTPStatus.BAD_REQUEST
+
+
+@pytest.mark.asyncio
+async def test_auto_deploy_missing_workflow() -> None:
+    """Missing workflow field returns 400."""
+    client = _get_client()
+    response = await client.post(
+        "/api/auto_deploy",
+        json={"gpu_budget": {"A100": 8}},
+    )
+    assert response.status_code == HTTPStatus.BAD_REQUEST
+
+
+@pytest.mark.asyncio
+async def test_auto_deploy_invalid_workflow() -> None:
+    """Invalid workflow name returns 400."""
+    client = _get_client()
+    response = await client.post(
+        "/api/auto_deploy",
+        json={
+            "gpu_budget": {"A100": 8},
+            "workflow": "nonexistent",
+        },
+    )
+    assert response.status_code == HTTPStatus.BAD_REQUEST
+    data = await response.get_json()
+    assert "error" in data
+
+
+@pytest.mark.asyncio
+async def test_auto_deploy_insufficient_gpus() -> None:
+    """Too few GPUs returns 400."""
+    client = _get_client()
+    response = await client.post(
+        "/api/auto_deploy",
+        json={
+            "gpu_budget": {"A100": 2},
+            "workflow": "streamcast",
+        },
+    )
+    assert response.status_code == HTTPStatus.BAD_REQUEST
+
+
+@pytest.mark.asyncio
+async def test_auto_deploy_no_json_body() -> None:
+    """No JSON body returns 400."""
+    client = _get_client()
+    response = await client.post("/api/auto_deploy")
+    assert response.status_code == HTTPStatus.BAD_REQUEST
+
+
+# ---------------------------------------------------------------------------
+# POST /api/auto_deploy/confirm
+# ---------------------------------------------------------------------------
+
+@pytest.mark.asyncio
+async def test_auto_deploy_confirm_success() -> None:
+    """Valid confirm request deploys containers."""
+    client = _get_client()
+    specs = [
+        {
+            "container_name": "gemma",
+            "cpu": 16,
+            "memory_gib": 192,
+            "ephemeral_storage_gib": 64,
+            "gpu": 2,
+            "gpu_type": "a100",
+            "mig_profile": None,
+        },
+        {
+            "container_name": "flux",
+            "cpu": 12,
+            "memory_gib": 128,
+            "ephemeral_storage_gib": 64,
+            "gpu": 2,
+            "gpu_type": "a100",
+            "mig_profile": None,
+        },
+    ]
+    response = await client.post(
+        "/api/auto_deploy/confirm",
+        json={"specs": specs},
+    )
+    # Should succeed (mocked K8s)
+    assert response.status_code in (HTTPStatus.OK, HTTPStatus.MULTI_STATUS)
+    data = await response.get_json()
+    assert "deployed" in data
+    assert "message" in data
+
+
+@pytest.mark.asyncio
+async def test_auto_deploy_confirm_missing_specs() -> None:
+    """Missing specs returns 400."""
+    client = _get_client()
+    response = await client.post(
+        "/api/auto_deploy/confirm",
+        json={},
+    )
+    assert response.status_code == HTTPStatus.BAD_REQUEST
+
+
+@pytest.mark.asyncio
+async def test_auto_deploy_confirm_empty_specs() -> None:
+    """Empty specs list returns 400."""
+    client = _get_client()
+    response = await client.post(
+        "/api/auto_deploy/confirm",
+        json={"specs": []},
+    )
+    assert response.status_code == HTTPStatus.BAD_REQUEST

From 124d7cbed27f387e2268618a94105943c03fc8e0 Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Fri, 15 May 2026 21:29:01 -0700
Subject: [PATCH 04/39] Fix: ensure model_provisioner is importable regardless
 of working directory

Add streamwise/ directory to sys.path explicitly in allocator_bridge.py
so model_provisioner can be found when Python is invoked from a different
working directory (e.g., in Docker/pipeline environments).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 streamwise/allocator_bridge.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/streamwise/allocator_bridge.py b/streamwise/allocator_bridge.py
index b1e610d2..40e3fe87 100644
--- a/streamwise/allocator_bridge.py
+++ b/streamwise/allocator_bridge.py
@@ -7,9 +7,12 @@
 
 from __future__ import annotations
 
+import sys
 import os
 
-import model_provisioner  # noqa: F401 — adds simulator/ to sys.path
+# Ensure the streamwise/ directory is on sys.path so model_provisioner is importable.
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+import model_provisioner  # noqa: E402, F401 — adds simulator/ to sys.path
 
 from dataclasses import dataclass
 from typing import Optional

From 38997c3afa00caa07c894a3379a75fa55bb2867c Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Sat, 16 May 2026 16:27:39 -0700
Subject: [PATCH 05/39] Fix: include model_provisioner and simulator in Docker
 image

The StreamWise Docker image was missing the model_provisioner/ package
and simulator/ foundation modules needed by the auto-deploy feature.

- Update deployment/setup_image.sh to copy model_provisioner/ and
  simulator/ into the Docker build context
- Update Dockerfile to COPY both directories
- Fix model_provisioner/__init__.py to find simulator/ in both local dev
  layout (../../simulator) and Docker layout (../simulator)
- Guard sys.path.insert with dedup check in allocator_bridge.py

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 deployment/setup_image.sh        | 2 ++
 deployment/streamwise/Dockerfile | 2 ++
 streamwise/allocator_bridge.py   | 6 ++++--
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/deployment/setup_image.sh b/deployment/setup_image.sh
index e8585bc1..1efeca94 100644
--- a/deployment/setup_image.sh
+++ b/deployment/setup_image.sh
@@ -90,6 +90,8 @@ if [[ "$IMAGE_NAME" == "streamwise" ]]; then
   cp "$APP_DIR"/*.bash "$IMAGE_DIR/docker_files/"
   [[ -d "$APP_DIR/static" ]] && cp -R "$APP_DIR/static" "$IMAGE_DIR/docker_files/"
   [[ -d "$APP_DIR/templates" ]] && cp -R "$APP_DIR/templates" "$IMAGE_DIR/docker_files/"
+  [[ -d "$APP_DIR/model_provisioner" ]] && cp -R "$APP_DIR/model_provisioner" "$IMAGE_DIR/docker_files/"
+  [[ -d "$MAIN_DIR/simulator" ]] && cp -R "$MAIN_DIR/simulator" "$IMAGE_DIR/docker_files/"
   cp "$MAIN_DIR/services.json" "$IMAGE_DIR/docker_files/"
 
   # Certs directory (empty by default; populated with --certfile/--keyfile for embedded HTTPS)
diff --git a/deployment/streamwise/Dockerfile b/deployment/streamwise/Dockerfile
index f7cef49c..a0c05e94 100644
--- a/deployment/streamwise/Dockerfile
+++ b/deployment/streamwise/Dockerfile
@@ -21,6 +21,8 @@ COPY *.sh .
 COPY *.json .
 COPY templates ./templates
 COPY static ./static
+COPY model_provisioner ./model_provisioner
+COPY simulator ./simulator
 
 # TLS certificates (optional — populated by setup_image.sh --certfile/--keyfile or mounted at runtime)
 RUN mkdir -p /certs
diff --git a/streamwise/allocator_bridge.py b/streamwise/allocator_bridge.py
index 40e3fe87..679084bf 100644
--- a/streamwise/allocator_bridge.py
+++ b/streamwise/allocator_bridge.py
@@ -10,8 +10,10 @@
 import sys
 import os
 
-# Ensure the streamwise/ directory is on sys.path so model_provisioner is importable.
-sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+# Ensure the directory containing this file is on sys.path so model_provisioner is importable.
+_HERE = os.path.dirname(os.path.abspath(__file__))
+if _HERE not in sys.path:
+    sys.path.insert(0, _HERE)
 import model_provisioner  # noqa: E402, F401 — adds simulator/ to sys.path
 
 from dataclasses import dataclass

From c8f5f2a0470ae76cccf5cd00445f74105df89968 Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Sat, 16 May 2026 20:32:32 -0700
Subject: [PATCH 06/39] Add pandas and tabulate to StreamWise requirements

The model_provisioner and simulator foundation modules (sim_types,
data_loading, utils, greedy) require pandas and tabulate which were not
previously needed by the StreamWise Docker image.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 streamwise/requirements.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/streamwise/requirements.txt b/streamwise/requirements.txt
index b5695973..468a2136 100644
--- a/streamwise/requirements.txt
+++ b/streamwise/requirements.txt
@@ -5,3 +5,5 @@ aiohttp
 numpy
 scipy
 colorlog
+pandas
+tabulate

From fc1abc951609931d6a09a5bc721306683f5e3bae Mon Sep 17 00:00:00 2001
From: Haoran Qiu <jamesqiu@connect.hku.hk>
Date: Sun, 17 May 2026 15:24:47 -0700
Subject: [PATCH 07/39] Add fallback for GPUs that do not support MIG

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 streamwise/allocator_bridge.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/streamwise/allocator_bridge.py b/streamwise/allocator_bridge.py
index 679084bf..293c887f 100644
--- a/streamwise/allocator_bridge.py
+++ b/streamwise/allocator_bridge.py
@@ -63,13 +63,24 @@
     GPUType.GB200: "gb200",
 }
 
-# MIG containers: these use a MIG slice instead of a full GPU
+# MIG is only supported by pod_manager on these GPU types.
+MIG_CAPABLE_GPU_TYPES: frozenset[GPUType] = frozenset({GPUType.A100, GPUType.H100})
+
+# Containers that prefer a MIG slice when the selected GPU type supports MIG.
 MIG_CONTAINERS: dict[str, str] = {
     "kokoro": "1g.10gb",
     "yolo": "1g.10gb",
     "realesrgan": "1g.10gb",
 }
 
+
+def get_mig_profile(container_name: str, gpu_type: GPUType) -> Optional[str]:
+    """Return a MIG profile only when the selected GPU type supports MIG."""
+    if gpu_type not in MIG_CAPABLE_GPU_TYPES:
+        return None
+    return MIG_CONTAINERS.get(container_name)
+
+
 # Mapping from StreamWise app name to simulator workflow key
 APP_TO_WORKFLOW: dict[str, str] = {
     "streamcast": "podcast",

From 367760be50952f616852ac1380e012314f4b551a Mon Sep 17 00:00:00 2001
From: Haoran Qiu <jamesqiu@connect.hku.hk>
Date: Sun, 17 May 2026 15:26:53 -0700
Subject: [PATCH 08/39] Move run_allocator to async call

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 streamwise/streamwise.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/streamwise/streamwise.py b/streamwise/streamwise.py
index 0ce24ac5..4062bbf4 100644
--- a/streamwise/streamwise.py
+++ b/streamwise/streamwise.py
@@ -752,7 +752,8 @@ async def api_auto_deploy() -> QuartReturn:
         if not workflow_name or not isinstance(workflow_name, str):
             return jsonify({"error": "Missing or invalid 'workflow' field"}), HTTPStatus.BAD_REQUEST
 
-        plan = allocator_bridge.run_allocator(
+        plan = await asyncio.to_thread(
+            allocator_bridge.run_allocator,
             gpu_budget=gpu_budget,
             workflow_name=workflow_name,
         )

From 81ec0a99256a9952721a03892dcce030ecac30f7 Mon Sep 17 00:00:00 2001
From: Haoran Qiu <jamesqiu@connect.hku.hk>
Date: Sun, 17 May 2026 15:28:59 -0700
Subject: [PATCH 09/39] Mock k8s API

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 tests/streamwise/test_streamwise_auto_deploy.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/tests/streamwise/test_streamwise_auto_deploy.py b/tests/streamwise/test_streamwise_auto_deploy.py
index a191785a..e9cbe2ee 100644
--- a/tests/streamwise/test_streamwise_auto_deploy.py
+++ b/tests/streamwise/test_streamwise_auto_deploy.py
@@ -193,15 +193,17 @@ async def test_auto_deploy_confirm_success() -> None:
             "mig_profile": None,
         },
     ]
-    response = await client.post(
-        "/api/auto_deploy/confirm",
-        json={"specs": specs},
-    )
-    # Should succeed (mocked K8s)
+    with patch("streamwise.pod_manager.add_pod") as mock_add_pod:
+        response = await client.post(
+            "/api/auto_deploy/confirm",
+            json={"specs": specs},
+        )
+    # Should succeed without invoking the real pod_manager.add_pod flow
     assert response.status_code in (HTTPStatus.OK, HTTPStatus.MULTI_STATUS)
     data = await response.get_json()
     assert "deployed" in data
     assert "message" in data
+    assert mock_add_pod.call_count == len(specs)
 
 
 @pytest.mark.asyncio

From 32461f81b39150b4eef8575c1e2e7a4684d82838 Mon Sep 17 00:00:00 2001
From: Haoran Qiu <jamesqiu@connect.hku.hk>
Date: Sun, 17 May 2026 15:29:28 -0700
Subject: [PATCH 10/39] Add GPU count check

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 streamwise/streamwise.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/streamwise/streamwise.py b/streamwise/streamwise.py
index 4062bbf4..b5ffcf4a 100644
--- a/streamwise/streamwise.py
+++ b/streamwise/streamwise.py
@@ -749,6 +749,19 @@ async def api_auto_deploy() -> QuartReturn:
 
         if not gpu_budget or not isinstance(gpu_budget, dict):
             return jsonify({"error": "Missing or invalid 'gpu_budget' field"}), HTTPStatus.BAD_REQUEST
+        for gpu_type_name, count in gpu_budget.items():
+            if isinstance(count, bool) or not isinstance(count, int) or count < 0:
+                return (
+                    jsonify(
+                        {
+                            "error": (
+                                "Invalid 'gpu_budget' field: each GPU type count must be a "
+                                "non-negative integer"
+                            )
+                        }
+                    ),
+                    HTTPStatus.BAD_REQUEST,
+                )
         if not workflow_name or not isinstance(workflow_name, str):
             return jsonify({"error": "Missing or invalid 'workflow' field"}), HTTPStatus.BAD_REQUEST
 

From 28e322ebb07fce9fd45f1a14378c8d41ef66cb11 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 17 May 2026 22:30:10 +0000
Subject: [PATCH 11/39] Restore cwd-relative default for simulator data loading

Agent-Logs-Url: https://github.com/Azure/realtimevideogen/sessions/d53596b3-c563-4deb-af27-7226e9dac364

Co-authored-by: James-QiuHaoran <22564180+James-QiuHaoran@users.noreply.github.com>
---
 simulator/data_loading.py            | 2 +-
 tests/simulator/test_data_loading.py | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/simulator/data_loading.py b/simulator/data_loading.py
index af37e5b8..1b37bb60 100644
--- a/simulator/data_loading.py
+++ b/simulator/data_loading.py
@@ -28,7 +28,7 @@
 from constants import POWER_GPU_IDLE
 from constants import POWER_GPU_TDP
 
-_DEFAULT_DATA_DIR = Path(__file__).resolve().parent / "data"
+_DEFAULT_DATA_DIR = Path("data")
 
 
 def load_latency_data(
diff --git a/tests/simulator/test_data_loading.py b/tests/simulator/test_data_loading.py
index 72337375..0bc63ad0 100644
--- a/tests/simulator/test_data_loading.py
+++ b/tests/simulator/test_data_loading.py
@@ -4,6 +4,7 @@
 
 import sys
 import os
+from pathlib import Path
 import pytest
 
 # Add current path
@@ -59,3 +60,11 @@ def test_adaptive_quality() -> None:
             "simulator/data/",
             "nonexisting"
         )
+
+
+def test_default_data_dir_is_cwd_relative(monkeypatch: pytest.MonkeyPatch) -> None:
+    repo_root = Path(__file__).resolve().parents[2]
+    monkeypatch.chdir(repo_root / "simulator")
+
+    assert load_latency_data() is not None
+    assert load_power_data() is not None

From 0bd57d953fd28d830f386c2f470d2e342626e472 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 17 May 2026 22:30:35 +0000
Subject: [PATCH 12/39] Fix auto-deploy confirm handling for add_pod error
 statuses

Agent-Logs-Url: https://github.com/Azure/realtimevideogen/sessions/e52c100d-a760-4a1a-8771-416155f3e835

Co-authored-by: James-QiuHaoran <22564180+James-QiuHaoran@users.noreply.github.com>
---
 streamwise/streamwise.py                      | 18 +++++++++++--
 .../streamwise/test_streamwise_auto_deploy.py | 26 +++++++++++++++++++
 2 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/streamwise/streamwise.py b/streamwise/streamwise.py
index b5ffcf4a..68f43d12 100644
--- a/streamwise/streamwise.py
+++ b/streamwise/streamwise.py
@@ -820,7 +820,7 @@ async def api_auto_deploy_confirm() -> QuartReturn:
                 continue
 
             try:
-                await pod_manager.add_pod(
+                add_pod_result = await pod_manager.add_pod(
                     container_name=container_name,
                     cpu=int(spec.get("cpu", 4)),
                     memory_gib=int(spec.get("memory_gib", 16)),
@@ -831,7 +831,21 @@ async def api_auto_deploy_confirm() -> QuartReturn:
                     namespace=NAMESPACE,
                     k8s_cluster=k8s_cluster,
                 )
-                deployed.append(container_name)
+
+                status_code = HTTPStatus.OK
+                if isinstance(add_pod_result, tuple) and len(add_pod_result) >= 2:
+                    status_value = add_pod_result[1]
+                    if isinstance(status_value, HTTPStatus):
+                        status_code = status_value
+                    elif isinstance(status_value, int):
+                        status_code = HTTPStatus(status_value)
+
+                if status_code >= HTTPStatus.BAD_REQUEST:
+                    msg = f"Failed to deploy '{container_name}' (status={int(status_code)})"
+                    logging.error(msg)
+                    errors.append(msg)
+                else:
+                    deployed.append(container_name)
             except Exception as pod_ex:
                 msg = f"Failed to deploy '{container_name}': {pod_ex}"
                 logging.error(msg)
diff --git a/tests/streamwise/test_streamwise_auto_deploy.py b/tests/streamwise/test_streamwise_auto_deploy.py
index e9cbe2ee..6496c87f 100644
--- a/tests/streamwise/test_streamwise_auto_deploy.py
+++ b/tests/streamwise/test_streamwise_auto_deploy.py
@@ -217,6 +217,32 @@ async def test_auto_deploy_confirm_missing_specs() -> None:
     assert response.status_code == HTTPStatus.BAD_REQUEST
 
 
+@pytest.mark.asyncio
+async def test_auto_deploy_confirm_tracks_add_pod_status_failures() -> None:
+    """Non-2xx add_pod return statuses are surfaced as deployment errors."""
+    client = _get_client()
+    specs = [
+        {"container_name": "gemma", "gpu": 2, "gpu_type": "a100"},
+        {"container_name": "flux", "gpu": 2, "gpu_type": "a100"},
+    ]
+    with patch.object(
+        sw.pod_manager,
+        "add_pod",
+        side_effect=[
+            (None, HTTPStatus.OK),
+            (None, HTTPStatus.BAD_REQUEST),
+        ],
+    ):
+        response = await client.post("/api/auto_deploy/confirm", json={"specs": specs})
+
+    assert response.status_code == HTTPStatus.MULTI_STATUS
+    data = await response.get_json()
+    assert data["deployed"] == ["gemma"]
+    assert len(data["errors"]) == 1
+    assert "flux" in data["errors"][0]
+    assert "status=400" in data["errors"][0]
+
+
 @pytest.mark.asyncio
 async def test_auto_deploy_confirm_empty_specs() -> None:
     """Empty specs list returns 400."""

From bc173a272ee8afd3f233766a5e41774b357d3ddb Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Fri, 5 Jun 2026 23:02:39 -0700
Subject: [PATCH 13/39] Fix the data path

---
 streamwise/allocator_bridge.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/streamwise/allocator_bridge.py b/streamwise/allocator_bridge.py
index 293c887f..b698af91 100644
--- a/streamwise/allocator_bridge.py
+++ b/streamwise/allocator_bridge.py
@@ -118,7 +118,7 @@ class DeploymentPlan:
 
 def _get_data_dir() -> str:
     """Get the path to the simulator data directory."""
-    default_path = os.path.join(os.path.dirname(__file__), "..", "simulator", "data")
+    default_path = os.path.join(os.path.dirname(__file__), "simulator", "data")
     return os.getenv("SIMULATOR_DATA_DIR", default_path)
 
 

From 1cd59409ebdd9527e5c5d7dad44de4eddc177c10 Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Fri, 5 Jun 2026 23:23:16 -0700
Subject: [PATCH 14/39] Validate budget

---
 streamwise/allocator_bridge.py    | 48 +++++++++++++++++++++++++++++--
 streamwise/templates/add_pod.html | 15 +++++++++-
 2 files changed, 59 insertions(+), 4 deletions(-)

diff --git a/streamwise/allocator_bridge.py b/streamwise/allocator_bridge.py
index b698af91..3c44b693 100644
--- a/streamwise/allocator_bridge.py
+++ b/streamwise/allocator_bridge.py
@@ -67,15 +67,24 @@
 MIG_CAPABLE_GPU_TYPES: frozenset[GPUType] = frozenset({GPUType.A100, GPUType.H100})
 
 # Containers that prefer a MIG slice when the selected GPU type supports MIG.
+# When MIG is available on the cluster, these services use a MIG slice (shared GPU).
+# When MIG is NOT available, they fall back to 1 full GPU each and the extra GPUs
+# are counted against the budget (with a warning if exceeded).
 MIG_CONTAINERS: dict[str, str] = {
     "kokoro": "1g.10gb",
     "yolo": "1g.10gb",
     "realesrgan": "1g.10gb",
 }
 
+# Whether MIG is actually configured on the cluster.
+# When False, MIG_CONTAINERS entries fall back to full GPUs.
+MIG_AVAILABLE: bool = False
+
 
 def get_mig_profile(container_name: str, gpu_type: GPUType) -> Optional[str]:
-    """Return a MIG profile only when the selected GPU type supports MIG."""
+    """Return a MIG profile only when MIG is available and the GPU type supports it."""
+    if not MIG_AVAILABLE:
+        return None
     if gpu_type not in MIG_CAPABLE_GPU_TYPES:
         return None
     return MIG_CONTAINERS.get(container_name)
@@ -201,6 +210,7 @@ def result_to_deployment_specs(result: Result) -> list[DeploymentSpec]:
     Convert an allocator Result into a list of DeploymentSpec objects.
 
     Each ModelAllocation with replicas > 0 is mapped to one or more container deployments.
+    When MIG is unavailable, containers that would normally use MIG slices get 1 full GPU instead.
     """
     specs: list[DeploymentSpec] = []
 
@@ -220,8 +230,15 @@ def result_to_deployment_specs(result: Result) -> list[DeploymentSpec]:
                     resources = CONTAINER_RESOURCES.get(container_name, (4, 16, 16))
                     cpu, memory_gib, ephemeral_storage_gib = resources
 
-                    mig_profile = MIG_CONTAINERS.get(container_name)
-                    gpu_count = allocation.devices if not mig_profile else 1
+                    mig_profile: Optional[str] = None
+                    if MIG_AVAILABLE and container_name in MIG_CONTAINERS:
+                        mig_profile = MIG_CONTAINERS[container_name]
+                        gpu_count = 1
+                    elif container_name in MIG_CONTAINERS:
+                        # MIG not available: use 1 full GPU instead of a MIG slice
+                        gpu_count = 1
+                    else:
+                        gpu_count = allocation.devices
 
                     for _ in range(allocation.replicas):
                         specs.append(DeploymentSpec(
@@ -239,6 +256,27 @@ def result_to_deployment_specs(result: Result) -> list[DeploymentSpec]:
 
 def deployment_plan_to_json(plan: DeploymentPlan) -> dict:
     """Serialize a DeploymentPlan to a JSON-friendly dict."""
+    # Calculate actual GPUs used by the deployment specs (may differ from allocator
+    # when MIG is unavailable and services fall back to full GPUs).
+    actual_gpus: dict[str, int] = {}
+    for spec in plan.specs:
+        if spec.mig_profile:
+            continue  # MIG slices don't count against full GPU budget
+        gpu_type_key = spec.gpu_type or "unknown"
+        actual_gpus[gpu_type_key] = actual_gpus.get(gpu_type_key, 0) + spec.gpu
+
+    total_budget = sum(plan.gpu_budget.values())
+    total_actual = sum(actual_gpus.values())
+    budget_exceeded = total_actual > total_budget
+
+    warnings: list[str] = []
+    if budget_exceeded:
+        warnings.append(
+            f"Deployment requires {total_actual} full GPUs but budget is "
+            f"{total_budget}. "
+            f"{'Enable MIG to fit lightweight services (kokoro, yolo, realesrgan) on shared GPU slices.' if not MIG_AVAILABLE else ''}"
+        )
+
     return {
         "workflow_name": plan.workflow_name,
         "gpu_budget": plan.gpu_budget,
@@ -250,7 +288,11 @@ def deployment_plan_to_json(plan: DeploymentPlan) -> dict:
                 gpu_type.value: count
                 for gpu_type, count in plan.result.gpus_used.items()
             },
+            "actual_gpus_needed": actual_gpus,
+            "budget_exceeded": budget_exceeded,
         },
+        "warnings": warnings,
+        "mig_available": MIG_AVAILABLE,
         "specs": [
             {
                 "container_name": spec.container_name,
diff --git a/streamwise/templates/add_pod.html b/streamwise/templates/add_pod.html
index f5496e10..2df1a61b 100644
--- a/streamwise/templates/add_pod.html
+++ b/streamwise/templates/add_pod.html
@@ -448,6 +448,7 @@ <h2 class="mt-5">🤖 Auto Deploy</h2>
         <!-- Auto-deploy results (hidden until optimize is clicked) -->
         <div id="auto-deploy-results" style="display:none;">
             <h4>📊 Optimized Deployment Plan</h4>
+            <div id="auto-deploy-warning" class="alert alert-warning mb-3" style="display:none;"></div>
             <div id="auto-deploy-metrics" class="alert alert-success mb-3"></div>
             <table class="table table-sm table-bordered" id="auto-deploy-plan-table">
                 <thead>
@@ -813,13 +814,25 @@ <h4>📊 Optimized Deployment Plan</h4>
                             return;
                         }
                         currentPlan = data;
+                        // Show warnings if any
+                        const warningDiv = document.getElementById('auto-deploy-warning');
+                        if (data.warnings && data.warnings.length > 0) {
+                            warningDiv.innerHTML = data.warnings.map(w =>
+                                `⚠️ ${escapeHtml(w)}`).join('<br>');
+                            warningDiv.style.display = '';
+                        } else {
+                            warningDiv.style.display = 'none';
+                        }
                         // Show metrics
                         const metrics = data.metrics;
+                        const actualGpus = metrics.actual_gpus_needed
+                            ? JSON.stringify(metrics.actual_gpus_needed) : JSON.stringify(metrics.gpus_used);
                         document.getElementById('auto-deploy-metrics').innerHTML =
                             `<strong>Total Time:</strong> ${metrics.total_time_s}s &nbsp;|&nbsp; ` +
                             `<strong>TTFF:</strong> ${metrics.ttff_s}s &nbsp;|&nbsp; ` +
                             `<strong>Cost:</strong> $${metrics.cost} &nbsp;|&nbsp; ` +
-                            `<strong>GPUs Used:</strong> ${JSON.stringify(metrics.gpus_used)}`;
+                            `<strong>GPUs Needed:</strong> ${actualGpus}` +
+                            (metrics.budget_exceeded ? ' <span class="text-danger">(exceeds budget!)</span>' : '');
                         // Show plan table
                         const tbody = document.getElementById('auto-deploy-plan-body');
                         tbody.innerHTML = '';

From d01ee1cdc7669507cd16322bf0f5e3171fbadbb9 Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Fri, 5 Jun 2026 23:26:34 -0700
Subject: [PATCH 15/39] Better error message

---
 streamwise/streamwise.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/streamwise/streamwise.py b/streamwise/streamwise.py
index 68f43d12..ecb16565 100644
--- a/streamwise/streamwise.py
+++ b/streamwise/streamwise.py
@@ -774,6 +774,13 @@ async def api_auto_deploy() -> QuartReturn:
 
     except ValueError as ve:
         return jsonify({"error": str(ve)}), HTTPStatus.BAD_REQUEST
+    except AssertionError as ae:
+        msg = str(ae) if str(ae) else (
+            "GPU budget too small. Each GPU type must have at least 8 GPUs "
+            "(one full server). Use a single GPU type with 8+ GPUs, or "
+            "ensure each type has at least 8."
+        )
+        return jsonify({"error": msg}), HTTPStatus.BAD_REQUEST
     except Exception as ex:
         logging.exception("Error in auto_deploy: %s", ex)
         return jsonify({"error": str(ex)}), HTTPStatus.INTERNAL_SERVER_ERROR

From 14c2aee4d99eba5b6b57673ddb72e003f49dc9f1 Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Tue, 9 Jun 2026 10:42:49 -0700
Subject: [PATCH 16/39] Auto-populate GPU budget from cluster state in
 auto-deploy UI

Add /api/auto_deploy/cluster_gpus endpoint that aggregates allocatable GPUs
by type (H100, A100, etc.) from all ready nodes. The auto-deploy form fetches
this on page load and pre-fills the GPU budget text boxes.

Also fixed NVIDIA device plugin toleration for Spot nodes (needed to register
GPUs on AKS Spot node pools).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 streamwise/streamwise.py            | 54 +++++++++++++++++++++
 streamwise/templates/add_pod.html   | 21 +++++++++
 tests/streamwise/test_streamwise.py | 73 +++++++++++++++++++++++++++++
 3 files changed, 148 insertions(+)

diff --git a/streamwise/streamwise.py b/streamwise/streamwise.py
index ecb16565..02ef92cd 100644
--- a/streamwise/streamwise.py
+++ b/streamwise/streamwise.py
@@ -879,6 +879,60 @@ async def api_auto_deploy_workflows() -> QuartReturn:
     }), HTTPStatus.OK
 
 
+@route("/api/auto_deploy/cluster_gpus", methods=["GET"])
+async def api_auto_deploy_cluster_gpus() -> QuartReturn:
+    """Return aggregated GPU counts by type from the current cluster.
+
+    Inspects all ready nodes and sums up allocatable GPUs grouped by the
+    nvidia.com/gpu.product label (mapped to canonical names like A100, H100, etc.).
+    """
+    try:
+        nodes = await get_k8s_nodes(k8s_cluster)
+        gpu_counts: dict[str, int] = {}
+        for node in nodes:
+            if not node.get("is_ready"):
+                continue
+            gpu_model = node.get("gpu_model", "N/A")
+            if gpu_model == "N/A":
+                continue
+            gpu_count = node.get("allocatable_resources", {}).get("gpu", 0)
+            if isinstance(gpu_count, str):
+                try:
+                    gpu_count = int(gpu_count)
+                except ValueError:
+                    continue
+            if gpu_count <= 0:
+                continue
+            # Map gpu_model label to canonical type name
+            canonical = _gpu_label_to_canonical(gpu_model)
+            gpu_counts[canonical] = gpu_counts.get(canonical, 0) + gpu_count
+        return jsonify({"gpu_budget": gpu_counts}), HTTPStatus.OK
+    except Exception as ex:
+        logging.exception("Error in cluster_gpus: %s", ex)
+        return jsonify({"error": str(ex)}), HTTPStatus.INTERNAL_SERVER_ERROR
+
+
+def _gpu_label_to_canonical(gpu_model: str) -> str:
+    """Map a GPU product label to a canonical type name for the allocator."""
+    model_upper = gpu_model.upper()
+    if "H100" in model_upper:
+        return "H100"
+    elif "H200" in model_upper:
+        return "H200"
+    elif "A100" in model_upper:
+        return "A100"
+    elif "GB200" in model_upper:
+        return "GB200"
+    elif "GB300" in model_upper:
+        return "GB300"
+    elif "V100" in model_upper:
+        return "V100"
+    elif "A10" in model_upper:
+        return "A10"
+    # Fallback: return as-is
+    return gpu_model
+
+
 @route("/api/node/<node_name>", methods=["DELETE"])
 async def api_remove_node(node_name: str) -> QuartReturn:
     return await node_manager.remove_node(
diff --git a/streamwise/templates/add_pod.html b/streamwise/templates/add_pod.html
index 2df1a61b..0f64a08b 100644
--- a/streamwise/templates/add_pod.html
+++ b/streamwise/templates/add_pod.html
@@ -779,6 +779,27 @@ <h4>📊 Optimized Deployment Plan</h4>
             if (autoDeployForm) {
                 let currentPlan = null;
 
+                // Auto-populate GPU budget from cluster state
+                fetch('/api/auto_deploy/cluster_gpus', {credentials: 'same-origin'})
+                    .then(r => r.json())
+                    .then(data => {
+                        if (data.gpu_budget) {
+                            const fieldMap = {
+                                'A100': 'auto_gpu_a100',
+                                'H100': 'auto_gpu_h100',
+                                'H200': 'auto_gpu_h200',
+                                'GB200': 'auto_gpu_gb200',
+                            };
+                            for (const [gpuType, fieldId] of Object.entries(fieldMap)) {
+                                const el = document.getElementById(fieldId);
+                                if (el && data.gpu_budget[gpuType] !== undefined) {
+                                    el.value = data.gpu_budget[gpuType];
+                                }
+                            }
+                        }
+                    })
+                    .catch(err => console.warn('Could not auto-populate GPU budget:', err));
+
                 autoDeployForm.addEventListener('submit', function(e) {
                     e.preventDefault();
                     const btn = document.getElementById('auto-deploy-optimize-btn');
diff --git a/tests/streamwise/test_streamwise.py b/tests/streamwise/test_streamwise.py
index 6ec7d09c..41df2012 100644
--- a/tests/streamwise/test_streamwise.py
+++ b/tests/streamwise/test_streamwise.py
@@ -730,3 +730,76 @@ def test_set_verify_ssl_true() -> None:
         assert http_session_manager.VERIFY_SSL is True
     finally:
         http_session_manager.set_verify_ssl(original)
+
+
+@pytest.mark.asyncio
+async def test_api_cluster_gpus_aggregates_by_type() -> None:
+    """The cluster_gpus endpoint aggregates GPU counts by canonical type name."""
+    mock_nodes = [
+        {
+            "node_name": "h100-node-0",
+            "is_ready": True,
+            "gpu_model": "NVIDIA-H100-80GB-HBM3",
+            "allocatable_resources": {"gpu": "8"},
+        },
+        {
+            "node_name": "h100-node-1",
+            "is_ready": True,
+            "gpu_model": "NVIDIA-H100-80GB-HBM3",
+            "allocatable_resources": {"gpu": "8"},
+        },
+        {
+            "node_name": "a100-node-0",
+            "is_ready": True,
+            "gpu_model": "NVIDIA A100-SXM4-80GB",
+            "allocatable_resources": {"gpu": "8"},
+        },
+        {
+            "node_name": "cpu-node",
+            "is_ready": True,
+            "gpu_model": "N/A",
+            "allocatable_resources": {"gpu": "0"},
+        },
+    ]
+    client = _get_client()
+    with patch("streamwise.streamwise.get_k8s_nodes", new=AsyncMock(return_value=mock_nodes)):
+        response = await client.get("/api/auto_deploy/cluster_gpus")
+    assert response.status_code == HTTPStatus.OK
+    data = await response.get_json()
+    assert data["gpu_budget"] == {"H100": 16, "A100": 8}
+
+
+@pytest.mark.asyncio
+async def test_api_cluster_gpus_skips_not_ready_nodes() -> None:
+    """The cluster_gpus endpoint skips nodes that are not ready."""
+    mock_nodes = [
+        {
+            "node_name": "h100-node-0",
+            "is_ready": True,
+            "gpu_model": "NVIDIA-H100-80GB-HBM3",
+            "allocatable_resources": {"gpu": "8"},
+        },
+        {
+            "node_name": "h100-node-1",
+            "is_ready": False,
+            "gpu_model": "NVIDIA-H100-80GB-HBM3",
+            "allocatable_resources": {"gpu": "8"},
+        },
+    ]
+    client = _get_client()
+    with patch("streamwise.streamwise.get_k8s_nodes", new=AsyncMock(return_value=mock_nodes)):
+        response = await client.get("/api/auto_deploy/cluster_gpus")
+    assert response.status_code == HTTPStatus.OK
+    data = await response.get_json()
+    assert data["gpu_budget"] == {"H100": 8}
+
+
+@pytest.mark.asyncio
+async def test_api_cluster_gpus_empty_cluster() -> None:
+    """The cluster_gpus endpoint returns empty budget for a cluster with no GPU nodes."""
+    client = _get_client()
+    with patch("streamwise.streamwise.get_k8s_nodes", new=AsyncMock(return_value=[])):
+        response = await client.get("/api/auto_deploy/cluster_gpus")
+    assert response.status_code == HTTPStatus.OK
+    data = await response.get_json()
+    assert data["gpu_budget"] == {}

From 7e3345411416639adb27912e9f7e6e0b62cf7361 Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Tue, 9 Jun 2026 10:48:08 -0700
Subject: [PATCH 17/39] Document GPU Spot node setup: toleration patch and node
 labels

The upstream NVIDIA device plugin DaemonSet lacks the Spot toleration, so
it won't schedule on AKS Spot GPU nodes. Document that the local manifest
(deployment/k8s/nvidia-device-plugin-ds.yaml) already includes this fix,
and provide the patch command as a fallback.

Also document the need for manual nvidia.com/gpu.product labels on nodes
until GPU Feature Discovery is installed.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 deployment/aks/README.md | 59 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 58 insertions(+), 1 deletion(-)

diff --git a/deployment/aks/README.md b/deployment/aks/README.md
index e72feffb..d9f10be2 100644
--- a/deployment/aks/README.md
+++ b/deployment/aks/README.md
@@ -220,6 +220,61 @@ kubectl create namespace gpu-resources
 kubectl apply -f deployment/k8s/nvidia-device-plugin-ds.yaml
 ```
 
+### 5.0 Critical: Spot Node Toleration and GPU Labels
+
+AKS Spot node pools apply the taint `kubernetes.azure.com/scalesetpriority=spot:NoSchedule`.
+The **upstream** NVIDIA device plugin (`https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/...`) only tolerates `nvidia.com/gpu`, so **it will not schedule on Spot GPU nodes**.
+
+**Always use the local manifest** (`deployment/k8s/nvidia-device-plugin-ds.yaml`) which already includes the Spot toleration. If you already applied the upstream manifest, patch it:
+```bash
+kubectl patch daemonset nvidia-device-plugin-daemonset -n kube-system \
+  --type='json' \
+  -p='[{"op":"add","path":"/spec/template/spec/tolerations/-","value":{"key":"kubernetes.azure.com/scalesetpriority","operator":"Equal","value":"spot","effect":"NoSchedule"}}]'
+```
+
+Without this patch, `nvidia.com/gpu` will report 0 on Spot nodes and GPU pods will remain Pending.
+
+**Label GPU nodes** (required for pod scheduling with nodeAffinity):
+
+AKS does not automatically apply `nvidia.com/gpu.product` labels. Without GPU Feature Discovery (NFD+GFD), you must label nodes manually:
+```bash
+# For H100 nodes:
+kubectl label node <node-name> nvidia.com/gpu.product=NVIDIA-H100-80GB-HBM3
+
+# For A100 nodes:
+kubectl label node <node-name> nvidia.com/gpu.product=NVIDIA-A100-SXM4-80GB
+
+# For H200 nodes:
+kubectl label node <node-name> nvidia.com/gpu.product=NVIDIA-H200-141GB-HBM3
+```
+
+To label all nodes in a GPU pool at once:
+```bash
+kubectl label nodes -l agentpool=gpuh100 nvidia.com/gpu.product=NVIDIA-H100-80GB-HBM3
+```
+
+> **⚠️ These labels are lost on node eviction/recreation.** If a Spot node is evicted and a new one
+> joins, you must re-apply the label. For a permanent solution, install
+> [NVIDIA GPU Feature Discovery](https://github.com/NVIDIA/gpu-feature-discovery) which
+> automatically detects and labels GPU hardware.
+
+After patching the toleration and labeling nodes, verify GPU registration:
+```bash
+# Confirm device plugin pods are running on GPU nodes
+kubectl get pods -n kube-system -l name=nvidia-device-plugin-ds -o wide
+
+# Confirm GPUs are registered
+kubectl get nodes -o custom-columns="NAME:.metadata.name,GPU:.status.allocatable.nvidia\.com/gpu,LABEL:.metadata.labels.nvidia\.com/gpu\.product"
+```
+
+Expected output for 4× H100 nodes:
+```
+NAME                            GPU   LABEL
+aks-gpuh100-xxxxx-vmss000000    8     NVIDIA-H100-80GB-HBM3
+aks-gpuh100-xxxxx-vmss000001    8     NVIDIA-H100-80GB-HBM3
+...
+```
+
 Scale the GPU spot node pool up (it starts at 0 nodes):
 
 ```bash
@@ -342,7 +397,9 @@ kubectl get events -n rtgen --sort-by='.lastTimestamp'
 Common issues:
 - **Image pull errors**: Verify ACR is attached to AKS (`az aks check-acr -g $AZ_RESOURCE_GROUP -n $AKS_CLUSTER --acr <acrName>`)
 - **Pods stuck in Pending (Insufficient cpu)**: The system node pool doesn't have enough CPU. Scale up with `az aks nodepool scale` or use a larger VM size (see Sizing note in Step 1)
-- **GPU not available**: Ensure the GPU node pool is scaled up and the NVIDIA device plugin is running
+- **GPU not available (Spot nodes)**: The NVIDIA device plugin may not be running on Spot GPU nodes because it lacks the Spot toleration. See Step 5.0 for the patch command
+- **GPU not available (0 GPUs on node)**: Ensure the NVIDIA device plugin daemonset pod is Running on the GPU node. If it shows 0 GPUs, restart the node or the device plugin pod
+- **Pods stuck with "node(s) didn't match Pod's node affinity/selector"**: GPU pods use `nodeAffinity` requiring `nvidia.com/gpu.product` label. Label your GPU nodes per Step 5.0
 - **MIG node reports 0 GPUs**: The `mixed`-strategy device plugin cannot enumerate devices until MIG mode is enabled and MIG instances are created. Complete the full [MIG Setup Guide](../k8s/MIG.md) — once instances exist the plugin will register `nvidia.com/gpu` (full GPUs) and `nvidia.com/mig-<profile>` (MIG slices) within ~30–60 seconds
 - **Spot VM evicted**: Spot VMs may be evicted at any time. Re-run `az aks nodepool scale` to restore the node. After re-scaling a MIG node you must repeat the [MIG Setup Guide](../k8s/MIG.md) since MIG state does not persist across evictions
 - **LoadBalancer stuck in Pending**: Verify the public IP exists (`az network public-ip show -g $AZ_RESOURCE_GROUP --name aks-pods-public-ip`) and the AKS identity has Network Contributor role on the resource group

From 3a16c292e43ce83b6a8cbb65662956efe3d7bc44 Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Tue, 9 Jun 2026 10:51:32 -0700
Subject: [PATCH 18/39] Fix: default GPU budget to 0, let cluster state
 populate values

Previously A100 defaulted to 8 even when no A100 nodes exist. Now all
fields default to 0 and are populated only from the cluster_gpus API.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 streamwise/templates/add_pod.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/streamwise/templates/add_pod.html b/streamwise/templates/add_pod.html
index 0f64a08b..94b980ad 100644
--- a/streamwise/templates/add_pod.html
+++ b/streamwise/templates/add_pod.html
@@ -397,7 +397,7 @@ <h2 class="mt-5">🤖 Auto Deploy</h2>
                     <div class="col-md-3">
                         <label for="auto_gpu_a100" class="form-label">A100</label>
                         <input type="number" class="form-control" id="auto_gpu_a100" name="gpu_a100"
-                            min="0" max="64" value="8">
+                            min="0" max="64" value="0">
                     </div>
                     <div class="col-md-3">
                         <label for="auto_gpu_h100" class="form-label">H100</label>

From 2075db834b82e9695881d95020e2c1fe22720c5d Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Tue, 9 Jun 2026 11:16:43 -0700
Subject: [PATCH 19/39] Fix allocator budget: OTHERS=2 without MIG, co-locate
 HF_VAE

When MIG is unavailable, patch DEVICE_OPTIONS so OTHERS (kokoro+yolo)
counts as 2 full GPUs instead of 1 MIG slice.

Mark hunyuanframepackvae as co-located container (gpu=0) since it
shares resources with the HunyuanFramePack server.

This ensures budget=16 produces exactly 16 GPUs for StreamCast.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 streamwise/allocator_bridge.py | 33 ++++++++++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/streamwise/allocator_bridge.py b/streamwise/allocator_bridge.py
index 3c44b693..dd47cbda 100644
--- a/streamwise/allocator_bridge.py
+++ b/streamwise/allocator_bridge.py
@@ -24,6 +24,7 @@
 from sim_types import Result
 
 from auto_model_allocator import AutoModelAllocator
+from constants import DEVICE_OPTIONS
 from data_loading import load_latency_data
 from model_provisioner.policies import STREAMWISE_POLICY
 from workflows import WORKFLOWS
@@ -76,6 +77,11 @@
     "realesrgan": "1g.10gb",
 }
 
+# Containers that are co-located with their parent model (sharing GPUs on the same server).
+# The allocator counts their GPUs as part of the parent model's allocation, so they should
+# deploy with gpu=0 to avoid double-counting.
+COLOCATED_CONTAINERS: frozenset[str] = frozenset({"hunyuanframepackvae"})
+
 # Whether MIG is actually configured on the cluster.
 # When False, MIG_CONTAINERS entries fall back to full GPUs.
 MIG_AVAILABLE: bool = False
@@ -182,6 +188,17 @@ def run_allocator(
     if not num_gpus or sum(num_gpus.values()) < 8:
         raise ValueError("Total GPU budget must be at least 8 GPUs.")
 
+    # When MIG is not available, adjust DEVICE_OPTIONS so the allocator reserves
+    # enough GPUs for containers that would normally share a MIG GPU.
+    # OTHERS (kokoro + yolo) needs 2 full GPUs instead of 1 MIG-shared GPU.
+    original_device_options: dict[Model, list[int]] = {}
+    if not MIG_AVAILABLE:
+        for model, containers in MODEL_TO_CONTAINERS.items():
+            mig_count = sum(1 for c in containers if c in MIG_CONTAINERS)
+            if mig_count > 1 and DEVICE_OPTIONS.get(model) == [1]:
+                original_device_options[model] = DEVICE_OPTIONS[model]
+                DEVICE_OPTIONS[model] = [mig_count]
+
     # Load latency data and run allocator
     data_dir = _get_data_dir()
     latency_data = load_latency_data(data_dir=data_dir)
@@ -194,6 +211,10 @@ def run_allocator(
 
     result = allocator.allocate(num_gpus=num_gpus, verbose=False)
 
+    # Restore original DEVICE_OPTIONS
+    for model, original in original_device_options.items():
+        DEVICE_OPTIONS[model] = original
+
     # Convert result to deployment specs
     specs = result_to_deployment_specs(result)
 
@@ -231,7 +252,10 @@ def result_to_deployment_specs(result: Result) -> list[DeploymentSpec]:
                     cpu, memory_gib, ephemeral_storage_gib = resources
 
                     mig_profile: Optional[str] = None
-                    if MIG_AVAILABLE and container_name in MIG_CONTAINERS:
+                    if container_name in COLOCATED_CONTAINERS:
+                        # Co-located with parent model; shares GPU on the same server
+                        gpu_count = 0
+                    elif MIG_AVAILABLE and container_name in MIG_CONTAINERS:
                         mig_profile = MIG_CONTAINERS[container_name]
                         gpu_count = 1
                     elif container_name in MIG_CONTAINERS:
@@ -271,10 +295,13 @@ def deployment_plan_to_json(plan: DeploymentPlan) -> dict:
 
     warnings: list[str] = []
     if budget_exceeded:
+        mig_hint = (
+            "Enable MIG to fit lightweight services (kokoro, yolo, realesrgan) "
+            "on shared GPU slices."
+        ) if not MIG_AVAILABLE else ""
         warnings.append(
             f"Deployment requires {total_actual} full GPUs but budget is "
-            f"{total_budget}. "
-            f"{'Enable MIG to fit lightweight services (kokoro, yolo, realesrgan) on shared GPU slices.' if not MIG_AVAILABLE else ''}"
+            f"{total_budget}. {mig_hint}"
         )
 
     return {

From b239abcaf4a6953ea7f0187f6a152326aa39303e Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Tue, 9 Jun 2026 11:18:08 -0700
Subject: [PATCH 20/39] Update test to reflect MIG_AVAILABLE=False behavior

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 tests/streamwise/test_allocator_bridge.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tests/streamwise/test_allocator_bridge.py b/tests/streamwise/test_allocator_bridge.py
index 569e4073..0394ad2b 100644
--- a/tests/streamwise/test_allocator_bridge.py
+++ b/tests/streamwise/test_allocator_bridge.py
@@ -143,9 +143,14 @@ def test_result_to_deployment_specs_basic() -> None:
     assert gemma_spec.gpu_type == "a100"
     assert gemma_spec.gpu == 1
 
-    # MIG containers get mig_profile set
+    # Without MIG, kokoro gets no mig_profile (full GPU)
     kokoro_spec = next(s for s in specs if s.container_name == "kokoro")
-    assert kokoro_spec.mig_profile == "1g.10gb"
+    assert kokoro_spec.mig_profile is None
+    assert kokoro_spec.gpu == 1
+
+    # Co-located container gets gpu=0
+    vae_spec = next(s for s in specs if s.container_name == "hunyuanframepackvae")
+    assert vae_spec.gpu == 0
 
 
 def test_result_to_deployment_specs_skips_zero_replicas() -> None:

From 4713651c0cfee03ffc4e38a19ed89167c0ceb37b Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Tue, 9 Jun 2026 11:55:26 -0700
Subject: [PATCH 21/39] Fix per-type GPU budget overflow by trimming excess
 replicas

The allocator counts OTHERS as 1 GPU (SINGLE_DEVICE_MODELS constraint),
but without MIG, kokoro+yolo each need a full GPU = 2. This caused
per-type budget violations (e.g., A100=9 when budget=8).

Instead of patching DEVICE_OPTIONS (ineffective due to allocator
constraints), detect per-type overflow after allocation and trim
excess replicas of the most-replicated container on the overflowing
type. This preserves throughput while respecting per-type budgets.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 streamwise/allocator_bridge.py | 82 +++++++++++++++++++++++++++-------
 1 file changed, 66 insertions(+), 16 deletions(-)

diff --git a/streamwise/allocator_bridge.py b/streamwise/allocator_bridge.py
index dd47cbda..90dce1bc 100644
--- a/streamwise/allocator_bridge.py
+++ b/streamwise/allocator_bridge.py
@@ -24,7 +24,6 @@
 from sim_types import Result
 
 from auto_model_allocator import AutoModelAllocator
-from constants import DEVICE_OPTIONS
 from data_loading import load_latency_data
 from model_provisioner.policies import STREAMWISE_POLICY
 from workflows import WORKFLOWS
@@ -137,6 +136,57 @@ def _get_data_dir() -> str:
     return os.getenv("SIMULATOR_DATA_DIR", default_path)
 
 
+# Reverse mapping from pod gpu_type string to GPUType enum
+_POD_STR_TO_GPU_TYPE: dict[str, GPUType] = {v: k for k, v in GPU_TYPE_TO_POD_STR.items()}
+
+
+def _calc_actual_gpus_per_type(specs: list['DeploymentSpec']) -> dict[GPUType, int]:
+    """Calculate actual GPUs needed per GPUType from deployment specs."""
+    result: dict[GPUType, int] = {}
+    for spec in specs:
+        if spec.mig_profile:
+            continue
+        gpu_type = _POD_STR_TO_GPU_TYPE.get(spec.gpu_type or "")
+        if gpu_type is not None:
+            result[gpu_type] = result.get(gpu_type, 0) + spec.gpu
+    return result
+
+
+def _trim_specs_for_type(
+    specs: list['DeploymentSpec'], gpu_type_str: str, excess: int
+) -> list['DeploymentSpec']:
+    """
+    Remove replicas from specs to reduce GPU usage on a specific type by `excess` GPUs.
+
+    Prefers removing replicas of the most-replicated scalable container (typically
+    realesrgan/upscaler) to minimize impact on pipeline throughput.
+    """
+    # Count replicas per container on this GPU type (only scalable ones)
+    from collections import Counter
+    type_counts: Counter[str] = Counter()
+    for spec in specs:
+        if spec.gpu_type == gpu_type_str and spec.gpu > 0 and spec.container_name not in COLOCATED_CONTAINERS:
+            type_counts[spec.container_name] += 1
+
+    # Prefer trimming containers with most replicas (least impact per removal)
+    trimmed = 0
+    result_specs = list(specs)
+    for container_name, _count in type_counts.most_common():
+        if trimmed >= excess:
+            break
+        # Remove replicas from the end of the list
+        for i in range(len(result_specs) - 1, -1, -1):
+            if trimmed >= excess:
+                break
+            spec = result_specs[i]
+            if (spec.container_name == container_name
+                    and spec.gpu_type == gpu_type_str
+                    and spec.gpu > 0):
+                trimmed += spec.gpu
+                result_specs.pop(i)
+    return result_specs
+
+
 def get_available_workflows() -> list[str]:
     """Return list of available workflow names for the UI."""
     return list(APP_TO_WORKFLOW.keys())
@@ -188,17 +238,6 @@ def run_allocator(
     if not num_gpus or sum(num_gpus.values()) < 8:
         raise ValueError("Total GPU budget must be at least 8 GPUs.")
 
-    # When MIG is not available, adjust DEVICE_OPTIONS so the allocator reserves
-    # enough GPUs for containers that would normally share a MIG GPU.
-    # OTHERS (kokoro + yolo) needs 2 full GPUs instead of 1 MIG-shared GPU.
-    original_device_options: dict[Model, list[int]] = {}
-    if not MIG_AVAILABLE:
-        for model, containers in MODEL_TO_CONTAINERS.items():
-            mig_count = sum(1 for c in containers if c in MIG_CONTAINERS)
-            if mig_count > 1 and DEVICE_OPTIONS.get(model) == [1]:
-                original_device_options[model] = DEVICE_OPTIONS[model]
-                DEVICE_OPTIONS[model] = [mig_count]
-
     # Load latency data and run allocator
     data_dir = _get_data_dir()
     latency_data = load_latency_data(data_dir=data_dir)
@@ -211,13 +250,24 @@ def run_allocator(
 
     result = allocator.allocate(num_gpus=num_gpus, verbose=False)
 
-    # Restore original DEVICE_OPTIONS
-    for model, original in original_device_options.items():
-        DEVICE_OPTIONS[model] = original
-
     # Convert result to deployment specs
     specs = result_to_deployment_specs(result)
 
+    # When MIG is unavailable, deployment specs may use more GPUs per type than the
+    # allocator budgeted (e.g., OTHERS allocates 1 GPU but kokoro+yolo each need a
+    # full GPU = 2). Detect per-type overflow and trim excess replicas.
+    if not MIG_AVAILABLE:
+        actual_per_type = _calc_actual_gpus_per_type(specs)
+        for gpu_type, budget_count in num_gpus.items():
+            actual = actual_per_type.get(gpu_type, 0)
+            if actual <= budget_count:
+                continue
+            # Need to trim (actual - budget_count) GPUs from this type.
+            # Remove replicas of the most-replicated scalable container on this type.
+            excess = actual - budget_count
+            gpu_type_str = GPU_TYPE_TO_POD_STR[gpu_type]
+            specs = _trim_specs_for_type(specs, gpu_type_str, excess)
+
     return DeploymentPlan(
         specs=specs,
         result=result,

From 485f579e1c06524aab58b15c62a4d9c4c750f926 Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Tue, 9 Jun 2026 11:57:03 -0700
Subject: [PATCH 22/39] Add skills on demo end-to-end

---
 .github/streamwise-demo-skills.md | 225 ++++++++++++++++++++++++++++++
 1 file changed, 225 insertions(+)
 create mode 100644 .github/streamwise-demo-skills.md

diff --git a/.github/streamwise-demo-skills.md b/.github/streamwise-demo-skills.md
new file mode 100644
index 00000000..679f3fa1
--- /dev/null
+++ b/.github/streamwise-demo-skills.md
@@ -0,0 +1,225 @@
+# StreamWise Demo: End-to-End AKS Deployment with GPU Spot Probing
+
+This document summarizes the full deployment walkthrough performed on 2026-06-05,
+from capacity probing through to a running StreamWise instance on AKS with 32 H100 GPUs.
+
+## Overview
+
+| Step | Tool/Skill | Outcome |
+|------|-----------|---------|
+| 1. SKU Discovery | `az vm list-skus` | Found `Standard_ND96isrf_H100_v5` unrestricted in eastus2 and SwedenCentral |
+| 2. Capacity Pre-Check | CCC Kusto query | SwedenCentral AZ03 has 6 allocable Spot VMs — enough for 4 nodes |
+| 3. AKS Deployment | Bicep (`aks.bicep`) | Cluster + 4 H100 Spot nodes + networking provisioned in ~12 min |
+| 4. K8s Setup | kubectl | Namespace, secrets, PV/PVC, NVIDIA device plugin |
+| 5. StreamWise Deploy | kubectl + YAML templating | Pod running, web UI accessible at public IP:8081 |
+
+## Step 1: GPU SKU Discovery (`azure-gpu-spot-probe` skill)
+
+### What We Learned
+
+The `azure-gpu-spot-probe` skill provides a structured approach to finding GPU Spot capacity:
+
+1. **List available SKUs** with `az vm list-skus --size H100` to find all H100 variants and their restriction status.
+2. **Cross-reference with CCC data** (Kusto query on `onecapacityfollower.centralus.kusto.windows.net`) to see actual allocable Spot VMs per region/zone.
+3. **Key insight:** CCC data is fleet-wide, not per-subscription. A region may show capacity but still be `Location`-restricted for your subscription.
+
+### H100 SKU Variants
+
+| SKU | Key Difference |
+|-----|---------------|
+| `Standard_ND96isr_H100_v5` | 8× H100, InfiniBand |
+| `Standard_ND96isrf_H100_v5` | 8× H100, InfiniBand (refresh/newer) |
+| `Standard_ND96is_H100_v5` | 8× H100, no InfiniBand suffix |
+| `Standard_ND96is_noIB_H100_v5` | 8× H100, explicitly no InfiniBand |
+
+### Subscription Access Results
+
+For `Standard_ND96isrf_H100_v5`:
+- **Unrestricted:** eastus2 (zones 1,2), SwedenCentral (zones 1,2,3)
+- **Location-blocked:** eastus, centralus, northcentralus
+
+### CCC Capacity Data
+
+```
+Region          | SKU                          | AllocableSpotVMs
+swedencentral   | Standard_ND96isrf_H100_v5    | 6
+eastus          | Standard_ND96isr_H100_v5     | 16 (but sub is blocked)
+centralus       | Standard_ND96isr_H100_v5     | 83 (but sub is blocked)
+```
+
+**Decision:** SwedenCentral — 6 allocable Spot VMs, subscription unrestricted, zones 1/2/3.
+
+## Step 2: AKS Cluster Deployment
+
+### Bicep Template (`deployment/aks/aks.bicep`)
+
+The template provisions:
+- **System node pool:** Standard_D16s_v5 (1 node) for StreamWise/StreamCast/system pods
+- **GPU Spot node pool (`spoth100`):** Full-GPU nodes for heavy models
+- **GPU MIG Spot node pool (`spoth100mig`):** For mixed-mode (7 full GPUs + 1 MIG-partitioned)
+- **Networking:** Static public IP, NAT gateway, NSG (ports 8000–9000), VNet with disabled default outbound
+
+### Deployment Command
+
+```bash
+az deployment group create \
+  --name AKSDeployment \
+  --resource-group hqiu-streamwise-aks-cluster \
+  --template-file deployment/aks/aks.bicep \
+  --parameters \
+    clusterName=hqiu-streamwise-aks-cluster-cluster \
+    gpuNodeVmSize=Standard_ND96isrf_H100_v5 \
+    gpuNodePoolName=spoth100 \
+    gpuMigNodePoolName=spoth100mig \
+    gpuNodeCount=4 \
+    acrName=inigogrtgen \
+    acrResourceGroup=inigog-acr
+```
+
+### Gotcha: ACR Role Assignment Failure
+
+The Bicep template includes a cross-resource-group ACR role assignment. If the role already exists
+(e.g., from a prior deployment), it fails with `RoleAssignmentUpdateNotPermitted`. The cluster
+itself still succeeds — just attach ACR manually:
+
+```bash
+az aks update -g <rg> -n <cluster> --attach-acr <acrName>
+```
+
+### Gotcha: ACR Login Server Name
+
+ACR names like `inigogrtgen` may have a login server of `inigogrtgen-<hash>.azurecr.io`, NOT
+`inigogrtgen.azurecr.io`. Always verify with:
+
+```bash
+az acr show --name <acr> --query loginServer -o tsv
+```
+
+## Step 3: Kubernetes Setup
+
+```bash
+kubectl create namespace rtgen
+kubectl create secret generic hf-token -n rtgen --from-literal=token=$HF_TOKEN
+kubectl apply -f deployment/k8s/local-pv.yaml
+kubectl apply -f deployment/k8s/local-pvc.yaml -n rtgen
+kubectl create namespace gpu-resources
+kubectl apply -f deployment/k8s/nvidia-device-plugin-ds.yaml
+```
+
+## Step 4: StreamWise Deployment
+
+The `streamwise-pod.yaml` uses shell variable placeholders (`${ACR_URL}`, `${LOAD_BALANCER_IP}`,
+`${RESOURCE_GROUP_NAME}`). On Linux use `envsubst`; on Windows use PowerShell string replacement:
+
+```powershell
+$yaml = Get-Content "deployment/aks/streamwise-pod.yaml" -Raw
+$yaml = $yaml -replace '\$\{ACR_URL\}', 'inigogrtgen-cjd9f3dydte2bzbb.azurecr.io'
+$yaml = $yaml -replace '\$\{RESOURCE_GROUP_NAME\}', 'hqiu-streamwise-aks-cluster'
+$yaml = $yaml -replace '\$\{LOAD_BALANCER_IP\}', '4.223.71.250'
+$yaml | kubectl apply -f -
+```
+
+### First Pull Time
+
+The StreamWise image is ~9 GB. First pull takes 5–10 minutes (pod shows `ContainerCreating`).
+
+## Step 5: Verification
+
+```bash
+kubectl get pods -n rtgen       # Should show 1/1 Running
+kubectl get svc -n rtgen        # Should show LoadBalancer with external IP
+curl http://<IP>:8081/          # Should return HTTP 200
+```
+
+## Final Result
+
+| Property | Value |
+|----------|-------|
+| Cluster | `hqiu-streamwise-aks-cluster-cluster` |
+| Region | SwedenCentral |
+| GPU Nodes | 4 × Standard_ND96isrf_H100_v5 (32 H100 GPUs) |
+| Public IP | 4.223.71.250 |
+| StreamWise URL | http://4.223.71.250:8081 |
+| FQDN | http://streamwise-fnv3ci.swedencentral.cloudapp.azure.com:8081 |
+
+## Auto-Deployment Feature
+
+From the StreamWise web UI at port 8081, you can deploy all GPU model services with one click,
+or via REST API:
+
+```bash
+# Deploy all services
+curl -X POST "http://4.223.71.250:8081/api/service"
+
+# Deploy individual services
+curl -X POST "http://4.223.71.250:8081/api/pod" \
+  -d "container_name=kokoro" -d "gpu=1" -d "memory=8" -d "cpu=2"
+
+# List deployed services
+curl "http://4.223.71.250:8081/api/services"
+```
+
+## Step 6: Rebuilding the Image for New Features
+
+When the deployed image is stale (e.g., missing the auto-deploy feature from a newer branch),
+rebuild and push from a local Docker install or use ACR Tasks.
+
+### Local Docker Build (recommended on Windows)
+
+```powershell
+# Prepare build context (see deployment/setup_image.sh for the full script)
+# Fix CRLF line endings in bash files before building on Windows:
+$content = [System.IO.File]::ReadAllText("deployment\streamwise\docker_files\run_httpserver.bash")
+$content = $content -replace "`r`n", "`n"
+[System.IO.File]::WriteAllText("deployment\streamwise\docker_files\run_httpserver.bash", $content, [System.Text.UTF8Encoding]::new($false))
+
+# Build and push
+docker buildx build --platform linux/amd64 `
+  --build-arg DOCKER_REPO=inigogrtgen-cjd9f3dydte2bzbb.azurecr.io `
+  --build-arg BASE_TAG=v0.5.0 `
+  -t "inigogrtgen-cjd9f3dydte2bzbb.azurecr.io/streamwise:v0.6.2-autodeploy" `
+  "deployment\streamwise\docker_files" --push
+```
+
+### ACR Cloud Build (broken on Windows due to Unicode encoding)
+
+`az acr build` streams build logs through the Azure CLI, which crashes on Windows cp1252
+terminals when pip outputs Unicode progress bars. Use local Docker build instead.
+
+### Redeploy the Pod
+
+```powershell
+kubectl delete pod streamwise -n rtgen --force --grace-period=0
+# Re-apply YAML with updated image tag
+```
+
+### Gotcha: CRLF Line Endings in Bash Scripts
+
+When building Docker images on Windows, `COPY *.bash .` preserves CRLF line endings.
+Linux containers then fail with `$'\r': command not found`. **Always convert to LF** before
+building, or add a `.dockerignore`/`dos2unix` step.
+
+### Gotcha: Simulator Data Path in Docker
+
+The Dockerfile copies the simulator into `/streamwise/simulator/`, but `allocator_bridge.py`
+originally resolved the data path as `os.path.dirname(__file__) + "/../simulator/data"` which
+evaluates to `/simulator/data/` (doesn't exist). Fixed by changing to:
+
+```python
+default_path = os.path.join(os.path.dirname(__file__), "simulator", "data")
+```
+
+This works in Docker (where cwd = `/streamwise/`) and locally (where `__file__` is in `streamwise/`).
+
+## Key Lessons Learned
+
+1. **Always check CCC capacity before probing** — saves time and money on doomed create attempts.
+2. **CCC ≠ subscription access** — fleet capacity doesn't mean your subscription can use it.
+3. **ACR login servers may have hashed suffixes** — always `az acr show --query loginServer`.
+4. **ACR role assignments are idempotent-ish** — redeployments fail but don't break the cluster.
+5. **Spot VMs can be evicted** — plan for re-scaling and MIG reconfiguration after eviction.
+6. **Image pulls are slow** — ~9 GB images take 5–10 min on first pull; be patient.
+7. **Zone mapping is opaque** — CCC AZ03 doesn't necessarily mean subscription zone 3.
+8. **CRLF kills Linux containers** — always fix line endings when building on Windows.
+9. **ACR cloud build broken on Windows** — use local Docker build + push instead.
+10. **Relative paths shift in Docker** — verify path assumptions match the container layout (`WORKDIR` + `COPY` targets).

From 9d1637bcc26ab1f9ac732bb9a9c30d02138b51b2 Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Tue, 9 Jun 2026 13:38:06 -0700
Subject: [PATCH 23/39] Fixed lint and comments

---
 .github/streamwise-demo-skills.md |  6 +--
 deployment/aks/README.md          |  2 +-
 streamwise/allocator_bridge.py    | 58 +++++++------------------
 streamwise/container_config.py    | 70 +++++++++++++++++++++++++++++++
 streamwise/streamwise.py          | 44 ++++++++++---------
 5 files changed, 114 insertions(+), 66 deletions(-)
 create mode 100644 streamwise/container_config.py

diff --git a/.github/streamwise-demo-skills.md b/.github/streamwise-demo-skills.md
index 679f3fa1..8d9c8cc0 100644
--- a/.github/streamwise-demo-skills.md
+++ b/.github/streamwise-demo-skills.md
@@ -40,7 +40,7 @@ For `Standard_ND96isrf_H100_v5`:
 
 ### CCC Capacity Data
 
-```
+```bash
 Region          | SKU                          | AllocableSpotVMs
 swedencentral   | Standard_ND96isrf_H100_v5    | 6
 eastus          | Standard_ND96isr_H100_v5     | 16 (but sub is blocked)
@@ -139,8 +139,8 @@ curl http://<IP>:8081/          # Should return HTTP 200
 | Region | SwedenCentral |
 | GPU Nodes | 4 × Standard_ND96isrf_H100_v5 (32 H100 GPUs) |
 | Public IP | 4.223.71.250 |
-| StreamWise URL | http://4.223.71.250:8081 |
-| FQDN | http://streamwise-fnv3ci.swedencentral.cloudapp.azure.com:8081 |
+| StreamWise URL | http://<IP>:8081 |
+| FQDN | http://<ADDRESS>:8081 |
 
 ## Auto-Deployment Feature
 
diff --git a/deployment/aks/README.md b/deployment/aks/README.md
index d9f10be2..6daf9743 100644
--- a/deployment/aks/README.md
+++ b/deployment/aks/README.md
@@ -268,7 +268,7 @@ kubectl get nodes -o custom-columns="NAME:.metadata.name,GPU:.status.allocatable
 ```
 
 Expected output for 4× H100 nodes:
-```
+```bash
 NAME                            GPU   LABEL
 aks-gpuh100-xxxxx-vmss000000    8     NVIDIA-H100-80GB-HBM3
 aks-gpuh100-xxxxx-vmss000001    8     NVIDIA-H100-80GB-HBM3
diff --git a/streamwise/allocator_bridge.py b/streamwise/allocator_bridge.py
index 90dce1bc..0b582cdf 100644
--- a/streamwise/allocator_bridge.py
+++ b/streamwise/allocator_bridge.py
@@ -14,6 +14,9 @@
 _HERE = os.path.dirname(os.path.abspath(__file__))
 if _HERE not in sys.path:
     sys.path.insert(0, _HERE)
+_REPO_ROOT = os.path.dirname(_HERE)
+if _REPO_ROOT not in sys.path:
+    sys.path.insert(0, _REPO_ROOT)
 import model_provisioner  # noqa: E402, F401 — adds simulator/ to sys.path
 
 from dataclasses import dataclass
@@ -24,8 +27,15 @@
 from sim_types import Result
 
 from auto_model_allocator import AutoModelAllocator
+from container_config import COLOCATED_CONTAINERS
+from container_config import CONTAINER_RESOURCES
+from container_config import GPU_TYPE_TO_POD_STR
+from container_config import MIG_AVAILABLE
+from container_config import MIG_CAPABLE_GPU_TYPES
+from container_config import MIG_CONTAINERS
 from data_loading import load_latency_data
 from model_provisioner.policies import STREAMWISE_POLICY
+from streamwise_apps import STREAMWISE_APPS
 from workflows import WORKFLOWS
 
 
@@ -42,49 +52,6 @@
     Model.OTHERS: ["kokoro", "yolo"],
 }
 
-# Default CPU/memory/storage for each container when deployed via auto-deploy.
-# Format: (cpu_cores, memory_gib, ephemeral_storage_gib)
-CONTAINER_RESOURCES: dict[str, tuple[int, int, int]] = {
-    "gemma": (16, 192, 64),
-    "flux": (12, 128, 64),
-    "hunyuanframepackf1": (24, 128, 64),
-    "hunyuanframepackvae": (4, 32, 16),
-    "fantasytalking": (12, 192, 64),
-    "realesrgan": (4, 32, 16),
-    "kokoro": (2, 8, 16),
-    "yolo": (4, 8, 16),
-}
-
-# GPU type string used by pod_manager (lowercase)
-GPU_TYPE_TO_POD_STR: dict[GPUType, str] = {
-    GPUType.A100: "a100",
-    GPUType.H100: "h100",
-    GPUType.H200: "h200",
-    GPUType.GB200: "gb200",
-}
-
-# MIG is only supported by pod_manager on these GPU types.
-MIG_CAPABLE_GPU_TYPES: frozenset[GPUType] = frozenset({GPUType.A100, GPUType.H100})
-
-# Containers that prefer a MIG slice when the selected GPU type supports MIG.
-# When MIG is available on the cluster, these services use a MIG slice (shared GPU).
-# When MIG is NOT available, they fall back to 1 full GPU each and the extra GPUs
-# are counted against the budget (with a warning if exceeded).
-MIG_CONTAINERS: dict[str, str] = {
-    "kokoro": "1g.10gb",
-    "yolo": "1g.10gb",
-    "realesrgan": "1g.10gb",
-}
-
-# Containers that are co-located with their parent model (sharing GPUs on the same server).
-# The allocator counts their GPUs as part of the parent model's allocation, so they should
-# deploy with gpu=0 to avoid double-counting.
-COLOCATED_CONTAINERS: frozenset[str] = frozenset({"hunyuanframepackvae"})
-
-# Whether MIG is actually configured on the cluster.
-# When False, MIG_CONTAINERS entries fall back to full GPUs.
-MIG_AVAILABLE: bool = False
-
 
 def get_mig_profile(container_name: str, gpu_type: GPUType) -> Optional[str]:
     """Return a MIG profile only when MIG is available and the GPU type supports it."""
@@ -108,6 +75,11 @@ def get_mig_profile(container_name: str, gpu_type: GPUType) -> Optional[str]:
     "streamedit": "editing",
 }
 
+# Ensure allocator knows about all StreamWise apps (catch drift early).
+assert set(APP_TO_WORKFLOW.keys()) == set(STREAMWISE_APPS), (
+    f"APP_TO_WORKFLOW keys {set(APP_TO_WORKFLOW.keys())} != STREAMWISE_APPS {set(STREAMWISE_APPS)}"
+)
+
 
 @dataclass
 class DeploymentSpec:
diff --git a/streamwise/container_config.py b/streamwise/container_config.py
new file mode 100644
index 00000000..a9da9ced
--- /dev/null
+++ b/streamwise/container_config.py
@@ -0,0 +1,70 @@
+"""
+Shared container deployment configuration for StreamWise.
+
+Central source of truth for container resource defaults, MIG profiles,
+GPU type mappings, and related deployment constants. Both allocator_bridge
+and streamwise.py import from here to avoid duplication.
+"""
+
+from __future__ import annotations
+
+import sys
+import os
+
+_HERE = os.path.dirname(os.path.abspath(__file__))
+if _HERE not in sys.path:
+    sys.path.insert(0, _HERE)
+
+_REPO_ROOT = os.path.dirname(_HERE)
+if _REPO_ROOT not in sys.path:
+    sys.path.insert(0, _REPO_ROOT)
+
+# model_provisioner import adds simulator/ to sys.path
+import model_provisioner  # noqa: E402, F401
+
+from sim_types import GPUType  # noqa: E402
+
+
+# Default CPU/memory/storage for each container when deployed via auto-deploy.
+# Format: (cpu_cores, memory_gib, ephemeral_storage_gib)
+# Keep in sync with the Helm values in deployment/helm/values.yaml.
+CONTAINER_RESOURCES: dict[str, tuple[int, int, int]] = {
+    "gemma": (16, 192, 64),
+    "flux": (12, 128, 64),
+    "hunyuanframepackf1": (24, 128, 64),
+    "hunyuanframepackvae": (4, 32, 16),
+    "fantasytalking": (12, 192, 64),
+    "realesrgan": (4, 32, 16),
+    "kokoro": (2, 8, 16),
+    "yolo": (4, 8, 16),
+}
+
+# GPU type string used by pod_manager (lowercase).
+GPU_TYPE_TO_POD_STR: dict[GPUType, str] = {
+    GPUType.A100: "a100",
+    GPUType.H100: "h100",
+    GPUType.H200: "h200",
+    GPUType.GB200: "gb200",
+}
+
+# MIG is only supported by pod_manager on these GPU types.
+MIG_CAPABLE_GPU_TYPES: frozenset[GPUType] = frozenset({GPUType.A100, GPUType.H100})
+
+# Containers that prefer a MIG slice when the selected GPU type supports MIG.
+# When MIG is available on the cluster, these services use a MIG slice (shared GPU).
+# When MIG is NOT available, they fall back to 1 full GPU each and the extra GPUs
+# are counted against the budget (with a warning if exceeded).
+MIG_CONTAINERS: dict[str, str] = {
+    "kokoro": "1g.10gb",
+    "yolo": "1g.10gb",
+    "realesrgan": "1g.10gb",
+}
+
+# Containers that are co-located with their parent model (sharing GPUs on the same server).
+# The allocator counts their GPUs as part of the parent model's allocation, so they should
+# deploy with gpu=0 to avoid double-counting.
+COLOCATED_CONTAINERS: frozenset[str] = frozenset({"hunyuanframepackvae"})
+
+# Whether MIG is actually configured on the cluster.
+# When False, MIG_CONTAINERS entries fall back to full GPUs.
+MIG_AVAILABLE: bool = False
diff --git a/streamwise/streamwise.py b/streamwise/streamwise.py
index 02ef92cd..ffedb40c 100644
--- a/streamwise/streamwise.py
+++ b/streamwise/streamwise.py
@@ -35,6 +35,8 @@
 import node_manager
 import job_manager
 import allocator_bridge
+from container_config import CONTAINER_RESOURCES
+from container_config import MIG_CONTAINERS
 
 from service_manager import get_services
 from service_manager import get_service_timestamps
@@ -595,25 +597,29 @@ async def api_add_service(
 ) -> QuartReturn:
     """API interface to add pods for all services."""
     try:
-        # CPU, memory GiB, ephemeral storage GiB, GPU count, GPU type
-        # Keep in sync with the helm values
-        container_dict: dict[str, tuple[int, int, int, Union[int, str]]] = {
-            "podcasttranscript": (1, 4, 16, 0),
-            "slidetranscript": (1, 4, 16, 0),
-            "gemma": (16, 192, 64, min(2, max_gpus)),
-            # "hunyuanframepackf1": (32, 192, 64, min(2, max_gpus)),
-            "hunyuanframepackf1": (24, 128, 64, min(2, max_gpus)),
-            "hunyuanframepackvae": (4, 32, 16, 1),
-            # "flux": (16, 192, 64, min(2, max_gpus)),
-            "flux": (12, 128, 64, min(2, max_gpus)),
-            "fluxkontext": (12, 128, 64, 1),
-            # "fantasytalking": (16, 256, 64, min(2, max_gpus)),
-            "fantasytalking": (12, 192, 64, min(2, max_gpus)),
-            "realesrgan": (4, 32, 16, "1g.10gb"),
-            "yolo": (4, 8, 16, "1g.10gb"),
-            "kokoro": (2, 8, 16, "1g.10gb"),
-            "whisper": (2, 8, 16, 1),
-        }
+        # Build container_dict from shared constants (CONTAINER_RESOURCES + MIG_CONTAINERS).
+        # Format: container_name -> (cpu, memory_gib, ephemeral_storage_gib, gpu_info)
+        # gpu_info is either an int (GPU count) or a MIG profile string.
+        container_dict: dict[str, tuple[int, int, int, Union[int, str]]] = {}
+
+        # Services not in CONTAINER_RESOURCES (CPU-only or extra services)
+        container_dict["podcasttranscript"] = (1, 4, 16, 0)
+        container_dict["slidetranscript"] = (1, 4, 16, 0)
+
+        for name, (cpu, mem, storage) in CONTAINER_RESOURCES.items():
+            if name in MIG_CONTAINERS:
+                container_dict[name] = (cpu, mem, storage, MIG_CONTAINERS[name])
+            else:
+                container_dict[name] = (cpu, mem, storage, min(2, max_gpus))
+
+        # Additional services not covered by CONTAINER_RESOURCES
+        container_dict["fluxkontext"] = (12, 128, 64, 1)
+        container_dict["whisper"] = (2, 8, 16, 1)
+
+        # hunyuanframepackvae uses exactly 1 GPU (not scaled by max_gpus)
+        cpu, mem, storage = CONTAINER_RESOURCES["hunyuanframepackvae"]
+        container_dict["hunyuanframepackvae"] = (cpu, mem, storage, 1)
+
         for container_name, (cpu, mem_gib, sotrage_gib, gpu_info) in container_dict.items():
             num_gpus, mig_profile = parse_gpu_info(gpu_info)
             await pod_manager.add_pod(

From 8886bafc69d773c356a2c9eac8c3958673d73807 Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Tue, 9 Jun 2026 14:06:52 -0700
Subject: [PATCH 24/39] Fixed data path

---
 streamwise/allocator_bridge.py                  | 2 +-
 tests/streamwise/test_streamwise_auto_deploy.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/streamwise/allocator_bridge.py b/streamwise/allocator_bridge.py
index 0b582cdf..543eb96a 100644
--- a/streamwise/allocator_bridge.py
+++ b/streamwise/allocator_bridge.py
@@ -104,7 +104,7 @@ class DeploymentPlan:
 
 def _get_data_dir() -> str:
     """Get the path to the simulator data directory."""
-    default_path = os.path.join(os.path.dirname(__file__), "simulator", "data")
+    default_path = os.path.join(_REPO_ROOT, "simulator", "data")
     return os.getenv("SIMULATOR_DATA_DIR", default_path)
 
 
diff --git a/tests/streamwise/test_streamwise_auto_deploy.py b/tests/streamwise/test_streamwise_auto_deploy.py
index 6496c87f..7c6b630e 100644
--- a/tests/streamwise/test_streamwise_auto_deploy.py
+++ b/tests/streamwise/test_streamwise_auto_deploy.py
@@ -193,7 +193,7 @@ async def test_auto_deploy_confirm_success() -> None:
             "mig_profile": None,
         },
     ]
-    with patch("streamwise.pod_manager.add_pod") as mock_add_pod:
+    with patch.object(sw.pod_manager, "add_pod") as mock_add_pod:
         response = await client.post(
             "/api/auto_deploy/confirm",
             json={"specs": specs},

From cb82b02a010d50c15aba02efd97b384689e49d63 Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Tue, 9 Jun 2026 15:29:37 -0700
Subject: [PATCH 25/39] Move auto-deploy to standalone page and deploy app
 container

- Add /auto_deploy route with standalone auto_deploy.html page
- Add robot icon + 'Auto Deploy' button on main index page above Applications
- Remove auto-deploy section from add_pod.html (wrappers/apps add page)
- Confirm endpoint now also deploys the application container (e.g., streamcast)
  when a workflow name is provided in the request

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 streamwise/streamwise.py              |  61 +++++--
 streamwise/templates/add_pod.html     | 224 -----------------------
 streamwise/templates/auto_deploy.html | 250 ++++++++++++++++++++++++++
 streamwise/templates/index.html       |  12 ++
 4 files changed, 309 insertions(+), 238 deletions(-)
 create mode 100644 streamwise/templates/auto_deploy.html

diff --git a/streamwise/streamwise.py b/streamwise/streamwise.py
index ffedb40c..5662e66a 100644
--- a/streamwise/streamwise.py
+++ b/streamwise/streamwise.py
@@ -548,6 +548,12 @@ async def add_pod(service_name: str) -> str:
     )
 
 
+@route("/auto_deploy", methods=["GET"])
+async def auto_deploy_page() -> str:
+    """Render the standalone auto-deploy page."""
+    return await render_template("auto_deploy.html")
+
+
 @route("/api/pod/<pod_name>", methods=["DELETE"])
 async def api_remove_pod(pod_name: str) -> QuartReturn:
     """API interface to remove a pod by name."""
@@ -798,21 +804,12 @@ async def api_auto_deploy_confirm() -> QuartReturn:
 
     Expects JSON body:
         {
-            "specs": [
-                {
-                    "container_name": "gemma",
-                    "cpu": 16,
-                    "memory_gib": 192,
-                    "ephemeral_storage_gib": 64,
-                    "gpu": 2,
-                    "gpu_type": "a100",
-                    "mig_profile": null
-                },
-                ...
-            ]
+            "specs": [...],
+            "workflow": "streamcast"  (optional: also deploys the application container)
         }
 
-    Deploys all containers in the plan.
+    Deploys all model wrapper containers in the plan, plus the application
+    container if a workflow name is provided.
     """
     try:
         data = await request.get_json()
@@ -823,6 +820,8 @@ async def api_auto_deploy_confirm() -> QuartReturn:
         if not specs or not isinstance(specs, list):
             return jsonify({"error": "Missing or invalid 'specs' field"}), HTTPStatus.BAD_REQUEST
 
+        workflow = data.get("workflow")
+
         deployed: List[str] = []
         errors: List[str] = []
 
@@ -864,11 +863,45 @@ async def api_auto_deploy_confirm() -> QuartReturn:
                 logging.error(msg)
                 errors.append(msg)
 
+        # Also deploy the application container if workflow is specified
+        if workflow and workflow in STREAMWISE_APPS:
+            try:
+                add_pod_result = await pod_manager.add_pod(
+                    container_name=workflow,
+                    cpu=4,
+                    memory_gib=16,
+                    ephemeral_storage_gib=16,
+                    gpu=0,
+                    gpu_type=None,
+                    mig_profile=None,
+                    namespace=NAMESPACE,
+                    k8s_cluster=k8s_cluster,
+                )
+                status_code = HTTPStatus.OK
+                if isinstance(add_pod_result, tuple) and len(add_pod_result) >= 2:
+                    status_value = add_pod_result[1]
+                    if isinstance(status_value, HTTPStatus):
+                        status_code = status_value
+                    elif isinstance(status_value, int):
+                        status_code = HTTPStatus(status_value)
+                if status_code >= HTTPStatus.BAD_REQUEST:
+                    msg = f"Failed to deploy app '{workflow}' (status={int(status_code)})"
+                    logging.error(msg)
+                    errors.append(msg)
+                else:
+                    deployed.append(workflow)
+            except Exception as app_ex:
+                msg = f"Failed to deploy app '{workflow}': {app_ex}"
+                logging.error(msg)
+                errors.append(msg)
+
+        total_deployed = len(deployed)
+        total_specs = len(specs) + (1 if workflow and workflow in STREAMWISE_APPS else 0)
         status = HTTPStatus.OK if not errors else HTTPStatus.MULTI_STATUS
         return jsonify({
             "deployed": deployed,
             "errors": errors,
-            "message": f"Deployed {len(deployed)}/{len(specs)} containers.",
+            "message": f"Deployed {total_deployed}/{total_specs} containers.",
         }), status
 
     except Exception as ex:
diff --git a/streamwise/templates/add_pod.html b/streamwise/templates/add_pod.html
index 94b980ad..d61952aa 100644
--- a/streamwise/templates/add_pod.html
+++ b/streamwise/templates/add_pod.html
@@ -384,95 +384,6 @@ <h2 class="mt-5">🧩 Applications</h2>
             </form>
         {% endif %}
 
-        <!-- Auto-Deploy Section -->
-        <h2 class="mt-5">🤖 Auto Deploy</h2>
-        <p>Specify your GPU budget and the optimizer will determine the best allocation for each component:</p>
-
-        <form id="auto-deploy-form">
-            <fieldset class="border rounded-3 p-3 mb-4">
-                <legend class="float-none w-auto px-2 fw-semibold">
-                    💰 GPU Budget
-                </legend>
-                <div class="row g-3 mb-3">
-                    <div class="col-md-3">
-                        <label for="auto_gpu_a100" class="form-label">A100</label>
-                        <input type="number" class="form-control" id="auto_gpu_a100" name="gpu_a100"
-                            min="0" max="64" value="0">
-                    </div>
-                    <div class="col-md-3">
-                        <label for="auto_gpu_h100" class="form-label">H100</label>
-                        <input type="number" class="form-control" id="auto_gpu_h100" name="gpu_h100"
-                            min="0" max="64" value="0">
-                    </div>
-                    <div class="col-md-3">
-                        <label for="auto_gpu_h200" class="form-label">H200</label>
-                        <input type="number" class="form-control" id="auto_gpu_h200" name="gpu_h200"
-                            min="0" max="64" value="0">
-                    </div>
-                    <div class="col-md-3">
-                        <label for="auto_gpu_gb200" class="form-label">GB200</label>
-                        <input type="number" class="form-control" id="auto_gpu_gb200" name="gpu_gb200"
-                            min="0" max="64" value="0">
-                    </div>
-                </div>
-            </fieldset>
-
-            <fieldset class="border rounded-3 p-3 mb-4">
-                <legend class="float-none w-auto px-2 fw-semibold">
-                    🎬 Workflow
-                </legend>
-                <div class="mb-3">
-                    <label for="auto_workflow" class="form-label">Application workflow</label>
-                    <select class="form-select" id="auto_workflow" name="workflow">
-                        <option value="streamcast" selected>🎙️ StreamCast (Podcast)</option>
-                        <option value="streampersona">👤 StreamPersona (Slide)</option>
-                        <option value="streamchat">💬 StreamChat (Video Chat)</option>
-                        <option value="streamshort">🎬 StreamShort (Shorts)</option>
-                        <option value="streammovie">🎬 StreamMovie (Movie)</option>
-                        <option value="streamanimate">🎞️ StreamAnimate (Story)</option>
-                        <option value="streamlecture">📚 StreamLecture (Lecture)</option>
-                        <option value="streamdub">🎤 StreamDub (Dubbing)</option>
-                        <option value="streamedit">✂️ StreamEdit (Editing)</option>
-                    </select>
-                </div>
-            </fieldset>
-
-            <div class="text-end mb-3">
-                <button type="submit" class="btn btn-warning" style="width: 200px;"
-                    id="auto-deploy-optimize-btn">
-                    🤖 Optimize
-                </button>
-            </div>
-        </form>
-
-        <!-- Auto-deploy results (hidden until optimize is clicked) -->
-        <div id="auto-deploy-results" style="display:none;">
-            <h4>📊 Optimized Deployment Plan</h4>
-            <div id="auto-deploy-warning" class="alert alert-warning mb-3" style="display:none;"></div>
-            <div id="auto-deploy-metrics" class="alert alert-success mb-3"></div>
-            <table class="table table-sm table-bordered" id="auto-deploy-plan-table">
-                <thead>
-                    <tr>
-                        <th>Container</th>
-                        <th>GPU</th>
-                        <th>GPU Type</th>
-                        <th>CPU</th>
-                        <th>Memory</th>
-                        <th>MIG</th>
-                    </tr>
-                </thead>
-                <tbody id="auto-deploy-plan-body"></tbody>
-            </table>
-            <div class="text-end">
-                <button type="button" class="btn btn-success" style="width: 200px;"
-                    id="auto-deploy-confirm-btn">
-                    ✅ Confirm Deploy
-                </button>
-            </div>
-        </div>
-
-        <div id="auto-deploy-error" class="alert alert-danger mt-3" style="display:none;"></div>
-
         <script src="{{ url_for('static', filename='js/form-utils.js') }}"></script>
         <script>
             // Keep aligned with deployment/helm/values.yaml and services.json
@@ -774,141 +685,6 @@ <h4>📊 Optimized Deployment Plan</h4>
                     });
                 });
             }
-            // Auto-Deploy
-            const autoDeployForm = document.getElementById('auto-deploy-form');
-            if (autoDeployForm) {
-                let currentPlan = null;
-
-                // Auto-populate GPU budget from cluster state
-                fetch('/api/auto_deploy/cluster_gpus', {credentials: 'same-origin'})
-                    .then(r => r.json())
-                    .then(data => {
-                        if (data.gpu_budget) {
-                            const fieldMap = {
-                                'A100': 'auto_gpu_a100',
-                                'H100': 'auto_gpu_h100',
-                                'H200': 'auto_gpu_h200',
-                                'GB200': 'auto_gpu_gb200',
-                            };
-                            for (const [gpuType, fieldId] of Object.entries(fieldMap)) {
-                                const el = document.getElementById(fieldId);
-                                if (el && data.gpu_budget[gpuType] !== undefined) {
-                                    el.value = data.gpu_budget[gpuType];
-                                }
-                            }
-                        }
-                    })
-                    .catch(err => console.warn('Could not auto-populate GPU budget:', err));
-
-                autoDeployForm.addEventListener('submit', function(e) {
-                    e.preventDefault();
-                    const btn = document.getElementById('auto-deploy-optimize-btn');
-                    btn.disabled = true;
-                    btn.textContent = '⏳ Optimizing...';
-
-                    const gpuBudget = {
-                        'A100': parseInt(document.getElementById('auto_gpu_a100').value) || 0,
-                        'H100': parseInt(document.getElementById('auto_gpu_h100').value) || 0,
-                        'H200': parseInt(document.getElementById('auto_gpu_h200').value) || 0,
-                        'GB200': parseInt(document.getElementById('auto_gpu_gb200').value) || 0,
-                    };
-                    const workflow = document.getElementById('auto_workflow').value;
-
-                    const errorDiv = document.getElementById('auto-deploy-error');
-                    const resultsDiv = document.getElementById('auto-deploy-results');
-                    errorDiv.style.display = 'none';
-                    resultsDiv.style.display = 'none';
-
-                    fetch('/api/auto_deploy', {
-                        method: 'POST',
-                        headers: {'Content-Type': 'application/json'},
-                        body: JSON.stringify({gpu_budget: gpuBudget, workflow: workflow}),
-                        credentials: 'same-origin'
-                    })
-                    .then(response => response.json().then(data => ({ok: response.ok, data})))
-                    .then(({ok, data}) => {
-                        btn.disabled = false;
-                        btn.textContent = '🤖 Optimize';
-                        if (!ok) {
-                            errorDiv.textContent = data.error || 'Unknown error';
-                            errorDiv.style.display = '';
-                            return;
-                        }
-                        currentPlan = data;
-                        // Show warnings if any
-                        const warningDiv = document.getElementById('auto-deploy-warning');
-                        if (data.warnings && data.warnings.length > 0) {
-                            warningDiv.innerHTML = data.warnings.map(w =>
-                                `⚠️ ${escapeHtml(w)}`).join('<br>');
-                            warningDiv.style.display = '';
-                        } else {
-                            warningDiv.style.display = 'none';
-                        }
-                        // Show metrics
-                        const metrics = data.metrics;
-                        const actualGpus = metrics.actual_gpus_needed
-                            ? JSON.stringify(metrics.actual_gpus_needed) : JSON.stringify(metrics.gpus_used);
-                        document.getElementById('auto-deploy-metrics').innerHTML =
-                            `<strong>Total Time:</strong> ${metrics.total_time_s}s &nbsp;|&nbsp; ` +
-                            `<strong>TTFF:</strong> ${metrics.ttff_s}s &nbsp;|&nbsp; ` +
-                            `<strong>Cost:</strong> $${metrics.cost} &nbsp;|&nbsp; ` +
-                            `<strong>GPUs Needed:</strong> ${actualGpus}` +
-                            (metrics.budget_exceeded ? ' <span class="text-danger">(exceeds budget!)</span>' : '');
-                        // Show plan table
-                        const tbody = document.getElementById('auto-deploy-plan-body');
-                        tbody.innerHTML = '';
-                        data.specs.forEach(spec => {
-                            const row = document.createElement('tr');
-                            row.innerHTML =
-                                `<td>${escapeHtml(spec.container_name)}</td>` +
-                                `<td>${spec.gpu}</td>` +
-                                `<td>${escapeHtml(spec.gpu_type || 'any')}</td>` +
-                                `<td>${spec.cpu}</td>` +
-                                `<td>${spec.memory_gib} GiB</td>` +
-                                `<td>${spec.mig_profile || '-'}</td>`;
-                            tbody.appendChild(row);
-                        });
-                        resultsDiv.style.display = '';
-                    })
-                    .catch(err => {
-                        btn.disabled = false;
-                        btn.textContent = '🤖 Optimize';
-                        errorDiv.textContent = 'Network error: ' + err;
-                        errorDiv.style.display = '';
-                    });
-                });
-
-                document.getElementById('auto-deploy-confirm-btn').addEventListener('click', function() {
-                    if (!currentPlan || !currentPlan.specs) return;
-
-                    const btn = this;
-                    btn.disabled = true;
-                    btn.textContent = '⏳ Deploying...';
-
-                    fetch('/api/auto_deploy/confirm', {
-                        method: 'POST',
-                        headers: {'Content-Type': 'application/json'},
-                        body: JSON.stringify({specs: currentPlan.specs}),
-                        credentials: 'same-origin'
-                    })
-                    .then(response => response.json().then(data => ({ok: response.ok, data})))
-                    .then(({ok, data}) => {
-                        btn.disabled = false;
-                        btn.textContent = '✅ Confirm Deploy';
-                        if (data.errors && data.errors.length > 0) {
-                            alert('Deployed ' + data.deployed.length + ' containers.\nErrors:\n' + data.errors.join('\n'));
-                        } else {
-                            alert(data.message || 'Deployment complete!');
-                        }
-                        window.location.href = '/';
-                    })
-                    .catch(err => {
-                        btn.disabled = false;
-                        btn.textContent = '✅ Confirm Deploy';
-                        alert('Error: ' + err);
-                    });
-                });
-            }
         </script>
         <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.5/dist/js/bootstrap.bundle.min.js"
             integrity="sha384-k6d4wzSIapyDyv1kpU366/PK5hCdSbCRGRCMv+eplOQJWyd1fbcAu9OCUj5zNLiq"
diff --git a/streamwise/templates/auto_deploy.html b/streamwise/templates/auto_deploy.html
new file mode 100644
index 00000000..3b141aeb
--- /dev/null
+++ b/streamwise/templates/auto_deploy.html
@@ -0,0 +1,250 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="utf-8">
+    <title>Auto Deploy - StreamWise</title>
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.5/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-SgOJa3DmI69IUzQ2PVdRZhwQ+dy64/BUtbMJw1MZ8t5HZApcHrRKUc4W0kG879m7" crossorigin="anonymous">
+    <link rel="stylesheet" href="{{ url_for('static', filename='style.css') }}">
+    <link rel="icon" href="data:image/svg+xml,<svg xmlns=%22http://www.w3.org/2000/svg%22 viewBox=%220 0 100 100%22><text y=%22.9em%22 font-size=%2290%22>🎬</text></svg>">
+</head>
+
+<body>
+    <div class="fixed-nav-buttons">
+        <a class="btn btn-outline-primary" href="/" title="Home">🏠</a>
+    </div>
+
+    <div class="container">
+        <h1 class="mt-4 mb-3">🤖 Auto Deploy</h1>
+        <p class="text-muted">
+            Specify your GPU budget and workflow. The optimizer will determine the best model allocation
+            and deploy all model wrappers plus the application container.
+        </p>
+
+        <form id="auto-deploy-form">
+            <fieldset class="border rounded-3 p-3 mb-4">
+                <legend class="float-none w-auto px-2 fw-semibold">
+                    💰 GPU Budget
+                </legend>
+                <div class="row g-3 mb-3">
+                    <div class="col-md-3">
+                        <label for="auto_gpu_a100" class="form-label">A100</label>
+                        <input type="number" class="form-control" id="auto_gpu_a100" name="gpu_a100"
+                            min="0" max="64" value="0">
+                    </div>
+                    <div class="col-md-3">
+                        <label for="auto_gpu_h100" class="form-label">H100</label>
+                        <input type="number" class="form-control" id="auto_gpu_h100" name="gpu_h100"
+                            min="0" max="64" value="0">
+                    </div>
+                    <div class="col-md-3">
+                        <label for="auto_gpu_h200" class="form-label">H200</label>
+                        <input type="number" class="form-control" id="auto_gpu_h200" name="gpu_h200"
+                            min="0" max="64" value="0">
+                    </div>
+                    <div class="col-md-3">
+                        <label for="auto_gpu_gb200" class="form-label">GB200</label>
+                        <input type="number" class="form-control" id="auto_gpu_gb200" name="gpu_gb200"
+                            min="0" max="64" value="0">
+                    </div>
+                </div>
+            </fieldset>
+
+            <fieldset class="border rounded-3 p-3 mb-4">
+                <legend class="float-none w-auto px-2 fw-semibold">
+                    🎬 Workflow
+                </legend>
+                <div class="mb-3">
+                    <label for="auto_workflow" class="form-label">Application workflow</label>
+                    <select class="form-select" id="auto_workflow" name="workflow">
+                        <option value="streamcast" selected>🎙️ StreamCast (Podcast)</option>
+                        <option value="streampersona">👤 StreamPersona (Slide)</option>
+                        <option value="streamchat">💬 StreamChat (Video Chat)</option>
+                        <option value="streamshort">🎬 StreamShort (Shorts)</option>
+                        <option value="streammovie">🎬 StreamMovie (Movie)</option>
+                        <option value="streamanimate">🎞️ StreamAnimate (Story)</option>
+                        <option value="streamlecture">📚 StreamLecture (Lecture)</option>
+                        <option value="streamdub">🎤 StreamDub (Dubbing)</option>
+                        <option value="streamedit">✂️ StreamEdit (Editing)</option>
+                    </select>
+                </div>
+            </fieldset>
+
+            <div class="text-end mb-3">
+                <button type="submit" class="btn btn-warning" style="width: 200px;"
+                    id="auto-deploy-optimize-btn">
+                    🤖 Optimize
+                </button>
+            </div>
+        </form>
+
+        <div id="auto-deploy-results" style="display:none;">
+            <h4>📊 Optimized Deployment Plan</h4>
+            <div id="auto-deploy-warning" class="alert alert-warning mb-3" style="display:none;"></div>
+            <div id="auto-deploy-metrics" class="alert alert-success mb-3"></div>
+            <table class="table table-sm table-bordered" id="auto-deploy-plan-table">
+                <thead>
+                    <tr>
+                        <th>Container</th>
+                        <th>GPU</th>
+                        <th>GPU Type</th>
+                        <th>CPU</th>
+                        <th>Memory</th>
+                        <th>MIG</th>
+                    </tr>
+                </thead>
+                <tbody id="auto-deploy-plan-body"></tbody>
+            </table>
+            <div class="text-end">
+                <button type="button" class="btn btn-success" style="width: 200px;"
+                    id="auto-deploy-confirm-btn">
+                    ✅ Confirm Deploy
+                </button>
+            </div>
+        </div>
+
+        <div id="auto-deploy-error" class="alert alert-danger mt-3" style="display:none;"></div>
+    </div>
+
+    <script src="{{ url_for('static', filename='js/form-utils.js') }}"></script>
+    <script>
+        const autoDeployForm = document.getElementById('auto-deploy-form');
+        let currentPlan = null;
+
+        // Auto-populate GPU budget from cluster state
+        fetch('/api/auto_deploy/cluster_gpus', {credentials: 'same-origin'})
+            .then(r => r.json())
+            .then(data => {
+                if (data.gpu_budget) {
+                    const fieldMap = {
+                        'A100': 'auto_gpu_a100',
+                        'H100': 'auto_gpu_h100',
+                        'H200': 'auto_gpu_h200',
+                        'GB200': 'auto_gpu_gb200',
+                    };
+                    for (const [gpuType, fieldId] of Object.entries(fieldMap)) {
+                        const el = document.getElementById(fieldId);
+                        if (el && data.gpu_budget[gpuType] !== undefined) {
+                            el.value = data.gpu_budget[gpuType];
+                        }
+                    }
+                }
+            })
+            .catch(err => console.warn('Could not auto-populate GPU budget:', err));
+
+        autoDeployForm.addEventListener('submit', function(e) {
+            e.preventDefault();
+            const btn = document.getElementById('auto-deploy-optimize-btn');
+            btn.disabled = true;
+            btn.textContent = '⏳ Optimizing...';
+
+            const gpuBudget = {
+                'A100': parseInt(document.getElementById('auto_gpu_a100').value) || 0,
+                'H100': parseInt(document.getElementById('auto_gpu_h100').value) || 0,
+                'H200': parseInt(document.getElementById('auto_gpu_h200').value) || 0,
+                'GB200': parseInt(document.getElementById('auto_gpu_gb200').value) || 0,
+            };
+            const workflow = document.getElementById('auto_workflow').value;
+
+            const errorDiv = document.getElementById('auto-deploy-error');
+            const resultsDiv = document.getElementById('auto-deploy-results');
+            errorDiv.style.display = 'none';
+            resultsDiv.style.display = 'none';
+
+            fetch('/api/auto_deploy', {
+                method: 'POST',
+                headers: {'Content-Type': 'application/json'},
+                body: JSON.stringify({gpu_budget: gpuBudget, workflow: workflow}),
+                credentials: 'same-origin'
+            })
+            .then(response => response.json().then(data => ({ok: response.ok, data})))
+            .then(({ok, data}) => {
+                btn.disabled = false;
+                btn.textContent = '🤖 Optimize';
+                if (!ok) {
+                    errorDiv.textContent = data.error || 'Unknown error';
+                    errorDiv.style.display = '';
+                    return;
+                }
+                currentPlan = data;
+                // Show warnings if any
+                const warningDiv = document.getElementById('auto-deploy-warning');
+                if (data.warnings && data.warnings.length > 0) {
+                    warningDiv.innerHTML = data.warnings.map(w =>
+                        `⚠️ ${escapeHtml(w)}`).join('<br>');
+                    warningDiv.style.display = '';
+                } else {
+                    warningDiv.style.display = 'none';
+                }
+                // Show metrics
+                const metrics = data.metrics;
+                const actualGpus = metrics.actual_gpus_needed
+                    ? JSON.stringify(metrics.actual_gpus_needed) : JSON.stringify(metrics.gpus_used);
+                document.getElementById('auto-deploy-metrics').innerHTML =
+                    `<strong>Total Time:</strong> ${metrics.total_time_s}s &nbsp;|&nbsp; ` +
+                    `<strong>TTFF:</strong> ${metrics.ttff_s}s &nbsp;|&nbsp; ` +
+                    `<strong>Cost:</strong> $${metrics.cost} &nbsp;|&nbsp; ` +
+                    `<strong>GPUs Needed:</strong> ${actualGpus}` +
+                    (metrics.budget_exceeded ? ' <span class="text-danger">(exceeds budget!)</span>' : '');
+                // Show plan table
+                const tbody = document.getElementById('auto-deploy-plan-body');
+                tbody.innerHTML = '';
+                data.specs.forEach(spec => {
+                    const row = document.createElement('tr');
+                    row.innerHTML =
+                        `<td>${escapeHtml(spec.container_name)}</td>` +
+                        `<td>${spec.gpu}</td>` +
+                        `<td>${escapeHtml(spec.gpu_type || 'any')}</td>` +
+                        `<td>${spec.cpu}</td>` +
+                        `<td>${spec.memory_gib} GiB</td>` +
+                        `<td>${spec.mig_profile || '-'}</td>`;
+                    tbody.appendChild(row);
+                });
+                resultsDiv.style.display = '';
+            })
+            .catch(err => {
+                btn.disabled = false;
+                btn.textContent = '🤖 Optimize';
+                errorDiv.textContent = 'Network error: ' + err;
+                errorDiv.style.display = '';
+            });
+        });
+
+        document.getElementById('auto-deploy-confirm-btn').addEventListener('click', function() {
+            if (!currentPlan || !currentPlan.specs) return;
+
+            const btn = this;
+            btn.disabled = true;
+            btn.textContent = '⏳ Deploying...';
+
+            const workflow = document.getElementById('auto_workflow').value;
+
+            fetch('/api/auto_deploy/confirm', {
+                method: 'POST',
+                headers: {'Content-Type': 'application/json'},
+                body: JSON.stringify({specs: currentPlan.specs, workflow: workflow}),
+                credentials: 'same-origin'
+            })
+            .then(response => response.json().then(data => ({ok: response.ok, data})))
+            .then(({ok, data}) => {
+                btn.disabled = false;
+                btn.textContent = '✅ Confirm Deploy';
+                if (data.errors && data.errors.length > 0) {
+                    alert('Deployed ' + data.deployed.length + ' containers.\nErrors:\n' + data.errors.join('\n'));
+                } else {
+                    alert(data.message || 'Deployment complete!');
+                }
+                window.location.href = '/';
+            })
+            .catch(err => {
+                btn.disabled = false;
+                btn.textContent = '✅ Confirm Deploy';
+                alert('Error: ' + err);
+            });
+        });
+    </script>
+    <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.5/dist/js/bootstrap.bundle.min.js"
+        integrity="sha384-k6d4wzSIapyDyv1kpU366/PK5hCdSbCRGRCMv+eplOQJWyd1fbcAu9OCUj5zNLiq"
+        crossorigin="anonymous"></script>
+</body>
+</html>
diff --git a/streamwise/templates/index.html b/streamwise/templates/index.html
index 7fc18a6c..f2b8a80a 100644
--- a/streamwise/templates/index.html
+++ b/streamwise/templates/index.html
@@ -244,6 +244,18 @@ <h1 class="mb-4 text-center">📄📽️ StreamWise Cluster Manager 🔉🎬</h1
         </table>
         {% endmacro %}
 
+        <h2 class="mt-5 mb-3">🤖 Auto Deploy</h2>
+        <div class="text-begin mb-3">
+            <a
+                class="btn btn-outline-warning btn-lg"
+                href="{{ url_for('auto_deploy_page') }}"
+                title="Auto Deploy"
+                aria-label="Auto Deploy">
+                🤖 Auto Deploy
+            </a>
+            <span class="text-muted ms-2">Optimize and deploy all services automatically</span>
+        </div>
+
         <h2 class="mt-5 mb-3">🎯 Applications</h2>
         <div class="text-begin mb-3">
             <a

From 1ee2801116479da446cdf035e7cae0a834c83171 Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Tue, 9 Jun 2026 15:34:31 -0700
Subject: [PATCH 26/39] Fix data dir path: use _HERE instead of _REPO_ROOT for
 container compat

In Docker the parent of /streamwise is /, which produced an invalid
path /simulator/data/*.csv.  Using _HERE resolves to
/streamwise/simulator/data/ inside the container.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 streamwise/allocator_bridge.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/streamwise/allocator_bridge.py b/streamwise/allocator_bridge.py
index 543eb96a..68aff990 100644
--- a/streamwise/allocator_bridge.py
+++ b/streamwise/allocator_bridge.py
@@ -104,7 +104,7 @@ class DeploymentPlan:
 
 def _get_data_dir() -> str:
     """Get the path to the simulator data directory."""
-    default_path = os.path.join(_REPO_ROOT, "simulator", "data")
+    default_path = os.path.join(_HERE, "simulator", "data")
     return os.getenv("SIMULATOR_DATA_DIR", default_path)
 
 

From 5d38cf8521fd9cebf05b9eb0bddf458bafe98f0c Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Tue, 9 Jun 2026 15:38:27 -0700
Subject: [PATCH 27/39] Fix allocator: round budget up to server multiples
 before calling allocator

The greedy allocator asserts GPU counts are multiples of
NUM_GPUS_PER_SERVER (8). Now we round up for the allocator call, then
trim specs back to the user's actual budget. This allows non-multiple
budgets like 26 or 30.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 streamwise/allocator_bridge.py | 36 ++++++++++++++++++++--------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/streamwise/allocator_bridge.py b/streamwise/allocator_bridge.py
index 68aff990..f585d05d 100644
--- a/streamwise/allocator_bridge.py
+++ b/streamwise/allocator_bridge.py
@@ -210,6 +210,15 @@ def run_allocator(
     if not num_gpus or sum(num_gpus.values()) < 8:
         raise ValueError("Total GPU budget must be at least 8 GPUs.")
 
+    # The allocator requires GPU counts to be multiples of NUM_GPUS_PER_SERVER (8).
+    # Round up for the allocator, then trim back to the real budget afterward.
+    import math
+    from constants import NUM_GPUS_PER_SERVER
+    allocator_gpus: dict[GPUType, int] = {}
+    for gpu_type, count in num_gpus.items():
+        server_size = NUM_GPUS_PER_SERVER[gpu_type]
+        allocator_gpus[gpu_type] = math.ceil(count / server_size) * server_size
+
     # Load latency data and run allocator
     data_dir = _get_data_dir()
     latency_data = load_latency_data(data_dir=data_dir)
@@ -220,25 +229,22 @@ def run_allocator(
         policy=STREAMWISE_POLICY,
     )
 
-    result = allocator.allocate(num_gpus=num_gpus, verbose=False)
+    result = allocator.allocate(num_gpus=allocator_gpus, verbose=False)
 
     # Convert result to deployment specs
     specs = result_to_deployment_specs(result)
 
-    # When MIG is unavailable, deployment specs may use more GPUs per type than the
-    # allocator budgeted (e.g., OTHERS allocates 1 GPU but kokoro+yolo each need a
-    # full GPU = 2). Detect per-type overflow and trim excess replicas.
-    if not MIG_AVAILABLE:
-        actual_per_type = _calc_actual_gpus_per_type(specs)
-        for gpu_type, budget_count in num_gpus.items():
-            actual = actual_per_type.get(gpu_type, 0)
-            if actual <= budget_count:
-                continue
-            # Need to trim (actual - budget_count) GPUs from this type.
-            # Remove replicas of the most-replicated scalable container on this type.
-            excess = actual - budget_count
-            gpu_type_str = GPU_TYPE_TO_POD_STR[gpu_type]
-            specs = _trim_specs_for_type(specs, gpu_type_str, excess)
+    # Trim deployment specs back to the user's actual budget.
+    # Also handles MIG-unavailable overflow (e.g., OTHERS allocates 1 GPU
+    # but kokoro+yolo each need a full GPU = 2).
+    actual_per_type = _calc_actual_gpus_per_type(specs)
+    for gpu_type, budget_count in num_gpus.items():
+        actual = actual_per_type.get(gpu_type, 0)
+        if actual <= budget_count:
+            continue
+        excess = actual - budget_count
+        gpu_type_str = GPU_TYPE_TO_POD_STR[gpu_type]
+        specs = _trim_specs_for_type(specs, gpu_type_str, excess)
 
     return DeploymentPlan(
         specs=specs,

From 6ce67364cbdea06028fa0ca691bd0973d67c24ee Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Tue, 9 Jun 2026 15:43:17 -0700
Subject: [PATCH 28/39] Add 'Delete All' button for wrapper pods on main page
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a red 🗑️ Delete All button next to the ➕ under Wrappers.
Clicking it calls DELETE /api/pods/wrappers which removes all
non-app pods in the rtgen namespace.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 streamwise/streamwise.py        | 23 +++++++++++++++++++++++
 streamwise/templates/index.html | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+)

diff --git a/streamwise/streamwise.py b/streamwise/streamwise.py
index 5662e66a..b201f3e0 100644
--- a/streamwise/streamwise.py
+++ b/streamwise/streamwise.py
@@ -566,6 +566,29 @@ async def api_remove_pod(pod_name: str) -> QuartReturn:
         k8s_cluster=k8s_cluster)
 
 
+@route("/api/pods/wrappers", methods=["DELETE"])
+async def api_delete_all_wrappers() -> QuartReturn:
+    """Delete all wrapper pods (non-app pods) in the namespace."""
+    svcs = await get_services(namespace=NAMESPACE, k8s_cluster=k8s_cluster)
+    wrapper_pods = [
+        svc["pod_name"] for svc in svcs
+        if svc.get("container_name") not in STREAMWISE_APPS and svc.get("pod_name")
+    ]
+    deleted = 0
+    errors: list[str] = []
+    for pod_name in wrapper_pods:
+        try:
+            await pod_manager.remove_pod(
+                pod_name, namespace=NAMESPACE, k8s_cluster=k8s_cluster)
+            deleted += 1
+        except Exception as e:
+            errors.append(f"{pod_name}: {e}")
+    result: dict[str, object] = {"deleted": deleted, "total": len(wrapper_pods)}
+    if errors:
+        result["errors"] = errors
+    return jsonify(result), HTTPStatus.OK
+
+
 @route("/api/services", methods=["GET"])
 async def api_get_services() -> QuartReturn:
     """API interface to get the list of services."""
diff --git a/streamwise/templates/index.html b/streamwise/templates/index.html
index f2b8a80a..c9913f55 100644
--- a/streamwise/templates/index.html
+++ b/streamwise/templates/index.html
@@ -296,6 +296,15 @@ <h2 class="mt-5 mb-3">🌐 Wrappers</h2>
                     Add Wrapper
                 {% endif %}
             </a>
+            {% if wrapper_svcs and wrapper_svcs | length > 0 %}
+            <button
+                class="btn btn-outline-danger ms-2"
+                title="Delete All Wrappers"
+                aria-label="Delete All Wrappers"
+                onclick="deleteAllWrappers()">
+                🗑️ Delete All
+            </button>
+            {% endif %}
         </div>
         {% if wrapper_svcs %}
             {{ svc_table(wrapper_svcs, 'rtgen-table') }}
@@ -586,5 +595,28 @@ <h2 class="mt-5 mb-3">🫛 Pods</h2>
             });
         });
     </script>
+    <script>
+        async function deleteAllWrappers() {
+            if (!confirm('Delete ALL wrapper pods? This cannot be undone.')) return;
+            const btn = event.currentTarget;
+            btn.disabled = true;
+            btn.textContent = '⏳ Deleting…';
+            try {
+                const resp = await fetch('/api/pods/wrappers', {method: 'DELETE'});
+                const data = await resp.json();
+                if (resp.ok) {
+                    alert(`Deleted ${data.deleted} wrapper pod(s).`);
+                    location.reload();
+                } else {
+                    alert('Error: ' + (data.error || resp.statusText));
+                }
+            } catch (e) {
+                alert('Request failed: ' + e.message);
+            } finally {
+                btn.disabled = false;
+                btn.textContent = '🗑️ Delete All';
+            }
+        }
+    </script>
 </body>
 </html>

From eb3b992e47196d2935c23ed1b9105692fbd722f6 Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Tue, 9 Jun 2026 15:54:55 -0700
Subject: [PATCH 29/39] Redesign GPU budget UI: dynamic rows from cluster state

Replace 4-column fixed GPU type inputs with dynamic rows (one per
type). Rows auto-populate from the cluster state (e.g., 32 H100s).
Users can add/remove rows; the dropdown prevents duplicate types.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 streamwise/templates/auto_deploy.html | 124 +++++++++++++++++---------
 1 file changed, 84 insertions(+), 40 deletions(-)

diff --git a/streamwise/templates/auto_deploy.html b/streamwise/templates/auto_deploy.html
index 3b141aeb..7f0398d1 100644
--- a/streamwise/templates/auto_deploy.html
+++ b/streamwise/templates/auto_deploy.html
@@ -26,28 +26,9 @@ <h1 class="mt-4 mb-3">🤖 Auto Deploy</h1>
                 <legend class="float-none w-auto px-2 fw-semibold">
                     💰 GPU Budget
                 </legend>
-                <div class="row g-3 mb-3">
-                    <div class="col-md-3">
-                        <label for="auto_gpu_a100" class="form-label">A100</label>
-                        <input type="number" class="form-control" id="auto_gpu_a100" name="gpu_a100"
-                            min="0" max="64" value="0">
-                    </div>
-                    <div class="col-md-3">
-                        <label for="auto_gpu_h100" class="form-label">H100</label>
-                        <input type="number" class="form-control" id="auto_gpu_h100" name="gpu_h100"
-                            min="0" max="64" value="0">
-                    </div>
-                    <div class="col-md-3">
-                        <label for="auto_gpu_h200" class="form-label">H200</label>
-                        <input type="number" class="form-control" id="auto_gpu_h200" name="gpu_h200"
-                            min="0" max="64" value="0">
-                    </div>
-                    <div class="col-md-3">
-                        <label for="auto_gpu_gb200" class="form-label">GB200</label>
-                        <input type="number" class="form-control" id="auto_gpu_gb200" name="gpu_gb200"
-                            min="0" max="64" value="0">
-                    </div>
-                </div>
+                <div id="gpu-budget-rows"></div>
+                <button type="button" class="btn btn-outline-success btn-sm mt-2" id="add-gpu-row-btn"
+                    title="Add GPU type">➕ Add GPU Type</button>
             </fieldset>
 
             <fieldset class="border rounded-3 p-3 mb-4">
@@ -108,29 +89,97 @@ <h4>📊 Optimized Deployment Plan</h4>
 
     <script src="{{ url_for('static', filename='js/form-utils.js') }}"></script>
     <script>
+        const ALL_GPU_TYPES = ['A100', 'H100', 'H200', 'GB200'];
+        const gpuRowsContainer = document.getElementById('gpu-budget-rows');
+        const addGpuRowBtn = document.getElementById('add-gpu-row-btn');
         const autoDeployForm = document.getElementById('auto-deploy-form');
         let currentPlan = null;
 
-        // Auto-populate GPU budget from cluster state
+        function getActiveGpuTypes() {
+            return Array.from(gpuRowsContainer.querySelectorAll('.gpu-type-select'))
+                .map(s => s.value);
+        }
+
+        function createGpuRow(gpuType, count) {
+            const row = document.createElement('div');
+            row.className = 'row g-2 mb-2 align-items-center gpu-budget-row';
+            const usedTypes = getActiveGpuTypes();
+            const options = ALL_GPU_TYPES.map(t => {
+                const disabled = (t !== gpuType && usedTypes.includes(t)) ? 'disabled' : '';
+                const selected = (t === gpuType) ? 'selected' : '';
+                return `<option value="${t}" ${selected} ${disabled}>${t}</option>`;
+            }).join('');
+            row.innerHTML = `
+                <div class="col-md-4">
+                    <select class="form-select gpu-type-select">${options}</select>
+                </div>
+                <div class="col-md-4">
+                    <input type="number" class="form-control gpu-count-input" min="0" max="256" value="${count}">
+                </div>
+                <div class="col-md-2">
+                    <button type="button" class="btn btn-outline-danger btn-sm remove-gpu-row-btn" title="Remove">✕</button>
+                </div>`;
+            gpuRowsContainer.appendChild(row);
+            row.querySelector('.remove-gpu-row-btn').addEventListener('click', () => {
+                row.remove();
+                refreshAddButton();
+                refreshSelectOptions();
+            });
+            row.querySelector('.gpu-type-select').addEventListener('change', () => {
+                refreshSelectOptions();
+            });
+            refreshAddButton();
+            refreshSelectOptions();
+        }
+
+        function refreshAddButton() {
+            const active = getActiveGpuTypes();
+            addGpuRowBtn.style.display = active.length >= ALL_GPU_TYPES.length ? 'none' : '';
+        }
+
+        function refreshSelectOptions() {
+            const active = getActiveGpuTypes();
+            gpuRowsContainer.querySelectorAll('.gpu-type-select').forEach(sel => {
+                const current = sel.value;
+                sel.querySelectorAll('option').forEach(opt => {
+                    opt.disabled = (opt.value !== current && active.includes(opt.value));
+                });
+            });
+        }
+
+        function getGpuBudget() {
+            const budget = {};
+            gpuRowsContainer.querySelectorAll('.gpu-budget-row').forEach(row => {
+                const type = row.querySelector('.gpu-type-select').value;
+                const count = parseInt(row.querySelector('.gpu-count-input').value) || 0;
+                if (count > 0) budget[type] = count;
+            });
+            return budget;
+        }
+
+        addGpuRowBtn.addEventListener('click', () => {
+            const active = getActiveGpuTypes();
+            const next = ALL_GPU_TYPES.find(t => !active.includes(t));
+            if (next) createGpuRow(next, 0);
+        });
+
+        // Auto-populate from cluster state
         fetch('/api/auto_deploy/cluster_gpus', {credentials: 'same-origin'})
             .then(r => r.json())
             .then(data => {
                 if (data.gpu_budget) {
-                    const fieldMap = {
-                        'A100': 'auto_gpu_a100',
-                        'H100': 'auto_gpu_h100',
-                        'H200': 'auto_gpu_h200',
-                        'GB200': 'auto_gpu_gb200',
-                    };
-                    for (const [gpuType, fieldId] of Object.entries(fieldMap)) {
-                        const el = document.getElementById(fieldId);
-                        if (el && data.gpu_budget[gpuType] !== undefined) {
-                            el.value = data.gpu_budget[gpuType];
+                    for (const [gpuType, count] of Object.entries(data.gpu_budget)) {
+                        if (ALL_GPU_TYPES.includes(gpuType) && count > 0) {
+                            createGpuRow(gpuType, count);
                         }
                     }
                 }
+                // If no rows from cluster, add a default empty H100 row
+                if (gpuRowsContainer.children.length === 0) {
+                    createGpuRow('H100', 0);
+                }
             })
-            .catch(err => console.warn('Could not auto-populate GPU budget:', err));
+            .catch(() => { createGpuRow('H100', 0); });
 
         autoDeployForm.addEventListener('submit', function(e) {
             e.preventDefault();
@@ -138,12 +187,7 @@ <h4>📊 Optimized Deployment Plan</h4>
             btn.disabled = true;
             btn.textContent = '⏳ Optimizing...';
 
-            const gpuBudget = {
-                'A100': parseInt(document.getElementById('auto_gpu_a100').value) || 0,
-                'H100': parseInt(document.getElementById('auto_gpu_h100').value) || 0,
-                'H200': parseInt(document.getElementById('auto_gpu_h200').value) || 0,
-                'GB200': parseInt(document.getElementById('auto_gpu_gb200').value) || 0,
-            };
+            const gpuBudget = getGpuBudget();
             const workflow = document.getElementById('auto_workflow').value;
 
             const errorDiv = document.getElementById('auto-deploy-error');

From 9452ff3c1f55d93e51994e790c75d9a84816f428 Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Tue, 9 Jun 2026 15:58:16 -0700
Subject: [PATCH 30/39] Exclude streamwise and app pods from 'Delete All
 Wrappers'

The delete-all-wrappers endpoint and the wrapper list on the main page
now exclude the streamwise management pod and all STREAMWISE_APPS
(load balancer / application containers).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 streamwise/streamwise.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/streamwise/streamwise.py b/streamwise/streamwise.py
index b201f3e0..949fbf65 100644
--- a/streamwise/streamwise.py
+++ b/streamwise/streamwise.py
@@ -222,7 +222,8 @@ async def index() -> QuartReturn:
             svc["load_balancer"] = await get_lb_pod(pod_name)
 
     app_svcs = [svc for svc in svcs if svc.get("container_name") in STREAMWISE_APPS]
-    wrapper_svcs = [svc for svc in svcs if svc.get("container_name") not in STREAMWISE_APPS]
+    _system_containers = set(STREAMWISE_APPS) | {"streamwise"}
+    wrapper_svcs = [svc for svc in svcs if svc.get("container_name") not in _system_containers]
 
     return await render_template(
         "index.html",
@@ -568,11 +569,13 @@ async def api_remove_pod(pod_name: str) -> QuartReturn:
 
 @route("/api/pods/wrappers", methods=["DELETE"])
 async def api_delete_all_wrappers() -> QuartReturn:
-    """Delete all wrapper pods (non-app pods) in the namespace."""
+    """Delete all wrapper pods (non-app, non-system pods) in the namespace."""
     svcs = await get_services(namespace=NAMESPACE, k8s_cluster=k8s_cluster)
+    # Exclude app pods and the streamwise management pod itself
+    excluded = set(STREAMWISE_APPS) | {"streamwise"}
     wrapper_pods = [
         svc["pod_name"] for svc in svcs
-        if svc.get("container_name") not in STREAMWISE_APPS and svc.get("pod_name")
+        if svc.get("container_name") not in excluded and svc.get("pod_name")
     ]
     deleted = 0
     errors: list[str] = []

From b514fbc06857fb27cddaec3b63e7a22ca254a76b Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Tue, 9 Jun 2026 16:01:24 -0700
Subject: [PATCH 31/39] Format GPUs Needed as '30 H100s' instead of JSON

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 streamwise/templates/auto_deploy.html | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/streamwise/templates/auto_deploy.html b/streamwise/templates/auto_deploy.html
index 7f0398d1..d4b1f960 100644
--- a/streamwise/templates/auto_deploy.html
+++ b/streamwise/templates/auto_deploy.html
@@ -222,8 +222,10 @@ <h4>📊 Optimized Deployment Plan</h4>
                 }
                 // Show metrics
                 const metrics = data.metrics;
-                const actualGpus = metrics.actual_gpus_needed
-                    ? JSON.stringify(metrics.actual_gpus_needed) : JSON.stringify(metrics.gpus_used);
+                const gpuObj = metrics.actual_gpus_needed || metrics.gpus_used || {};
+                const actualGpus = Object.entries(gpuObj)
+                    .map(([type, count]) => `${count} ${type.toUpperCase()}s`)
+                    .join(', ') || '0';
                 document.getElementById('auto-deploy-metrics').innerHTML =
                     `<strong>Total Time:</strong> ${metrics.total_time_s}s &nbsp;|&nbsp; ` +
                     `<strong>TTFF:</strong> ${metrics.ttff_s}s &nbsp;|&nbsp; ` +

From d2ae1700c23f4d29daa40623e456bca8badb74d9 Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Tue, 9 Jun 2026 16:09:40 -0700
Subject: [PATCH 32/39] Use services.json friendly names and uppercase GPU
 types in auto-deploy table

The /api/auto_deploy response now includes friendly_name (from
services.json friendlyName field via get_friendly_container_name)
and uppercased gpu_type for each spec. The frontend displays these
directly.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 streamwise/streamwise.py              | 10 +++++++++-
 streamwise/templates/auto_deploy.html |  2 +-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/streamwise/streamwise.py b/streamwise/streamwise.py
index 949fbf65..cccd8b34 100644
--- a/streamwise/streamwise.py
+++ b/streamwise/streamwise.py
@@ -808,7 +808,15 @@ async def api_auto_deploy() -> QuartReturn:
             gpu_budget=gpu_budget,
             workflow_name=workflow_name,
         )
-        return jsonify(allocator_bridge.deployment_plan_to_json(plan)), HTTPStatus.OK
+        result_json = allocator_bridge.deployment_plan_to_json(plan)
+
+        # Enrich specs with friendly names from services.json and uppercase GPU types
+        for spec in result_json.get("specs", []):
+            spec["friendly_name"] = await get_friendly_container_name(spec["container_name"])
+            if spec.get("gpu_type"):
+                spec["gpu_type"] = spec["gpu_type"].upper()
+
+        return jsonify(result_json), HTTPStatus.OK
 
     except ValueError as ve:
         return jsonify({"error": str(ve)}), HTTPStatus.BAD_REQUEST
diff --git a/streamwise/templates/auto_deploy.html b/streamwise/templates/auto_deploy.html
index d4b1f960..60c7ebd4 100644
--- a/streamwise/templates/auto_deploy.html
+++ b/streamwise/templates/auto_deploy.html
@@ -238,7 +238,7 @@ <h4>📊 Optimized Deployment Plan</h4>
                 data.specs.forEach(spec => {
                     const row = document.createElement('tr');
                     row.innerHTML =
-                        `<td>${escapeHtml(spec.container_name)}</td>` +
+                        `<td>${escapeHtml(spec.friendly_name || spec.container_name)}</td>` +
                         `<td>${spec.gpu}</td>` +
                         `<td>${escapeHtml(spec.gpu_type || 'any')}</td>` +
                         `<td>${spec.cpu}</td>` +

From 35af577bdfc290e5f73f84359d2b0f39dfe43879 Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Tue, 9 Jun 2026 16:12:26 -0700
Subject: [PATCH 33/39] Hide co-located containers (gpu=0) from auto-deploy
 results table

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 streamwise/templates/auto_deploy.html | 1 +
 1 file changed, 1 insertion(+)

diff --git a/streamwise/templates/auto_deploy.html b/streamwise/templates/auto_deploy.html
index 60c7ebd4..060ed4c7 100644
--- a/streamwise/templates/auto_deploy.html
+++ b/streamwise/templates/auto_deploy.html
@@ -236,6 +236,7 @@ <h4>📊 Optimized Deployment Plan</h4>
                 const tbody = document.getElementById('auto-deploy-plan-body');
                 tbody.innerHTML = '';
                 data.specs.forEach(spec => {
+                    if (spec.gpu === 0) return;  // Skip co-located containers (e.g. VAE)
                     const row = document.createElement('tr');
                     row.innerHTML =
                         `<td>${escapeHtml(spec.friendly_name || spec.container_name)}</td>` +

From e64f091654a331bb537db8881201ec8f50ac5460 Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Tue, 9 Jun 2026 16:17:16 -0700
Subject: [PATCH 34/39] Fix: allocate GPU for VAE when HF disaggregation is
 enabled

COLOCATED_CONTAINERS was unconditionally forcing hunyuanframepackvae
to gpu=0 even when the policy has disaggregation={Model.HF: True}.
Now co-location only applies when disaggregation is disabled.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 streamwise/allocator_bridge.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/streamwise/allocator_bridge.py b/streamwise/allocator_bridge.py
index f585d05d..dc3d7f51 100644
--- a/streamwise/allocator_bridge.py
+++ b/streamwise/allocator_bridge.py
@@ -280,8 +280,12 @@ def result_to_deployment_specs(result: Result) -> list[DeploymentSpec]:
                     cpu, memory_gib, ephemeral_storage_gib = resources
 
                     mig_profile: Optional[str] = None
-                    if container_name in COLOCATED_CONTAINERS:
-                        # Co-located with parent model; shares GPU on the same server
+                    # Co-locate VAE only when disaggregation is disabled
+                    is_colocated = (
+                        container_name in COLOCATED_CONTAINERS
+                        and not STREAMWISE_POLICY.disaggregation.get(Model.HF, False)
+                    )
+                    if is_colocated:
                         gpu_count = 0
                     elif MIG_AVAILABLE and container_name in MIG_CONTAINERS:
                         mig_profile = MIG_CONTAINERS[container_name]

From 9de764847654cba8d6373fa9b29e9c5d7d248398 Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Tue, 9 Jun 2026 16:23:55 -0700
Subject: [PATCH 35/39] Replace GPU count text box with range slider (8-100)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 streamwise/templates/auto_deploy.html | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/streamwise/templates/auto_deploy.html b/streamwise/templates/auto_deploy.html
index 060ed4c7..0b90277b 100644
--- a/streamwise/templates/auto_deploy.html
+++ b/streamwise/templates/auto_deploy.html
@@ -110,11 +110,14 @@ <h4>📊 Optimized Deployment Plan</h4>
                 return `<option value="${t}" ${selected} ${disabled}>${t}</option>`;
             }).join('');
             row.innerHTML = `
-                <div class="col-md-4">
+                <div class="col-md-3">
                     <select class="form-select gpu-type-select">${options}</select>
                 </div>
-                <div class="col-md-4">
-                    <input type="number" class="form-control gpu-count-input" min="0" max="256" value="${count}">
+                <div class="col-md-5">
+                    <input type="range" class="form-range gpu-count-slider" min="8" max="100" step="1" value="${Math.max(count, 8)}">
+                </div>
+                <div class="col-md-2">
+                    <span class="badge bg-primary fs-6 gpu-count-label">${Math.max(count, 8)}</span>
                 </div>
                 <div class="col-md-2">
                     <button type="button" class="btn btn-outline-danger btn-sm remove-gpu-row-btn" title="Remove">✕</button>
@@ -128,6 +131,9 @@ <h4>📊 Optimized Deployment Plan</h4>
             row.querySelector('.gpu-type-select').addEventListener('change', () => {
                 refreshSelectOptions();
             });
+            const slider = row.querySelector('.gpu-count-slider');
+            const label = row.querySelector('.gpu-count-label');
+            slider.addEventListener('input', () => { label.textContent = slider.value; });
             refreshAddButton();
             refreshSelectOptions();
         }
@@ -151,7 +157,7 @@ <h4>📊 Optimized Deployment Plan</h4>
             const budget = {};
             gpuRowsContainer.querySelectorAll('.gpu-budget-row').forEach(row => {
                 const type = row.querySelector('.gpu-type-select').value;
-                const count = parseInt(row.querySelector('.gpu-count-input').value) || 0;
+                const count = parseInt(row.querySelector('.gpu-count-slider').value) || 0;
                 if (count > 0) budget[type] = count;
             });
             return budget;

From 19b539d9e4417f5ae5e76f6be81c33ba4859ef2d Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Tue, 9 Jun 2026 16:25:42 -0700
Subject: [PATCH 36/39] Left TODOs

---
 streamwise/allocator_bridge.py | 1 +
 streamwise/streamwise.py       | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/streamwise/allocator_bridge.py b/streamwise/allocator_bridge.py
index dc3d7f51..8855526a 100644
--- a/streamwise/allocator_bridge.py
+++ b/streamwise/allocator_bridge.py
@@ -281,6 +281,7 @@ def result_to_deployment_specs(result: Result) -> list[DeploymentSpec]:
 
                     mig_profile: Optional[str] = None
                     # Co-locate VAE only when disaggregation is disabled
+                    # TODO: make disaggregation a configuration exposed to the users
                     is_colocated = (
                         container_name in COLOCATED_CONTAINERS
                         and not STREAMWISE_POLICY.disaggregation.get(Model.HF, False)
diff --git a/streamwise/streamwise.py b/streamwise/streamwise.py
index cccd8b34..7c0b4de6 100644
--- a/streamwise/streamwise.py
+++ b/streamwise/streamwise.py
@@ -551,7 +551,7 @@ async def add_pod(service_name: str) -> str:
 
 @route("/auto_deploy", methods=["GET"])
 async def auto_deploy_page() -> str:
-    """Render the standalone auto-deploy page."""
+    """Render the standalone auto-deploy page. (TODO: enable customization after auto-deploy plan is generated)"""
     return await render_template("auto_deploy.html")
 
 

From 03d7d7b4fa143855b424276b3041585c4f72a97d Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Tue, 9 Jun 2026 22:03:59 -0700
Subject: [PATCH 37/39] Remove Skills and README changes

---
 .github/streamwise-demo-skills.md | 225 ------------------------------
 deployment/aks/README.md          |  59 +-------
 2 files changed, 1 insertion(+), 283 deletions(-)
 delete mode 100644 .github/streamwise-demo-skills.md

diff --git a/.github/streamwise-demo-skills.md b/.github/streamwise-demo-skills.md
deleted file mode 100644
index 8d9c8cc0..00000000
--- a/.github/streamwise-demo-skills.md
+++ /dev/null
@@ -1,225 +0,0 @@
-# StreamWise Demo: End-to-End AKS Deployment with GPU Spot Probing
-
-This document summarizes the full deployment walkthrough performed on 2026-06-05,
-from capacity probing through to a running StreamWise instance on AKS with 32 H100 GPUs.
-
-## Overview
-
-| Step | Tool/Skill | Outcome |
-|------|-----------|---------|
-| 1. SKU Discovery | `az vm list-skus` | Found `Standard_ND96isrf_H100_v5` unrestricted in eastus2 and SwedenCentral |
-| 2. Capacity Pre-Check | CCC Kusto query | SwedenCentral AZ03 has 6 allocable Spot VMs — enough for 4 nodes |
-| 3. AKS Deployment | Bicep (`aks.bicep`) | Cluster + 4 H100 Spot nodes + networking provisioned in ~12 min |
-| 4. K8s Setup | kubectl | Namespace, secrets, PV/PVC, NVIDIA device plugin |
-| 5. StreamWise Deploy | kubectl + YAML templating | Pod running, web UI accessible at public IP:8081 |
-
-## Step 1: GPU SKU Discovery (`azure-gpu-spot-probe` skill)
-
-### What We Learned
-
-The `azure-gpu-spot-probe` skill provides a structured approach to finding GPU Spot capacity:
-
-1. **List available SKUs** with `az vm list-skus --size H100` to find all H100 variants and their restriction status.
-2. **Cross-reference with CCC data** (Kusto query on `onecapacityfollower.centralus.kusto.windows.net`) to see actual allocable Spot VMs per region/zone.
-3. **Key insight:** CCC data is fleet-wide, not per-subscription. A region may show capacity but still be `Location`-restricted for your subscription.
-
-### H100 SKU Variants
-
-| SKU | Key Difference |
-|-----|---------------|
-| `Standard_ND96isr_H100_v5` | 8× H100, InfiniBand |
-| `Standard_ND96isrf_H100_v5` | 8× H100, InfiniBand (refresh/newer) |
-| `Standard_ND96is_H100_v5` | 8× H100, no InfiniBand suffix |
-| `Standard_ND96is_noIB_H100_v5` | 8× H100, explicitly no InfiniBand |
-
-### Subscription Access Results
-
-For `Standard_ND96isrf_H100_v5`:
-- **Unrestricted:** eastus2 (zones 1,2), SwedenCentral (zones 1,2,3)
-- **Location-blocked:** eastus, centralus, northcentralus
-
-### CCC Capacity Data
-
-```bash
-Region          | SKU                          | AllocableSpotVMs
-swedencentral   | Standard_ND96isrf_H100_v5    | 6
-eastus          | Standard_ND96isr_H100_v5     | 16 (but sub is blocked)
-centralus       | Standard_ND96isr_H100_v5     | 83 (but sub is blocked)
-```
-
-**Decision:** SwedenCentral — 6 allocable Spot VMs, subscription unrestricted, zones 1/2/3.
-
-## Step 2: AKS Cluster Deployment
-
-### Bicep Template (`deployment/aks/aks.bicep`)
-
-The template provisions:
-- **System node pool:** Standard_D16s_v5 (1 node) for StreamWise/StreamCast/system pods
-- **GPU Spot node pool (`spoth100`):** Full-GPU nodes for heavy models
-- **GPU MIG Spot node pool (`spoth100mig`):** For mixed-mode (7 full GPUs + 1 MIG-partitioned)
-- **Networking:** Static public IP, NAT gateway, NSG (ports 8000–9000), VNet with disabled default outbound
-
-### Deployment Command
-
-```bash
-az deployment group create \
-  --name AKSDeployment \
-  --resource-group hqiu-streamwise-aks-cluster \
-  --template-file deployment/aks/aks.bicep \
-  --parameters \
-    clusterName=hqiu-streamwise-aks-cluster-cluster \
-    gpuNodeVmSize=Standard_ND96isrf_H100_v5 \
-    gpuNodePoolName=spoth100 \
-    gpuMigNodePoolName=spoth100mig \
-    gpuNodeCount=4 \
-    acrName=inigogrtgen \
-    acrResourceGroup=inigog-acr
-```
-
-### Gotcha: ACR Role Assignment Failure
-
-The Bicep template includes a cross-resource-group ACR role assignment. If the role already exists
-(e.g., from a prior deployment), it fails with `RoleAssignmentUpdateNotPermitted`. The cluster
-itself still succeeds — just attach ACR manually:
-
-```bash
-az aks update -g <rg> -n <cluster> --attach-acr <acrName>
-```
-
-### Gotcha: ACR Login Server Name
-
-ACR names like `inigogrtgen` may have a login server of `inigogrtgen-<hash>.azurecr.io`, NOT
-`inigogrtgen.azurecr.io`. Always verify with:
-
-```bash
-az acr show --name <acr> --query loginServer -o tsv
-```
-
-## Step 3: Kubernetes Setup
-
-```bash
-kubectl create namespace rtgen
-kubectl create secret generic hf-token -n rtgen --from-literal=token=$HF_TOKEN
-kubectl apply -f deployment/k8s/local-pv.yaml
-kubectl apply -f deployment/k8s/local-pvc.yaml -n rtgen
-kubectl create namespace gpu-resources
-kubectl apply -f deployment/k8s/nvidia-device-plugin-ds.yaml
-```
-
-## Step 4: StreamWise Deployment
-
-The `streamwise-pod.yaml` uses shell variable placeholders (`${ACR_URL}`, `${LOAD_BALANCER_IP}`,
-`${RESOURCE_GROUP_NAME}`). On Linux use `envsubst`; on Windows use PowerShell string replacement:
-
-```powershell
-$yaml = Get-Content "deployment/aks/streamwise-pod.yaml" -Raw
-$yaml = $yaml -replace '\$\{ACR_URL\}', 'inigogrtgen-cjd9f3dydte2bzbb.azurecr.io'
-$yaml = $yaml -replace '\$\{RESOURCE_GROUP_NAME\}', 'hqiu-streamwise-aks-cluster'
-$yaml = $yaml -replace '\$\{LOAD_BALANCER_IP\}', '4.223.71.250'
-$yaml | kubectl apply -f -
-```
-
-### First Pull Time
-
-The StreamWise image is ~9 GB. First pull takes 5–10 minutes (pod shows `ContainerCreating`).
-
-## Step 5: Verification
-
-```bash
-kubectl get pods -n rtgen       # Should show 1/1 Running
-kubectl get svc -n rtgen        # Should show LoadBalancer with external IP
-curl http://<IP>:8081/          # Should return HTTP 200
-```
-
-## Final Result
-
-| Property | Value |
-|----------|-------|
-| Cluster | `hqiu-streamwise-aks-cluster-cluster` |
-| Region | SwedenCentral |
-| GPU Nodes | 4 × Standard_ND96isrf_H100_v5 (32 H100 GPUs) |
-| Public IP | 4.223.71.250 |
-| StreamWise URL | http://<IP>:8081 |
-| FQDN | http://<ADDRESS>:8081 |
-
-## Auto-Deployment Feature
-
-From the StreamWise web UI at port 8081, you can deploy all GPU model services with one click,
-or via REST API:
-
-```bash
-# Deploy all services
-curl -X POST "http://4.223.71.250:8081/api/service"
-
-# Deploy individual services
-curl -X POST "http://4.223.71.250:8081/api/pod" \
-  -d "container_name=kokoro" -d "gpu=1" -d "memory=8" -d "cpu=2"
-
-# List deployed services
-curl "http://4.223.71.250:8081/api/services"
-```
-
-## Step 6: Rebuilding the Image for New Features
-
-When the deployed image is stale (e.g., missing the auto-deploy feature from a newer branch),
-rebuild and push from a local Docker install or use ACR Tasks.
-
-### Local Docker Build (recommended on Windows)
-
-```powershell
-# Prepare build context (see deployment/setup_image.sh for the full script)
-# Fix CRLF line endings in bash files before building on Windows:
-$content = [System.IO.File]::ReadAllText("deployment\streamwise\docker_files\run_httpserver.bash")
-$content = $content -replace "`r`n", "`n"
-[System.IO.File]::WriteAllText("deployment\streamwise\docker_files\run_httpserver.bash", $content, [System.Text.UTF8Encoding]::new($false))
-
-# Build and push
-docker buildx build --platform linux/amd64 `
-  --build-arg DOCKER_REPO=inigogrtgen-cjd9f3dydte2bzbb.azurecr.io `
-  --build-arg BASE_TAG=v0.5.0 `
-  -t "inigogrtgen-cjd9f3dydte2bzbb.azurecr.io/streamwise:v0.6.2-autodeploy" `
-  "deployment\streamwise\docker_files" --push
-```
-
-### ACR Cloud Build (broken on Windows due to Unicode encoding)
-
-`az acr build` streams build logs through the Azure CLI, which crashes on Windows cp1252
-terminals when pip outputs Unicode progress bars. Use local Docker build instead.
-
-### Redeploy the Pod
-
-```powershell
-kubectl delete pod streamwise -n rtgen --force --grace-period=0
-# Re-apply YAML with updated image tag
-```
-
-### Gotcha: CRLF Line Endings in Bash Scripts
-
-When building Docker images on Windows, `COPY *.bash .` preserves CRLF line endings.
-Linux containers then fail with `$'\r': command not found`. **Always convert to LF** before
-building, or add a `.dockerignore`/`dos2unix` step.
-
-### Gotcha: Simulator Data Path in Docker
-
-The Dockerfile copies the simulator into `/streamwise/simulator/`, but `allocator_bridge.py`
-originally resolved the data path as `os.path.dirname(__file__) + "/../simulator/data"` which
-evaluates to `/simulator/data/` (doesn't exist). Fixed by changing to:
-
-```python
-default_path = os.path.join(os.path.dirname(__file__), "simulator", "data")
-```
-
-This works in Docker (where cwd = `/streamwise/`) and locally (where `__file__` is in `streamwise/`).
-
-## Key Lessons Learned
-
-1. **Always check CCC capacity before probing** — saves time and money on doomed create attempts.
-2. **CCC ≠ subscription access** — fleet capacity doesn't mean your subscription can use it.
-3. **ACR login servers may have hashed suffixes** — always `az acr show --query loginServer`.
-4. **ACR role assignments are idempotent-ish** — redeployments fail but don't break the cluster.
-5. **Spot VMs can be evicted** — plan for re-scaling and MIG reconfiguration after eviction.
-6. **Image pulls are slow** — ~9 GB images take 5–10 min on first pull; be patient.
-7. **Zone mapping is opaque** — CCC AZ03 doesn't necessarily mean subscription zone 3.
-8. **CRLF kills Linux containers** — always fix line endings when building on Windows.
-9. **ACR cloud build broken on Windows** — use local Docker build + push instead.
-10. **Relative paths shift in Docker** — verify path assumptions match the container layout (`WORKDIR` + `COPY` targets).
diff --git a/deployment/aks/README.md b/deployment/aks/README.md
index 6daf9743..e72feffb 100644
--- a/deployment/aks/README.md
+++ b/deployment/aks/README.md
@@ -220,61 +220,6 @@ kubectl create namespace gpu-resources
 kubectl apply -f deployment/k8s/nvidia-device-plugin-ds.yaml
 ```
 
-### 5.0 Critical: Spot Node Toleration and GPU Labels
-
-AKS Spot node pools apply the taint `kubernetes.azure.com/scalesetpriority=spot:NoSchedule`.
-The **upstream** NVIDIA device plugin (`https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/...`) only tolerates `nvidia.com/gpu`, so **it will not schedule on Spot GPU nodes**.
-
-**Always use the local manifest** (`deployment/k8s/nvidia-device-plugin-ds.yaml`) which already includes the Spot toleration. If you already applied the upstream manifest, patch it:
-```bash
-kubectl patch daemonset nvidia-device-plugin-daemonset -n kube-system \
-  --type='json' \
-  -p='[{"op":"add","path":"/spec/template/spec/tolerations/-","value":{"key":"kubernetes.azure.com/scalesetpriority","operator":"Equal","value":"spot","effect":"NoSchedule"}}]'
-```
-
-Without this patch, `nvidia.com/gpu` will report 0 on Spot nodes and GPU pods will remain Pending.
-
-**Label GPU nodes** (required for pod scheduling with nodeAffinity):
-
-AKS does not automatically apply `nvidia.com/gpu.product` labels. Without GPU Feature Discovery (NFD+GFD), you must label nodes manually:
-```bash
-# For H100 nodes:
-kubectl label node <node-name> nvidia.com/gpu.product=NVIDIA-H100-80GB-HBM3
-
-# For A100 nodes:
-kubectl label node <node-name> nvidia.com/gpu.product=NVIDIA-A100-SXM4-80GB
-
-# For H200 nodes:
-kubectl label node <node-name> nvidia.com/gpu.product=NVIDIA-H200-141GB-HBM3
-```
-
-To label all nodes in a GPU pool at once:
-```bash
-kubectl label nodes -l agentpool=gpuh100 nvidia.com/gpu.product=NVIDIA-H100-80GB-HBM3
-```
-
-> **⚠️ These labels are lost on node eviction/recreation.** If a Spot node is evicted and a new one
-> joins, you must re-apply the label. For a permanent solution, install
-> [NVIDIA GPU Feature Discovery](https://github.com/NVIDIA/gpu-feature-discovery) which
-> automatically detects and labels GPU hardware.
-
-After patching the toleration and labeling nodes, verify GPU registration:
-```bash
-# Confirm device plugin pods are running on GPU nodes
-kubectl get pods -n kube-system -l name=nvidia-device-plugin-ds -o wide
-
-# Confirm GPUs are registered
-kubectl get nodes -o custom-columns="NAME:.metadata.name,GPU:.status.allocatable.nvidia\.com/gpu,LABEL:.metadata.labels.nvidia\.com/gpu\.product"
-```
-
-Expected output for 4× H100 nodes:
-```bash
-NAME                            GPU   LABEL
-aks-gpuh100-xxxxx-vmss000000    8     NVIDIA-H100-80GB-HBM3
-aks-gpuh100-xxxxx-vmss000001    8     NVIDIA-H100-80GB-HBM3
-...
-```
-
 Scale the GPU spot node pool up (it starts at 0 nodes):
 
 ```bash
@@ -397,9 +342,7 @@ kubectl get events -n rtgen --sort-by='.lastTimestamp'
 Common issues:
 - **Image pull errors**: Verify ACR is attached to AKS (`az aks check-acr -g $AZ_RESOURCE_GROUP -n $AKS_CLUSTER --acr <acrName>`)
 - **Pods stuck in Pending (Insufficient cpu)**: The system node pool doesn't have enough CPU. Scale up with `az aks nodepool scale` or use a larger VM size (see Sizing note in Step 1)
-- **GPU not available (Spot nodes)**: The NVIDIA device plugin may not be running on Spot GPU nodes because it lacks the Spot toleration. See Step 5.0 for the patch command
-- **GPU not available (0 GPUs on node)**: Ensure the NVIDIA device plugin daemonset pod is Running on the GPU node. If it shows 0 GPUs, restart the node or the device plugin pod
-- **Pods stuck with "node(s) didn't match Pod's node affinity/selector"**: GPU pods use `nodeAffinity` requiring `nvidia.com/gpu.product` label. Label your GPU nodes per Step 5.0
+- **GPU not available**: Ensure the GPU node pool is scaled up and the NVIDIA device plugin is running
 - **MIG node reports 0 GPUs**: The `mixed`-strategy device plugin cannot enumerate devices until MIG mode is enabled and MIG instances are created. Complete the full [MIG Setup Guide](../k8s/MIG.md) — once instances exist the plugin will register `nvidia.com/gpu` (full GPUs) and `nvidia.com/mig-<profile>` (MIG slices) within ~30–60 seconds
 - **Spot VM evicted**: Spot VMs may be evicted at any time. Re-run `az aks nodepool scale` to restore the node. After re-scaling a MIG node you must repeat the [MIG Setup Guide](../k8s/MIG.md) since MIG state does not persist across evictions
 - **LoadBalancer stuck in Pending**: Verify the public IP exists (`az network public-ip show -g $AZ_RESOURCE_GROUP --name aks-pods-public-ip`) and the AKS identity has Network Contributor role on the resource group

From 9e45542c7f3ea58d86d12e24a225fe993363a691 Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Tue, 9 Jun 2026 22:36:29 -0700
Subject: [PATCH 38/39] Fix tests

---
 streamwise/allocator_bridge.py            | 2 +-
 tests/streamwise/test_allocator_bridge.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/streamwise/allocator_bridge.py b/streamwise/allocator_bridge.py
index 8855526a..6f892668 100644
--- a/streamwise/allocator_bridge.py
+++ b/streamwise/allocator_bridge.py
@@ -104,7 +104,7 @@ class DeploymentPlan:
 
 def _get_data_dir() -> str:
     """Get the path to the simulator data directory."""
-    default_path = os.path.join(_HERE, "simulator", "data")
+    default_path = os.path.join(_REPO_ROOT, "simulator", "data")
     return os.getenv("SIMULATOR_DATA_DIR", default_path)
 
 
diff --git a/tests/streamwise/test_allocator_bridge.py b/tests/streamwise/test_allocator_bridge.py
index 0394ad2b..53364282 100644
--- a/tests/streamwise/test_allocator_bridge.py
+++ b/tests/streamwise/test_allocator_bridge.py
@@ -148,9 +148,9 @@ def test_result_to_deployment_specs_basic() -> None:
     assert kokoro_spec.mig_profile is None
     assert kokoro_spec.gpu == 1
 
-    # Co-located container gets gpu=0
+    # With disaggregation=True for HF, VAE runs on its own GPU
     vae_spec = next(s for s in specs if s.container_name == "hunyuanframepackvae")
-    assert vae_spec.gpu == 0
+    assert vae_spec.gpu == 1
 
 
 def test_result_to_deployment_specs_skips_zero_replicas() -> None:

From df2b243b740dd805bd4044108e6912734e26d37c Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Thu, 11 Jun 2026 22:06:29 -0700
Subject: [PATCH 39/39] Update auto deploy webpage icon to robot

---
 streamwise/templates/auto_deploy.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/streamwise/templates/auto_deploy.html b/streamwise/templates/auto_deploy.html
index 0b90277b..b477d3ef 100644
--- a/streamwise/templates/auto_deploy.html
+++ b/streamwise/templates/auto_deploy.html
@@ -6,7 +6,7 @@
     <meta name="viewport" content="width=device-width, initial-scale=1">
     <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.5/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-SgOJa3DmI69IUzQ2PVdRZhwQ+dy64/BUtbMJw1MZ8t5HZApcHrRKUc4W0kG879m7" crossorigin="anonymous">
     <link rel="stylesheet" href="{{ url_for('static', filename='style.css') }}">
-    <link rel="icon" href="data:image/svg+xml,<svg xmlns=%22http://www.w3.org/2000/svg%22 viewBox=%220 0 100 100%22><text y=%22.9em%22 font-size=%2290%22>🎬</text></svg>">
+    <link rel="icon" href="data:image/svg+xml,<svg xmlns=%22http://www.w3.org/2000/svg%22 viewBox=%220 0 100 100%22><text y=%22.9em%22 font-size=%2290%22>🤖</text></svg>">
 </head>
 
 <body>