From e971662922195f0aec6f6b1eeba541b2b75cb0df Mon Sep 17 00:00:00 2001 From: Haoran Qiu Date: Fri, 15 May 2026 17:32:20 -0700 Subject: [PATCH 01/39] Refactor: move model provisioning policies to streamwise/model_provisioner/ Move the 6 policy/allocator files (greedy, milp, naive_baseline, hexgen, helix, policies) from simulator/ into streamwise/model_provisioner/ so they can be reused by both the simulator evaluation framework and the StreamWise serving system. - Create streamwise/model_provisioner/ package with __init__.py that adds simulator/ to sys.path for foundation module access - Create simulator/__init__.py that adds streamwise/ to sys.path so model_provisioner is importable from simulator code - Update all imports across simulator files and 20 test files - Switch data_loading.py to use Path instead of str for data_dir params - Fix mypy issue in wrapper/run_httpserver.py (bytearray assignment) - Add .venv to .flake8 exclude and .gitignore Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .flake8 | 1 + .gitignore | 3 +++ simulator/__init__.py | 15 ++++++++++++++ simulator/actions.py | 2 +- simulator/auto_model_allocator.py | 12 +++++------ simulator/data_loading.py | 12 ++++++----- simulator/model_allocator.py | 2 +- simulator/multirequests.py | 2 +- simulator/provisioning.py | 2 +- streamwise/model_provisioner/__init__.py | 15 ++++++++++++++ .../model_provisioner}/greedy.py | 6 +++--- .../model_provisioner}/helix.py | 6 +++--- .../model_provisioner}/hexgen.py | 8 ++++---- .../model_provisioner}/milp.py | 2 +- .../model_provisioner}/naive_baseline.py | 4 ++-- .../model_provisioner}/policies.py | 0 tests/simulator/test_auto_model_allocator.py | 20 +++++++++---------- tests/simulator/test_data_loading.py | 2 +- tests/simulator/test_evaluator.py | 4 ++-- tests/simulator/test_greedy.py | 6 +++--- tests/simulator/test_helix.py | 6 +++--- tests/simulator/test_hexgen.py | 8 ++++---- tests/simulator/test_milp.py | 6 +++--- tests/simulator/test_models.py | 6 +++--- tests/simulator/test_multirequests_derive.py | 2 +- tests/simulator/test_simulator.py | 8 ++++---- tests/simulator/test_simulator_actions.py | 2 +- tests/simulator/test_simulator_baseline.py | 12 +++++------ tests/simulator/test_simulator_energy.py | 10 +++++----- .../simulator/test_simulator_multirequests.py | 2 +- tests/simulator/test_simulator_plotutils.py | 2 +- tests/simulator/test_simulator_policies.py | 6 +++--- .../simulator/test_simulator_provisioning.py | 8 ++++---- tests/simulator/test_simulator_types.py | 4 ++-- tests/simulator/test_simulator_utils.py | 2 +- tests/simulator/test_workflows.py | 4 ++-- wrapper/run_httpserver.py | 4 ++-- 37 files changed, 126 insertions(+), 90 deletions(-) create mode 100644 streamwise/model_provisioner/__init__.py rename {simulator => streamwise/model_provisioner}/greedy.py (99%) rename {simulator => streamwise/model_provisioner}/helix.py (99%) rename {simulator => streamwise/model_provisioner}/hexgen.py (99%) rename {simulator => streamwise/model_provisioner}/milp.py (99%) rename {simulator => streamwise/model_provisioner}/naive_baseline.py (99%) rename {simulator => streamwise/model_provisioner}/policies.py (100%) diff --git a/.flake8 b/.flake8 index 13cb9ba1..b32f0349 100644 --- a/.flake8 +++ b/.flake8 @@ -3,3 +3,4 @@ max-line-length = 120 # Ignore E402: module-level import not at top of file # Ignore W503: line break before binary operator (incompatible with W504) ignore = E402,W503 +exclude = .venv diff --git a/.gitignore b/.gitignore index 51130c5b..9807bf14 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,9 @@ *.sln.docstates *.env +# Environment files +.venv/ + # User-specific files (MonoDevelop/Xamarin Studio) *.userprefs diff --git a/simulator/__init__.py b/simulator/__init__.py index e69de29b..263309ff 100644 --- a/simulator/__init__.py +++ b/simulator/__init__.py @@ -0,0 +1,15 @@ +""" +Simulator package β€” provisioning sweeps, multi-request analysis, and plotting +on top of the model_provisioner allocation policies. + +The allocation policy implementations live in ``streamwise/model_provisioner/``. +""" +import os +import sys + +# Make model_provisioner importable for simulator modules. +_STREAMWISE_DIR = os.path.normpath( + os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "streamwise") +) +if _STREAMWISE_DIR not in sys.path: + sys.path.insert(0, _STREAMWISE_DIR) diff --git a/simulator/actions.py b/simulator/actions.py index debea677..69af1618 100644 --- a/simulator/actions.py +++ b/simulator/actions.py @@ -27,7 +27,7 @@ from sim_types import Objective from sim_types import Policy -from policies import STREAMWISE_POLICY +from model_provisioner.policies import STREAMWISE_POLICY from models import get_model_allocation diff --git a/simulator/auto_model_allocator.py b/simulator/auto_model_allocator.py index ea0fda61..3ca86cb7 100644 --- a/simulator/auto_model_allocator.py +++ b/simulator/auto_model_allocator.py @@ -19,7 +19,7 @@ from sim_types import GPUType from sim_types import Result -from policies import STREAMWISE_POLICY +from model_provisioner.policies import STREAMWISE_POLICY from model_allocator import ModelAllocator @@ -47,7 +47,7 @@ def __init__( def _build_allocator(self) -> ModelAllocator: """Create concrete allocator based on configured solver.""" if self.policy.solver == Solver.GREEDY: - from greedy import GreedyAllocator + from model_provisioner.greedy import GreedyAllocator return GreedyAllocator( workflow=self.workflow, latency_data=self.latency_data, @@ -55,7 +55,7 @@ def _build_allocator(self) -> ModelAllocator: policy=self.policy, ) if self.policy.solver == Solver.NAIVE: - from naive_baseline import NaiveAllocator + from model_provisioner.naive_baseline import NaiveAllocator return NaiveAllocator( workflow=self.workflow, latency_data=self.latency_data, @@ -63,7 +63,7 @@ def _build_allocator(self) -> ModelAllocator: policy=self.policy, ) if self.policy.solver in {Solver.GUROBI, Solver.HIGHS}: - from milp import MILPAllocator + from model_provisioner.milp import MILPAllocator return MILPAllocator( workflow=self.workflow, latency_data=self.latency_data, @@ -71,7 +71,7 @@ def _build_allocator(self) -> ModelAllocator: policy=self.policy, ) if self.policy.solver == Solver.HEXGEN: - from hexgen import HexGenAllocator + from model_provisioner.hexgen import HexGenAllocator return HexGenAllocator( workflow=self.workflow, latency_data=self.latency_data, @@ -79,7 +79,7 @@ def _build_allocator(self) -> ModelAllocator: policy=self.policy, ) if self.policy.solver == Solver.HELIX: - from helix import HelixAllocator + from model_provisioner.helix import HelixAllocator return HelixAllocator( workflow=self.workflow, latency_data=self.latency_data, diff --git a/simulator/data_loading.py b/simulator/data_loading.py index 6ee59ec5..af37e5b8 100644 --- a/simulator/data_loading.py +++ b/simulator/data_loading.py @@ -28,15 +28,17 @@ from constants import POWER_GPU_IDLE from constants import POWER_GPU_TDP +_DEFAULT_DATA_DIR = Path(__file__).resolve().parent / "data" + def load_latency_data( - data_dir: str = "data/", + data_dir: str | Path = _DEFAULT_DATA_DIR, ) -> LatencyData: """ Load latency and throughput mapping data from CSV files. Args: - data_dir (str): The directory where the CSV files are stored. + data_dir: The directory where the CSV files are stored. Returns: LatencyData: An object containing all loaded latency data. """ @@ -107,13 +109,13 @@ def load_latency_data( def load_power_data( - data_dir: str = "data/" + data_dir: str | Path = _DEFAULT_DATA_DIR ) -> PowerData: """ Load power consumption data from CSV files. Args: - data_dir (str): The directory where the CSV files are stored. + data_dir: The directory where the CSV files are stored. Returns: PowerData: An object containing all loaded power consumption data. """ @@ -216,7 +218,7 @@ def load_power_data( def load_adaptive_quality_data( - data_dir: str, + data_dir: str | Path, level: QualityLevel, ) -> LatencyData: """Load latency data for adaptive quality.""" diff --git a/simulator/model_allocator.py b/simulator/model_allocator.py index ab1c7e39..0f773a51 100644 --- a/simulator/model_allocator.py +++ b/simulator/model_allocator.py @@ -27,7 +27,7 @@ from models import UpscalerModelAllocation from models import OthersModelAllocation -from policies import NAIVE_POLICY +from model_provisioner.policies import NAIVE_POLICY class ModelAllocator(ABC): diff --git a/simulator/multirequests.py b/simulator/multirequests.py index 4fee5d55..a8d87a8b 100644 --- a/simulator/multirequests.py +++ b/simulator/multirequests.py @@ -18,7 +18,7 @@ from workflows import PODCAST_WORKFLOW -from policies import STREAMWISE_POLICY +from model_provisioner.policies import STREAMWISE_POLICY from auto_model_allocator import AutoModelAllocator diff --git a/simulator/provisioning.py b/simulator/provisioning.py index 43612b53..dd4f2a89 100644 --- a/simulator/provisioning.py +++ b/simulator/provisioning.py @@ -33,7 +33,7 @@ from auto_model_allocator import AutoModelAllocator -from policies import STREAMWISE_POLICY +from model_provisioner.policies import STREAMWISE_POLICY from constants import SECONDS_IN_HOUR diff --git a/streamwise/model_provisioner/__init__.py b/streamwise/model_provisioner/__init__.py new file mode 100644 index 00000000..c79b0cde --- /dev/null +++ b/streamwise/model_provisioner/__init__.py @@ -0,0 +1,15 @@ +""" +Model Provisioner β€” allocation policy implementations for GPU resource distribution. + +Contains greedy, naive, MILP, HexGen, and Helix allocation strategies. +The foundation types (sim_types, constants, models, etc.) live in simulator/. +""" +import os +import sys + +# Add simulator/ to sys.path so policy files can import foundation modules. +_SIMULATOR_DIR = os.path.normpath( + os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "..", "simulator") +) +if _SIMULATOR_DIR not in sys.path: + sys.path.insert(0, _SIMULATOR_DIR) diff --git a/simulator/greedy.py b/streamwise/model_provisioner/greedy.py similarity index 99% rename from simulator/greedy.py rename to streamwise/model_provisioner/greedy.py index 459742e5..8c1a1dd0 100644 --- a/simulator/greedy.py +++ b/streamwise/model_provisioner/greedy.py @@ -33,9 +33,9 @@ from model_allocator import ModelAllocator -from policies import STREAMWISE_POLICY -from policies import MAX_ITERATIONS -from policies import USE_ALL_GPUS +from .policies import STREAMWISE_POLICY +from .policies import MAX_ITERATIONS +from .policies import USE_ALL_GPUS from actions import gen_actions from actions import choose_action diff --git a/simulator/helix.py b/streamwise/model_provisioner/helix.py similarity index 99% rename from simulator/helix.py rename to streamwise/model_provisioner/helix.py index 5891538f..e8fededf 100644 --- a/simulator/helix.py +++ b/streamwise/model_provisioner/helix.py @@ -43,10 +43,10 @@ from evaluator import evaluate_model_allocation -from milp import MILPAllocator +from .milp import MILPAllocator -from policies import HELIX_POLICY -from policies import MAX_DEVICES +from .policies import HELIX_POLICY +from .policies import MAX_DEVICES from constants import DEVICE_OPTIONS diff --git a/simulator/hexgen.py b/streamwise/model_provisioner/hexgen.py similarity index 99% rename from simulator/hexgen.py rename to streamwise/model_provisioner/hexgen.py index 64c64160..4f37768a 100644 --- a/simulator/hexgen.py +++ b/streamwise/model_provisioner/hexgen.py @@ -30,15 +30,15 @@ from evaluator import calc_used_gpus from evaluator import evaluate_model_allocation -from greedy import GreedyAllocator +from .greedy import GreedyAllocator from actions import gen_actions from actions import choose_action from actions import apply_action -from policies import HEXGEN_POLICY -from policies import MAX_ITERATIONS -from policies import USE_ALL_GPUS +from .policies import HEXGEN_POLICY +from .policies import MAX_ITERATIONS +from .policies import USE_ALL_GPUS def _get_model_order(workflow: WorkflowConfig) -> list[Model]: diff --git a/simulator/milp.py b/streamwise/model_provisioner/milp.py similarity index 99% rename from simulator/milp.py rename to streamwise/model_provisioner/milp.py index 7a84e754..67749258 100644 --- a/simulator/milp.py +++ b/streamwise/model_provisioner/milp.py @@ -40,7 +40,7 @@ from constants import NUM_GPUS_PER_SERVER from constants import SECONDS_IN_HOUR -from policies import STREAMWISE_MILP_POLICY +from .policies import STREAMWISE_MILP_POLICY MAX_INSTANCES = 16 diff --git a/simulator/naive_baseline.py b/streamwise/model_provisioner/naive_baseline.py similarity index 99% rename from simulator/naive_baseline.py rename to streamwise/model_provisioner/naive_baseline.py index 9f9c550c..ec95904e 100644 --- a/simulator/naive_baseline.py +++ b/streamwise/model_provisioner/naive_baseline.py @@ -31,8 +31,8 @@ from evaluator import evaluate_model_allocation -from policies import NAIVE_POLICY -from policies import MAX_DEVICES +from .policies import NAIVE_POLICY +from .policies import MAX_DEVICES from model_allocator import ModelAllocator diff --git a/simulator/policies.py b/streamwise/model_provisioner/policies.py similarity index 100% rename from simulator/policies.py rename to streamwise/model_provisioner/policies.py diff --git a/tests/simulator/test_auto_model_allocator.py b/tests/simulator/test_auto_model_allocator.py index a9aa17d6..f7550822 100644 --- a/tests/simulator/test_auto_model_allocator.py +++ b/tests/simulator/test_auto_model_allocator.py @@ -23,7 +23,7 @@ from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): +with temp_sys_path("simulator", "streamwise"): from sim_types import GPUType from sim_types import Model from sim_types import QualityLevel @@ -33,18 +33,18 @@ from data_loading import load_latency_data - from policies import STREAMWISE_POLICY - from policies import NAIVE_POLICY - from policies import HEXGEN_POLICY - from policies import HELIX_POLICY + from model_provisioner.policies import STREAMWISE_POLICY + from model_provisioner.policies import NAIVE_POLICY + from model_provisioner.policies import HEXGEN_POLICY + from model_provisioner.policies import HELIX_POLICY from auto_model_allocator import AutoModelAllocator - from greedy import GreedyAllocator - from naive_baseline import NaiveAllocator - from hexgen import HexGenAllocator - from helix import HelixAllocator - from milp import MILPAllocator + from model_provisioner.greedy import GreedyAllocator + from model_provisioner.naive_baseline import NaiveAllocator + from model_provisioner.hexgen import HexGenAllocator + from model_provisioner.helix import HelixAllocator + from model_provisioner.milp import MILPAllocator from workflows import PODCAST_WORKFLOW diff --git a/tests/simulator/test_data_loading.py b/tests/simulator/test_data_loading.py index 129a2f3b..72337375 100644 --- a/tests/simulator/test_data_loading.py +++ b/tests/simulator/test_data_loading.py @@ -11,7 +11,7 @@ from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): +with temp_sys_path("simulator", "streamwise"): from sim_types import QualityLevel from data_loading import load_latency_data diff --git a/tests/simulator/test_evaluator.py b/tests/simulator/test_evaluator.py index a162e99b..b3c37e73 100644 --- a/tests/simulator/test_evaluator.py +++ b/tests/simulator/test_evaluator.py @@ -8,7 +8,7 @@ from tests.test_utils import assert_equals_approx from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): +with temp_sys_path("simulator", "streamwise"): from constants import DEFAULT_WORKFLOW_CONFIG from constants import SECONDS_IN_HOUR @@ -20,7 +20,7 @@ from evaluator import evaluate_model_allocation - from policies import STREAMWISE_POLICY + from model_provisioner.policies import STREAMWISE_POLICY from models import FluxModelAllocation from models import GemmaModelAllocation diff --git a/tests/simulator/test_greedy.py b/tests/simulator/test_greedy.py index c33d6991..bfa2996e 100644 --- a/tests/simulator/test_greedy.py +++ b/tests/simulator/test_greedy.py @@ -8,7 +8,7 @@ from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): +with temp_sys_path("simulator", "streamwise"): from constants import DEFAULT_WORKFLOW_CONFIG from constants import SECONDS_IN_HOUR @@ -21,9 +21,9 @@ from data_loading import load_latency_data from data_loading import load_power_data - from greedy import GreedyAllocator + from model_provisioner.greedy import GreedyAllocator - from policies import STREAMWISE_POLICY + from model_provisioner.policies import STREAMWISE_POLICY def test_allocate_8A_8H() -> None: diff --git a/tests/simulator/test_helix.py b/tests/simulator/test_helix.py index a336595d..7261b902 100644 --- a/tests/simulator/test_helix.py +++ b/tests/simulator/test_helix.py @@ -12,7 +12,7 @@ from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): +with temp_sys_path("simulator", "streamwise"): from constants import DEFAULT_WORKFLOW_CONFIG from sim_types import GPUType from sim_types import Model @@ -20,8 +20,8 @@ from sim_types import Solver from data_loading import load_latency_data from data_loading import load_power_data - from helix import HelixAllocator - from policies import HELIX_POLICY + from model_provisioner.helix import HelixAllocator + from model_provisioner.policies import HELIX_POLICY def test_get_model_order() -> None: diff --git a/tests/simulator/test_hexgen.py b/tests/simulator/test_hexgen.py index 99e7eef5..3d77867b 100644 --- a/tests/simulator/test_hexgen.py +++ b/tests/simulator/test_hexgen.py @@ -7,12 +7,12 @@ from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): +with temp_sys_path("simulator", "streamwise"): from constants import DEFAULT_WORKFLOW_CONFIG from sim_types import GPUType from data_loading import load_latency_data - from hexgen import HexGenAllocator - from hexgen import _get_model_order + from model_provisioner.hexgen import HexGenAllocator + from model_provisioner.hexgen import _get_model_order from sim_types import MODEL_ORDER @@ -154,7 +154,7 @@ def test_no_gpus_error() -> None: def test_is_subclass_of_greedy() -> None: """HexGenAllocator should extend GreedyAllocator.""" - from greedy import GreedyAllocator + from model_provisioner.greedy import GreedyAllocator latency_data = load_latency_data("simulator/data/") allocator = HexGenAllocator( workflow=DEFAULT_WORKFLOW_CONFIG, diff --git a/tests/simulator/test_milp.py b/tests/simulator/test_milp.py index 70c4bfa8..52a308bd 100644 --- a/tests/simulator/test_milp.py +++ b/tests/simulator/test_milp.py @@ -13,7 +13,7 @@ from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): +with temp_sys_path("simulator", "streamwise"): from sim_types import LatencyData from sim_types import PowerData from sim_types import GPUType @@ -27,11 +27,11 @@ from constants import DEFAULT_WORKFLOW_CONFIG from constants import SECONDS_IN_HOUR - from policies import STREAMWISE_MILP_POLICY + from model_provisioner.policies import STREAMWISE_MILP_POLICY from workflows import WORKFLOWS - from milp import MILPAllocator + from model_provisioner.milp import MILPAllocator from evaluator import evaluate_model_allocation diff --git a/tests/simulator/test_models.py b/tests/simulator/test_models.py index 57e00a0a..eccb449b 100644 --- a/tests/simulator/test_models.py +++ b/tests/simulator/test_models.py @@ -16,7 +16,7 @@ from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): +with temp_sys_path("simulator", "streamwise"): from sim_types import GPUType from sim_types import Model from sim_types import ModelAllocation @@ -29,8 +29,8 @@ from data_loading import load_latency_data from data_loading import load_power_data - from policies import STREAMWISE_POLICY - from policies import NAIVE_POLICY + from model_provisioner.policies import STREAMWISE_POLICY + from model_provisioner.policies import NAIVE_POLICY from models import get_model_allocation from models import _calculate_total_time diff --git a/tests/simulator/test_multirequests_derive.py b/tests/simulator/test_multirequests_derive.py index 8e7ed798..c809ccd0 100644 --- a/tests/simulator/test_multirequests_derive.py +++ b/tests/simulator/test_multirequests_derive.py @@ -7,7 +7,7 @@ from tests.test_utils import assert_equal_dict from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): +with temp_sys_path("simulator", "streamwise"): from sim_types import GPUType from sim_types import Model from sim_types import QualityLevel diff --git a/tests/simulator/test_simulator.py b/tests/simulator/test_simulator.py index fc791151..d698bb9d 100644 --- a/tests/simulator/test_simulator.py +++ b/tests/simulator/test_simulator.py @@ -13,7 +13,7 @@ from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): +with temp_sys_path("simulator", "streamwise"): from sim_types import WorkflowConfig from sim_types import Model from sim_types import Objective @@ -26,10 +26,10 @@ from data_loading import load_power_data from auto_model_allocator import AutoModelAllocator - from greedy import GreedyAllocator + from model_provisioner.greedy import GreedyAllocator - from policies import STREAMWISE_POLICY - from policies import NAIVE_POLICY + from model_provisioner.policies import STREAMWISE_POLICY + from model_provisioner.policies import NAIVE_POLICY def test_estimate_total_time() -> None: diff --git a/tests/simulator/test_simulator_actions.py b/tests/simulator/test_simulator_actions.py index dd3bf4fd..539946c5 100644 --- a/tests/simulator/test_simulator_actions.py +++ b/tests/simulator/test_simulator_actions.py @@ -7,7 +7,7 @@ from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): +with temp_sys_path("simulator", "streamwise"): from sim_types import Action from sim_types import ActionName from sim_types import GPUType diff --git a/tests/simulator/test_simulator_baseline.py b/tests/simulator/test_simulator_baseline.py index 64282777..b195a1cf 100644 --- a/tests/simulator/test_simulator_baseline.py +++ b/tests/simulator/test_simulator_baseline.py @@ -11,7 +11,7 @@ from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): +with temp_sys_path("simulator", "streamwise"): from sim_types import GPUType from sim_types import Model @@ -24,12 +24,12 @@ from data_loading import load_power_data from auto_model_allocator import AutoModelAllocator - from naive_baseline import NaiveAllocator - from greedy import GreedyAllocator + from model_provisioner.naive_baseline import NaiveAllocator + from model_provisioner.greedy import GreedyAllocator - from policies import NAIVE_POLICY - from policies import BASELINE_POLICIES - from policies import STREAMWISE_POLICY + from model_provisioner.policies import NAIVE_POLICY + from model_provisioner.policies import BASELINE_POLICIES + from model_provisioner.policies import STREAMWISE_POLICY from workflows import SHORTS_WORKFLOW from workflows import WORKFLOWS diff --git a/tests/simulator/test_simulator_energy.py b/tests/simulator/test_simulator_energy.py index 16b6e8bf..c96fd128 100644 --- a/tests/simulator/test_simulator_energy.py +++ b/tests/simulator/test_simulator_energy.py @@ -9,7 +9,7 @@ from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): +with temp_sys_path("simulator", "streamwise"): from constants import DEFAULT_WORKFLOW_CONFIG from sim_types import GPUType @@ -21,11 +21,11 @@ from data_loading import load_power_data from auto_model_allocator import AutoModelAllocator - from greedy import GreedyAllocator - from naive_baseline import NaiveAllocator + from model_provisioner.greedy import GreedyAllocator + from model_provisioner.naive_baseline import NaiveAllocator - from policies import NAIVE_POLICY - from policies import STREAMWISE_POLICY + from model_provisioner.policies import NAIVE_POLICY + from model_provisioner.policies import STREAMWISE_POLICY def test_energy() -> None: diff --git a/tests/simulator/test_simulator_multirequests.py b/tests/simulator/test_simulator_multirequests.py index 972596ec..6403baba 100644 --- a/tests/simulator/test_simulator_multirequests.py +++ b/tests/simulator/test_simulator_multirequests.py @@ -7,7 +7,7 @@ from tests.test_utils import assert_equals_approx from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): +with temp_sys_path("simulator", "streamwise"): from multirequests import QPM_LIST from multirequests import get_replicas from multirequests import get_costs diff --git a/tests/simulator/test_simulator_plotutils.py b/tests/simulator/test_simulator_plotutils.py index cee69368..b3bdead9 100644 --- a/tests/simulator/test_simulator_plotutils.py +++ b/tests/simulator/test_simulator_plotutils.py @@ -6,7 +6,7 @@ from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): +with temp_sys_path("simulator", "streamwise"): from plot_utils import plot_ttff_vs_cost from plot_utils import plot_ttff_vs_energy from plot_utils import plot_adaptive_quality diff --git a/tests/simulator/test_simulator_policies.py b/tests/simulator/test_simulator_policies.py index ffab5ba0..d9e1421f 100644 --- a/tests/simulator/test_simulator_policies.py +++ b/tests/simulator/test_simulator_policies.py @@ -11,9 +11,9 @@ from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): - from policies import STREAMWISE_POLICY - from policies import BASELINE_POLICIES +with temp_sys_path("simulator", "streamwise"): + from model_provisioner.policies import STREAMWISE_POLICY + from model_provisioner.policies import BASELINE_POLICIES from sim_types import Objective diff --git a/tests/simulator/test_simulator_provisioning.py b/tests/simulator/test_simulator_provisioning.py index 6bd142ae..fb5d46fd 100644 --- a/tests/simulator/test_simulator_provisioning.py +++ b/tests/simulator/test_simulator_provisioning.py @@ -7,7 +7,7 @@ from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): +with temp_sys_path("simulator", "streamwise"): from constants import DEFAULT_WORKFLOW_CONFIG from provisioning import get_provisioning_results @@ -23,9 +23,9 @@ from data_loading import load_latency_data - from policies import NAIVE_POLICY - from policies import STREAMWISE_POLICY - from policies import HEXGEN_POLICY + from model_provisioner.policies import NAIVE_POLICY + from model_provisioner.policies import STREAMWISE_POLICY + from model_provisioner.policies import HEXGEN_POLICY @pytest.mark.parametrize("gpu_type", [gpu_type for gpu_type in GPUType]) diff --git a/tests/simulator/test_simulator_types.py b/tests/simulator/test_simulator_types.py index 8bfc292f..223a3260 100644 --- a/tests/simulator/test_simulator_types.py +++ b/tests/simulator/test_simulator_types.py @@ -8,7 +8,7 @@ from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): +with temp_sys_path("simulator", "streamwise"): from sim_types import Model from sim_types import GPUType @@ -20,7 +20,7 @@ from models import GemmaModelAllocation from models import FluxModelAllocation - from policies import STREAMWISE_POLICY + from model_provisioner.policies import STREAMWISE_POLICY from workflows import PODCAST_WORKFLOW diff --git a/tests/simulator/test_simulator_utils.py b/tests/simulator/test_simulator_utils.py index 9711a696..b78d675d 100644 --- a/tests/simulator/test_simulator_utils.py +++ b/tests/simulator/test_simulator_utils.py @@ -6,7 +6,7 @@ from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): +with temp_sys_path("simulator", "streamwise"): from sim_types import Model from sim_types import GPUType from sim_types import ModelAllocation diff --git a/tests/simulator/test_workflows.py b/tests/simulator/test_workflows.py index bff7ed56..b38dc2ab 100644 --- a/tests/simulator/test_workflows.py +++ b/tests/simulator/test_workflows.py @@ -15,7 +15,7 @@ from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): +with temp_sys_path("simulator", "streamwise"): from sim_types import WorkflowConfig, Model, QualityLevel, GPUType from constants import ( FPS, @@ -28,7 +28,7 @@ ) from data_loading import load_latency_data from auto_model_allocator import AutoModelAllocator - from policies import STREAMWISE_POLICY, NAIVE_POLICY + from model_provisioner.policies import STREAMWISE_POLICY, NAIVE_POLICY from workflows import ( MAX_FT_FRAMES, SUBSCENE_SECONDS, diff --git a/wrapper/run_httpserver.py b/wrapper/run_httpserver.py index a9ec16ad..6ca398fe 100644 --- a/wrapper/run_httpserver.py +++ b/wrapper/run_httpserver.py @@ -1266,8 +1266,8 @@ async def send_task(gen_task: dict) -> None: try: payload_bytes = await asyncio.to_thread(pickle.dumps, gen_task) - payload_bytes = bytearray(payload_bytes) - payload_tensor = torch.frombuffer(payload_bytes, dtype=torch.uint8).to("cuda") + payload_buffer = bytearray(payload_bytes) + payload_tensor = torch.frombuffer(payload_buffer, dtype=torch.uint8).to("cuda") payload_size = torch.tensor([payload_tensor.numel()], dtype=torch.int64, device="cuda") if payload_size.item() > MAX_PAYLOAD_BYTES: From 3c324f712d7ca3ef8be622026fea622ddff2aa87 Mon Sep 17 00:00:00 2001 From: Haoran Qiu Date: Sat, 16 May 2026 16:32:27 -0700 Subject: [PATCH 02/39] Fix model_provisioner __init__.py to support Docker layout Support both local dev (../../simulator) and Docker (../simulator) paths when resolving the simulator directory for foundation module imports. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- streamwise/model_provisioner/__init__.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/streamwise/model_provisioner/__init__.py b/streamwise/model_provisioner/__init__.py index c79b0cde..9d75609c 100644 --- a/streamwise/model_provisioner/__init__.py +++ b/streamwise/model_provisioner/__init__.py @@ -8,8 +8,13 @@ import sys # Add simulator/ to sys.path so policy files can import foundation modules. -_SIMULATOR_DIR = os.path.normpath( - os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "..", "simulator") -) -if _SIMULATOR_DIR not in sys.path: - sys.path.insert(0, _SIMULATOR_DIR) +# Supports both local dev layout (../../simulator) and Docker layout (../simulator). +_HERE = os.path.dirname(os.path.abspath(__file__)) +_CANDIDATES = [ + os.path.normpath(os.path.join(_HERE, "..", "..", "simulator")), + os.path.normpath(os.path.join(_HERE, "..", "simulator")), +] +for _path in _CANDIDATES: + if os.path.isdir(_path) and _path not in sys.path: + sys.path.insert(0, _path) + break From 8f9bc0b8fc5db35fc2a6781ef0d89f0432b081f0 Mon Sep 17 00:00:00 2001 From: Haoran Qiu Date: Fri, 15 May 2026 17:34:27 -0700 Subject: [PATCH 03/39] Add auto-deploy feature to StreamWise dashboard Add an 'Auto Deploy' button to the web dashboard that automatically optimizes GPU resource allocation across workflow components using the model provisioner's greedy allocator. - Add streamwise/allocator_bridge.py: maps allocator output to K8s deployment parameters (Model enum -> container names, GPU specs) - Add /api/auto_deploy and /api/auto_deploy/confirm routes to streamwise.py for computing and confirming deployment plans - Add auto-deploy UI section to add_pod.html with GPU budget inputs, workflow selector, and deployment plan preview - Add comprehensive tests for allocator bridge and auto-deploy API Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- streamwise/allocator_bridge.py | 250 ++++++++++++++++ streamwise/streamwise.py | 118 ++++++++ streamwise/templates/add_pod.html | 190 ++++++++++++ tests/streamwise/test_allocator_bridge.py | 282 ++++++++++++++++++ .../streamwise/test_streamwise_auto_deploy.py | 226 ++++++++++++++ 5 files changed, 1066 insertions(+) create mode 100644 streamwise/allocator_bridge.py create mode 100644 tests/streamwise/test_allocator_bridge.py create mode 100644 tests/streamwise/test_streamwise_auto_deploy.py diff --git a/streamwise/allocator_bridge.py b/streamwise/allocator_bridge.py new file mode 100644 index 00000000..b1e610d2 --- /dev/null +++ b/streamwise/allocator_bridge.py @@ -0,0 +1,250 @@ +""" +Bridge between the model provisioner's allocator output and StreamWise pod deployment. + +Translates ModelAllocation results (abstract Model enum + GPU counts) into concrete +container deployment parameters compatible with pod_manager.add_pod(). +""" + +from __future__ import annotations + +import os + +import model_provisioner # noqa: F401 β€” adds simulator/ to sys.path + +from dataclasses import dataclass +from typing import Optional + +from sim_types import GPUType +from sim_types import Model +from sim_types import Result + +from auto_model_allocator import AutoModelAllocator +from data_loading import load_latency_data +from model_provisioner.policies import STREAMWISE_POLICY +from workflows import WORKFLOWS + + +# Mapping from simulator Model enum to concrete container names used by pod_manager. +# Some Model entries map to multiple containers (e.g., OTHERS -> kokoro + yolo). +MODEL_TO_CONTAINERS: dict[Model, list[str]] = { + Model.GEMMA: ["gemma"], + Model.FLUX: ["flux"], + Model.HF: ["hunyuanframepackf1"], + Model.HF_VAE: ["hunyuanframepackvae"], + Model.FT: ["fantasytalking"], + Model.FT_VAE: [], # FT_VAE is handled within fantasytalking container + Model.UPSCALER: ["realesrgan"], + Model.OTHERS: ["kokoro", "yolo"], +} + +# Default CPU/memory/storage for each container when deployed via auto-deploy. +# Format: (cpu_cores, memory_gib, ephemeral_storage_gib) +CONTAINER_RESOURCES: dict[str, tuple[int, int, int]] = { + "gemma": (16, 192, 64), + "flux": (12, 128, 64), + "hunyuanframepackf1": (24, 128, 64), + "hunyuanframepackvae": (4, 32, 16), + "fantasytalking": (12, 192, 64), + "realesrgan": (4, 32, 16), + "kokoro": (2, 8, 16), + "yolo": (4, 8, 16), +} + +# GPU type string used by pod_manager (lowercase) +GPU_TYPE_TO_POD_STR: dict[GPUType, str] = { + GPUType.A100: "a100", + GPUType.H100: "h100", + GPUType.H200: "h200", + GPUType.GB200: "gb200", +} + +# MIG containers: these use a MIG slice instead of a full GPU +MIG_CONTAINERS: dict[str, str] = { + "kokoro": "1g.10gb", + "yolo": "1g.10gb", + "realesrgan": "1g.10gb", +} + +# Mapping from StreamWise app name to simulator workflow key +APP_TO_WORKFLOW: dict[str, str] = { + "streamcast": "podcast", + "streampersona": "slide", + "streamchat": "chat", + "streamshort": "short", + "streammovie": "movie", + "streamanimate": "story", + "streamlecture": "lecture", + "streamdub": "dubbing", + "streamedit": "editing", +} + + +@dataclass +class DeploymentSpec: + """A single container deployment specification.""" + container_name: str + cpu: int + memory_gib: int + ephemeral_storage_gib: int + gpu: int + gpu_type: Optional[str] + mig_profile: Optional[str] + + +@dataclass +class DeploymentPlan: + """Complete deployment plan produced by the auto-allocator.""" + specs: list[DeploymentSpec] + result: Result + workflow_name: str + gpu_budget: dict[str, int] + + +def _get_data_dir() -> str: + """Get the path to the simulator data directory.""" + default_path = os.path.join(os.path.dirname(__file__), "..", "simulator", "data") + return os.getenv("SIMULATOR_DATA_DIR", default_path) + + +def get_available_workflows() -> list[str]: + """Return list of available workflow names for the UI.""" + return list(APP_TO_WORKFLOW.keys()) + + +def get_available_gpu_types() -> list[str]: + """Return list of available GPU type strings for the UI.""" + return [gpu_type.value for gpu_type in GPUType] + + +def run_allocator( + gpu_budget: dict[str, int], + workflow_name: str, +) -> DeploymentPlan: + """ + Run the greedy model allocator and return a deployment plan. + + Args: + gpu_budget: GPU counts keyed by GPU type string (e.g., {"A100": 8, "H100": 0}). + workflow_name: StreamWise app name (e.g., "streamcast"). + + Returns: + DeploymentPlan with concrete container deployment specs. + + Raises: + ValueError: If workflow_name or GPU types are invalid. + """ + # Validate workflow + workflow_key = APP_TO_WORKFLOW.get(workflow_name) + if workflow_key is None: + raise ValueError( + f"Unknown workflow '{workflow_name}'. " + f"Available: {list(APP_TO_WORKFLOW.keys())}") + + workflow = WORKFLOWS[workflow_key] + + # Parse GPU budget into GPUType enum + num_gpus: dict[GPUType, int] = {} + for gpu_str, count in gpu_budget.items(): + try: + gpu_type = GPUType(gpu_str) + except ValueError: + raise ValueError( + f"Unknown GPU type '{gpu_str}'. " + f"Available: {[g.value for g in GPUType]}") + if count > 0: + num_gpus[gpu_type] = count + + if not num_gpus or sum(num_gpus.values()) < 8: + raise ValueError("Total GPU budget must be at least 8 GPUs.") + + # Load latency data and run allocator + data_dir = _get_data_dir() + latency_data = load_latency_data(data_dir=data_dir) + + allocator = AutoModelAllocator( + workflow=workflow, + latency_data=latency_data, + policy=STREAMWISE_POLICY, + ) + + result = allocator.allocate(num_gpus=num_gpus, verbose=False) + + # Convert result to deployment specs + specs = result_to_deployment_specs(result) + + return DeploymentPlan( + specs=specs, + result=result, + workflow_name=workflow_name, + gpu_budget=gpu_budget, + ) + + +def result_to_deployment_specs(result: Result) -> list[DeploymentSpec]: + """ + Convert an allocator Result into a list of DeploymentSpec objects. + + Each ModelAllocation with replicas > 0 is mapped to one or more container deployments. + """ + specs: list[DeploymentSpec] = [] + + for gpu_type, model_dict in result.models.items(): + gpu_type_str = GPU_TYPE_TO_POD_STR[gpu_type] + + for model, allocations in model_dict.items(): + containers = MODEL_TO_CONTAINERS.get(model, []) + if not containers: + continue + + for allocation in allocations: + if allocation.replicas <= 0: + continue + + for container_name in containers: + resources = CONTAINER_RESOURCES.get(container_name, (4, 16, 16)) + cpu, memory_gib, ephemeral_storage_gib = resources + + mig_profile = MIG_CONTAINERS.get(container_name) + gpu_count = allocation.devices if not mig_profile else 1 + + for _ in range(allocation.replicas): + specs.append(DeploymentSpec( + container_name=container_name, + cpu=cpu, + memory_gib=memory_gib, + ephemeral_storage_gib=ephemeral_storage_gib, + gpu=gpu_count, + gpu_type=gpu_type_str, + mig_profile=mig_profile, + )) + + return specs + + +def deployment_plan_to_json(plan: DeploymentPlan) -> dict: + """Serialize a DeploymentPlan to a JSON-friendly dict.""" + return { + "workflow_name": plan.workflow_name, + "gpu_budget": plan.gpu_budget, + "metrics": { + "total_time_s": round(plan.result.total_time_s, 2), + "ttff_s": round(plan.result.ttff_s, 2), + "cost": round(plan.result.cost, 4), + "gpus_used": { + gpu_type.value: count + for gpu_type, count in plan.result.gpus_used.items() + }, + }, + "specs": [ + { + "container_name": spec.container_name, + "cpu": spec.cpu, + "memory_gib": spec.memory_gib, + "ephemeral_storage_gib": spec.ephemeral_storage_gib, + "gpu": spec.gpu, + "gpu_type": spec.gpu_type, + "mig_profile": spec.mig_profile, + } + for spec in plan.specs + ], + } diff --git a/streamwise/streamwise.py b/streamwise/streamwise.py index 1c63eacf..0ce24ac5 100644 --- a/streamwise/streamwise.py +++ b/streamwise/streamwise.py @@ -34,6 +34,7 @@ import pod_manager import node_manager import job_manager +import allocator_bridge from service_manager import get_services from service_manager import get_service_timestamps @@ -726,6 +727,123 @@ async def api_add_pod() -> QuartReturn: return jsonify({"error": str(ex)}), HTTPStatus.INTERNAL_SERVER_ERROR +@route("/api/auto_deploy", methods=["POST"]) +async def api_auto_deploy() -> QuartReturn: + """Run the model allocator to produce an optimized deployment plan. + + Expects JSON body: + { + "gpu_budget": {"A100": 8, "H100": 0, ...}, + "workflow": "streamcast" + } + + Returns the deployment plan with estimated metrics and per-container specs. + """ + try: + data = await request.get_json() + if not data: + return jsonify({"error": "Request body must be JSON"}), HTTPStatus.BAD_REQUEST + + gpu_budget = data.get("gpu_budget") + workflow_name = data.get("workflow") + + if not gpu_budget or not isinstance(gpu_budget, dict): + return jsonify({"error": "Missing or invalid 'gpu_budget' field"}), HTTPStatus.BAD_REQUEST + if not workflow_name or not isinstance(workflow_name, str): + return jsonify({"error": "Missing or invalid 'workflow' field"}), HTTPStatus.BAD_REQUEST + + plan = allocator_bridge.run_allocator( + gpu_budget=gpu_budget, + workflow_name=workflow_name, + ) + return jsonify(allocator_bridge.deployment_plan_to_json(plan)), HTTPStatus.OK + + except ValueError as ve: + return jsonify({"error": str(ve)}), HTTPStatus.BAD_REQUEST + except Exception as ex: + logging.exception("Error in auto_deploy: %s", ex) + return jsonify({"error": str(ex)}), HTTPStatus.INTERNAL_SERVER_ERROR + + +@route("/api/auto_deploy/confirm", methods=["POST"]) +async def api_auto_deploy_confirm() -> QuartReturn: + """Execute a deployment plan produced by /api/auto_deploy. + + Expects JSON body: + { + "specs": [ + { + "container_name": "gemma", + "cpu": 16, + "memory_gib": 192, + "ephemeral_storage_gib": 64, + "gpu": 2, + "gpu_type": "a100", + "mig_profile": null + }, + ... + ] + } + + Deploys all containers in the plan. + """ + try: + data = await request.get_json() + if not data: + return jsonify({"error": "Request body must be JSON"}), HTTPStatus.BAD_REQUEST + + specs = data.get("specs") + if not specs or not isinstance(specs, list): + return jsonify({"error": "Missing or invalid 'specs' field"}), HTTPStatus.BAD_REQUEST + + deployed: List[str] = [] + errors: List[str] = [] + + for spec in specs: + container_name = spec.get("container_name") + if not container_name: + errors.append("Spec missing 'container_name'") + continue + + try: + await pod_manager.add_pod( + container_name=container_name, + cpu=int(spec.get("cpu", 4)), + memory_gib=int(spec.get("memory_gib", 16)), + ephemeral_storage_gib=int(spec.get("ephemeral_storage_gib", 16)), + gpu=int(spec.get("gpu", 0)), + gpu_type=spec.get("gpu_type"), + mig_profile=spec.get("mig_profile"), + namespace=NAMESPACE, + k8s_cluster=k8s_cluster, + ) + deployed.append(container_name) + except Exception as pod_ex: + msg = f"Failed to deploy '{container_name}': {pod_ex}" + logging.error(msg) + errors.append(msg) + + status = HTTPStatus.OK if not errors else HTTPStatus.MULTI_STATUS + return jsonify({ + "deployed": deployed, + "errors": errors, + "message": f"Deployed {len(deployed)}/{len(specs)} containers.", + }), status + + except Exception as ex: + logging.exception("Error in auto_deploy/confirm: %s", ex) + return jsonify({"error": str(ex)}), HTTPStatus.INTERNAL_SERVER_ERROR + + +@route("/api/auto_deploy/workflows", methods=["GET"]) +async def api_auto_deploy_workflows() -> QuartReturn: + """Return available workflows and GPU types for the auto-deploy UI.""" + return jsonify({ + "workflows": allocator_bridge.get_available_workflows(), + "gpu_types": allocator_bridge.get_available_gpu_types(), + }), HTTPStatus.OK + + @route("/api/node/", methods=["DELETE"]) async def api_remove_node(node_name: str) -> QuartReturn: return await node_manager.remove_node( diff --git a/streamwise/templates/add_pod.html b/streamwise/templates/add_pod.html index d61952aa..f5496e10 100644 --- a/streamwise/templates/add_pod.html +++ b/streamwise/templates/add_pod.html @@ -384,6 +384,94 @@

🧩 Applications

{% endif %} + +

πŸ€– Auto Deploy

+

Specify your GPU budget and the optimizer will determine the best allocation for each component:

+ +
+
+ + πŸ’° GPU Budget + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+
+ +
+ + 🎬 Workflow + +
+ + +
+
+ +
+ +
+
+ + + + + + + + + + diff --git a/streamwise/templates/index.html b/streamwise/templates/index.html index 7fc18a6c..f2b8a80a 100644 --- a/streamwise/templates/index.html +++ b/streamwise/templates/index.html @@ -244,6 +244,18 @@

πŸ“„πŸ“½οΈ StreamWise Cluster Manager πŸ”‰πŸŽ¬

{% endmacro %} +

πŸ€– Auto Deploy

+
+ + πŸ€– Auto Deploy + + Optimize and deploy all services automatically +
+

🎯 Applications

Date: Tue, 9 Jun 2026 15:34:31 -0700 Subject: [PATCH 26/39] Fix data dir path: use _HERE instead of _REPO_ROOT for container compat In Docker the parent of /streamwise is /, which produced an invalid path /simulator/data/*.csv. Using _HERE resolves to /streamwise/simulator/data/ inside the container. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- streamwise/allocator_bridge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/streamwise/allocator_bridge.py b/streamwise/allocator_bridge.py index 543eb96a..68aff990 100644 --- a/streamwise/allocator_bridge.py +++ b/streamwise/allocator_bridge.py @@ -104,7 +104,7 @@ class DeploymentPlan: def _get_data_dir() -> str: """Get the path to the simulator data directory.""" - default_path = os.path.join(_REPO_ROOT, "simulator", "data") + default_path = os.path.join(_HERE, "simulator", "data") return os.getenv("SIMULATOR_DATA_DIR", default_path) From 5d38cf8521fd9cebf05b9eb0bddf458bafe98f0c Mon Sep 17 00:00:00 2001 From: Haoran Qiu Date: Tue, 9 Jun 2026 15:38:27 -0700 Subject: [PATCH 27/39] Fix allocator: round budget up to server multiples before calling allocator The greedy allocator asserts GPU counts are multiples of NUM_GPUS_PER_SERVER (8). Now we round up for the allocator call, then trim specs back to the user's actual budget. This allows non-multiple budgets like 26 or 30. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- streamwise/allocator_bridge.py | 36 ++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/streamwise/allocator_bridge.py b/streamwise/allocator_bridge.py index 68aff990..f585d05d 100644 --- a/streamwise/allocator_bridge.py +++ b/streamwise/allocator_bridge.py @@ -210,6 +210,15 @@ def run_allocator( if not num_gpus or sum(num_gpus.values()) < 8: raise ValueError("Total GPU budget must be at least 8 GPUs.") + # The allocator requires GPU counts to be multiples of NUM_GPUS_PER_SERVER (8). + # Round up for the allocator, then trim back to the real budget afterward. + import math + from constants import NUM_GPUS_PER_SERVER + allocator_gpus: dict[GPUType, int] = {} + for gpu_type, count in num_gpus.items(): + server_size = NUM_GPUS_PER_SERVER[gpu_type] + allocator_gpus[gpu_type] = math.ceil(count / server_size) * server_size + # Load latency data and run allocator data_dir = _get_data_dir() latency_data = load_latency_data(data_dir=data_dir) @@ -220,25 +229,22 @@ def run_allocator( policy=STREAMWISE_POLICY, ) - result = allocator.allocate(num_gpus=num_gpus, verbose=False) + result = allocator.allocate(num_gpus=allocator_gpus, verbose=False) # Convert result to deployment specs specs = result_to_deployment_specs(result) - # When MIG is unavailable, deployment specs may use more GPUs per type than the - # allocator budgeted (e.g., OTHERS allocates 1 GPU but kokoro+yolo each need a - # full GPU = 2). Detect per-type overflow and trim excess replicas. - if not MIG_AVAILABLE: - actual_per_type = _calc_actual_gpus_per_type(specs) - for gpu_type, budget_count in num_gpus.items(): - actual = actual_per_type.get(gpu_type, 0) - if actual <= budget_count: - continue - # Need to trim (actual - budget_count) GPUs from this type. - # Remove replicas of the most-replicated scalable container on this type. - excess = actual - budget_count - gpu_type_str = GPU_TYPE_TO_POD_STR[gpu_type] - specs = _trim_specs_for_type(specs, gpu_type_str, excess) + # Trim deployment specs back to the user's actual budget. + # Also handles MIG-unavailable overflow (e.g., OTHERS allocates 1 GPU + # but kokoro+yolo each need a full GPU = 2). + actual_per_type = _calc_actual_gpus_per_type(specs) + for gpu_type, budget_count in num_gpus.items(): + actual = actual_per_type.get(gpu_type, 0) + if actual <= budget_count: + continue + excess = actual - budget_count + gpu_type_str = GPU_TYPE_TO_POD_STR[gpu_type] + specs = _trim_specs_for_type(specs, gpu_type_str, excess) return DeploymentPlan( specs=specs, From 6ce67364cbdea06028fa0ca691bd0973d67c24ee Mon Sep 17 00:00:00 2001 From: Haoran Qiu Date: Tue, 9 Jun 2026 15:43:17 -0700 Subject: [PATCH 28/39] Add 'Delete All' button for wrapper pods on main page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a red πŸ—‘οΈ Delete All button next to the βž• under Wrappers. Clicking it calls DELETE /api/pods/wrappers which removes all non-app pods in the rtgen namespace. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- streamwise/streamwise.py | 23 +++++++++++++++++++++++ streamwise/templates/index.html | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/streamwise/streamwise.py b/streamwise/streamwise.py index 5662e66a..b201f3e0 100644 --- a/streamwise/streamwise.py +++ b/streamwise/streamwise.py @@ -566,6 +566,29 @@ async def api_remove_pod(pod_name: str) -> QuartReturn: k8s_cluster=k8s_cluster) +@route("/api/pods/wrappers", methods=["DELETE"]) +async def api_delete_all_wrappers() -> QuartReturn: + """Delete all wrapper pods (non-app pods) in the namespace.""" + svcs = await get_services(namespace=NAMESPACE, k8s_cluster=k8s_cluster) + wrapper_pods = [ + svc["pod_name"] for svc in svcs + if svc.get("container_name") not in STREAMWISE_APPS and svc.get("pod_name") + ] + deleted = 0 + errors: list[str] = [] + for pod_name in wrapper_pods: + try: + await pod_manager.remove_pod( + pod_name, namespace=NAMESPACE, k8s_cluster=k8s_cluster) + deleted += 1 + except Exception as e: + errors.append(f"{pod_name}: {e}") + result: dict[str, object] = {"deleted": deleted, "total": len(wrapper_pods)} + if errors: + result["errors"] = errors + return jsonify(result), HTTPStatus.OK + + @route("/api/services", methods=["GET"]) async def api_get_services() -> QuartReturn: """API interface to get the list of services.""" diff --git a/streamwise/templates/index.html b/streamwise/templates/index.html index f2b8a80a..c9913f55 100644 --- a/streamwise/templates/index.html +++ b/streamwise/templates/index.html @@ -296,6 +296,15 @@

🌐 Wrappers

Add Wrapper {% endif %}
+ {% if wrapper_svcs and wrapper_svcs | length > 0 %} + + {% endif %}
{% if wrapper_svcs %} {{ svc_table(wrapper_svcs, 'rtgen-table') }} @@ -586,5 +595,28 @@

πŸ«› Pods

}); }); + From eb3b992e47196d2935c23ed1b9105692fbd722f6 Mon Sep 17 00:00:00 2001 From: Haoran Qiu Date: Tue, 9 Jun 2026 15:54:55 -0700 Subject: [PATCH 29/39] Redesign GPU budget UI: dynamic rows from cluster state Replace 4-column fixed GPU type inputs with dynamic rows (one per type). Rows auto-populate from the cluster state (e.g., 32 H100s). Users can add/remove rows; the dropdown prevents duplicate types. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- streamwise/templates/auto_deploy.html | 124 +++++++++++++++++--------- 1 file changed, 84 insertions(+), 40 deletions(-) diff --git a/streamwise/templates/auto_deploy.html b/streamwise/templates/auto_deploy.html index 3b141aeb..7f0398d1 100644 --- a/streamwise/templates/auto_deploy.html +++ b/streamwise/templates/auto_deploy.html @@ -26,28 +26,9 @@

πŸ€– Auto Deploy

πŸ’° GPU Budget -
-
- - -
-
- - -
-
- - -
-
- - -
-
+
+
@@ -108,29 +89,97 @@

πŸ“Š Optimized Deployment Plan