From b7e48e353f88c1427f30ed19d5463ec6e754c057 Mon Sep 17 00:00:00 2001
From: Alberto Perdomo <aperdomo@redhat.com>
Date: Mon, 13 Apr 2026 13:56:30 +0100
Subject: [PATCH 01/21] feat: Initial commit

Signed-off-by: Alberto Perdomo <aperdomo@redhat.com>
---
 .../llm_d/manifests/datasciencecluster.yaml   |  22 +
 .../epp-approximate-prefix-cache.yaml         |  15 +
 config/llm_d/manifests/gateway.yaml           |  14 +
 config/llm_d/manifests/gpu-clusterpolicy.yaml |  37 +
 .../llm_d/manifests/llminferenceservice.yaml  |  96 +++
 .../manifests/nfd-nodefeaturediscovery.yaml   |   6 +
 config/llm_d/models.yaml                      |  25 +
 config/llm_d/platform.yaml                    |  82 +++
 config/llm_d/presets.yaml                     |  14 +
 config/llm_d/workloads.yaml                   |  20 +
 projects/llm_d/README.md                      | 310 +-------
 projects/llm_d/orchestration/ci.py            |   6 +-
 projects/llm_d/orchestration/cli.py           |  25 +-
 projects/llm_d/orchestration/llmd_runtime.py  | 695 ++++++++++++++++++
 projects/llm_d/orchestration/prepare_llmd.py  | 430 ++++++++++-
 projects/llm_d/orchestration/test_llmd.py     | 492 ++++++++++++-
 .../llm_d/toolbox/capture_isvc_state/main.py  |  45 +-
 tests/llm_d/test_runtime.py                   | 208 ++++++
 18 files changed, 2164 insertions(+), 378 deletions(-)
 create mode 100644 config/llm_d/manifests/datasciencecluster.yaml
 create mode 100644 config/llm_d/manifests/epp-approximate-prefix-cache.yaml
 create mode 100644 config/llm_d/manifests/gateway.yaml
 create mode 100644 config/llm_d/manifests/gpu-clusterpolicy.yaml
 create mode 100644 config/llm_d/manifests/llminferenceservice.yaml
 create mode 100644 config/llm_d/manifests/nfd-nodefeaturediscovery.yaml
 create mode 100644 config/llm_d/models.yaml
 create mode 100644 config/llm_d/platform.yaml
 create mode 100644 config/llm_d/presets.yaml
 create mode 100644 config/llm_d/workloads.yaml
 mode change 100755 => 100644 projects/llm_d/orchestration/ci.py
 mode change 100755 => 100644 projects/llm_d/orchestration/cli.py
 create mode 100644 projects/llm_d/orchestration/llmd_runtime.py
 mode change 100755 => 100644 projects/llm_d/toolbox/capture_isvc_state/main.py
 create mode 100644 tests/llm_d/test_runtime.py

diff --git a/config/llm_d/manifests/datasciencecluster.yaml b/config/llm_d/manifests/datasciencecluster.yaml
new file mode 100644
index 00000000..fd45316d
--- /dev/null
+++ b/config/llm_d/manifests/datasciencecluster.yaml
@@ -0,0 +1,22 @@
+apiVersion: datasciencecluster.opendatahub.io/v1
+kind: DataScienceCluster
+metadata:
+  name: default-dsc
+  namespace: redhat-ods-applications
+spec:
+  components:
+    codeflare:
+      managementState: Removed
+    dashboard:
+      managementState: Removed
+    datasciencepipelines:
+      managementState: Removed
+    kserve:
+      managementState: Managed
+      rawDeploymentServiceConfig: Headless
+    modelmeshserving:
+      managementState: Removed
+    ray:
+      managementState: Removed
+    workbenches:
+      managementState: Removed
diff --git a/config/llm_d/manifests/epp-approximate-prefix-cache.yaml b/config/llm_d/manifests/epp-approximate-prefix-cache.yaml
new file mode 100644
index 00000000..e584dcf2
--- /dev/null
+++ b/config/llm_d/manifests/epp-approximate-prefix-cache.yaml
@@ -0,0 +1,15 @@
+apiVersion: inference.networking.x-k8s.io/v1alpha1
+kind: EndpointPickerConfig
+plugins:
+  - type: queue-scorer
+  - type: kv-cache-utilization-scorer
+  - type: prefix-cache-scorer
+schedulingProfiles:
+  - name: default
+    plugins:
+      - pluginRef: queue-scorer
+        weight: 2
+      - pluginRef: kv-cache-utilization-scorer
+        weight: 2
+      - pluginRef: prefix-cache-scorer
+        weight: 3
diff --git a/config/llm_d/manifests/gateway.yaml b/config/llm_d/manifests/gateway.yaml
new file mode 100644
index 00000000..dff0c398
--- /dev/null
+++ b/config/llm_d/manifests/gateway.yaml
@@ -0,0 +1,14 @@
+apiVersion: gateway.networking.k8s.io/v1
+kind: Gateway
+metadata:
+  name: openshift-ai-inference
+  namespace: openshift-ingress
+spec:
+  gatewayClassName: data-science-gateway-class
+  listeners:
+    - name: http
+      port: 80
+      protocol: HTTP
+      allowedRoutes:
+        namespaces:
+          from: All
diff --git a/config/llm_d/manifests/gpu-clusterpolicy.yaml b/config/llm_d/manifests/gpu-clusterpolicy.yaml
new file mode 100644
index 00000000..6a9ad7ee
--- /dev/null
+++ b/config/llm_d/manifests/gpu-clusterpolicy.yaml
@@ -0,0 +1,37 @@
+apiVersion: nvidia.com/v1
+kind: ClusterPolicy
+metadata:
+  name: gpu-cluster-policy
+spec:
+  daemonsets:
+    tolerations:
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Exists
+    updateStrategy: RollingUpdate
+  dcgm:
+    enabled: true
+  dcgmExporter:
+    enabled: true
+  devicePlugin:
+    enabled: true
+  driver:
+    enabled: true
+    kernelModuleType: auto
+  gfd:
+    enabled: true
+  mig:
+    strategy: single
+  nodeStatusExporter:
+    enabled: true
+  operator:
+    defaultRuntime: crio
+    runtimeClass: nvidia
+  toolkit:
+    enabled: true
+    installDir: /usr/local/nvidia
+  validator:
+    plugin:
+      env:
+        - name: WITH_WORKLOAD
+          value: "false"
diff --git a/config/llm_d/manifests/llminferenceservice.yaml b/config/llm_d/manifests/llminferenceservice.yaml
new file mode 100644
index 00000000..cff616f8
--- /dev/null
+++ b/config/llm_d/manifests/llminferenceservice.yaml
@@ -0,0 +1,96 @@
+apiVersion: serving.kserve.io/v1alpha1
+kind: LLMInferenceService
+metadata:
+  name: llm-d
+  namespace: llm-d
+  annotations:
+    security.opendatahub.io/enable-auth: "false"
+    prometheus.io/path: /metrics
+    prometheus.io/port: "8000"
+spec:
+  replicas: 1
+  model:
+    uri: hf://Qwen/Qwen3-0.6B
+    name: Qwen/Qwen3-0.6B
+  router:
+    scheduler:
+      template:
+        containers:
+          - name: main
+            env:
+              - name: TOKENIZER_CACHE_DIR
+                value: /tmp/tokenizer-cache
+              - name: HF_HOME
+                value: /tmp/tokenizer-cache
+              - name: TRANSFORMERS_CACHE
+                value: /tmp/tokenizer-cache
+              - name: XDG_CACHE_HOME
+                value: /tmp
+            args:
+              - --cert-path
+              - /var/run/kserve/tls
+              - --pool-group
+              - inference.networking.x-k8s.io
+              - --pool-name
+              - "{{ ChildName .ObjectMeta.Name `-inference-pool` }}"
+              - --pool-namespace
+              - "{{ .ObjectMeta.Namespace }}"
+              - --zap-encoder
+              - json
+              - --grpc-port
+              - "9002"
+              - --grpc-health-port
+              - "9003"
+              - --secure-serving
+              - --model-server-metrics-scheme
+              - https
+              - --config-text
+            volumeMounts:
+              - name: tokenizer-cache
+                mountPath: /tmp/tokenizer-cache
+              - name: cachi2-cache
+                mountPath: /cachi2
+        volumes:
+          - name: tokenizer-cache
+            emptyDir: {}
+          - name: cachi2-cache
+            emptyDir: {}
+        nodeSelector:
+          nvidia.com/gpu.present: "true"
+    route: {}
+    gateway: {}
+  template:
+    tolerations:
+      - key: nvidia.com/gpu
+        operator: Exists
+        effect: NoSchedule
+    containers:
+      - name: main
+        resources:
+          requests:
+            cpu: "4"
+            memory: 16Gi
+            nvidia.com/gpu: "1"
+          limits:
+            cpu: "4"
+            memory: 16Gi
+            nvidia.com/gpu: "1"
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 8000
+            scheme: HTTPS
+          initialDelaySeconds: 900
+          periodSeconds: 60
+          timeoutSeconds: 60
+          failureThreshold: 1000
+        readinessProbe:
+          failureThreshold: 10000
+          httpGet:
+            path: /health
+            port: 8000
+            scheme: HTTPS
+          initialDelaySeconds: 60
+          periodSeconds: 30
+          successThreshold: 1
+          timeoutSeconds: 30
diff --git a/config/llm_d/manifests/nfd-nodefeaturediscovery.yaml b/config/llm_d/manifests/nfd-nodefeaturediscovery.yaml
new file mode 100644
index 00000000..df19596f
--- /dev/null
+++ b/config/llm_d/manifests/nfd-nodefeaturediscovery.yaml
@@ -0,0 +1,6 @@
+apiVersion: nfd.openshift.io/v1
+kind: NodeFeatureDiscovery
+metadata:
+  name: nfd-instance
+  namespace: openshift-nfd
+spec: {}
diff --git a/config/llm_d/models.yaml b/config/llm_d/models.yaml
new file mode 100644
index 00000000..46cf4bf4
--- /dev/null
+++ b/config/llm_d/models.yaml
@@ -0,0 +1,25 @@
+models:
+
+  qwen3-0-6b:
+    served_model_name: Qwen/Qwen3-0.6B
+    uri: hf://Qwen/Qwen3-0.6B
+    resources:
+      requests:
+        cpu: "4"
+        memory: 16Gi
+        nvidia.com/gpu: "1"
+      limits:
+        cpu: "4"
+        memory: 16Gi
+        nvidia.com/gpu: "1"
+
+  llama-3-1-8b-instruct-fp8:
+    served_model_name: llama-3-1-8b-instruct-fp8
+    uri: oci://registry.redhat.io/rhelai1/modelcar-llama-3-1-8b-instruct-fp8-dynamic:1.5
+    resources:
+      requests:
+        cpu: "4"
+        memory: 8Gi
+        nvidia.com/gpu: "1"
+      limits:
+        nvidia.com/gpu: "1"
diff --git a/config/llm_d/platform.yaml b/config/llm_d/platform.yaml
new file mode 100644
index 00000000..c5e35ea4
--- /dev/null
+++ b/config/llm_d/platform.yaml
@@ -0,0 +1,82 @@
+cluster:
+  minimum_openshift_version: "4.19.9"
+  namespace_prefix: llm-d
+  namespace_max_length: 63
+  cleanup_timeout_seconds: 900
+  gpu_node_label_selector: nvidia.com/gpu.present=true
+  nfd_gpu_detection_labels:
+    - feature.node.kubernetes.io/pci-10de.present
+    - feature.node.kubernetes.io/pci-0302_10de.present
+    - feature.node.kubernetes.io/pci-0300_10de.present
+
+operators:
+  - display_name: OpenShift Cert Manager
+    package: openshift-cert-manager-operator
+    namespace: openshift-cert-manager-operator
+    channel: stable-v1.18
+    source: redhat-operators
+    wait_timeout_seconds: 900
+  - display_name: Leader Worker Set
+    package: leader-worker-set
+    namespace: openshift-lws
+    channel: stable
+    source: redhat-operators
+    wait_timeout_seconds: 900
+  - display_name: Node Feature Discovery
+    package: nfd
+    namespace: openshift-nfd
+    channel: stable
+    source: redhat-operators
+    wait_timeout_seconds: 900
+    bootstrap_crd: nodefeaturediscoveries.nfd.openshift.io
+    bootstrap_manifest: manifests/nfd-nodefeaturediscovery.yaml
+  - display_name: NVIDIA GPU Operator
+    package: gpu-operator-certified
+    namespace: nvidia-gpu-operator
+    channel: stable
+    source: certified-operators
+    wait_timeout_seconds: 1800
+    bootstrap_crd: clusterpolicies.nvidia.com
+    bootstrap_manifest: manifests/gpu-clusterpolicy.yaml
+  - display_name: Red Hat OpenShift AI
+    package: rhods-operator
+    namespace: redhat-ods-operator
+    channel: stable-3.x
+    source: redhat-operators
+    wait_timeout_seconds: 1800
+
+rhoai:
+  namespace: redhat-ods-applications
+  datasciencecluster_name: default-dsc
+  datasciencecluster_template: manifests/datasciencecluster.yaml
+  wait_timeout_seconds: 1800
+  required_crds_before_dsc:
+    - datascienceclusters.datasciencecluster.opendatahub.io
+  required_crds_after_dsc:
+    - llminferenceservices.serving.kserve.io
+
+gateway:
+  namespace: openshift-ingress
+  name: openshift-ai-inference
+  gateway_class_name: data-science-gateway-class
+  status_address_name: gateway-external
+  create_if_missing: true
+  manifest_template: manifests/gateway.yaml
+  wait_timeout_seconds: 600
+
+inference_service:
+  name: llm-d
+  template: manifests/llminferenceservice.yaml
+  epp_config_template: manifests/epp-approximate-prefix-cache.yaml
+  workload_deployment_name_suffix: -kserve
+  pod_appearance_timeout_seconds: 600
+  ready_timeout_seconds: 1800
+  delete_timeout_seconds: 900
+
+artifacts:
+  capture_namespace_events: true
+
+smoke:
+  endpoint_path: /v1/completions
+  request_retries: 30
+  request_retry_delay_seconds: 10
diff --git a/config/llm_d/presets.yaml b/config/llm_d/presets.yaml
new file mode 100644
index 00000000..9fdaae32
--- /dev/null
+++ b/config/llm_d/presets.yaml
@@ -0,0 +1,14 @@
+aliases:
+  cks: smoke
+
+presets:
+
+  smoke:
+    model: qwen3-0-6b
+    smoke_request: default
+    benchmark: null
+
+  benchmark-short:
+    model: llama-3-1-8b-instruct-fp8
+    smoke_request: default
+    benchmark: short
diff --git a/config/llm_d/workloads.yaml b/config/llm_d/workloads.yaml
new file mode 100644
index 00000000..f5ebbb85
--- /dev/null
+++ b/config/llm_d/workloads.yaml
@@ -0,0 +1,20 @@
+smoke_requests:
+  default:
+    prompt: San Francisco is a
+    max_tokens: 50
+    temperature: 0.7
+
+benchmarks:
+
+  short:
+    job_name: guidellm-benchmark
+    image: ghcr.io/vllm-project/guidellm:v0.5.4
+    pvc_size: 1Gi
+    timeout_seconds: 900
+    rate: 1
+    args:
+      backend_type: openai_http
+      rate_type: concurrent
+      max_seconds: 120
+      sample_requests: 20
+      data: prompt_tokens=256,output_tokens=128
diff --git a/projects/llm_d/README.md b/projects/llm_d/README.md
index f254277f..d76634d9 100644
--- a/projects/llm_d/README.md
+++ b/projects/llm_d/README.md
@@ -1,304 +1,16 @@
-# Skeleton Project
+# llm_d
 
-This is a template/skeleton project that demonstrates how to create a new project within the **FORGE** test harness framework.
+`llm_d` is the Forge project for validating downstream llm-d on RHOAI.
 
-## Overview
+The current implementation is intentionally narrow:
 
-This skeleton shows the essential structure and patterns for building projects that comply with FORGE's constitutional principles:
+- target only downstream `LLMInferenceService`
+- keep the public interface compatible with current Fournos phase execution
+- use checked-in presets and manifests instead of a large mutable config surface
 
-- **CI-First Testing**: Structured phases ensure consistent CI integration
-- **Observable Measurements**: Command execution logging and timing
-- **Reproducible Results**: Deterministic operations with clear success/failure
-- **Scale-Aware Design**: Efficient synchronous operations
-- **AI Platform Specificity**: OpenShift AI focused testing patterns
+Main entrypoints:
 
-## Project Structure
-
-```
-skeleton/
-├── orchestration/
-│   └── ci.py          # Main CI script with Click-based CLI
-├── README.md          # This documentation
-├── config.yaml        # Project configuration (optional)
-├── tests/             # Test scripts and data (optional)
-└── scripts/           # Helper scripts (optional)
-```
-
-## Quick Start
-
-### 1. Run Individual Phases
-
-```bash
-# From the FORGE root directory
-
-# Prepare environment
-./run_ci skeleton ci prepare
-
-# Run tests
-./run_ci skeleton ci test
-
-# Clean up
-./run_ci skeleton ci cleanup
-```
-
-### 2. Development Options
-
-```bash
-# Verbose output
-./run_ci skeleton ci --verbose test
-
-# See all available commands
-./run_ci skeleton ci --help
-```
-
-## Creating Your Own Project
-
-### Step 1: Copy Skeleton
-
-```bash
-cp -r projects/skeleton projects/your-project-name
-cd projects/your-project-name
-```
-
-### Step 2: Customize
-
-1. **Update `orchestration/ci.py`**:
-   - Change `self.project_name` to your project name
-   - Replace placeholder `echo` commands with actual test logic
-   - Update the CLI description and help text
-
-2. **Update `README.md`**:
-   - Document your project's purpose and usage
-   - Add specific setup instructions
-
-3. **Add configuration** (optional):
-   - Create `config.yaml` for project-specific settings
-   - Reference it in your CI script
-
-### Step 3: Implement Test Logic
-
-Replace the example `echo` commands with your actual test logic:
-
-#### Prepare Phase
-```python
-def prepare(self):
-    self.log("Starting prepare phase...")
-
-    # Example: Install dependencies
-    if not self.execute_command(
-        "oc apply -f manifests/setup.yaml",
-        "Deploy setup resources"
-    ):
-        return 1
-
-    # Example: Validate environment
-    if not self.execute_command(
-        "oc get nodes",
-        "Check cluster nodes"
-    ):
-        return 1
-
-    self.log("Prepare phase completed!", "success")
-    return 0
-```
-
-#### Test Phase
-```python
-def test(self):
-    self.log("Starting test phase...")
-
-    # Example: Run performance tests
-    if not self.execute_command(
-        "python scripts/performance_test.py --config config.yaml",
-        "Running performance tests"
-    ):
-        return 1
-
-    # Example: Run functional tests
-    if not self.execute_command(
-        "pytest tests/ -v",
-        "Running functional tests"
-    ):
-        return 1
-
-    self.log("Test phase completed!", "success")
-    return 0
-```
-
-#### Cleanup Phase
-```python
-def cleanup(self):
-    self.log("Starting cleanup phase...")
-
-    # Example: Remove test resources
-    self.execute_command(
-        "oc delete -f manifests/",
-        "Cleanup test resources"
-    )
-
-    # Example: Generate reports
-    self.execute_command(
-        "python scripts/generate_report.py",
-        "Generate final report"
-    )
-
-    self.log("Cleanup phase completed!", "success")
-    return 0
-```
-
-## Key Patterns
-
-### 1. Phase Structure
-
-Each project should implement these standard phases:
-- **prepare**: Set up environment and dependencies
-- **test**: Execute main testing logic
-- **cleanup**: Clean up resources and finalize
-
-### 2. Command Execution
-
-Use the `execute_command` method for consistent execution and logging:
-
-```python
-# Basic command execution
-success = self.execute_command("your-command", "Description")
-if not success:
-    return 1  # Exit with error
-
-# Command with complex logic
-result = self.execute_command(
-    "kubectl get pods -o json",
-    "Check pod status"
-)
-```
-
-### 3. Error Handling
-
-Always check command results and handle failures appropriately:
-
-```python
-if not self.execute_command("critical-command", "Critical step"):
-    self.log("Critical step failed!", "error")
-    return 1  # Exit with error code
-
-# Cleanup commands can be non-critical
-self.execute_command("cleanup-command", "Optional cleanup")
-# Continue regardless of success
-```
-
-### 4. Logging
-
-Use the logging methods for consistent output:
-
-```python
-self.log("Starting operation", "info")      # ℹ️ [project] Starting operation
-self.log("Operation completed", "success")  # ✅ [project] Operation completed
-self.log("Warning occurred", "warning")     # ⚠️ [project] Warning occurred
-self.log("Error occurred", "error")         # ❌ [project] Error occurred
-```
-
-### 5. Verbose Mode
-
-The framework automatically handles verbose mode:
-
-```python
-# In verbose mode, command details are automatically shown
-# Your execute_command calls will show:
-# - Command being executed
-# - Command output (if any)
-# - Execution duration
-```
-
-## Click CLI Structure
-
-The skeleton uses Click groups to organize commands:
-
-```python
-@click.group()
-@click.option('--verbose', '-v', is_flag=True, help='Enable verbose output')
-@click.pass_context
-def cli(ctx, verbose):
-    """Project CI Operations for FORGE."""
-    ctx.ensure_object(types.SimpleNamespace)
-    ctx.obj.verbose = verbose
-    ctx.obj.runner = YourProjectTestRunner(verbose)
-
-@cli.command()
-@click.pass_context
-def prepare(ctx):
-    """Prepare phase - Set up environment and dependencies."""
-    runner = ctx.obj.runner
-    exit_code = runner.prepare()
-    sys.exit(exit_code)
-```
-
-## Best Practices
-
-### 1. Constitutional Compliance
-
-- ✅ **CI-First**: Design for automated execution without user interaction
-- ✅ **Observable**: Log important events and command execution
-- ✅ **Reproducible**: Use deterministic operations and clear error codes
-- ✅ **Scale-Aware**: Keep operations efficient and focused
-- ✅ **AI Platform Specific**: Focus on OpenShift AI scenarios and tooling
-
-### 2. Error Handling
-
-- Always validate prerequisites in prepare phase
-- Check command results and fail fast on errors
-- Provide meaningful error messages with context
-- Clean up resources even when tests fail (use try/except if needed)
-
-### 3. Command Design
-
-- Make commands idempotent when possible
-- Use meaningful descriptions for all execute_command calls
-- Test commands locally before adding to CI
-- Consider timeouts for long-running operations
-
-### 4. Configuration
-
-- Keep project configuration in `config.yaml` or environment variables
-- Make tests configurable for different environments
-- Document all configuration options
-- Use sensible defaults
-
-## Testing the Skeleton
-
-```bash
-# Test individual phases
-./run_ci skeleton ci prepare
-./run_ci skeleton ci test
-./run_ci skeleton ci cleanup
-
-# Test with verbose output
-./run_ci skeleton ci --verbose prepare
-
-# See all available commands
-./run_ci skeleton ci --help
-```
-
-## Integration with CI Systems
-
-The skeleton is designed for easy CI integration:
-
-```bash
-# In your CI pipeline
-./run_ci your-project ci prepare || exit 1
-./run_ci your-project ci test || exit 1
-./run_ci your-project ci cleanup  # Always run cleanup
-```
-
-## Next Steps
-
-1. **Study the Code**: Review `orchestration/ci.py` to understand the patterns
-2. **Copy and Customize**: Create your own project based on this skeleton
-3. **Implement Tests**: Replace placeholder `echo` commands with real test logic
-4. **Test Integration**: Verify your project works with the run_ci entrypoint
-5. **Add Documentation**: Document your specific test scenarios and setup
-
-## Support
-
-- Review other projects in `projects/` for more examples
-- Check the main FORGE documentation
-- Study the run_ci entrypoint code in `projects/core/ci_entrypoint/`
+- CI phase wrapper: [ci.py](/Users/aperdomo/workspace/redhat/forge/projects/llm_d/orchestration/ci.py)
+- Prepare flow: [prepare_llmd.py](/Users/aperdomo/workspace/redhat/forge/projects/llm_d/orchestration/prepare_llmd.py)
+- Test flow: [test_llmd.py](/Users/aperdomo/workspace/redhat/forge/projects/llm_d/orchestration/test_llmd.py)
+- Shared runtime/config loader: [llmd_runtime.py](/Users/aperdomo/workspace/redhat/forge/projects/llm_d/orchestration/llmd_runtime.py)
diff --git a/projects/llm_d/orchestration/ci.py b/projects/llm_d/orchestration/ci.py
old mode 100755
new mode 100644
index 7623510f..97073e6e
--- a/projects/llm_d/orchestration/ci.py
+++ b/projects/llm_d/orchestration/ci.py
@@ -25,7 +25,7 @@ def main(ctx):
 @main.command()
 @click.pass_context
 @ci_lib.safe_ci_command
-def prepare(ctx):
+def prepare(ctx) -> int:
     """Prepare phase - Set up environment and dependencies."""
     return prepare_llmd.prepare()
 
@@ -33,7 +33,7 @@ def prepare(ctx):
 @main.command()
 @click.pass_context
 @ci_lib.safe_ci_command
-def test(ctx):
+def test(ctx) -> int:
     """Test phase - Execute the main testing logic."""
     return test_llmd.test()
 
@@ -41,7 +41,7 @@ def test(ctx):
 @main.command()
 @click.pass_context
 @ci_lib.safe_ci_command
-def pre_cleanup(ctx):
+def pre_cleanup(ctx) -> int:
     """Cleanup phase - Clean up resources and finalize."""
     return prepare_llmd.cleanup()
 
diff --git a/projects/llm_d/orchestration/cli.py b/projects/llm_d/orchestration/cli.py
old mode 100755
new mode 100644
index def09477..06ae9ef6
--- a/projects/llm_d/orchestration/cli.py
+++ b/projects/llm_d/orchestration/cli.py
@@ -1,7 +1,4 @@
 #!/usr/bin/env python3
-"""
-LLM-D Project CLI Operations
-"""
 
 import logging
 import sys
@@ -19,7 +16,7 @@
 @click.group()
 @click.pass_context
 def main(ctx):
-    """LLM-D Project CI Operations for FORGE."""
+    """LLM-D Project CLI Operations for FORGE."""
     ctx.ensure_object(types.SimpleNamespace)
     test_llmd.init()
 
@@ -27,37 +24,33 @@ def main(ctx):
 @main.command()
 @click.pass_context
 @safe_cli_command
-def prepare(ctx):
+def prepare(ctx) -> int:
     """Prepare phase - Set up environment and dependencies."""
-    exit_code = prepare_llmd.prepare()
-    sys.exit(exit_code)
+    return prepare_llmd.prepare()
 
 
 @main.command()
 @click.pass_context
 @safe_cli_command
-def test(ctx):
+def test(ctx) -> int:
     """Test phase - Execute the main testing logic."""
-    exit_code = test_llmd.test()
-    sys.exit(exit_code)
+    return test_llmd.test()
 
 
 @main.command()
 @click.pass_context
 @safe_cli_command
-def pre_cleanup(ctx):
+def pre_cleanup(ctx) -> int:
     """Cleanup phase - Clean up resources and finalize."""
-    exit_code = prepare_llmd.cleanup()
-    sys.exit(exit_code)
+    return prepare_llmd.cleanup()
 
 
 @main.command()
 @click.pass_context
 @safe_cli_command
-def post_cleanup(ctx):
+def post_cleanup(ctx) -> int:
     """Cleanup phase - Clean up resources and finalize."""
-    exit_code = prepare_llmd.cleanup()
-    sys.exit(exit_code)
+    return prepare_llmd.cleanup()
 
 
 if __name__ == "__main__":
diff --git a/projects/llm_d/orchestration/llmd_runtime.py b/projects/llm_d/orchestration/llmd_runtime.py
new file mode 100644
index 00000000..aba35fd8
--- /dev/null
+++ b/projects/llm_d/orchestration/llmd_runtime.py
@@ -0,0 +1,695 @@
+from __future__ import annotations
+
+import copy
+import json
+import logging
+import os
+import re
+import shlex
+import subprocess
+import sys
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Iterable
+
+import yaml
+
+FORGE_HOME = Path(__file__).resolve().parents[3]
+if str(FORGE_HOME) not in sys.path:
+    sys.path.insert(0, str(FORGE_HOME))
+
+from projects.core.library import env, run
+
+LOGGER = logging.getLogger(__name__)
+CONFIG_DIR = FORGE_HOME / "config" / "llm_d"
+ALLOWED_OVERRIDE_KEYS = frozenset({"namespace"})
+
+
+class CommandError(RuntimeError):
+    """Raised when an external command exits unsuccessfully."""
+
+
+@dataclass(frozen=True)
+class ResolvedConfig:
+    artifact_dir: Path
+    project_root: Path
+    config_dir: Path
+    preset_name: str
+    preset_alias: str | None
+    job_name: str
+    namespace: str
+    namespace_is_managed: bool
+    gpu_count: int | None
+    platform: dict[str, Any]
+    model: dict[str, Any]
+    smoke_request: dict[str, Any]
+    benchmark: dict[str, Any] | None
+    fournos_config: dict[str, Any]
+    overrides: dict[str, Any]
+
+    @property
+    def manifests_dir(self) -> Path:
+        return self.config_dir / "manifests"
+
+
+def init() -> Path:
+    if not logging.getLogger().handlers:
+        logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
+
+    env.init()
+    run.init()
+    ensure_artifact_directories(env.ARTIFACT_DIR)
+    return env.ARTIFACT_DIR
+
+
+def ensure_artifact_directories(artifact_dir: Path) -> None:
+    for relative in ("src", "artifacts", "artifacts/results"):
+        (artifact_dir / relative).mkdir(parents=True, exist_ok=True)
+
+
+def load_run_configuration(
+    *, cwd: Path | None = None, artifact_dir: Path | None = None
+) -> ResolvedConfig:
+    cwd = cwd or Path.cwd()
+    artifact_dir = artifact_dir or env.ARTIFACT_DIR
+    if artifact_dir is None:
+        raise RuntimeError("ARTIFACT_DIR is not initialized")
+
+    platform_data = load_yaml(CONFIG_DIR / "platform.yaml")
+    models_data = load_yaml(CONFIG_DIR / "models.yaml")["models"]
+    workloads_data = load_yaml(CONFIG_DIR / "workloads.yaml")
+    preset_data = load_yaml(CONFIG_DIR / "presets.yaml")
+
+    fournos_config = load_fournos_config(cwd)
+    overrides = parse_overrides(os.environ.get("FORGE_CONFIG_OVERRIDES", ""))
+
+    requested_preset = (
+        fournos_config.get("preset") or os.environ.get("FORGE_PRESET") or "smoke"
+    )
+    alias = (
+        requested_preset if requested_preset in preset_data.get("aliases", {}) else None
+    )
+    preset_name = preset_data.get("aliases", {}).get(requested_preset, requested_preset)
+    preset = preset_data["presets"].get(preset_name)
+    if preset is None:
+        raise ValueError(f"Unknown llm_d preset: {requested_preset}")
+
+    model_name = preset["model"]
+    model = copy.deepcopy(models_data[model_name])
+
+    smoke_request_name = preset.get("smoke_request", "default")
+    smoke_request = copy.deepcopy(workloads_data["smoke_requests"][smoke_request_name])
+
+    benchmark_name = preset.get("benchmark")
+    benchmark = None
+    if benchmark_name:
+        benchmark = copy.deepcopy(workloads_data["benchmarks"][benchmark_name])
+
+    job_name = fournos_config.get("job-name") or os.environ.get("FORGE_JOB_NAME")
+    if not job_name:
+        job_name = f"local-{preset_name}"
+
+    namespace_override = overrides.get("namespace") or fournos_config.get("namespace")
+    namespace = namespace_override or derive_namespace(
+        job_name,
+        platform_data["cluster"]["namespace_prefix"],
+        platform_data["cluster"]["namespace_max_length"],
+    )
+
+    gpu_count = normalize_gpu_count(fournos_config.get("gpu-count"))
+
+    return ResolvedConfig(
+        artifact_dir=Path(artifact_dir),
+        project_root=FORGE_HOME,
+        config_dir=CONFIG_DIR,
+        preset_name=preset_name,
+        preset_alias=alias,
+        job_name=job_name,
+        namespace=namespace,
+        namespace_is_managed=namespace_override is None,
+        gpu_count=gpu_count,
+        platform=platform_data,
+        model=model,
+        smoke_request=smoke_request,
+        benchmark=benchmark,
+        fournos_config=fournos_config,
+        overrides=overrides,
+    )
+
+
+def load_fournos_config(cwd: Path) -> dict[str, Any]:
+    config_path = cwd / "fournos_config.yaml"
+    if not config_path.exists():
+        return {}
+
+    data = load_yaml(config_path)
+    if data is None:
+        return {}
+    if not isinstance(data, dict):
+        raise ValueError(
+            f"Unexpected FOURNOS config type in {config_path}: {type(data)}"
+        )
+    return data
+
+
+def parse_overrides(raw: str) -> dict[str, Any]:
+    if not raw or raw.strip() in {"", "null", "{}"}:
+        return {}
+
+    try:
+        data = json.loads(raw)
+    except json.JSONDecodeError as exc:
+        raise ValueError(f"FORGE_CONFIG_OVERRIDES is not valid JSON: {exc}") from exc
+
+    if not isinstance(data, dict):
+        raise ValueError("FORGE_CONFIG_OVERRIDES must decode to a JSON object")
+
+    unsupported = sorted(set(data) - ALLOWED_OVERRIDE_KEYS)
+    if unsupported:
+        raise ValueError(
+            "Unsupported llm_d override keys: "
+            f"{', '.join(unsupported)}. Allowed keys: {', '.join(sorted(ALLOWED_OVERRIDE_KEYS))}"
+        )
+
+    return data
+
+
+def normalize_gpu_count(value: Any) -> int | None:
+    if value in (None, ""):
+        return None
+    try:
+        return int(value)
+    except (TypeError, ValueError):
+        LOGGER.warning("Ignoring invalid gpu-count value: %s", value)
+        return None
+
+
+def derive_namespace(job_name: str, prefix: str, max_length: int) -> str:
+    slug = re.sub(r"[^a-z0-9-]+", "-", job_name.lower())
+    slug = re.sub(r"-{2,}", "-", slug).strip("-")
+    if not slug:
+        slug = "run"
+
+    if slug.startswith(f"{prefix}-"):
+        namespace = slug
+    else:
+        namespace = f"{prefix}-{slug}"
+
+    namespace = namespace[:max_length].rstrip("-")
+    if not namespace:
+        raise ValueError(
+            f"Could not derive a valid namespace from job name: {job_name}"
+        )
+    return namespace
+
+
+def load_yaml(path: Path) -> Any:
+    with path.open(encoding="utf-8") as handle:
+        return yaml.safe_load(handle)
+
+
+def write_yaml(path: Path, payload: Any) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as handle:
+        yaml.safe_dump(payload, handle, sort_keys=False)
+
+
+def write_json(path: Path, payload: Any) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as handle:
+        json.dump(payload, handle, indent=2, sort_keys=True)
+        handle.write("\n")
+
+
+def write_text(path: Path, content: str) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(content, encoding="utf-8")
+
+
+def run_command(
+    args: Iterable[str],
+    *,
+    check: bool = True,
+    capture_output: bool = True,
+    input_text: str | None = None,
+) -> subprocess.CompletedProcess[str]:
+    cmd = [str(arg) for arg in args]
+    LOGGER.info("run: %s", " ".join(shlex.quote(arg) for arg in cmd))
+    result = subprocess.run(
+        cmd,
+        check=False,
+        text=True,
+        capture_output=capture_output,
+        input=input_text,
+    )
+
+    if capture_output:
+        if result.stdout:
+            LOGGER.info("stdout:\n%s", result.stdout.rstrip())
+        if result.stderr:
+            LOGGER.info("stderr:\n%s", result.stderr.rstrip())
+
+    if check and result.returncode != 0:
+        raise CommandError(
+            f"Command failed with exit code {result.returncode}: "
+            f"{' '.join(shlex.quote(arg) for arg in cmd)}"
+        )
+
+    return result
+
+
+def oc(
+    *args: str,
+    check: bool = True,
+    capture_output: bool = True,
+    input_text: str | None = None,
+) -> subprocess.CompletedProcess[str]:
+    return run_command(
+        ["oc", *args],
+        check=check,
+        capture_output=capture_output,
+        input_text=input_text,
+    )
+
+
+def apply_manifest(artifact_path: Path, manifest: dict[str, Any]) -> None:
+    write_yaml(artifact_path, manifest)
+    oc("apply", "-f", str(artifact_path))
+
+
+def oc_get_json(
+    kind: str,
+    *,
+    name: str | None = None,
+    namespace: str | None = None,
+    selector: str | None = None,
+    ignore_not_found: bool = False,
+) -> dict[str, Any] | None:
+    args = ["get", kind]
+    if name:
+        args.append(name)
+    if namespace:
+        args.extend(["-n", namespace])
+    if selector:
+        args.extend(["-l", selector])
+    args.extend(["-o", "json"])
+
+    result = oc(*args, check=not ignore_not_found, capture_output=True)
+    if ignore_not_found and result.returncode != 0:
+        return None
+    return json.loads(result.stdout)
+
+
+def resource_exists(kind: str, name: str, *, namespace: str | None = None) -> bool:
+    result = oc(
+        "get",
+        kind,
+        name,
+        *([] if namespace is None else ["-n", namespace]),
+        check=False,
+        capture_output=True,
+    )
+    return result.returncode == 0
+
+
+def wait_until(
+    description: str,
+    *,
+    timeout_seconds: int,
+    interval_seconds: int,
+    predicate,
+) -> Any:
+    deadline = time.time() + timeout_seconds
+    last_error: Exception | None = None
+
+    while time.time() < deadline:
+        try:
+            value = predicate()
+            if value:
+                return value
+            last_error = None
+        except Exception as exc:  # pragma: no cover - exercised in integration paths
+            last_error = exc
+            LOGGER.info("waiting for %s: %s", description, exc)
+        time.sleep(interval_seconds)
+
+    if last_error:
+        raise RuntimeError(
+            f"Timed out waiting for {description}: {last_error}"
+        ) from last_error
+    raise RuntimeError(f"Timed out waiting for {description}")
+
+
+def wait_for_namespace_deleted(namespace: str, timeout_seconds: int) -> None:
+    def _namespace_gone() -> bool:
+        return not resource_exists("namespace", namespace)
+
+    wait_until(
+        f"namespace/{namespace} deletion",
+        timeout_seconds=timeout_seconds,
+        interval_seconds=10,
+        predicate=_namespace_gone,
+    )
+
+
+def wait_for_crd(crd_name: str, timeout_seconds: int) -> None:
+    wait_until(
+        f"crd/{crd_name}",
+        timeout_seconds=timeout_seconds,
+        interval_seconds=10,
+        predicate=lambda: resource_exists("crd", crd_name),
+    )
+
+
+def wait_for_operator_csv(
+    package: str, namespace: str, timeout_seconds: int
+) -> dict[str, Any]:
+    selector = f"operators.coreos.com/{package}.{namespace}"
+
+    def _csv_ready() -> dict[str, Any] | None:
+        data = oc_get_json(
+            "csv", namespace=namespace, selector=selector, ignore_not_found=True
+        )
+        if not data:
+            return None
+        items = data.get("items", [])
+        if not items:
+            return None
+        csv = items[0]
+        if csv.get("status", {}).get("phase") == "Succeeded":
+            return csv
+        return None
+
+    return wait_until(
+        f"{package} CSV in {namespace}",
+        timeout_seconds=timeout_seconds,
+        interval_seconds=15,
+        predicate=_csv_ready,
+    )
+
+
+def ensure_namespace(namespace: str, *, labels: dict[str, str] | None = None) -> None:
+    if not resource_exists("namespace", namespace):
+        oc("create", "namespace", namespace)
+
+    if labels:
+        label_args = [f"{key}={value}" for key, value in labels.items()]
+        oc("label", "namespace", namespace, "--overwrite", *label_args)
+
+
+def ensure_operator_group(namespace: str, package: str) -> None:
+    data = oc_get_json("operatorgroup", namespace=namespace, ignore_not_found=True)
+    if data and data.get("items"):
+        for item in data["items"]:
+            targets = item.get("spec", {}).get("targetNamespaces") or [namespace]
+            if namespace in targets:
+                return
+        raise RuntimeError(
+            f"Existing OperatorGroup objects in {namespace} do not target {namespace}"
+        )
+
+    operator_group = {
+        "apiVersion": "operators.coreos.com/v1",
+        "kind": "OperatorGroup",
+        "metadata": {"name": package, "namespace": namespace},
+        "spec": {"targetNamespaces": [namespace]},
+    }
+    oc("apply", "-f", "-", input_text=yaml.safe_dump(operator_group, sort_keys=False))
+
+
+def ensure_subscription(operator_spec: dict[str, Any]) -> None:
+    namespace = operator_spec["namespace"]
+    package = operator_spec["package"]
+
+    ensure_namespace(namespace)
+    ensure_operator_group(namespace, package)
+
+    subscription = desired_subscription(operator_spec)
+    current = oc_get_json(
+        "subscription.operators.coreos.com",
+        name=package,
+        namespace=namespace,
+        ignore_not_found=True,
+    )
+    if current and not subscription_spec_matches(
+        current.get("spec", {}), subscription["spec"]
+    ):
+        LOGGER.info("Reconciling subscription drift for %s in %s", package, namespace)
+
+    oc("apply", "-f", "-", input_text=yaml.safe_dump(subscription, sort_keys=False))
+
+    def _subscription_reconciled() -> dict[str, Any] | None:
+        payload = oc_get_json(
+            "subscription.operators.coreos.com",
+            name=package,
+            namespace=namespace,
+        )
+        if subscription_spec_matches(payload.get("spec", {}), subscription["spec"]):
+            return payload
+        return None
+
+    wait_until(
+        f"subscription/{package} reconciliation in {namespace}",
+        timeout_seconds=60,
+        interval_seconds=5,
+        predicate=_subscription_reconciled,
+    )
+
+
+def desired_subscription(operator_spec: dict[str, Any]) -> dict[str, Any]:
+    namespace = operator_spec["namespace"]
+    package = operator_spec["package"]
+    return {
+        "apiVersion": "operators.coreos.com/v1alpha1",
+        "kind": "Subscription",
+        "metadata": {"name": package, "namespace": namespace},
+        "spec": {
+            "channel": operator_spec["channel"],
+            "installPlanApproval": "Automatic",
+            "name": package,
+            "source": operator_spec["source"],
+            "sourceNamespace": "openshift-marketplace",
+        },
+    }
+
+
+def subscription_spec_matches(actual: dict[str, Any], expected: dict[str, Any]) -> bool:
+    keys = ("channel", "installPlanApproval", "name", "source", "sourceNamespace")
+    return all(actual.get(key) == expected.get(key) for key in keys)
+
+
+def operator_spec_by_package(platform: dict[str, Any], package: str) -> dict[str, Any]:
+    for operator_spec in platform["operators"]:
+        if operator_spec["package"] == package:
+            return operator_spec
+    raise KeyError(f"Unknown operator package in llm_d platform config: {package}")
+
+
+def load_manifest_template(
+    config: ResolvedConfig, relative_path: str
+) -> dict[str, Any]:
+    return load_yaml(config.config_dir / relative_path)
+
+
+def version_tuple(value: str) -> tuple[int, ...]:
+    numbers = re.findall(r"\d+", value)
+    return tuple(int(number) for number in numbers[:3])
+
+
+def condition_status(resource: dict[str, Any], condition_type: str) -> str | None:
+    conditions = resource.get("status", {}).get("conditions", [])
+    for condition in conditions:
+        if condition.get("type") == condition_type:
+            return condition.get("status")
+    return None
+
+
+def render_datasciencecluster(config: ResolvedConfig) -> dict[str, Any]:
+    template_path = (
+        config.config_dir / config.platform["rhoai"]["datasciencecluster_template"]
+    )
+    manifest = load_yaml(template_path)
+    manifest["metadata"]["name"] = config.platform["rhoai"]["datasciencecluster_name"]
+    manifest["metadata"]["namespace"] = config.platform["rhoai"]["namespace"]
+    return manifest
+
+
+def render_gateway(config: ResolvedConfig) -> dict[str, Any]:
+    template_path = config.config_dir / config.platform["gateway"]["manifest_template"]
+    manifest = load_yaml(template_path)
+    manifest["metadata"]["name"] = config.platform["gateway"]["name"]
+    manifest["metadata"]["namespace"] = config.platform["gateway"]["namespace"]
+    manifest["spec"]["gatewayClassName"] = config.platform["gateway"][
+        "gateway_class_name"
+    ]
+    return manifest
+
+
+def render_inference_service(config: ResolvedConfig) -> dict[str, Any]:
+    template_path = config.config_dir / config.platform["inference_service"]["template"]
+    manifest = load_yaml(template_path)
+
+    name = config.platform["inference_service"]["name"]
+    manifest["metadata"]["name"] = name
+    manifest["metadata"]["namespace"] = config.namespace
+    manifest["metadata"].setdefault("labels", {})
+    manifest["metadata"]["labels"].update(
+        {
+            "app.kubernetes.io/managed-by": "forge",
+            "forge.openshift.io/project": "llm_d",
+        }
+    )
+
+    manifest["spec"]["model"]["uri"] = config.model["uri"]
+    manifest["spec"]["model"]["name"] = config.model["served_model_name"]
+    manifest["spec"]["template"]["containers"][0]["resources"] = copy.deepcopy(
+        config.model["resources"]
+    )
+
+    epp_path = (
+        config.config_dir / config.platform["inference_service"]["epp_config_template"]
+    )
+    epp_config = epp_path.read_text(encoding="utf-8")
+    router_args = manifest["spec"]["router"]["scheduler"]["template"]["containers"][0][
+        "args"
+    ]
+    if not router_args or router_args[-1] != "--config-text":
+        raise ValueError("Expected llm-d router args to end with --config-text")
+    router_args.append(epp_config)
+
+    return manifest
+
+
+def render_guidellm_pvc(config: ResolvedConfig) -> dict[str, Any]:
+    if not config.benchmark:
+        raise ValueError("Benchmark configuration is not enabled for this preset")
+
+    return {
+        "apiVersion": "v1",
+        "kind": "PersistentVolumeClaim",
+        "metadata": {
+            "name": config.benchmark["job_name"],
+            "namespace": config.namespace,
+        },
+        "spec": {
+            "accessModes": ["ReadWriteOnce"],
+            "resources": {"requests": {"storage": config.benchmark["pvc_size"]}},
+        },
+    }
+
+
+def render_guidellm_job(config: ResolvedConfig, endpoint_url: str) -> dict[str, Any]:
+    if not config.benchmark:
+        raise ValueError("Benchmark configuration is not enabled for this preset")
+
+    args = [
+        "benchmark",
+        "run",
+        f"--target={endpoint_url}",
+        f"--rate={config.benchmark['rate']}",
+    ]
+    for key, value in config.benchmark["args"].items():
+        if value is None:
+            continue
+        args.append(f"--{key.replace('_', '-')}={value}")
+    args.append("--outputs=json")
+
+    return {
+        "apiVersion": "batch/v1",
+        "kind": "Job",
+        "metadata": {
+            "name": config.benchmark["job_name"],
+            "namespace": config.namespace,
+        },
+        "spec": {
+            "backoffLimit": 0,
+            "template": {
+                "spec": {
+                    "serviceAccountName": "default",
+                    "restartPolicy": "Never",
+                    "containers": [
+                        {
+                            "name": "guidellm",
+                            "image": config.benchmark["image"],
+                            "command": ["/opt/app-root/bin/guidellm"],
+                            "args": args,
+                            "env": [{"name": "USER", "value": "guidellm"}],
+                            "volumeMounts": [
+                                {"name": "home", "mountPath": "/home/guidellm"},
+                                {"name": "results", "mountPath": "/results"},
+                            ],
+                        }
+                    ],
+                    "volumes": [
+                        {"name": "home", "emptyDir": {}},
+                        {
+                            "name": "results",
+                            "persistentVolumeClaim": {
+                                "claimName": config.benchmark["job_name"]
+                            },
+                        },
+                    ],
+                }
+            },
+        },
+    }
+
+
+def render_guidellm_copy_pod(
+    config: ResolvedConfig, node_name: str | None = None
+) -> dict[str, Any]:
+    if not config.benchmark:
+        raise ValueError("Benchmark configuration is not enabled for this preset")
+
+    pod = {
+        "apiVersion": "v1",
+        "kind": "Pod",
+        "metadata": {
+            "name": f"{config.benchmark['job_name']}-copy",
+            "namespace": config.namespace,
+        },
+        "spec": {
+            "restartPolicy": "Never",
+            "initContainers": [
+                {
+                    "name": "permission-fixer",
+                    "image": config.benchmark["image"],
+                    "command": [
+                        "/bin/sh",
+                        "-c",
+                        "chmod 755 /results && chown -R 1001:1001 /results || true",
+                    ],
+                    "securityContext": {
+                        "runAsUser": 0,
+                        "allowPrivilegeEscalation": True,
+                    },
+                    "volumeMounts": [{"name": "results", "mountPath": "/results"}],
+                }
+            ],
+            "containers": [
+                {
+                    "name": "copy-helper",
+                    "image": config.benchmark["image"],
+                    "command": ["/bin/sleep", "300"],
+                    "securityContext": {
+                        "runAsUser": 1001,
+                        "runAsNonRoot": True,
+                        "allowPrivilegeEscalation": False,
+                    },
+                    "volumeMounts": [{"name": "results", "mountPath": "/results"}],
+                }
+            ],
+            "volumes": [
+                {
+                    "name": "results",
+                    "persistentVolumeClaim": {
+                        "claimName": config.benchmark["job_name"]
+                    },
+                }
+            ],
+        },
+    }
+    if node_name:
+        pod["spec"]["nodeName"] = node_name
+    return pod
diff --git a/projects/llm_d/orchestration/prepare_llmd.py b/projects/llm_d/orchestration/prepare_llmd.py
index c28ad8c7..fdabe4b8 100644
--- a/projects/llm_d/orchestration/prepare_llmd.py
+++ b/projects/llm_d/orchestration/prepare_llmd.py
@@ -1,16 +1,428 @@
+from __future__ import annotations
+
+import json
 import logging
+from pathlib import Path
+
+from projects.llm_d.orchestration import llmd_runtime
+
+LOGGER = logging.getLogger(__name__)
+
+
+def prepare() -> int:
+    llmd_runtime.init()
+    config = llmd_runtime.load_run_configuration()
+
+    LOGGER.info(
+        "Preparing llm_d preset=%s namespace=%s", config.preset_name, config.namespace
+    )
+
+    verify_oc_access()
+    verify_cluster_version(config)
+    prepare_cert_manager(config)
+    prepare_leader_worker_set(config)
+    prepare_nfd(config)
+    prepare_gpu_operator(config)
+    prepare_rhoai_operator(config)
+    apply_datasciencecluster(config)
+    wait_for_datasciencecluster_ready(config)
+    ensure_required_crds(config.platform["rhoai"]["required_crds_after_dsc"], config)
+    ensure_gateway(config)
+    ensure_test_namespace(config)
+    verify_gpu_nodes(config)
+    capture_prepare_state(config)
+
+    return 0
+
+
+def cleanup() -> int:
+    llmd_runtime.init()
+    config = llmd_runtime.load_run_configuration()
+
+    inference_service_name = config.platform["inference_service"]["name"]
+    benchmark_name = (
+        config.benchmark["job_name"] if config.benchmark else "guidellm-benchmark"
+    )
+
+    if config.namespace_is_managed:
+        if llmd_runtime.resource_exists("namespace", config.namespace):
+            llmd_runtime.oc(
+                "delete", "namespace", config.namespace, "--ignore-not-found=true"
+            )
+            llmd_runtime.wait_for_namespace_deleted(
+                config.namespace,
+                timeout_seconds=config.platform["cluster"]["cleanup_timeout_seconds"],
+            )
+    else:
+        llmd_runtime.oc(
+            "delete",
+            "llminferenceservice",
+            inference_service_name,
+            "-n",
+            config.namespace,
+            "--ignore-not-found=true",
+            check=False,
+        )
+        llmd_runtime.oc(
+            "delete",
+            "job,pvc",
+            benchmark_name,
+            "-n",
+            config.namespace,
+            "--ignore-not-found=true",
+            check=False,
+        )
+        llmd_runtime.oc(
+            "delete",
+            "pod",
+            f"{benchmark_name}-copy",
+            "-n",
+            config.namespace,
+            "--ignore-not-found=true",
+            check=False,
+        )
+
+    return 0
+
+
+def verify_oc_access() -> None:
+    llmd_runtime.oc("whoami", capture_output=True)
+
+
+def verify_cluster_version(config: llmd_runtime.ResolvedConfig) -> None:
+    version_info = llmd_runtime.oc("version", "-o", "json", capture_output=True)
+    payload = json.loads(version_info.stdout)
+
+    openshift_version = (
+        payload.get("openshiftVersion")
+        or payload.get("serverVersion", {}).get("gitVersion")
+        or payload.get("serverVersion", {}).get("platform")
+    )
+    if not openshift_version:
+        raise RuntimeError(
+            "Could not determine OpenShift version from `oc version -o json`"
+        )
+
+    minimum = config.platform["cluster"]["minimum_openshift_version"]
+    if llmd_runtime.version_tuple(openshift_version) < llmd_runtime.version_tuple(
+        minimum
+    ):
+        raise RuntimeError(
+            f"Cluster version {openshift_version} is older than the llm_d minimum {minimum}"
+        )
+
+
+def ensure_operator_subscription(operator_spec: dict[str, str]) -> dict[str, object]:
+    llmd_runtime.ensure_subscription(operator_spec)
+    return llmd_runtime.wait_for_operator_csv(
+        operator_spec["package"],
+        operator_spec["namespace"],
+        timeout_seconds=operator_spec["wait_timeout_seconds"],
+    )
+
+
+def prepare_cert_manager(config: llmd_runtime.ResolvedConfig) -> None:
+    operator_spec = llmd_runtime.operator_spec_by_package(
+        config.platform, "openshift-cert-manager-operator"
+    )
+    ensure_operator_subscription(operator_spec)
+
+
+def prepare_leader_worker_set(config: llmd_runtime.ResolvedConfig) -> None:
+    operator_spec = llmd_runtime.operator_spec_by_package(
+        config.platform, "leader-worker-set"
+    )
+    ensure_operator_subscription(operator_spec)
+
+
+def prepare_nfd(config: llmd_runtime.ResolvedConfig) -> None:
+    operator_spec = llmd_runtime.operator_spec_by_package(config.platform, "nfd")
+    ensure_operator_subscription(operator_spec)
+    llmd_runtime.wait_for_crd(
+        operator_spec["bootstrap_crd"],
+        timeout_seconds=operator_spec["wait_timeout_seconds"],
+    )
+
+    manifest = llmd_runtime.load_manifest_template(
+        config, operator_spec["bootstrap_manifest"]
+    )
+    llmd_runtime.apply_manifest(
+        config.artifact_dir / "src" / "nfd-nodefeaturediscovery.yaml",
+        manifest,
+    )
+
+    llmd_runtime.wait_until(
+        "NodeFeatureDiscovery bootstrap resource",
+        timeout_seconds=operator_spec["wait_timeout_seconds"],
+        interval_seconds=10,
+        predicate=lambda: llmd_runtime.resource_exists(
+            "nodefeaturediscovery",
+            manifest["metadata"]["name"],
+            namespace=manifest["metadata"]["namespace"],
+        ),
+    )
+
+    wait_for_nfd_gpu_labels(
+        config, timeout_seconds=operator_spec["wait_timeout_seconds"]
+    )
+
+
+def prepare_gpu_operator(config: llmd_runtime.ResolvedConfig) -> None:
+    operator_spec = llmd_runtime.operator_spec_by_package(
+        config.platform, "gpu-operator-certified"
+    )
+    ensure_operator_subscription(operator_spec)
+    llmd_runtime.wait_for_crd(
+        operator_spec["bootstrap_crd"],
+        timeout_seconds=operator_spec["wait_timeout_seconds"],
+    )
+
+    manifest = llmd_runtime.load_manifest_template(
+        config, operator_spec["bootstrap_manifest"]
+    )
+    clusterpolicy_name = manifest["metadata"]["name"]
+    if llmd_runtime.resource_exists("clusterpolicy", clusterpolicy_name):
+        LOGGER.info(
+            "ClusterPolicy/%s already exists; verifying readiness instead of applying bootstrap manifest",
+            clusterpolicy_name,
+        )
+        wait_for_gpu_clusterpolicy_ready(
+            clusterpolicy_name,
+            timeout_seconds=operator_spec["wait_timeout_seconds"],
+        )
+        return
+
+    llmd_runtime.apply_manifest(
+        config.artifact_dir / "src" / "gpu-clusterpolicy.yaml",
+        manifest,
+    )
+
+    wait_for_gpu_clusterpolicy_ready(
+        clusterpolicy_name,
+        timeout_seconds=operator_spec["wait_timeout_seconds"],
+    )
+
+
+def wait_for_gpu_clusterpolicy_ready(
+    clusterpolicy_name: str, *, timeout_seconds: int
+) -> None:
+    def _clusterpolicy_ready() -> bool:
+        payload = llmd_runtime.oc_get_json(
+            "clusterpolicy",
+            name=clusterpolicy_name,
+        )
+        state = payload.get("status", {}).get("state", "")
+        return state.lower() == "ready"
+
+    llmd_runtime.wait_until(
+        f"clusterpolicy/{clusterpolicy_name} ready",
+        timeout_seconds=timeout_seconds,
+        interval_seconds=15,
+        predicate=_clusterpolicy_ready,
+    )
+
+
+def prepare_rhoai_operator(config: llmd_runtime.ResolvedConfig) -> None:
+    operator_spec = llmd_runtime.operator_spec_by_package(
+        config.platform, "rhods-operator"
+    )
+    ensure_operator_subscription(operator_spec)
+    ensure_required_crds(config.platform["rhoai"]["required_crds_before_dsc"], config)
+
+
+def ensure_required_crds(
+    crd_names: list[str], config: llmd_runtime.ResolvedConfig
+) -> None:
+    for crd_name in crd_names:
+        llmd_runtime.wait_for_crd(
+            crd_name,
+            timeout_seconds=config.platform["rhoai"]["wait_timeout_seconds"],
+        )
+
+
+def apply_datasciencecluster(config: llmd_runtime.ResolvedConfig) -> None:
+    manifest = llmd_runtime.render_datasciencecluster(config)
+    llmd_runtime.apply_manifest(
+        config.artifact_dir / "src" / "datasciencecluster.yaml", manifest
+    )
+    llmd_runtime.oc(
+        "get",
+        "datasciencecluster",
+        config.platform["rhoai"]["datasciencecluster_name"],
+        "-n",
+        config.platform["rhoai"]["namespace"],
+        "-o",
+        "yaml",
+        capture_output=True,
+    )
+
+
+def wait_for_datasciencecluster_ready(config: llmd_runtime.ResolvedConfig) -> None:
+    rhoai = config.platform["rhoai"]
+
+    def _dsc_ready() -> bool:
+        payload = llmd_runtime.oc_get_json(
+            "datasciencecluster",
+            name=rhoai["datasciencecluster_name"],
+            namespace=rhoai["namespace"],
+        )
+        phase = payload.get("status", {}).get("phase")
+        if phase == "Ready":
+            return True
+        if phase in {"Failed", "Error"}:
+            raise RuntimeError(f"DataScienceCluster entered terminal phase {phase}")
+        return False
+
+    llmd_runtime.wait_until(
+        f"datasciencecluster/{rhoai['datasciencecluster_name']} ready",
+        timeout_seconds=rhoai["wait_timeout_seconds"],
+        interval_seconds=10,
+        predicate=_dsc_ready,
+    )
+
+
+def ensure_gateway(config: llmd_runtime.ResolvedConfig) -> None:
+    gateway = config.platform["gateway"]
+    if not llmd_runtime.resource_exists(
+        "gateway", gateway["name"], namespace=gateway["namespace"]
+    ):
+        if not gateway["create_if_missing"]:
+            raise RuntimeError(
+                f"Required gateway {gateway['name']} does not exist in {gateway['namespace']}"
+            )
+        manifest = llmd_runtime.render_gateway(config)
+        llmd_runtime.apply_manifest(
+            config.artifact_dir / "src" / "gateway.yaml", manifest
+        )
+
+    def _gateway_programmed() -> bool:
+        resource = llmd_runtime.oc_get_json(
+            "gateway",
+            name=gateway["name"],
+            namespace=gateway["namespace"],
+        )
+        return llmd_runtime.condition_status(resource, "Programmed") == "True"
+
+    llmd_runtime.wait_until(
+        f"gateway/{gateway['name']} programmed",
+        timeout_seconds=gateway["wait_timeout_seconds"],
+        interval_seconds=10,
+        predicate=_gateway_programmed,
+    )
+
+
+def ensure_test_namespace(config: llmd_runtime.ResolvedConfig) -> None:
+    llmd_runtime.ensure_namespace(
+        config.namespace,
+        labels={
+            "app.kubernetes.io/managed-by": "forge",
+            "forge.openshift.io/project": "llm_d",
+        },
+    )
+
+
+def verify_gpu_nodes(config: llmd_runtime.ResolvedConfig) -> None:
+    selector = config.platform["cluster"]["gpu_node_label_selector"]
+    data = llmd_runtime.oc_get_json("nodes", selector=selector, ignore_not_found=True)
+    items = data.get("items", []) if data else []
+    if not items:
+        raise RuntimeError(
+            f"No GPU nodes found with selector {selector}. The llm_d smoke path requires GPUs."
+        )
+
+
+def wait_for_nfd_gpu_labels(
+    config: llmd_runtime.ResolvedConfig, *, timeout_seconds: int
+) -> None:
+    selectors = config.platform["cluster"]["nfd_gpu_detection_labels"]
+
+    def _labels_present() -> bool:
+        for selector in selectors:
+            data = llmd_runtime.oc_get_json(
+                "nodes", selector=selector, ignore_not_found=True
+            )
+            if data and data.get("items"):
+                return True
+        return False
+
+    llmd_runtime.wait_until(
+        "NFD GPU discovery labels on cluster nodes",
+        timeout_seconds=timeout_seconds,
+        interval_seconds=15,
+        predicate=_labels_present,
+    )
+
 
-from projects.core.library import config
+def capture_prepare_state(config: llmd_runtime.ResolvedConfig) -> None:
+    artifacts_dir = config.artifact_dir / "artifacts"
+    rhoai = config.platform["rhoai"]
+    gateway = config.platform["gateway"]
 
-logger = logging.getLogger(__name__)
+    capture_resource_yaml(
+        "datasciencecluster",
+        rhoai["datasciencecluster_name"],
+        rhoai["namespace"],
+        artifacts_dir / "datasciencecluster.yaml",
+    )
+    capture_resource_yaml(
+        "gateway",
+        gateway["name"],
+        gateway["namespace"],
+        artifacts_dir / "gateway.yaml",
+    )
+    gateway_service = llmd_runtime.oc(
+        "get",
+        "service",
+        "-A",
+        "-l",
+        f"gateway.networking.k8s.io/gateway-name={gateway['name']}",
+        "-o",
+        "yaml",
+        check=False,
+        capture_output=True,
+    )
+    if gateway_service.returncode == 0 and gateway_service.stdout:
+        llmd_runtime.write_text(
+            artifacts_dir / "gateway.service.yaml", gateway_service.stdout
+        )
+    if config.platform["artifacts"]["capture_namespace_events"]:
+        capture_namespace_events(
+            config.namespace, artifacts_dir / "namespace.events.txt"
+        )
 
 
-def prepare():
-    ns = config.project.get_config("prepare.namespace.name")
-    logger.warning(f"Hello prepare {ns}")
-    pass
+def capture_resource_yaml(
+    kind: str,
+    name: str,
+    namespace: str,
+    destination: Path,
+    *,
+    check: bool = True,
+) -> None:
+    result = llmd_runtime.oc(
+        "get",
+        kind,
+        name,
+        "-n",
+        namespace,
+        "-o",
+        "yaml",
+        check=check,
+        capture_output=True,
+    )
+    if result.returncode == 0 and result.stdout:
+        llmd_runtime.write_text(destination, result.stdout)
 
 
-def cleanup():
-    logger.warning("Hello cleanup")
-    pass
+def capture_namespace_events(namespace: str, destination: Path) -> None:
+    result = llmd_runtime.oc(
+        "get",
+        "events",
+        "-n",
+        namespace,
+        "--sort-by=.metadata.creationTimestamp",
+        check=False,
+        capture_output=True,
+    )
+    if result.returncode == 0 and result.stdout:
+        llmd_runtime.write_text(destination, result.stdout)
diff --git a/projects/llm_d/orchestration/test_llmd.py b/projects/llm_d/orchestration/test_llmd.py
index 8290ee63..b11948d7 100644
--- a/projects/llm_d/orchestration/test_llmd.py
+++ b/projects/llm_d/orchestration/test_llmd.py
@@ -1,29 +1,483 @@
+from __future__ import annotations
+
+import json
 import logging
-import pathlib
+import time
+from pathlib import Path
+
+from projects.llm_d.orchestration import llmd_runtime
+
+LOGGER = logging.getLogger(__name__)
+
+
+def init() -> None:
+    llmd_runtime.init()
+
+
+def test() -> int:
+    llmd_runtime.init()
+    config = llmd_runtime.load_run_configuration()
+
+    name = config.platform["inference_service"]["name"]
+    namespace = config.namespace
+    artifacts_dir = config.artifact_dir / "artifacts"
+
+    LOGGER.info("Testing llm_d preset=%s namespace=%s", config.preset_name, namespace)
+
+    endpoint_url = None
+    try:
+        endpoint_url = deploy_inference_service(config)
+        smoke_response = run_smoke_request(config, endpoint_url)
+        llmd_runtime.write_json(artifacts_dir / "smoke.response.json", smoke_response)
+
+        if config.benchmark:
+            run_guidellm_benchmark(config, endpoint_url)
+
+        return 0
+    finally:
+        capture_inference_service_state(config)
+        if endpoint_url:
+            llmd_runtime.write_text(artifacts_dir / "endpoint.url", f"{endpoint_url}\n")
+        benchmark_name = (
+            config.benchmark["job_name"] if config.benchmark else "guidellm-benchmark"
+        )
+        llmd_runtime.oc(
+            "delete",
+            "job,pvc",
+            benchmark_name,
+            "-n",
+            namespace,
+            "--ignore-not-found=true",
+            check=False,
+        )
+        llmd_runtime.oc(
+            "delete",
+            "pod",
+            f"{benchmark_name}-copy",
+            "-n",
+            namespace,
+            "--ignore-not-found=true",
+            check=False,
+        )
+        events = llmd_runtime.oc(
+            "get",
+            "events",
+            "-n",
+            namespace,
+            "--sort-by=.metadata.creationTimestamp",
+            check=False,
+            capture_output=True,
+        )
+        if events.returncode == 0 and events.stdout:
+            llmd_runtime.write_text(
+                artifacts_dir / "namespace.events.txt", events.stdout
+            )
+
+
+def deploy_inference_service(config: llmd_runtime.ResolvedConfig) -> str:
+    name = config.platform["inference_service"]["name"]
+    namespace = config.namespace
+    selector = f"app.kubernetes.io/name={name}"
+
+    llmd_runtime.oc(
+        "delete",
+        "llminferenceservice",
+        name,
+        "-n",
+        namespace,
+        "--ignore-not-found=true",
+        check=False,
+    )
+
+    def _old_pods_gone() -> bool:
+        pods = llmd_runtime.oc_get_json(
+            "pods", namespace=namespace, selector=selector, ignore_not_found=True
+        )
+        return not pods or not pods.get("items")
+
+    llmd_runtime.wait_until(
+        f"old llm-d pods to disappear in {namespace}",
+        timeout_seconds=config.platform["inference_service"]["delete_timeout_seconds"],
+        interval_seconds=10,
+        predicate=_old_pods_gone,
+    )
+
+    manifest = llmd_runtime.render_inference_service(config)
+    llmd_runtime.apply_manifest(
+        config.artifact_dir / "src" / "llminferenceservice.yaml", manifest
+    )
+
+    def _pods_present() -> bool:
+        pods = llmd_runtime.oc_get_json(
+            "pods", namespace=namespace, selector=selector, ignore_not_found=True
+        )
+        return bool(pods and pods.get("items"))
+
+    llmd_runtime.wait_until(
+        f"llm-d pods to appear in {namespace}",
+        timeout_seconds=config.platform["inference_service"][
+            "pod_appearance_timeout_seconds"
+        ],
+        interval_seconds=5,
+        predicate=_pods_present,
+    )
+
+    def _service_ready() -> bool:
+        payload = llmd_runtime.oc_get_json(
+            "llminferenceservice", name=name, namespace=namespace
+        )
+        return llmd_runtime.condition_status(payload, "Ready") == "True"
+
+    llmd_runtime.wait_until(
+        f"llminferenceservice/{name} ready",
+        timeout_seconds=config.platform["inference_service"]["ready_timeout_seconds"],
+        interval_seconds=10,
+        predicate=_service_ready,
+    )
+
+    return llmd_runtime.wait_until(
+        f"gateway address for llminferenceservice/{name}",
+        timeout_seconds=config.platform["inference_service"]["ready_timeout_seconds"],
+        interval_seconds=10,
+        predicate=lambda: try_resolve_endpoint_url(config),
+    )
+
+
+def resolve_endpoint_url(config: llmd_runtime.ResolvedConfig) -> str:
+    endpoint_url = try_resolve_endpoint_url(config)
+    if endpoint_url:
+        return endpoint_url
+
+    name = config.platform["inference_service"]["name"]
+    gateway_name = config.platform["gateway"]["status_address_name"]
+    raise RuntimeError(
+        f"Gateway address {gateway_name} is missing from llminferenceservice/{name} status.addresses"
+    )
+
+
+def try_resolve_endpoint_url(config: llmd_runtime.ResolvedConfig) -> str | None:
+    name = config.platform["inference_service"]["name"]
+    namespace = config.namespace
+    gateway_name = config.platform["gateway"]["status_address_name"]
+    payload = llmd_runtime.oc_get_json(
+        "llminferenceservice", name=name, namespace=namespace
+    )
+
+    for address in payload.get("status", {}).get("addresses", []):
+        if address.get("name") == gateway_name and address.get("url"):
+            return address["url"]
+    return None
+
+
+def run_smoke_request(
+    config: llmd_runtime.ResolvedConfig, endpoint_url: str
+) -> dict[str, object]:
+    namespace = config.namespace
+    name = config.platform["inference_service"]["name"]
+    deployment_name = f"{name}{config.platform['inference_service']['workload_deployment_name_suffix']}"
+
+    payload = {
+        "model": config.model["served_model_name"],
+        "prompt": config.smoke_request["prompt"],
+        "max_tokens": config.smoke_request["max_tokens"],
+        "temperature": config.smoke_request["temperature"],
+    }
+    llmd_runtime.write_json(
+        config.artifact_dir / "artifacts" / "smoke.request.json", payload
+    )
+
+    retries = config.platform["smoke"]["request_retries"]
+    delay = config.platform["smoke"]["request_retry_delay_seconds"]
+    result = None
+    for _ in range(retries):
+        result = llmd_runtime.oc(
+            "exec",
+            "-n",
+            namespace,
+            f"deployment/{deployment_name}",
+            "-c",
+            "main",
+            "--",
+            "curl",
+            "-k",
+            "-sSf",
+            f"{endpoint_url}{config.platform['smoke']['endpoint_path']}",
+            "-H",
+            "Content-Type: application/json",
+            "-d",
+            json.dumps(payload),
+            check=False,
+            capture_output=True,
+        )
+        if result.returncode == 0:
+            break
+        time.sleep(delay)
+
+    if result is None or result.returncode != 0:
+        raise RuntimeError("Smoke request never succeeded against the llm_d endpoint")
+
+    response = json.loads(result.stdout)
+    if not response.get("choices"):
+        raise RuntimeError(f"Invalid smoke response payload: {result.stdout}")
+    return response
+
+
+def run_guidellm_benchmark(
+    config: llmd_runtime.ResolvedConfig, endpoint_url: str
+) -> None:
+    benchmark_name = config.benchmark["job_name"]
+    namespace = config.namespace
+
+    llmd_runtime.oc(
+        "delete",
+        "job,pvc",
+        benchmark_name,
+        "-n",
+        namespace,
+        "--ignore-not-found=true",
+        check=False,
+    )
+    llmd_runtime.oc(
+        "delete",
+        "pod",
+        f"{benchmark_name}-copy",
+        "-n",
+        namespace,
+        "--ignore-not-found=true",
+        check=False,
+    )
+
+    llmd_runtime.apply_manifest(
+        config.artifact_dir / "src" / "guidellm-pvc.yaml",
+        llmd_runtime.render_guidellm_pvc(config),
+    )
+    llmd_runtime.apply_manifest(
+        config.artifact_dir / "src" / "guidellm-job.yaml",
+        llmd_runtime.render_guidellm_job(config, endpoint_url),
+    )
+
+    def _job_terminal() -> dict[str, object] | None:
+        payload = llmd_runtime.oc_get_json(
+            "job", name=benchmark_name, namespace=namespace
+        )
+        status = payload.get("status", {})
+        if status.get("succeeded"):
+            return payload
+        if status.get("failed"):
+            raise RuntimeError(f"GuideLLM job {benchmark_name} failed")
+        return None
+
+    llmd_runtime.wait_until(
+        f"GuideLLM job/{benchmark_name}",
+        timeout_seconds=config.benchmark["timeout_seconds"],
+        interval_seconds=10,
+        predicate=_job_terminal,
+    )
+
+    capture_guidellm_state(config)
+    copy_guidellm_results(config)
+
+
+def copy_guidellm_results(config: llmd_runtime.ResolvedConfig) -> None:
+    benchmark_name = config.benchmark["job_name"]
+    namespace = config.namespace
+    pod_data = llmd_runtime.oc_get_json(
+        "pods",
+        namespace=namespace,
+        selector=f"job-name={benchmark_name}",
+        ignore_not_found=True,
+    )
+    node_name = None
+    if pod_data and pod_data.get("items"):
+        node_name = pod_data["items"][0].get("spec", {}).get("nodeName")
+
+    llmd_runtime.apply_manifest(
+        config.artifact_dir / "src" / "guidellm-copy-pod.yaml",
+        llmd_runtime.render_guidellm_copy_pod(config, node_name=node_name),
+    )
+
+    def _helper_ready() -> bool:
+        payload = llmd_runtime.oc_get_json(
+            "pod",
+            name=f"{benchmark_name}-copy",
+            namespace=namespace,
+        )
+        conditions = payload.get("status", {}).get("conditions", [])
+        return any(
+            condition.get("type") == "Ready" and condition.get("status") == "True"
+            for condition in conditions
+        )
+
+    llmd_runtime.wait_until(
+        f"GuideLLM copy helper pod/{benchmark_name}-copy",
+        timeout_seconds=120,
+        interval_seconds=5,
+        predicate=_helper_ready,
+    )
+
+    result = llmd_runtime.oc(
+        "exec",
+        "-n",
+        namespace,
+        f"{benchmark_name}-copy",
+        "--",
+        "cat",
+        "/results/benchmarks.json",
+        check=False,
+        capture_output=True,
+    )
+    if result.returncode == 0 and result.stdout:
+        llmd_runtime.write_text(
+            config.artifact_dir / "artifacts" / "results" / "benchmarks.json",
+            result.stdout,
+        )
+
+
+def capture_inference_service_state(config: llmd_runtime.ResolvedConfig) -> None:
+    name = config.platform["inference_service"]["name"]
+    namespace = config.namespace
+    artifacts_dir = config.artifact_dir / "artifacts"
+    selector = f"app.kubernetes.io/name={name}"
 
-from projects.core.library import config, env, run
-from projects.llm_d.toolbox.capture_isvc_state.main import run as capture_isvc_state
+    capture_get(
+        "llminferenceservice",
+        name,
+        namespace,
+        "yaml",
+        artifacts_dir / "llminferenceservice.yaml",
+    )
+    capture_get(
+        "llminferenceservice",
+        name,
+        namespace,
+        "json",
+        artifacts_dir / "llminferenceservice.json",
+    )
+    capture_get(
+        "pods",
+        None,
+        namespace,
+        "yaml",
+        artifacts_dir / "llminferenceservice.pods.yaml",
+        selector=selector,
+    )
+    capture_get(
+        "deployments",
+        None,
+        namespace,
+        "yaml",
+        artifacts_dir / "llminferenceservice.deployments.yaml",
+        selector=selector,
+    )
+    capture_get(
+        "replicasets",
+        None,
+        namespace,
+        "yaml",
+        artifacts_dir / "llminferenceservice.replicasets.yaml",
+        selector=selector,
+    )
+    capture_get(
+        "pods", None, namespace, "wide", artifacts_dir / "namespace.pods.status"
+    )
+    capture_get(
+        "services", None, namespace, "wide", artifacts_dir / "namespace.services.status"
+    )
 
-logger = logging.getLogger(__name__)
+    pod_list = llmd_runtime.oc_get_json(
+        "pods", namespace=namespace, selector=selector, ignore_not_found=True
+    )
+    if pod_list:
+        lines = []
+        previous_lines = []
+        for pod in pod_list.get("items", []):
+            pod_name = pod["metadata"]["name"]
+            lines.append(f"=== {pod_name} ===")
+            log_result = llmd_runtime.oc(
+                "logs",
+                pod_name,
+                "-n",
+                namespace,
+                "--all-containers=true",
+                check=False,
+                capture_output=True,
+            )
+            if log_result.stdout:
+                lines.append(log_result.stdout.rstrip())
 
+            previous_lines.append(f"=== {pod_name} ===")
+            previous_result = llmd_runtime.oc(
+                "logs",
+                pod_name,
+                "-n",
+                namespace,
+                "--previous",
+                "--all-containers=true",
+                check=False,
+                capture_output=True,
+            )
+            if previous_result.stdout:
+                previous_lines.append(previous_result.stdout.rstrip())
 
-def init():
-    env.init()
-    run.init()
-    config.init(pathlib.Path(__file__).parent)
+        llmd_runtime.write_text(
+            artifacts_dir / "llminferenceservice.pods.logs", "\n".join(lines) + "\n"
+        )
+        llmd_runtime.write_text(
+            artifacts_dir / "llminferenceservice.pods.previous.logs",
+            "\n".join(previous_lines) + "\n",
+        )
 
 
-@config.requires(
-    ns="prepare.namespace.name",
-    name="tests.llmd.flavors",
-)
-def test(_cfg):
-    logger.warning(f"Hello test {_cfg.ns}/{_cfg.name}")
+def capture_guidellm_state(config: llmd_runtime.ResolvedConfig) -> None:
+    benchmark_name = config.benchmark["job_name"]
+    namespace = config.namespace
+    artifacts_dir = config.artifact_dir / "artifacts"
 
-    # two alternatives to query the configuration:
-    # @config.requires(dict) or config.project.get_config("<path>")
-    # and we will define something similar for the secrets
+    capture_get(
+        "job",
+        benchmark_name,
+        namespace,
+        "yaml",
+        artifacts_dir / "guidellm_benchmark_job.yaml",
+    )
+    capture_get(
+        "pods",
+        None,
+        namespace,
+        "yaml",
+        artifacts_dir / "guidellm_benchmark_job.pods.yaml",
+        selector=f"job-name={benchmark_name}",
+    )
+    result = llmd_runtime.oc(
+        "logs",
+        f"job/{benchmark_name}",
+        "-n",
+        namespace,
+        check=False,
+        capture_output=True,
+    )
+    if result.returncode == 0 and result.stdout:
+        llmd_runtime.write_text(
+            artifacts_dir / "guidellm_benchmark_job.logs", result.stdout
+        )
 
-    config.project.get_config("tests.llmd.flavors")
 
-    capture_isvc_state(_cfg.name, namespace=_cfg.ns)
+def capture_get(
+    kind: str,
+    name: str | None,
+    namespace: str,
+    output: str,
+    destination: Path,
+    *,
+    selector: str | None = None,
+) -> None:
+    args = ["get", kind]
+    if name:
+        args.append(name)
+    args.extend(["-n", namespace])
+    if selector:
+        args.extend(["-l", selector])
+    args.extend(["-o", output])
+    result = llmd_runtime.oc(*args, check=False, capture_output=True)
+    if result.returncode == 0 and result.stdout:
+        llmd_runtime.write_text(destination, result.stdout)
diff --git a/projects/llm_d/toolbox/capture_isvc_state/main.py b/projects/llm_d/toolbox/capture_isvc_state/main.py
old mode 100755
new mode 100644
index 78448e15..85d09bc8
--- a/projects/llm_d/toolbox/capture_isvc_state/main.py
+++ b/projects/llm_d/toolbox/capture_isvc_state/main.py
@@ -5,12 +5,7 @@
 Replaces llmd_capture_isvc_state Ansible role
 """
 
-from projects.core.dsl import (
-    execute_tasks,
-    shell,
-    task,
-    toolbox,
-)
+from projects.core.dsl import execute_tasks, shell, task, toolbox
 
 
 def run(llmisvc_name: str, *, namespace: str = ""):
@@ -22,7 +17,6 @@ def run(llmisvc_name: str, *, namespace: str = ""):
         namespace: Namespace of the LLMInferenceService (empty string auto-detects current namespace)
     """
 
-    # Execute all registered tasks in order, respecting conditions
     return execute_tasks(locals())
 
 
@@ -157,7 +151,6 @@ def capture_podmonitors(args, context):
 @task
 def capture_pod_logs(args, context):
     """Capture logs from LLMInferenceService pods"""
-    # Get list of pod names
     result = shell.run(
         f'oc get pods -l "app.kubernetes.io/name={args.llmisvc_name}" -n {context.target_namespace} -o jsonpath="{{.items[*].metadata.name}}"',
         check=False,
@@ -170,19 +163,16 @@ def capture_pod_logs(args, context):
 
     log_file = args.artifact_dir / "artifacts/llminferenceservice.pods.logs"
 
-    # Capture logs for each pod
-    with open(log_file, "w") as f:  # Start with empty file
+    with open(log_file, "w") as handle:
         for pod_name in pod_names:
-            f.write(f"=== Logs for pod: {pod_name} ===\n")
-
-            # Get logs for this pod
+            handle.write(f"=== Logs for pod: {pod_name} ===\n")
             log_result = shell.run(
                 f"oc logs {pod_name} -n {context.target_namespace} --all-containers=true",
                 check=False,
                 log_stdout=False,
             )
-            f.write(log_result.stdout)
-            f.write("\n")
+            handle.write(log_result.stdout)
+            handle.write("\n")
 
     return f"Pod logs captured for {len(pod_names)} pods"
 
@@ -190,7 +180,6 @@ def capture_pod_logs(args, context):
 @task
 def capture_pod_previous_logs(args, context):
     """Capture previous logs from LLMInferenceService pods if available"""
-    # Get list of pod names
     result = shell.run(
         f'oc get pods -l "app.kubernetes.io/name={args.llmisvc_name}" -n {context.target_namespace} -o jsonpath="{{.items[*].metadata.name}}"',
         check=False,
@@ -202,19 +191,16 @@ def capture_pod_previous_logs(args, context):
 
     log_file = args.artifact_dir / "artifacts/llminferenceservice.pods.previous.logs"
 
-    # Capture previous logs for each pod
-    with open(log_file, "w") as f:  # Start with empty file
+    with open(log_file, "w") as handle:
         for pod_name in pod_names:
-            f.write(f"=== Previous logs for pod: {pod_name} ===\n")
-
-            # Get previous logs for this pod
+            handle.write(f"=== Previous logs for pod: {pod_name} ===\n")
             log_result = shell.run(
                 f"oc logs {pod_name} -n {context.target_namespace} --previous --all-containers=true",
                 check=False,
                 log_stdout=False,
             )
-            f.write(log_result.stdout)
-            f.write("\n")
+            handle.write(log_result.stdout)
+            handle.write("\n")
 
     return f"Pod previous logs captured for {len(pod_names)} pods"
 
@@ -233,7 +219,6 @@ def capture_llminferenceservice_describe(args, context):
 @task
 def capture_pods_describe(args, context):
     """Capture describe output for related pods"""
-    # Get list of pod names
     result = shell.run(
         f'oc get pods -l "app.kubernetes.io/name={args.llmisvc_name}" -n {context.target_namespace} -o jsonpath="{{.items[*].metadata.name}}"',
         check=False,
@@ -245,24 +230,20 @@ def capture_pods_describe(args, context):
 
     describe_file = args.artifact_dir / "artifacts/llminferenceservice.pods.describe.txt"
 
-    # Capture describe output for each pod
-    with open(describe_file, "w") as f:  # Start with empty file
+    with open(describe_file, "w") as handle:
         for pod_name in pod_names:
-            f.write(f"=== Describe for pod: {pod_name} ===\n")
-
-            # Get describe output for this pod
+            handle.write(f"=== Describe for pod: {pod_name} ===\n")
             describe_result = shell.run(
                 f"oc describe pod {pod_name} -n {context.target_namespace}",
                 log_stdout=False,
                 check=False,
             )
-            f.write(describe_result.stdout)
-            f.write("\n")
+            handle.write(describe_result.stdout)
+            handle.write("\n")
 
     return f"Pod describe output captured for {len(pod_names)} pods"
 
 
-# Create the main function using the toolbox library
 main = toolbox.create_toolbox_main(run)
 
 
diff --git a/tests/llm_d/test_runtime.py b/tests/llm_d/test_runtime.py
new file mode 100644
index 00000000..4557de00
--- /dev/null
+++ b/tests/llm_d/test_runtime.py
@@ -0,0 +1,208 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+from projects.llm_d.orchestration import llmd_runtime
+from projects.llm_d.orchestration import prepare_llmd
+from projects.llm_d.orchestration import test_llmd
+
+
+def test_derive_namespace_uses_prefix_once() -> None:
+    namespace = llmd_runtime.derive_namespace("llm-d-nightly-smoke", "llm-d", 63)
+    assert namespace == "llm-d-nightly-smoke"
+
+
+def test_parse_overrides_rejects_unknown_keys() -> None:
+    with pytest.raises(ValueError, match="Unsupported llm_d override keys"):
+        llmd_runtime.parse_overrides('{"model":"other"}')
+
+
+def test_load_run_configuration_resolves_alias(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
+    artifact_dir = tmp_path / "artifacts"
+    artifact_dir.mkdir()
+
+    fournos_config = tmp_path / "fournos_config.yaml"
+    fournos_config.write_text(
+        "preset: cks\njob-name: llm-d-e2e\n",
+        encoding="utf-8",
+    )
+
+    config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
+
+    assert config.preset_name == "smoke"
+    assert config.preset_alias == "cks"
+    assert config.model["served_model_name"] == "Qwen/Qwen3-0.6B"
+    assert config.namespace == "llm-d-e2e"
+    assert config.namespace_is_managed is True
+
+
+def test_namespace_override_is_not_managed(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", '{"namespace":"custom-ns"}')
+    artifact_dir = tmp_path / "artifacts"
+    artifact_dir.mkdir()
+
+    config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
+
+    assert config.namespace == "custom-ns"
+    assert config.namespace_is_managed is False
+
+
+def test_render_inference_service_injects_model_and_epp(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
+    artifact_dir = tmp_path / "artifacts"
+    artifact_dir.mkdir()
+
+    config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
+    manifest = llmd_runtime.render_inference_service(config)
+
+    assert manifest["metadata"]["name"] == "llm-d"
+    assert manifest["metadata"]["namespace"] == config.namespace
+    assert manifest["spec"]["model"]["name"] == "Qwen/Qwen3-0.6B"
+    assert manifest["spec"]["model"]["uri"] == "hf://Qwen/Qwen3-0.6B"
+    assert manifest["spec"]["model"]["name"] == config.model["served_model_name"]
+    assert manifest["spec"]["model"]["uri"] == config.model["uri"]
+    router_args = manifest["spec"]["router"]["scheduler"]["template"]["containers"][0]["args"]
+    assert router_args[-2] == "--config-text"
+    assert "EndpointPickerConfig" in router_args[-1]
+
+
+def test_render_guidellm_job_uses_target_and_rate(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
+    artifact_dir = tmp_path / "artifacts"
+    artifact_dir.mkdir()
+    (tmp_path / "fournos_config.yaml").write_text(
+        "preset: benchmark-short\njob-name: llm-d-benchmark\n",
+        encoding="utf-8",
+    )
+
+    config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
+    manifest = llmd_runtime.render_guidellm_job(config, "https://example.test")
+
+    container = manifest["spec"]["template"]["spec"]["containers"][0]
+    assert container["image"] == "ghcr.io/vllm-project/guidellm:v0.5.4"
+    assert "--target=https://example.test" in container["args"]
+    assert "--rate=1" in container["args"]
+
+
+def test_prepare_gpu_operator_skips_existing_clusterpolicy(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
+    artifact_dir = tmp_path / "artifacts"
+    artifact_dir.mkdir()
+    config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
+
+    calls: list[str] = []
+
+    monkeypatch.setattr(
+        prepare_llmd,
+        "ensure_operator_subscription",
+        lambda operator_spec: calls.append(f"subscription:{operator_spec['package']}"),
+    )
+    monkeypatch.setattr(
+        llmd_runtime,
+        "wait_for_crd",
+        lambda crd_name, *, timeout_seconds: calls.append(f"crd:{crd_name}"),
+    )
+    monkeypatch.setattr(
+        llmd_runtime,
+        "load_manifest_template",
+        lambda _config, _path: {
+            "apiVersion": "nvidia.com/v1",
+            "kind": "ClusterPolicy",
+            "metadata": {"name": "gpu-cluster-policy"},
+            "spec": {},
+        },
+    )
+    monkeypatch.setattr(llmd_runtime, "resource_exists", lambda kind, name: True)
+
+    def fail_apply(*_: object, **__: object) -> None:
+        raise AssertionError("existing ClusterPolicy must not be reapplied")
+
+    monkeypatch.setattr(llmd_runtime, "apply_manifest", fail_apply)
+    monkeypatch.setattr(
+        llmd_runtime,
+        "oc_get_json",
+        lambda kind, name: {"status": {"state": "ready"}},
+    )
+
+    prepare_llmd.prepare_gpu_operator(config)
+
+    assert calls == ["subscription:gpu-operator-certified", "crd:clusterpolicies.nvidia.com"]
+
+
+def test_prepare_gpu_operator_bootstraps_missing_clusterpolicy(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
+    artifact_dir = tmp_path / "artifacts"
+    artifact_dir.mkdir()
+    config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
+
+    applied: list[Path] = []
+    manifest = {
+        "apiVersion": "nvidia.com/v1",
+        "kind": "ClusterPolicy",
+        "metadata": {"name": "gpu-cluster-policy"},
+        "spec": {},
+    }
+
+    monkeypatch.setattr(prepare_llmd, "ensure_operator_subscription", lambda _: None)
+    monkeypatch.setattr(llmd_runtime, "wait_for_crd", lambda *_, **__: None)
+    monkeypatch.setattr(llmd_runtime, "load_manifest_template", lambda _config, _path: manifest)
+    monkeypatch.setattr(llmd_runtime, "resource_exists", lambda kind, name: False)
+    monkeypatch.setattr(
+        llmd_runtime,
+        "apply_manifest",
+        lambda artifact_path, _manifest: applied.append(artifact_path),
+    )
+    monkeypatch.setattr(
+        llmd_runtime,
+        "oc_get_json",
+        lambda kind, name: {"status": {"state": "ready"}},
+    )
+
+    prepare_llmd.prepare_gpu_operator(config)
+
+    assert applied == [artifact_dir / "src" / "gpu-clusterpolicy.yaml"]
+
+
+def test_gpu_clusterpolicy_manifest_has_required_default_sections() -> None:
+    manifest = llmd_runtime.load_yaml(
+        llmd_runtime.CONFIG_DIR / "manifests" / "gpu-clusterpolicy.yaml"
+    )
+
+    assert manifest["kind"] == "ClusterPolicy"
+    assert manifest["metadata"]["name"] == "gpu-cluster-policy"
+    assert {
+        "daemonsets",
+        "dcgm",
+        "dcgmExporter",
+        "devicePlugin",
+        "driver",
+        "gfd",
+        "nodeStatusExporter",
+        "operator",
+        "toolkit",
+    } <= set(manifest["spec"])
+
+
+def test_resolve_endpoint_url_requires_gateway_address(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
+    artifact_dir = tmp_path / "artifacts"
+    artifact_dir.mkdir()
+    config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
+
+    def fake_oc_get_json(kind: str, **_: object) -> dict[str, object]:
+        assert kind == "llminferenceservice"
+        return {"status": {"addresses": [{"name": "other", "url": "https://wrong"}]}}
+
+    monkeypatch.setattr(llmd_runtime, "oc_get_json", fake_oc_get_json)
+
+    with pytest.raises(RuntimeError, match="Gateway address"):
+        test_llmd.resolve_endpoint_url(config)

From b575b037dc725ddacbcf3629645020e3c16b0d2b Mon Sep 17 00:00:00 2001
From: Alberto Perdomo <aperdomo@redhat.com>
Date: Wed, 15 Apr 2026 11:02:18 +0100
Subject: [PATCH 02/21] refactor: move llm_d config and phases

---
 config/llm_d/models.yaml                      |  25 -
 config/llm_d/presets.yaml                     |  14 -
 projects/core/library/config.py               |   6 +-
 projects/llm_d/README.md                      |  19 +-
 .../llm_d/orchestration/config.d/models.yaml  |  25 +
 .../orchestration/config.d}/platform.yaml     |   0
 .../llm_d/orchestration/config.d/runtime.yaml |   7 +
 .../orchestration/config.d}/workloads.yaml    |   1 -
 projects/llm_d/orchestration/config.yaml      | 233 +--------
 projects/llm_d/orchestration/llmd_runtime.py  |  93 ++--
 .../manifests/datasciencecluster.yaml         |   0
 .../epp-approximate-prefix-cache.yaml         |   0
 .../orchestration}/manifests/gateway.yaml     |   0
 .../manifests/gpu-clusterpolicy.yaml          |   0
 .../manifests/llminferenceservice.yaml        |   0
 .../manifests/nfd-nodefeaturediscovery.yaml   |   0
 projects/llm_d/orchestration/prepare_llmd.py  | 427 +--------------
 .../llm_d/orchestration/presets.d/cks.yaml    |  23 -
 .../orchestration/presets.d/presets.yaml      |  18 +-
 projects/llm_d/orchestration/test_llmd.py     | 477 +----------------
 projects/llm_d/toolbox/cleanup/main.py        |  66 +++
 projects/llm_d/toolbox/prepare/main.py        | 391 ++++++++++++++
 projects/llm_d/toolbox/test/main.py           | 492 ++++++++++++++++++
 tests/llm_d/test_runtime.py                   |  90 +++-
 24 files changed, 1149 insertions(+), 1258 deletions(-)
 delete mode 100644 config/llm_d/models.yaml
 delete mode 100644 config/llm_d/presets.yaml
 create mode 100644 projects/llm_d/orchestration/config.d/models.yaml
 rename {config/llm_d => projects/llm_d/orchestration/config.d}/platform.yaml (100%)
 create mode 100644 projects/llm_d/orchestration/config.d/runtime.yaml
 rename {config/llm_d => projects/llm_d/orchestration/config.d}/workloads.yaml (99%)
 rename {config/llm_d => projects/llm_d/orchestration}/manifests/datasciencecluster.yaml (100%)
 rename {config/llm_d => projects/llm_d/orchestration}/manifests/epp-approximate-prefix-cache.yaml (100%)
 rename {config/llm_d => projects/llm_d/orchestration}/manifests/gateway.yaml (100%)
 rename {config/llm_d => projects/llm_d/orchestration}/manifests/gpu-clusterpolicy.yaml (100%)
 rename {config/llm_d => projects/llm_d/orchestration}/manifests/llminferenceservice.yaml (100%)
 rename {config/llm_d => projects/llm_d/orchestration}/manifests/nfd-nodefeaturediscovery.yaml (100%)
 delete mode 100644 projects/llm_d/orchestration/presets.d/cks.yaml
 create mode 100644 projects/llm_d/toolbox/cleanup/main.py
 create mode 100644 projects/llm_d/toolbox/prepare/main.py
 create mode 100644 projects/llm_d/toolbox/test/main.py

diff --git a/config/llm_d/models.yaml b/config/llm_d/models.yaml
deleted file mode 100644
index 46cf4bf4..00000000
--- a/config/llm_d/models.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-models:
-
-  qwen3-0-6b:
-    served_model_name: Qwen/Qwen3-0.6B
-    uri: hf://Qwen/Qwen3-0.6B
-    resources:
-      requests:
-        cpu: "4"
-        memory: 16Gi
-        nvidia.com/gpu: "1"
-      limits:
-        cpu: "4"
-        memory: 16Gi
-        nvidia.com/gpu: "1"
-
-  llama-3-1-8b-instruct-fp8:
-    served_model_name: llama-3-1-8b-instruct-fp8
-    uri: oci://registry.redhat.io/rhelai1/modelcar-llama-3-1-8b-instruct-fp8-dynamic:1.5
-    resources:
-      requests:
-        cpu: "4"
-        memory: 8Gi
-        nvidia.com/gpu: "1"
-      limits:
-        nvidia.com/gpu: "1"
diff --git a/config/llm_d/presets.yaml b/config/llm_d/presets.yaml
deleted file mode 100644
index 9fdaae32..00000000
--- a/config/llm_d/presets.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-aliases:
-  cks: smoke
-
-presets:
-
-  smoke:
-    model: qwen3-0-6b
-    smoke_request: default
-    benchmark: null
-
-  benchmark-short:
-    model: llama-3-1-8b-instruct-fp8
-    smoke_request: default
-    benchmark: short
diff --git a/projects/core/library/config.py b/projects/core/library/config.py
index 55d3cb24..b17b3e3a 100644
--- a/projects/core/library/config.py
+++ b/projects/core/library/config.py
@@ -307,7 +307,9 @@ def multi_dereference():
 
         # --- #
 
-        new_value = simple_dereference() if value.startswith("@") else multi_dereference()
+        new_value = (
+            simple_dereference() if value.startswith("@") else multi_dereference()
+        )
 
         if not handled_secretly:
             logger.info(f"resolve_reference: {value} ==> '{new_value}'")
@@ -435,8 +437,6 @@ def init(orchestration_dir, *, apply_config_overrides=True):
 
     project = Config(config_path)
 
-    env.ARTIFACT_DIR / VARIABLE_OVERRIDES_FILENAME
-
     if not apply_config_overrides:
         logger.info(
             "config.init: running with 'apply_config_overrides', "
diff --git a/projects/llm_d/README.md b/projects/llm_d/README.md
index d76634d9..fd443121 100644
--- a/projects/llm_d/README.md
+++ b/projects/llm_d/README.md
@@ -6,11 +6,20 @@ The current implementation is intentionally narrow:
 
 - target only downstream `LLMInferenceService`
 - keep the public interface compatible with current Fournos phase execution
-- use checked-in presets and manifests instead of a large mutable config surface
+- use checked-in config chunks and manifests instead of a large mutable config surface
+
+Configuration layout:
+
+- base config: [`orchestration/config.yaml`](./orchestration/config.yaml)
+- config chunks: [`orchestration/config.d`](./orchestration/config.d)
+- presets: [`orchestration/presets.d`](./orchestration/presets.d)
+- manifests: [`orchestration/manifests`](./orchestration/manifests)
 
 Main entrypoints:
 
-- CI phase wrapper: [ci.py](/Users/aperdomo/workspace/redhat/forge/projects/llm_d/orchestration/ci.py)
-- Prepare flow: [prepare_llmd.py](/Users/aperdomo/workspace/redhat/forge/projects/llm_d/orchestration/prepare_llmd.py)
-- Test flow: [test_llmd.py](/Users/aperdomo/workspace/redhat/forge/projects/llm_d/orchestration/test_llmd.py)
-- Shared runtime/config loader: [llmd_runtime.py](/Users/aperdomo/workspace/redhat/forge/projects/llm_d/orchestration/llmd_runtime.py)
+- CI phase wrapper: [`orchestration/ci.py`](./orchestration/ci.py)
+- CLI wrapper: [`orchestration/cli.py`](./orchestration/cli.py)
+- Shared runtime/config loader: [`orchestration/llmd_runtime.py`](./orchestration/llmd_runtime.py)
+- Toolbox prepare command: [`toolbox/prepare/main.py`](./toolbox/prepare/main.py)
+- Toolbox test command: [`toolbox/test/main.py`](./toolbox/test/main.py)
+- Toolbox cleanup command: [`toolbox/cleanup/main.py`](./toolbox/cleanup/main.py)
diff --git a/projects/llm_d/orchestration/config.d/models.yaml b/projects/llm_d/orchestration/config.d/models.yaml
new file mode 100644
index 00000000..fd204db4
--- /dev/null
+++ b/projects/llm_d/orchestration/config.d/models.yaml
@@ -0,0 +1,25 @@
+qwen3-0-6b:
+  served_model_name: Qwen/Qwen3-0.6B
+  uri: hf://Qwen/Qwen3-0.6B
+  resources:
+    requests:
+      cpu: "4"
+      memory: 16Gi
+      nvidia.com/gpu: "1"
+    limits:
+      cpu: "4"
+      memory: 16Gi
+      nvidia.com/gpu: "1"
+
+llama-3-1-8b-instruct-fp8:
+  served_model_name: llama-3-1-8b-instruct-fp8
+  uri: oci://registry.redhat.io/rhelai1/modelcar-llama-3-1-8b-instruct-fp8-dynamic:1.5
+  resources:
+    requests:
+      cpu: "4"
+      memory: 8Gi
+      nvidia.com/gpu: "1"
+    limits:
+      cpu: "4"
+      memory: 8Gi
+      nvidia.com/gpu: "1"
diff --git a/config/llm_d/platform.yaml b/projects/llm_d/orchestration/config.d/platform.yaml
similarity index 100%
rename from config/llm_d/platform.yaml
rename to projects/llm_d/orchestration/config.d/platform.yaml
diff --git a/projects/llm_d/orchestration/config.d/runtime.yaml b/projects/llm_d/orchestration/config.d/runtime.yaml
new file mode 100644
index 00000000..982d8fd2
--- /dev/null
+++ b/projects/llm_d/orchestration/config.d/runtime.yaml
@@ -0,0 +1,7 @@
+default_preset: smoke
+allowed_override_keys:
+  - namespace
+selected_preset: smoke
+model_key: qwen3-0-6b
+smoke_request_key: default
+benchmark_key: null
diff --git a/config/llm_d/workloads.yaml b/projects/llm_d/orchestration/config.d/workloads.yaml
similarity index 99%
rename from config/llm_d/workloads.yaml
rename to projects/llm_d/orchestration/config.d/workloads.yaml
index f5ebbb85..1ce9bdc6 100644
--- a/config/llm_d/workloads.yaml
+++ b/projects/llm_d/orchestration/config.d/workloads.yaml
@@ -5,7 +5,6 @@ smoke_requests:
     temperature: 0.7
 
 benchmarks:
-
   short:
     job_name: guidellm-benchmark
     image: ghcr.io/vllm-project/guidellm:v0.5.4
diff --git a/projects/llm_d/orchestration/config.yaml b/projects/llm_d/orchestration/config.yaml
index e7367e8f..c36dfa60 100644
--- a/projects/llm_d/orchestration/config.yaml
+++ b/projects/llm_d/orchestration/config.yaml
@@ -1,230 +1,3 @@
-prepare:
-  skip: false
-  namespace:
-    name: llm-d-project
-
-  operators:
-    skip: false
-    list:
-      - name: "Red Hat Connectivity Link"
-        catalog: redhat-operators
-        operator: rhcl-operator
-        namespace: all
-        enabled: false
-
-      - name: "OpenShift Cert Manager"
-        catalog: redhat-operators
-        operator: openshift-cert-manager-operator
-        namespace: openshift-cert-manager-operator
-        enabled: true
-
-      - name: "Leader Worker Set"
-        catalog: redhat-operators
-        operator: leader-worker-set
-        namespace: openshift-lws
-        deploy_cr: true
-        enabled: true
-
-      - name: "Node Feature Discovery"
-        catalog: redhat-operators
-        operator: nfd
-        namespace: openshift-nfd
-        deploy_cr: 1
-        enabled: true
-
-      - name: "NVIDIA GPU Operator"
-        catalog: certified-operators
-        operator: gpu-operator-certified
-        namespace: nvidia-gpu-operator
-        deploy_cr: true
-        enabled: true
-
-      - name: "Grafana Operator"
-        catalog: community-operators
-        operator: grafana-operator
-        namespace: grafana-operator
-        enabled: true
-        extra_args:
-          all_namespaces: true
-
-  cluster:
-    skip: false
-    nodes:
-      auto_scale: false
-      auto_scale_down_on_exit: false
-      instance_type: gx3-16x80x1l4
-      count: 2
-
-  rhoai:
-    skip: false
-    image: "quay.io/rhoai/rhoai-fbc-fragment"
-    tag: "rhoai-3.3@sha256:f6e7db613cd040e53da2d47850477a9b914de18979adaaac47e15dc7c76f8a76"
-    channel: "stable-3.x"
-    datasciencecluster:
-      enable: "[kserve]"
-      extra_settings: '{"spec.components.kserve.rawDeploymentServiceConfig": "Headless"}'
-
-  gateway:
-    skip: false
-    name: openshift-ai-inference  # NOTE: Should not be changed for the time being
-
-  grafana:
-    skip: false
-    namespace: grafana
-    datasources:
-      - grafana/datasource.yaml
-    dashboards_dir: grafana/dashboards
-
-  monitoring:
-    skip: false
-    namespaces:
-      - "@prepare.namespace.name"
-
-  gpu:
-    wait_for_readiness: false
-
-  preload:
-    skip: false
-    extra_images: {}
-    node_selector_key: nvidia.com/gpu.present
-    node_selector_value: "true"
-
-  pvc:
-    enabled: true
-    size: 2000Gi
-    name: storage
-    access_mode: ReadWriteOnce
-    storage_class: null
-
-  model_downloader:
-    image: ghcr.io/opendatahub-io/rhaii-on-xks/kserve-storage-initializer:e6b5db0@sha256:b305264fe2211be2c6063500c4c11da79e8357af4b34dd8567b0d8e8dea7e1d4
-
-  cleanup:
-    skip: false
-
-models:
-  facebook-opt-125m:
-    name: facebook/opt-125m
-    source: hf://facebook/opt-125m
-    resources:
-      cpu: 2
-      memory: 8Gi
-
-  llama3-1-8b:
-    name: RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8-dynamic
-    uri: oci://registry.redhat.io/rhelai1/modelcar-llama-3-1-8b-instruct-fp8-dynamic:1.5
-    # source: hf://RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8-dynamic
-    resources: {}
-
-  llama3-3-70b:
-    name: RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
-    source: hf://RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
-    resources:
-      cpu: 4
-      memory: 64Gi
-
-  gpt-oss-120:
-    name: openai/gpt-oss-120b
-    source: hf://openai/gpt-oss-120b
-    resources:
-      cpu: 4
-      memory: 64Gi
-
-  granite4-tiny:
-    name: RedHatAI/granite-4.0-h-tiny-FP8-dynamic
-    source: hf://RedHatAI/granite-4.0-h-tiny-FP8-dynamic
-    resources: {}
-
-tests:
-  llmd:
-    skip: false
-    skip_prepare: false
-    flavors: intelligentrouting
-    namespace: "@prepare.namespace.name"
-
-    inference_service:
-      skip_deployment: false
-      name: llm-d
-      yaml_file: llama-3-1-8b-instruct-fp8.yaml
-      timeout: 900
-      do_simple_test: true
-      gateway:
-        name: gateway-external
-      model: llama3-1-8b
-      metrics:
-        manual_capture: true
-        scheduler_servicemonitor_name: kserve-llm-isvc-scheduler
-        vllm_podmonitor_name: kserve-llm-isvc-vllm-engine
-
-      # vLLM arguments (always applied)
-      vllm_args:
-        - "--disable-uvicorn-access-log"
-        - "--enable-prefix-caching"
-        - "--uvicorn-log-level=debug"
-        - "--trust-remote-code"
-        - "--disable-log-requests"
-        - "--max-model-len=40960" # keep in 5th position or uddate the presets
-        - "--gpu-memory-utilization=0.92"
-
-      kueue:
-        enabled: false
-        prefix: "kueue.x-k8s.io/"
-        labels:
-          pod-group-name: llmisvc
-          managed: "true"
-        annotations:
-          queue-name: perf-gpu-queue
-
-      # Extra properties to inject into the LLMISVC YAML using dotted-key notation
-      extra_properties: {}
-
-    benchmarks:
-      guidellm:
-        enabled: true
-        name: guidellm-benchmark
-        backend_type: openai_http
-        rate_type: concurrent
-        max_seconds: 120
-        max_requests: null
-        timeout: 900
-        data: prompt_tokens=256,output_tokens=128
-        rate: 1
-        sample_requests: 20
-
-  capture_prom: true
-  capture_prom_uwm: true
-  dry_mode: false
-  visualize: true
-
-export_artifacts:
-  enabled: false
-
-matbench:
-  enabled: true
-  preset: null
-  workload: projects.llm-d.visualizations.llmd_inference
-  config_file: plots.yaml
-  # directory to plot
-  lts:
-    generate: true
-    opensearch:
-      export:
-        enabled: false
-        enabled_on_replot: false
-        fail_test_on_fail: true
-      instance: smoke
-      index: forge-llm-d-cpt
-      index_prefix: ""
-      build_counter_index: "forge-llm-d-builds" # used to generate a unique ID for each build
-    regression_analyses:
-      enabled: false
-      enabled_on_replot: true
-      upload_lts_on_regression: true
-      # if the regression analyses fail, mark the test as failed
-      fail_test_on_regression: true
-      notification:
-        enabled: true
-        title: "llm-d CPT"
-  download:
-    mode: prefer_cache
-    url: null
+project:
+  name: llm_d
+  args: []
diff --git a/projects/llm_d/orchestration/llmd_runtime.py b/projects/llm_d/orchestration/llmd_runtime.py
index aba35fd8..8b507bd2 100644
--- a/projects/llm_d/orchestration/llmd_runtime.py
+++ b/projects/llm_d/orchestration/llmd_runtime.py
@@ -7,7 +7,6 @@
 import re
 import shlex
 import subprocess
-import sys
 import time
 from dataclasses import dataclass
 from pathlib import Path
@@ -15,15 +14,11 @@
 
 import yaml
 
-FORGE_HOME = Path(__file__).resolve().parents[3]
-if str(FORGE_HOME) not in sys.path:
-    sys.path.insert(0, str(FORGE_HOME))
-
-from projects.core.library import env, run
+from projects.core.library import config, env, run
 
 LOGGER = logging.getLogger(__name__)
-CONFIG_DIR = FORGE_HOME / "config" / "llm_d"
-ALLOWED_OVERRIDE_KEYS = frozenset({"namespace"})
+ORCHESTRATION_DIR = env.FORGE_HOME / "projects" / "llm_d" / "orchestration"
+CONFIG_DIR = ORCHESTRATION_DIR
 
 
 class CommandError(RuntimeError):
@@ -72,39 +67,42 @@ def load_run_configuration(
     *, cwd: Path | None = None, artifact_dir: Path | None = None
 ) -> ResolvedConfig:
     cwd = cwd or Path.cwd()
-    artifact_dir = artifact_dir or env.ARTIFACT_DIR
-    if artifact_dir is None:
-        raise RuntimeError("ARTIFACT_DIR is not initialized")
-
-    platform_data = load_yaml(CONFIG_DIR / "platform.yaml")
-    models_data = load_yaml(CONFIG_DIR / "models.yaml")["models"]
-    workloads_data = load_yaml(CONFIG_DIR / "workloads.yaml")
-    preset_data = load_yaml(CONFIG_DIR / "presets.yaml")
+    if artifact_dir is not None:
+        os.environ["ARTIFACT_DIR"] = str(artifact_dir)
+    artifact_dir = init()
+    _reinitialize_project_config()
 
+    platform_data = copy.deepcopy(config.project.get_config("platform"))
     fournos_config = load_fournos_config(cwd)
-    overrides = parse_overrides(os.environ.get("FORGE_CONFIG_OVERRIDES", ""))
+    overrides = parse_overrides(
+        os.environ.get("FORGE_CONFIG_OVERRIDES", ""),
+        allowed_keys=config.project.get_config("runtime.allowed_override_keys", []),
+    )
 
     requested_preset = (
-        fournos_config.get("preset") or os.environ.get("FORGE_PRESET") or "smoke"
-    )
-    alias = (
-        requested_preset if requested_preset in preset_data.get("aliases", {}) else None
+        fournos_config.get("preset")
+        or os.environ.get("FORGE_PRESET")
+        or config.project.get_config("runtime.default_preset")
     )
-    preset_name = preset_data.get("aliases", {}).get(requested_preset, requested_preset)
-    preset = preset_data["presets"].get(preset_name)
-    if preset is None:
-        raise ValueError(f"Unknown llm_d preset: {requested_preset}")
+    apply_requested_preset(requested_preset)
+
+    preset_name = config.project.get_config("runtime.selected_preset")
+    preset_alias = requested_preset if requested_preset != preset_name else None
 
-    model_name = preset["model"]
-    model = copy.deepcopy(models_data[model_name])
+    model_name = config.project.get_config("runtime.model_key")
+    model = copy.deepcopy(config.project.get_config(f"models.{model_name}"))
 
-    smoke_request_name = preset.get("smoke_request", "default")
-    smoke_request = copy.deepcopy(workloads_data["smoke_requests"][smoke_request_name])
+    smoke_request_name = config.project.get_config("runtime.smoke_request_key")
+    smoke_request = copy.deepcopy(
+        config.project.get_config(f"workloads.smoke_requests.{smoke_request_name}")
+    )
 
-    benchmark_name = preset.get("benchmark")
+    benchmark_name = config.project.get_config("runtime.benchmark_key", None)
     benchmark = None
     if benchmark_name:
-        benchmark = copy.deepcopy(workloads_data["benchmarks"][benchmark_name])
+        benchmark = copy.deepcopy(
+            config.project.get_config(f"workloads.benchmarks.{benchmark_name}")
+        )
 
     job_name = fournos_config.get("job-name") or os.environ.get("FORGE_JOB_NAME")
     if not job_name:
@@ -121,10 +119,10 @@ def load_run_configuration(
 
     return ResolvedConfig(
         artifact_dir=Path(artifact_dir),
-        project_root=FORGE_HOME,
-        config_dir=CONFIG_DIR,
+        project_root=env.FORGE_HOME,
+        config_dir=ORCHESTRATION_DIR,
         preset_name=preset_name,
-        preset_alias=alias,
+        preset_alias=preset_alias,
         job_name=job_name,
         namespace=namespace,
         namespace_is_managed=namespace_override is None,
@@ -138,6 +136,26 @@ def load_run_configuration(
     )
 
 
+def _reinitialize_project_config() -> None:
+    config.project = None
+    artifact_config = env.ARTIFACT_DIR / "config.yaml"
+    if artifact_config.exists():
+        artifact_config.unlink()
+
+    presets_applied = env.ARTIFACT_DIR / "presets_applied"
+    if presets_applied.exists():
+        presets_applied.unlink()
+
+    config.init(ORCHESTRATION_DIR)
+
+
+def apply_requested_preset(requested_preset: str) -> None:
+    if not config.project.get_preset(requested_preset):
+        raise ValueError(f"Unknown llm_d preset: {requested_preset}")
+
+    config.project.apply_preset(requested_preset)
+
+
 def load_fournos_config(cwd: Path) -> dict[str, Any]:
     config_path = cwd / "fournos_config.yaml"
     if not config_path.exists():
@@ -153,7 +171,7 @@ def load_fournos_config(cwd: Path) -> dict[str, Any]:
     return data
 
 
-def parse_overrides(raw: str) -> dict[str, Any]:
+def parse_overrides(raw: str, *, allowed_keys: Iterable[str]) -> dict[str, Any]:
     if not raw or raw.strip() in {"", "null", "{}"}:
         return {}
 
@@ -165,11 +183,12 @@ def parse_overrides(raw: str) -> dict[str, Any]:
     if not isinstance(data, dict):
         raise ValueError("FORGE_CONFIG_OVERRIDES must decode to a JSON object")
 
-    unsupported = sorted(set(data) - ALLOWED_OVERRIDE_KEYS)
+    allowed_keys = frozenset(allowed_keys)
+    unsupported = sorted(set(data) - allowed_keys)
     if unsupported:
         raise ValueError(
             "Unsupported llm_d override keys: "
-            f"{', '.join(unsupported)}. Allowed keys: {', '.join(sorted(ALLOWED_OVERRIDE_KEYS))}"
+            f"{', '.join(unsupported)}. Allowed keys: {', '.join(sorted(allowed_keys))}"
         )
 
     return data
diff --git a/config/llm_d/manifests/datasciencecluster.yaml b/projects/llm_d/orchestration/manifests/datasciencecluster.yaml
similarity index 100%
rename from config/llm_d/manifests/datasciencecluster.yaml
rename to projects/llm_d/orchestration/manifests/datasciencecluster.yaml
diff --git a/config/llm_d/manifests/epp-approximate-prefix-cache.yaml b/projects/llm_d/orchestration/manifests/epp-approximate-prefix-cache.yaml
similarity index 100%
rename from config/llm_d/manifests/epp-approximate-prefix-cache.yaml
rename to projects/llm_d/orchestration/manifests/epp-approximate-prefix-cache.yaml
diff --git a/config/llm_d/manifests/gateway.yaml b/projects/llm_d/orchestration/manifests/gateway.yaml
similarity index 100%
rename from config/llm_d/manifests/gateway.yaml
rename to projects/llm_d/orchestration/manifests/gateway.yaml
diff --git a/config/llm_d/manifests/gpu-clusterpolicy.yaml b/projects/llm_d/orchestration/manifests/gpu-clusterpolicy.yaml
similarity index 100%
rename from config/llm_d/manifests/gpu-clusterpolicy.yaml
rename to projects/llm_d/orchestration/manifests/gpu-clusterpolicy.yaml
diff --git a/config/llm_d/manifests/llminferenceservice.yaml b/projects/llm_d/orchestration/manifests/llminferenceservice.yaml
similarity index 100%
rename from config/llm_d/manifests/llminferenceservice.yaml
rename to projects/llm_d/orchestration/manifests/llminferenceservice.yaml
diff --git a/config/llm_d/manifests/nfd-nodefeaturediscovery.yaml b/projects/llm_d/orchestration/manifests/nfd-nodefeaturediscovery.yaml
similarity index 100%
rename from config/llm_d/manifests/nfd-nodefeaturediscovery.yaml
rename to projects/llm_d/orchestration/manifests/nfd-nodefeaturediscovery.yaml
diff --git a/projects/llm_d/orchestration/prepare_llmd.py b/projects/llm_d/orchestration/prepare_llmd.py
index fdabe4b8..d52f921a 100644
--- a/projects/llm_d/orchestration/prepare_llmd.py
+++ b/projects/llm_d/orchestration/prepare_llmd.py
@@ -1,428 +1,15 @@
 from __future__ import annotations
 
-import json
-import logging
-from pathlib import Path
-
-from projects.llm_d.orchestration import llmd_runtime
-
-LOGGER = logging.getLogger(__name__)
+from projects.llm_d.toolbox.cleanup.main import run as cleanup_toolbox_run
+from projects.llm_d.toolbox.cleanup.main import run_cleanup
+from projects.llm_d.toolbox.prepare.main import prepare_gpu_operator
+from projects.llm_d.toolbox.prepare.main import run as prepare_toolbox_run
+from projects.llm_d.toolbox.prepare.main import run_prepare
 
 
 def prepare() -> int:
-    llmd_runtime.init()
-    config = llmd_runtime.load_run_configuration()
-
-    LOGGER.info(
-        "Preparing llm_d preset=%s namespace=%s", config.preset_name, config.namespace
-    )
-
-    verify_oc_access()
-    verify_cluster_version(config)
-    prepare_cert_manager(config)
-    prepare_leader_worker_set(config)
-    prepare_nfd(config)
-    prepare_gpu_operator(config)
-    prepare_rhoai_operator(config)
-    apply_datasciencecluster(config)
-    wait_for_datasciencecluster_ready(config)
-    ensure_required_crds(config.platform["rhoai"]["required_crds_after_dsc"], config)
-    ensure_gateway(config)
-    ensure_test_namespace(config)
-    verify_gpu_nodes(config)
-    capture_prepare_state(config)
-
-    return 0
+    return prepare_toolbox_run()
 
 
 def cleanup() -> int:
-    llmd_runtime.init()
-    config = llmd_runtime.load_run_configuration()
-
-    inference_service_name = config.platform["inference_service"]["name"]
-    benchmark_name = (
-        config.benchmark["job_name"] if config.benchmark else "guidellm-benchmark"
-    )
-
-    if config.namespace_is_managed:
-        if llmd_runtime.resource_exists("namespace", config.namespace):
-            llmd_runtime.oc(
-                "delete", "namespace", config.namespace, "--ignore-not-found=true"
-            )
-            llmd_runtime.wait_for_namespace_deleted(
-                config.namespace,
-                timeout_seconds=config.platform["cluster"]["cleanup_timeout_seconds"],
-            )
-    else:
-        llmd_runtime.oc(
-            "delete",
-            "llminferenceservice",
-            inference_service_name,
-            "-n",
-            config.namespace,
-            "--ignore-not-found=true",
-            check=False,
-        )
-        llmd_runtime.oc(
-            "delete",
-            "job,pvc",
-            benchmark_name,
-            "-n",
-            config.namespace,
-            "--ignore-not-found=true",
-            check=False,
-        )
-        llmd_runtime.oc(
-            "delete",
-            "pod",
-            f"{benchmark_name}-copy",
-            "-n",
-            config.namespace,
-            "--ignore-not-found=true",
-            check=False,
-        )
-
-    return 0
-
-
-def verify_oc_access() -> None:
-    llmd_runtime.oc("whoami", capture_output=True)
-
-
-def verify_cluster_version(config: llmd_runtime.ResolvedConfig) -> None:
-    version_info = llmd_runtime.oc("version", "-o", "json", capture_output=True)
-    payload = json.loads(version_info.stdout)
-
-    openshift_version = (
-        payload.get("openshiftVersion")
-        or payload.get("serverVersion", {}).get("gitVersion")
-        or payload.get("serverVersion", {}).get("platform")
-    )
-    if not openshift_version:
-        raise RuntimeError(
-            "Could not determine OpenShift version from `oc version -o json`"
-        )
-
-    minimum = config.platform["cluster"]["minimum_openshift_version"]
-    if llmd_runtime.version_tuple(openshift_version) < llmd_runtime.version_tuple(
-        minimum
-    ):
-        raise RuntimeError(
-            f"Cluster version {openshift_version} is older than the llm_d minimum {minimum}"
-        )
-
-
-def ensure_operator_subscription(operator_spec: dict[str, str]) -> dict[str, object]:
-    llmd_runtime.ensure_subscription(operator_spec)
-    return llmd_runtime.wait_for_operator_csv(
-        operator_spec["package"],
-        operator_spec["namespace"],
-        timeout_seconds=operator_spec["wait_timeout_seconds"],
-    )
-
-
-def prepare_cert_manager(config: llmd_runtime.ResolvedConfig) -> None:
-    operator_spec = llmd_runtime.operator_spec_by_package(
-        config.platform, "openshift-cert-manager-operator"
-    )
-    ensure_operator_subscription(operator_spec)
-
-
-def prepare_leader_worker_set(config: llmd_runtime.ResolvedConfig) -> None:
-    operator_spec = llmd_runtime.operator_spec_by_package(
-        config.platform, "leader-worker-set"
-    )
-    ensure_operator_subscription(operator_spec)
-
-
-def prepare_nfd(config: llmd_runtime.ResolvedConfig) -> None:
-    operator_spec = llmd_runtime.operator_spec_by_package(config.platform, "nfd")
-    ensure_operator_subscription(operator_spec)
-    llmd_runtime.wait_for_crd(
-        operator_spec["bootstrap_crd"],
-        timeout_seconds=operator_spec["wait_timeout_seconds"],
-    )
-
-    manifest = llmd_runtime.load_manifest_template(
-        config, operator_spec["bootstrap_manifest"]
-    )
-    llmd_runtime.apply_manifest(
-        config.artifact_dir / "src" / "nfd-nodefeaturediscovery.yaml",
-        manifest,
-    )
-
-    llmd_runtime.wait_until(
-        "NodeFeatureDiscovery bootstrap resource",
-        timeout_seconds=operator_spec["wait_timeout_seconds"],
-        interval_seconds=10,
-        predicate=lambda: llmd_runtime.resource_exists(
-            "nodefeaturediscovery",
-            manifest["metadata"]["name"],
-            namespace=manifest["metadata"]["namespace"],
-        ),
-    )
-
-    wait_for_nfd_gpu_labels(
-        config, timeout_seconds=operator_spec["wait_timeout_seconds"]
-    )
-
-
-def prepare_gpu_operator(config: llmd_runtime.ResolvedConfig) -> None:
-    operator_spec = llmd_runtime.operator_spec_by_package(
-        config.platform, "gpu-operator-certified"
-    )
-    ensure_operator_subscription(operator_spec)
-    llmd_runtime.wait_for_crd(
-        operator_spec["bootstrap_crd"],
-        timeout_seconds=operator_spec["wait_timeout_seconds"],
-    )
-
-    manifest = llmd_runtime.load_manifest_template(
-        config, operator_spec["bootstrap_manifest"]
-    )
-    clusterpolicy_name = manifest["metadata"]["name"]
-    if llmd_runtime.resource_exists("clusterpolicy", clusterpolicy_name):
-        LOGGER.info(
-            "ClusterPolicy/%s already exists; verifying readiness instead of applying bootstrap manifest",
-            clusterpolicy_name,
-        )
-        wait_for_gpu_clusterpolicy_ready(
-            clusterpolicy_name,
-            timeout_seconds=operator_spec["wait_timeout_seconds"],
-        )
-        return
-
-    llmd_runtime.apply_manifest(
-        config.artifact_dir / "src" / "gpu-clusterpolicy.yaml",
-        manifest,
-    )
-
-    wait_for_gpu_clusterpolicy_ready(
-        clusterpolicy_name,
-        timeout_seconds=operator_spec["wait_timeout_seconds"],
-    )
-
-
-def wait_for_gpu_clusterpolicy_ready(
-    clusterpolicy_name: str, *, timeout_seconds: int
-) -> None:
-    def _clusterpolicy_ready() -> bool:
-        payload = llmd_runtime.oc_get_json(
-            "clusterpolicy",
-            name=clusterpolicy_name,
-        )
-        state = payload.get("status", {}).get("state", "")
-        return state.lower() == "ready"
-
-    llmd_runtime.wait_until(
-        f"clusterpolicy/{clusterpolicy_name} ready",
-        timeout_seconds=timeout_seconds,
-        interval_seconds=15,
-        predicate=_clusterpolicy_ready,
-    )
-
-
-def prepare_rhoai_operator(config: llmd_runtime.ResolvedConfig) -> None:
-    operator_spec = llmd_runtime.operator_spec_by_package(
-        config.platform, "rhods-operator"
-    )
-    ensure_operator_subscription(operator_spec)
-    ensure_required_crds(config.platform["rhoai"]["required_crds_before_dsc"], config)
-
-
-def ensure_required_crds(
-    crd_names: list[str], config: llmd_runtime.ResolvedConfig
-) -> None:
-    for crd_name in crd_names:
-        llmd_runtime.wait_for_crd(
-            crd_name,
-            timeout_seconds=config.platform["rhoai"]["wait_timeout_seconds"],
-        )
-
-
-def apply_datasciencecluster(config: llmd_runtime.ResolvedConfig) -> None:
-    manifest = llmd_runtime.render_datasciencecluster(config)
-    llmd_runtime.apply_manifest(
-        config.artifact_dir / "src" / "datasciencecluster.yaml", manifest
-    )
-    llmd_runtime.oc(
-        "get",
-        "datasciencecluster",
-        config.platform["rhoai"]["datasciencecluster_name"],
-        "-n",
-        config.platform["rhoai"]["namespace"],
-        "-o",
-        "yaml",
-        capture_output=True,
-    )
-
-
-def wait_for_datasciencecluster_ready(config: llmd_runtime.ResolvedConfig) -> None:
-    rhoai = config.platform["rhoai"]
-
-    def _dsc_ready() -> bool:
-        payload = llmd_runtime.oc_get_json(
-            "datasciencecluster",
-            name=rhoai["datasciencecluster_name"],
-            namespace=rhoai["namespace"],
-        )
-        phase = payload.get("status", {}).get("phase")
-        if phase == "Ready":
-            return True
-        if phase in {"Failed", "Error"}:
-            raise RuntimeError(f"DataScienceCluster entered terminal phase {phase}")
-        return False
-
-    llmd_runtime.wait_until(
-        f"datasciencecluster/{rhoai['datasciencecluster_name']} ready",
-        timeout_seconds=rhoai["wait_timeout_seconds"],
-        interval_seconds=10,
-        predicate=_dsc_ready,
-    )
-
-
-def ensure_gateway(config: llmd_runtime.ResolvedConfig) -> None:
-    gateway = config.platform["gateway"]
-    if not llmd_runtime.resource_exists(
-        "gateway", gateway["name"], namespace=gateway["namespace"]
-    ):
-        if not gateway["create_if_missing"]:
-            raise RuntimeError(
-                f"Required gateway {gateway['name']} does not exist in {gateway['namespace']}"
-            )
-        manifest = llmd_runtime.render_gateway(config)
-        llmd_runtime.apply_manifest(
-            config.artifact_dir / "src" / "gateway.yaml", manifest
-        )
-
-    def _gateway_programmed() -> bool:
-        resource = llmd_runtime.oc_get_json(
-            "gateway",
-            name=gateway["name"],
-            namespace=gateway["namespace"],
-        )
-        return llmd_runtime.condition_status(resource, "Programmed") == "True"
-
-    llmd_runtime.wait_until(
-        f"gateway/{gateway['name']} programmed",
-        timeout_seconds=gateway["wait_timeout_seconds"],
-        interval_seconds=10,
-        predicate=_gateway_programmed,
-    )
-
-
-def ensure_test_namespace(config: llmd_runtime.ResolvedConfig) -> None:
-    llmd_runtime.ensure_namespace(
-        config.namespace,
-        labels={
-            "app.kubernetes.io/managed-by": "forge",
-            "forge.openshift.io/project": "llm_d",
-        },
-    )
-
-
-def verify_gpu_nodes(config: llmd_runtime.ResolvedConfig) -> None:
-    selector = config.platform["cluster"]["gpu_node_label_selector"]
-    data = llmd_runtime.oc_get_json("nodes", selector=selector, ignore_not_found=True)
-    items = data.get("items", []) if data else []
-    if not items:
-        raise RuntimeError(
-            f"No GPU nodes found with selector {selector}. The llm_d smoke path requires GPUs."
-        )
-
-
-def wait_for_nfd_gpu_labels(
-    config: llmd_runtime.ResolvedConfig, *, timeout_seconds: int
-) -> None:
-    selectors = config.platform["cluster"]["nfd_gpu_detection_labels"]
-
-    def _labels_present() -> bool:
-        for selector in selectors:
-            data = llmd_runtime.oc_get_json(
-                "nodes", selector=selector, ignore_not_found=True
-            )
-            if data and data.get("items"):
-                return True
-        return False
-
-    llmd_runtime.wait_until(
-        "NFD GPU discovery labels on cluster nodes",
-        timeout_seconds=timeout_seconds,
-        interval_seconds=15,
-        predicate=_labels_present,
-    )
-
-
-def capture_prepare_state(config: llmd_runtime.ResolvedConfig) -> None:
-    artifacts_dir = config.artifact_dir / "artifacts"
-    rhoai = config.platform["rhoai"]
-    gateway = config.platform["gateway"]
-
-    capture_resource_yaml(
-        "datasciencecluster",
-        rhoai["datasciencecluster_name"],
-        rhoai["namespace"],
-        artifacts_dir / "datasciencecluster.yaml",
-    )
-    capture_resource_yaml(
-        "gateway",
-        gateway["name"],
-        gateway["namespace"],
-        artifacts_dir / "gateway.yaml",
-    )
-    gateway_service = llmd_runtime.oc(
-        "get",
-        "service",
-        "-A",
-        "-l",
-        f"gateway.networking.k8s.io/gateway-name={gateway['name']}",
-        "-o",
-        "yaml",
-        check=False,
-        capture_output=True,
-    )
-    if gateway_service.returncode == 0 and gateway_service.stdout:
-        llmd_runtime.write_text(
-            artifacts_dir / "gateway.service.yaml", gateway_service.stdout
-        )
-    if config.platform["artifacts"]["capture_namespace_events"]:
-        capture_namespace_events(
-            config.namespace, artifacts_dir / "namespace.events.txt"
-        )
-
-
-def capture_resource_yaml(
-    kind: str,
-    name: str,
-    namespace: str,
-    destination: Path,
-    *,
-    check: bool = True,
-) -> None:
-    result = llmd_runtime.oc(
-        "get",
-        kind,
-        name,
-        "-n",
-        namespace,
-        "-o",
-        "yaml",
-        check=check,
-        capture_output=True,
-    )
-    if result.returncode == 0 and result.stdout:
-        llmd_runtime.write_text(destination, result.stdout)
-
-
-def capture_namespace_events(namespace: str, destination: Path) -> None:
-    result = llmd_runtime.oc(
-        "get",
-        "events",
-        "-n",
-        namespace,
-        "--sort-by=.metadata.creationTimestamp",
-        check=False,
-        capture_output=True,
-    )
-    if result.returncode == 0 and result.stdout:
-        llmd_runtime.write_text(destination, result.stdout)
+    return cleanup_toolbox_run()
diff --git a/projects/llm_d/orchestration/presets.d/cks.yaml b/projects/llm_d/orchestration/presets.d/cks.yaml
deleted file mode 100644
index b4f842dc..00000000
--- a/projects/llm_d/orchestration/presets.d/cks.yaml
+++ /dev/null
@@ -1,23 +0,0 @@
-extends: [pvc_rwx, llama-70b]
-
-tests.capture_prom: false
-tests.capture_prom_uwm: false
-tests.llmd.skip_prepare: true
-prepare.namespace.name: kpouget-dev
-prepare.preload.node_selector_key: gpu.nvidia.com/class
-prepare.preload.node_selector_value: "H200"
-tests.llmd.inference_service.extra_properties:
-  spec.template.affinity:
-    nodeAffinity:
-      requiredDuringSchedulingIgnoredDuringExecution:
-        nodeSelectorTerms:
-          - matchExpressions:
-            - key: kubernetes.io/hostname
-              operator: NotIn
-              values:
-              - gf48e48
-              - gf4334a
-prepare.preload.extra_images:
-  vllm-cuda-rhel9: registry.redhat.io/rhaiis/vllm-cuda-rhel9@sha256:094db84a1da5e8a575d0c9eade114fa30f4a2061064a338e3e032f3578f8082a
-  llm-d-inference-scheduler: ghcr.io/opendatahub-io/rhaii-on-xks/llm-d-inference-scheduler:e6b5db0@sha256:43e8b8edc158f31535c8b23d77629f8cde111cc762a8f4ee5f2f884470566211
-  guidellm: ghcr.io/vllm-project/guidellm:v0.5.4
diff --git a/projects/llm_d/orchestration/presets.d/presets.yaml b/projects/llm_d/orchestration/presets.d/presets.yaml
index 3bd1e3fb..37fcc711 100644
--- a/projects/llm_d/orchestration/presets.d/presets.yaml
+++ b/projects/llm_d/orchestration/presets.d/presets.yaml
@@ -1,9 +1,17 @@
 __multiple: true
 
-pvc_rwx:
-  prepare.pvc.name: storage-rwx
-  prepare.pvc.access_mode: ReadWriteMany
+smoke:
+  runtime.selected_preset: smoke
+  runtime.model_key: qwen3-0-6b
+  runtime.smoke_request_key: default
+  runtime.benchmark_key: null
 
+benchmark-short:
+  runtime.selected_preset: benchmark-short
+  runtime.model_key: llama-3-1-8b-instruct-fp8
+  runtime.smoke_request_key: default
+  runtime.benchmark_key: short
 
-llama-70b:
-  tests.llmd.inference_service.model: llama3-3-70b
+cks:
+  extends:
+    - smoke
diff --git a/projects/llm_d/orchestration/test_llmd.py b/projects/llm_d/orchestration/test_llmd.py
index b11948d7..8fc2bc40 100644
--- a/projects/llm_d/orchestration/test_llmd.py
+++ b/projects/llm_d/orchestration/test_llmd.py
@@ -1,13 +1,9 @@
 from __future__ import annotations
 
-import json
-import logging
-import time
-from pathlib import Path
-
 from projects.llm_d.orchestration import llmd_runtime
-
-LOGGER = logging.getLogger(__name__)
+from projects.llm_d.toolbox.test.main import resolve_endpoint_url
+from projects.llm_d.toolbox.test.main import run as test_toolbox_run
+from projects.llm_d.toolbox.test.main import run_test
 
 
 def init() -> None:
@@ -15,469 +11,4 @@ def init() -> None:
 
 
 def test() -> int:
-    llmd_runtime.init()
-    config = llmd_runtime.load_run_configuration()
-
-    name = config.platform["inference_service"]["name"]
-    namespace = config.namespace
-    artifacts_dir = config.artifact_dir / "artifacts"
-
-    LOGGER.info("Testing llm_d preset=%s namespace=%s", config.preset_name, namespace)
-
-    endpoint_url = None
-    try:
-        endpoint_url = deploy_inference_service(config)
-        smoke_response = run_smoke_request(config, endpoint_url)
-        llmd_runtime.write_json(artifacts_dir / "smoke.response.json", smoke_response)
-
-        if config.benchmark:
-            run_guidellm_benchmark(config, endpoint_url)
-
-        return 0
-    finally:
-        capture_inference_service_state(config)
-        if endpoint_url:
-            llmd_runtime.write_text(artifacts_dir / "endpoint.url", f"{endpoint_url}\n")
-        benchmark_name = (
-            config.benchmark["job_name"] if config.benchmark else "guidellm-benchmark"
-        )
-        llmd_runtime.oc(
-            "delete",
-            "job,pvc",
-            benchmark_name,
-            "-n",
-            namespace,
-            "--ignore-not-found=true",
-            check=False,
-        )
-        llmd_runtime.oc(
-            "delete",
-            "pod",
-            f"{benchmark_name}-copy",
-            "-n",
-            namespace,
-            "--ignore-not-found=true",
-            check=False,
-        )
-        events = llmd_runtime.oc(
-            "get",
-            "events",
-            "-n",
-            namespace,
-            "--sort-by=.metadata.creationTimestamp",
-            check=False,
-            capture_output=True,
-        )
-        if events.returncode == 0 and events.stdout:
-            llmd_runtime.write_text(
-                artifacts_dir / "namespace.events.txt", events.stdout
-            )
-
-
-def deploy_inference_service(config: llmd_runtime.ResolvedConfig) -> str:
-    name = config.platform["inference_service"]["name"]
-    namespace = config.namespace
-    selector = f"app.kubernetes.io/name={name}"
-
-    llmd_runtime.oc(
-        "delete",
-        "llminferenceservice",
-        name,
-        "-n",
-        namespace,
-        "--ignore-not-found=true",
-        check=False,
-    )
-
-    def _old_pods_gone() -> bool:
-        pods = llmd_runtime.oc_get_json(
-            "pods", namespace=namespace, selector=selector, ignore_not_found=True
-        )
-        return not pods or not pods.get("items")
-
-    llmd_runtime.wait_until(
-        f"old llm-d pods to disappear in {namespace}",
-        timeout_seconds=config.platform["inference_service"]["delete_timeout_seconds"],
-        interval_seconds=10,
-        predicate=_old_pods_gone,
-    )
-
-    manifest = llmd_runtime.render_inference_service(config)
-    llmd_runtime.apply_manifest(
-        config.artifact_dir / "src" / "llminferenceservice.yaml", manifest
-    )
-
-    def _pods_present() -> bool:
-        pods = llmd_runtime.oc_get_json(
-            "pods", namespace=namespace, selector=selector, ignore_not_found=True
-        )
-        return bool(pods and pods.get("items"))
-
-    llmd_runtime.wait_until(
-        f"llm-d pods to appear in {namespace}",
-        timeout_seconds=config.platform["inference_service"][
-            "pod_appearance_timeout_seconds"
-        ],
-        interval_seconds=5,
-        predicate=_pods_present,
-    )
-
-    def _service_ready() -> bool:
-        payload = llmd_runtime.oc_get_json(
-            "llminferenceservice", name=name, namespace=namespace
-        )
-        return llmd_runtime.condition_status(payload, "Ready") == "True"
-
-    llmd_runtime.wait_until(
-        f"llminferenceservice/{name} ready",
-        timeout_seconds=config.platform["inference_service"]["ready_timeout_seconds"],
-        interval_seconds=10,
-        predicate=_service_ready,
-    )
-
-    return llmd_runtime.wait_until(
-        f"gateway address for llminferenceservice/{name}",
-        timeout_seconds=config.platform["inference_service"]["ready_timeout_seconds"],
-        interval_seconds=10,
-        predicate=lambda: try_resolve_endpoint_url(config),
-    )
-
-
-def resolve_endpoint_url(config: llmd_runtime.ResolvedConfig) -> str:
-    endpoint_url = try_resolve_endpoint_url(config)
-    if endpoint_url:
-        return endpoint_url
-
-    name = config.platform["inference_service"]["name"]
-    gateway_name = config.platform["gateway"]["status_address_name"]
-    raise RuntimeError(
-        f"Gateway address {gateway_name} is missing from llminferenceservice/{name} status.addresses"
-    )
-
-
-def try_resolve_endpoint_url(config: llmd_runtime.ResolvedConfig) -> str | None:
-    name = config.platform["inference_service"]["name"]
-    namespace = config.namespace
-    gateway_name = config.platform["gateway"]["status_address_name"]
-    payload = llmd_runtime.oc_get_json(
-        "llminferenceservice", name=name, namespace=namespace
-    )
-
-    for address in payload.get("status", {}).get("addresses", []):
-        if address.get("name") == gateway_name and address.get("url"):
-            return address["url"]
-    return None
-
-
-def run_smoke_request(
-    config: llmd_runtime.ResolvedConfig, endpoint_url: str
-) -> dict[str, object]:
-    namespace = config.namespace
-    name = config.platform["inference_service"]["name"]
-    deployment_name = f"{name}{config.platform['inference_service']['workload_deployment_name_suffix']}"
-
-    payload = {
-        "model": config.model["served_model_name"],
-        "prompt": config.smoke_request["prompt"],
-        "max_tokens": config.smoke_request["max_tokens"],
-        "temperature": config.smoke_request["temperature"],
-    }
-    llmd_runtime.write_json(
-        config.artifact_dir / "artifacts" / "smoke.request.json", payload
-    )
-
-    retries = config.platform["smoke"]["request_retries"]
-    delay = config.platform["smoke"]["request_retry_delay_seconds"]
-    result = None
-    for _ in range(retries):
-        result = llmd_runtime.oc(
-            "exec",
-            "-n",
-            namespace,
-            f"deployment/{deployment_name}",
-            "-c",
-            "main",
-            "--",
-            "curl",
-            "-k",
-            "-sSf",
-            f"{endpoint_url}{config.platform['smoke']['endpoint_path']}",
-            "-H",
-            "Content-Type: application/json",
-            "-d",
-            json.dumps(payload),
-            check=False,
-            capture_output=True,
-        )
-        if result.returncode == 0:
-            break
-        time.sleep(delay)
-
-    if result is None or result.returncode != 0:
-        raise RuntimeError("Smoke request never succeeded against the llm_d endpoint")
-
-    response = json.loads(result.stdout)
-    if not response.get("choices"):
-        raise RuntimeError(f"Invalid smoke response payload: {result.stdout}")
-    return response
-
-
-def run_guidellm_benchmark(
-    config: llmd_runtime.ResolvedConfig, endpoint_url: str
-) -> None:
-    benchmark_name = config.benchmark["job_name"]
-    namespace = config.namespace
-
-    llmd_runtime.oc(
-        "delete",
-        "job,pvc",
-        benchmark_name,
-        "-n",
-        namespace,
-        "--ignore-not-found=true",
-        check=False,
-    )
-    llmd_runtime.oc(
-        "delete",
-        "pod",
-        f"{benchmark_name}-copy",
-        "-n",
-        namespace,
-        "--ignore-not-found=true",
-        check=False,
-    )
-
-    llmd_runtime.apply_manifest(
-        config.artifact_dir / "src" / "guidellm-pvc.yaml",
-        llmd_runtime.render_guidellm_pvc(config),
-    )
-    llmd_runtime.apply_manifest(
-        config.artifact_dir / "src" / "guidellm-job.yaml",
-        llmd_runtime.render_guidellm_job(config, endpoint_url),
-    )
-
-    def _job_terminal() -> dict[str, object] | None:
-        payload = llmd_runtime.oc_get_json(
-            "job", name=benchmark_name, namespace=namespace
-        )
-        status = payload.get("status", {})
-        if status.get("succeeded"):
-            return payload
-        if status.get("failed"):
-            raise RuntimeError(f"GuideLLM job {benchmark_name} failed")
-        return None
-
-    llmd_runtime.wait_until(
-        f"GuideLLM job/{benchmark_name}",
-        timeout_seconds=config.benchmark["timeout_seconds"],
-        interval_seconds=10,
-        predicate=_job_terminal,
-    )
-
-    capture_guidellm_state(config)
-    copy_guidellm_results(config)
-
-
-def copy_guidellm_results(config: llmd_runtime.ResolvedConfig) -> None:
-    benchmark_name = config.benchmark["job_name"]
-    namespace = config.namespace
-    pod_data = llmd_runtime.oc_get_json(
-        "pods",
-        namespace=namespace,
-        selector=f"job-name={benchmark_name}",
-        ignore_not_found=True,
-    )
-    node_name = None
-    if pod_data and pod_data.get("items"):
-        node_name = pod_data["items"][0].get("spec", {}).get("nodeName")
-
-    llmd_runtime.apply_manifest(
-        config.artifact_dir / "src" / "guidellm-copy-pod.yaml",
-        llmd_runtime.render_guidellm_copy_pod(config, node_name=node_name),
-    )
-
-    def _helper_ready() -> bool:
-        payload = llmd_runtime.oc_get_json(
-            "pod",
-            name=f"{benchmark_name}-copy",
-            namespace=namespace,
-        )
-        conditions = payload.get("status", {}).get("conditions", [])
-        return any(
-            condition.get("type") == "Ready" and condition.get("status") == "True"
-            for condition in conditions
-        )
-
-    llmd_runtime.wait_until(
-        f"GuideLLM copy helper pod/{benchmark_name}-copy",
-        timeout_seconds=120,
-        interval_seconds=5,
-        predicate=_helper_ready,
-    )
-
-    result = llmd_runtime.oc(
-        "exec",
-        "-n",
-        namespace,
-        f"{benchmark_name}-copy",
-        "--",
-        "cat",
-        "/results/benchmarks.json",
-        check=False,
-        capture_output=True,
-    )
-    if result.returncode == 0 and result.stdout:
-        llmd_runtime.write_text(
-            config.artifact_dir / "artifacts" / "results" / "benchmarks.json",
-            result.stdout,
-        )
-
-
-def capture_inference_service_state(config: llmd_runtime.ResolvedConfig) -> None:
-    name = config.platform["inference_service"]["name"]
-    namespace = config.namespace
-    artifacts_dir = config.artifact_dir / "artifacts"
-    selector = f"app.kubernetes.io/name={name}"
-
-    capture_get(
-        "llminferenceservice",
-        name,
-        namespace,
-        "yaml",
-        artifacts_dir / "llminferenceservice.yaml",
-    )
-    capture_get(
-        "llminferenceservice",
-        name,
-        namespace,
-        "json",
-        artifacts_dir / "llminferenceservice.json",
-    )
-    capture_get(
-        "pods",
-        None,
-        namespace,
-        "yaml",
-        artifacts_dir / "llminferenceservice.pods.yaml",
-        selector=selector,
-    )
-    capture_get(
-        "deployments",
-        None,
-        namespace,
-        "yaml",
-        artifacts_dir / "llminferenceservice.deployments.yaml",
-        selector=selector,
-    )
-    capture_get(
-        "replicasets",
-        None,
-        namespace,
-        "yaml",
-        artifacts_dir / "llminferenceservice.replicasets.yaml",
-        selector=selector,
-    )
-    capture_get(
-        "pods", None, namespace, "wide", artifacts_dir / "namespace.pods.status"
-    )
-    capture_get(
-        "services", None, namespace, "wide", artifacts_dir / "namespace.services.status"
-    )
-
-    pod_list = llmd_runtime.oc_get_json(
-        "pods", namespace=namespace, selector=selector, ignore_not_found=True
-    )
-    if pod_list:
-        lines = []
-        previous_lines = []
-        for pod in pod_list.get("items", []):
-            pod_name = pod["metadata"]["name"]
-            lines.append(f"=== {pod_name} ===")
-            log_result = llmd_runtime.oc(
-                "logs",
-                pod_name,
-                "-n",
-                namespace,
-                "--all-containers=true",
-                check=False,
-                capture_output=True,
-            )
-            if log_result.stdout:
-                lines.append(log_result.stdout.rstrip())
-
-            previous_lines.append(f"=== {pod_name} ===")
-            previous_result = llmd_runtime.oc(
-                "logs",
-                pod_name,
-                "-n",
-                namespace,
-                "--previous",
-                "--all-containers=true",
-                check=False,
-                capture_output=True,
-            )
-            if previous_result.stdout:
-                previous_lines.append(previous_result.stdout.rstrip())
-
-        llmd_runtime.write_text(
-            artifacts_dir / "llminferenceservice.pods.logs", "\n".join(lines) + "\n"
-        )
-        llmd_runtime.write_text(
-            artifacts_dir / "llminferenceservice.pods.previous.logs",
-            "\n".join(previous_lines) + "\n",
-        )
-
-
-def capture_guidellm_state(config: llmd_runtime.ResolvedConfig) -> None:
-    benchmark_name = config.benchmark["job_name"]
-    namespace = config.namespace
-    artifacts_dir = config.artifact_dir / "artifacts"
-
-    capture_get(
-        "job",
-        benchmark_name,
-        namespace,
-        "yaml",
-        artifacts_dir / "guidellm_benchmark_job.yaml",
-    )
-    capture_get(
-        "pods",
-        None,
-        namespace,
-        "yaml",
-        artifacts_dir / "guidellm_benchmark_job.pods.yaml",
-        selector=f"job-name={benchmark_name}",
-    )
-    result = llmd_runtime.oc(
-        "logs",
-        f"job/{benchmark_name}",
-        "-n",
-        namespace,
-        check=False,
-        capture_output=True,
-    )
-    if result.returncode == 0 and result.stdout:
-        llmd_runtime.write_text(
-            artifacts_dir / "guidellm_benchmark_job.logs", result.stdout
-        )
-
-
-def capture_get(
-    kind: str,
-    name: str | None,
-    namespace: str,
-    output: str,
-    destination: Path,
-    *,
-    selector: str | None = None,
-) -> None:
-    args = ["get", kind]
-    if name:
-        args.append(name)
-    args.extend(["-n", namespace])
-    if selector:
-        args.extend(["-l", selector])
-    args.extend(["-o", output])
-    result = llmd_runtime.oc(*args, check=False, capture_output=True)
-    if result.returncode == 0 and result.stdout:
-        llmd_runtime.write_text(destination, result.stdout)
+    return test_toolbox_run()
diff --git a/projects/llm_d/toolbox/cleanup/main.py b/projects/llm_d/toolbox/cleanup/main.py
new file mode 100644
index 00000000..46d0aedf
--- /dev/null
+++ b/projects/llm_d/toolbox/cleanup/main.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+from projects.core.dsl import toolbox
+from projects.llm_d.orchestration import llmd_runtime
+
+
+def run() -> int:
+    llmd_runtime.init()
+    config = llmd_runtime.load_run_configuration()
+    return run_cleanup(config)
+
+
+def run_cleanup(config: llmd_runtime.ResolvedConfig) -> int:
+    inference_service_name = config.platform["inference_service"]["name"]
+    benchmark_name = (
+        config.benchmark["job_name"] if config.benchmark else "guidellm-benchmark"
+    )
+
+    if config.namespace_is_managed:
+        if llmd_runtime.resource_exists("namespace", config.namespace):
+            llmd_runtime.oc(
+                "delete", "namespace", config.namespace, "--ignore-not-found=true"
+            )
+            llmd_runtime.wait_for_namespace_deleted(
+                config.namespace,
+                timeout_seconds=config.platform["cluster"]["cleanup_timeout_seconds"],
+            )
+    else:
+        llmd_runtime.oc(
+            "delete",
+            "llminferenceservice",
+            inference_service_name,
+            "-n",
+            config.namespace,
+            "--ignore-not-found=true",
+            check=False,
+        )
+        llmd_runtime.oc(
+            "delete",
+            "job,pvc",
+            benchmark_name,
+            "-n",
+            config.namespace,
+            "--ignore-not-found=true",
+            check=False,
+        )
+        llmd_runtime.oc(
+            "delete",
+            "pod",
+            f"{benchmark_name}-copy",
+            "-n",
+            config.namespace,
+            "--ignore-not-found=true",
+            check=False,
+        )
+
+    return 0
+
+
+main = toolbox.create_toolbox_main(run)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/projects/llm_d/toolbox/prepare/main.py b/projects/llm_d/toolbox/prepare/main.py
new file mode 100644
index 00000000..7edad1f8
--- /dev/null
+++ b/projects/llm_d/toolbox/prepare/main.py
@@ -0,0 +1,391 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import json
+import logging
+from pathlib import Path
+
+from projects.core.dsl import toolbox
+from projects.llm_d.orchestration import llmd_runtime
+
+LOGGER = logging.getLogger(__name__)
+
+
+def run() -> int:
+    llmd_runtime.init()
+    config = llmd_runtime.load_run_configuration()
+    return run_prepare(config)
+
+
+def run_prepare(config: llmd_runtime.ResolvedConfig) -> int:
+    LOGGER.info(
+        "Preparing llm_d preset=%s namespace=%s", config.preset_name, config.namespace
+    )
+
+    verify_oc_access()
+    verify_cluster_version(config)
+    prepare_cert_manager(config)
+    prepare_leader_worker_set(config)
+    prepare_nfd(config)
+    prepare_gpu_operator(config)
+    prepare_rhoai_operator(config)
+    apply_datasciencecluster(config)
+    wait_for_datasciencecluster_ready(config)
+    ensure_required_crds(config.platform["rhoai"]["required_crds_after_dsc"], config)
+    ensure_gateway(config)
+    ensure_test_namespace(config)
+    verify_gpu_nodes(config)
+    capture_prepare_state(config)
+
+    return 0
+
+
+def verify_oc_access() -> None:
+    llmd_runtime.oc("whoami", capture_output=True)
+
+
+def verify_cluster_version(config: llmd_runtime.ResolvedConfig) -> None:
+    version_info = llmd_runtime.oc("version", "-o", "json", capture_output=True)
+    payload = json.loads(version_info.stdout)
+
+    openshift_version = (
+        payload.get("openshiftVersion")
+        or payload.get("serverVersion", {}).get("gitVersion")
+        or payload.get("serverVersion", {}).get("platform")
+    )
+    if not openshift_version:
+        raise RuntimeError(
+            "Could not determine OpenShift version from `oc version -o json`"
+        )
+
+    minimum = config.platform["cluster"]["minimum_openshift_version"]
+    if llmd_runtime.version_tuple(openshift_version) < llmd_runtime.version_tuple(
+        minimum
+    ):
+        raise RuntimeError(
+            f"Cluster version {openshift_version} is older than the llm_d minimum {minimum}"
+        )
+
+
+def ensure_operator_subscription(operator_spec: dict[str, str]) -> dict[str, object]:
+    llmd_runtime.ensure_subscription(operator_spec)
+    return llmd_runtime.wait_for_operator_csv(
+        operator_spec["package"],
+        operator_spec["namespace"],
+        timeout_seconds=operator_spec["wait_timeout_seconds"],
+    )
+
+
+def prepare_cert_manager(config: llmd_runtime.ResolvedConfig) -> None:
+    operator_spec = llmd_runtime.operator_spec_by_package(
+        config.platform, "openshift-cert-manager-operator"
+    )
+    ensure_operator_subscription(operator_spec)
+
+
+def prepare_leader_worker_set(config: llmd_runtime.ResolvedConfig) -> None:
+    operator_spec = llmd_runtime.operator_spec_by_package(
+        config.platform, "leader-worker-set"
+    )
+    ensure_operator_subscription(operator_spec)
+
+
+def prepare_nfd(config: llmd_runtime.ResolvedConfig) -> None:
+    operator_spec = llmd_runtime.operator_spec_by_package(config.platform, "nfd")
+    ensure_operator_subscription(operator_spec)
+    llmd_runtime.wait_for_crd(
+        operator_spec["bootstrap_crd"],
+        timeout_seconds=operator_spec["wait_timeout_seconds"],
+    )
+
+    manifest = llmd_runtime.load_manifest_template(
+        config, operator_spec["bootstrap_manifest"]
+    )
+    llmd_runtime.apply_manifest(
+        config.artifact_dir / "src" / "nfd-nodefeaturediscovery.yaml",
+        manifest,
+    )
+
+    llmd_runtime.wait_until(
+        "NodeFeatureDiscovery bootstrap resource",
+        timeout_seconds=operator_spec["wait_timeout_seconds"],
+        interval_seconds=10,
+        predicate=lambda: llmd_runtime.resource_exists(
+            "nodefeaturediscovery",
+            manifest["metadata"]["name"],
+            namespace=manifest["metadata"]["namespace"],
+        ),
+    )
+
+    wait_for_nfd_gpu_labels(
+        config, timeout_seconds=operator_spec["wait_timeout_seconds"]
+    )
+
+
+def prepare_gpu_operator(config: llmd_runtime.ResolvedConfig) -> None:
+    operator_spec = llmd_runtime.operator_spec_by_package(
+        config.platform, "gpu-operator-certified"
+    )
+    ensure_operator_subscription(operator_spec)
+    llmd_runtime.wait_for_crd(
+        operator_spec["bootstrap_crd"],
+        timeout_seconds=operator_spec["wait_timeout_seconds"],
+    )
+
+    manifest = llmd_runtime.load_manifest_template(
+        config, operator_spec["bootstrap_manifest"]
+    )
+    clusterpolicy_name = manifest["metadata"]["name"]
+    if llmd_runtime.resource_exists("clusterpolicy", clusterpolicy_name):
+        LOGGER.info(
+            "ClusterPolicy/%s already exists; verifying readiness instead of applying bootstrap manifest",
+            clusterpolicy_name,
+        )
+        wait_for_gpu_clusterpolicy_ready(
+            clusterpolicy_name,
+            timeout_seconds=operator_spec["wait_timeout_seconds"],
+        )
+        return
+
+    llmd_runtime.apply_manifest(
+        config.artifact_dir / "src" / "gpu-clusterpolicy.yaml",
+        manifest,
+    )
+
+    wait_for_gpu_clusterpolicy_ready(
+        clusterpolicy_name,
+        timeout_seconds=operator_spec["wait_timeout_seconds"],
+    )
+
+
+def wait_for_gpu_clusterpolicy_ready(
+    clusterpolicy_name: str, *, timeout_seconds: int
+) -> None:
+    def _clusterpolicy_ready() -> bool:
+        payload = llmd_runtime.oc_get_json(
+            "clusterpolicy",
+            name=clusterpolicy_name,
+        )
+        state = payload.get("status", {}).get("state", "")
+        return state.lower() == "ready"
+
+    llmd_runtime.wait_until(
+        f"clusterpolicy/{clusterpolicy_name} ready",
+        timeout_seconds=timeout_seconds,
+        interval_seconds=15,
+        predicate=_clusterpolicy_ready,
+    )
+
+
+def prepare_rhoai_operator(config: llmd_runtime.ResolvedConfig) -> None:
+    operator_spec = llmd_runtime.operator_spec_by_package(
+        config.platform, "rhods-operator"
+    )
+    ensure_operator_subscription(operator_spec)
+    ensure_required_crds(config.platform["rhoai"]["required_crds_before_dsc"], config)
+
+
+def ensure_required_crds(
+    crd_names: list[str], config: llmd_runtime.ResolvedConfig
+) -> None:
+    for crd_name in crd_names:
+        llmd_runtime.wait_for_crd(
+            crd_name,
+            timeout_seconds=config.platform["rhoai"]["wait_timeout_seconds"],
+        )
+
+
+def apply_datasciencecluster(config: llmd_runtime.ResolvedConfig) -> None:
+    manifest = llmd_runtime.render_datasciencecluster(config)
+    llmd_runtime.apply_manifest(
+        config.artifact_dir / "src" / "datasciencecluster.yaml", manifest
+    )
+    llmd_runtime.oc(
+        "get",
+        "datasciencecluster",
+        config.platform["rhoai"]["datasciencecluster_name"],
+        "-n",
+        config.platform["rhoai"]["namespace"],
+        "-o",
+        "yaml",
+        capture_output=True,
+    )
+
+
+def wait_for_datasciencecluster_ready(config: llmd_runtime.ResolvedConfig) -> None:
+    rhoai = config.platform["rhoai"]
+
+    def _dsc_ready() -> bool:
+        payload = llmd_runtime.oc_get_json(
+            "datasciencecluster",
+            name=rhoai["datasciencecluster_name"],
+            namespace=rhoai["namespace"],
+        )
+        phase = payload.get("status", {}).get("phase")
+        if phase == "Ready":
+            return True
+        if phase in {"Failed", "Error"}:
+            raise RuntimeError(f"DataScienceCluster entered terminal phase {phase}")
+        return False
+
+    llmd_runtime.wait_until(
+        f"datasciencecluster/{rhoai['datasciencecluster_name']} ready",
+        timeout_seconds=rhoai["wait_timeout_seconds"],
+        interval_seconds=10,
+        predicate=_dsc_ready,
+    )
+
+
+def ensure_gateway(config: llmd_runtime.ResolvedConfig) -> None:
+    gateway = config.platform["gateway"]
+    if not llmd_runtime.resource_exists(
+        "gateway", gateway["name"], namespace=gateway["namespace"]
+    ):
+        if not gateway["create_if_missing"]:
+            raise RuntimeError(
+                f"Required gateway {gateway['name']} does not exist in {gateway['namespace']}"
+            )
+        manifest = llmd_runtime.render_gateway(config)
+        llmd_runtime.apply_manifest(
+            config.artifact_dir / "src" / "gateway.yaml", manifest
+        )
+
+    def _gateway_programmed() -> bool:
+        resource = llmd_runtime.oc_get_json(
+            "gateway",
+            name=gateway["name"],
+            namespace=gateway["namespace"],
+        )
+        return llmd_runtime.condition_status(resource, "Programmed") == "True"
+
+    llmd_runtime.wait_until(
+        f"gateway/{gateway['name']} programmed",
+        timeout_seconds=gateway["wait_timeout_seconds"],
+        interval_seconds=10,
+        predicate=_gateway_programmed,
+    )
+
+
+def ensure_test_namespace(config: llmd_runtime.ResolvedConfig) -> None:
+    llmd_runtime.ensure_namespace(
+        config.namespace,
+        labels={
+            "app.kubernetes.io/managed-by": "forge",
+            "forge.openshift.io/project": "llm_d",
+        },
+    )
+
+
+def verify_gpu_nodes(config: llmd_runtime.ResolvedConfig) -> None:
+    selector = config.platform["cluster"]["gpu_node_label_selector"]
+    data = llmd_runtime.oc_get_json("nodes", selector=selector, ignore_not_found=True)
+    items = data.get("items", []) if data else []
+    if not items:
+        raise RuntimeError(
+            f"No GPU nodes found with selector {selector}. The llm_d smoke path requires GPUs."
+        )
+
+
+def wait_for_nfd_gpu_labels(
+    config: llmd_runtime.ResolvedConfig, *, timeout_seconds: int
+) -> None:
+    selectors = config.platform["cluster"]["nfd_gpu_detection_labels"]
+
+    def _labels_present() -> bool:
+        for selector in selectors:
+            data = llmd_runtime.oc_get_json(
+                "nodes", selector=selector, ignore_not_found=True
+            )
+            if data and data.get("items"):
+                return True
+        return False
+
+    llmd_runtime.wait_until(
+        "NFD GPU discovery labels on cluster nodes",
+        timeout_seconds=timeout_seconds,
+        interval_seconds=15,
+        predicate=_labels_present,
+    )
+
+
+def capture_prepare_state(config: llmd_runtime.ResolvedConfig) -> None:
+    artifacts_dir = config.artifact_dir / "artifacts"
+    rhoai = config.platform["rhoai"]
+    gateway = config.platform["gateway"]
+
+    capture_resource_yaml(
+        "datasciencecluster",
+        rhoai["datasciencecluster_name"],
+        rhoai["namespace"],
+        artifacts_dir / "datasciencecluster.yaml",
+    )
+    capture_resource_yaml(
+        "gateway",
+        gateway["name"],
+        gateway["namespace"],
+        artifacts_dir / "gateway.yaml",
+    )
+    gateway_service = llmd_runtime.oc(
+        "get",
+        "service",
+        "-A",
+        "-l",
+        f"gateway.networking.k8s.io/gateway-name={gateway['name']}",
+        "-o",
+        "yaml",
+        check=False,
+        capture_output=True,
+    )
+    if gateway_service.returncode == 0 and gateway_service.stdout:
+        llmd_runtime.write_text(
+            artifacts_dir / "gateway.service.yaml", gateway_service.stdout
+        )
+    if config.platform["artifacts"]["capture_namespace_events"]:
+        capture_namespace_events(
+            config.namespace, artifacts_dir / "namespace.events.txt"
+        )
+
+
+def capture_resource_yaml(
+    kind: str,
+    name: str,
+    namespace: str,
+    destination: Path,
+    *,
+    check: bool = True,
+) -> None:
+    result = llmd_runtime.oc(
+        "get",
+        kind,
+        name,
+        "-n",
+        namespace,
+        "-o",
+        "yaml",
+        check=check,
+        capture_output=True,
+    )
+    if result.returncode == 0 and result.stdout:
+        llmd_runtime.write_text(destination, result.stdout)
+
+
+def capture_namespace_events(namespace: str, destination: Path) -> None:
+    result = llmd_runtime.oc(
+        "get",
+        "events",
+        "-n",
+        namespace,
+        "--sort-by=.metadata.creationTimestamp",
+        check=False,
+        capture_output=True,
+    )
+    if result.returncode == 0 and result.stdout:
+        llmd_runtime.write_text(destination, result.stdout)
+
+
+main = toolbox.create_toolbox_main(run)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/projects/llm_d/toolbox/test/main.py b/projects/llm_d/toolbox/test/main.py
new file mode 100644
index 00000000..d779c18c
--- /dev/null
+++ b/projects/llm_d/toolbox/test/main.py
@@ -0,0 +1,492 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import json
+import logging
+import time
+from pathlib import Path
+
+from projects.core.dsl import toolbox
+from projects.llm_d.orchestration import llmd_runtime
+
+LOGGER = logging.getLogger(__name__)
+
+
+def run() -> int:
+    llmd_runtime.init()
+    config = llmd_runtime.load_run_configuration()
+    return run_test(config)
+
+
+def run_test(config: llmd_runtime.ResolvedConfig) -> int:
+    name = config.platform["inference_service"]["name"]
+    namespace = config.namespace
+    artifacts_dir = config.artifact_dir / "artifacts"
+
+    LOGGER.info("Testing llm_d preset=%s namespace=%s", config.preset_name, namespace)
+
+    endpoint_url = None
+    try:
+        endpoint_url = deploy_inference_service(config)
+        smoke_response = run_smoke_request(config, endpoint_url)
+        llmd_runtime.write_json(artifacts_dir / "smoke.response.json", smoke_response)
+
+        if config.benchmark:
+            run_guidellm_benchmark(config, endpoint_url)
+
+        return 0
+    finally:
+        capture_inference_service_state(config)
+        if endpoint_url:
+            llmd_runtime.write_text(artifacts_dir / "endpoint.url", f"{endpoint_url}\n")
+        benchmark_name = (
+            config.benchmark["job_name"] if config.benchmark else "guidellm-benchmark"
+        )
+        llmd_runtime.oc(
+            "delete",
+            "job,pvc",
+            benchmark_name,
+            "-n",
+            namespace,
+            "--ignore-not-found=true",
+            check=False,
+        )
+        llmd_runtime.oc(
+            "delete",
+            "pod",
+            f"{benchmark_name}-copy",
+            "-n",
+            namespace,
+            "--ignore-not-found=true",
+            check=False,
+        )
+        events = llmd_runtime.oc(
+            "get",
+            "events",
+            "-n",
+            namespace,
+            "--sort-by=.metadata.creationTimestamp",
+            check=False,
+            capture_output=True,
+        )
+        if events.returncode == 0 and events.stdout:
+            llmd_runtime.write_text(
+                artifacts_dir / "namespace.events.txt", events.stdout
+            )
+
+
+def deploy_inference_service(config: llmd_runtime.ResolvedConfig) -> str:
+    name = config.platform["inference_service"]["name"]
+    namespace = config.namespace
+    selector = f"app.kubernetes.io/name={name}"
+
+    llmd_runtime.oc(
+        "delete",
+        "llminferenceservice",
+        name,
+        "-n",
+        namespace,
+        "--ignore-not-found=true",
+        check=False,
+    )
+
+    def _old_pods_gone() -> bool:
+        pods = llmd_runtime.oc_get_json(
+            "pods", namespace=namespace, selector=selector, ignore_not_found=True
+        )
+        return not pods or not pods.get("items")
+
+    llmd_runtime.wait_until(
+        f"old llm-d pods to disappear in {namespace}",
+        timeout_seconds=config.platform["inference_service"]["delete_timeout_seconds"],
+        interval_seconds=10,
+        predicate=_old_pods_gone,
+    )
+
+    manifest = llmd_runtime.render_inference_service(config)
+    llmd_runtime.apply_manifest(
+        config.artifact_dir / "src" / "llminferenceservice.yaml", manifest
+    )
+
+    def _pods_present() -> bool:
+        pods = llmd_runtime.oc_get_json(
+            "pods", namespace=namespace, selector=selector, ignore_not_found=True
+        )
+        return bool(pods and pods.get("items"))
+
+    llmd_runtime.wait_until(
+        f"llm-d pods to appear in {namespace}",
+        timeout_seconds=config.platform["inference_service"][
+            "pod_appearance_timeout_seconds"
+        ],
+        interval_seconds=5,
+        predicate=_pods_present,
+    )
+
+    def _service_ready() -> bool:
+        payload = llmd_runtime.oc_get_json(
+            "llminferenceservice", name=name, namespace=namespace
+        )
+        return llmd_runtime.condition_status(payload, "Ready") == "True"
+
+    llmd_runtime.wait_until(
+        f"llminferenceservice/{name} ready",
+        timeout_seconds=config.platform["inference_service"]["ready_timeout_seconds"],
+        interval_seconds=10,
+        predicate=_service_ready,
+    )
+
+    return llmd_runtime.wait_until(
+        f"gateway address for llminferenceservice/{name}",
+        timeout_seconds=config.platform["inference_service"]["ready_timeout_seconds"],
+        interval_seconds=10,
+        predicate=lambda: try_resolve_endpoint_url(config),
+    )
+
+
+def resolve_endpoint_url(config: llmd_runtime.ResolvedConfig) -> str:
+    endpoint_url = try_resolve_endpoint_url(config)
+    if endpoint_url:
+        return endpoint_url
+
+    name = config.platform["inference_service"]["name"]
+    gateway_name = config.platform["gateway"]["status_address_name"]
+    raise RuntimeError(
+        f"Gateway address {gateway_name} is missing from llminferenceservice/{name} status.addresses"
+    )
+
+
+def try_resolve_endpoint_url(config: llmd_runtime.ResolvedConfig) -> str | None:
+    name = config.platform["inference_service"]["name"]
+    namespace = config.namespace
+    gateway_name = config.platform["gateway"]["status_address_name"]
+    payload = llmd_runtime.oc_get_json(
+        "llminferenceservice", name=name, namespace=namespace
+    )
+
+    for address in payload.get("status", {}).get("addresses", []):
+        if address.get("name") == gateway_name and address.get("url"):
+            return address["url"]
+    return None
+
+
+def run_smoke_request(
+    config: llmd_runtime.ResolvedConfig, endpoint_url: str
+) -> dict[str, object]:
+    namespace = config.namespace
+    name = config.platform["inference_service"]["name"]
+    deployment_name = f"{name}{config.platform['inference_service']['workload_deployment_name_suffix']}"
+
+    payload = {
+        "model": config.model["served_model_name"],
+        "prompt": config.smoke_request["prompt"],
+        "max_tokens": config.smoke_request["max_tokens"],
+        "temperature": config.smoke_request["temperature"],
+    }
+    llmd_runtime.write_json(
+        config.artifact_dir / "artifacts" / "smoke.request.json", payload
+    )
+
+    retries = config.platform["smoke"]["request_retries"]
+    delay = config.platform["smoke"]["request_retry_delay_seconds"]
+    result = None
+    for _ in range(retries):
+        result = llmd_runtime.oc(
+            "exec",
+            "-n",
+            namespace,
+            f"deployment/{deployment_name}",
+            "-c",
+            "main",
+            "--",
+            "curl",
+            "-k",
+            "-sSf",
+            f"{endpoint_url}{config.platform['smoke']['endpoint_path']}",
+            "-H",
+            "Content-Type: application/json",
+            "-d",
+            json.dumps(payload),
+            check=False,
+            capture_output=True,
+        )
+        if result.returncode == 0:
+            break
+        time.sleep(delay)
+
+    if result is None or result.returncode != 0:
+        raise RuntimeError("Smoke request never succeeded against the llm_d endpoint")
+
+    response = json.loads(result.stdout)
+    if not response.get("choices"):
+        raise RuntimeError(f"Invalid smoke response payload: {result.stdout}")
+    return response
+
+
+def run_guidellm_benchmark(
+    config: llmd_runtime.ResolvedConfig, endpoint_url: str
+) -> None:
+    benchmark_name = config.benchmark["job_name"]
+    namespace = config.namespace
+
+    llmd_runtime.oc(
+        "delete",
+        "job,pvc",
+        benchmark_name,
+        "-n",
+        namespace,
+        "--ignore-not-found=true",
+        check=False,
+    )
+    llmd_runtime.oc(
+        "delete",
+        "pod",
+        f"{benchmark_name}-copy",
+        "-n",
+        namespace,
+        "--ignore-not-found=true",
+        check=False,
+    )
+
+    llmd_runtime.apply_manifest(
+        config.artifact_dir / "src" / "guidellm-pvc.yaml",
+        llmd_runtime.render_guidellm_pvc(config),
+    )
+    llmd_runtime.apply_manifest(
+        config.artifact_dir / "src" / "guidellm-job.yaml",
+        llmd_runtime.render_guidellm_job(config, endpoint_url),
+    )
+
+    def _job_terminal() -> dict[str, object] | None:
+        payload = llmd_runtime.oc_get_json(
+            "job", name=benchmark_name, namespace=namespace
+        )
+        status = payload.get("status", {})
+        if status.get("succeeded"):
+            return payload
+        if status.get("failed"):
+            raise RuntimeError(f"GuideLLM job {benchmark_name} failed")
+        return None
+
+    llmd_runtime.wait_until(
+        f"GuideLLM job/{benchmark_name}",
+        timeout_seconds=config.benchmark["timeout_seconds"],
+        interval_seconds=10,
+        predicate=_job_terminal,
+    )
+
+    capture_guidellm_state(config)
+    copy_guidellm_results(config)
+
+
+def copy_guidellm_results(config: llmd_runtime.ResolvedConfig) -> None:
+    benchmark_name = config.benchmark["job_name"]
+    namespace = config.namespace
+    pod_data = llmd_runtime.oc_get_json(
+        "pods",
+        namespace=namespace,
+        selector=f"job-name={benchmark_name}",
+        ignore_not_found=True,
+    )
+    node_name = None
+    if pod_data and pod_data.get("items"):
+        node_name = pod_data["items"][0].get("spec", {}).get("nodeName")
+
+    llmd_runtime.apply_manifest(
+        config.artifact_dir / "src" / "guidellm-copy-pod.yaml",
+        llmd_runtime.render_guidellm_copy_pod(config, node_name=node_name),
+    )
+
+    def _helper_ready() -> bool:
+        payload = llmd_runtime.oc_get_json(
+            "pod",
+            name=f"{benchmark_name}-copy",
+            namespace=namespace,
+        )
+        conditions = payload.get("status", {}).get("conditions", [])
+        return any(
+            condition.get("type") == "Ready" and condition.get("status") == "True"
+            for condition in conditions
+        )
+
+    llmd_runtime.wait_until(
+        f"GuideLLM copy helper pod/{benchmark_name}-copy",
+        timeout_seconds=120,
+        interval_seconds=5,
+        predicate=_helper_ready,
+    )
+
+    result = llmd_runtime.oc(
+        "exec",
+        "-n",
+        namespace,
+        f"{benchmark_name}-copy",
+        "--",
+        "cat",
+        "/results/benchmarks.json",
+        check=False,
+        capture_output=True,
+    )
+    if result.returncode == 0 and result.stdout:
+        llmd_runtime.write_text(
+            config.artifact_dir / "artifacts" / "results" / "benchmarks.json",
+            result.stdout,
+        )
+
+
+def capture_inference_service_state(config: llmd_runtime.ResolvedConfig) -> None:
+    name = config.platform["inference_service"]["name"]
+    namespace = config.namespace
+    artifacts_dir = config.artifact_dir / "artifacts"
+    selector = f"app.kubernetes.io/name={name}"
+
+    capture_get(
+        "llminferenceservice",
+        name,
+        namespace,
+        "yaml",
+        artifacts_dir / "llminferenceservice.yaml",
+    )
+    capture_get(
+        "llminferenceservice",
+        name,
+        namespace,
+        "json",
+        artifacts_dir / "llminferenceservice.json",
+    )
+    capture_get(
+        "pods",
+        None,
+        namespace,
+        "yaml",
+        artifacts_dir / "llminferenceservice.pods.yaml",
+        selector=selector,
+    )
+    capture_get(
+        "deployments",
+        None,
+        namespace,
+        "yaml",
+        artifacts_dir / "llminferenceservice.deployments.yaml",
+        selector=selector,
+    )
+    capture_get(
+        "replicasets",
+        None,
+        namespace,
+        "yaml",
+        artifacts_dir / "llminferenceservice.replicasets.yaml",
+        selector=selector,
+    )
+    capture_get(
+        "pods", None, namespace, "wide", artifacts_dir / "namespace.pods.status"
+    )
+    capture_get(
+        "services", None, namespace, "wide", artifacts_dir / "namespace.services.status"
+    )
+
+    pod_list = llmd_runtime.oc_get_json(
+        "pods", namespace=namespace, selector=selector, ignore_not_found=True
+    )
+    if pod_list:
+        lines = []
+        previous_lines = []
+        for pod in pod_list.get("items", []):
+            pod_name = pod["metadata"]["name"]
+            lines.append(f"=== {pod_name} ===")
+            log_result = llmd_runtime.oc(
+                "logs",
+                pod_name,
+                "-n",
+                namespace,
+                "--all-containers=true",
+                check=False,
+                capture_output=True,
+            )
+            if log_result.stdout:
+                lines.append(log_result.stdout.rstrip())
+
+            previous_lines.append(f"=== {pod_name} ===")
+            previous_result = llmd_runtime.oc(
+                "logs",
+                pod_name,
+                "-n",
+                namespace,
+                "--previous",
+                "--all-containers=true",
+                check=False,
+                capture_output=True,
+            )
+            if previous_result.stdout:
+                previous_lines.append(previous_result.stdout.rstrip())
+
+        llmd_runtime.write_text(
+            artifacts_dir / "llminferenceservice.pods.logs", "\n".join(lines) + "\n"
+        )
+        llmd_runtime.write_text(
+            artifacts_dir / "llminferenceservice.pods.previous.logs",
+            "\n".join(previous_lines) + "\n",
+        )
+
+
+def capture_guidellm_state(config: llmd_runtime.ResolvedConfig) -> None:
+    benchmark_name = config.benchmark["job_name"]
+    namespace = config.namespace
+    artifacts_dir = config.artifact_dir / "artifacts"
+
+    capture_get(
+        "job",
+        benchmark_name,
+        namespace,
+        "yaml",
+        artifacts_dir / "guidellm_benchmark_job.yaml",
+    )
+    capture_get(
+        "pods",
+        None,
+        namespace,
+        "yaml",
+        artifacts_dir / "guidellm_benchmark_job.pods.yaml",
+        selector=f"job-name={benchmark_name}",
+    )
+    result = llmd_runtime.oc(
+        "logs",
+        f"job/{benchmark_name}",
+        "-n",
+        namespace,
+        check=False,
+        capture_output=True,
+    )
+    if result.returncode == 0 and result.stdout:
+        llmd_runtime.write_text(
+            artifacts_dir / "guidellm_benchmark_job.logs", result.stdout
+        )
+
+
+def capture_get(
+    kind: str,
+    name: str | None,
+    namespace: str,
+    output: str,
+    destination: Path,
+    *,
+    selector: str | None = None,
+) -> None:
+    args = ["get", kind]
+    if name:
+        args.append(name)
+    args.extend(["-n", namespace])
+    if selector:
+        args.extend(["-l", selector])
+    args.extend(["-o", output])
+    result = llmd_runtime.oc(*args, check=False, capture_output=True)
+    if result.returncode == 0 and result.stdout:
+        llmd_runtime.write_text(destination, result.stdout)
+
+
+main = toolbox.create_toolbox_main(run)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/llm_d/test_runtime.py b/tests/llm_d/test_runtime.py
index 4557de00..b2bcff1d 100644
--- a/tests/llm_d/test_runtime.py
+++ b/tests/llm_d/test_runtime.py
@@ -5,8 +5,8 @@
 import pytest
 
 from projects.llm_d.orchestration import llmd_runtime
-from projects.llm_d.orchestration import prepare_llmd
-from projects.llm_d.orchestration import test_llmd
+from projects.llm_d.toolbox.prepare import main as prepare_toolbox
+from projects.llm_d.toolbox.test import main as test_toolbox
 
 
 def test_derive_namespace_uses_prefix_once() -> None:
@@ -16,10 +16,12 @@ def test_derive_namespace_uses_prefix_once() -> None:
 
 def test_parse_overrides_rejects_unknown_keys() -> None:
     with pytest.raises(ValueError, match="Unsupported llm_d override keys"):
-        llmd_runtime.parse_overrides('{"model":"other"}')
+        llmd_runtime.parse_overrides('{"model":"other"}', allowed_keys=("namespace",))
 
 
-def test_load_run_configuration_resolves_alias(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
+def test_load_run_configuration_resolves_alias(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
     monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
     artifact_dir = tmp_path / "artifacts"
     artifact_dir.mkdir()
@@ -30,7 +32,9 @@ def test_load_run_configuration_resolves_alias(tmp_path: Path, monkeypatch: pyte
         encoding="utf-8",
     )
 
-    config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
+    config = llmd_runtime.load_run_configuration(
+        cwd=tmp_path, artifact_dir=artifact_dir
+    )
 
     assert config.preset_name == "smoke"
     assert config.preset_alias == "cks"
@@ -39,23 +43,48 @@ def test_load_run_configuration_resolves_alias(tmp_path: Path, monkeypatch: pyte
     assert config.namespace_is_managed is True
 
 
-def test_namespace_override_is_not_managed(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
+def test_load_run_configuration_consolidates_config_d(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
+    artifact_dir = tmp_path / "artifacts"
+    artifact_dir.mkdir()
+
+    llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
+    consolidated = llmd_runtime.load_yaml(artifact_dir / "config.yaml")
+
+    assert "platform" in consolidated
+    assert "models" in consolidated
+    assert "runtime" in consolidated
+    assert "workloads" in consolidated
+    assert consolidated["runtime"]["default_preset"] == "smoke"
+
+
+def test_namespace_override_is_not_managed(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
     monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", '{"namespace":"custom-ns"}')
     artifact_dir = tmp_path / "artifacts"
     artifact_dir.mkdir()
 
-    config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
+    config = llmd_runtime.load_run_configuration(
+        cwd=tmp_path, artifact_dir=artifact_dir
+    )
 
     assert config.namespace == "custom-ns"
     assert config.namespace_is_managed is False
 
 
-def test_render_inference_service_injects_model_and_epp(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
+def test_render_inference_service_injects_model_and_epp(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
     monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
     artifact_dir = tmp_path / "artifacts"
     artifact_dir.mkdir()
 
-    config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
+    config = llmd_runtime.load_run_configuration(
+        cwd=tmp_path, artifact_dir=artifact_dir
+    )
     manifest = llmd_runtime.render_inference_service(config)
 
     assert manifest["metadata"]["name"] == "llm-d"
@@ -64,12 +93,16 @@ def test_render_inference_service_injects_model_and_epp(tmp_path: Path, monkeypa
     assert manifest["spec"]["model"]["uri"] == "hf://Qwen/Qwen3-0.6B"
     assert manifest["spec"]["model"]["name"] == config.model["served_model_name"]
     assert manifest["spec"]["model"]["uri"] == config.model["uri"]
-    router_args = manifest["spec"]["router"]["scheduler"]["template"]["containers"][0]["args"]
+    router_args = manifest["spec"]["router"]["scheduler"]["template"]["containers"][0][
+        "args"
+    ]
     assert router_args[-2] == "--config-text"
     assert "EndpointPickerConfig" in router_args[-1]
 
 
-def test_render_guidellm_job_uses_target_and_rate(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
+def test_render_guidellm_job_uses_target_and_rate(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
     monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
     artifact_dir = tmp_path / "artifacts"
     artifact_dir.mkdir()
@@ -78,7 +111,9 @@ def test_render_guidellm_job_uses_target_and_rate(tmp_path: Path, monkeypatch: p
         encoding="utf-8",
     )
 
-    config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
+    config = llmd_runtime.load_run_configuration(
+        cwd=tmp_path, artifact_dir=artifact_dir
+    )
     manifest = llmd_runtime.render_guidellm_job(config, "https://example.test")
 
     container = manifest["spec"]["template"]["spec"]["containers"][0]
@@ -93,12 +128,14 @@ def test_prepare_gpu_operator_skips_existing_clusterpolicy(
     monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
     artifact_dir = tmp_path / "artifacts"
     artifact_dir.mkdir()
-    config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
+    config = llmd_runtime.load_run_configuration(
+        cwd=tmp_path, artifact_dir=artifact_dir
+    )
 
     calls: list[str] = []
 
     monkeypatch.setattr(
-        prepare_llmd,
+        prepare_toolbox,
         "ensure_operator_subscription",
         lambda operator_spec: calls.append(f"subscription:{operator_spec['package']}"),
     )
@@ -129,9 +166,12 @@ def fail_apply(*_: object, **__: object) -> None:
         lambda kind, name: {"status": {"state": "ready"}},
     )
 
-    prepare_llmd.prepare_gpu_operator(config)
+    prepare_toolbox.prepare_gpu_operator(config)
 
-    assert calls == ["subscription:gpu-operator-certified", "crd:clusterpolicies.nvidia.com"]
+    assert calls == [
+        "subscription:gpu-operator-certified",
+        "crd:clusterpolicies.nvidia.com",
+    ]
 
 
 def test_prepare_gpu_operator_bootstraps_missing_clusterpolicy(
@@ -140,7 +180,9 @@ def test_prepare_gpu_operator_bootstraps_missing_clusterpolicy(
     monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
     artifact_dir = tmp_path / "artifacts"
     artifact_dir.mkdir()
-    config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
+    config = llmd_runtime.load_run_configuration(
+        cwd=tmp_path, artifact_dir=artifact_dir
+    )
 
     applied: list[Path] = []
     manifest = {
@@ -150,9 +192,11 @@ def test_prepare_gpu_operator_bootstraps_missing_clusterpolicy(
         "spec": {},
     }
 
-    monkeypatch.setattr(prepare_llmd, "ensure_operator_subscription", lambda _: None)
+    monkeypatch.setattr(prepare_toolbox, "ensure_operator_subscription", lambda _: None)
     monkeypatch.setattr(llmd_runtime, "wait_for_crd", lambda *_, **__: None)
-    monkeypatch.setattr(llmd_runtime, "load_manifest_template", lambda _config, _path: manifest)
+    monkeypatch.setattr(
+        llmd_runtime, "load_manifest_template", lambda _config, _path: manifest
+    )
     monkeypatch.setattr(llmd_runtime, "resource_exists", lambda kind, name: False)
     monkeypatch.setattr(
         llmd_runtime,
@@ -165,7 +209,7 @@ def test_prepare_gpu_operator_bootstraps_missing_clusterpolicy(
         lambda kind, name: {"status": {"state": "ready"}},
     )
 
-    prepare_llmd.prepare_gpu_operator(config)
+    prepare_toolbox.prepare_gpu_operator(config)
 
     assert applied == [artifact_dir / "src" / "gpu-clusterpolicy.yaml"]
 
@@ -196,7 +240,9 @@ def test_resolve_endpoint_url_requires_gateway_address(
     monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
     artifact_dir = tmp_path / "artifacts"
     artifact_dir.mkdir()
-    config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
+    config = llmd_runtime.load_run_configuration(
+        cwd=tmp_path, artifact_dir=artifact_dir
+    )
 
     def fake_oc_get_json(kind: str, **_: object) -> dict[str, object]:
         assert kind == "llminferenceservice"
@@ -205,4 +251,4 @@ def fake_oc_get_json(kind: str, **_: object) -> dict[str, object]:
     monkeypatch.setattr(llmd_runtime, "oc_get_json", fake_oc_get_json)
 
     with pytest.raises(RuntimeError, match="Gateway address"):
-        test_llmd.resolve_endpoint_url(config)
+        test_toolbox.resolve_endpoint_url(config)

From 1c707d6223449e496d9ac8bf54f95f1e9ee7804f Mon Sep 17 00:00:00 2001
From: Alberto Perdomo <aperdomo@redhat.com>
Date: Thu, 16 Apr 2026 08:30:42 +0100
Subject: [PATCH 03/21] feat: Add llm_d model cache

---
 .../orchestration/config.d/model_cache.yaml   |  25 +
 .../llm_d/orchestration/config.d/models.yaml  |   7 +
 .../orchestration/config.d/platform.yaml      |   1 +
 projects/llm_d/orchestration/llmd_runtime.py  | 452 +++++++++++++++++-
 projects/llm_d/toolbox/cleanup/main.py        | 105 +++-
 projects/llm_d/toolbox/prepare/main.py        |   4 +
 .../llm_d/toolbox/prepare_model_cache/main.py | 207 ++++++++
 tests/llm_d/test_runtime.py                   | 185 ++++++-
 8 files changed, 951 insertions(+), 35 deletions(-)
 create mode 100644 projects/llm_d/orchestration/config.d/model_cache.yaml
 create mode 100644 projects/llm_d/toolbox/prepare_model_cache/main.py

diff --git a/projects/llm_d/orchestration/config.d/model_cache.yaml b/projects/llm_d/orchestration/config.d/model_cache.yaml
new file mode 100644
index 00000000..eae01772
--- /dev/null
+++ b/projects/llm_d/orchestration/config.d/model_cache.yaml
@@ -0,0 +1,25 @@
+enabled: true
+marker_filename: .forge-model-cache.json
+
+pvc:
+  name_prefix: llm-d-model
+  size: 15Gi
+  access_mode: ReadWriteOnce
+  storage_class_name: null
+  model_directory_name: model
+
+download:
+  wait_timeout_seconds: 7200
+  poll_interval_seconds: 15
+  pod_image_pull_policy: IfNotPresent
+
+hf:
+  downloader_image: registry.access.redhat.com/ubi9/python-311
+  token_secret_name: null
+  token_secret_key: token
+
+oci:
+  extractor_image: registry.redhat.io/openshift4/ose-cli:v4.19
+  registry_auth_secret_name: null
+  registry_auth_secret_key: .dockerconfigjson
+  image_path: /
diff --git a/projects/llm_d/orchestration/config.d/models.yaml b/projects/llm_d/orchestration/config.d/models.yaml
index fd204db4..4334cf4a 100644
--- a/projects/llm_d/orchestration/config.d/models.yaml
+++ b/projects/llm_d/orchestration/config.d/models.yaml
@@ -1,6 +1,9 @@
 qwen3-0-6b:
   served_model_name: Qwen/Qwen3-0.6B
   uri: hf://Qwen/Qwen3-0.6B
+  cache:
+    pvc_size: 10Gi
+    access_mode: ReadWriteOnce
   resources:
     requests:
       cpu: "4"
@@ -14,6 +17,10 @@ qwen3-0-6b:
 llama-3-1-8b-instruct-fp8:
   served_model_name: llama-3-1-8b-instruct-fp8
   uri: oci://registry.redhat.io/rhelai1/modelcar-llama-3-1-8b-instruct-fp8-dynamic:1.5
+  cache:
+    pvc_size: 40Gi
+    access_mode: ReadWriteOnce
+    oci_image_path: /
   resources:
     requests:
       cpu: "4"
diff --git a/projects/llm_d/orchestration/config.d/platform.yaml b/projects/llm_d/orchestration/config.d/platform.yaml
index c5e35ea4..9f3b9e0e 100644
--- a/projects/llm_d/orchestration/config.d/platform.yaml
+++ b/projects/llm_d/orchestration/config.d/platform.yaml
@@ -1,5 +1,6 @@
 cluster:
   minimum_openshift_version: "4.19.9"
+  namespace_name: forge-llm-d
   namespace_prefix: llm-d
   namespace_max_length: 63
   cleanup_timeout_seconds: 900
diff --git a/projects/llm_d/orchestration/llmd_runtime.py b/projects/llm_d/orchestration/llmd_runtime.py
index 8b507bd2..2c961e7c 100644
--- a/projects/llm_d/orchestration/llmd_runtime.py
+++ b/projects/llm_d/orchestration/llmd_runtime.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import copy
+import hashlib
 import json
 import logging
 import os
@@ -37,7 +38,9 @@ class ResolvedConfig:
     namespace_is_managed: bool
     gpu_count: int | None
     platform: dict[str, Any]
+    model_key: str
     model: dict[str, Any]
+    model_cache: dict[str, Any]
     smoke_request: dict[str, Any]
     benchmark: dict[str, Any] | None
     fournos_config: dict[str, Any]
@@ -48,6 +51,31 @@ def manifests_dir(self) -> Path:
         return self.config_dir / "manifests"
 
 
+@dataclass(frozen=True)
+class ModelCacheSpec:
+    source_uri: str
+    source_scheme: str
+    cache_key: str
+    namespace: str
+    pvc_name: str
+    pvc_size: str
+    access_mode: str
+    storage_class_name: str | None
+    model_path: str
+    model_uri: str
+    marker_filename: str
+    download_job_name: str
+    hf_token_secret_name: str | None
+    hf_token_secret_key: str | None
+    oci_image_path: str | None
+    oci_registry_auth_secret_name: str | None
+    oci_registry_auth_secret_key: str | None
+
+    @property
+    def marker_path(self) -> str:
+        return f"/cache/{self.model_path}/{self.marker_filename}"
+
+
 def init() -> Path:
     if not logging.getLogger().handlers:
         logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
@@ -73,6 +101,7 @@ def load_run_configuration(
     _reinitialize_project_config()
 
     platform_data = copy.deepcopy(config.project.get_config("platform"))
+    model_cache = copy.deepcopy(config.project.get_config("model_cache"))
     fournos_config = load_fournos_config(cwd)
     overrides = parse_overrides(
         os.environ.get("FORGE_CONFIG_OVERRIDES", ""),
@@ -109,10 +138,15 @@ def load_run_configuration(
         job_name = f"local-{preset_name}"
 
     namespace_override = overrides.get("namespace") or fournos_config.get("namespace")
-    namespace = namespace_override or derive_namespace(
-        job_name,
-        platform_data["cluster"]["namespace_prefix"],
-        platform_data["cluster"]["namespace_max_length"],
+    default_namespace = platform_data["cluster"].get("namespace_name")
+    namespace = (
+        namespace_override
+        or default_namespace
+        or derive_namespace(
+            job_name,
+            platform_data["cluster"]["namespace_prefix"],
+            platform_data["cluster"]["namespace_max_length"],
+        )
     )
 
     gpu_count = normalize_gpu_count(fournos_config.get("gpu-count"))
@@ -125,10 +159,12 @@ def load_run_configuration(
         preset_alias=preset_alias,
         job_name=job_name,
         namespace=namespace,
-        namespace_is_managed=namespace_override is None,
+        namespace_is_managed=namespace_override is None and default_namespace is None,
         gpu_count=gpu_count,
         platform=platform_data,
+        model_key=model_name,
         model=model,
+        model_cache=model_cache,
         smoke_request=smoke_request,
         benchmark=benchmark,
         fournos_config=fournos_config,
@@ -223,6 +259,76 @@ def derive_namespace(job_name: str, prefix: str, max_length: int) -> str:
     return namespace
 
 
+def slugify_identifier(value: str, *, max_length: int = 63) -> str:
+    slug = re.sub(r"[^a-z0-9-]+", "-", value.lower())
+    slug = re.sub(r"-{2,}", "-", slug).strip("-")
+    return slug[:max_length].rstrip("-") or "item"
+
+
+def truncate_k8s_name(value: str, *, max_length: int = 63) -> str:
+    return value[:max_length].rstrip("-")
+
+
+def resolve_model_cache(config: ResolvedConfig) -> ModelCacheSpec | None:
+    if not config.model_cache.get("enabled", False):
+        return None
+
+    source_uri = config.model["uri"]
+    if source_uri.startswith(("pvc://", "pvc+hf://")):
+        return None
+
+    if source_uri.startswith("hf://"):
+        source_scheme = "hf"
+    elif source_uri.startswith("oci://"):
+        source_scheme = "oci"
+    else:
+        raise ValueError(
+            f"Unsupported model cache source URI for {config.model_key}: {source_uri}"
+        )
+
+    model_cache_overrides = config.model.get("cache", {})
+    pvc_defaults = config.model_cache["pvc"]
+    pvc_prefix = config.model_cache["pvc"]["name_prefix"]
+    cache_key = hashlib.sha256(source_uri.encode("utf-8")).hexdigest()[:10]
+    pvc_name = truncate_k8s_name(
+        f"{pvc_prefix}-{slugify_identifier(config.model_key, max_length=32)}-{cache_key}"
+    )
+    model_path = pvc_defaults["model_directory_name"]
+
+    return ModelCacheSpec(
+        source_uri=source_uri,
+        source_scheme=source_scheme,
+        cache_key=cache_key,
+        namespace=config.namespace,
+        pvc_name=pvc_name,
+        pvc_size=model_cache_overrides.get("pvc_size", pvc_defaults["size"]),
+        access_mode=model_cache_overrides.get(
+            "access_mode", pvc_defaults["access_mode"]
+        ),
+        storage_class_name=model_cache_overrides.get(
+            "storage_class_name", pvc_defaults.get("storage_class_name")
+        ),
+        model_path=model_path,
+        model_uri=f"pvc://{pvc_name}/{model_path}",
+        marker_filename=config.model_cache["marker_filename"],
+        download_job_name=truncate_k8s_name(f"{pvc_name}-download"),
+        hf_token_secret_name=model_cache_overrides.get(
+            "hf_token_secret_name", config.model_cache["hf"].get("token_secret_name")
+        ),
+        hf_token_secret_key=config.model_cache["hf"].get("token_secret_key"),
+        oci_image_path=model_cache_overrides.get(
+            "oci_image_path", config.model_cache["oci"].get("image_path")
+        ),
+        oci_registry_auth_secret_name=model_cache_overrides.get(
+            "oci_registry_auth_secret_name",
+            config.model_cache["oci"].get("registry_auth_secret_name"),
+        ),
+        oci_registry_auth_secret_key=config.model_cache["oci"].get(
+            "registry_auth_secret_key"
+        ),
+    )
+
+
 def load_yaml(path: Path) -> Any:
     with path.open(encoding="utf-8") as handle:
         return yaml.safe_load(handle)
@@ -524,6 +630,93 @@ def condition_status(resource: dict[str, Any], condition_type: str) -> str | Non
     return None
 
 
+def pvc_access_mode_matches(actual_modes: list[str], expected_mode: str) -> bool:
+    return expected_mode in actual_modes
+
+
+def wait_for_pvc_bound(
+    pvc_name: str, namespace: str, *, timeout_seconds: int
+) -> dict[str, Any]:
+    def _pvc_bound() -> dict[str, Any] | None:
+        payload = oc_get_json(
+            "persistentvolumeclaim",
+            name=pvc_name,
+            namespace=namespace,
+            ignore_not_found=True,
+        )
+        if not payload:
+            return None
+        if payload.get("status", {}).get("phase") == "Bound":
+            return payload
+        return None
+
+    return wait_until(
+        f"persistentvolumeclaim/{pvc_name} bound in {namespace}",
+        timeout_seconds=timeout_seconds,
+        interval_seconds=5,
+        predicate=_pvc_bound,
+    )
+
+
+def wait_for_job_completion(
+    job_name: str, namespace: str, *, timeout_seconds: int, interval_seconds: int = 10
+) -> dict[str, Any]:
+    def _job_completed() -> dict[str, Any] | None:
+        payload = oc_get_json(
+            "job",
+            name=job_name,
+            namespace=namespace,
+            ignore_not_found=True,
+        )
+        if not payload:
+            return None
+        status = payload.get("status", {})
+        if status.get("succeeded", 0):
+            return payload
+        failed_count = status.get("failed", 0)
+        for condition in status.get("conditions", []):
+            if condition.get("type") == "Failed" and condition.get("status") == "True":
+                raise RuntimeError(
+                    f"job/{job_name} failed: {condition.get('reason') or 'unknown reason'}"
+                )
+        if failed_count:
+            raise RuntimeError(f"job/{job_name} failed after {failed_count} attempt(s)")
+        return None
+
+    return wait_until(
+        f"job/{job_name} completion in {namespace}",
+        timeout_seconds=timeout_seconds,
+        interval_seconds=interval_seconds,
+        predicate=_job_completed,
+    )
+
+
+def job_pod_names(job_name: str, namespace: str) -> list[str]:
+    payload = oc_get_json(
+        "pods",
+        namespace=namespace,
+        selector=f"job-name={job_name}",
+        ignore_not_found=True,
+    )
+    if not payload:
+        return []
+    return [item["metadata"]["name"] for item in payload.get("items", [])]
+
+
+def resolve_default_serviceaccount_image_pull_secret(namespace: str) -> str | None:
+    payload = oc_get_json(
+        "serviceaccount", name="default", namespace=namespace, ignore_not_found=True
+    )
+    if not payload:
+        return None
+
+    for item in payload.get("imagePullSecrets", []):
+        name = item.get("name")
+        if name:
+            return name
+    return None
+
+
 def render_datasciencecluster(config: ResolvedConfig) -> dict[str, Any]:
     template_path = (
         config.config_dir / config.platform["rhoai"]["datasciencecluster_template"]
@@ -545,6 +738,230 @@ def render_gateway(config: ResolvedConfig) -> dict[str, Any]:
     return manifest
 
 
+def render_model_cache_pvc(spec: ModelCacheSpec) -> dict[str, Any]:
+    manifest: dict[str, Any] = {
+        "apiVersion": "v1",
+        "kind": "PersistentVolumeClaim",
+        "metadata": {
+            "name": spec.pvc_name,
+            "namespace": spec.namespace,
+            "labels": {
+                "app.kubernetes.io/managed-by": "forge",
+                "forge.openshift.io/project": "llm_d",
+                "forge.openshift.io/model-cache": "true",
+                "forge.openshift.io/preserve": "true",
+            },
+            "annotations": {
+                "forge.openshift.io/model-cache-key": spec.cache_key,
+                "forge.openshift.io/model-source-uri": spec.source_uri,
+            },
+        },
+        "spec": {
+            "accessModes": [spec.access_mode],
+            "resources": {"requests": {"storage": spec.pvc_size}},
+        },
+    }
+    if spec.storage_class_name:
+        manifest["spec"]["storageClassName"] = spec.storage_class_name
+    return manifest
+
+
+def render_model_cache_job(
+    config: ResolvedConfig, spec: ModelCacheSpec
+) -> dict[str, Any]:
+    common_env = [
+        {"name": "MODEL_SOURCE", "value": spec.source_uri},
+        {"name": "MODEL_TARGET_DIR", "value": f"/cache/{spec.model_path}"},
+        {"name": "MARKER_FILE", "value": spec.marker_path},
+        {"name": "CACHE_KEY", "value": spec.cache_key},
+    ]
+    volumes: list[dict[str, Any]] = [
+        {"name": "cache", "persistentVolumeClaim": {"claimName": spec.pvc_name}}
+    ]
+
+    container: dict[str, Any]
+    if spec.source_scheme == "hf":
+        command = """
+set -euo pipefail
+mkdir -p "${MODEL_TARGET_DIR}"
+rm -rf "${MODEL_TARGET_DIR}"/*
+python -m pip install --quiet --no-cache-dir 'huggingface_hub[hf_xet]'
+python - <<'PY'
+import os
+from huggingface_hub import snapshot_download
+
+token = None
+token_file = os.environ.get("HF_TOKEN_FILE")
+if token_file and os.path.exists(token_file):
+    with open(token_file, encoding="utf-8") as handle:
+        token = handle.read().strip() or None
+
+snapshot_download(
+    repo_id=os.environ["MODEL_SOURCE"][5:],
+    local_dir=os.environ["MODEL_TARGET_DIR"],
+    local_dir_use_symlinks=False,
+    token=token,
+)
+PY
+cat > "${MARKER_FILE}" <<EOF
+{"source_uri":"${MODEL_SOURCE}","cache_key":"${CACHE_KEY}","scheme":"hf"}
+EOF
+"""
+        volume_mounts = [{"name": "cache", "mountPath": "/cache"}]
+        if spec.hf_token_secret_name:
+            volumes.append(
+                {
+                    "name": "hf-token",
+                    "secret": {"secretName": spec.hf_token_secret_name},
+                }
+            )
+            volume_mounts.append(
+                {
+                    "name": "hf-token",
+                    "mountPath": "/var/run/forge/hf-token",
+                    "readOnly": True,
+                }
+            )
+            common_env.append(
+                {
+                    "name": "HF_TOKEN_FILE",
+                    "value": f"/var/run/forge/hf-token/{spec.hf_token_secret_key}",
+                }
+            )
+
+        container = {
+            "name": "hf-model-downloader",
+            "image": config.model_cache["hf"]["downloader_image"],
+            "imagePullPolicy": config.model_cache["download"]["pod_image_pull_policy"],
+            "command": ["/bin/bash", "-ceu", command],
+            "env": common_env,
+            "volumeMounts": volume_mounts,
+        }
+    elif spec.source_scheme == "oci":
+        registry_auth_secret_name = (
+            spec.oci_registry_auth_secret_name
+            or resolve_default_serviceaccount_image_pull_secret(spec.namespace)
+        )
+        command = """
+set -euo pipefail
+mkdir -p "${MODEL_TARGET_DIR}"
+rm -rf "${MODEL_TARGET_DIR}"/*
+auth_args=()
+if [[ -n "${REGISTRY_AUTH_FILE:-}" && -f "${REGISTRY_AUTH_FILE}" ]]; then
+  auth_args+=(--registry-config="${REGISTRY_AUTH_FILE}")
+fi
+oc image extract "${MODEL_SOURCE#oci://}" \
+  --path "${OCI_IMAGE_PATH}:${MODEL_TARGET_DIR}" \
+  --confirm \
+  "${auth_args[@]}"
+cat > "${MARKER_FILE}" <<EOF
+{"source_uri":"${MODEL_SOURCE}","cache_key":"${CACHE_KEY}","scheme":"oci","image_path":"${OCI_IMAGE_PATH}"}
+EOF
+"""
+        volume_mounts = [{"name": "cache", "mountPath": "/cache"}]
+        common_env.append(
+            {"name": "OCI_IMAGE_PATH", "value": spec.oci_image_path or "/"}
+        )
+        if registry_auth_secret_name:
+            volumes.append(
+                {
+                    "name": "registry-auth",
+                    "secret": {"secretName": registry_auth_secret_name},
+                }
+            )
+            volume_mounts.append(
+                {
+                    "name": "registry-auth",
+                    "mountPath": "/var/run/forge/registry-auth",
+                    "readOnly": True,
+                }
+            )
+            common_env.append(
+                {
+                    "name": "REGISTRY_AUTH_FILE",
+                    "value": f"/var/run/forge/registry-auth/{spec.oci_registry_auth_secret_key}",
+                }
+            )
+
+        container = {
+            "name": "oci-model-extractor",
+            "image": config.model_cache["oci"]["extractor_image"],
+            "imagePullPolicy": config.model_cache["download"]["pod_image_pull_policy"],
+            "command": ["/bin/bash", "-ceu", command],
+            "env": common_env,
+            "volumeMounts": volume_mounts,
+        }
+    else:  # pragma: no cover - guarded by resolve_model_cache
+        raise ValueError(f"Unsupported model cache source scheme: {spec.source_scheme}")
+
+    return {
+        "apiVersion": "batch/v1",
+        "kind": "Job",
+        "metadata": {
+            "name": spec.download_job_name,
+            "namespace": spec.namespace,
+            "labels": {
+                "app.kubernetes.io/managed-by": "forge",
+                "forge.openshift.io/project": "llm_d",
+            },
+        },
+        "spec": {
+            "backoffLimit": 0,
+            "activeDeadlineSeconds": config.model_cache["download"][
+                "wait_timeout_seconds"
+            ],
+            "template": {
+                "metadata": {
+                    "labels": {
+                        "job-name": spec.download_job_name,
+                        "app.kubernetes.io/managed-by": "forge",
+                        "forge.openshift.io/project": "llm_d",
+                    }
+                },
+                "spec": {
+                    "serviceAccountName": "default",
+                    "restartPolicy": "Never",
+                    "containers": [container],
+                    "volumes": volumes,
+                },
+            },
+        },
+    }
+
+
+def model_cache_pvc_ready(spec: ModelCacheSpec) -> bool:
+    payload = oc_get_json(
+        "persistentvolumeclaim",
+        name=spec.pvc_name,
+        namespace=spec.namespace,
+        ignore_not_found=True,
+    )
+    if not payload:
+        return False
+
+    annotations = payload.get("metadata", {}).get("annotations", {})
+    return (
+        annotations.get("forge.openshift.io/model-cache-ready") == "true"
+        and annotations.get("forge.openshift.io/model-cache-key") == spec.cache_key
+        and annotations.get("forge.openshift.io/model-source-uri") == spec.source_uri
+    )
+
+
+def annotate_model_cache_pvc(spec: ModelCacheSpec) -> None:
+    oc(
+        "annotate",
+        "persistentvolumeclaim",
+        spec.pvc_name,
+        "-n",
+        spec.namespace,
+        "--overwrite",
+        f"forge.openshift.io/model-cache-ready=true",
+        f"forge.openshift.io/model-cache-key={spec.cache_key}",
+        f"forge.openshift.io/model-source-uri={spec.source_uri}",
+        f"forge.openshift.io/model-uri={spec.model_uri}",
+    )
+
+
 def render_inference_service(config: ResolvedConfig) -> dict[str, Any]:
     template_path = config.config_dir / config.platform["inference_service"]["template"]
     manifest = load_yaml(template_path)
@@ -560,7 +977,10 @@ def render_inference_service(config: ResolvedConfig) -> dict[str, Any]:
         }
     )
 
-    manifest["spec"]["model"]["uri"] = config.model["uri"]
+    cache_spec = resolve_model_cache(config)
+    manifest["spec"]["model"]["uri"] = (
+        cache_spec.model_uri if cache_spec else config.model["uri"]
+    )
     manifest["spec"]["model"]["name"] = config.model["served_model_name"]
     manifest["spec"]["template"]["containers"][0]["resources"] = copy.deepcopy(
         config.model["resources"]
@@ -590,6 +1010,10 @@ def render_guidellm_pvc(config: ResolvedConfig) -> dict[str, Any]:
         "metadata": {
             "name": config.benchmark["job_name"],
             "namespace": config.namespace,
+            "labels": {
+                "app.kubernetes.io/managed-by": "forge",
+                "forge.openshift.io/project": "llm_d",
+            },
         },
         "spec": {
             "accessModes": ["ReadWriteOnce"],
@@ -620,10 +1044,20 @@ def render_guidellm_job(config: ResolvedConfig, endpoint_url: str) -> dict[str,
         "metadata": {
             "name": config.benchmark["job_name"],
             "namespace": config.namespace,
+            "labels": {
+                "app.kubernetes.io/managed-by": "forge",
+                "forge.openshift.io/project": "llm_d",
+            },
         },
         "spec": {
             "backoffLimit": 0,
             "template": {
+                "metadata": {
+                    "labels": {
+                        "app.kubernetes.io/managed-by": "forge",
+                        "forge.openshift.io/project": "llm_d",
+                    }
+                },
                 "spec": {
                     "serviceAccountName": "default",
                     "restartPolicy": "Never",
@@ -649,7 +1083,7 @@ def render_guidellm_job(config: ResolvedConfig, endpoint_url: str) -> dict[str,
                             },
                         },
                     ],
-                }
+                },
             },
         },
     }
@@ -667,6 +1101,10 @@ def render_guidellm_copy_pod(
         "metadata": {
             "name": f"{config.benchmark['job_name']}-copy",
             "namespace": config.namespace,
+            "labels": {
+                "app.kubernetes.io/managed-by": "forge",
+                "forge.openshift.io/project": "llm_d",
+            },
         },
         "spec": {
             "restartPolicy": "Never",
diff --git a/projects/llm_d/toolbox/cleanup/main.py b/projects/llm_d/toolbox/cleanup/main.py
index 46d0aedf..d80726ef 100644
--- a/projects/llm_d/toolbox/cleanup/main.py
+++ b/projects/llm_d/toolbox/cleanup/main.py
@@ -13,36 +13,38 @@ def run() -> int:
 
 
 def run_cleanup(config: llmd_runtime.ResolvedConfig) -> int:
+    delete_run_leftovers(config)
+    return 0
+
+
+def delete_run_leftovers(config: llmd_runtime.ResolvedConfig) -> None:
+    if not llmd_runtime.resource_exists("namespace", config.namespace):
+        return
+
     inference_service_name = config.platform["inference_service"]["name"]
-    benchmark_name = (
-        config.benchmark["job_name"] if config.benchmark else "guidellm-benchmark"
+    namespace = config.namespace
+    cleanup_timeout_seconds = config.platform["cluster"]["cleanup_timeout_seconds"]
+    benchmark_names = {"guidellm-benchmark"}
+    if config.benchmark:
+        benchmark_names.add(config.benchmark["job_name"])
+
+    llmd_runtime.oc(
+        "delete",
+        "llminferenceservice",
+        inference_service_name,
+        "-n",
+        namespace,
+        "--ignore-not-found=true",
+        check=False,
     )
 
-    if config.namespace_is_managed:
-        if llmd_runtime.resource_exists("namespace", config.namespace):
-            llmd_runtime.oc(
-                "delete", "namespace", config.namespace, "--ignore-not-found=true"
-            )
-            llmd_runtime.wait_for_namespace_deleted(
-                config.namespace,
-                timeout_seconds=config.platform["cluster"]["cleanup_timeout_seconds"],
-            )
-    else:
-        llmd_runtime.oc(
-            "delete",
-            "llminferenceservice",
-            inference_service_name,
-            "-n",
-            config.namespace,
-            "--ignore-not-found=true",
-            check=False,
-        )
+    for benchmark_name in sorted(benchmark_names):
         llmd_runtime.oc(
             "delete",
             "job,pvc",
             benchmark_name,
             "-n",
-            config.namespace,
+            namespace,
             "--ignore-not-found=true",
             check=False,
         )
@@ -51,12 +53,67 @@ def run_cleanup(config: llmd_runtime.ResolvedConfig) -> int:
             "pod",
             f"{benchmark_name}-copy",
             "-n",
-            config.namespace,
+            namespace,
             "--ignore-not-found=true",
             check=False,
         )
 
-    return 0
+    llmd_runtime.oc(
+        "delete",
+        "job",
+        "-n",
+        namespace,
+        "-l",
+        "forge.openshift.io/project=llm_d",
+        "--ignore-not-found=true",
+        check=False,
+    )
+    llmd_runtime.oc(
+        "delete",
+        "pod",
+        "-n",
+        namespace,
+        "-l",
+        "forge.openshift.io/project=llm_d",
+        "--ignore-not-found=true",
+        check=False,
+    )
+    llmd_runtime.oc(
+        "delete",
+        "pvc",
+        "-n",
+        namespace,
+        "-l",
+        "forge.openshift.io/project=llm_d,forge.openshift.io/preserve!=true",
+        "--ignore-not-found=true",
+        check=False,
+    )
+
+    llmd_runtime.wait_until(
+        f"llminferenceservice/{inference_service_name} deletion in {namespace}",
+        timeout_seconds=cleanup_timeout_seconds,
+        interval_seconds=10,
+        predicate=lambda: not llmd_runtime.resource_exists(
+            "llminferenceservice", inference_service_name, namespace=namespace
+        ),
+    )
+
+    llmd_runtime.wait_until(
+        f"llm-d workload pods deletion in {namespace}",
+        timeout_seconds=cleanup_timeout_seconds,
+        interval_seconds=10,
+        predicate=lambda: _llm_d_pods_gone(namespace, inference_service_name),
+    )
+
+
+def _llm_d_pods_gone(namespace: str, inference_service_name: str) -> bool:
+    payload = llmd_runtime.oc_get_json(
+        "pods",
+        namespace=namespace,
+        selector=f"app.kubernetes.io/name={inference_service_name}",
+        ignore_not_found=True,
+    )
+    return not payload or not payload.get("items")
 
 
 main = toolbox.create_toolbox_main(run)
diff --git a/projects/llm_d/toolbox/prepare/main.py b/projects/llm_d/toolbox/prepare/main.py
index 7edad1f8..3ebbaf67 100644
--- a/projects/llm_d/toolbox/prepare/main.py
+++ b/projects/llm_d/toolbox/prepare/main.py
@@ -8,6 +8,8 @@
 
 from projects.core.dsl import toolbox
 from projects.llm_d.orchestration import llmd_runtime
+from projects.llm_d.toolbox.cleanup import main as cleanup_toolbox
+from projects.llm_d.toolbox.prepare_model_cache import main as prepare_model_cache
 
 LOGGER = logging.getLogger(__name__)
 
@@ -35,6 +37,8 @@ def run_prepare(config: llmd_runtime.ResolvedConfig) -> int:
     ensure_required_crds(config.platform["rhoai"]["required_crds_after_dsc"], config)
     ensure_gateway(config)
     ensure_test_namespace(config)
+    cleanup_toolbox.delete_run_leftovers(config)
+    prepare_model_cache.run_prepare_model_cache(config)
     verify_gpu_nodes(config)
     capture_prepare_state(config)
 
diff --git a/projects/llm_d/toolbox/prepare_model_cache/main.py b/projects/llm_d/toolbox/prepare_model_cache/main.py
new file mode 100644
index 00000000..143ae77a
--- /dev/null
+++ b/projects/llm_d/toolbox/prepare_model_cache/main.py
@@ -0,0 +1,207 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import logging
+
+from projects.core.dsl import toolbox
+from projects.llm_d.orchestration import llmd_runtime
+
+LOGGER = logging.getLogger(__name__)
+
+
+def run() -> int:
+    llmd_runtime.init()
+    config = llmd_runtime.load_run_configuration()
+    return run_prepare_model_cache(config)
+
+
+def run_prepare_model_cache(config: llmd_runtime.ResolvedConfig) -> int:
+    cache_spec = llmd_runtime.resolve_model_cache(config)
+    if not cache_spec:
+        LOGGER.info("Model cache disabled for preset=%s", config.preset_name)
+        return 0
+
+    if config.namespace_is_managed:
+        LOGGER.warning(
+            "Model cache PVC %s lives in managed namespace %s. Namespace cleanup will remove it; cache reuse requires a stable namespace override.",
+            cache_spec.pvc_name,
+            cache_spec.namespace,
+        )
+
+    ensure_model_cache_pvc(config, cache_spec)
+    if llmd_runtime.model_cache_pvc_ready(cache_spec):
+        LOGGER.info(
+            "Model cache PVC %s already contains %s; skipping download",
+            cache_spec.pvc_name,
+            cache_spec.source_uri,
+        )
+        capture_model_cache_state(config, cache_spec)
+        return 0
+
+    run_model_cache_download_job(config, cache_spec)
+    llmd_runtime.annotate_model_cache_pvc(cache_spec)
+    capture_model_cache_state(config, cache_spec)
+    return 0
+
+
+def ensure_model_cache_pvc(
+    config: llmd_runtime.ResolvedConfig, cache_spec: llmd_runtime.ModelCacheSpec
+) -> None:
+    existing = llmd_runtime.oc_get_json(
+        "persistentvolumeclaim",
+        name=cache_spec.pvc_name,
+        namespace=cache_spec.namespace,
+        ignore_not_found=True,
+    )
+    if existing:
+        actual_modes = existing.get("spec", {}).get("accessModes", [])
+        if not llmd_runtime.pvc_access_mode_matches(
+            actual_modes, cache_spec.access_mode
+        ):
+            raise RuntimeError(
+                f"PVC {cache_spec.pvc_name} exists with access modes {actual_modes}, expected {cache_spec.access_mode}"
+            )
+
+        actual_storage_class = existing.get("spec", {}).get("storageClassName")
+        if (
+            cache_spec.storage_class_name
+            and actual_storage_class != cache_spec.storage_class_name
+        ):
+            raise RuntimeError(
+                f"PVC {cache_spec.pvc_name} exists with storageClassName={actual_storage_class}, expected {cache_spec.storage_class_name}"
+            )
+
+        llmd_runtime.wait_for_pvc_bound(
+            cache_spec.pvc_name,
+            cache_spec.namespace,
+            timeout_seconds=config.model_cache["download"]["wait_timeout_seconds"],
+        )
+        return
+
+    llmd_runtime.apply_manifest(
+        config.artifact_dir / "src" / "model-cache-pvc.yaml",
+        llmd_runtime.render_model_cache_pvc(cache_spec),
+    )
+    llmd_runtime.wait_for_pvc_bound(
+        cache_spec.pvc_name,
+        cache_spec.namespace,
+        timeout_seconds=config.model_cache["download"]["wait_timeout_seconds"],
+    )
+
+
+def run_model_cache_download_job(
+    config: llmd_runtime.ResolvedConfig, cache_spec: llmd_runtime.ModelCacheSpec
+) -> None:
+    llmd_runtime.oc(
+        "delete",
+        "job",
+        cache_spec.download_job_name,
+        "-n",
+        cache_spec.namespace,
+        "--ignore-not-found=true",
+        check=False,
+    )
+    llmd_runtime.wait_until(
+        f"job/{cache_spec.download_job_name} deletion in {cache_spec.namespace}",
+        timeout_seconds=120,
+        interval_seconds=5,
+        predicate=lambda: not llmd_runtime.resource_exists(
+            "job", cache_spec.download_job_name, namespace=cache_spec.namespace
+        ),
+    )
+
+    llmd_runtime.apply_manifest(
+        config.artifact_dir / "src" / "model-cache-job.yaml",
+        llmd_runtime.render_model_cache_job(config, cache_spec),
+    )
+
+    try:
+        llmd_runtime.wait_for_job_completion(
+            cache_spec.download_job_name,
+            cache_spec.namespace,
+            timeout_seconds=config.model_cache["download"]["wait_timeout_seconds"],
+            interval_seconds=config.model_cache["download"]["poll_interval_seconds"],
+        )
+    finally:
+        capture_model_cache_state(config, cache_spec)
+
+
+def capture_model_cache_state(
+    config: llmd_runtime.ResolvedConfig, cache_spec: llmd_runtime.ModelCacheSpec
+) -> None:
+    artifact_dir = config.artifact_dir / "artifacts" / "model-cache"
+    llmd_runtime.write_json(
+        artifact_dir / "spec.json",
+        {
+            "pvc_name": cache_spec.pvc_name,
+            "model_uri": cache_spec.model_uri,
+            "source_uri": cache_spec.source_uri,
+            "source_scheme": cache_spec.source_scheme,
+        },
+    )
+
+    capture_resource_yaml(
+        "persistentvolumeclaim",
+        cache_spec.pvc_name,
+        cache_spec.namespace,
+        artifact_dir / "pvc.yaml",
+    )
+    capture_resource_yaml(
+        "job",
+        cache_spec.download_job_name,
+        cache_spec.namespace,
+        artifact_dir / "job.yaml",
+        check=False,
+    )
+
+    for pod_name in llmd_runtime.job_pod_names(
+        cache_spec.download_job_name, cache_spec.namespace
+    ):
+        capture_resource_yaml(
+            "pod",
+            pod_name,
+            cache_spec.namespace,
+            artifact_dir / f"{pod_name}.yaml",
+            check=False,
+        )
+        log_result = llmd_runtime.oc(
+            "logs",
+            pod_name,
+            "-n",
+            cache_spec.namespace,
+            check=False,
+            capture_output=True,
+        )
+        if log_result.returncode == 0 and log_result.stdout:
+            llmd_runtime.write_text(artifact_dir / f"{pod_name}.log", log_result.stdout)
+
+
+def capture_resource_yaml(
+    kind: str,
+    name: str,
+    namespace: str,
+    destination,
+    *,
+    check: bool = True,
+) -> None:
+    result = llmd_runtime.oc(
+        "get",
+        kind,
+        name,
+        "-n",
+        namespace,
+        "-o",
+        "yaml",
+        check=check,
+        capture_output=True,
+    )
+    if result.returncode == 0 and result.stdout:
+        llmd_runtime.write_text(destination, result.stdout)
+
+
+main = toolbox.create_toolbox_main(run)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/llm_d/test_runtime.py b/tests/llm_d/test_runtime.py
index b2bcff1d..d130e781 100644
--- a/tests/llm_d/test_runtime.py
+++ b/tests/llm_d/test_runtime.py
@@ -5,7 +5,9 @@
 import pytest
 
 from projects.llm_d.orchestration import llmd_runtime
+from projects.llm_d.toolbox.cleanup import main as cleanup_toolbox
 from projects.llm_d.toolbox.prepare import main as prepare_toolbox
+from projects.llm_d.toolbox.prepare_model_cache import main as prepare_model_cache_toolbox
 from projects.llm_d.toolbox.test import main as test_toolbox
 
 
@@ -39,8 +41,8 @@ def test_load_run_configuration_resolves_alias(
     assert config.preset_name == "smoke"
     assert config.preset_alias == "cks"
     assert config.model["served_model_name"] == "Qwen/Qwen3-0.6B"
-    assert config.namespace == "llm-d-e2e"
-    assert config.namespace_is_managed is True
+    assert config.namespace == "forge-llm-d"
+    assert config.namespace_is_managed is False
 
 
 def test_load_run_configuration_consolidates_config_d(
@@ -54,6 +56,7 @@ def test_load_run_configuration_consolidates_config_d(
     consolidated = llmd_runtime.load_yaml(artifact_dir / "config.yaml")
 
     assert "platform" in consolidated
+    assert "model_cache" in consolidated
     assert "models" in consolidated
     assert "runtime" in consolidated
     assert "workloads" in consolidated
@@ -75,6 +78,25 @@ def test_namespace_override_is_not_managed(
     assert config.namespace_is_managed is False
 
 
+def test_default_namespace_comes_from_project_config(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
+    artifact_dir = tmp_path / "artifacts"
+    artifact_dir.mkdir()
+    (tmp_path / "fournos_config.yaml").write_text(
+        "job-name: llm-d-nightly\n",
+        encoding="utf-8",
+    )
+
+    config = llmd_runtime.load_run_configuration(
+        cwd=tmp_path, artifact_dir=artifact_dir
+    )
+
+    assert config.namespace == "forge-llm-d"
+    assert config.namespace_is_managed is False
+
+
 def test_render_inference_service_injects_model_and_epp(
     tmp_path: Path, monkeypatch: pytest.MonkeyPatch
 ) -> None:
@@ -86,13 +108,13 @@ def test_render_inference_service_injects_model_and_epp(
         cwd=tmp_path, artifact_dir=artifact_dir
     )
     manifest = llmd_runtime.render_inference_service(config)
+    cache_spec = llmd_runtime.resolve_model_cache(config)
 
     assert manifest["metadata"]["name"] == "llm-d"
     assert manifest["metadata"]["namespace"] == config.namespace
     assert manifest["spec"]["model"]["name"] == "Qwen/Qwen3-0.6B"
-    assert manifest["spec"]["model"]["uri"] == "hf://Qwen/Qwen3-0.6B"
+    assert manifest["spec"]["model"]["uri"] == cache_spec.model_uri
     assert manifest["spec"]["model"]["name"] == config.model["served_model_name"]
-    assert manifest["spec"]["model"]["uri"] == config.model["uri"]
     router_args = manifest["spec"]["router"]["scheduler"]["template"]["containers"][0][
         "args"
     ]
@@ -100,6 +122,84 @@ def test_render_inference_service_injects_model_and_epp(
     assert "EndpointPickerConfig" in router_args[-1]
 
 
+def test_resolve_model_cache_for_hf_model(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
+    artifact_dir = tmp_path / "artifacts"
+    artifact_dir.mkdir()
+
+    config = llmd_runtime.load_run_configuration(
+        cwd=tmp_path, artifact_dir=artifact_dir
+    )
+    cache_spec = llmd_runtime.resolve_model_cache(config)
+
+    assert cache_spec is not None
+    assert cache_spec.source_scheme == "hf"
+    assert cache_spec.pvc_name.startswith("llm-d-model-qwen3-0-6b-")
+    assert cache_spec.model_uri == f"pvc://{cache_spec.pvc_name}/model"
+    assert cache_spec.pvc_size == "10Gi"
+    assert cache_spec.access_mode == "ReadWriteOnce"
+
+
+def test_render_model_cache_job_for_hf_model(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
+    artifact_dir = tmp_path / "artifacts"
+    artifact_dir.mkdir()
+
+    config = llmd_runtime.load_run_configuration(
+        cwd=tmp_path, artifact_dir=artifact_dir
+    )
+    cache_spec = llmd_runtime.resolve_model_cache(config)
+    manifest = llmd_runtime.render_model_cache_job(config, cache_spec)
+
+    container = manifest["spec"]["template"]["spec"]["containers"][0]
+    assert container["name"] == "hf-model-downloader"
+    assert container["image"] == "registry.access.redhat.com/ubi9/python-311"
+    assert any(
+        env["name"] == "MODEL_SOURCE" and env["value"] == "hf://Qwen/Qwen3-0.6B"
+        for env in container["env"]
+    )
+    assert "huggingface_hub" in container["command"][-1]
+
+
+def test_render_model_cache_job_for_oci_model_uses_registry_auth_secret(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
+    artifact_dir = tmp_path / "artifacts"
+    artifact_dir.mkdir()
+    (tmp_path / "fournos_config.yaml").write_text(
+        "preset: benchmark-short\njob-name: llm-d-benchmark\n",
+        encoding="utf-8",
+    )
+
+    config = llmd_runtime.load_run_configuration(
+        cwd=tmp_path, artifact_dir=artifact_dir
+    )
+    monkeypatch.setattr(
+        llmd_runtime,
+        "resolve_default_serviceaccount_image_pull_secret",
+        lambda namespace: "pull-secret",
+    )
+    cache_spec = llmd_runtime.resolve_model_cache(config)
+    manifest = llmd_runtime.render_model_cache_job(config, cache_spec)
+
+    container = manifest["spec"]["template"]["spec"]["containers"][0]
+    volume_names = {
+        volume["name"] for volume in manifest["spec"]["template"]["spec"]["volumes"]
+    }
+
+    assert cache_spec.source_scheme == "oci"
+    assert container["name"] == "oci-model-extractor"
+    assert container["image"] == "registry.redhat.io/openshift4/ose-cli:v4.19"
+    assert any(env["name"] == "OCI_IMAGE_PATH" and env["value"] == "/" for env in container["env"])
+    assert "registry-auth" in volume_names
+    assert "oc image extract" in container["command"][-1]
+
+
 def test_render_guidellm_job_uses_target_and_rate(
     tmp_path: Path, monkeypatch: pytest.MonkeyPatch
 ) -> None:
@@ -122,6 +222,83 @@ def test_render_guidellm_job_uses_target_and_rate(
     assert "--rate=1" in container["args"]
 
 
+def test_prepare_model_cache_skips_ready_pvc(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
+    artifact_dir = tmp_path / "artifacts"
+    artifact_dir.mkdir()
+
+    config = llmd_runtime.load_run_configuration(
+        cwd=tmp_path, artifact_dir=artifact_dir
+    )
+    cache_spec = llmd_runtime.resolve_model_cache(config)
+    calls: list[str] = []
+
+    monkeypatch.setattr(
+        prepare_model_cache_toolbox,
+        "ensure_model_cache_pvc",
+        lambda _config, _cache_spec: calls.append("ensure-pvc"),
+    )
+    monkeypatch.setattr(
+        llmd_runtime, "model_cache_pvc_ready", lambda _cache_spec: True
+    )
+    monkeypatch.setattr(
+        prepare_model_cache_toolbox,
+        "capture_model_cache_state",
+        lambda _config, _cache_spec: calls.append("capture"),
+    )
+    monkeypatch.setattr(
+        prepare_model_cache_toolbox,
+        "run_model_cache_download_job",
+        lambda _config, _cache_spec: calls.append("download"),
+    )
+
+    prepare_model_cache_toolbox.run_prepare_model_cache(config)
+
+    assert calls == ["ensure-pvc", "capture"]
+
+
+def test_cleanup_deletes_leftovers_but_not_namespace_or_preserved_pvcs(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
+    artifact_dir = tmp_path / "artifacts"
+    artifact_dir.mkdir()
+
+    config = llmd_runtime.load_run_configuration(
+        cwd=tmp_path, artifact_dir=artifact_dir
+    )
+    oc_calls: list[tuple[str, ...]] = []
+
+    def fake_resource_exists(kind: str, name: str, namespace: str | None = None) -> bool:
+        if kind == "namespace":
+            return True
+        return False
+
+    monkeypatch.setattr(llmd_runtime, "resource_exists", fake_resource_exists)
+    monkeypatch.setattr(
+        llmd_runtime,
+        "oc",
+        lambda *args, **kwargs: oc_calls.append(tuple(args)),
+    )
+    monkeypatch.setattr(llmd_runtime, "wait_until", lambda *args, **kwargs: True)
+    monkeypatch.setattr(cleanup_toolbox, "_llm_d_pods_gone", lambda *_args: True)
+
+    cleanup_toolbox.delete_run_leftovers(config)
+
+    assert ("delete", "namespace", config.namespace, "--ignore-not-found=true") not in oc_calls
+    assert (
+        "delete",
+        "pvc",
+        "-n",
+        config.namespace,
+        "-l",
+        "forge.openshift.io/project=llm_d,forge.openshift.io/preserve!=true",
+        "--ignore-not-found=true",
+    ) in oc_calls
+
+
 def test_prepare_gpu_operator_skips_existing_clusterpolicy(
     tmp_path: Path, monkeypatch: pytest.MonkeyPatch
 ) -> None:

From be5a512c3a0933f5c4dfeba7e6f2c27291d28d4a Mon Sep 17 00:00:00 2001
From: Alberto Perdomo <aperdomo@redhat.com>
Date: Thu, 16 Apr 2026 13:35:58 +0100
Subject: [PATCH 04/21] fix: Lint after rebase

---
 projects/core/library/config.py               |  4 +-
 projects/llm_d/orchestration/cli.py           |  1 -
 projects/llm_d/orchestration/llmd_runtime.py  | 89 +++++--------------
 projects/llm_d/orchestration/prepare_llmd.py  |  3 -
 projects/llm_d/orchestration/test_llmd.py     |  2 -
 projects/llm_d/toolbox/prepare/main.py        | 72 ++++-----------
 .../llm_d/toolbox/prepare_model_cache/main.py | 13 +--
 projects/llm_d/toolbox/test/main.py           | 57 ++++--------
 tests/llm_d/test_runtime.py                   | 77 ++++------------
 9 files changed, 81 insertions(+), 237 deletions(-)

diff --git a/projects/core/library/config.py b/projects/core/library/config.py
index b17b3e3a..740e921c 100644
--- a/projects/core/library/config.py
+++ b/projects/core/library/config.py
@@ -307,9 +307,7 @@ def multi_dereference():
 
         # --- #
 
-        new_value = (
-            simple_dereference() if value.startswith("@") else multi_dereference()
-        )
+        new_value = simple_dereference() if value.startswith("@") else multi_dereference()
 
         if not handled_secretly:
             logger.info(f"resolve_reference: {value} ==> '{new_value}'")
diff --git a/projects/llm_d/orchestration/cli.py b/projects/llm_d/orchestration/cli.py
index 06ae9ef6..ca87c653 100644
--- a/projects/llm_d/orchestration/cli.py
+++ b/projects/llm_d/orchestration/cli.py
@@ -1,7 +1,6 @@
 #!/usr/bin/env python3
 
 import logging
-import sys
 import types
 
 import click
diff --git a/projects/llm_d/orchestration/llmd_runtime.py b/projects/llm_d/orchestration/llmd_runtime.py
index 2c961e7c..48c503f3 100644
--- a/projects/llm_d/orchestration/llmd_runtime.py
+++ b/projects/llm_d/orchestration/llmd_runtime.py
@@ -9,9 +9,10 @@
 import shlex
 import subprocess
 import time
+from collections.abc import Iterable
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any, Iterable
+from typing import Any
 
 import yaml
 
@@ -201,9 +202,7 @@ def load_fournos_config(cwd: Path) -> dict[str, Any]:
     if data is None:
         return {}
     if not isinstance(data, dict):
-        raise ValueError(
-            f"Unexpected FOURNOS config type in {config_path}: {type(data)}"
-        )
+        raise ValueError(f"Unexpected FOURNOS config type in {config_path}: {type(data)}")
     return data
 
 
@@ -253,9 +252,7 @@ def derive_namespace(job_name: str, prefix: str, max_length: int) -> str:
 
     namespace = namespace[:max_length].rstrip("-")
     if not namespace:
-        raise ValueError(
-            f"Could not derive a valid namespace from job name: {job_name}"
-        )
+        raise ValueError(f"Could not derive a valid namespace from job name: {job_name}")
     return namespace
 
 
@@ -282,9 +279,7 @@ def resolve_model_cache(config: ResolvedConfig) -> ModelCacheSpec | None:
     elif source_uri.startswith("oci://"):
         source_scheme = "oci"
     else:
-        raise ValueError(
-            f"Unsupported model cache source URI for {config.model_key}: {source_uri}"
-        )
+        raise ValueError(f"Unsupported model cache source URI for {config.model_key}: {source_uri}")
 
     model_cache_overrides = config.model.get("cache", {})
     pvc_defaults = config.model_cache["pvc"]
@@ -302,9 +297,7 @@ def resolve_model_cache(config: ResolvedConfig) -> ModelCacheSpec | None:
         namespace=config.namespace,
         pvc_name=pvc_name,
         pvc_size=model_cache_overrides.get("pvc_size", pvc_defaults["size"]),
-        access_mode=model_cache_overrides.get(
-            "access_mode", pvc_defaults["access_mode"]
-        ),
+        access_mode=model_cache_overrides.get("access_mode", pvc_defaults["access_mode"]),
         storage_class_name=model_cache_overrides.get(
             "storage_class_name", pvc_defaults.get("storage_class_name")
         ),
@@ -323,9 +316,7 @@ def resolve_model_cache(config: ResolvedConfig) -> ModelCacheSpec | None:
             "oci_registry_auth_secret_name",
             config.model_cache["oci"].get("registry_auth_secret_name"),
         ),
-        oci_registry_auth_secret_key=config.model_cache["oci"].get(
-            "registry_auth_secret_key"
-        ),
+        oci_registry_auth_secret_key=config.model_cache["oci"].get("registry_auth_secret_key"),
     )
 
 
@@ -460,9 +451,7 @@ def wait_until(
         time.sleep(interval_seconds)
 
     if last_error:
-        raise RuntimeError(
-            f"Timed out waiting for {description}: {last_error}"
-        ) from last_error
+        raise RuntimeError(f"Timed out waiting for {description}: {last_error}") from last_error
     raise RuntimeError(f"Timed out waiting for {description}")
 
 
@@ -487,15 +476,11 @@ def wait_for_crd(crd_name: str, timeout_seconds: int) -> None:
     )
 
 
-def wait_for_operator_csv(
-    package: str, namespace: str, timeout_seconds: int
-) -> dict[str, Any]:
+def wait_for_operator_csv(package: str, namespace: str, timeout_seconds: int) -> dict[str, Any]:
     selector = f"operators.coreos.com/{package}.{namespace}"
 
     def _csv_ready() -> dict[str, Any] | None:
-        data = oc_get_json(
-            "csv", namespace=namespace, selector=selector, ignore_not_found=True
-        )
+        data = oc_get_json("csv", namespace=namespace, selector=selector, ignore_not_found=True)
         if not data:
             return None
         items = data.get("items", [])
@@ -557,9 +542,7 @@ def ensure_subscription(operator_spec: dict[str, Any]) -> None:
         namespace=namespace,
         ignore_not_found=True,
     )
-    if current and not subscription_spec_matches(
-        current.get("spec", {}), subscription["spec"]
-    ):
+    if current and not subscription_spec_matches(current.get("spec", {}), subscription["spec"]):
         LOGGER.info("Reconciling subscription drift for %s in %s", package, namespace)
 
     oc("apply", "-f", "-", input_text=yaml.safe_dump(subscription, sort_keys=False))
@@ -611,9 +594,7 @@ def operator_spec_by_package(platform: dict[str, Any], package: str) -> dict[str
     raise KeyError(f"Unknown operator package in llm_d platform config: {package}")
 
 
-def load_manifest_template(
-    config: ResolvedConfig, relative_path: str
-) -> dict[str, Any]:
+def load_manifest_template(config: ResolvedConfig, relative_path: str) -> dict[str, Any]:
     return load_yaml(config.config_dir / relative_path)
 
 
@@ -634,9 +615,7 @@ def pvc_access_mode_matches(actual_modes: list[str], expected_mode: str) -> bool
     return expected_mode in actual_modes
 
 
-def wait_for_pvc_bound(
-    pvc_name: str, namespace: str, *, timeout_seconds: int
-) -> dict[str, Any]:
+def wait_for_pvc_bound(pvc_name: str, namespace: str, *, timeout_seconds: int) -> dict[str, Any]:
     def _pvc_bound() -> dict[str, Any] | None:
         payload = oc_get_json(
             "persistentvolumeclaim",
@@ -718,9 +697,7 @@ def resolve_default_serviceaccount_image_pull_secret(namespace: str) -> str | No
 
 
 def render_datasciencecluster(config: ResolvedConfig) -> dict[str, Any]:
-    template_path = (
-        config.config_dir / config.platform["rhoai"]["datasciencecluster_template"]
-    )
+    template_path = config.config_dir / config.platform["rhoai"]["datasciencecluster_template"]
     manifest = load_yaml(template_path)
     manifest["metadata"]["name"] = config.platform["rhoai"]["datasciencecluster_name"]
     manifest["metadata"]["namespace"] = config.platform["rhoai"]["namespace"]
@@ -732,9 +709,7 @@ def render_gateway(config: ResolvedConfig) -> dict[str, Any]:
     manifest = load_yaml(template_path)
     manifest["metadata"]["name"] = config.platform["gateway"]["name"]
     manifest["metadata"]["namespace"] = config.platform["gateway"]["namespace"]
-    manifest["spec"]["gatewayClassName"] = config.platform["gateway"][
-        "gateway_class_name"
-    ]
+    manifest["spec"]["gatewayClassName"] = config.platform["gateway"]["gateway_class_name"]
     return manifest
 
 
@@ -766,9 +741,7 @@ def render_model_cache_pvc(spec: ModelCacheSpec) -> dict[str, Any]:
     return manifest
 
 
-def render_model_cache_job(
-    config: ResolvedConfig, spec: ModelCacheSpec
-) -> dict[str, Any]:
+def render_model_cache_job(config: ResolvedConfig, spec: ModelCacheSpec) -> dict[str, Any]:
     common_env = [
         {"name": "MODEL_SOURCE", "value": spec.source_uri},
         {"name": "MODEL_TARGET_DIR", "value": f"/cache/{spec.model_path}"},
@@ -859,9 +832,7 @@ def render_model_cache_job(
 EOF
 """
         volume_mounts = [{"name": "cache", "mountPath": "/cache"}]
-        common_env.append(
-            {"name": "OCI_IMAGE_PATH", "value": spec.oci_image_path or "/"}
-        )
+        common_env.append({"name": "OCI_IMAGE_PATH", "value": spec.oci_image_path or "/"})
         if registry_auth_secret_name:
             volumes.append(
                 {
@@ -907,9 +878,7 @@ def render_model_cache_job(
         },
         "spec": {
             "backoffLimit": 0,
-            "activeDeadlineSeconds": config.model_cache["download"][
-                "wait_timeout_seconds"
-            ],
+            "activeDeadlineSeconds": config.model_cache["download"]["wait_timeout_seconds"],
             "template": {
                 "metadata": {
                     "labels": {
@@ -955,7 +924,7 @@ def annotate_model_cache_pvc(spec: ModelCacheSpec) -> None:
         "-n",
         spec.namespace,
         "--overwrite",
-        f"forge.openshift.io/model-cache-ready=true",
+        "forge.openshift.io/model-cache-ready=true",
         f"forge.openshift.io/model-cache-key={spec.cache_key}",
         f"forge.openshift.io/model-source-uri={spec.source_uri}",
         f"forge.openshift.io/model-uri={spec.model_uri}",
@@ -978,21 +947,15 @@ def render_inference_service(config: ResolvedConfig) -> dict[str, Any]:
     )
 
     cache_spec = resolve_model_cache(config)
-    manifest["spec"]["model"]["uri"] = (
-        cache_spec.model_uri if cache_spec else config.model["uri"]
-    )
+    manifest["spec"]["model"]["uri"] = cache_spec.model_uri if cache_spec else config.model["uri"]
     manifest["spec"]["model"]["name"] = config.model["served_model_name"]
     manifest["spec"]["template"]["containers"][0]["resources"] = copy.deepcopy(
         config.model["resources"]
     )
 
-    epp_path = (
-        config.config_dir / config.platform["inference_service"]["epp_config_template"]
-    )
+    epp_path = config.config_dir / config.platform["inference_service"]["epp_config_template"]
     epp_config = epp_path.read_text(encoding="utf-8")
-    router_args = manifest["spec"]["router"]["scheduler"]["template"]["containers"][0][
-        "args"
-    ]
+    router_args = manifest["spec"]["router"]["scheduler"]["template"]["containers"][0]["args"]
     if not router_args or router_args[-1] != "--config-text":
         raise ValueError("Expected llm-d router args to end with --config-text")
     router_args.append(epp_config)
@@ -1078,9 +1041,7 @@ def render_guidellm_job(config: ResolvedConfig, endpoint_url: str) -> dict[str,
                         {"name": "home", "emptyDir": {}},
                         {
                             "name": "results",
-                            "persistentVolumeClaim": {
-                                "claimName": config.benchmark["job_name"]
-                            },
+                            "persistentVolumeClaim": {"claimName": config.benchmark["job_name"]},
                         },
                     ],
                 },
@@ -1140,9 +1101,7 @@ def render_guidellm_copy_pod(
             "volumes": [
                 {
                     "name": "results",
-                    "persistentVolumeClaim": {
-                        "claimName": config.benchmark["job_name"]
-                    },
+                    "persistentVolumeClaim": {"claimName": config.benchmark["job_name"]},
                 }
             ],
         },
diff --git a/projects/llm_d/orchestration/prepare_llmd.py b/projects/llm_d/orchestration/prepare_llmd.py
index d52f921a..ba64a9dc 100644
--- a/projects/llm_d/orchestration/prepare_llmd.py
+++ b/projects/llm_d/orchestration/prepare_llmd.py
@@ -1,10 +1,7 @@
 from __future__ import annotations
 
 from projects.llm_d.toolbox.cleanup.main import run as cleanup_toolbox_run
-from projects.llm_d.toolbox.cleanup.main import run_cleanup
-from projects.llm_d.toolbox.prepare.main import prepare_gpu_operator
 from projects.llm_d.toolbox.prepare.main import run as prepare_toolbox_run
-from projects.llm_d.toolbox.prepare.main import run_prepare
 
 
 def prepare() -> int:
diff --git a/projects/llm_d/orchestration/test_llmd.py b/projects/llm_d/orchestration/test_llmd.py
index 8fc2bc40..5254cafb 100644
--- a/projects/llm_d/orchestration/test_llmd.py
+++ b/projects/llm_d/orchestration/test_llmd.py
@@ -1,9 +1,7 @@
 from __future__ import annotations
 
 from projects.llm_d.orchestration import llmd_runtime
-from projects.llm_d.toolbox.test.main import resolve_endpoint_url
 from projects.llm_d.toolbox.test.main import run as test_toolbox_run
-from projects.llm_d.toolbox.test.main import run_test
 
 
 def init() -> None:
diff --git a/projects/llm_d/toolbox/prepare/main.py b/projects/llm_d/toolbox/prepare/main.py
index 3ebbaf67..621c34d4 100644
--- a/projects/llm_d/toolbox/prepare/main.py
+++ b/projects/llm_d/toolbox/prepare/main.py
@@ -21,9 +21,7 @@ def run() -> int:
 
 
 def run_prepare(config: llmd_runtime.ResolvedConfig) -> int:
-    LOGGER.info(
-        "Preparing llm_d preset=%s namespace=%s", config.preset_name, config.namespace
-    )
+    LOGGER.info("Preparing llm_d preset=%s namespace=%s", config.preset_name, config.namespace)
 
     verify_oc_access()
     verify_cluster_version(config)
@@ -59,14 +57,10 @@ def verify_cluster_version(config: llmd_runtime.ResolvedConfig) -> None:
         or payload.get("serverVersion", {}).get("platform")
     )
     if not openshift_version:
-        raise RuntimeError(
-            "Could not determine OpenShift version from `oc version -o json`"
-        )
+        raise RuntimeError("Could not determine OpenShift version from `oc version -o json`")
 
     minimum = config.platform["cluster"]["minimum_openshift_version"]
-    if llmd_runtime.version_tuple(openshift_version) < llmd_runtime.version_tuple(
-        minimum
-    ):
+    if llmd_runtime.version_tuple(openshift_version) < llmd_runtime.version_tuple(minimum):
         raise RuntimeError(
             f"Cluster version {openshift_version} is older than the llm_d minimum {minimum}"
         )
@@ -89,9 +83,7 @@ def prepare_cert_manager(config: llmd_runtime.ResolvedConfig) -> None:
 
 
 def prepare_leader_worker_set(config: llmd_runtime.ResolvedConfig) -> None:
-    operator_spec = llmd_runtime.operator_spec_by_package(
-        config.platform, "leader-worker-set"
-    )
+    operator_spec = llmd_runtime.operator_spec_by_package(config.platform, "leader-worker-set")
     ensure_operator_subscription(operator_spec)
 
 
@@ -103,9 +95,7 @@ def prepare_nfd(config: llmd_runtime.ResolvedConfig) -> None:
         timeout_seconds=operator_spec["wait_timeout_seconds"],
     )
 
-    manifest = llmd_runtime.load_manifest_template(
-        config, operator_spec["bootstrap_manifest"]
-    )
+    manifest = llmd_runtime.load_manifest_template(config, operator_spec["bootstrap_manifest"])
     llmd_runtime.apply_manifest(
         config.artifact_dir / "src" / "nfd-nodefeaturediscovery.yaml",
         manifest,
@@ -122,24 +112,18 @@ def prepare_nfd(config: llmd_runtime.ResolvedConfig) -> None:
         ),
     )
 
-    wait_for_nfd_gpu_labels(
-        config, timeout_seconds=operator_spec["wait_timeout_seconds"]
-    )
+    wait_for_nfd_gpu_labels(config, timeout_seconds=operator_spec["wait_timeout_seconds"])
 
 
 def prepare_gpu_operator(config: llmd_runtime.ResolvedConfig) -> None:
-    operator_spec = llmd_runtime.operator_spec_by_package(
-        config.platform, "gpu-operator-certified"
-    )
+    operator_spec = llmd_runtime.operator_spec_by_package(config.platform, "gpu-operator-certified")
     ensure_operator_subscription(operator_spec)
     llmd_runtime.wait_for_crd(
         operator_spec["bootstrap_crd"],
         timeout_seconds=operator_spec["wait_timeout_seconds"],
     )
 
-    manifest = llmd_runtime.load_manifest_template(
-        config, operator_spec["bootstrap_manifest"]
-    )
+    manifest = llmd_runtime.load_manifest_template(config, operator_spec["bootstrap_manifest"])
     clusterpolicy_name = manifest["metadata"]["name"]
     if llmd_runtime.resource_exists("clusterpolicy", clusterpolicy_name):
         LOGGER.info(
@@ -163,9 +147,7 @@ def prepare_gpu_operator(config: llmd_runtime.ResolvedConfig) -> None:
     )
 
 
-def wait_for_gpu_clusterpolicy_ready(
-    clusterpolicy_name: str, *, timeout_seconds: int
-) -> None:
+def wait_for_gpu_clusterpolicy_ready(clusterpolicy_name: str, *, timeout_seconds: int) -> None:
     def _clusterpolicy_ready() -> bool:
         payload = llmd_runtime.oc_get_json(
             "clusterpolicy",
@@ -183,16 +165,12 @@ def _clusterpolicy_ready() -> bool:
 
 
 def prepare_rhoai_operator(config: llmd_runtime.ResolvedConfig) -> None:
-    operator_spec = llmd_runtime.operator_spec_by_package(
-        config.platform, "rhods-operator"
-    )
+    operator_spec = llmd_runtime.operator_spec_by_package(config.platform, "rhods-operator")
     ensure_operator_subscription(operator_spec)
     ensure_required_crds(config.platform["rhoai"]["required_crds_before_dsc"], config)
 
 
-def ensure_required_crds(
-    crd_names: list[str], config: llmd_runtime.ResolvedConfig
-) -> None:
+def ensure_required_crds(crd_names: list[str], config: llmd_runtime.ResolvedConfig) -> None:
     for crd_name in crd_names:
         llmd_runtime.wait_for_crd(
             crd_name,
@@ -202,9 +180,7 @@ def ensure_required_crds(
 
 def apply_datasciencecluster(config: llmd_runtime.ResolvedConfig) -> None:
     manifest = llmd_runtime.render_datasciencecluster(config)
-    llmd_runtime.apply_manifest(
-        config.artifact_dir / "src" / "datasciencecluster.yaml", manifest
-    )
+    llmd_runtime.apply_manifest(config.artifact_dir / "src" / "datasciencecluster.yaml", manifest)
     llmd_runtime.oc(
         "get",
         "datasciencecluster",
@@ -243,17 +219,13 @@ def _dsc_ready() -> bool:
 
 def ensure_gateway(config: llmd_runtime.ResolvedConfig) -> None:
     gateway = config.platform["gateway"]
-    if not llmd_runtime.resource_exists(
-        "gateway", gateway["name"], namespace=gateway["namespace"]
-    ):
+    if not llmd_runtime.resource_exists("gateway", gateway["name"], namespace=gateway["namespace"]):
         if not gateway["create_if_missing"]:
             raise RuntimeError(
                 f"Required gateway {gateway['name']} does not exist in {gateway['namespace']}"
             )
         manifest = llmd_runtime.render_gateway(config)
-        llmd_runtime.apply_manifest(
-            config.artifact_dir / "src" / "gateway.yaml", manifest
-        )
+        llmd_runtime.apply_manifest(config.artifact_dir / "src" / "gateway.yaml", manifest)
 
     def _gateway_programmed() -> bool:
         resource = llmd_runtime.oc_get_json(
@@ -291,16 +263,12 @@ def verify_gpu_nodes(config: llmd_runtime.ResolvedConfig) -> None:
         )
 
 
-def wait_for_nfd_gpu_labels(
-    config: llmd_runtime.ResolvedConfig, *, timeout_seconds: int
-) -> None:
+def wait_for_nfd_gpu_labels(config: llmd_runtime.ResolvedConfig, *, timeout_seconds: int) -> None:
     selectors = config.platform["cluster"]["nfd_gpu_detection_labels"]
 
     def _labels_present() -> bool:
         for selector in selectors:
-            data = llmd_runtime.oc_get_json(
-                "nodes", selector=selector, ignore_not_found=True
-            )
+            data = llmd_runtime.oc_get_json("nodes", selector=selector, ignore_not_found=True)
             if data and data.get("items"):
                 return True
         return False
@@ -342,13 +310,9 @@ def capture_prepare_state(config: llmd_runtime.ResolvedConfig) -> None:
         capture_output=True,
     )
     if gateway_service.returncode == 0 and gateway_service.stdout:
-        llmd_runtime.write_text(
-            artifacts_dir / "gateway.service.yaml", gateway_service.stdout
-        )
+        llmd_runtime.write_text(artifacts_dir / "gateway.service.yaml", gateway_service.stdout)
     if config.platform["artifacts"]["capture_namespace_events"]:
-        capture_namespace_events(
-            config.namespace, artifacts_dir / "namespace.events.txt"
-        )
+        capture_namespace_events(config.namespace, artifacts_dir / "namespace.events.txt")
 
 
 def capture_resource_yaml(
diff --git a/projects/llm_d/toolbox/prepare_model_cache/main.py b/projects/llm_d/toolbox/prepare_model_cache/main.py
index 143ae77a..1dc50758 100644
--- a/projects/llm_d/toolbox/prepare_model_cache/main.py
+++ b/projects/llm_d/toolbox/prepare_model_cache/main.py
@@ -56,18 +56,13 @@ def ensure_model_cache_pvc(
     )
     if existing:
         actual_modes = existing.get("spec", {}).get("accessModes", [])
-        if not llmd_runtime.pvc_access_mode_matches(
-            actual_modes, cache_spec.access_mode
-        ):
+        if not llmd_runtime.pvc_access_mode_matches(actual_modes, cache_spec.access_mode):
             raise RuntimeError(
                 f"PVC {cache_spec.pvc_name} exists with access modes {actual_modes}, expected {cache_spec.access_mode}"
             )
 
         actual_storage_class = existing.get("spec", {}).get("storageClassName")
-        if (
-            cache_spec.storage_class_name
-            and actual_storage_class != cache_spec.storage_class_name
-        ):
+        if cache_spec.storage_class_name and actual_storage_class != cache_spec.storage_class_name:
             raise RuntimeError(
                 f"PVC {cache_spec.pvc_name} exists with storageClassName={actual_storage_class}, expected {cache_spec.storage_class_name}"
             )
@@ -155,9 +150,7 @@ def capture_model_cache_state(
         check=False,
     )
 
-    for pod_name in llmd_runtime.job_pod_names(
-        cache_spec.download_job_name, cache_spec.namespace
-    ):
+    for pod_name in llmd_runtime.job_pod_names(cache_spec.download_job_name, cache_spec.namespace):
         capture_resource_yaml(
             "pod",
             pod_name,
diff --git a/projects/llm_d/toolbox/test/main.py b/projects/llm_d/toolbox/test/main.py
index d779c18c..0ea05751 100644
--- a/projects/llm_d/toolbox/test/main.py
+++ b/projects/llm_d/toolbox/test/main.py
@@ -20,7 +20,6 @@ def run() -> int:
 
 
 def run_test(config: llmd_runtime.ResolvedConfig) -> int:
-    name = config.platform["inference_service"]["name"]
     namespace = config.namespace
     artifacts_dir = config.artifact_dir / "artifacts"
 
@@ -40,9 +39,7 @@ def run_test(config: llmd_runtime.ResolvedConfig) -> int:
         capture_inference_service_state(config)
         if endpoint_url:
             llmd_runtime.write_text(artifacts_dir / "endpoint.url", f"{endpoint_url}\n")
-        benchmark_name = (
-            config.benchmark["job_name"] if config.benchmark else "guidellm-benchmark"
-        )
+        benchmark_name = config.benchmark["job_name"] if config.benchmark else "guidellm-benchmark"
         llmd_runtime.oc(
             "delete",
             "job,pvc",
@@ -71,9 +68,7 @@ def run_test(config: llmd_runtime.ResolvedConfig) -> int:
             capture_output=True,
         )
         if events.returncode == 0 and events.stdout:
-            llmd_runtime.write_text(
-                artifacts_dir / "namespace.events.txt", events.stdout
-            )
+            llmd_runtime.write_text(artifacts_dir / "namespace.events.txt", events.stdout)
 
 
 def deploy_inference_service(config: llmd_runtime.ResolvedConfig) -> str:
@@ -105,9 +100,7 @@ def _old_pods_gone() -> bool:
     )
 
     manifest = llmd_runtime.render_inference_service(config)
-    llmd_runtime.apply_manifest(
-        config.artifact_dir / "src" / "llminferenceservice.yaml", manifest
-    )
+    llmd_runtime.apply_manifest(config.artifact_dir / "src" / "llminferenceservice.yaml", manifest)
 
     def _pods_present() -> bool:
         pods = llmd_runtime.oc_get_json(
@@ -117,17 +110,13 @@ def _pods_present() -> bool:
 
     llmd_runtime.wait_until(
         f"llm-d pods to appear in {namespace}",
-        timeout_seconds=config.platform["inference_service"][
-            "pod_appearance_timeout_seconds"
-        ],
+        timeout_seconds=config.platform["inference_service"]["pod_appearance_timeout_seconds"],
         interval_seconds=5,
         predicate=_pods_present,
     )
 
     def _service_ready() -> bool:
-        payload = llmd_runtime.oc_get_json(
-            "llminferenceservice", name=name, namespace=namespace
-        )
+        payload = llmd_runtime.oc_get_json("llminferenceservice", name=name, namespace=namespace)
         return llmd_runtime.condition_status(payload, "Ready") == "True"
 
     llmd_runtime.wait_until(
@@ -161,9 +150,7 @@ def try_resolve_endpoint_url(config: llmd_runtime.ResolvedConfig) -> str | None:
     name = config.platform["inference_service"]["name"]
     namespace = config.namespace
     gateway_name = config.platform["gateway"]["status_address_name"]
-    payload = llmd_runtime.oc_get_json(
-        "llminferenceservice", name=name, namespace=namespace
-    )
+    payload = llmd_runtime.oc_get_json("llminferenceservice", name=name, namespace=namespace)
 
     for address in payload.get("status", {}).get("addresses", []):
         if address.get("name") == gateway_name and address.get("url"):
@@ -171,12 +158,12 @@ def try_resolve_endpoint_url(config: llmd_runtime.ResolvedConfig) -> str | None:
     return None
 
 
-def run_smoke_request(
-    config: llmd_runtime.ResolvedConfig, endpoint_url: str
-) -> dict[str, object]:
+def run_smoke_request(config: llmd_runtime.ResolvedConfig, endpoint_url: str) -> dict[str, object]:
     namespace = config.namespace
     name = config.platform["inference_service"]["name"]
-    deployment_name = f"{name}{config.platform['inference_service']['workload_deployment_name_suffix']}"
+    deployment_name = (
+        f"{name}{config.platform['inference_service']['workload_deployment_name_suffix']}"
+    )
 
     payload = {
         "model": config.model["served_model_name"],
@@ -184,9 +171,7 @@ def run_smoke_request(
         "max_tokens": config.smoke_request["max_tokens"],
         "temperature": config.smoke_request["temperature"],
     }
-    llmd_runtime.write_json(
-        config.artifact_dir / "artifacts" / "smoke.request.json", payload
-    )
+    llmd_runtime.write_json(config.artifact_dir / "artifacts" / "smoke.request.json", payload)
 
     retries = config.platform["smoke"]["request_retries"]
     delay = config.platform["smoke"]["request_retry_delay_seconds"]
@@ -224,9 +209,7 @@ def run_smoke_request(
     return response
 
 
-def run_guidellm_benchmark(
-    config: llmd_runtime.ResolvedConfig, endpoint_url: str
-) -> None:
+def run_guidellm_benchmark(config: llmd_runtime.ResolvedConfig, endpoint_url: str) -> None:
     benchmark_name = config.benchmark["job_name"]
     namespace = config.namespace
 
@@ -259,9 +242,7 @@ def run_guidellm_benchmark(
     )
 
     def _job_terminal() -> dict[str, object] | None:
-        payload = llmd_runtime.oc_get_json(
-            "job", name=benchmark_name, namespace=namespace
-        )
+        payload = llmd_runtime.oc_get_json("job", name=benchmark_name, namespace=namespace)
         status = payload.get("status", {})
         if status.get("succeeded"):
             return payload
@@ -379,12 +360,8 @@ def capture_inference_service_state(config: llmd_runtime.ResolvedConfig) -> None
         artifacts_dir / "llminferenceservice.replicasets.yaml",
         selector=selector,
     )
-    capture_get(
-        "pods", None, namespace, "wide", artifacts_dir / "namespace.pods.status"
-    )
-    capture_get(
-        "services", None, namespace, "wide", artifacts_dir / "namespace.services.status"
-    )
+    capture_get("pods", None, namespace, "wide", artifacts_dir / "namespace.pods.status")
+    capture_get("services", None, namespace, "wide", artifacts_dir / "namespace.services.status")
 
     pod_list = llmd_runtime.oc_get_json(
         "pods", namespace=namespace, selector=selector, ignore_not_found=True
@@ -459,9 +436,7 @@ def capture_guidellm_state(config: llmd_runtime.ResolvedConfig) -> None:
         capture_output=True,
     )
     if result.returncode == 0 and result.stdout:
-        llmd_runtime.write_text(
-            artifacts_dir / "guidellm_benchmark_job.logs", result.stdout
-        )
+        llmd_runtime.write_text(artifacts_dir / "guidellm_benchmark_job.logs", result.stdout)
 
 
 def capture_get(
diff --git a/tests/llm_d/test_runtime.py b/tests/llm_d/test_runtime.py
index d130e781..4fe116ee 100644
--- a/tests/llm_d/test_runtime.py
+++ b/tests/llm_d/test_runtime.py
@@ -34,9 +34,7 @@ def test_load_run_configuration_resolves_alias(
         encoding="utf-8",
     )
 
-    config = llmd_runtime.load_run_configuration(
-        cwd=tmp_path, artifact_dir=artifact_dir
-    )
+    config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
 
     assert config.preset_name == "smoke"
     assert config.preset_alias == "cks"
@@ -63,16 +61,12 @@ def test_load_run_configuration_consolidates_config_d(
     assert consolidated["runtime"]["default_preset"] == "smoke"
 
 
-def test_namespace_override_is_not_managed(
-    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
-) -> None:
+def test_namespace_override_is_not_managed(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
     monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", '{"namespace":"custom-ns"}')
     artifact_dir = tmp_path / "artifacts"
     artifact_dir.mkdir()
 
-    config = llmd_runtime.load_run_configuration(
-        cwd=tmp_path, artifact_dir=artifact_dir
-    )
+    config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
 
     assert config.namespace == "custom-ns"
     assert config.namespace_is_managed is False
@@ -89,9 +83,7 @@ def test_default_namespace_comes_from_project_config(
         encoding="utf-8",
     )
 
-    config = llmd_runtime.load_run_configuration(
-        cwd=tmp_path, artifact_dir=artifact_dir
-    )
+    config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
 
     assert config.namespace == "forge-llm-d"
     assert config.namespace_is_managed is False
@@ -104,9 +96,7 @@ def test_render_inference_service_injects_model_and_epp(
     artifact_dir = tmp_path / "artifacts"
     artifact_dir.mkdir()
 
-    config = llmd_runtime.load_run_configuration(
-        cwd=tmp_path, artifact_dir=artifact_dir
-    )
+    config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
     manifest = llmd_runtime.render_inference_service(config)
     cache_spec = llmd_runtime.resolve_model_cache(config)
 
@@ -115,23 +105,17 @@ def test_render_inference_service_injects_model_and_epp(
     assert manifest["spec"]["model"]["name"] == "Qwen/Qwen3-0.6B"
     assert manifest["spec"]["model"]["uri"] == cache_spec.model_uri
     assert manifest["spec"]["model"]["name"] == config.model["served_model_name"]
-    router_args = manifest["spec"]["router"]["scheduler"]["template"]["containers"][0][
-        "args"
-    ]
+    router_args = manifest["spec"]["router"]["scheduler"]["template"]["containers"][0]["args"]
     assert router_args[-2] == "--config-text"
     assert "EndpointPickerConfig" in router_args[-1]
 
 
-def test_resolve_model_cache_for_hf_model(
-    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
-) -> None:
+def test_resolve_model_cache_for_hf_model(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
     monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
     artifact_dir = tmp_path / "artifacts"
     artifact_dir.mkdir()
 
-    config = llmd_runtime.load_run_configuration(
-        cwd=tmp_path, artifact_dir=artifact_dir
-    )
+    config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
     cache_spec = llmd_runtime.resolve_model_cache(config)
 
     assert cache_spec is not None
@@ -149,9 +133,7 @@ def test_render_model_cache_job_for_hf_model(
     artifact_dir = tmp_path / "artifacts"
     artifact_dir.mkdir()
 
-    config = llmd_runtime.load_run_configuration(
-        cwd=tmp_path, artifact_dir=artifact_dir
-    )
+    config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
     cache_spec = llmd_runtime.resolve_model_cache(config)
     manifest = llmd_runtime.render_model_cache_job(config, cache_spec)
 
@@ -176,9 +158,7 @@ def test_render_model_cache_job_for_oci_model_uses_registry_auth_secret(
         encoding="utf-8",
     )
 
-    config = llmd_runtime.load_run_configuration(
-        cwd=tmp_path, artifact_dir=artifact_dir
-    )
+    config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
     monkeypatch.setattr(
         llmd_runtime,
         "resolve_default_serviceaccount_image_pull_secret",
@@ -188,9 +168,7 @@ def test_render_model_cache_job_for_oci_model_uses_registry_auth_secret(
     manifest = llmd_runtime.render_model_cache_job(config, cache_spec)
 
     container = manifest["spec"]["template"]["spec"]["containers"][0]
-    volume_names = {
-        volume["name"] for volume in manifest["spec"]["template"]["spec"]["volumes"]
-    }
+    volume_names = {volume["name"] for volume in manifest["spec"]["template"]["spec"]["volumes"]}
 
     assert cache_spec.source_scheme == "oci"
     assert container["name"] == "oci-model-extractor"
@@ -211,9 +189,7 @@ def test_render_guidellm_job_uses_target_and_rate(
         encoding="utf-8",
     )
 
-    config = llmd_runtime.load_run_configuration(
-        cwd=tmp_path, artifact_dir=artifact_dir
-    )
+    config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
     manifest = llmd_runtime.render_guidellm_job(config, "https://example.test")
 
     container = manifest["spec"]["template"]["spec"]["containers"][0]
@@ -229,10 +205,7 @@ def test_prepare_model_cache_skips_ready_pvc(
     artifact_dir = tmp_path / "artifacts"
     artifact_dir.mkdir()
 
-    config = llmd_runtime.load_run_configuration(
-        cwd=tmp_path, artifact_dir=artifact_dir
-    )
-    cache_spec = llmd_runtime.resolve_model_cache(config)
+    config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
     calls: list[str] = []
 
     monkeypatch.setattr(
@@ -240,9 +213,7 @@ def test_prepare_model_cache_skips_ready_pvc(
         "ensure_model_cache_pvc",
         lambda _config, _cache_spec: calls.append("ensure-pvc"),
     )
-    monkeypatch.setattr(
-        llmd_runtime, "model_cache_pvc_ready", lambda _cache_spec: True
-    )
+    monkeypatch.setattr(llmd_runtime, "model_cache_pvc_ready", lambda _cache_spec: True)
     monkeypatch.setattr(
         prepare_model_cache_toolbox,
         "capture_model_cache_state",
@@ -266,9 +237,7 @@ def test_cleanup_deletes_leftovers_but_not_namespace_or_preserved_pvcs(
     artifact_dir = tmp_path / "artifacts"
     artifact_dir.mkdir()
 
-    config = llmd_runtime.load_run_configuration(
-        cwd=tmp_path, artifact_dir=artifact_dir
-    )
+    config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
     oc_calls: list[tuple[str, ...]] = []
 
     def fake_resource_exists(kind: str, name: str, namespace: str | None = None) -> bool:
@@ -305,9 +274,7 @@ def test_prepare_gpu_operator_skips_existing_clusterpolicy(
     monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
     artifact_dir = tmp_path / "artifacts"
     artifact_dir.mkdir()
-    config = llmd_runtime.load_run_configuration(
-        cwd=tmp_path, artifact_dir=artifact_dir
-    )
+    config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
 
     calls: list[str] = []
 
@@ -357,9 +324,7 @@ def test_prepare_gpu_operator_bootstraps_missing_clusterpolicy(
     monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
     artifact_dir = tmp_path / "artifacts"
     artifact_dir.mkdir()
-    config = llmd_runtime.load_run_configuration(
-        cwd=tmp_path, artifact_dir=artifact_dir
-    )
+    config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
 
     applied: list[Path] = []
     manifest = {
@@ -371,9 +336,7 @@ def test_prepare_gpu_operator_bootstraps_missing_clusterpolicy(
 
     monkeypatch.setattr(prepare_toolbox, "ensure_operator_subscription", lambda _: None)
     monkeypatch.setattr(llmd_runtime, "wait_for_crd", lambda *_, **__: None)
-    monkeypatch.setattr(
-        llmd_runtime, "load_manifest_template", lambda _config, _path: manifest
-    )
+    monkeypatch.setattr(llmd_runtime, "load_manifest_template", lambda _config, _path: manifest)
     monkeypatch.setattr(llmd_runtime, "resource_exists", lambda kind, name: False)
     monkeypatch.setattr(
         llmd_runtime,
@@ -417,9 +380,7 @@ def test_resolve_endpoint_url_requires_gateway_address(
     monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
     artifact_dir = tmp_path / "artifacts"
     artifact_dir.mkdir()
-    config = llmd_runtime.load_run_configuration(
-        cwd=tmp_path, artifact_dir=artifact_dir
-    )
+    config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
 
     def fake_oc_get_json(kind: str, **_: object) -> dict[str, object]:
         assert kind == "llminferenceservice"

From d97dc9bf108a8255e84978f8b243599834d61481 Mon Sep 17 00:00:00 2001
From: Alberto Perdomo <aperdomo@redhat.com>
Date: Mon, 20 Apr 2026 10:43:22 +0100
Subject: [PATCH 05/21] refactor: Separate llm_d scheduler profiles

---
 .../llm_d/orchestration/config.d/platform.yaml    |  1 -
 .../llm_d/orchestration/config.d/runtime.yaml     |  1 +
 .../config.d/scheduler_profiles.yaml              |  2 ++
 projects/llm_d/orchestration/llmd_runtime.py      | 15 ++++++++++++---
 .../llm_d/orchestration/presets.d/presets.yaml    |  2 ++
 .../approximate-prefix-cache.yaml}                |  0
 tests/llm_d/test_runtime.py                       |  4 +++-
 7 files changed, 20 insertions(+), 5 deletions(-)
 create mode 100644 projects/llm_d/orchestration/config.d/scheduler_profiles.yaml
 rename projects/llm_d/orchestration/{manifests/epp-approximate-prefix-cache.yaml => scheduler_profiles/approximate-prefix-cache.yaml} (100%)

diff --git a/projects/llm_d/orchestration/config.d/platform.yaml b/projects/llm_d/orchestration/config.d/platform.yaml
index 9f3b9e0e..9ef74568 100644
--- a/projects/llm_d/orchestration/config.d/platform.yaml
+++ b/projects/llm_d/orchestration/config.d/platform.yaml
@@ -68,7 +68,6 @@ gateway:
 inference_service:
   name: llm-d
   template: manifests/llminferenceservice.yaml
-  epp_config_template: manifests/epp-approximate-prefix-cache.yaml
   workload_deployment_name_suffix: -kserve
   pod_appearance_timeout_seconds: 600
   ready_timeout_seconds: 1800
diff --git a/projects/llm_d/orchestration/config.d/runtime.yaml b/projects/llm_d/orchestration/config.d/runtime.yaml
index 982d8fd2..4f1bfb98 100644
--- a/projects/llm_d/orchestration/config.d/runtime.yaml
+++ b/projects/llm_d/orchestration/config.d/runtime.yaml
@@ -3,5 +3,6 @@ allowed_override_keys:
   - namespace
 selected_preset: smoke
 model_key: qwen3-0-6b
+scheduler_profile_key: approximate-prefix-cache
 smoke_request_key: default
 benchmark_key: null
diff --git a/projects/llm_d/orchestration/config.d/scheduler_profiles.yaml b/projects/llm_d/orchestration/config.d/scheduler_profiles.yaml
new file mode 100644
index 00000000..cb579d9b
--- /dev/null
+++ b/projects/llm_d/orchestration/config.d/scheduler_profiles.yaml
@@ -0,0 +1,2 @@
+approximate-prefix-cache:
+  config_path: scheduler_profiles/approximate-prefix-cache.yaml
diff --git a/projects/llm_d/orchestration/llmd_runtime.py b/projects/llm_d/orchestration/llmd_runtime.py
index 48c503f3..69206a2d 100644
--- a/projects/llm_d/orchestration/llmd_runtime.py
+++ b/projects/llm_d/orchestration/llmd_runtime.py
@@ -41,6 +41,8 @@ class ResolvedConfig:
     platform: dict[str, Any]
     model_key: str
     model: dict[str, Any]
+    scheduler_profile_key: str
+    scheduler_profile: dict[str, Any]
     model_cache: dict[str, Any]
     smoke_request: dict[str, Any]
     benchmark: dict[str, Any] | None
@@ -122,6 +124,11 @@ def load_run_configuration(
     model_name = config.project.get_config("runtime.model_key")
     model = copy.deepcopy(config.project.get_config(f"models.{model_name}"))
 
+    scheduler_profile_key = config.project.get_config("runtime.scheduler_profile_key")
+    scheduler_profile = copy.deepcopy(
+        config.project.get_config(f"scheduler_profiles.{scheduler_profile_key}")
+    )
+
     smoke_request_name = config.project.get_config("runtime.smoke_request_key")
     smoke_request = copy.deepcopy(
         config.project.get_config(f"workloads.smoke_requests.{smoke_request_name}")
@@ -165,6 +172,8 @@ def load_run_configuration(
         platform=platform_data,
         model_key=model_name,
         model=model,
+        scheduler_profile_key=scheduler_profile_key,
+        scheduler_profile=scheduler_profile,
         model_cache=model_cache,
         smoke_request=smoke_request,
         benchmark=benchmark,
@@ -953,12 +962,12 @@ def render_inference_service(config: ResolvedConfig) -> dict[str, Any]:
         config.model["resources"]
     )
 
-    epp_path = config.config_dir / config.platform["inference_service"]["epp_config_template"]
-    epp_config = epp_path.read_text(encoding="utf-8")
+    scheduler_profile_path = config.config_dir / config.scheduler_profile["config_path"]
+    scheduler_profile_config = scheduler_profile_path.read_text(encoding="utf-8")
     router_args = manifest["spec"]["router"]["scheduler"]["template"]["containers"][0]["args"]
     if not router_args or router_args[-1] != "--config-text":
         raise ValueError("Expected llm-d router args to end with --config-text")
-    router_args.append(epp_config)
+    router_args.append(scheduler_profile_config)
 
     return manifest
 
diff --git a/projects/llm_d/orchestration/presets.d/presets.yaml b/projects/llm_d/orchestration/presets.d/presets.yaml
index 37fcc711..9fc1392a 100644
--- a/projects/llm_d/orchestration/presets.d/presets.yaml
+++ b/projects/llm_d/orchestration/presets.d/presets.yaml
@@ -3,12 +3,14 @@ __multiple: true
 smoke:
   runtime.selected_preset: smoke
   runtime.model_key: qwen3-0-6b
+  runtime.scheduler_profile_key: approximate-prefix-cache
   runtime.smoke_request_key: default
   runtime.benchmark_key: null
 
 benchmark-short:
   runtime.selected_preset: benchmark-short
   runtime.model_key: llama-3-1-8b-instruct-fp8
+  runtime.scheduler_profile_key: approximate-prefix-cache
   runtime.smoke_request_key: default
   runtime.benchmark_key: short
 
diff --git a/projects/llm_d/orchestration/manifests/epp-approximate-prefix-cache.yaml b/projects/llm_d/orchestration/scheduler_profiles/approximate-prefix-cache.yaml
similarity index 100%
rename from projects/llm_d/orchestration/manifests/epp-approximate-prefix-cache.yaml
rename to projects/llm_d/orchestration/scheduler_profiles/approximate-prefix-cache.yaml
diff --git a/tests/llm_d/test_runtime.py b/tests/llm_d/test_runtime.py
index 4fe116ee..50f84000 100644
--- a/tests/llm_d/test_runtime.py
+++ b/tests/llm_d/test_runtime.py
@@ -57,6 +57,7 @@ def test_load_run_configuration_consolidates_config_d(
     assert "model_cache" in consolidated
     assert "models" in consolidated
     assert "runtime" in consolidated
+    assert "scheduler_profiles" in consolidated
     assert "workloads" in consolidated
     assert consolidated["runtime"]["default_preset"] == "smoke"
 
@@ -89,7 +90,7 @@ def test_default_namespace_comes_from_project_config(
     assert config.namespace_is_managed is False
 
 
-def test_render_inference_service_injects_model_and_epp(
+def test_render_inference_service_injects_model_and_scheduler_profile(
     tmp_path: Path, monkeypatch: pytest.MonkeyPatch
 ) -> None:
     monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
@@ -105,6 +106,7 @@ def test_render_inference_service_injects_model_and_epp(
     assert manifest["spec"]["model"]["name"] == "Qwen/Qwen3-0.6B"
     assert manifest["spec"]["model"]["uri"] == cache_spec.model_uri
     assert manifest["spec"]["model"]["name"] == config.model["served_model_name"]
+    assert config.scheduler_profile_key == "approximate-prefix-cache"
     router_args = manifest["spec"]["router"]["scheduler"]["template"]["containers"][0]["args"]
     assert router_args[-2] == "--config-text"
     assert "EndpointPickerConfig" in router_args[-1]

From 58abf1157c595190c2ac60c2c6f9627bf9b792d6 Mon Sep 17 00:00:00 2001
From: Alberto Perdomo <aperdomo@redhat.com>
Date: Mon, 20 Apr 2026 11:35:56 +0100
Subject: [PATCH 06/21] refactor: Rename llm_d capture toolbox

---
 projects/core/dsl/log.py                                      | 4 ++--
 projects/core/dsl/runtime.py                                  | 4 ++--
 .../{capture_isvc_state => capture_llmisvc_state}/main.py     | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)
 rename projects/llm_d/toolbox/{capture_isvc_state => capture_llmisvc_state}/main.py (99%)

diff --git a/projects/core/dsl/log.py b/projects/core/dsl/log.py
index b5c911de..b93a5076 100644
--- a/projects/core/dsl/log.py
+++ b/projects/core/dsl/log.py
@@ -117,6 +117,6 @@ def _get_toolbox_function_name(filename):
     """Extract toolbox function name from file path (parent directory name)"""
     filename_path = Path(filename)
 
-    # For paths like projects/llm_d/toolbox/capture_isvc_state/main.py
-    # Return the parent directory name: capture_isvc_state
+    # For paths like projects/llm_d/toolbox/capture_llmisvc_state/main.py
+    # Return the parent directory name: capture_llmisvc_state
     return filename_path.parent.name
diff --git a/projects/core/dsl/runtime.py b/projects/core/dsl/runtime.py
index d1afff31..c8f807db 100644
--- a/projects/core/dsl/runtime.py
+++ b/projects/core/dsl/runtime.py
@@ -413,6 +413,6 @@ def _get_toolbox_function_name(filename):
     """Extract toolbox function name from file path (parent directory name)"""
     filename_path = Path(filename)
 
-    # For paths like projects/llm_d/toolbox/capture_isvc_state/main.py
-    # Return the parent directory name: capture_isvc_state
+    # For paths like projects/llm_d/toolbox/capture_llmisvc_state/main.py
+    # Return the parent directory name: capture_llmisvc_state
     return filename_path.parent.name
diff --git a/projects/llm_d/toolbox/capture_isvc_state/main.py b/projects/llm_d/toolbox/capture_llmisvc_state/main.py
similarity index 99%
rename from projects/llm_d/toolbox/capture_isvc_state/main.py
rename to projects/llm_d/toolbox/capture_llmisvc_state/main.py
index 85d09bc8..1e4577c5 100644
--- a/projects/llm_d/toolbox/capture_isvc_state/main.py
+++ b/projects/llm_d/toolbox/capture_llmisvc_state/main.py
@@ -2,7 +2,7 @@
 
 """
 LLMInferenceService state capture using task-based DSL
-Replaces llmd_capture_isvc_state Ansible role
+Replaces llmd_capture_llmisvc_state Ansible role
 """
 
 from projects.core.dsl import execute_tasks, shell, task, toolbox

From 6155c42591153927d355a90f4338a387bfff20c7 Mon Sep 17 00:00:00 2001
From: Alberto Perdomo <aperdomo@redhat.com>
Date: Mon, 20 Apr 2026 14:23:51 +0100
Subject: [PATCH 07/21] fix: Harden llm_d runtime command handling

---
 projects/llm_d/orchestration/llmd_runtime.py | 68 ++++++++++++----
 tests/llm_d/test_runtime.py                  | 82 ++++++++++++++++++++
 2 files changed, 133 insertions(+), 17 deletions(-)

diff --git a/projects/llm_d/orchestration/llmd_runtime.py b/projects/llm_d/orchestration/llmd_runtime.py
index 69206a2d..53c270fa 100644
--- a/projects/llm_d/orchestration/llmd_runtime.py
+++ b/projects/llm_d/orchestration/llmd_runtime.py
@@ -358,16 +358,26 @@ def run_command(
     check: bool = True,
     capture_output: bool = True,
     input_text: str | None = None,
+    timeout_seconds: float | None = 300,
 ) -> subprocess.CompletedProcess[str]:
     cmd = [str(arg) for arg in args]
     LOGGER.info("run: %s", " ".join(shlex.quote(arg) for arg in cmd))
-    result = subprocess.run(
-        cmd,
-        check=False,
-        text=True,
-        capture_output=capture_output,
-        input=input_text,
-    )
+    try:
+        result = subprocess.run(
+            cmd,
+            check=False,
+            text=True,
+            capture_output=capture_output,
+            input=input_text,
+            timeout=timeout_seconds,
+        )
+    except subprocess.TimeoutExpired:
+        LOGGER.error(
+            "Command timed out after %ss: %s",
+            timeout_seconds,
+            " ".join(shlex.quote(arg) for arg in cmd),
+        )
+        raise
 
     if capture_output:
         if result.stdout:
@@ -389,12 +399,14 @@ def oc(
     check: bool = True,
     capture_output: bool = True,
     input_text: str | None = None,
+    timeout_seconds: float | None = 300,
 ) -> subprocess.CompletedProcess[str]:
     return run_command(
         ["oc", *args],
         check=check,
         capture_output=capture_output,
         input_text=input_text,
+        timeout_seconds=timeout_seconds,
     )
 
 
@@ -421,21 +433,41 @@ def oc_get_json(
     args.extend(["-o", "json"])
 
     result = oc(*args, check=not ignore_not_found, capture_output=True)
-    if ignore_not_found and result.returncode != 0:
-        return None
+    if result.returncode != 0:
+        if ignore_not_found and _is_oc_not_found_error(result.stderr):
+            return None
+        raise CommandError(
+            f"oc {' '.join(shlex.quote(arg) for arg in args)} failed with exit code "
+            f"{result.returncode}: {result.stderr.strip()}"
+        )
+    if not result.stdout:
+        raise CommandError(f"oc {' '.join(shlex.quote(arg) for arg in args)} returned no output")
     return json.loads(result.stdout)
 
 
 def resource_exists(kind: str, name: str, *, namespace: str | None = None) -> bool:
-    result = oc(
-        "get",
-        kind,
-        name,
-        *([] if namespace is None else ["-n", namespace]),
-        check=False,
-        capture_output=True,
+    return (
+        oc_get_json(
+            kind,
+            name=name,
+            namespace=namespace,
+            ignore_not_found=True,
+        )
+        is not None
     )
-    return result.returncode == 0
+
+
+def _is_oc_not_found_error(stderr: str | None) -> bool:
+    if not stderr:
+        return False
+
+    normalized = stderr.lower()
+    if "error from server (notfound)" in normalized:
+        return True
+    if "no resources found" in normalized:
+        return True
+
+    return bool(re.search(r"\bnot found\b", normalized))
 
 
 def wait_until(
@@ -455,6 +487,8 @@ def wait_until(
                 return value
             last_error = None
         except Exception as exc:  # pragma: no cover - exercised in integration paths
+            if isinstance(exc, RuntimeError):
+                raise
             last_error = exc
             LOGGER.info("waiting for %s: %s", description, exc)
         time.sleep(interval_seconds)
diff --git a/tests/llm_d/test_runtime.py b/tests/llm_d/test_runtime.py
index 50f84000..b7639712 100644
--- a/tests/llm_d/test_runtime.py
+++ b/tests/llm_d/test_runtime.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import subprocess
 from pathlib import Path
 
 import pytest
@@ -392,3 +393,84 @@ def fake_oc_get_json(kind: str, **_: object) -> dict[str, object]:
 
     with pytest.raises(RuntimeError, match="Gateway address"):
         test_toolbox.resolve_endpoint_url(config)
+
+
+def test_wait_until_reraises_runtime_error() -> None:
+    with pytest.raises(RuntimeError, match="terminal failure"):
+        llmd_runtime.wait_until(
+            "test condition",
+            timeout_seconds=1,
+            interval_seconds=0,
+            predicate=lambda: (_ for _ in ()).throw(RuntimeError("terminal failure")),
+        )
+
+
+def test_oc_forwards_timeout_to_run_command(monkeypatch: pytest.MonkeyPatch) -> None:
+    captured: dict[str, object] = {}
+
+    def fake_run_command(args, **kwargs):
+        captured["args"] = list(args)
+        captured["kwargs"] = kwargs
+        return subprocess.CompletedProcess(args, 0, stdout="", stderr="")
+
+    monkeypatch.setattr(llmd_runtime, "run_command", fake_run_command)
+
+    llmd_runtime.oc("get", "pods", timeout_seconds=42)
+
+    assert captured["args"] == ["oc", "get", "pods"]
+    assert captured["kwargs"]["timeout_seconds"] == 42
+
+
+def test_oc_get_json_returns_none_only_for_not_found(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    monkeypatch.setattr(
+        llmd_runtime,
+        "oc",
+        lambda *args, **kwargs: subprocess.CompletedProcess(
+            args,
+            1,
+            stdout="",
+            stderr='Error from server (NotFound): llminferenceservices.serving.kserve.io "llm-d" not found',
+        ),
+    )
+
+    payload = llmd_runtime.oc_get_json(
+        "llminferenceservice",
+        name="llm-d",
+        namespace="forge-llm-d",
+        ignore_not_found=True,
+    )
+
+    assert payload is None
+
+
+def test_oc_get_json_raises_for_non_not_found_errors(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    monkeypatch.setattr(
+        llmd_runtime,
+        "oc",
+        lambda *args, **kwargs: subprocess.CompletedProcess(
+            args,
+            1,
+            stdout="",
+            stderr='Error from server (Forbidden): pods is forbidden: User "alice" cannot list resource "pods"',
+        ),
+    )
+
+    with pytest.raises(llmd_runtime.CommandError, match="Forbidden"):
+        llmd_runtime.oc_get_json("pods", namespace="forge-llm-d", ignore_not_found=True)
+
+
+def test_resource_exists_propagates_non_not_found_errors(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    monkeypatch.setattr(
+        llmd_runtime,
+        "oc_get_json",
+        lambda *args, **kwargs: (_ for _ in ()).throw(llmd_runtime.CommandError("boom")),
+    )
+
+    with pytest.raises(llmd_runtime.CommandError, match="boom"):
+        llmd_runtime.resource_exists("namespace", "forge-llm-d")

From a066d986616e1a6427e4d5d5b73593034fbe6c47 Mon Sep 17 00:00:00 2001
From: Alberto Perdomo <aperdomo@redhat.com>
Date: Tue, 21 Apr 2026 10:32:41 +0100
Subject: [PATCH 08/21] fix: Make NFD prepare idempotent

---
 projects/llm_d/toolbox/prepare/main.py | 21 +++++++---
 tests/llm_d/test_runtime.py            | 53 ++++++++++++++++++++++++++
 2 files changed, 68 insertions(+), 6 deletions(-)

diff --git a/projects/llm_d/toolbox/prepare/main.py b/projects/llm_d/toolbox/prepare/main.py
index 621c34d4..831201d5 100644
--- a/projects/llm_d/toolbox/prepare/main.py
+++ b/projects/llm_d/toolbox/prepare/main.py
@@ -96,10 +96,19 @@ def prepare_nfd(config: llmd_runtime.ResolvedConfig) -> None:
     )
 
     manifest = llmd_runtime.load_manifest_template(config, operator_spec["bootstrap_manifest"])
-    llmd_runtime.apply_manifest(
-        config.artifact_dir / "src" / "nfd-nodefeaturediscovery.yaml",
-        manifest,
-    )
+    nfd_name = manifest["metadata"]["name"]
+    nfd_namespace = manifest["metadata"]["namespace"]
+    if llmd_runtime.resource_exists("nodefeaturediscovery", nfd_name, namespace=nfd_namespace):
+        LOGGER.info(
+            "NodeFeatureDiscovery/%s already exists in %s; verifying GPU discovery labels",
+            nfd_name,
+            nfd_namespace,
+        )
+    else:
+        llmd_runtime.apply_manifest(
+            config.artifact_dir / "src" / "nfd-nodefeaturediscovery.yaml",
+            manifest,
+        )
 
     llmd_runtime.wait_until(
         "NodeFeatureDiscovery bootstrap resource",
@@ -107,8 +116,8 @@ def prepare_nfd(config: llmd_runtime.ResolvedConfig) -> None:
         interval_seconds=10,
         predicate=lambda: llmd_runtime.resource_exists(
             "nodefeaturediscovery",
-            manifest["metadata"]["name"],
-            namespace=manifest["metadata"]["namespace"],
+            nfd_name,
+            namespace=nfd_namespace,
         ),
     )
 
diff --git a/tests/llm_d/test_runtime.py b/tests/llm_d/test_runtime.py
index b7639712..2c2ab085 100644
--- a/tests/llm_d/test_runtime.py
+++ b/tests/llm_d/test_runtime.py
@@ -357,6 +357,59 @@ def test_prepare_gpu_operator_bootstraps_missing_clusterpolicy(
     assert applied == [artifact_dir / "src" / "gpu-clusterpolicy.yaml"]
 
 
+def test_prepare_nfd_skips_existing_nodefeaturediscovery(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
+    artifact_dir = tmp_path / "artifacts"
+    artifact_dir.mkdir()
+    config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
+
+    calls: list[str] = []
+    manifest = {
+        "apiVersion": "nfd.openshift.io/v1",
+        "kind": "NodeFeatureDiscovery",
+        "metadata": {"name": "nfd-instance", "namespace": "openshift-nfd"},
+    }
+
+    monkeypatch.setattr(
+        prepare_toolbox,
+        "ensure_operator_subscription",
+        lambda operator_spec: calls.append(f"subscription:{operator_spec['package']}"),
+    )
+    monkeypatch.setattr(
+        llmd_runtime,
+        "wait_for_crd",
+        lambda crd_name, *, timeout_seconds: calls.append(f"crd:{crd_name}"),
+    )
+    monkeypatch.setattr(llmd_runtime, "load_manifest_template", lambda _config, _path: manifest)
+    monkeypatch.setattr(llmd_runtime, "resource_exists", lambda *args, **kwargs: True)
+    monkeypatch.setattr(
+        llmd_runtime,
+        "wait_until",
+        lambda *args, **kwargs: calls.append("wait-nfd"),
+    )
+    monkeypatch.setattr(
+        prepare_toolbox,
+        "wait_for_nfd_gpu_labels",
+        lambda _config, *, timeout_seconds: calls.append("wait-labels"),
+    )
+
+    def fail_apply(*_: object, **__: object) -> None:
+        raise AssertionError("existing NodeFeatureDiscovery must not be reapplied")
+
+    monkeypatch.setattr(llmd_runtime, "apply_manifest", fail_apply)
+
+    prepare_toolbox.prepare_nfd(config)
+
+    assert calls == [
+        "subscription:nfd",
+        "crd:nodefeaturediscoveries.nfd.openshift.io",
+        "wait-nfd",
+        "wait-labels",
+    ]
+
+
 def test_gpu_clusterpolicy_manifest_has_required_default_sections() -> None:
     manifest = llmd_runtime.load_yaml(
         llmd_runtime.CONFIG_DIR / "manifests" / "gpu-clusterpolicy.yaml"

From 819b46bc139820200c885c7124632ebd4611bdd7 Mon Sep 17 00:00:00 2001
From: Alberto Perdomo <aperdomo@redhat.com>
Date: Tue, 21 Apr 2026 10:33:00 +0100
Subject: [PATCH 09/21] fix: Run llm_d smoke in helper job

---
 .../orchestration/config.d/platform.yaml      |   3 +
 projects/llm_d/orchestration/llmd_runtime.py  |  79 ++++++++++++
 projects/llm_d/toolbox/test/main.py           | 115 +++++++++++++-----
 tests/llm_d/test_runtime.py                   |  59 +++++++++
 4 files changed, 225 insertions(+), 31 deletions(-)

diff --git a/projects/llm_d/orchestration/config.d/platform.yaml b/projects/llm_d/orchestration/config.d/platform.yaml
index 9ef74568..43092e7c 100644
--- a/projects/llm_d/orchestration/config.d/platform.yaml
+++ b/projects/llm_d/orchestration/config.d/platform.yaml
@@ -77,6 +77,9 @@ artifacts:
   capture_namespace_events: true
 
 smoke:
+  job_name: llm-d-smoke
+  client_image: curlimages/curl:8.11.1
   endpoint_path: /v1/completions
   request_retries: 30
   request_retry_delay_seconds: 10
+  request_timeout_seconds: 60
diff --git a/projects/llm_d/orchestration/llmd_runtime.py b/projects/llm_d/orchestration/llmd_runtime.py
index 53c270fa..733b04ec 100644
--- a/projects/llm_d/orchestration/llmd_runtime.py
+++ b/projects/llm_d/orchestration/llmd_runtime.py
@@ -1006,6 +1006,85 @@ def render_inference_service(config: ResolvedConfig) -> dict[str, Any]:
     return manifest
 
 
+def render_smoke_request_job(
+    config: ResolvedConfig, endpoint_url: str, payload: dict[str, Any]
+) -> dict[str, Any]:
+    smoke = config.platform["smoke"]
+    command = """
+set -eu
+attempt=1
+while [ "${attempt}" -le "${REQUEST_RETRIES}" ]; do
+  if curl -k -sSf --max-time "${REQUEST_TIMEOUT_SECONDS}" \
+    "${ENDPOINT_URL}${ENDPOINT_PATH}" \
+    -H "Content-Type: application/json" \
+    -d "${REQUEST_PAYLOAD}" \
+    -o /tmp/smoke-response.json \
+    2>/tmp/smoke-error.log; then
+    cat /tmp/smoke-response.json
+    exit 0
+  fi
+  attempt=$((attempt + 1))
+  sleep "${REQUEST_RETRY_DELAY_SECONDS}"
+done
+cat /tmp/smoke-error.log >&2 || true
+exit 1
+"""
+
+    return {
+        "apiVersion": "batch/v1",
+        "kind": "Job",
+        "metadata": {
+            "name": smoke["job_name"],
+            "namespace": config.namespace,
+            "labels": {
+                "app.kubernetes.io/managed-by": "forge",
+                "forge.openshift.io/project": "llm_d",
+                "forge.openshift.io/component": "smoke",
+            },
+        },
+        "spec": {
+            "backoffLimit": 0,
+            "activeDeadlineSeconds": (
+                smoke["request_retries"]
+                * (smoke["request_timeout_seconds"] + smoke["request_retry_delay_seconds"])
+            ),
+            "template": {
+                "metadata": {
+                    "labels": {
+                        "app.kubernetes.io/managed-by": "forge",
+                        "forge.openshift.io/project": "llm_d",
+                        "forge.openshift.io/component": "smoke",
+                    }
+                },
+                "spec": {
+                    "restartPolicy": "Never",
+                    "containers": [
+                        {
+                            "name": "smoke",
+                            "image": smoke["client_image"],
+                            "command": ["/bin/sh", "-ceu", command],
+                            "env": [
+                                {"name": "ENDPOINT_URL", "value": endpoint_url},
+                                {"name": "ENDPOINT_PATH", "value": smoke["endpoint_path"]},
+                                {"name": "REQUEST_PAYLOAD", "value": json.dumps(payload)},
+                                {"name": "REQUEST_RETRIES", "value": str(smoke["request_retries"])},
+                                {
+                                    "name": "REQUEST_RETRY_DELAY_SECONDS",
+                                    "value": str(smoke["request_retry_delay_seconds"]),
+                                },
+                                {
+                                    "name": "REQUEST_TIMEOUT_SECONDS",
+                                    "value": str(smoke["request_timeout_seconds"]),
+                                },
+                            ],
+                        }
+                    ],
+                },
+            },
+        },
+    }
+
+
 def render_guidellm_pvc(config: ResolvedConfig) -> dict[str, Any]:
     if not config.benchmark:
         raise ValueError("Benchmark configuration is not enabled for this preset")
diff --git a/projects/llm_d/toolbox/test/main.py b/projects/llm_d/toolbox/test/main.py
index 0ea05751..9c6242b6 100644
--- a/projects/llm_d/toolbox/test/main.py
+++ b/projects/llm_d/toolbox/test/main.py
@@ -4,7 +4,6 @@
 
 import json
 import logging
-import time
 from pathlib import Path
 
 from projects.core.dsl import toolbox
@@ -40,6 +39,16 @@ def run_test(config: llmd_runtime.ResolvedConfig) -> int:
         if endpoint_url:
             llmd_runtime.write_text(artifacts_dir / "endpoint.url", f"{endpoint_url}\n")
         benchmark_name = config.benchmark["job_name"] if config.benchmark else "guidellm-benchmark"
+        smoke_job_name = config.platform["smoke"]["job_name"]
+        llmd_runtime.oc(
+            "delete",
+            "job",
+            smoke_job_name,
+            "-n",
+            namespace,
+            "--ignore-not-found=true",
+            check=False,
+        )
         llmd_runtime.oc(
             "delete",
             "job,pvc",
@@ -160,10 +169,7 @@ def try_resolve_endpoint_url(config: llmd_runtime.ResolvedConfig) -> str | None:
 
 def run_smoke_request(config: llmd_runtime.ResolvedConfig, endpoint_url: str) -> dict[str, object]:
     namespace = config.namespace
-    name = config.platform["inference_service"]["name"]
-    deployment_name = (
-        f"{name}{config.platform['inference_service']['workload_deployment_name_suffix']}"
-    )
+    job_name = config.platform["smoke"]["job_name"]
 
     payload = {
         "model": config.model["served_model_name"],
@@ -173,35 +179,56 @@ def run_smoke_request(config: llmd_runtime.ResolvedConfig, endpoint_url: str) ->
     }
     llmd_runtime.write_json(config.artifact_dir / "artifacts" / "smoke.request.json", payload)
 
-    retries = config.platform["smoke"]["request_retries"]
-    delay = config.platform["smoke"]["request_retry_delay_seconds"]
-    result = None
-    for _ in range(retries):
-        result = llmd_runtime.oc(
-            "exec",
-            "-n",
+    llmd_runtime.oc(
+        "delete",
+        "job",
+        job_name,
+        "-n",
+        namespace,
+        "--ignore-not-found=true",
+        check=False,
+    )
+    llmd_runtime.wait_until(
+        f"job/{job_name} deletion in {namespace}",
+        timeout_seconds=120,
+        interval_seconds=5,
+        predicate=lambda: not llmd_runtime.resource_exists("job", job_name, namespace=namespace),
+    )
+
+    llmd_runtime.apply_manifest(
+        config.artifact_dir / "src" / "smoke-job.yaml",
+        llmd_runtime.render_smoke_request_job(config, endpoint_url, payload),
+    )
+
+    try:
+        llmd_runtime.wait_for_job_completion(
+            job_name,
             namespace,
-            f"deployment/{deployment_name}",
-            "-c",
-            "main",
-            "--",
-            "curl",
-            "-k",
-            "-sSf",
-            f"{endpoint_url}{config.platform['smoke']['endpoint_path']}",
-            "-H",
-            "Content-Type: application/json",
-            "-d",
-            json.dumps(payload),
-            check=False,
-            capture_output=True,
+            timeout_seconds=(
+                config.platform["smoke"]["request_retries"]
+                * (
+                    config.platform["smoke"]["request_timeout_seconds"]
+                    + config.platform["smoke"]["request_retry_delay_seconds"]
+                )
+            ),
+            interval_seconds=5,
         )
-        if result.returncode == 0:
-            break
-        time.sleep(delay)
+    finally:
+        capture_smoke_state(config)
 
-    if result is None or result.returncode != 0:
-        raise RuntimeError("Smoke request never succeeded against the llm_d endpoint")
+    result = llmd_runtime.oc(
+        "logs",
+        f"job/{job_name}",
+        "-n",
+        namespace,
+        check=False,
+        capture_output=True,
+    )
+
+    if result.returncode != 0 or not result.stdout:
+        raise RuntimeError(
+            f"Smoke request job {job_name} completed but response logs could not be read: {result.stderr}"
+        )
 
     response = json.loads(result.stdout)
     if not response.get("choices"):
@@ -209,6 +236,32 @@ def run_smoke_request(config: llmd_runtime.ResolvedConfig, endpoint_url: str) ->
     return response
 
 
+def capture_smoke_state(config: llmd_runtime.ResolvedConfig) -> None:
+    job_name = config.platform["smoke"]["job_name"]
+    namespace = config.namespace
+    artifacts_dir = config.artifact_dir / "artifacts"
+
+    capture_get("job", job_name, namespace, "yaml", artifacts_dir / "smoke_job.yaml")
+    capture_get(
+        "pods",
+        None,
+        namespace,
+        "yaml",
+        artifacts_dir / "smoke_job.pods.yaml",
+        selector=f"job-name={job_name}",
+    )
+    result = llmd_runtime.oc(
+        "logs",
+        f"job/{job_name}",
+        "-n",
+        namespace,
+        check=False,
+        capture_output=True,
+    )
+    if result.returncode == 0 and result.stdout:
+        llmd_runtime.write_text(artifacts_dir / "smoke_job.logs", result.stdout)
+
+
 def run_guidellm_benchmark(config: llmd_runtime.ResolvedConfig, endpoint_url: str) -> None:
     benchmark_name = config.benchmark["job_name"]
     namespace = config.namespace
diff --git a/tests/llm_d/test_runtime.py b/tests/llm_d/test_runtime.py
index 2c2ab085..4c0c2a34 100644
--- a/tests/llm_d/test_runtime.py
+++ b/tests/llm_d/test_runtime.py
@@ -201,6 +201,28 @@ def test_render_guidellm_job_uses_target_and_rate(
     assert "--rate=1" in container["args"]
 
 
+def test_render_smoke_request_job_uses_curl_helper(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
+    artifact_dir = tmp_path / "artifacts"
+    artifact_dir.mkdir()
+
+    config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
+    payload = {"model": "Qwen/Qwen3-0.6B", "prompt": "test"}
+    manifest = llmd_runtime.render_smoke_request_job(config, "https://example.test", payload)
+
+    container = manifest["spec"]["template"]["spec"]["containers"][0]
+    env = {item["name"]: item["value"] for item in container["env"]}
+
+    assert manifest["kind"] == "Job"
+    assert manifest["metadata"]["name"] == "llm-d-smoke"
+    assert container["image"] == "curlimages/curl:8.11.1"
+    assert env["ENDPOINT_URL"] == "https://example.test"
+    assert env["ENDPOINT_PATH"] == "/v1/completions"
+    assert env["REQUEST_PAYLOAD"] == '{"model": "Qwen/Qwen3-0.6B", "prompt": "test"}'
+
+
 def test_prepare_model_cache_skips_ready_pvc(
     tmp_path: Path, monkeypatch: pytest.MonkeyPatch
 ) -> None:
@@ -448,6 +470,43 @@ def fake_oc_get_json(kind: str, **_: object) -> dict[str, object]:
         test_toolbox.resolve_endpoint_url(config)
 
 
+def test_run_smoke_request_uses_helper_job(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
+    artifact_dir = tmp_path / "artifacts"
+    artifact_dir.mkdir()
+    config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
+    oc_calls: list[tuple[str, ...]] = []
+    applied: list[Path] = []
+
+    def fake_oc(*args, **kwargs):
+        oc_calls.append(tuple(args))
+        if args[:2] == ("logs", "job/llm-d-smoke"):
+            return subprocess.CompletedProcess(
+                args,
+                0,
+                stdout='{"choices":[{"text":"ok"}]}\n',
+                stderr="",
+            )
+        return subprocess.CompletedProcess(args, 0, stdout="", stderr="")
+
+    monkeypatch.setattr(llmd_runtime, "oc", fake_oc)
+    monkeypatch.setattr(llmd_runtime, "resource_exists", lambda *args, **kwargs: False)
+    monkeypatch.setattr(llmd_runtime, "wait_until", lambda *args, **kwargs: True)
+    monkeypatch.setattr(llmd_runtime, "wait_for_job_completion", lambda *args, **kwargs: True)
+    monkeypatch.setattr(
+        llmd_runtime,
+        "apply_manifest",
+        lambda artifact_path, _manifest: applied.append(artifact_path),
+    )
+    monkeypatch.setattr(test_toolbox, "capture_smoke_state", lambda _config: None)
+
+    response = test_toolbox.run_smoke_request(config, "https://example.test")
+
+    assert response["choices"][0]["text"] == "ok"
+    assert applied == [artifact_dir / "src" / "smoke-job.yaml"]
+    assert not any(call and call[0] == "exec" for call in oc_calls)
+
+
 def test_wait_until_reraises_runtime_error() -> None:
     with pytest.raises(RuntimeError, match="terminal failure"):
         llmd_runtime.wait_until(

From 7d4b39bc89fb45d1a1941973bcf1d30118778546 Mon Sep 17 00:00:00 2001
From: Alberto Perdomo <aperdomo@redhat.com>
Date: Tue, 21 Apr 2026 12:59:17 +0100
Subject: [PATCH 10/21] feat: Add llm_d scheduler profiles

---
 .../llm_d/orchestration/config.d/runtime.yaml |  2 +-
 .../config.d/scheduler_profiles.yaml          |  9 +++-
 projects/llm_d/orchestration/llmd_runtime.py  | 17 ++++++--
 .../orchestration/presets.d/presets.yaml      | 16 ++++++-
 ...ate-prefix-cache.yaml => approximate.yaml} |  0
 .../scheduler_profiles/precise.yaml           | 26 +++++++++++
 tests/llm_d/test_runtime.py                   | 43 ++++++++++++++++++-
 7 files changed, 104 insertions(+), 9 deletions(-)
 rename projects/llm_d/orchestration/scheduler_profiles/{approximate-prefix-cache.yaml => approximate.yaml} (100%)
 create mode 100644 projects/llm_d/orchestration/scheduler_profiles/precise.yaml

diff --git a/projects/llm_d/orchestration/config.d/runtime.yaml b/projects/llm_d/orchestration/config.d/runtime.yaml
index 4f1bfb98..c8715ccb 100644
--- a/projects/llm_d/orchestration/config.d/runtime.yaml
+++ b/projects/llm_d/orchestration/config.d/runtime.yaml
@@ -3,6 +3,6 @@ allowed_override_keys:
   - namespace
 selected_preset: smoke
 model_key: qwen3-0-6b
-scheduler_profile_key: approximate-prefix-cache
+scheduler_profile_key: approximate
 smoke_request_key: default
 benchmark_key: null
diff --git a/projects/llm_d/orchestration/config.d/scheduler_profiles.yaml b/projects/llm_d/orchestration/config.d/scheduler_profiles.yaml
index cb579d9b..b3bca162 100644
--- a/projects/llm_d/orchestration/config.d/scheduler_profiles.yaml
+++ b/projects/llm_d/orchestration/config.d/scheduler_profiles.yaml
@@ -1,2 +1,9 @@
+approximate:
+  config_path: scheduler_profiles/approximate.yaml
+
+precise:
+  config_path: scheduler_profiles/precise.yaml
+
+# Compatibility alias for earlier llm_d presets.
 approximate-prefix-cache:
-  config_path: scheduler_profiles/approximate-prefix-cache.yaml
+  config_path: scheduler_profiles/approximate.yaml
diff --git a/projects/llm_d/orchestration/llmd_runtime.py b/projects/llm_d/orchestration/llmd_runtime.py
index 733b04ec..c35b5a31 100644
--- a/projects/llm_d/orchestration/llmd_runtime.py
+++ b/projects/llm_d/orchestration/llmd_runtime.py
@@ -42,7 +42,7 @@ class ResolvedConfig:
     model_key: str
     model: dict[str, Any]
     scheduler_profile_key: str
-    scheduler_profile: dict[str, Any]
+    scheduler_profile: dict[str, Any] | None
     model_cache: dict[str, Any]
     smoke_request: dict[str, Any]
     benchmark: dict[str, Any] | None
@@ -125,9 +125,11 @@ def load_run_configuration(
     model = copy.deepcopy(config.project.get_config(f"models.{model_name}"))
 
     scheduler_profile_key = config.project.get_config("runtime.scheduler_profile_key")
-    scheduler_profile = copy.deepcopy(
-        config.project.get_config(f"scheduler_profiles.{scheduler_profile_key}")
-    )
+    scheduler_profile = None
+    if scheduler_profile_key != "default":
+        scheduler_profile = copy.deepcopy(
+            config.project.get_config(f"scheduler_profiles.{scheduler_profile_key}")
+        )
 
     smoke_request_name = config.project.get_config("runtime.smoke_request_key")
     smoke_request = copy.deepcopy(
@@ -996,6 +998,13 @@ def render_inference_service(config: ResolvedConfig) -> dict[str, Any]:
         config.model["resources"]
     )
 
+    if config.scheduler_profile_key == "default":
+        manifest["spec"]["router"]["scheduler"] = {}
+        return manifest
+
+    if config.scheduler_profile is None:
+        raise ValueError(f"Missing scheduler profile config for {config.scheduler_profile_key}")
+
     scheduler_profile_path = config.config_dir / config.scheduler_profile["config_path"]
     scheduler_profile_config = scheduler_profile_path.read_text(encoding="utf-8")
     router_args = manifest["spec"]["router"]["scheduler"]["template"]["containers"][0]["args"]
diff --git a/projects/llm_d/orchestration/presets.d/presets.yaml b/projects/llm_d/orchestration/presets.d/presets.yaml
index 9fc1392a..0b3de3a7 100644
--- a/projects/llm_d/orchestration/presets.d/presets.yaml
+++ b/projects/llm_d/orchestration/presets.d/presets.yaml
@@ -3,14 +3,26 @@ __multiple: true
 smoke:
   runtime.selected_preset: smoke
   runtime.model_key: qwen3-0-6b
-  runtime.scheduler_profile_key: approximate-prefix-cache
+  runtime.scheduler_profile_key: approximate
   runtime.smoke_request_key: default
   runtime.benchmark_key: null
 
+smoke-precise:
+  extends:
+    - smoke
+  runtime.selected_preset: smoke-precise
+  runtime.scheduler_profile_key: precise
+
+smoke-default-scheduler:
+  extends:
+    - smoke
+  runtime.selected_preset: smoke-default-scheduler
+  runtime.scheduler_profile_key: default
+
 benchmark-short:
   runtime.selected_preset: benchmark-short
   runtime.model_key: llama-3-1-8b-instruct-fp8
-  runtime.scheduler_profile_key: approximate-prefix-cache
+  runtime.scheduler_profile_key: approximate
   runtime.smoke_request_key: default
   runtime.benchmark_key: short
 
diff --git a/projects/llm_d/orchestration/scheduler_profiles/approximate-prefix-cache.yaml b/projects/llm_d/orchestration/scheduler_profiles/approximate.yaml
similarity index 100%
rename from projects/llm_d/orchestration/scheduler_profiles/approximate-prefix-cache.yaml
rename to projects/llm_d/orchestration/scheduler_profiles/approximate.yaml
diff --git a/projects/llm_d/orchestration/scheduler_profiles/precise.yaml b/projects/llm_d/orchestration/scheduler_profiles/precise.yaml
new file mode 100644
index 00000000..707e5e0c
--- /dev/null
+++ b/projects/llm_d/orchestration/scheduler_profiles/precise.yaml
@@ -0,0 +1,26 @@
+apiVersion: inference.networking.x-k8s.io/v1alpha1
+kind: EndpointPickerConfig
+plugins:
+  - type: single-profile-handler
+  - type: precise-prefix-cache-scorer
+    parameters:
+      indexerConfig:
+        tokenProcessorConfig:
+          blockSize: 64
+          hashSeed: "42"
+        tokenizersPoolConfig:
+          hf:
+            tokenizersCacheDir: /tmp/tokenizers
+  - type: kv-cache-utilization-scorer
+  - type: queue-scorer
+  - type: max-score-picker
+schedulingProfiles:
+  - name: default
+    plugins:
+      - pluginRef: precise-prefix-cache-scorer
+        weight: 3.0
+      - pluginRef: kv-cache-utilization-scorer
+        weight: 2.0
+      - pluginRef: queue-scorer
+        weight: 2.0
+      - pluginRef: max-score-picker
diff --git a/tests/llm_d/test_runtime.py b/tests/llm_d/test_runtime.py
index 4c0c2a34..bc19284a 100644
--- a/tests/llm_d/test_runtime.py
+++ b/tests/llm_d/test_runtime.py
@@ -107,10 +107,51 @@ def test_render_inference_service_injects_model_and_scheduler_profile(
     assert manifest["spec"]["model"]["name"] == "Qwen/Qwen3-0.6B"
     assert manifest["spec"]["model"]["uri"] == cache_spec.model_uri
     assert manifest["spec"]["model"]["name"] == config.model["served_model_name"]
-    assert config.scheduler_profile_key == "approximate-prefix-cache"
+    assert config.scheduler_profile_key == "approximate"
     router_args = manifest["spec"]["router"]["scheduler"]["template"]["containers"][0]["args"]
     assert router_args[-2] == "--config-text"
     assert "EndpointPickerConfig" in router_args[-1]
+    assert "prefix-cache-scorer" in router_args[-1]
+
+
+def test_render_inference_service_supports_precise_scheduler_profile(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
+    artifact_dir = tmp_path / "artifacts"
+    artifact_dir.mkdir()
+    (tmp_path / "fournos_config.yaml").write_text(
+        "preset: smoke-precise\njob-name: llm-d-precise\n",
+        encoding="utf-8",
+    )
+
+    config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
+    manifest = llmd_runtime.render_inference_service(config)
+
+    assert config.scheduler_profile_key == "precise"
+    router_args = manifest["spec"]["router"]["scheduler"]["template"]["containers"][0]["args"]
+    assert router_args[-2] == "--config-text"
+    assert "precise-prefix-cache-scorer" in router_args[-1]
+    assert "tokenizersCacheDir" in router_args[-1]
+
+
+def test_render_inference_service_supports_default_scheduler(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
+    artifact_dir = tmp_path / "artifacts"
+    artifact_dir.mkdir()
+    (tmp_path / "fournos_config.yaml").write_text(
+        "preset: smoke-default-scheduler\njob-name: llm-d-default\n",
+        encoding="utf-8",
+    )
+
+    config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
+    manifest = llmd_runtime.render_inference_service(config)
+
+    assert config.scheduler_profile_key == "default"
+    assert config.scheduler_profile is None
+    assert manifest["spec"]["router"]["scheduler"] == {}
 
 
 def test_resolve_model_cache_for_hf_model(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:

From 332adc33756d05937f220746a995751570f9ffc9 Mon Sep 17 00:00:00 2001
From: Alberto Perdomo <aperdomo@redhat.com>
Date: Sun, 3 May 2026 10:14:06 +0100
Subject: [PATCH 11/21] refactor: Split llm_d runtime helpers

---
 projects/llm_d/orchestration/llmd_runtime.py  | 769 +++---------------
 .../llm_d/orchestration/runtime_config.py     | 352 ++++++++
 .../llm_d/orchestration/runtime_manifests.py  | 327 ++++++++
 3 files changed, 775 insertions(+), 673 deletions(-)
 create mode 100644 projects/llm_d/orchestration/runtime_config.py
 create mode 100644 projects/llm_d/orchestration/runtime_manifests.py

diff --git a/projects/llm_d/orchestration/llmd_runtime.py b/projects/llm_d/orchestration/llmd_runtime.py
index c35b5a31..59b054e6 100644
--- a/projects/llm_d/orchestration/llmd_runtime.py
+++ b/projects/llm_d/orchestration/llmd_runtime.py
@@ -1,359 +1,114 @@
 from __future__ import annotations
 
-import copy
-import hashlib
 import json
 import logging
-import os
 import re
 import shlex
 import subprocess
 import time
 from collections.abc import Iterable
-from dataclasses import dataclass
-from pathlib import Path
 from typing import Any
 
 import yaml
 
-from projects.core.library import config, env, run
+from projects.llm_d.orchestration.runtime_config import (
+    CONFIG_DIR,
+    ORCHESTRATION_DIR,
+    ModelCacheSpec,
+    ResolvedConfig,
+    apply_requested_preset,
+    derive_namespace,
+    ensure_artifact_directories,
+    init,
+    load_fournos_config,
+    load_run_configuration,
+    load_yaml,
+    normalize_gpu_count,
+    parse_overrides,
+    resolve_model_cache,
+    slugify_identifier,
+    truncate_k8s_name,
+    version_tuple,
+    write_json,
+    write_text,
+    write_yaml,
+)
+from projects.llm_d.orchestration.runtime_manifests import (
+    load_manifest_template,
+    render_datasciencecluster,
+    render_gateway,
+    render_guidellm_copy_pod,
+    render_guidellm_job,
+    render_guidellm_pvc,
+    render_inference_service,
+    render_model_cache_pvc,
+    render_smoke_request_job,
+)
 
 LOGGER = logging.getLogger(__name__)
-ORCHESTRATION_DIR = env.FORGE_HOME / "projects" / "llm_d" / "orchestration"
-CONFIG_DIR = ORCHESTRATION_DIR
+
+__all__ = [
+    "CONFIG_DIR",
+    "ORCHESTRATION_DIR",
+    "CommandError",
+    "ModelCacheSpec",
+    "ResolvedConfig",
+    "annotate_model_cache_pvc",
+    "apply_manifest",
+    "apply_requested_preset",
+    "condition_status",
+    "derive_namespace",
+    "desired_subscription",
+    "ensure_artifact_directories",
+    "ensure_namespace",
+    "ensure_operator_group",
+    "ensure_subscription",
+    "init",
+    "job_pod_names",
+    "load_fournos_config",
+    "load_manifest_template",
+    "load_run_configuration",
+    "load_yaml",
+    "model_cache_pvc_ready",
+    "normalize_gpu_count",
+    "oc",
+    "oc_get_json",
+    "operator_spec_by_package",
+    "parse_overrides",
+    "pvc_access_mode_matches",
+    "render_datasciencecluster",
+    "render_gateway",
+    "render_guidellm_copy_pod",
+    "render_guidellm_job",
+    "render_guidellm_pvc",
+    "render_inference_service",
+    "render_model_cache_job",
+    "render_model_cache_pvc",
+    "render_smoke_request_job",
+    "resource_exists",
+    "resolve_default_serviceaccount_image_pull_secret",
+    "resolve_model_cache",
+    "run_command",
+    "slugify_identifier",
+    "subscription_spec_matches",
+    "truncate_k8s_name",
+    "version_tuple",
+    "wait_for_crd",
+    "wait_for_job_completion",
+    "wait_for_namespace_deleted",
+    "wait_for_operator_csv",
+    "wait_for_pvc_bound",
+    "wait_until",
+    "write_json",
+    "write_text",
+    "write_yaml",
+]
 
 
 class CommandError(RuntimeError):
     """Raised when an external command exits unsuccessfully."""
 
 
-@dataclass(frozen=True)
-class ResolvedConfig:
-    artifact_dir: Path
-    project_root: Path
-    config_dir: Path
-    preset_name: str
-    preset_alias: str | None
-    job_name: str
-    namespace: str
-    namespace_is_managed: bool
-    gpu_count: int | None
-    platform: dict[str, Any]
-    model_key: str
-    model: dict[str, Any]
-    scheduler_profile_key: str
-    scheduler_profile: dict[str, Any] | None
-    model_cache: dict[str, Any]
-    smoke_request: dict[str, Any]
-    benchmark: dict[str, Any] | None
-    fournos_config: dict[str, Any]
-    overrides: dict[str, Any]
-
-    @property
-    def manifests_dir(self) -> Path:
-        return self.config_dir / "manifests"
-
-
-@dataclass(frozen=True)
-class ModelCacheSpec:
-    source_uri: str
-    source_scheme: str
-    cache_key: str
-    namespace: str
-    pvc_name: str
-    pvc_size: str
-    access_mode: str
-    storage_class_name: str | None
-    model_path: str
-    model_uri: str
-    marker_filename: str
-    download_job_name: str
-    hf_token_secret_name: str | None
-    hf_token_secret_key: str | None
-    oci_image_path: str | None
-    oci_registry_auth_secret_name: str | None
-    oci_registry_auth_secret_key: str | None
-
-    @property
-    def marker_path(self) -> str:
-        return f"/cache/{self.model_path}/{self.marker_filename}"
-
-
-def init() -> Path:
-    if not logging.getLogger().handlers:
-        logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
-
-    env.init()
-    run.init()
-    ensure_artifact_directories(env.ARTIFACT_DIR)
-    return env.ARTIFACT_DIR
-
-
-def ensure_artifact_directories(artifact_dir: Path) -> None:
-    for relative in ("src", "artifacts", "artifacts/results"):
-        (artifact_dir / relative).mkdir(parents=True, exist_ok=True)
-
-
-def load_run_configuration(
-    *, cwd: Path | None = None, artifact_dir: Path | None = None
-) -> ResolvedConfig:
-    cwd = cwd or Path.cwd()
-    if artifact_dir is not None:
-        os.environ["ARTIFACT_DIR"] = str(artifact_dir)
-    artifact_dir = init()
-    _reinitialize_project_config()
-
-    platform_data = copy.deepcopy(config.project.get_config("platform"))
-    model_cache = copy.deepcopy(config.project.get_config("model_cache"))
-    fournos_config = load_fournos_config(cwd)
-    overrides = parse_overrides(
-        os.environ.get("FORGE_CONFIG_OVERRIDES", ""),
-        allowed_keys=config.project.get_config("runtime.allowed_override_keys", []),
-    )
-
-    requested_preset = (
-        fournos_config.get("preset")
-        or os.environ.get("FORGE_PRESET")
-        or config.project.get_config("runtime.default_preset")
-    )
-    apply_requested_preset(requested_preset)
-
-    preset_name = config.project.get_config("runtime.selected_preset")
-    preset_alias = requested_preset if requested_preset != preset_name else None
-
-    model_name = config.project.get_config("runtime.model_key")
-    model = copy.deepcopy(config.project.get_config(f"models.{model_name}"))
-
-    scheduler_profile_key = config.project.get_config("runtime.scheduler_profile_key")
-    scheduler_profile = None
-    if scheduler_profile_key != "default":
-        scheduler_profile = copy.deepcopy(
-            config.project.get_config(f"scheduler_profiles.{scheduler_profile_key}")
-        )
-
-    smoke_request_name = config.project.get_config("runtime.smoke_request_key")
-    smoke_request = copy.deepcopy(
-        config.project.get_config(f"workloads.smoke_requests.{smoke_request_name}")
-    )
-
-    benchmark_name = config.project.get_config("runtime.benchmark_key", None)
-    benchmark = None
-    if benchmark_name:
-        benchmark = copy.deepcopy(
-            config.project.get_config(f"workloads.benchmarks.{benchmark_name}")
-        )
-
-    job_name = fournos_config.get("job-name") or os.environ.get("FORGE_JOB_NAME")
-    if not job_name:
-        job_name = f"local-{preset_name}"
-
-    namespace_override = overrides.get("namespace") or fournos_config.get("namespace")
-    default_namespace = platform_data["cluster"].get("namespace_name")
-    namespace = (
-        namespace_override
-        or default_namespace
-        or derive_namespace(
-            job_name,
-            platform_data["cluster"]["namespace_prefix"],
-            platform_data["cluster"]["namespace_max_length"],
-        )
-    )
-
-    gpu_count = normalize_gpu_count(fournos_config.get("gpu-count"))
-
-    return ResolvedConfig(
-        artifact_dir=Path(artifact_dir),
-        project_root=env.FORGE_HOME,
-        config_dir=ORCHESTRATION_DIR,
-        preset_name=preset_name,
-        preset_alias=preset_alias,
-        job_name=job_name,
-        namespace=namespace,
-        namespace_is_managed=namespace_override is None and default_namespace is None,
-        gpu_count=gpu_count,
-        platform=platform_data,
-        model_key=model_name,
-        model=model,
-        scheduler_profile_key=scheduler_profile_key,
-        scheduler_profile=scheduler_profile,
-        model_cache=model_cache,
-        smoke_request=smoke_request,
-        benchmark=benchmark,
-        fournos_config=fournos_config,
-        overrides=overrides,
-    )
-
-
-def _reinitialize_project_config() -> None:
-    config.project = None
-    artifact_config = env.ARTIFACT_DIR / "config.yaml"
-    if artifact_config.exists():
-        artifact_config.unlink()
-
-    presets_applied = env.ARTIFACT_DIR / "presets_applied"
-    if presets_applied.exists():
-        presets_applied.unlink()
-
-    config.init(ORCHESTRATION_DIR)
-
-
-def apply_requested_preset(requested_preset: str) -> None:
-    if not config.project.get_preset(requested_preset):
-        raise ValueError(f"Unknown llm_d preset: {requested_preset}")
-
-    config.project.apply_preset(requested_preset)
-
-
-def load_fournos_config(cwd: Path) -> dict[str, Any]:
-    config_path = cwd / "fournos_config.yaml"
-    if not config_path.exists():
-        return {}
-
-    data = load_yaml(config_path)
-    if data is None:
-        return {}
-    if not isinstance(data, dict):
-        raise ValueError(f"Unexpected FOURNOS config type in {config_path}: {type(data)}")
-    return data
-
-
-def parse_overrides(raw: str, *, allowed_keys: Iterable[str]) -> dict[str, Any]:
-    if not raw or raw.strip() in {"", "null", "{}"}:
-        return {}
-
-    try:
-        data = json.loads(raw)
-    except json.JSONDecodeError as exc:
-        raise ValueError(f"FORGE_CONFIG_OVERRIDES is not valid JSON: {exc}") from exc
-
-    if not isinstance(data, dict):
-        raise ValueError("FORGE_CONFIG_OVERRIDES must decode to a JSON object")
-
-    allowed_keys = frozenset(allowed_keys)
-    unsupported = sorted(set(data) - allowed_keys)
-    if unsupported:
-        raise ValueError(
-            "Unsupported llm_d override keys: "
-            f"{', '.join(unsupported)}. Allowed keys: {', '.join(sorted(allowed_keys))}"
-        )
-
-    return data
-
-
-def normalize_gpu_count(value: Any) -> int | None:
-    if value in (None, ""):
-        return None
-    try:
-        return int(value)
-    except (TypeError, ValueError):
-        LOGGER.warning("Ignoring invalid gpu-count value: %s", value)
-        return None
-
-
-def derive_namespace(job_name: str, prefix: str, max_length: int) -> str:
-    slug = re.sub(r"[^a-z0-9-]+", "-", job_name.lower())
-    slug = re.sub(r"-{2,}", "-", slug).strip("-")
-    if not slug:
-        slug = "run"
-
-    if slug.startswith(f"{prefix}-"):
-        namespace = slug
-    else:
-        namespace = f"{prefix}-{slug}"
-
-    namespace = namespace[:max_length].rstrip("-")
-    if not namespace:
-        raise ValueError(f"Could not derive a valid namespace from job name: {job_name}")
-    return namespace
-
-
-def slugify_identifier(value: str, *, max_length: int = 63) -> str:
-    slug = re.sub(r"[^a-z0-9-]+", "-", value.lower())
-    slug = re.sub(r"-{2,}", "-", slug).strip("-")
-    return slug[:max_length].rstrip("-") or "item"
-
-
-def truncate_k8s_name(value: str, *, max_length: int = 63) -> str:
-    return value[:max_length].rstrip("-")
-
-
-def resolve_model_cache(config: ResolvedConfig) -> ModelCacheSpec | None:
-    if not config.model_cache.get("enabled", False):
-        return None
-
-    source_uri = config.model["uri"]
-    if source_uri.startswith(("pvc://", "pvc+hf://")):
-        return None
-
-    if source_uri.startswith("hf://"):
-        source_scheme = "hf"
-    elif source_uri.startswith("oci://"):
-        source_scheme = "oci"
-    else:
-        raise ValueError(f"Unsupported model cache source URI for {config.model_key}: {source_uri}")
-
-    model_cache_overrides = config.model.get("cache", {})
-    pvc_defaults = config.model_cache["pvc"]
-    pvc_prefix = config.model_cache["pvc"]["name_prefix"]
-    cache_key = hashlib.sha256(source_uri.encode("utf-8")).hexdigest()[:10]
-    pvc_name = truncate_k8s_name(
-        f"{pvc_prefix}-{slugify_identifier(config.model_key, max_length=32)}-{cache_key}"
-    )
-    model_path = pvc_defaults["model_directory_name"]
-
-    return ModelCacheSpec(
-        source_uri=source_uri,
-        source_scheme=source_scheme,
-        cache_key=cache_key,
-        namespace=config.namespace,
-        pvc_name=pvc_name,
-        pvc_size=model_cache_overrides.get("pvc_size", pvc_defaults["size"]),
-        access_mode=model_cache_overrides.get("access_mode", pvc_defaults["access_mode"]),
-        storage_class_name=model_cache_overrides.get(
-            "storage_class_name", pvc_defaults.get("storage_class_name")
-        ),
-        model_path=model_path,
-        model_uri=f"pvc://{pvc_name}/{model_path}",
-        marker_filename=config.model_cache["marker_filename"],
-        download_job_name=truncate_k8s_name(f"{pvc_name}-download"),
-        hf_token_secret_name=model_cache_overrides.get(
-            "hf_token_secret_name", config.model_cache["hf"].get("token_secret_name")
-        ),
-        hf_token_secret_key=config.model_cache["hf"].get("token_secret_key"),
-        oci_image_path=model_cache_overrides.get(
-            "oci_image_path", config.model_cache["oci"].get("image_path")
-        ),
-        oci_registry_auth_secret_name=model_cache_overrides.get(
-            "oci_registry_auth_secret_name",
-            config.model_cache["oci"].get("registry_auth_secret_name"),
-        ),
-        oci_registry_auth_secret_key=config.model_cache["oci"].get("registry_auth_secret_key"),
-    )
-
-
-def load_yaml(path: Path) -> Any:
-    with path.open(encoding="utf-8") as handle:
-        return yaml.safe_load(handle)
-
-
-def write_yaml(path: Path, payload: Any) -> None:
-    path.parent.mkdir(parents=True, exist_ok=True)
-    with path.open("w", encoding="utf-8") as handle:
-        yaml.safe_dump(payload, handle, sort_keys=False)
-
-
-def write_json(path: Path, payload: Any) -> None:
-    path.parent.mkdir(parents=True, exist_ok=True)
-    with path.open("w", encoding="utf-8") as handle:
-        json.dump(payload, handle, indent=2, sort_keys=True)
-        handle.write("\n")
-
-
-def write_text(path: Path, content: str) -> None:
-    path.parent.mkdir(parents=True, exist_ok=True)
-    path.write_text(content, encoding="utf-8")
-
-
 def run_command(
     args: Iterable[str],
     *,
@@ -412,7 +167,7 @@ def oc(
     )
 
 
-def apply_manifest(artifact_path: Path, manifest: dict[str, Any]) -> None:
+def apply_manifest(artifact_path: Any, manifest: dict[str, Any]) -> None:
     write_yaml(artifact_path, manifest)
     oc("apply", "-f", str(artifact_path))
 
@@ -501,14 +256,11 @@ def wait_until(
 
 
 def wait_for_namespace_deleted(namespace: str, timeout_seconds: int) -> None:
-    def _namespace_gone() -> bool:
-        return not resource_exists("namespace", namespace)
-
     wait_until(
         f"namespace/{namespace} deletion",
         timeout_seconds=timeout_seconds,
         interval_seconds=10,
-        predicate=_namespace_gone,
+        predicate=lambda: not resource_exists("namespace", namespace),
     )
 
 
@@ -549,8 +301,7 @@ def ensure_namespace(namespace: str, *, labels: dict[str, str] | None = None) ->
         oc("create", "namespace", namespace)
 
     if labels:
-        label_args = [f"{key}={value}" for key, value in labels.items()]
-        oc("label", "namespace", namespace, "--overwrite", *label_args)
+        oc("label", "namespace", namespace, "--overwrite", *[f"{k}={v}" for k, v in labels.items()])
 
 
 def ensure_operator_group(namespace: str, package: str) -> None:
@@ -639,18 +390,8 @@ def operator_spec_by_package(platform: dict[str, Any], package: str) -> dict[str
     raise KeyError(f"Unknown operator package in llm_d platform config: {package}")
 
 
-def load_manifest_template(config: ResolvedConfig, relative_path: str) -> dict[str, Any]:
-    return load_yaml(config.config_dir / relative_path)
-
-
-def version_tuple(value: str) -> tuple[int, ...]:
-    numbers = re.findall(r"\d+", value)
-    return tuple(int(number) for number in numbers[:3])
-
-
 def condition_status(resource: dict[str, Any], condition_type: str) -> str | None:
-    conditions = resource.get("status", {}).get("conditions", [])
-    for condition in conditions:
+    for condition in resource.get("status", {}).get("conditions", []):
         if condition.get("type") == condition_type:
             return condition.get("status")
     return None
@@ -741,51 +482,6 @@ def resolve_default_serviceaccount_image_pull_secret(namespace: str) -> str | No
     return None
 
 
-def render_datasciencecluster(config: ResolvedConfig) -> dict[str, Any]:
-    template_path = config.config_dir / config.platform["rhoai"]["datasciencecluster_template"]
-    manifest = load_yaml(template_path)
-    manifest["metadata"]["name"] = config.platform["rhoai"]["datasciencecluster_name"]
-    manifest["metadata"]["namespace"] = config.platform["rhoai"]["namespace"]
-    return manifest
-
-
-def render_gateway(config: ResolvedConfig) -> dict[str, Any]:
-    template_path = config.config_dir / config.platform["gateway"]["manifest_template"]
-    manifest = load_yaml(template_path)
-    manifest["metadata"]["name"] = config.platform["gateway"]["name"]
-    manifest["metadata"]["namespace"] = config.platform["gateway"]["namespace"]
-    manifest["spec"]["gatewayClassName"] = config.platform["gateway"]["gateway_class_name"]
-    return manifest
-
-
-def render_model_cache_pvc(spec: ModelCacheSpec) -> dict[str, Any]:
-    manifest: dict[str, Any] = {
-        "apiVersion": "v1",
-        "kind": "PersistentVolumeClaim",
-        "metadata": {
-            "name": spec.pvc_name,
-            "namespace": spec.namespace,
-            "labels": {
-                "app.kubernetes.io/managed-by": "forge",
-                "forge.openshift.io/project": "llm_d",
-                "forge.openshift.io/model-cache": "true",
-                "forge.openshift.io/preserve": "true",
-            },
-            "annotations": {
-                "forge.openshift.io/model-cache-key": spec.cache_key,
-                "forge.openshift.io/model-source-uri": spec.source_uri,
-            },
-        },
-        "spec": {
-            "accessModes": [spec.access_mode],
-            "resources": {"requests": {"storage": spec.pvc_size}},
-        },
-    }
-    if spec.storage_class_name:
-        manifest["spec"]["storageClassName"] = spec.storage_class_name
-    return manifest
-
-
 def render_model_cache_job(config: ResolvedConfig, spec: ModelCacheSpec) -> dict[str, Any]:
     common_env = [
         {"name": "MODEL_SOURCE", "value": spec.source_uri},
@@ -797,7 +493,6 @@ def render_model_cache_job(config: ResolvedConfig, spec: ModelCacheSpec) -> dict
         {"name": "cache", "persistentVolumeClaim": {"claimName": spec.pvc_name}}
     ]
 
-    container: dict[str, Any]
     if spec.source_scheme == "hf":
         command = """
 set -euo pipefail
@@ -828,10 +523,7 @@ def render_model_cache_job(config: ResolvedConfig, spec: ModelCacheSpec) -> dict
         volume_mounts = [{"name": "cache", "mountPath": "/cache"}]
         if spec.hf_token_secret_name:
             volumes.append(
-                {
-                    "name": "hf-token",
-                    "secret": {"secretName": spec.hf_token_secret_name},
-                }
+                {"name": "hf-token", "secret": {"secretName": spec.hf_token_secret_name}}
             )
             volume_mounts.append(
                 {
@@ -880,10 +572,7 @@ def render_model_cache_job(config: ResolvedConfig, spec: ModelCacheSpec) -> dict
         common_env.append({"name": "OCI_IMAGE_PATH", "value": spec.oci_image_path or "/"})
         if registry_auth_secret_name:
             volumes.append(
-                {
-                    "name": "registry-auth",
-                    "secret": {"secretName": registry_auth_secret_name},
-                }
+                {"name": "registry-auth", "secret": {"secretName": registry_auth_secret_name}}
             )
             volume_mounts.append(
                 {
@@ -974,269 +663,3 @@ def annotate_model_cache_pvc(spec: ModelCacheSpec) -> None:
         f"forge.openshift.io/model-source-uri={spec.source_uri}",
         f"forge.openshift.io/model-uri={spec.model_uri}",
     )
-
-
-def render_inference_service(config: ResolvedConfig) -> dict[str, Any]:
-    template_path = config.config_dir / config.platform["inference_service"]["template"]
-    manifest = load_yaml(template_path)
-
-    name = config.platform["inference_service"]["name"]
-    manifest["metadata"]["name"] = name
-    manifest["metadata"]["namespace"] = config.namespace
-    manifest["metadata"].setdefault("labels", {})
-    manifest["metadata"]["labels"].update(
-        {
-            "app.kubernetes.io/managed-by": "forge",
-            "forge.openshift.io/project": "llm_d",
-        }
-    )
-
-    cache_spec = resolve_model_cache(config)
-    manifest["spec"]["model"]["uri"] = cache_spec.model_uri if cache_spec else config.model["uri"]
-    manifest["spec"]["model"]["name"] = config.model["served_model_name"]
-    manifest["spec"]["template"]["containers"][0]["resources"] = copy.deepcopy(
-        config.model["resources"]
-    )
-
-    if config.scheduler_profile_key == "default":
-        manifest["spec"]["router"]["scheduler"] = {}
-        return manifest
-
-    if config.scheduler_profile is None:
-        raise ValueError(f"Missing scheduler profile config for {config.scheduler_profile_key}")
-
-    scheduler_profile_path = config.config_dir / config.scheduler_profile["config_path"]
-    scheduler_profile_config = scheduler_profile_path.read_text(encoding="utf-8")
-    router_args = manifest["spec"]["router"]["scheduler"]["template"]["containers"][0]["args"]
-    if not router_args or router_args[-1] != "--config-text":
-        raise ValueError("Expected llm-d router args to end with --config-text")
-    router_args.append(scheduler_profile_config)
-
-    return manifest
-
-
-def render_smoke_request_job(
-    config: ResolvedConfig, endpoint_url: str, payload: dict[str, Any]
-) -> dict[str, Any]:
-    smoke = config.platform["smoke"]
-    command = """
-set -eu
-attempt=1
-while [ "${attempt}" -le "${REQUEST_RETRIES}" ]; do
-  if curl -k -sSf --max-time "${REQUEST_TIMEOUT_SECONDS}" \
-    "${ENDPOINT_URL}${ENDPOINT_PATH}" \
-    -H "Content-Type: application/json" \
-    -d "${REQUEST_PAYLOAD}" \
-    -o /tmp/smoke-response.json \
-    2>/tmp/smoke-error.log; then
-    cat /tmp/smoke-response.json
-    exit 0
-  fi
-  attempt=$((attempt + 1))
-  sleep "${REQUEST_RETRY_DELAY_SECONDS}"
-done
-cat /tmp/smoke-error.log >&2 || true
-exit 1
-"""
-
-    return {
-        "apiVersion": "batch/v1",
-        "kind": "Job",
-        "metadata": {
-            "name": smoke["job_name"],
-            "namespace": config.namespace,
-            "labels": {
-                "app.kubernetes.io/managed-by": "forge",
-                "forge.openshift.io/project": "llm_d",
-                "forge.openshift.io/component": "smoke",
-            },
-        },
-        "spec": {
-            "backoffLimit": 0,
-            "activeDeadlineSeconds": (
-                smoke["request_retries"]
-                * (smoke["request_timeout_seconds"] + smoke["request_retry_delay_seconds"])
-            ),
-            "template": {
-                "metadata": {
-                    "labels": {
-                        "app.kubernetes.io/managed-by": "forge",
-                        "forge.openshift.io/project": "llm_d",
-                        "forge.openshift.io/component": "smoke",
-                    }
-                },
-                "spec": {
-                    "restartPolicy": "Never",
-                    "containers": [
-                        {
-                            "name": "smoke",
-                            "image": smoke["client_image"],
-                            "command": ["/bin/sh", "-ceu", command],
-                            "env": [
-                                {"name": "ENDPOINT_URL", "value": endpoint_url},
-                                {"name": "ENDPOINT_PATH", "value": smoke["endpoint_path"]},
-                                {"name": "REQUEST_PAYLOAD", "value": json.dumps(payload)},
-                                {"name": "REQUEST_RETRIES", "value": str(smoke["request_retries"])},
-                                {
-                                    "name": "REQUEST_RETRY_DELAY_SECONDS",
-                                    "value": str(smoke["request_retry_delay_seconds"]),
-                                },
-                                {
-                                    "name": "REQUEST_TIMEOUT_SECONDS",
-                                    "value": str(smoke["request_timeout_seconds"]),
-                                },
-                            ],
-                        }
-                    ],
-                },
-            },
-        },
-    }
-
-
-def render_guidellm_pvc(config: ResolvedConfig) -> dict[str, Any]:
-    if not config.benchmark:
-        raise ValueError("Benchmark configuration is not enabled for this preset")
-
-    return {
-        "apiVersion": "v1",
-        "kind": "PersistentVolumeClaim",
-        "metadata": {
-            "name": config.benchmark["job_name"],
-            "namespace": config.namespace,
-            "labels": {
-                "app.kubernetes.io/managed-by": "forge",
-                "forge.openshift.io/project": "llm_d",
-            },
-        },
-        "spec": {
-            "accessModes": ["ReadWriteOnce"],
-            "resources": {"requests": {"storage": config.benchmark["pvc_size"]}},
-        },
-    }
-
-
-def render_guidellm_job(config: ResolvedConfig, endpoint_url: str) -> dict[str, Any]:
-    if not config.benchmark:
-        raise ValueError("Benchmark configuration is not enabled for this preset")
-
-    args = [
-        "benchmark",
-        "run",
-        f"--target={endpoint_url}",
-        f"--rate={config.benchmark['rate']}",
-    ]
-    for key, value in config.benchmark["args"].items():
-        if value is None:
-            continue
-        args.append(f"--{key.replace('_', '-')}={value}")
-    args.append("--outputs=json")
-
-    return {
-        "apiVersion": "batch/v1",
-        "kind": "Job",
-        "metadata": {
-            "name": config.benchmark["job_name"],
-            "namespace": config.namespace,
-            "labels": {
-                "app.kubernetes.io/managed-by": "forge",
-                "forge.openshift.io/project": "llm_d",
-            },
-        },
-        "spec": {
-            "backoffLimit": 0,
-            "template": {
-                "metadata": {
-                    "labels": {
-                        "app.kubernetes.io/managed-by": "forge",
-                        "forge.openshift.io/project": "llm_d",
-                    }
-                },
-                "spec": {
-                    "serviceAccountName": "default",
-                    "restartPolicy": "Never",
-                    "containers": [
-                        {
-                            "name": "guidellm",
-                            "image": config.benchmark["image"],
-                            "command": ["/opt/app-root/bin/guidellm"],
-                            "args": args,
-                            "env": [{"name": "USER", "value": "guidellm"}],
-                            "volumeMounts": [
-                                {"name": "home", "mountPath": "/home/guidellm"},
-                                {"name": "results", "mountPath": "/results"},
-                            ],
-                        }
-                    ],
-                    "volumes": [
-                        {"name": "home", "emptyDir": {}},
-                        {
-                            "name": "results",
-                            "persistentVolumeClaim": {"claimName": config.benchmark["job_name"]},
-                        },
-                    ],
-                },
-            },
-        },
-    }
-
-
-def render_guidellm_copy_pod(
-    config: ResolvedConfig, node_name: str | None = None
-) -> dict[str, Any]:
-    if not config.benchmark:
-        raise ValueError("Benchmark configuration is not enabled for this preset")
-
-    pod = {
-        "apiVersion": "v1",
-        "kind": "Pod",
-        "metadata": {
-            "name": f"{config.benchmark['job_name']}-copy",
-            "namespace": config.namespace,
-            "labels": {
-                "app.kubernetes.io/managed-by": "forge",
-                "forge.openshift.io/project": "llm_d",
-            },
-        },
-        "spec": {
-            "restartPolicy": "Never",
-            "initContainers": [
-                {
-                    "name": "permission-fixer",
-                    "image": config.benchmark["image"],
-                    "command": [
-                        "/bin/sh",
-                        "-c",
-                        "chmod 755 /results && chown -R 1001:1001 /results || true",
-                    ],
-                    "securityContext": {
-                        "runAsUser": 0,
-                        "allowPrivilegeEscalation": True,
-                    },
-                    "volumeMounts": [{"name": "results", "mountPath": "/results"}],
-                }
-            ],
-            "containers": [
-                {
-                    "name": "copy-helper",
-                    "image": config.benchmark["image"],
-                    "command": ["/bin/sleep", "300"],
-                    "securityContext": {
-                        "runAsUser": 1001,
-                        "runAsNonRoot": True,
-                        "allowPrivilegeEscalation": False,
-                    },
-                    "volumeMounts": [{"name": "results", "mountPath": "/results"}],
-                }
-            ],
-            "volumes": [
-                {
-                    "name": "results",
-                    "persistentVolumeClaim": {"claimName": config.benchmark["job_name"]},
-                }
-            ],
-        },
-    }
-    if node_name:
-        pod["spec"]["nodeName"] = node_name
-    return pod
diff --git a/projects/llm_d/orchestration/runtime_config.py b/projects/llm_d/orchestration/runtime_config.py
new file mode 100644
index 00000000..42b5fcb1
--- /dev/null
+++ b/projects/llm_d/orchestration/runtime_config.py
@@ -0,0 +1,352 @@
+from __future__ import annotations
+
+import copy
+import hashlib
+import json
+import logging
+import os
+import re
+from collections.abc import Iterable
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+import yaml
+
+from projects.core.library import config, env, run
+
+LOGGER = logging.getLogger(__name__)
+ORCHESTRATION_DIR = env.FORGE_HOME / "projects" / "llm_d" / "orchestration"
+CONFIG_DIR = ORCHESTRATION_DIR
+
+
+@dataclass(frozen=True)
+class ResolvedConfig:
+    artifact_dir: Path
+    project_root: Path
+    config_dir: Path
+    preset_name: str
+    preset_alias: str | None
+    job_name: str
+    namespace: str
+    namespace_is_managed: bool
+    gpu_count: int | None
+    platform: dict[str, Any]
+    model_key: str
+    model: dict[str, Any]
+    scheduler_profile_key: str
+    scheduler_profile: dict[str, Any] | None
+    model_cache: dict[str, Any]
+    smoke_request: dict[str, Any]
+    benchmark: dict[str, Any] | None
+    fournos_config: dict[str, Any]
+    overrides: dict[str, Any]
+
+    @property
+    def manifests_dir(self) -> Path:
+        return self.config_dir / "manifests"
+
+
+@dataclass(frozen=True)
+class ModelCacheSpec:
+    source_uri: str
+    source_scheme: str
+    cache_key: str
+    namespace: str
+    pvc_name: str
+    pvc_size: str
+    access_mode: str
+    storage_class_name: str | None
+    model_path: str
+    model_uri: str
+    marker_filename: str
+    download_job_name: str
+    hf_token_secret_name: str | None
+    hf_token_secret_key: str | None
+    oci_image_path: str | None
+    oci_registry_auth_secret_name: str | None
+    oci_registry_auth_secret_key: str | None
+
+    @property
+    def marker_path(self) -> str:
+        return f"/cache/{self.model_path}/{self.marker_filename}"
+
+
+def init() -> Path:
+    if not logging.getLogger().handlers:
+        logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
+
+    env.init()
+    run.init()
+    ensure_artifact_directories(env.ARTIFACT_DIR)
+    return env.ARTIFACT_DIR
+
+
+def ensure_artifact_directories(artifact_dir: Path) -> None:
+    for relative in ("src", "artifacts", "artifacts/results"):
+        (artifact_dir / relative).mkdir(parents=True, exist_ok=True)
+
+
+def load_run_configuration(
+    *, cwd: Path | None = None, artifact_dir: Path | None = None
+) -> ResolvedConfig:
+    cwd = cwd or Path.cwd()
+    if artifact_dir is not None:
+        os.environ["ARTIFACT_DIR"] = str(artifact_dir)
+    artifact_dir = init()
+    _reinitialize_project_config()
+
+    platform_data = copy.deepcopy(config.project.get_config("platform"))
+    model_cache = copy.deepcopy(config.project.get_config("model_cache"))
+    fournos_config = load_fournos_config(cwd)
+    overrides = parse_overrides(
+        os.environ.get("FORGE_CONFIG_OVERRIDES", ""),
+        allowed_keys=config.project.get_config("runtime.allowed_override_keys", []),
+    )
+
+    requested_preset = (
+        fournos_config.get("preset")
+        or os.environ.get("FORGE_PRESET")
+        or config.project.get_config("runtime.default_preset")
+    )
+    apply_requested_preset(requested_preset)
+
+    preset_name = config.project.get_config("runtime.selected_preset")
+    preset_alias = requested_preset if requested_preset != preset_name else None
+
+    model_name = config.project.get_config("runtime.model_key")
+    model = copy.deepcopy(config.project.get_config(f"models.{model_name}"))
+
+    scheduler_profile_key = config.project.get_config("runtime.scheduler_profile_key")
+    scheduler_profile = None
+    if scheduler_profile_key != "default":
+        scheduler_profile = copy.deepcopy(
+            config.project.get_config(f"scheduler_profiles.{scheduler_profile_key}")
+        )
+
+    smoke_request_name = config.project.get_config("runtime.smoke_request_key")
+    smoke_request = copy.deepcopy(
+        config.project.get_config(f"workloads.smoke_requests.{smoke_request_name}")
+    )
+
+    benchmark_name = config.project.get_config("runtime.benchmark_key", None)
+    benchmark = None
+    if benchmark_name:
+        benchmark = copy.deepcopy(
+            config.project.get_config(f"workloads.benchmarks.{benchmark_name}")
+        )
+
+    job_name = fournos_config.get("job-name") or os.environ.get("FORGE_JOB_NAME")
+    if not job_name:
+        job_name = f"local-{preset_name}"
+
+    namespace_override = overrides.get("namespace") or fournos_config.get("namespace")
+    default_namespace = platform_data["cluster"].get("namespace_name")
+    namespace = (
+        namespace_override
+        or default_namespace
+        or derive_namespace(
+            job_name,
+            platform_data["cluster"]["namespace_prefix"],
+            platform_data["cluster"]["namespace_max_length"],
+        )
+    )
+
+    gpu_count = normalize_gpu_count(fournos_config.get("gpu-count"))
+
+    return ResolvedConfig(
+        artifact_dir=Path(artifact_dir),
+        project_root=env.FORGE_HOME,
+        config_dir=ORCHESTRATION_DIR,
+        preset_name=preset_name,
+        preset_alias=preset_alias,
+        job_name=job_name,
+        namespace=namespace,
+        namespace_is_managed=namespace_override is None and default_namespace is None,
+        gpu_count=gpu_count,
+        platform=platform_data,
+        model_key=model_name,
+        model=model,
+        scheduler_profile_key=scheduler_profile_key,
+        scheduler_profile=scheduler_profile,
+        model_cache=model_cache,
+        smoke_request=smoke_request,
+        benchmark=benchmark,
+        fournos_config=fournos_config,
+        overrides=overrides,
+    )
+
+
+def _reinitialize_project_config() -> None:
+    config.project = None
+    artifact_config = env.ARTIFACT_DIR / "config.yaml"
+    if artifact_config.exists():
+        artifact_config.unlink()
+
+    presets_applied = env.ARTIFACT_DIR / "presets_applied"
+    if presets_applied.exists():
+        presets_applied.unlink()
+
+    config.init(ORCHESTRATION_DIR)
+
+
+def apply_requested_preset(requested_preset: str) -> None:
+    if not config.project.get_preset(requested_preset):
+        raise ValueError(f"Unknown llm_d preset: {requested_preset}")
+
+    config.project.apply_preset(requested_preset)
+
+
+def load_fournos_config(cwd: Path) -> dict[str, Any]:
+    config_path = cwd / "fournos_config.yaml"
+    if not config_path.exists():
+        return {}
+
+    data = load_yaml(config_path)
+    if data is None:
+        return {}
+    if not isinstance(data, dict):
+        raise ValueError(f"Unexpected FOURNOS config type in {config_path}: {type(data)}")
+    return data
+
+
+def parse_overrides(raw: str, *, allowed_keys: Iterable[str]) -> dict[str, Any]:
+    if not raw or raw.strip() in {"", "null", "{}"}:
+        return {}
+
+    try:
+        data = json.loads(raw)
+    except json.JSONDecodeError as exc:
+        raise ValueError(f"FORGE_CONFIG_OVERRIDES is not valid JSON: {exc}") from exc
+
+    if not isinstance(data, dict):
+        raise ValueError("FORGE_CONFIG_OVERRIDES must decode to a JSON object")
+
+    allowed_keys = frozenset(allowed_keys)
+    unsupported = sorted(set(data) - allowed_keys)
+    if unsupported:
+        raise ValueError(
+            "Unsupported llm_d override keys: "
+            f"{', '.join(unsupported)}. Allowed keys: {', '.join(sorted(allowed_keys))}"
+        )
+
+    return data
+
+
+def normalize_gpu_count(value: Any) -> int | None:
+    if value in (None, ""):
+        return None
+    try:
+        return int(value)
+    except (TypeError, ValueError):
+        LOGGER.warning("Ignoring invalid gpu-count value: %s", value)
+        return None
+
+
+def derive_namespace(job_name: str, prefix: str, max_length: int) -> str:
+    slug = re.sub(r"[^a-z0-9-]+", "-", job_name.lower())
+    slug = re.sub(r"-{2,}", "-", slug).strip("-")
+    if not slug:
+        slug = "run"
+
+    if slug.startswith(f"{prefix}-"):
+        namespace = slug
+    else:
+        namespace = f"{prefix}-{slug}"
+
+    namespace = namespace[:max_length].rstrip("-")
+    if not namespace:
+        raise ValueError(f"Could not derive a valid namespace from job name: {job_name}")
+    return namespace
+
+
+def slugify_identifier(value: str, *, max_length: int = 63) -> str:
+    slug = re.sub(r"[^a-z0-9-]+", "-", value.lower())
+    slug = re.sub(r"-{2,}", "-", slug).strip("-")
+    return slug[:max_length].rstrip("-") or "item"
+
+
+def truncate_k8s_name(value: str, *, max_length: int = 63) -> str:
+    return value[:max_length].rstrip("-")
+
+
+def version_tuple(value: str) -> tuple[int, ...]:
+    numbers = re.findall(r"\d+", value)
+    return tuple(int(number) for number in numbers[:3])
+
+
+def resolve_model_cache(config: ResolvedConfig) -> ModelCacheSpec | None:
+    if not config.model_cache.get("enabled", False):
+        return None
+
+    source_uri = config.model["uri"]
+    if source_uri.startswith(("pvc://", "pvc+hf://")):
+        return None
+
+    if source_uri.startswith("hf://"):
+        source_scheme = "hf"
+    elif source_uri.startswith("oci://"):
+        source_scheme = "oci"
+    else:
+        raise ValueError(f"Unsupported model cache source URI for {config.model_key}: {source_uri}")
+
+    model_cache_overrides = config.model.get("cache", {})
+    pvc_defaults = config.model_cache["pvc"]
+    pvc_prefix = config.model_cache["pvc"]["name_prefix"]
+    cache_key = hashlib.sha256(source_uri.encode("utf-8")).hexdigest()[:10]
+    pvc_name = truncate_k8s_name(
+        f"{pvc_prefix}-{slugify_identifier(config.model_key, max_length=32)}-{cache_key}"
+    )
+    model_path = pvc_defaults["model_directory_name"]
+
+    return ModelCacheSpec(
+        source_uri=source_uri,
+        source_scheme=source_scheme,
+        cache_key=cache_key,
+        namespace=config.namespace,
+        pvc_name=pvc_name,
+        pvc_size=model_cache_overrides.get("pvc_size", pvc_defaults["size"]),
+        access_mode=model_cache_overrides.get("access_mode", pvc_defaults["access_mode"]),
+        storage_class_name=model_cache_overrides.get(
+            "storage_class_name", pvc_defaults.get("storage_class_name")
+        ),
+        model_path=model_path,
+        model_uri=f"pvc://{pvc_name}/{model_path}",
+        marker_filename=config.model_cache["marker_filename"],
+        download_job_name=truncate_k8s_name(f"{pvc_name}-download"),
+        hf_token_secret_name=model_cache_overrides.get(
+            "hf_token_secret_name", config.model_cache["hf"].get("token_secret_name")
+        ),
+        hf_token_secret_key=config.model_cache["hf"].get("token_secret_key"),
+        oci_image_path=model_cache_overrides.get(
+            "oci_image_path", config.model_cache["oci"].get("image_path")
+        ),
+        oci_registry_auth_secret_name=model_cache_overrides.get(
+            "oci_registry_auth_secret_name",
+            config.model_cache["oci"].get("registry_auth_secret_name"),
+        ),
+        oci_registry_auth_secret_key=config.model_cache["oci"].get("registry_auth_secret_key"),
+    )
+
+
+def load_yaml(path: Path) -> Any:
+    with path.open(encoding="utf-8") as handle:
+        return yaml.safe_load(handle)
+
+
+def write_yaml(path: Path, payload: Any) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as handle:
+        yaml.safe_dump(payload, handle, sort_keys=False)
+
+
+def write_json(path: Path, payload: Any) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as handle:
+        json.dump(payload, handle, indent=2, sort_keys=True)
+        handle.write("\n")
+
+
+def write_text(path: Path, content: str) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(content, encoding="utf-8")
diff --git a/projects/llm_d/orchestration/runtime_manifests.py b/projects/llm_d/orchestration/runtime_manifests.py
new file mode 100644
index 00000000..0c72a88e
--- /dev/null
+++ b/projects/llm_d/orchestration/runtime_manifests.py
@@ -0,0 +1,327 @@
+from __future__ import annotations
+
+import copy
+import json
+from typing import Any
+
+from projects.llm_d.orchestration.runtime_config import (
+    ModelCacheSpec,
+    ResolvedConfig,
+    load_yaml,
+    resolve_model_cache,
+)
+
+
+def load_manifest_template(config: ResolvedConfig, relative_path: str) -> dict[str, Any]:
+    return load_yaml(config.config_dir / relative_path)
+
+
+def render_datasciencecluster(config: ResolvedConfig) -> dict[str, Any]:
+    template_path = config.config_dir / config.platform["rhoai"]["datasciencecluster_template"]
+    manifest = load_yaml(template_path)
+    manifest["metadata"]["name"] = config.platform["rhoai"]["datasciencecluster_name"]
+    manifest["metadata"]["namespace"] = config.platform["rhoai"]["namespace"]
+    return manifest
+
+
+def render_gateway(config: ResolvedConfig) -> dict[str, Any]:
+    template_path = config.config_dir / config.platform["gateway"]["manifest_template"]
+    manifest = load_yaml(template_path)
+    manifest["metadata"]["name"] = config.platform["gateway"]["name"]
+    manifest["metadata"]["namespace"] = config.platform["gateway"]["namespace"]
+    manifest["spec"]["gatewayClassName"] = config.platform["gateway"]["gateway_class_name"]
+    return manifest
+
+
+def render_model_cache_pvc(spec: ModelCacheSpec) -> dict[str, Any]:
+    manifest: dict[str, Any] = {
+        "apiVersion": "v1",
+        "kind": "PersistentVolumeClaim",
+        "metadata": {
+            "name": spec.pvc_name,
+            "namespace": spec.namespace,
+            "labels": {
+                "app.kubernetes.io/managed-by": "forge",
+                "forge.openshift.io/project": "llm_d",
+                "forge.openshift.io/model-cache": "true",
+                "forge.openshift.io/preserve": "true",
+            },
+            "annotations": {
+                "forge.openshift.io/model-cache-key": spec.cache_key,
+                "forge.openshift.io/model-source-uri": spec.source_uri,
+            },
+        },
+        "spec": {
+            "accessModes": [spec.access_mode],
+            "resources": {"requests": {"storage": spec.pvc_size}},
+        },
+    }
+    if spec.storage_class_name:
+        manifest["spec"]["storageClassName"] = spec.storage_class_name
+    return manifest
+
+
+def render_inference_service(config: ResolvedConfig) -> dict[str, Any]:
+    template_path = config.config_dir / config.platform["inference_service"]["template"]
+    manifest = load_yaml(template_path)
+
+    name = config.platform["inference_service"]["name"]
+    manifest["metadata"]["name"] = name
+    manifest["metadata"]["namespace"] = config.namespace
+    manifest["metadata"].setdefault("labels", {})
+    manifest["metadata"]["labels"].update(
+        {
+            "app.kubernetes.io/managed-by": "forge",
+            "forge.openshift.io/project": "llm_d",
+        }
+    )
+
+    cache_spec = resolve_model_cache(config)
+    manifest["spec"]["model"]["uri"] = cache_spec.model_uri if cache_spec else config.model["uri"]
+    manifest["spec"]["model"]["name"] = config.model["served_model_name"]
+    manifest["spec"]["template"]["containers"][0]["resources"] = copy.deepcopy(
+        config.model["resources"]
+    )
+
+    if config.scheduler_profile_key == "default":
+        manifest["spec"]["router"]["scheduler"] = {}
+        return manifest
+
+    if config.scheduler_profile is None:
+        raise ValueError(f"Missing scheduler profile config for {config.scheduler_profile_key}")
+
+    scheduler_profile_path = config.config_dir / config.scheduler_profile["config_path"]
+    scheduler_profile_config = scheduler_profile_path.read_text(encoding="utf-8")
+    router_args = manifest["spec"]["router"]["scheduler"]["template"]["containers"][0]["args"]
+    if not router_args or router_args[-1] != "--config-text":
+        raise ValueError("Expected llm-d router args to end with --config-text")
+    router_args.append(scheduler_profile_config)
+
+    return manifest
+
+
+def render_smoke_request_job(
+    config: ResolvedConfig, endpoint_url: str, payload: dict[str, Any]
+) -> dict[str, Any]:
+    smoke = config.platform["smoke"]
+    command = """
+set -eu
+attempt=1
+while [ "${attempt}" -le "${REQUEST_RETRIES}" ]; do
+  if curl -k -sSf --max-time "${REQUEST_TIMEOUT_SECONDS}" \
+    "${ENDPOINT_URL}${ENDPOINT_PATH}" \
+    -H "Content-Type: application/json" \
+    -d "${REQUEST_PAYLOAD}" \
+    -o /tmp/smoke-response.json \
+    2>/tmp/smoke-error.log; then
+    cat /tmp/smoke-response.json
+    exit 0
+  fi
+  attempt=$((attempt + 1))
+  sleep "${REQUEST_RETRY_DELAY_SECONDS}"
+done
+cat /tmp/smoke-error.log >&2 || true
+exit 1
+"""
+
+    return {
+        "apiVersion": "batch/v1",
+        "kind": "Job",
+        "metadata": {
+            "name": smoke["job_name"],
+            "namespace": config.namespace,
+            "labels": {
+                "app.kubernetes.io/managed-by": "forge",
+                "forge.openshift.io/project": "llm_d",
+                "forge.openshift.io/component": "smoke",
+            },
+        },
+        "spec": {
+            "backoffLimit": 0,
+            "activeDeadlineSeconds": (
+                smoke["request_retries"]
+                * (smoke["request_timeout_seconds"] + smoke["request_retry_delay_seconds"])
+            ),
+            "template": {
+                "metadata": {
+                    "labels": {
+                        "app.kubernetes.io/managed-by": "forge",
+                        "forge.openshift.io/project": "llm_d",
+                        "forge.openshift.io/component": "smoke",
+                    }
+                },
+                "spec": {
+                    "restartPolicy": "Never",
+                    "containers": [
+                        {
+                            "name": "smoke",
+                            "image": smoke["client_image"],
+                            "command": ["/bin/sh", "-ceu", command],
+                            "env": [
+                                {"name": "ENDPOINT_URL", "value": endpoint_url},
+                                {"name": "ENDPOINT_PATH", "value": smoke["endpoint_path"]},
+                                {"name": "REQUEST_PAYLOAD", "value": json.dumps(payload)},
+                                {"name": "REQUEST_RETRIES", "value": str(smoke["request_retries"])},
+                                {
+                                    "name": "REQUEST_RETRY_DELAY_SECONDS",
+                                    "value": str(smoke["request_retry_delay_seconds"]),
+                                },
+                                {
+                                    "name": "REQUEST_TIMEOUT_SECONDS",
+                                    "value": str(smoke["request_timeout_seconds"]),
+                                },
+                            ],
+                        }
+                    ],
+                },
+            },
+        },
+    }
+
+
+def render_guidellm_pvc(config: ResolvedConfig) -> dict[str, Any]:
+    if not config.benchmark:
+        raise ValueError("Benchmark configuration is not enabled for this preset")
+
+    return {
+        "apiVersion": "v1",
+        "kind": "PersistentVolumeClaim",
+        "metadata": {
+            "name": config.benchmark["job_name"],
+            "namespace": config.namespace,
+            "labels": {
+                "app.kubernetes.io/managed-by": "forge",
+                "forge.openshift.io/project": "llm_d",
+            },
+        },
+        "spec": {
+            "accessModes": ["ReadWriteOnce"],
+            "resources": {"requests": {"storage": config.benchmark["pvc_size"]}},
+        },
+    }
+
+
+def render_guidellm_job(config: ResolvedConfig, endpoint_url: str) -> dict[str, Any]:
+    if not config.benchmark:
+        raise ValueError("Benchmark configuration is not enabled for this preset")
+
+    args = [
+        "benchmark",
+        "run",
+        f"--target={endpoint_url}",
+        f"--rate={config.benchmark['rate']}",
+    ]
+    for key, value in config.benchmark["args"].items():
+        if value is None:
+            continue
+        args.append(f"--{key.replace('_', '-')}={value}")
+    args.append("--outputs=json")
+
+    return {
+        "apiVersion": "batch/v1",
+        "kind": "Job",
+        "metadata": {
+            "name": config.benchmark["job_name"],
+            "namespace": config.namespace,
+            "labels": {
+                "app.kubernetes.io/managed-by": "forge",
+                "forge.openshift.io/project": "llm_d",
+            },
+        },
+        "spec": {
+            "backoffLimit": 0,
+            "template": {
+                "metadata": {
+                    "labels": {
+                        "app.kubernetes.io/managed-by": "forge",
+                        "forge.openshift.io/project": "llm_d",
+                    }
+                },
+                "spec": {
+                    "serviceAccountName": "default",
+                    "restartPolicy": "Never",
+                    "containers": [
+                        {
+                            "name": "guidellm",
+                            "image": config.benchmark["image"],
+                            "command": ["/opt/app-root/bin/guidellm"],
+                            "args": args,
+                            "env": [{"name": "USER", "value": "guidellm"}],
+                            "volumeMounts": [
+                                {"name": "home", "mountPath": "/home/guidellm"},
+                                {"name": "results", "mountPath": "/results"},
+                            ],
+                        }
+                    ],
+                    "volumes": [
+                        {"name": "home", "emptyDir": {}},
+                        {
+                            "name": "results",
+                            "persistentVolumeClaim": {"claimName": config.benchmark["job_name"]},
+                        },
+                    ],
+                },
+            },
+        },
+    }
+
+
+def render_guidellm_copy_pod(
+    config: ResolvedConfig, node_name: str | None = None
+) -> dict[str, Any]:
+    if not config.benchmark:
+        raise ValueError("Benchmark configuration is not enabled for this preset")
+
+    pod = {
+        "apiVersion": "v1",
+        "kind": "Pod",
+        "metadata": {
+            "name": f"{config.benchmark['job_name']}-copy",
+            "namespace": config.namespace,
+            "labels": {
+                "app.kubernetes.io/managed-by": "forge",
+                "forge.openshift.io/project": "llm_d",
+            },
+        },
+        "spec": {
+            "restartPolicy": "Never",
+            "initContainers": [
+                {
+                    "name": "permission-fixer",
+                    "image": config.benchmark["image"],
+                    "command": [
+                        "/bin/sh",
+                        "-c",
+                        "chmod 755 /results && chown -R 1001:1001 /results || true",
+                    ],
+                    "securityContext": {
+                        "runAsUser": 0,
+                        "allowPrivilegeEscalation": True,
+                    },
+                    "volumeMounts": [{"name": "results", "mountPath": "/results"}],
+                }
+            ],
+            "containers": [
+                {
+                    "name": "copy-helper",
+                    "image": config.benchmark["image"],
+                    "command": ["/bin/sleep", "300"],
+                    "securityContext": {
+                        "runAsUser": 1001,
+                        "runAsNonRoot": True,
+                        "allowPrivilegeEscalation": False,
+                    },
+                    "volumeMounts": [{"name": "results", "mountPath": "/results"}],
+                }
+            ],
+            "volumes": [
+                {
+                    "name": "results",
+                    "persistentVolumeClaim": {"claimName": config.benchmark["job_name"]},
+                }
+            ],
+        },
+    }
+    if node_name:
+        pod["spec"]["nodeName"] = node_name
+    return pod

From acc95e96fff0910036a981d94b52a0405695978d Mon Sep 17 00:00:00 2001
From: Alberto Perdomo <aperdomo@redhat.com>
Date: Sun, 3 May 2026 10:19:33 +0100
Subject: [PATCH 12/21] refactor: Add llm_d phase input boundary

---
 projects/llm_d/orchestration/ci.py           |  36 +++-
 projects/llm_d/orchestration/cli.py          |  38 +++-
 projects/llm_d/orchestration/phase_inputs.py | 207 +++++++++++++++++++
 projects/llm_d/orchestration/prepare_llmd.py |  12 --
 projects/llm_d/orchestration/test_llmd.py    |  12 --
 tests/llm_d/test_runtime.py                  | 141 ++++++++++++-
 6 files changed, 408 insertions(+), 38 deletions(-)
 create mode 100644 projects/llm_d/orchestration/phase_inputs.py
 delete mode 100644 projects/llm_d/orchestration/prepare_llmd.py
 delete mode 100644 projects/llm_d/orchestration/test_llmd.py

diff --git a/projects/llm_d/orchestration/ci.py b/projects/llm_d/orchestration/ci.py
index 97073e6e..ed02e0b2 100644
--- a/projects/llm_d/orchestration/ci.py
+++ b/projects/llm_d/orchestration/ci.py
@@ -7,10 +7,34 @@
 import types
 
 import click
-import prepare_llmd
-import test_llmd
 
 from projects.core.library import ci as ci_lib
+from projects.llm_d.orchestration import llmd_runtime, phase_inputs
+from projects.llm_d.toolbox.cleanup.main import run as cleanup_toolbox_run
+from projects.llm_d.toolbox.prepare.main import run as prepare_toolbox_run
+from projects.llm_d.toolbox.test.main import run as test_toolbox_run
+
+
+def init_runtime() -> None:
+    llmd_runtime.init()
+
+
+def run_prepare_phase() -> int:
+    config = llmd_runtime.load_run_configuration()
+    inputs_file = phase_inputs.write_prepare_inputs(config)
+    return prepare_toolbox_run(inputs_file=str(inputs_file))
+
+
+def run_test_phase() -> int:
+    config = llmd_runtime.load_run_configuration()
+    inputs_file = phase_inputs.write_test_inputs(config)
+    return test_toolbox_run(inputs_file=str(inputs_file))
+
+
+def run_cleanup_phase() -> int:
+    config = llmd_runtime.load_run_configuration()
+    inputs_file = phase_inputs.write_cleanup_inputs(config)
+    return cleanup_toolbox_run(inputs_file=str(inputs_file))
 
 
 @click.group()
@@ -19,7 +43,7 @@
 def main(ctx):
     """LLM-D Project CI Operations for FORGE."""
     ctx.ensure_object(types.SimpleNamespace)
-    test_llmd.init()
+    init_runtime()
 
 
 @main.command()
@@ -27,7 +51,7 @@ def main(ctx):
 @ci_lib.safe_ci_command
 def prepare(ctx) -> int:
     """Prepare phase - Set up environment and dependencies."""
-    return prepare_llmd.prepare()
+    return run_prepare_phase()
 
 
 @main.command()
@@ -35,7 +59,7 @@ def prepare(ctx) -> int:
 @ci_lib.safe_ci_command
 def test(ctx) -> int:
     """Test phase - Execute the main testing logic."""
-    return test_llmd.test()
+    return run_test_phase()
 
 
 @main.command()
@@ -43,7 +67,7 @@ def test(ctx) -> int:
 @ci_lib.safe_ci_command
 def pre_cleanup(ctx) -> int:
     """Cleanup phase - Clean up resources and finalize."""
-    return prepare_llmd.cleanup()
+    return run_cleanup_phase()
 
 
 if __name__ == "__main__":
diff --git a/projects/llm_d/orchestration/cli.py b/projects/llm_d/orchestration/cli.py
index ca87c653..02b2e549 100644
--- a/projects/llm_d/orchestration/cli.py
+++ b/projects/llm_d/orchestration/cli.py
@@ -4,20 +4,44 @@
 import types
 
 import click
-import prepare_llmd
-import test_llmd
 
 from projects.core.library.cli import safe_cli_command
+from projects.llm_d.orchestration import llmd_runtime, phase_inputs
+from projects.llm_d.toolbox.cleanup.main import run as cleanup_toolbox_run
+from projects.llm_d.toolbox.prepare.main import run as prepare_toolbox_run
+from projects.llm_d.toolbox.test.main import run as test_toolbox_run
 
 logger = logging.getLogger(__name__)
 
 
+def init_runtime() -> None:
+    llmd_runtime.init()
+
+
+def run_prepare_phase() -> int:
+    config = llmd_runtime.load_run_configuration()
+    inputs_file = phase_inputs.write_prepare_inputs(config)
+    return prepare_toolbox_run(inputs_file=str(inputs_file))
+
+
+def run_test_phase() -> int:
+    config = llmd_runtime.load_run_configuration()
+    inputs_file = phase_inputs.write_test_inputs(config)
+    return test_toolbox_run(inputs_file=str(inputs_file))
+
+
+def run_cleanup_phase() -> int:
+    config = llmd_runtime.load_run_configuration()
+    inputs_file = phase_inputs.write_cleanup_inputs(config)
+    return cleanup_toolbox_run(inputs_file=str(inputs_file))
+
+
 @click.group()
 @click.pass_context
 def main(ctx):
     """LLM-D Project CLI Operations for FORGE."""
     ctx.ensure_object(types.SimpleNamespace)
-    test_llmd.init()
+    init_runtime()
 
 
 @main.command()
@@ -25,7 +49,7 @@ def main(ctx):
 @safe_cli_command
 def prepare(ctx) -> int:
     """Prepare phase - Set up environment and dependencies."""
-    return prepare_llmd.prepare()
+    return run_prepare_phase()
 
 
 @main.command()
@@ -33,7 +57,7 @@ def prepare(ctx) -> int:
 @safe_cli_command
 def test(ctx) -> int:
     """Test phase - Execute the main testing logic."""
-    return test_llmd.test()
+    return run_test_phase()
 
 
 @main.command()
@@ -41,7 +65,7 @@ def test(ctx) -> int:
 @safe_cli_command
 def pre_cleanup(ctx) -> int:
     """Cleanup phase - Clean up resources and finalize."""
-    return prepare_llmd.cleanup()
+    return run_cleanup_phase()
 
 
 @main.command()
@@ -49,7 +73,7 @@ def pre_cleanup(ctx) -> int:
 @safe_cli_command
 def post_cleanup(ctx) -> int:
     """Cleanup phase - Clean up resources and finalize."""
-    return prepare_llmd.cleanup()
+    return run_cleanup_phase()
 
 
 if __name__ == "__main__":
diff --git a/projects/llm_d/orchestration/phase_inputs.py b/projects/llm_d/orchestration/phase_inputs.py
new file mode 100644
index 00000000..8a195515
--- /dev/null
+++ b/projects/llm_d/orchestration/phase_inputs.py
@@ -0,0 +1,207 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+from projects.llm_d.orchestration.runtime_config import ResolvedConfig, load_yaml, write_yaml
+
+
+@dataclass(frozen=True)
+class CleanupInputs:
+    artifact_dir: Path
+    namespace: str
+    platform: dict[str, Any]
+    benchmark: dict[str, Any] | None
+
+
+@dataclass(frozen=True)
+class PrepareModelCacheInputs:
+    artifact_dir: Path
+    preset_name: str
+    namespace: str
+    namespace_is_managed: bool
+    model_key: str
+    model: dict[str, Any]
+    model_cache: dict[str, Any]
+
+
+@dataclass(frozen=True)
+class PrepareInputs:
+    artifact_dir: Path
+    config_dir: Path
+    preset_name: str
+    namespace: str
+    namespace_is_managed: bool
+    platform: dict[str, Any]
+    model_key: str
+    model: dict[str, Any]
+    model_cache: dict[str, Any]
+    benchmark: dict[str, Any] | None
+
+
+@dataclass(frozen=True)
+class TestInputs:
+    artifact_dir: Path
+    config_dir: Path
+    preset_name: str
+    namespace: str
+    platform: dict[str, Any]
+    model_key: str
+    model: dict[str, Any]
+    scheduler_profile_key: str
+    scheduler_profile: dict[str, Any] | None
+    model_cache: dict[str, Any]
+    smoke_request: dict[str, Any]
+    benchmark: dict[str, Any] | None
+
+
+def write_cleanup_inputs(config: ResolvedConfig) -> Path:
+    path = config.artifact_dir / "_meta" / "cleanup.inputs.yaml"
+    write_yaml(
+        path,
+        {
+            "artifact_dir": str(config.artifact_dir),
+            "namespace": config.namespace,
+            "platform": config.platform,
+            "benchmark": config.benchmark,
+        },
+    )
+    return path
+
+
+def write_prepare_model_cache_inputs(config: ResolvedConfig) -> Path:
+    path = config.artifact_dir / "_meta" / "prepare_model_cache.inputs.yaml"
+    write_yaml(
+        path,
+        {
+            "artifact_dir": str(config.artifact_dir),
+            "preset_name": config.preset_name,
+            "namespace": config.namespace,
+            "namespace_is_managed": config.namespace_is_managed,
+            "model_key": config.model_key,
+            "model": config.model,
+            "model_cache": config.model_cache,
+        },
+    )
+    return path
+
+
+def write_prepare_inputs(config: ResolvedConfig) -> Path:
+    path = config.artifact_dir / "_meta" / "prepare.inputs.yaml"
+    write_yaml(
+        path,
+        {
+            "artifact_dir": str(config.artifact_dir),
+            "config_dir": str(config.config_dir),
+            "preset_name": config.preset_name,
+            "namespace": config.namespace,
+            "namespace_is_managed": config.namespace_is_managed,
+            "platform": config.platform,
+            "model_key": config.model_key,
+            "model": config.model,
+            "model_cache": config.model_cache,
+            "benchmark": config.benchmark,
+        },
+    )
+    return path
+
+
+def write_test_inputs(config: ResolvedConfig) -> Path:
+    path = config.artifact_dir / "_meta" / "test.inputs.yaml"
+    write_yaml(
+        path,
+        {
+            "artifact_dir": str(config.artifact_dir),
+            "config_dir": str(config.config_dir),
+            "preset_name": config.preset_name,
+            "namespace": config.namespace,
+            "platform": config.platform,
+            "model_key": config.model_key,
+            "model": config.model,
+            "scheduler_profile_key": config.scheduler_profile_key,
+            "scheduler_profile": config.scheduler_profile,
+            "model_cache": config.model_cache,
+            "smoke_request": config.smoke_request,
+            "benchmark": config.benchmark,
+        },
+    )
+    return path
+
+
+def load_cleanup_inputs(path: str | Path) -> CleanupInputs:
+    payload = load_yaml(Path(path))
+    return CleanupInputs(
+        artifact_dir=Path(payload["artifact_dir"]),
+        namespace=payload["namespace"],
+        platform=payload["platform"],
+        benchmark=payload["benchmark"],
+    )
+
+
+def load_prepare_model_cache_inputs(path: str | Path) -> PrepareModelCacheInputs:
+    payload = load_yaml(Path(path))
+    return PrepareModelCacheInputs(
+        artifact_dir=Path(payload["artifact_dir"]),
+        preset_name=payload["preset_name"],
+        namespace=payload["namespace"],
+        namespace_is_managed=payload["namespace_is_managed"],
+        model_key=payload["model_key"],
+        model=payload["model"],
+        model_cache=payload["model_cache"],
+    )
+
+
+def load_prepare_inputs(path: str | Path) -> PrepareInputs:
+    payload = load_yaml(Path(path))
+    return PrepareInputs(
+        artifact_dir=Path(payload["artifact_dir"]),
+        config_dir=Path(payload["config_dir"]),
+        preset_name=payload["preset_name"],
+        namespace=payload["namespace"],
+        namespace_is_managed=payload["namespace_is_managed"],
+        platform=payload["platform"],
+        model_key=payload["model_key"],
+        model=payload["model"],
+        model_cache=payload["model_cache"],
+        benchmark=payload["benchmark"],
+    )
+
+
+def load_test_inputs(path: str | Path) -> TestInputs:
+    payload = load_yaml(Path(path))
+    return TestInputs(
+        artifact_dir=Path(payload["artifact_dir"]),
+        config_dir=Path(payload["config_dir"]),
+        preset_name=payload["preset_name"],
+        namespace=payload["namespace"],
+        platform=payload["platform"],
+        model_key=payload["model_key"],
+        model=payload["model"],
+        scheduler_profile_key=payload["scheduler_profile_key"],
+        scheduler_profile=payload["scheduler_profile"],
+        model_cache=payload["model_cache"],
+        smoke_request=payload["smoke_request"],
+        benchmark=payload["benchmark"],
+    )
+
+
+def cleanup_inputs_from_prepare(inputs: PrepareInputs) -> CleanupInputs:
+    return CleanupInputs(
+        artifact_dir=inputs.artifact_dir,
+        namespace=inputs.namespace,
+        platform=inputs.platform,
+        benchmark=inputs.benchmark,
+    )
+
+
+def prepare_model_cache_inputs_from_prepare(inputs: PrepareInputs) -> PrepareModelCacheInputs:
+    return PrepareModelCacheInputs(
+        artifact_dir=inputs.artifact_dir,
+        preset_name=inputs.preset_name,
+        namespace=inputs.namespace,
+        namespace_is_managed=inputs.namespace_is_managed,
+        model_key=inputs.model_key,
+        model=inputs.model,
+        model_cache=inputs.model_cache,
+    )
diff --git a/projects/llm_d/orchestration/prepare_llmd.py b/projects/llm_d/orchestration/prepare_llmd.py
deleted file mode 100644
index ba64a9dc..00000000
--- a/projects/llm_d/orchestration/prepare_llmd.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from __future__ import annotations
-
-from projects.llm_d.toolbox.cleanup.main import run as cleanup_toolbox_run
-from projects.llm_d.toolbox.prepare.main import run as prepare_toolbox_run
-
-
-def prepare() -> int:
-    return prepare_toolbox_run()
-
-
-def cleanup() -> int:
-    return cleanup_toolbox_run()
diff --git a/projects/llm_d/orchestration/test_llmd.py b/projects/llm_d/orchestration/test_llmd.py
deleted file mode 100644
index 5254cafb..00000000
--- a/projects/llm_d/orchestration/test_llmd.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from __future__ import annotations
-
-from projects.llm_d.orchestration import llmd_runtime
-from projects.llm_d.toolbox.test.main import run as test_toolbox_run
-
-
-def init() -> None:
-    llmd_runtime.init()
-
-
-def test() -> int:
-    return test_toolbox_run()
diff --git a/tests/llm_d/test_runtime.py b/tests/llm_d/test_runtime.py
index bc19284a..126cb9e7 100644
--- a/tests/llm_d/test_runtime.py
+++ b/tests/llm_d/test_runtime.py
@@ -5,7 +5,9 @@
 
 import pytest
 
-from projects.llm_d.orchestration import llmd_runtime
+from projects.llm_d.orchestration import ci as llmd_ci
+from projects.llm_d.orchestration import cli as llmd_cli
+from projects.llm_d.orchestration import llmd_runtime, phase_inputs
 from projects.llm_d.toolbox.cleanup import main as cleanup_toolbox
 from projects.llm_d.toolbox.prepare import main as prepare_toolbox
 from projects.llm_d.toolbox.prepare_model_cache import main as prepare_model_cache_toolbox
@@ -91,6 +93,143 @@ def test_default_namespace_comes_from_project_config(
     assert config.namespace_is_managed is False
 
 
+def test_write_prepare_inputs_round_trip(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
+    artifact_dir = tmp_path / "artifacts"
+    artifact_dir.mkdir()
+
+    config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
+
+    path = phase_inputs.write_prepare_inputs(config)
+    payload = llmd_runtime.load_yaml(path)
+    loaded = phase_inputs.load_prepare_inputs(path)
+
+    assert set(payload) == {
+        "artifact_dir",
+        "config_dir",
+        "preset_name",
+        "namespace",
+        "namespace_is_managed",
+        "platform",
+        "model_key",
+        "model",
+        "model_cache",
+        "benchmark",
+    }
+    assert loaded.artifact_dir == config.artifact_dir
+    assert loaded.config_dir == config.config_dir
+    assert loaded.namespace == config.namespace
+    assert loaded.platform == config.platform
+    assert loaded.model == config.model
+    assert loaded.model_cache == config.model_cache
+    assert loaded.benchmark == config.benchmark
+
+
+def test_write_test_inputs_round_trip(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
+    artifact_dir = tmp_path / "artifacts"
+    artifact_dir.mkdir()
+
+    config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
+
+    path = phase_inputs.write_test_inputs(config)
+    payload = llmd_runtime.load_yaml(path)
+    loaded = phase_inputs.load_test_inputs(path)
+
+    assert set(payload) == {
+        "artifact_dir",
+        "config_dir",
+        "preset_name",
+        "namespace",
+        "platform",
+        "model_key",
+        "model",
+        "scheduler_profile_key",
+        "scheduler_profile",
+        "model_cache",
+        "smoke_request",
+        "benchmark",
+    }
+    assert loaded.namespace == config.namespace
+    assert loaded.scheduler_profile_key == config.scheduler_profile_key
+    assert loaded.smoke_request == config.smoke_request
+    assert loaded.benchmark == config.benchmark
+
+
+@pytest.mark.parametrize("orchestration", [llmd_ci, llmd_cli])
+def test_orchestration_prepare_writes_inputs_and_invokes_toolbox(
+    orchestration, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
+    artifact_dir = tmp_path / "artifacts"
+    artifact_dir.mkdir()
+    config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
+    captured: dict[str, str] = {}
+
+    monkeypatch.setattr(orchestration.llmd_runtime, "load_run_configuration", lambda: config)
+    monkeypatch.setattr(
+        orchestration,
+        "prepare_toolbox_run",
+        lambda *, inputs_file: captured.setdefault("inputs_file", inputs_file) or 17,
+    )
+
+    result = orchestration.run_prepare_phase()
+    loaded = phase_inputs.load_prepare_inputs(captured["inputs_file"])
+
+    assert result == captured["inputs_file"]
+    assert loaded.namespace == config.namespace
+
+
+@pytest.mark.parametrize("orchestration", [llmd_ci, llmd_cli])
+def test_orchestration_test_writes_inputs_and_invokes_toolbox(
+    orchestration, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
+    artifact_dir = tmp_path / "artifacts"
+    artifact_dir.mkdir()
+    config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
+    captured: dict[str, str] = {}
+
+    monkeypatch.setattr(orchestration.llmd_runtime, "load_run_configuration", lambda: config)
+    monkeypatch.setattr(
+        orchestration,
+        "test_toolbox_run",
+        lambda *, inputs_file: captured.setdefault("inputs_file", inputs_file) or 23,
+    )
+
+    result = orchestration.run_test_phase()
+    loaded = phase_inputs.load_test_inputs(captured["inputs_file"])
+
+    assert result == captured["inputs_file"]
+    assert loaded.namespace == config.namespace
+    assert loaded.model == config.model
+
+
+@pytest.mark.parametrize("orchestration", [llmd_ci, llmd_cli])
+def test_orchestration_cleanup_writes_inputs_and_invokes_toolbox(
+    orchestration, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
+    artifact_dir = tmp_path / "artifacts"
+    artifact_dir.mkdir()
+    config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
+    captured: dict[str, str] = {}
+
+    monkeypatch.setattr(orchestration.llmd_runtime, "load_run_configuration", lambda: config)
+    monkeypatch.setattr(
+        orchestration,
+        "cleanup_toolbox_run",
+        lambda *, inputs_file: captured.setdefault("inputs_file", inputs_file) or 29,
+    )
+
+    result = orchestration.run_cleanup_phase()
+    loaded = phase_inputs.load_cleanup_inputs(captured["inputs_file"])
+
+    assert result == captured["inputs_file"]
+    assert loaded.namespace == config.namespace
+    assert loaded.platform == config.platform
+
+
 def test_render_inference_service_injects_model_and_scheduler_profile(
     tmp_path: Path, monkeypatch: pytest.MonkeyPatch
 ) -> None:

From f7d0d1a6b19364ae4c9360a66b57b256df0d2ae6 Mon Sep 17 00:00:00 2001
From: Alberto Perdomo <aperdomo@redhat.com>
Date: Sun, 3 May 2026 10:27:34 +0100
Subject: [PATCH 13/21] refactor: Convert llm_d cleanup and model-cache phases
 to DSL tasks

---
 projects/llm_d/toolbox/cleanup/main.py        | 164 +++++++++++-------
 .../llm_d/toolbox/prepare_model_cache/main.py |  65 ++++++-
 tests/llm_d/test_runtime.py                   |  22 +--
 3 files changed, 171 insertions(+), 80 deletions(-)

diff --git a/projects/llm_d/toolbox/cleanup/main.py b/projects/llm_d/toolbox/cleanup/main.py
index d80726ef..abd543db 100644
--- a/projects/llm_d/toolbox/cleanup/main.py
+++ b/projects/llm_d/toolbox/cleanup/main.py
@@ -2,89 +2,137 @@
 
 from __future__ import annotations
 
-from projects.core.dsl import toolbox
-from projects.llm_d.orchestration import llmd_runtime
+from projects.core.dsl import execute_tasks, shell, task, toolbox
+from projects.llm_d.orchestration import llmd_runtime, phase_inputs
 
 
-def run() -> int:
-    llmd_runtime.init()
-    config = llmd_runtime.load_run_configuration()
-    return run_cleanup(config)
+def run(*, inputs_file: str) -> int:
+    """Delete llm_d runtime leftovers from a namespace.
 
+    Args:
+        inputs_file: Path to the cleanup phase input file generated by orchestration
+    """
 
-def run_cleanup(config: llmd_runtime.ResolvedConfig) -> int:
-    delete_run_leftovers(config)
+    llmd_runtime.init()
+    execute_tasks(locals())
     return 0
 
 
-def delete_run_leftovers(config: llmd_runtime.ResolvedConfig) -> None:
-    if not llmd_runtime.resource_exists("namespace", config.namespace):
-        return
+@task
+def load_inputs(args, ctx):
+    """Load the cleanup phase inputs"""
+
+    ctx.inputs = phase_inputs.load_cleanup_inputs(args.inputs_file)
+    return f"Loaded cleanup inputs for namespace {ctx.inputs.namespace}"
+
+
+@task
+def delete_leftovers(args, ctx):
+    """Delete llm_d runtime leftovers"""
 
-    inference_service_name = config.platform["inference_service"]["name"]
-    namespace = config.namespace
-    cleanup_timeout_seconds = config.platform["cluster"]["cleanup_timeout_seconds"]
+    inputs = ctx.inputs
+    if not llmd_runtime.resource_exists("namespace", inputs.namespace):
+        return f"Namespace {inputs.namespace} does not exist; nothing to clean"
+
+    inference_service_name = inputs.platform["inference_service"]["name"]
+    namespace = inputs.namespace
+    cleanup_timeout_seconds = inputs.platform["cluster"]["cleanup_timeout_seconds"]
     benchmark_names = {"guidellm-benchmark"}
-    if config.benchmark:
-        benchmark_names.add(config.benchmark["job_name"])
-
-    llmd_runtime.oc(
-        "delete",
-        "llminferenceservice",
-        inference_service_name,
-        "-n",
-        namespace,
+    if inputs.benchmark:
+        benchmark_names.add(inputs.benchmark["job_name"])
+
+    shell.run(
+        f"oc delete llminferenceservice {inference_service_name} "
+        f"-n {namespace} --ignore-not-found=true",
+        check=False,
+    )
+
+    for benchmark_name in sorted(benchmark_names):
+        shell.run(
+            f"oc delete job,pvc {benchmark_name} -n {namespace} --ignore-not-found=true",
+            check=False,
+        )
+        shell.run(
+            f"oc delete pod {benchmark_name}-copy -n {namespace} --ignore-not-found=true",
+            check=False,
+        )
+
+    shell.run(
+        f'oc delete job -n {namespace} -l "forge.openshift.io/project=llm_d" '
+        "--ignore-not-found=true",
+        check=False,
+    )
+    shell.run(
+        f'oc delete pod -n {namespace} -l "forge.openshift.io/project=llm_d" '
+        "--ignore-not-found=true",
+        check=False,
+    )
+    shell.run(
+        f"oc delete pvc -n {namespace} "
+        '-l "forge.openshift.io/project=llm_d,forge.openshift.io/preserve!=true" '
         "--ignore-not-found=true",
         check=False,
     )
 
+    llmd_runtime.wait_until(
+        f"llminferenceservice/{inference_service_name} deletion in {namespace}",
+        timeout_seconds=cleanup_timeout_seconds,
+        interval_seconds=10,
+        predicate=lambda: not llmd_runtime.resource_exists(
+            "llminferenceservice", inference_service_name, namespace=namespace
+        ),
+    )
+
+    llmd_runtime.wait_until(
+        f"llm-d workload pods deletion in {namespace}",
+        timeout_seconds=cleanup_timeout_seconds,
+        interval_seconds=10,
+        predicate=lambda: _llm_d_pods_gone(namespace, inference_service_name),
+    )
+
+    return f"Cleanup finished for namespace {namespace}"
+
+
+def delete_run_leftovers(inputs: phase_inputs.CleanupInputs) -> None:
+    if not llmd_runtime.resource_exists("namespace", inputs.namespace):
+        return
+
+    inference_service_name = inputs.platform["inference_service"]["name"]
+    namespace = inputs.namespace
+    cleanup_timeout_seconds = inputs.platform["cluster"]["cleanup_timeout_seconds"]
+    benchmark_names = {"guidellm-benchmark"}
+    if inputs.benchmark:
+        benchmark_names.add(inputs.benchmark["job_name"])
+
+    shell.run(
+        f"oc delete llminferenceservice {inference_service_name} "
+        f"-n {namespace} --ignore-not-found=true",
+        check=False,
+    )
+
     for benchmark_name in sorted(benchmark_names):
-        llmd_runtime.oc(
-            "delete",
-            "job,pvc",
-            benchmark_name,
-            "-n",
-            namespace,
-            "--ignore-not-found=true",
+        shell.run(
+            f"oc delete job,pvc {benchmark_name} -n {namespace} --ignore-not-found=true",
             check=False,
         )
-        llmd_runtime.oc(
-            "delete",
-            "pod",
-            f"{benchmark_name}-copy",
-            "-n",
-            namespace,
-            "--ignore-not-found=true",
+        shell.run(
+            f"oc delete pod {benchmark_name}-copy -n {namespace} --ignore-not-found=true",
             check=False,
         )
 
-    llmd_runtime.oc(
-        "delete",
-        "job",
-        "-n",
-        namespace,
-        "-l",
-        "forge.openshift.io/project=llm_d",
+    shell.run(
+        f'oc delete job -n {namespace} -l "forge.openshift.io/project=llm_d" '
         "--ignore-not-found=true",
         check=False,
     )
-    llmd_runtime.oc(
-        "delete",
-        "pod",
-        "-n",
-        namespace,
-        "-l",
-        "forge.openshift.io/project=llm_d",
+    shell.run(
+        f'oc delete pod -n {namespace} -l "forge.openshift.io/project=llm_d" '
         "--ignore-not-found=true",
         check=False,
     )
-    llmd_runtime.oc(
-        "delete",
-        "pvc",
-        "-n",
-        namespace,
-        "-l",
-        "forge.openshift.io/project=llm_d,forge.openshift.io/preserve!=true",
+    shell.run(
+        f"oc delete pvc -n {namespace} "
+        '-l "forge.openshift.io/project=llm_d,forge.openshift.io/preserve!=true" '
         "--ignore-not-found=true",
         check=False,
     )
diff --git a/projects/llm_d/toolbox/prepare_model_cache/main.py b/projects/llm_d/toolbox/prepare_model_cache/main.py
index 1dc50758..f698ef0c 100644
--- a/projects/llm_d/toolbox/prepare_model_cache/main.py
+++ b/projects/llm_d/toolbox/prepare_model_cache/main.py
@@ -4,19 +4,66 @@
 
 import logging
 
-from projects.core.dsl import toolbox
-from projects.llm_d.orchestration import llmd_runtime
+from projects.core.dsl import execute_tasks, task, toolbox
+from projects.llm_d.orchestration import llmd_runtime, phase_inputs
 
 LOGGER = logging.getLogger(__name__)
 
 
-def run() -> int:
+def run(*, inputs_file: str) -> int:
+    """Prepare the shared model cache PVC and populate it when needed.
+
+    Args:
+        inputs_file: Path to the prepare_model_cache phase input file generated by orchestration
+    """
+
     llmd_runtime.init()
-    config = llmd_runtime.load_run_configuration()
-    return run_prepare_model_cache(config)
+    execute_tasks(locals())
+    return 0
+
+
+@task
+def load_inputs(args, ctx):
+    """Load the model cache phase inputs"""
+
+    ctx.inputs = phase_inputs.load_prepare_model_cache_inputs(args.inputs_file)
+    return f"Loaded model cache inputs for preset {ctx.inputs.preset_name}"
+
+
+@task
+def prepare_model_cache(args, ctx):
+    """Ensure the model cache PVC exists and is populated"""
+
+    config = ctx.inputs
+    cache_spec = llmd_runtime.resolve_model_cache(config)
+    if not cache_spec:
+        LOGGER.info("Model cache disabled for preset=%s", config.preset_name)
+        return "Model cache disabled"
+
+    if config.namespace_is_managed:
+        LOGGER.warning(
+            "Model cache PVC %s lives in managed namespace %s. Namespace cleanup will remove it; cache reuse requires a stable namespace override.",
+            cache_spec.pvc_name,
+            cache_spec.namespace,
+        )
+
+    ensure_model_cache_pvc(config, cache_spec)
+    if llmd_runtime.model_cache_pvc_ready(cache_spec):
+        LOGGER.info(
+            "Model cache PVC %s already contains %s; skipping download",
+            cache_spec.pvc_name,
+            cache_spec.source_uri,
+        )
+        capture_model_cache_state(config, cache_spec)
+        return f"Model cache already populated in {cache_spec.pvc_name}"
+
+    run_model_cache_download_job(config, cache_spec)
+    llmd_runtime.annotate_model_cache_pvc(cache_spec)
+    capture_model_cache_state(config, cache_spec)
+    return f"Model cache step finished for namespace {config.namespace}"
 
 
-def run_prepare_model_cache(config: llmd_runtime.ResolvedConfig) -> int:
+def run_prepare_model_cache(config: phase_inputs.PrepareModelCacheInputs) -> int:
     cache_spec = llmd_runtime.resolve_model_cache(config)
     if not cache_spec:
         LOGGER.info("Model cache disabled for preset=%s", config.preset_name)
@@ -46,7 +93,7 @@ def run_prepare_model_cache(config: llmd_runtime.ResolvedConfig) -> int:
 
 
 def ensure_model_cache_pvc(
-    config: llmd_runtime.ResolvedConfig, cache_spec: llmd_runtime.ModelCacheSpec
+    config: phase_inputs.PrepareModelCacheInputs, cache_spec: llmd_runtime.ModelCacheSpec
 ) -> None:
     existing = llmd_runtime.oc_get_json(
         "persistentvolumeclaim",
@@ -86,7 +133,7 @@ def ensure_model_cache_pvc(
 
 
 def run_model_cache_download_job(
-    config: llmd_runtime.ResolvedConfig, cache_spec: llmd_runtime.ModelCacheSpec
+    config: phase_inputs.PrepareModelCacheInputs, cache_spec: llmd_runtime.ModelCacheSpec
 ) -> None:
     llmd_runtime.oc(
         "delete",
@@ -123,7 +170,7 @@ def run_model_cache_download_job(
 
 
 def capture_model_cache_state(
-    config: llmd_runtime.ResolvedConfig, cache_spec: llmd_runtime.ModelCacheSpec
+    config: phase_inputs.PrepareModelCacheInputs, cache_spec: llmd_runtime.ModelCacheSpec
 ) -> None:
     artifact_dir = config.artifact_dir / "artifacts" / "model-cache"
     llmd_runtime.write_json(
diff --git a/tests/llm_d/test_runtime.py b/tests/llm_d/test_runtime.py
index 126cb9e7..2171f7f4 100644
--- a/tests/llm_d/test_runtime.py
+++ b/tests/llm_d/test_runtime.py
@@ -443,7 +443,7 @@ def test_cleanup_deletes_leftovers_but_not_namespace_or_preserved_pvcs(
     artifact_dir.mkdir()
 
     config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
-    oc_calls: list[tuple[str, ...]] = []
+    shell_calls: list[str] = []
 
     def fake_resource_exists(kind: str, name: str, namespace: str | None = None) -> bool:
         if kind == "namespace":
@@ -452,25 +452,21 @@ def fake_resource_exists(kind: str, name: str, namespace: str | None = None) ->
 
     monkeypatch.setattr(llmd_runtime, "resource_exists", fake_resource_exists)
     monkeypatch.setattr(
-        llmd_runtime,
-        "oc",
-        lambda *args, **kwargs: oc_calls.append(tuple(args)),
+        cleanup_toolbox.shell,
+        "run",
+        lambda command, **kwargs: shell_calls.append(command),
     )
     monkeypatch.setattr(llmd_runtime, "wait_until", lambda *args, **kwargs: True)
     monkeypatch.setattr(cleanup_toolbox, "_llm_d_pods_gone", lambda *_args: True)
 
     cleanup_toolbox.delete_run_leftovers(config)
 
-    assert ("delete", "namespace", config.namespace, "--ignore-not-found=true") not in oc_calls
+    assert f"oc delete namespace {config.namespace} --ignore-not-found=true" not in shell_calls
     assert (
-        "delete",
-        "pvc",
-        "-n",
-        config.namespace,
-        "-l",
-        "forge.openshift.io/project=llm_d,forge.openshift.io/preserve!=true",
-        "--ignore-not-found=true",
-    ) in oc_calls
+        f"oc delete pvc -n {config.namespace} "
+        '-l "forge.openshift.io/project=llm_d,forge.openshift.io/preserve!=true" '
+        "--ignore-not-found=true"
+    ) in shell_calls
 
 
 def test_prepare_gpu_operator_skips_existing_clusterpolicy(

From 31c8c5ff90e6cfc3efebbd48cc4a5d807198aaa5 Mon Sep 17 00:00:00 2001
From: Alberto Perdomo <aperdomo@redhat.com>
Date: Sun, 3 May 2026 10:31:42 +0100
Subject: [PATCH 14/21] refactor: Inline llm_d prepare and test task logic

---
 projects/llm_d/toolbox/prepare/main.py | 477 +++++++++++++++++++++--
 projects/llm_d/toolbox/test/main.py    | 518 ++++++++++++++++++++++---
 2 files changed, 889 insertions(+), 106 deletions(-)

diff --git a/projects/llm_d/toolbox/prepare/main.py b/projects/llm_d/toolbox/prepare/main.py
index 831201d5..6851bf4b 100644
--- a/projects/llm_d/toolbox/prepare/main.py
+++ b/projects/llm_d/toolbox/prepare/main.py
@@ -6,48 +6,441 @@
 import logging
 from pathlib import Path
 
-from projects.core.dsl import toolbox
-from projects.llm_d.orchestration import llmd_runtime
-from projects.llm_d.toolbox.cleanup import main as cleanup_toolbox
+from projects.core.dsl import execute_tasks, shell, task, toolbox
+from projects.llm_d.orchestration import llmd_runtime, phase_inputs
 from projects.llm_d.toolbox.prepare_model_cache import main as prepare_model_cache
 
 LOGGER = logging.getLogger(__name__)
 
 
-def run() -> int:
-    llmd_runtime.init()
-    config = llmd_runtime.load_run_configuration()
-    return run_prepare(config)
-
-
-def run_prepare(config: llmd_runtime.ResolvedConfig) -> int:
-    LOGGER.info("Preparing llm_d preset=%s namespace=%s", config.preset_name, config.namespace)
-
-    verify_oc_access()
-    verify_cluster_version(config)
-    prepare_cert_manager(config)
-    prepare_leader_worker_set(config)
-    prepare_nfd(config)
-    prepare_gpu_operator(config)
-    prepare_rhoai_operator(config)
-    apply_datasciencecluster(config)
-    wait_for_datasciencecluster_ready(config)
-    ensure_required_crds(config.platform["rhoai"]["required_crds_after_dsc"], config)
-    ensure_gateway(config)
-    ensure_test_namespace(config)
-    cleanup_toolbox.delete_run_leftovers(config)
-    prepare_model_cache.run_prepare_model_cache(config)
-    verify_gpu_nodes(config)
-    capture_prepare_state(config)
+def run(*, inputs_file: str) -> int:
+    """Prepare a cluster for llm_d downstream smoke and benchmark runs.
+
+    Args:
+        inputs_file: Path to the prepare phase input file generated by orchestration
+    """
 
+    llmd_runtime.init()
+    execute_tasks(locals())
     return 0
 
 
+@task
+def load_inputs(args, ctx):
+    """Load the prepare phase inputs"""
+
+    ctx.config = phase_inputs.load_prepare_inputs(args.inputs_file)
+    return f"Loaded prepare inputs for preset {ctx.config.preset_name}"
+
+
+@task
+def verify_oc_access_task(args, ctx):
+    """Verify OpenShift CLI access"""
+
+    llmd_runtime.oc("whoami", capture_output=True)
+    return "OpenShift CLI access verified"
+
+
+@task
+def verify_cluster_version_task(args, ctx):
+    """Validate the cluster version against llm_d requirements"""
+
+    version_info = llmd_runtime.oc("version", "-o", "json", capture_output=True)
+    payload = json.loads(version_info.stdout)
+
+    openshift_version = (
+        payload.get("openshiftVersion")
+        or payload.get("serverVersion", {}).get("gitVersion")
+        or payload.get("serverVersion", {}).get("platform")
+    )
+    if not openshift_version:
+        raise RuntimeError("Could not determine OpenShift version from `oc version -o json`")
+
+    minimum = ctx.config.platform["cluster"]["minimum_openshift_version"]
+    if llmd_runtime.version_tuple(openshift_version) < llmd_runtime.version_tuple(minimum):
+        raise RuntimeError(
+            f"Cluster version {openshift_version} is older than the llm_d minimum {minimum}"
+        )
+
+    return f"Cluster version satisfies {minimum}"
+
+
+@task
+def prepare_cert_manager_task(args, ctx):
+    """Ensure the cert-manager operator is installed"""
+
+    operator_spec = llmd_runtime.operator_spec_by_package(
+        ctx.config.platform, "openshift-cert-manager-operator"
+    )
+    ensure_operator_subscription(operator_spec)
+    return "cert-manager operator ready"
+
+
+@task
+def prepare_leader_worker_set_task(args, ctx):
+    """Ensure the leader-worker-set operator is installed"""
+
+    operator_spec = llmd_runtime.operator_spec_by_package(ctx.config.platform, "leader-worker-set")
+    ensure_operator_subscription(operator_spec)
+    return "leader-worker-set operator ready"
+
+
+@task
+def prepare_nfd_task(args, ctx):
+    """Ensure Node Feature Discovery is installed and reporting GPU labels"""
+
+    config = ctx.config
+    operator_spec = llmd_runtime.operator_spec_by_package(config.platform, "nfd")
+    ensure_operator_subscription(operator_spec)
+    llmd_runtime.wait_for_crd(
+        operator_spec["bootstrap_crd"],
+        timeout_seconds=operator_spec["wait_timeout_seconds"],
+    )
+
+    manifest = llmd_runtime.load_manifest_template(config, operator_spec["bootstrap_manifest"])
+    nfd_name = manifest["metadata"]["name"]
+    nfd_namespace = manifest["metadata"]["namespace"]
+    if llmd_runtime.resource_exists("nodefeaturediscovery", nfd_name, namespace=nfd_namespace):
+        LOGGER.info(
+            "NodeFeatureDiscovery/%s already exists in %s; verifying GPU discovery labels",
+            nfd_name,
+            nfd_namespace,
+        )
+    else:
+        llmd_runtime.apply_manifest(
+            config.artifact_dir / "src" / "nfd-nodefeaturediscovery.yaml",
+            manifest,
+        )
+
+    llmd_runtime.wait_until(
+        "NodeFeatureDiscovery bootstrap resource",
+        timeout_seconds=operator_spec["wait_timeout_seconds"],
+        interval_seconds=10,
+        predicate=lambda: llmd_runtime.resource_exists(
+            "nodefeaturediscovery",
+            nfd_name,
+            namespace=nfd_namespace,
+        ),
+    )
+
+    wait_for_nfd_gpu_labels(config, timeout_seconds=operator_spec["wait_timeout_seconds"])
+    return "Node Feature Discovery ready"
+
+
+@task
+def prepare_gpu_operator_task(args, ctx):
+    """Ensure the GPU operator is installed and ready"""
+
+    config = ctx.config
+    operator_spec = llmd_runtime.operator_spec_by_package(config.platform, "gpu-operator-certified")
+    ensure_operator_subscription(operator_spec)
+    llmd_runtime.wait_for_crd(
+        operator_spec["bootstrap_crd"],
+        timeout_seconds=operator_spec["wait_timeout_seconds"],
+    )
+
+    manifest = llmd_runtime.load_manifest_template(config, operator_spec["bootstrap_manifest"])
+    clusterpolicy_name = manifest["metadata"]["name"]
+    if llmd_runtime.resource_exists("clusterpolicy", clusterpolicy_name):
+        LOGGER.info(
+            "ClusterPolicy/%s already exists; verifying readiness instead of applying bootstrap manifest",
+            clusterpolicy_name,
+        )
+    else:
+        llmd_runtime.apply_manifest(
+            config.artifact_dir / "src" / "gpu-clusterpolicy.yaml",
+            manifest,
+        )
+
+    wait_for_gpu_clusterpolicy_ready(
+        clusterpolicy_name,
+        timeout_seconds=operator_spec["wait_timeout_seconds"],
+    )
+    return "GPU operator ready"
+
+
+@task
+def prepare_rhoai_operator_task(args, ctx):
+    """Ensure the RHOAI operator is installed"""
+
+    config = ctx.config
+    operator_spec = llmd_runtime.operator_spec_by_package(config.platform, "rhods-operator")
+    ensure_operator_subscription(operator_spec)
+    for crd_name in config.platform["rhoai"]["required_crds_before_dsc"]:
+        llmd_runtime.wait_for_crd(
+            crd_name,
+            timeout_seconds=config.platform["rhoai"]["wait_timeout_seconds"],
+        )
+    return "RHOAI operator ready"
+
+
+@task
+def apply_datasciencecluster_task(args, ctx):
+    """Apply the DataScienceCluster manifest"""
+
+    config = ctx.config
+    manifest = llmd_runtime.render_datasciencecluster(config)
+    llmd_runtime.apply_manifest(config.artifact_dir / "src" / "datasciencecluster.yaml", manifest)
+    llmd_runtime.oc(
+        "get",
+        "datasciencecluster",
+        config.platform["rhoai"]["datasciencecluster_name"],
+        "-n",
+        config.platform["rhoai"]["namespace"],
+        "-o",
+        "yaml",
+        capture_output=True,
+    )
+    return "DataScienceCluster applied"
+
+
+@task
+def wait_for_datasciencecluster_ready_task(args, ctx):
+    """Wait for the DataScienceCluster to become ready"""
+
+    rhoai = ctx.config.platform["rhoai"]
+
+    def _dsc_ready() -> bool:
+        payload = llmd_runtime.oc_get_json(
+            "datasciencecluster",
+            name=rhoai["datasciencecluster_name"],
+            namespace=rhoai["namespace"],
+        )
+        phase = payload.get("status", {}).get("phase")
+        if phase == "Ready":
+            return True
+        if phase in {"Failed", "Error"}:
+            raise RuntimeError(f"DataScienceCluster entered terminal phase {phase}")
+        return False
+
+    llmd_runtime.wait_until(
+        f"datasciencecluster/{rhoai['datasciencecluster_name']} ready",
+        timeout_seconds=rhoai["wait_timeout_seconds"],
+        interval_seconds=10,
+        predicate=_dsc_ready,
+    )
+    return "DataScienceCluster ready"
+
+
+@task
+def ensure_required_crds_task(args, ctx):
+    """Wait for the llm_d-required CRDs to exist"""
+
+    for crd_name in ctx.config.platform["rhoai"]["required_crds_after_dsc"]:
+        llmd_runtime.wait_for_crd(
+            crd_name,
+            timeout_seconds=ctx.config.platform["rhoai"]["wait_timeout_seconds"],
+        )
+    return "Required CRDs present"
+
+
+@task
+def ensure_gateway_task(args, ctx):
+    """Ensure the gateway exists and is programmed"""
+
+    config = ctx.config
+    gateway = config.platform["gateway"]
+    if not llmd_runtime.resource_exists("gateway", gateway["name"], namespace=gateway["namespace"]):
+        if not gateway["create_if_missing"]:
+            raise RuntimeError(
+                f"Required gateway {gateway['name']} does not exist in {gateway['namespace']}"
+            )
+        manifest = llmd_runtime.render_gateway(config)
+        llmd_runtime.apply_manifest(config.artifact_dir / "src" / "gateway.yaml", manifest)
+
+    def _gateway_programmed() -> bool:
+        resource = llmd_runtime.oc_get_json(
+            "gateway",
+            name=gateway["name"],
+            namespace=gateway["namespace"],
+        )
+        return llmd_runtime.condition_status(resource, "Programmed") == "True"
+
+    llmd_runtime.wait_until(
+        f"gateway/{gateway['name']} programmed",
+        timeout_seconds=gateway["wait_timeout_seconds"],
+        interval_seconds=10,
+        predicate=_gateway_programmed,
+    )
+    return "Gateway ready"
+
+
+@task
+def ensure_test_namespace_task(args, ctx):
+    """Ensure the llm_d namespace exists"""
+
+    llmd_runtime.ensure_namespace(
+        ctx.config.namespace,
+        labels={
+            "app.kubernetes.io/managed-by": "forge",
+            "forge.openshift.io/project": "llm_d",
+        },
+    )
+    return f"Namespace {ctx.config.namespace} ready"
+
+
+@task
+def cleanup_previous_run_task(args, ctx):
+    """Delete leftover llm_d resources from the namespace"""
+
+    config = ctx.config
+    inference_service_name = config.platform["inference_service"]["name"]
+    namespace = config.namespace
+    cleanup_timeout_seconds = config.platform["cluster"]["cleanup_timeout_seconds"]
+    benchmark_names = {"guidellm-benchmark"}
+    if config.benchmark:
+        benchmark_names.add(config.benchmark["job_name"])
+
+    shell.run(
+        f"oc delete llminferenceservice {inference_service_name} "
+        f"-n {namespace} --ignore-not-found=true",
+        check=False,
+    )
+
+    for benchmark_name in sorted(benchmark_names):
+        shell.run(
+            f"oc delete job,pvc {benchmark_name} -n {namespace} --ignore-not-found=true",
+            check=False,
+        )
+        shell.run(
+            f"oc delete pod {benchmark_name}-copy -n {namespace} --ignore-not-found=true",
+            check=False,
+        )
+
+    shell.run(
+        f'oc delete job -n {namespace} -l "forge.openshift.io/project=llm_d" '
+        "--ignore-not-found=true",
+        check=False,
+    )
+    shell.run(
+        f'oc delete pod -n {namespace} -l "forge.openshift.io/project=llm_d" '
+        "--ignore-not-found=true",
+        check=False,
+    )
+    shell.run(
+        f"oc delete pvc -n {namespace} "
+        '-l "forge.openshift.io/project=llm_d,forge.openshift.io/preserve!=true" '
+        "--ignore-not-found=true",
+        check=False,
+    )
+
+    llmd_runtime.wait_until(
+        f"llminferenceservice/{inference_service_name} deletion in {namespace}",
+        timeout_seconds=cleanup_timeout_seconds,
+        interval_seconds=10,
+        predicate=lambda: not llmd_runtime.resource_exists(
+            "llminferenceservice", inference_service_name, namespace=namespace
+        ),
+    )
+
+    llmd_runtime.wait_until(
+        f"llm-d workload pods deletion in {namespace}",
+        timeout_seconds=cleanup_timeout_seconds,
+        interval_seconds=10,
+        predicate=lambda: not (
+            pods := llmd_runtime.oc_get_json(
+                "pods",
+                namespace=namespace,
+                selector=f"app.kubernetes.io/name={inference_service_name}",
+                ignore_not_found=True,
+            )
+        )
+        or not pods.get("items"),
+    )
+    return f"Previous llm_d leftovers deleted from {ctx.config.namespace}"
+
+
+@task
+def prepare_model_cache_task(args, ctx):
+    """Prepare the shared model cache if enabled"""
+
+    cache_inputs = phase_inputs.prepare_model_cache_inputs_from_prepare(ctx.config)
+    cache_spec = llmd_runtime.resolve_model_cache(cache_inputs)
+    if not cache_spec:
+        LOGGER.info("Model cache disabled for preset=%s", cache_inputs.preset_name)
+        return "Model cache disabled"
+
+    if cache_inputs.namespace_is_managed:
+        LOGGER.warning(
+            "Model cache PVC %s lives in managed namespace %s. Namespace cleanup will remove it; cache reuse requires a stable namespace override.",
+            cache_spec.pvc_name,
+            cache_spec.namespace,
+        )
+
+    prepare_model_cache.ensure_model_cache_pvc(cache_inputs, cache_spec)
+    if llmd_runtime.model_cache_pvc_ready(cache_spec):
+        LOGGER.info(
+            "Model cache PVC %s already contains %s; skipping download",
+            cache_spec.pvc_name,
+            cache_spec.source_uri,
+        )
+        prepare_model_cache.capture_model_cache_state(cache_inputs, cache_spec)
+        return "Model cache already populated"
+
+    prepare_model_cache.run_model_cache_download_job(cache_inputs, cache_spec)
+    llmd_runtime.annotate_model_cache_pvc(cache_spec)
+    prepare_model_cache.capture_model_cache_state(cache_inputs, cache_spec)
+    return "Model cache prepared"
+
+
+@task
+def verify_gpu_nodes_task(args, ctx):
+    """Verify that GPU nodes are available on the cluster"""
+
+    selector = ctx.config.platform["cluster"]["gpu_node_label_selector"]
+    data = llmd_runtime.oc_get_json("nodes", selector=selector, ignore_not_found=True)
+    items = data.get("items", []) if data else []
+    if not items:
+        raise RuntimeError(
+            f"No GPU nodes found with selector {selector}. The llm_d smoke path requires GPUs."
+        )
+    return "GPU nodes detected"
+
+
+@task
+def capture_prepare_state_task(args, ctx):
+    """Capture cluster state after the prepare phase"""
+
+    config = ctx.config
+    artifacts_dir = config.artifact_dir / "artifacts"
+    rhoai = config.platform["rhoai"]
+    gateway = config.platform["gateway"]
+
+    capture_resource_yaml(
+        "datasciencecluster",
+        rhoai["datasciencecluster_name"],
+        rhoai["namespace"],
+        artifacts_dir / "datasciencecluster.yaml",
+    )
+    capture_resource_yaml(
+        "gateway",
+        gateway["name"],
+        gateway["namespace"],
+        artifacts_dir / "gateway.yaml",
+    )
+    gateway_service = llmd_runtime.oc(
+        "get",
+        "service",
+        "-A",
+        "-l",
+        f"gateway.networking.k8s.io/gateway-name={gateway['name']}",
+        "-o",
+        "yaml",
+        check=False,
+        capture_output=True,
+    )
+    if gateway_service.returncode == 0 and gateway_service.stdout:
+        llmd_runtime.write_text(artifacts_dir / "gateway.service.yaml", gateway_service.stdout)
+    if config.platform["artifacts"]["capture_namespace_events"]:
+        capture_namespace_events(config.namespace, artifacts_dir / "namespace.events.txt")
+    return "Prepare-state artifacts captured"
+
+
 def verify_oc_access() -> None:
     llmd_runtime.oc("whoami", capture_output=True)
 
 
-def verify_cluster_version(config: llmd_runtime.ResolvedConfig) -> None:
+def verify_cluster_version(config: phase_inputs.PrepareInputs) -> None:
     version_info = llmd_runtime.oc("version", "-o", "json", capture_output=True)
     payload = json.loads(version_info.stdout)
 
@@ -75,19 +468,19 @@ def ensure_operator_subscription(operator_spec: dict[str, str]) -> dict[str, obj
     )
 
 
-def prepare_cert_manager(config: llmd_runtime.ResolvedConfig) -> None:
+def prepare_cert_manager(config: phase_inputs.PrepareInputs) -> None:
     operator_spec = llmd_runtime.operator_spec_by_package(
         config.platform, "openshift-cert-manager-operator"
     )
     ensure_operator_subscription(operator_spec)
 
 
-def prepare_leader_worker_set(config: llmd_runtime.ResolvedConfig) -> None:
+def prepare_leader_worker_set(config: phase_inputs.PrepareInputs) -> None:
     operator_spec = llmd_runtime.operator_spec_by_package(config.platform, "leader-worker-set")
     ensure_operator_subscription(operator_spec)
 
 
-def prepare_nfd(config: llmd_runtime.ResolvedConfig) -> None:
+def prepare_nfd(config: phase_inputs.PrepareInputs) -> None:
     operator_spec = llmd_runtime.operator_spec_by_package(config.platform, "nfd")
     ensure_operator_subscription(operator_spec)
     llmd_runtime.wait_for_crd(
@@ -124,7 +517,7 @@ def prepare_nfd(config: llmd_runtime.ResolvedConfig) -> None:
     wait_for_nfd_gpu_labels(config, timeout_seconds=operator_spec["wait_timeout_seconds"])
 
 
-def prepare_gpu_operator(config: llmd_runtime.ResolvedConfig) -> None:
+def prepare_gpu_operator(config: phase_inputs.PrepareInputs) -> None:
     operator_spec = llmd_runtime.operator_spec_by_package(config.platform, "gpu-operator-certified")
     ensure_operator_subscription(operator_spec)
     llmd_runtime.wait_for_crd(
@@ -173,13 +566,13 @@ def _clusterpolicy_ready() -> bool:
     )
 
 
-def prepare_rhoai_operator(config: llmd_runtime.ResolvedConfig) -> None:
+def prepare_rhoai_operator(config: phase_inputs.PrepareInputs) -> None:
     operator_spec = llmd_runtime.operator_spec_by_package(config.platform, "rhods-operator")
     ensure_operator_subscription(operator_spec)
     ensure_required_crds(config.platform["rhoai"]["required_crds_before_dsc"], config)
 
 
-def ensure_required_crds(crd_names: list[str], config: llmd_runtime.ResolvedConfig) -> None:
+def ensure_required_crds(crd_names: list[str], config: phase_inputs.PrepareInputs) -> None:
     for crd_name in crd_names:
         llmd_runtime.wait_for_crd(
             crd_name,
@@ -187,7 +580,7 @@ def ensure_required_crds(crd_names: list[str], config: llmd_runtime.ResolvedConf
         )
 
 
-def apply_datasciencecluster(config: llmd_runtime.ResolvedConfig) -> None:
+def apply_datasciencecluster(config: phase_inputs.PrepareInputs) -> None:
     manifest = llmd_runtime.render_datasciencecluster(config)
     llmd_runtime.apply_manifest(config.artifact_dir / "src" / "datasciencecluster.yaml", manifest)
     llmd_runtime.oc(
@@ -202,7 +595,7 @@ def apply_datasciencecluster(config: llmd_runtime.ResolvedConfig) -> None:
     )
 
 
-def wait_for_datasciencecluster_ready(config: llmd_runtime.ResolvedConfig) -> None:
+def wait_for_datasciencecluster_ready(config: phase_inputs.PrepareInputs) -> None:
     rhoai = config.platform["rhoai"]
 
     def _dsc_ready() -> bool:
@@ -226,7 +619,7 @@ def _dsc_ready() -> bool:
     )
 
 
-def ensure_gateway(config: llmd_runtime.ResolvedConfig) -> None:
+def ensure_gateway(config: phase_inputs.PrepareInputs) -> None:
     gateway = config.platform["gateway"]
     if not llmd_runtime.resource_exists("gateway", gateway["name"], namespace=gateway["namespace"]):
         if not gateway["create_if_missing"]:
@@ -252,7 +645,7 @@ def _gateway_programmed() -> bool:
     )
 
 
-def ensure_test_namespace(config: llmd_runtime.ResolvedConfig) -> None:
+def ensure_test_namespace(config: phase_inputs.PrepareInputs) -> None:
     llmd_runtime.ensure_namespace(
         config.namespace,
         labels={
@@ -262,7 +655,7 @@ def ensure_test_namespace(config: llmd_runtime.ResolvedConfig) -> None:
     )
 
 
-def verify_gpu_nodes(config: llmd_runtime.ResolvedConfig) -> None:
+def verify_gpu_nodes(config: phase_inputs.PrepareInputs) -> None:
     selector = config.platform["cluster"]["gpu_node_label_selector"]
     data = llmd_runtime.oc_get_json("nodes", selector=selector, ignore_not_found=True)
     items = data.get("items", []) if data else []
@@ -272,7 +665,7 @@ def verify_gpu_nodes(config: llmd_runtime.ResolvedConfig) -> None:
         )
 
 
-def wait_for_nfd_gpu_labels(config: llmd_runtime.ResolvedConfig, *, timeout_seconds: int) -> None:
+def wait_for_nfd_gpu_labels(config: phase_inputs.PrepareInputs, *, timeout_seconds: int) -> None:
     selectors = config.platform["cluster"]["nfd_gpu_detection_labels"]
 
     def _labels_present() -> bool:
@@ -290,7 +683,7 @@ def _labels_present() -> bool:
     )
 
 
-def capture_prepare_state(config: llmd_runtime.ResolvedConfig) -> None:
+def capture_prepare_state(config: phase_inputs.PrepareInputs) -> None:
     artifacts_dir = config.artifact_dir / "artifacts"
     rhoai = config.platform["rhoai"]
     gateway = config.platform["gateway"]
diff --git a/projects/llm_d/toolbox/test/main.py b/projects/llm_d/toolbox/test/main.py
index 9c6242b6..5941d03a 100644
--- a/projects/llm_d/toolbox/test/main.py
+++ b/projects/llm_d/toolbox/test/main.py
@@ -6,81 +6,471 @@
 import logging
 from pathlib import Path
 
-from projects.core.dsl import toolbox
-from projects.llm_d.orchestration import llmd_runtime
+from projects.core.dsl import always, execute_tasks, task, toolbox
+from projects.llm_d.orchestration import llmd_runtime, phase_inputs
 
 LOGGER = logging.getLogger(__name__)
 
 
-def run() -> int:
+def run(*, inputs_file: str) -> int:
+    """Deploy llm_d, run the smoke request, and optionally execute GuideLLM.
+
+    Args:
+        inputs_file: Path to the test phase input file generated by orchestration
+    """
+
     llmd_runtime.init()
-    config = llmd_runtime.load_run_configuration()
-    return run_test(config)
+    execute_tasks(locals())
+    return 0
+
+
+@task
+def load_inputs(args, ctx):
+    """Load the test phase inputs"""
+
+    ctx.config = phase_inputs.load_test_inputs(args.inputs_file)
+    return f"Loaded test inputs for preset {ctx.config.preset_name}"
+
 
+@task
+def deploy_inference_service_task(args, ctx):
+    """Deploy the LLMInferenceService and resolve its endpoint"""
 
-def run_test(config: llmd_runtime.ResolvedConfig) -> int:
+    config = ctx.config
+    name = config.platform["inference_service"]["name"]
     namespace = config.namespace
-    artifacts_dir = config.artifact_dir / "artifacts"
+    selector = f"app.kubernetes.io/name={name}"
 
-    LOGGER.info("Testing llm_d preset=%s namespace=%s", config.preset_name, namespace)
+    llmd_runtime.oc(
+        "delete",
+        "llminferenceservice",
+        name,
+        "-n",
+        namespace,
+        "--ignore-not-found=true",
+        check=False,
+    )
 
-    endpoint_url = None
-    try:
-        endpoint_url = deploy_inference_service(config)
-        smoke_response = run_smoke_request(config, endpoint_url)
-        llmd_runtime.write_json(artifacts_dir / "smoke.response.json", smoke_response)
+    def _old_pods_gone() -> bool:
+        pods = llmd_runtime.oc_get_json(
+            "pods", namespace=namespace, selector=selector, ignore_not_found=True
+        )
+        return not pods or not pods.get("items")
 
-        if config.benchmark:
-            run_guidellm_benchmark(config, endpoint_url)
+    llmd_runtime.wait_until(
+        f"old llm-d pods to disappear in {namespace}",
+        timeout_seconds=config.platform["inference_service"]["delete_timeout_seconds"],
+        interval_seconds=10,
+        predicate=_old_pods_gone,
+    )
 
-        return 0
-    finally:
-        capture_inference_service_state(config)
-        if endpoint_url:
-            llmd_runtime.write_text(artifacts_dir / "endpoint.url", f"{endpoint_url}\n")
-        benchmark_name = config.benchmark["job_name"] if config.benchmark else "guidellm-benchmark"
-        smoke_job_name = config.platform["smoke"]["job_name"]
-        llmd_runtime.oc(
-            "delete",
-            "job",
-            smoke_job_name,
-            "-n",
-            namespace,
-            "--ignore-not-found=true",
-            check=False,
+    manifest = llmd_runtime.render_inference_service(config)
+    llmd_runtime.apply_manifest(config.artifact_dir / "src" / "llminferenceservice.yaml", manifest)
+
+    def _pods_present() -> bool:
+        pods = llmd_runtime.oc_get_json(
+            "pods", namespace=namespace, selector=selector, ignore_not_found=True
         )
-        llmd_runtime.oc(
-            "delete",
-            "job,pvc",
-            benchmark_name,
-            "-n",
+        return bool(pods and pods.get("items"))
+
+    llmd_runtime.wait_until(
+        f"llm-d pods to appear in {namespace}",
+        timeout_seconds=config.platform["inference_service"]["pod_appearance_timeout_seconds"],
+        interval_seconds=5,
+        predicate=_pods_present,
+    )
+
+    def _service_ready() -> bool:
+        payload = llmd_runtime.oc_get_json("llminferenceservice", name=name, namespace=namespace)
+        return llmd_runtime.condition_status(payload, "Ready") == "True"
+
+    llmd_runtime.wait_until(
+        f"llminferenceservice/{name} ready",
+        timeout_seconds=config.platform["inference_service"]["ready_timeout_seconds"],
+        interval_seconds=10,
+        predicate=_service_ready,
+    )
+
+    ctx.endpoint_url = llmd_runtime.wait_until(
+        f"gateway address for llminferenceservice/{name}",
+        timeout_seconds=config.platform["inference_service"]["ready_timeout_seconds"],
+        interval_seconds=10,
+        predicate=lambda: try_resolve_endpoint_url(config),
+    )
+    return f"Endpoint resolved: {ctx.endpoint_url}"
+
+
+@task
+def run_smoke_request_task(args, ctx):
+    """Run the smoke request against the deployed service"""
+
+    config = ctx.config
+    namespace = config.namespace
+    job_name = config.platform["smoke"]["job_name"]
+    payload = {
+        "model": config.model["served_model_name"],
+        "prompt": config.smoke_request["prompt"],
+        "max_tokens": config.smoke_request["max_tokens"],
+        "temperature": config.smoke_request["temperature"],
+    }
+    llmd_runtime.write_json(config.artifact_dir / "artifacts" / "smoke.request.json", payload)
+
+    llmd_runtime.oc(
+        "delete",
+        "job",
+        job_name,
+        "-n",
+        namespace,
+        "--ignore-not-found=true",
+        check=False,
+    )
+    llmd_runtime.wait_until(
+        f"job/{job_name} deletion in {namespace}",
+        timeout_seconds=120,
+        interval_seconds=5,
+        predicate=lambda: not llmd_runtime.resource_exists("job", job_name, namespace=namespace),
+    )
+
+    llmd_runtime.apply_manifest(
+        config.artifact_dir / "src" / "smoke-job.yaml",
+        llmd_runtime.render_smoke_request_job(config, ctx.endpoint_url, payload),
+    )
+
+    try:
+        llmd_runtime.wait_for_job_completion(
+            job_name,
             namespace,
-            "--ignore-not-found=true",
-            check=False,
+            timeout_seconds=(
+                config.platform["smoke"]["request_retries"]
+                * (
+                    config.platform["smoke"]["request_timeout_seconds"]
+                    + config.platform["smoke"]["request_retry_delay_seconds"]
+                )
+            ),
+            interval_seconds=5,
         )
-        llmd_runtime.oc(
-            "delete",
-            "pod",
-            f"{benchmark_name}-copy",
-            "-n",
-            namespace,
-            "--ignore-not-found=true",
-            check=False,
+    finally:
+        capture_smoke_state(config)
+
+    result = llmd_runtime.oc(
+        "logs",
+        f"job/{job_name}",
+        "-n",
+        namespace,
+        check=False,
+        capture_output=True,
+    )
+
+    if result.returncode != 0 or not result.stdout:
+        raise RuntimeError(
+            f"Smoke request job {job_name} completed but response logs could not be read: {result.stderr}"
         )
-        events = llmd_runtime.oc(
-            "get",
-            "events",
-            "-n",
-            namespace,
-            "--sort-by=.metadata.creationTimestamp",
-            check=False,
-            capture_output=True,
+
+    response = json.loads(result.stdout)
+    if not response.get("choices"):
+        raise RuntimeError(f"Invalid smoke response payload: {result.stdout}")
+
+    llmd_runtime.write_json(config.artifact_dir / "artifacts" / "smoke.response.json", response)
+    ctx.smoke_response = response
+    return "Smoke request completed"
+
+
+@task
+def run_guidellm_benchmark_task(args, ctx):
+    """Run the GuideLLM benchmark when enabled for the preset"""
+
+    if not ctx.config.benchmark:
+        return "GuideLLM benchmark disabled"
+
+    config = ctx.config
+    benchmark_name = config.benchmark["job_name"]
+    namespace = config.namespace
+
+    llmd_runtime.oc(
+        "delete",
+        "job,pvc",
+        benchmark_name,
+        "-n",
+        namespace,
+        "--ignore-not-found=true",
+        check=False,
+    )
+    llmd_runtime.oc(
+        "delete",
+        "pod",
+        f"{benchmark_name}-copy",
+        "-n",
+        namespace,
+        "--ignore-not-found=true",
+        check=False,
+    )
+
+    llmd_runtime.apply_manifest(
+        config.artifact_dir / "src" / "guidellm-pvc.yaml",
+        llmd_runtime.render_guidellm_pvc(config),
+    )
+    llmd_runtime.apply_manifest(
+        config.artifact_dir / "src" / "guidellm-job.yaml",
+        llmd_runtime.render_guidellm_job(config, ctx.endpoint_url),
+    )
+
+    def _job_terminal() -> dict[str, object] | None:
+        payload = llmd_runtime.oc_get_json("job", name=benchmark_name, namespace=namespace)
+        status = payload.get("status", {})
+        if status.get("succeeded"):
+            return payload
+        if status.get("failed"):
+            raise RuntimeError(f"GuideLLM job {benchmark_name} failed")
+        return None
+
+    llmd_runtime.wait_until(
+        f"GuideLLM job/{benchmark_name}",
+        timeout_seconds=config.benchmark["timeout_seconds"],
+        interval_seconds=10,
+        predicate=_job_terminal,
+    )
+
+    capture_guidellm_state(config)
+    copy_guidellm_results(config)
+    return f"GuideLLM benchmark {ctx.config.benchmark['job_name']} completed"
+
+
+@always
+@task
+def capture_inference_service_state_task(args, ctx):
+    """Capture the LLMInferenceService state and related resources"""
+
+    config = getattr(ctx, "config", None)
+    if not config:
+        return "Test inputs unavailable; skipping state capture"
+
+    name = config.platform["inference_service"]["name"]
+    namespace = config.namespace
+    artifacts_dir = config.artifact_dir / "artifacts"
+    selector = f"app.kubernetes.io/name={name}"
+
+    capture_get(
+        "llminferenceservice",
+        name,
+        namespace,
+        "yaml",
+        artifacts_dir / "llminferenceservice.yaml",
+    )
+    capture_get(
+        "llminferenceservice",
+        name,
+        namespace,
+        "json",
+        artifacts_dir / "llminferenceservice.json",
+    )
+    capture_get(
+        "pods",
+        None,
+        namespace,
+        "yaml",
+        artifacts_dir / "llminferenceservice.pods.yaml",
+        selector=selector,
+    )
+    capture_get(
+        "deployments",
+        None,
+        namespace,
+        "yaml",
+        artifacts_dir / "llminferenceservice.deployments.yaml",
+        selector=selector,
+    )
+    capture_get(
+        "replicasets",
+        None,
+        namespace,
+        "yaml",
+        artifacts_dir / "llminferenceservice.replicasets.yaml",
+        selector=selector,
+    )
+    capture_get("pods", None, namespace, "wide", artifacts_dir / "namespace.pods.status")
+    capture_get("services", None, namespace, "wide", artifacts_dir / "namespace.services.status")
+
+    pod_list = llmd_runtime.oc_get_json(
+        "pods", namespace=namespace, selector=selector, ignore_not_found=True
+    )
+    if pod_list:
+        lines = []
+        previous_lines = []
+        for pod in pod_list.get("items", []):
+            pod_name = pod["metadata"]["name"]
+            lines.append(f"=== {pod_name} ===")
+            log_result = llmd_runtime.oc(
+                "logs",
+                pod_name,
+                "-n",
+                namespace,
+                "--all-containers=true",
+                check=False,
+                capture_output=True,
+            )
+            if log_result.stdout:
+                lines.append(log_result.stdout.rstrip())
+
+            previous_lines.append(f"=== {pod_name} ===")
+            previous_result = llmd_runtime.oc(
+                "logs",
+                pod_name,
+                "-n",
+                namespace,
+                "--previous",
+                "--all-containers=true",
+                check=False,
+                capture_output=True,
+            )
+            if previous_result.stdout:
+                previous_lines.append(previous_result.stdout.rstrip())
+
+        llmd_runtime.write_text(
+            artifacts_dir / "llminferenceservice.pods.logs", "\n".join(lines) + "\n"
+        )
+        llmd_runtime.write_text(
+            artifacts_dir / "llminferenceservice.pods.previous.logs",
+            "\n".join(previous_lines) + "\n",
+        )
+    return "Inference-service artifacts captured"
+
+
+@always
+@task
+def write_endpoint_url_task(args, ctx):
+    """Persist the resolved endpoint URL when available"""
+
+    config = getattr(ctx, "config", None)
+    if not config:
+        return "Test inputs unavailable; skipping endpoint capture"
+
+    endpoint_url = getattr(ctx, "endpoint_url", None)
+    if not endpoint_url:
+        return "Endpoint URL not available"
+
+    llmd_runtime.write_text(config.artifact_dir / "artifacts" / "endpoint.url", f"{endpoint_url}\n")
+    return "Endpoint URL captured"
+
+
+@always
+@task
+def cleanup_runtime_resources_task(args, ctx):
+    """Delete smoke and benchmark helper resources"""
+
+    config = getattr(ctx, "config", None)
+    if not config:
+        return "Test inputs unavailable; skipping cleanup"
+
+    benchmark_name = config.benchmark["job_name"] if config.benchmark else "guidellm-benchmark"
+    smoke_job_name = config.platform["smoke"]["job_name"]
+    namespace = config.namespace
+
+    llmd_runtime.oc(
+        "delete",
+        "job",
+        smoke_job_name,
+        "-n",
+        namespace,
+        "--ignore-not-found=true",
+        check=False,
+    )
+    llmd_runtime.oc(
+        "delete",
+        "job,pvc",
+        benchmark_name,
+        "-n",
+        namespace,
+        "--ignore-not-found=true",
+        check=False,
+    )
+    llmd_runtime.oc(
+        "delete",
+        "pod",
+        f"{benchmark_name}-copy",
+        "-n",
+        namespace,
+        "--ignore-not-found=true",
+        check=False,
+    )
+    return "Test helper resources deleted"
+
+
+@always
+@task
+def capture_namespace_events_task(args, ctx):
+    """Capture namespace events after the test run"""
+
+    config = getattr(ctx, "config", None)
+    if not config:
+        return "Test inputs unavailable; skipping namespace events capture"
+
+    events = llmd_runtime.oc(
+        "get",
+        "events",
+        "-n",
+        config.namespace,
+        "--sort-by=.metadata.creationTimestamp",
+        check=False,
+        capture_output=True,
+    )
+    if events.returncode == 0 and events.stdout:
+        llmd_runtime.write_text(
+            config.artifact_dir / "artifacts" / "namespace.events.txt", events.stdout
+        )
+    return "Namespace events captured"
+
+
+def cleanup_runtime_resources(config: phase_inputs.TestInputs) -> None:
+    benchmark_name = config.benchmark["job_name"] if config.benchmark else "guidellm-benchmark"
+    smoke_job_name = config.platform["smoke"]["job_name"]
+    namespace = config.namespace
+
+    llmd_runtime.oc(
+        "delete",
+        "job",
+        smoke_job_name,
+        "-n",
+        namespace,
+        "--ignore-not-found=true",
+        check=False,
+    )
+    llmd_runtime.oc(
+        "delete",
+        "job,pvc",
+        benchmark_name,
+        "-n",
+        namespace,
+        "--ignore-not-found=true",
+        check=False,
+    )
+    llmd_runtime.oc(
+        "delete",
+        "pod",
+        f"{benchmark_name}-copy",
+        "-n",
+        namespace,
+        "--ignore-not-found=true",
+        check=False,
+    )
+
+
+def capture_namespace_events(config: phase_inputs.TestInputs) -> None:
+    events = llmd_runtime.oc(
+        "get",
+        "events",
+        "-n",
+        config.namespace,
+        "--sort-by=.metadata.creationTimestamp",
+        check=False,
+        capture_output=True,
+    )
+    if events.returncode == 0 and events.stdout:
+        llmd_runtime.write_text(
+            config.artifact_dir / "artifacts" / "namespace.events.txt", events.stdout
         )
-        if events.returncode == 0 and events.stdout:
-            llmd_runtime.write_text(artifacts_dir / "namespace.events.txt", events.stdout)
 
 
-def deploy_inference_service(config: llmd_runtime.ResolvedConfig) -> str:
+def deploy_inference_service(config: phase_inputs.TestInputs) -> str:
     name = config.platform["inference_service"]["name"]
     namespace = config.namespace
     selector = f"app.kubernetes.io/name={name}"
@@ -143,7 +533,7 @@ def _service_ready() -> bool:
     )
 
 
-def resolve_endpoint_url(config: llmd_runtime.ResolvedConfig) -> str:
+def resolve_endpoint_url(config: phase_inputs.TestInputs) -> str:
     endpoint_url = try_resolve_endpoint_url(config)
     if endpoint_url:
         return endpoint_url
@@ -155,7 +545,7 @@ def resolve_endpoint_url(config: llmd_runtime.ResolvedConfig) -> str:
     )
 
 
-def try_resolve_endpoint_url(config: llmd_runtime.ResolvedConfig) -> str | None:
+def try_resolve_endpoint_url(config: phase_inputs.TestInputs) -> str | None:
     name = config.platform["inference_service"]["name"]
     namespace = config.namespace
     gateway_name = config.platform["gateway"]["status_address_name"]
@@ -167,7 +557,7 @@ def try_resolve_endpoint_url(config: llmd_runtime.ResolvedConfig) -> str | None:
     return None
 
 
-def run_smoke_request(config: llmd_runtime.ResolvedConfig, endpoint_url: str) -> dict[str, object]:
+def run_smoke_request(config: phase_inputs.TestInputs, endpoint_url: str) -> dict[str, object]:
     namespace = config.namespace
     job_name = config.platform["smoke"]["job_name"]
 
@@ -236,7 +626,7 @@ def run_smoke_request(config: llmd_runtime.ResolvedConfig, endpoint_url: str) ->
     return response
 
 
-def capture_smoke_state(config: llmd_runtime.ResolvedConfig) -> None:
+def capture_smoke_state(config: phase_inputs.TestInputs) -> None:
     job_name = config.platform["smoke"]["job_name"]
     namespace = config.namespace
     artifacts_dir = config.artifact_dir / "artifacts"
@@ -262,7 +652,7 @@ def capture_smoke_state(config: llmd_runtime.ResolvedConfig) -> None:
         llmd_runtime.write_text(artifacts_dir / "smoke_job.logs", result.stdout)
 
 
-def run_guidellm_benchmark(config: llmd_runtime.ResolvedConfig, endpoint_url: str) -> None:
+def run_guidellm_benchmark(config: phase_inputs.TestInputs, endpoint_url: str) -> None:
     benchmark_name = config.benchmark["job_name"]
     namespace = config.namespace
 
@@ -314,7 +704,7 @@ def _job_terminal() -> dict[str, object] | None:
     copy_guidellm_results(config)
 
 
-def copy_guidellm_results(config: llmd_runtime.ResolvedConfig) -> None:
+def copy_guidellm_results(config: phase_inputs.TestInputs) -> None:
     benchmark_name = config.benchmark["job_name"]
     namespace = config.namespace
     pod_data = llmd_runtime.oc_get_json(
@@ -369,7 +759,7 @@ def _helper_ready() -> bool:
         )
 
 
-def capture_inference_service_state(config: llmd_runtime.ResolvedConfig) -> None:
+def capture_inference_service_state(config: phase_inputs.TestInputs) -> None:
     name = config.platform["inference_service"]["name"]
     namespace = config.namespace
     artifacts_dir = config.artifact_dir / "artifacts"
@@ -460,7 +850,7 @@ def capture_inference_service_state(config: llmd_runtime.ResolvedConfig) -> None
         )
 
 
-def capture_guidellm_state(config: llmd_runtime.ResolvedConfig) -> None:
+def capture_guidellm_state(config: phase_inputs.TestInputs) -> None:
     benchmark_name = config.benchmark["job_name"]
     namespace = config.namespace
     artifacts_dir = config.artifact_dir / "artifacts"

From 2098bcd101a7dfe59424ffde4dd5811fe26a008a Mon Sep 17 00:00:00 2001
From: Alberto Perdomo <aperdomo@redhat.com>
Date: Sun, 3 May 2026 20:48:26 +0100
Subject: [PATCH 15/21] chore: Reorder tests within project

Signed-off-by: Alberto Perdomo <aperdomo@redhat.com>
---
 .github/workflows/test_toolbox_dsl.yml                | 6 +++---
 {tests/llm_d => projects/llm_d/tests}/test_runtime.py | 0
 pyproject.toml                                        | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)
 rename {tests/llm_d => projects/llm_d/tests}/test_runtime.py (100%)

diff --git a/.github/workflows/test_toolbox_dsl.yml b/.github/workflows/test_toolbox_dsl.yml
index 0946fca0..d93faa2b 100644
--- a/.github/workflows/test_toolbox_dsl.yml
+++ b/.github/workflows/test_toolbox_dsl.yml
@@ -1,5 +1,5 @@
-# Unit tests for projects/core/dsl (task decorators, execute_tasks, failure/always/skip).
-name: Toolbox DSL tests
+# Python tests for repo-managed suites discovered via pyproject testpaths.
+name: Python test suites
 
 on:
   pull_request:
@@ -31,7 +31,7 @@ jobs:
           python -m pip install --upgrade pip
           python -m pip install pytest pyyaml jinja2 jsonpath_ng
 
-      - name: Run projects/core/tests
+      - name: Run pytest suites
         run: |
           set -o errexit
           # Tree + docstrings (what is being tested), then execute with one line per test + result.
diff --git a/tests/llm_d/test_runtime.py b/projects/llm_d/tests/test_runtime.py
similarity index 100%
rename from tests/llm_d/test_runtime.py
rename to projects/llm_d/tests/test_runtime.py
diff --git a/pyproject.toml b/pyproject.toml
index c6632bf9..b2b061f6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -125,7 +125,7 @@ ignore = [
 [tool.pytest.ini_options]
 minversion = "7.0"
 addopts = "-ra -q --strict-markers --strict-config"
-testpaths = ["projects/core/tests"]
+testpaths = ["projects/core/tests", "projects/llm_d/tests"]
 python_files = ["test_*.py", "*_test.py"]
 python_classes = ["Test*"]
 python_functions = ["test_*"]

From 7698ebe95fdc0345b9670325c843c6958f99a8a7 Mon Sep 17 00:00:00 2001
From: Alberto Perdomo <aperdomo@redhat.com>
Date: Mon, 4 May 2026 10:20:59 +0100
Subject: [PATCH 16/21] fix: Install Forge dependencies for pytest CI

---
 .github/workflows/test_toolbox_dsl.yml | 2 +-
 pyproject.toml                         | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test_toolbox_dsl.yml b/.github/workflows/test_toolbox_dsl.yml
index d93faa2b..73faeadb 100644
--- a/.github/workflows/test_toolbox_dsl.yml
+++ b/.github/workflows/test_toolbox_dsl.yml
@@ -29,7 +29,7 @@ jobs:
         run: |
           set -o errexit
           python -m pip install --upgrade pip
-          python -m pip install pytest pyyaml jinja2 jsonpath_ng
+          python -m pip install .[dev]
 
       - name: Run pytest suites
         run: |
diff --git a/pyproject.toml b/pyproject.toml
index b2b061f6..139c1bc7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,6 +25,7 @@ dependencies = [
     "plotly>=5.17.0",
     "dash>=2.14.0",
     "dash-bootstrap-components>=1.5.0",
+    "jinja2",
     "pyyaml>=6.0",
     "jsonschema>=4.19.0",
     "structlog>=23.1.0",

From e1f88620ac46a9c5086ca57672ff5f1b0b100405 Mon Sep 17 00:00:00 2001
From: Alberto Perdomo <aperdomo@redhat.com>
Date: Tue, 5 May 2026 08:19:40 +0100
Subject: [PATCH 17/21] refactor: Move llm_d shared runtime out of
 orchestration

---
 projects/core/library/config.py               | 17 +++++
 projects/llm_d/orchestration/ci.py            | 17 ++++-
 projects/llm_d/orchestration/cli.py           | 17 ++++-
 .../llmd_runtime.py                           | 76 ++++++-------------
 .../phase_inputs.py                           |  2 +-
 .../runtime_config.py                         | 60 +++++++++------
 .../runtime_manifests.py                      |  2 +-
 .../runtime/scripts/download_hf_model.sh      | 28 +++++++
 .../runtime/scripts/extract_oci_model.sh      | 18 +++++
 projects/llm_d/tests/test_runtime.py          |  2 +-
 projects/llm_d/toolbox/cleanup/main.py        |  2 +-
 projects/llm_d/toolbox/prepare/main.py        |  2 +-
 .../llm_d/toolbox/prepare_model_cache/main.py |  2 +-
 projects/llm_d/toolbox/test/main.py           |  2 +-
 14 files changed, 158 insertions(+), 89 deletions(-)
 rename projects/llm_d/{orchestration => runtime}/llmd_runtime.py (91%)
 rename projects/llm_d/{orchestration => runtime}/phase_inputs.py (98%)
 rename projects/llm_d/{orchestration => runtime}/runtime_config.py (86%)
 rename projects/llm_d/{orchestration => runtime}/runtime_manifests.py (99%)
 create mode 100644 projects/llm_d/runtime/scripts/download_hf_model.sh
 create mode 100644 projects/llm_d/runtime/scripts/extract_oci_model.sh

diff --git a/projects/core/library/config.py b/projects/core/library/config.py
index 740e921c..02c69809 100644
--- a/projects/core/library/config.py
+++ b/projects/core/library/config.py
@@ -450,3 +450,20 @@ def init(orchestration_dir, *, apply_config_overrides=True):
     project.apply_config_overrides()
     project.apply_presets_from_project_args()
     project.apply_config_overrides()  # reapply so that the value overrides are applied last
+
+
+def reload(orchestration_dir, *, apply_config_overrides=True):
+    global project
+
+    project = None
+
+    artifact_config = env.ARTIFACT_DIR / "config.yaml"
+    if artifact_config.exists():
+        artifact_config.unlink()
+
+    presets_applied = env.ARTIFACT_DIR / "presets_applied"
+    if presets_applied.exists():
+        presets_applied.unlink()
+
+    init(orchestration_dir, apply_config_overrides=apply_config_overrides)
+    return project
diff --git a/projects/llm_d/orchestration/ci.py b/projects/llm_d/orchestration/ci.py
index ed02e0b2..bc5ae6f4 100644
--- a/projects/llm_d/orchestration/ci.py
+++ b/projects/llm_d/orchestration/ci.py
@@ -4,12 +4,13 @@
 
 """
 
+import os
 import types
 
 import click
 
 from projects.core.library import ci as ci_lib
-from projects.llm_d.orchestration import llmd_runtime, phase_inputs
+from projects.llm_d.runtime import llmd_runtime, phase_inputs
 from projects.llm_d.toolbox.cleanup.main import run as cleanup_toolbox_run
 from projects.llm_d.toolbox.prepare.main import run as prepare_toolbox_run
 from projects.llm_d.toolbox.test.main import run as test_toolbox_run
@@ -19,20 +20,28 @@ def init_runtime() -> None:
     llmd_runtime.init()
 
 
+def load_runtime_configuration():
+    return llmd_runtime.load_run_configuration(
+        requested_preset=os.environ.get("FORGE_PRESET"),
+        raw_overrides=os.environ.get("FORGE_CONFIG_OVERRIDES"),
+        job_name=os.environ.get("FORGE_JOB_NAME"),
+    )
+
+
 def run_prepare_phase() -> int:
-    config = llmd_runtime.load_run_configuration()
+    config = load_runtime_configuration()
     inputs_file = phase_inputs.write_prepare_inputs(config)
     return prepare_toolbox_run(inputs_file=str(inputs_file))
 
 
 def run_test_phase() -> int:
-    config = llmd_runtime.load_run_configuration()
+    config = load_runtime_configuration()
     inputs_file = phase_inputs.write_test_inputs(config)
     return test_toolbox_run(inputs_file=str(inputs_file))
 
 
 def run_cleanup_phase() -> int:
-    config = llmd_runtime.load_run_configuration()
+    config = load_runtime_configuration()
     inputs_file = phase_inputs.write_cleanup_inputs(config)
     return cleanup_toolbox_run(inputs_file=str(inputs_file))
 
diff --git a/projects/llm_d/orchestration/cli.py b/projects/llm_d/orchestration/cli.py
index 02b2e549..fdb84fa9 100644
--- a/projects/llm_d/orchestration/cli.py
+++ b/projects/llm_d/orchestration/cli.py
@@ -1,12 +1,13 @@
 #!/usr/bin/env python3
 
 import logging
+import os
 import types
 
 import click
 
 from projects.core.library.cli import safe_cli_command
-from projects.llm_d.orchestration import llmd_runtime, phase_inputs
+from projects.llm_d.runtime import llmd_runtime, phase_inputs
 from projects.llm_d.toolbox.cleanup.main import run as cleanup_toolbox_run
 from projects.llm_d.toolbox.prepare.main import run as prepare_toolbox_run
 from projects.llm_d.toolbox.test.main import run as test_toolbox_run
@@ -18,20 +19,28 @@ def init_runtime() -> None:
     llmd_runtime.init()
 
 
+def load_runtime_configuration():
+    return llmd_runtime.load_run_configuration(
+        requested_preset=os.environ.get("FORGE_PRESET"),
+        raw_overrides=os.environ.get("FORGE_CONFIG_OVERRIDES"),
+        job_name=os.environ.get("FORGE_JOB_NAME"),
+    )
+
+
 def run_prepare_phase() -> int:
-    config = llmd_runtime.load_run_configuration()
+    config = load_runtime_configuration()
     inputs_file = phase_inputs.write_prepare_inputs(config)
     return prepare_toolbox_run(inputs_file=str(inputs_file))
 
 
 def run_test_phase() -> int:
-    config = llmd_runtime.load_run_configuration()
+    config = load_runtime_configuration()
     inputs_file = phase_inputs.write_test_inputs(config)
     return test_toolbox_run(inputs_file=str(inputs_file))
 
 
 def run_cleanup_phase() -> int:
-    config = llmd_runtime.load_run_configuration()
+    config = load_runtime_configuration()
     inputs_file = phase_inputs.write_cleanup_inputs(config)
     return cleanup_toolbox_run(inputs_file=str(inputs_file))
 
diff --git a/projects/llm_d/orchestration/llmd_runtime.py b/projects/llm_d/runtime/llmd_runtime.py
similarity index 91%
rename from projects/llm_d/orchestration/llmd_runtime.py
rename to projects/llm_d/runtime/llmd_runtime.py
index 59b054e6..53d4662b 100644
--- a/projects/llm_d/orchestration/llmd_runtime.py
+++ b/projects/llm_d/runtime/llmd_runtime.py
@@ -7,11 +7,12 @@
 import subprocess
 import time
 from collections.abc import Iterable
+from pathlib import Path
 from typing import Any
 
 import yaml
 
-from projects.llm_d.orchestration.runtime_config import (
+from projects.llm_d.runtime.runtime_config import (
     CONFIG_DIR,
     ORCHESTRATION_DIR,
     ModelCacheSpec,
@@ -33,7 +34,7 @@
     write_text,
     write_yaml,
 )
-from projects.llm_d.orchestration.runtime_manifests import (
+from projects.llm_d.runtime.runtime_manifests import (
     load_manifest_template,
     render_datasciencecluster,
     render_gateway,
@@ -45,7 +46,7 @@
     render_smoke_request_job,
 )
 
-LOGGER = logging.getLogger(__name__)
+logger = logging.getLogger(__name__)
 
 __all__ = [
     "CONFIG_DIR",
@@ -118,7 +119,7 @@ def run_command(
     timeout_seconds: float | None = 300,
 ) -> subprocess.CompletedProcess[str]:
     cmd = [str(arg) for arg in args]
-    LOGGER.info("run: %s", " ".join(shlex.quote(arg) for arg in cmd))
+    logger.info("run: %s", " ".join(shlex.quote(arg) for arg in cmd))
     try:
         result = subprocess.run(
             cmd,
@@ -129,7 +130,7 @@ def run_command(
             timeout=timeout_seconds,
         )
     except subprocess.TimeoutExpired:
-        LOGGER.error(
+        logger.error(
             "Command timed out after %ss: %s",
             timeout_seconds,
             " ".join(shlex.quote(arg) for arg in cmd),
@@ -138,9 +139,9 @@ def run_command(
 
     if capture_output:
         if result.stdout:
-            LOGGER.info("stdout:\n%s", result.stdout.rstrip())
+            logger.info("stdout:\n%s", result.stdout.rstrip())
         if result.stderr:
-            LOGGER.info("stderr:\n%s", result.stderr.rstrip())
+            logger.info("stderr:\n%s", result.stderr.rstrip())
 
     if check and result.returncode != 0:
         raise CommandError(
@@ -247,7 +248,7 @@ def wait_until(
             if isinstance(exc, RuntimeError):
                 raise
             last_error = exc
-            LOGGER.info("waiting for %s: %s", description, exc)
+            logger.info("waiting for %s: %s", description, exc)
         time.sleep(interval_seconds)
 
     if last_error:
@@ -339,7 +340,7 @@ def ensure_subscription(operator_spec: dict[str, Any]) -> None:
         ignore_not_found=True,
     )
     if current and not subscription_spec_matches(current.get("spec", {}), subscription["spec"]):
-        LOGGER.info("Reconciling subscription drift for %s in %s", package, namespace)
+        logger.info("Reconciling subscription drift for %s in %s", package, namespace)
 
     oc("apply", "-f", "-", input_text=yaml.safe_dump(subscription, sort_keys=False))
 
@@ -384,7 +385,13 @@ def subscription_spec_matches(actual: dict[str, Any], expected: dict[str, Any])
 
 
 def operator_spec_by_package(platform: dict[str, Any], package: str) -> dict[str, Any]:
-    for operator_spec in platform["operators"]:
+    operators = platform["operators"]
+    if isinstance(operators, dict):
+        if package in operators:
+            return {"package": package, **operators[package]}
+        raise KeyError(f"Unknown operator package in llm_d platform config: {package}")
+
+    for operator_spec in operators:
         if operator_spec["package"] == package:
             return operator_spec
     raise KeyError(f"Unknown operator package in llm_d platform config: {package}")
@@ -482,6 +489,11 @@ def resolve_default_serviceaccount_image_pull_secret(namespace: str) -> str | No
     return None
 
 
+def load_runtime_script(name: str) -> str:
+    script_path = Path(__file__).resolve().parent / "scripts" / name
+    return script_path.read_text(encoding="utf-8")
+
+
 def render_model_cache_job(config: ResolvedConfig, spec: ModelCacheSpec) -> dict[str, Any]:
     common_env = [
         {"name": "MODEL_SOURCE", "value": spec.source_uri},
@@ -494,32 +506,7 @@ def render_model_cache_job(config: ResolvedConfig, spec: ModelCacheSpec) -> dict
     ]
 
     if spec.source_scheme == "hf":
-        command = """
-set -euo pipefail
-mkdir -p "${MODEL_TARGET_DIR}"
-rm -rf "${MODEL_TARGET_DIR}"/*
-python -m pip install --quiet --no-cache-dir 'huggingface_hub[hf_xet]'
-python - <<'PY'
-import os
-from huggingface_hub import snapshot_download
-
-token = None
-token_file = os.environ.get("HF_TOKEN_FILE")
-if token_file and os.path.exists(token_file):
-    with open(token_file, encoding="utf-8") as handle:
-        token = handle.read().strip() or None
-
-snapshot_download(
-    repo_id=os.environ["MODEL_SOURCE"][5:],
-    local_dir=os.environ["MODEL_TARGET_DIR"],
-    local_dir_use_symlinks=False,
-    token=token,
-)
-PY
-cat > "${MARKER_FILE}" <<EOF
-{"source_uri":"${MODEL_SOURCE}","cache_key":"${CACHE_KEY}","scheme":"hf"}
-EOF
-"""
+        command = load_runtime_script("download_hf_model.sh")
         volume_mounts = [{"name": "cache", "mountPath": "/cache"}]
         if spec.hf_token_secret_name:
             volumes.append(
@@ -552,22 +539,7 @@ def render_model_cache_job(config: ResolvedConfig, spec: ModelCacheSpec) -> dict
             spec.oci_registry_auth_secret_name
             or resolve_default_serviceaccount_image_pull_secret(spec.namespace)
         )
-        command = """
-set -euo pipefail
-mkdir -p "${MODEL_TARGET_DIR}"
-rm -rf "${MODEL_TARGET_DIR}"/*
-auth_args=()
-if [[ -n "${REGISTRY_AUTH_FILE:-}" && -f "${REGISTRY_AUTH_FILE}" ]]; then
-  auth_args+=(--registry-config="${REGISTRY_AUTH_FILE}")
-fi
-oc image extract "${MODEL_SOURCE#oci://}" \
-  --path "${OCI_IMAGE_PATH}:${MODEL_TARGET_DIR}" \
-  --confirm \
-  "${auth_args[@]}"
-cat > "${MARKER_FILE}" <<EOF
-{"source_uri":"${MODEL_SOURCE}","cache_key":"${CACHE_KEY}","scheme":"oci","image_path":"${OCI_IMAGE_PATH}"}
-EOF
-"""
+        command = load_runtime_script("extract_oci_model.sh")
         volume_mounts = [{"name": "cache", "mountPath": "/cache"}]
         common_env.append({"name": "OCI_IMAGE_PATH", "value": spec.oci_image_path or "/"})
         if registry_auth_secret_name:
diff --git a/projects/llm_d/orchestration/phase_inputs.py b/projects/llm_d/runtime/phase_inputs.py
similarity index 98%
rename from projects/llm_d/orchestration/phase_inputs.py
rename to projects/llm_d/runtime/phase_inputs.py
index 8a195515..5b985737 100644
--- a/projects/llm_d/orchestration/phase_inputs.py
+++ b/projects/llm_d/runtime/phase_inputs.py
@@ -4,7 +4,7 @@
 from pathlib import Path
 from typing import Any
 
-from projects.llm_d.orchestration.runtime_config import ResolvedConfig, load_yaml, write_yaml
+from projects.llm_d.runtime.runtime_config import ResolvedConfig, load_yaml, write_yaml
 
 
 @dataclass(frozen=True)
diff --git a/projects/llm_d/orchestration/runtime_config.py b/projects/llm_d/runtime/runtime_config.py
similarity index 86%
rename from projects/llm_d/orchestration/runtime_config.py
rename to projects/llm_d/runtime/runtime_config.py
index 42b5fcb1..a16e1711 100644
--- a/projects/llm_d/orchestration/runtime_config.py
+++ b/projects/llm_d/runtime/runtime_config.py
@@ -15,8 +15,10 @@
 
 from projects.core.library import config, env, run
 
-LOGGER = logging.getLogger(__name__)
-ORCHESTRATION_DIR = env.FORGE_HOME / "projects" / "llm_d" / "orchestration"
+logger = logging.getLogger(__name__)
+RUNTIME_DIR = Path(__file__).resolve().parent
+PROJECT_DIR = RUNTIME_DIR.parent
+ORCHESTRATION_DIR = PROJECT_DIR / "orchestration"
 CONFIG_DIR = ORCHESTRATION_DIR
 
 
@@ -88,25 +90,30 @@ def ensure_artifact_directories(artifact_dir: Path) -> None:
 
 
 def load_run_configuration(
-    *, cwd: Path | None = None, artifact_dir: Path | None = None
+    *,
+    cwd: Path | None = None,
+    artifact_dir: Path | None = None,
+    requested_preset: str | None = None,
+    raw_overrides: str | None = None,
+    job_name: str | None = None,
 ) -> ResolvedConfig:
     cwd = cwd or Path.cwd()
     if artifact_dir is not None:
         os.environ["ARTIFACT_DIR"] = str(artifact_dir)
     artifact_dir = init()
-    _reinitialize_project_config()
+    config.reload(ORCHESTRATION_DIR)
 
-    platform_data = copy.deepcopy(config.project.get_config("platform"))
+    platform_data = normalize_platform_config(copy.deepcopy(config.project.get_config("platform")))
     model_cache = copy.deepcopy(config.project.get_config("model_cache"))
     fournos_config = load_fournos_config(cwd)
     overrides = parse_overrides(
-        os.environ.get("FORGE_CONFIG_OVERRIDES", ""),
+        raw_overrides or "",
         allowed_keys=config.project.get_config("runtime.allowed_override_keys", []),
     )
 
     requested_preset = (
-        fournos_config.get("preset")
-        or os.environ.get("FORGE_PRESET")
+        requested_preset
+        or fournos_config.get("preset")
         or config.project.get_config("runtime.default_preset")
     )
     apply_requested_preset(requested_preset)
@@ -136,19 +143,20 @@ def load_run_configuration(
             config.project.get_config(f"workloads.benchmarks.{benchmark_name}")
         )
 
-    job_name = fournos_config.get("job-name") or os.environ.get("FORGE_JOB_NAME")
+    job_name = job_name or fournos_config.get("job-name")
     if not job_name:
         job_name = f"local-{preset_name}"
 
     namespace_override = overrides.get("namespace") or fournos_config.get("namespace")
-    default_namespace = platform_data["cluster"].get("namespace_name")
+    namespace_config = platform_data["cluster"]["namespace"]
+    default_namespace = namespace_config.get("name")
     namespace = (
         namespace_override
         or default_namespace
         or derive_namespace(
             job_name,
-            platform_data["cluster"]["namespace_prefix"],
-            platform_data["cluster"]["namespace_max_length"],
+            namespace_config["prefix"],
+            namespace_config["max_length"],
         )
     )
 
@@ -177,17 +185,25 @@ def load_run_configuration(
     )
 
 
-def _reinitialize_project_config() -> None:
-    config.project = None
-    artifact_config = env.ARTIFACT_DIR / "config.yaml"
-    if artifact_config.exists():
-        artifact_config.unlink()
+def normalize_platform_config(platform_data: dict[str, Any]) -> dict[str, Any]:
+    cluster = platform_data["cluster"]
+    if "namespace" not in cluster:
+        cluster["namespace"] = {
+            "name": cluster.pop("namespace_name", None),
+            "prefix": cluster.pop("namespace_prefix"),
+            "max_length": cluster.pop("namespace_max_length"),
+        }
 
-    presets_applied = env.ARTIFACT_DIR / "presets_applied"
-    if presets_applied.exists():
-        presets_applied.unlink()
+    operators = platform_data["operators"]
+    if isinstance(operators, list):
+        platform_data["operators"] = {
+            operator_spec["package"]: {
+                key: value for key, value in operator_spec.items() if key != "package"
+            }
+            for operator_spec in operators
+        }
 
-    config.init(ORCHESTRATION_DIR)
+    return platform_data
 
 
 def apply_requested_preset(requested_preset: str) -> None:
@@ -239,7 +255,7 @@ def normalize_gpu_count(value: Any) -> int | None:
     try:
         return int(value)
     except (TypeError, ValueError):
-        LOGGER.warning("Ignoring invalid gpu-count value: %s", value)
+        logger.warning("Ignoring invalid gpu-count value: %s", value)
         return None
 
 
diff --git a/projects/llm_d/orchestration/runtime_manifests.py b/projects/llm_d/runtime/runtime_manifests.py
similarity index 99%
rename from projects/llm_d/orchestration/runtime_manifests.py
rename to projects/llm_d/runtime/runtime_manifests.py
index 0c72a88e..bc5fdca8 100644
--- a/projects/llm_d/orchestration/runtime_manifests.py
+++ b/projects/llm_d/runtime/runtime_manifests.py
@@ -4,7 +4,7 @@
 import json
 from typing import Any
 
-from projects.llm_d.orchestration.runtime_config import (
+from projects.llm_d.runtime.runtime_config import (
     ModelCacheSpec,
     ResolvedConfig,
     load_yaml,
diff --git a/projects/llm_d/runtime/scripts/download_hf_model.sh b/projects/llm_d/runtime/scripts/download_hf_model.sh
new file mode 100644
index 00000000..9623d2aa
--- /dev/null
+++ b/projects/llm_d/runtime/scripts/download_hf_model.sh
@@ -0,0 +1,28 @@
+set -euo pipefail
+
+mkdir -p "${MODEL_TARGET_DIR}"
+rm -rf "${MODEL_TARGET_DIR}"/*
+
+python -m pip install --quiet --no-cache-dir 'huggingface_hub[hf_xet]'
+python - <<'PY'
+import os
+
+from huggingface_hub import snapshot_download
+
+token = None
+token_file = os.environ.get("HF_TOKEN_FILE")
+if token_file and os.path.exists(token_file):
+    with open(token_file, encoding="utf-8") as handle:
+        token = handle.read().strip() or None
+
+snapshot_download(
+    repo_id=os.environ["MODEL_SOURCE"][5:],
+    local_dir=os.environ["MODEL_TARGET_DIR"],
+    local_dir_use_symlinks=False,
+    token=token,
+)
+PY
+
+cat > "${MARKER_FILE}" <<EOF
+{"source_uri":"${MODEL_SOURCE}","cache_key":"${CACHE_KEY}","scheme":"hf"}
+EOF
diff --git a/projects/llm_d/runtime/scripts/extract_oci_model.sh b/projects/llm_d/runtime/scripts/extract_oci_model.sh
new file mode 100644
index 00000000..297791ef
--- /dev/null
+++ b/projects/llm_d/runtime/scripts/extract_oci_model.sh
@@ -0,0 +1,18 @@
+set -euo pipefail
+
+mkdir -p "${MODEL_TARGET_DIR}"
+rm -rf "${MODEL_TARGET_DIR}"/*
+
+auth_args=()
+if [[ -n "${REGISTRY_AUTH_FILE:-}" && -f "${REGISTRY_AUTH_FILE}" ]]; then
+  auth_args+=(--registry-config="${REGISTRY_AUTH_FILE}")
+fi
+
+oc image extract "${MODEL_SOURCE#oci://}" \
+  --path "${OCI_IMAGE_PATH}:${MODEL_TARGET_DIR}" \
+  --confirm \
+  "${auth_args[@]}"
+
+cat > "${MARKER_FILE}" <<EOF
+{"source_uri":"${MODEL_SOURCE}","cache_key":"${CACHE_KEY}","scheme":"oci","image_path":"${OCI_IMAGE_PATH}"}
+EOF
diff --git a/projects/llm_d/tests/test_runtime.py b/projects/llm_d/tests/test_runtime.py
index 2171f7f4..32f31f8e 100644
--- a/projects/llm_d/tests/test_runtime.py
+++ b/projects/llm_d/tests/test_runtime.py
@@ -7,7 +7,7 @@
 
 from projects.llm_d.orchestration import ci as llmd_ci
 from projects.llm_d.orchestration import cli as llmd_cli
-from projects.llm_d.orchestration import llmd_runtime, phase_inputs
+from projects.llm_d.runtime import llmd_runtime, phase_inputs
 from projects.llm_d.toolbox.cleanup import main as cleanup_toolbox
 from projects.llm_d.toolbox.prepare import main as prepare_toolbox
 from projects.llm_d.toolbox.prepare_model_cache import main as prepare_model_cache_toolbox
diff --git a/projects/llm_d/toolbox/cleanup/main.py b/projects/llm_d/toolbox/cleanup/main.py
index abd543db..a32dbd6d 100644
--- a/projects/llm_d/toolbox/cleanup/main.py
+++ b/projects/llm_d/toolbox/cleanup/main.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 from projects.core.dsl import execute_tasks, shell, task, toolbox
-from projects.llm_d.orchestration import llmd_runtime, phase_inputs
+from projects.llm_d.runtime import llmd_runtime, phase_inputs
 
 
 def run(*, inputs_file: str) -> int:
diff --git a/projects/llm_d/toolbox/prepare/main.py b/projects/llm_d/toolbox/prepare/main.py
index 6851bf4b..34b23478 100644
--- a/projects/llm_d/toolbox/prepare/main.py
+++ b/projects/llm_d/toolbox/prepare/main.py
@@ -7,7 +7,7 @@
 from pathlib import Path
 
 from projects.core.dsl import execute_tasks, shell, task, toolbox
-from projects.llm_d.orchestration import llmd_runtime, phase_inputs
+from projects.llm_d.runtime import llmd_runtime, phase_inputs
 from projects.llm_d.toolbox.prepare_model_cache import main as prepare_model_cache
 
 LOGGER = logging.getLogger(__name__)
diff --git a/projects/llm_d/toolbox/prepare_model_cache/main.py b/projects/llm_d/toolbox/prepare_model_cache/main.py
index f698ef0c..73cfc24e 100644
--- a/projects/llm_d/toolbox/prepare_model_cache/main.py
+++ b/projects/llm_d/toolbox/prepare_model_cache/main.py
@@ -5,7 +5,7 @@
 import logging
 
 from projects.core.dsl import execute_tasks, task, toolbox
-from projects.llm_d.orchestration import llmd_runtime, phase_inputs
+from projects.llm_d.runtime import llmd_runtime, phase_inputs
 
 LOGGER = logging.getLogger(__name__)
 
diff --git a/projects/llm_d/toolbox/test/main.py b/projects/llm_d/toolbox/test/main.py
index 5941d03a..609c9e46 100644
--- a/projects/llm_d/toolbox/test/main.py
+++ b/projects/llm_d/toolbox/test/main.py
@@ -7,7 +7,7 @@
 from pathlib import Path
 
 from projects.core.dsl import always, execute_tasks, task, toolbox
-from projects.llm_d.orchestration import llmd_runtime, phase_inputs
+from projects.llm_d.runtime import llmd_runtime, phase_inputs
 
 LOGGER = logging.getLogger(__name__)
 

From 5ca7bbdc3ffcbc6d52b0d756427664fa06645e7a Mon Sep 17 00:00:00 2001
From: Alberto Perdomo <aperdomo@redhat.com>
Date: Tue, 5 May 2026 08:20:13 +0100
Subject: [PATCH 18/21] refactor: Normalize llm_d project configuration layout

---
 .../orchestration/config.d/platform.yaml      | 27 +++++++++---------
 .../llm_d/orchestration/config.d/project.yaml |  2 ++
 projects/llm_d/orchestration/config.yaml      |  3 --
 projects/llm_d/tests/test_runtime.py          | 28 +++++++++++++++++--
 4 files changed, 42 insertions(+), 18 deletions(-)
 create mode 100644 projects/llm_d/orchestration/config.d/project.yaml
 delete mode 100644 projects/llm_d/orchestration/config.yaml

diff --git a/projects/llm_d/orchestration/config.d/platform.yaml b/projects/llm_d/orchestration/config.d/platform.yaml
index 43092e7c..6f823eba 100644
--- a/projects/llm_d/orchestration/config.d/platform.yaml
+++ b/projects/llm_d/orchestration/config.d/platform.yaml
@@ -1,8 +1,9 @@
 cluster:
   minimum_openshift_version: "4.19.9"
-  namespace_name: forge-llm-d
-  namespace_prefix: llm-d
-  namespace_max_length: 63
+  namespace:
+    name: forge-llm-d
+    prefix: llm-d
+    max_length: 63
   cleanup_timeout_seconds: 900
   gpu_node_label_selector: nvidia.com/gpu.present=true
   nfd_gpu_detection_labels:
@@ -11,36 +12,36 @@ cluster:
     - feature.node.kubernetes.io/pci-0300_10de.present
 
 operators:
-  - display_name: OpenShift Cert Manager
-    package: openshift-cert-manager-operator
+  openshift-cert-manager-operator:
+    display_name: OpenShift Cert Manager
     namespace: openshift-cert-manager-operator
     channel: stable-v1.18
     source: redhat-operators
     wait_timeout_seconds: 900
-  - display_name: Leader Worker Set
-    package: leader-worker-set
+  leader-worker-set:
+    display_name: Leader Worker Set
     namespace: openshift-lws
     channel: stable
     source: redhat-operators
     wait_timeout_seconds: 900
-  - display_name: Node Feature Discovery
-    package: nfd
+  nfd:
+    display_name: Node Feature Discovery
     namespace: openshift-nfd
     channel: stable
     source: redhat-operators
     wait_timeout_seconds: 900
     bootstrap_crd: nodefeaturediscoveries.nfd.openshift.io
     bootstrap_manifest: manifests/nfd-nodefeaturediscovery.yaml
-  - display_name: NVIDIA GPU Operator
-    package: gpu-operator-certified
+  gpu-operator-certified:
+    display_name: NVIDIA GPU Operator
     namespace: nvidia-gpu-operator
     channel: stable
     source: certified-operators
     wait_timeout_seconds: 1800
     bootstrap_crd: clusterpolicies.nvidia.com
     bootstrap_manifest: manifests/gpu-clusterpolicy.yaml
-  - display_name: Red Hat OpenShift AI
-    package: rhods-operator
+  rhods-operator:
+    display_name: Red Hat OpenShift AI
     namespace: redhat-ods-operator
     channel: stable-3.x
     source: redhat-operators
diff --git a/projects/llm_d/orchestration/config.d/project.yaml b/projects/llm_d/orchestration/config.d/project.yaml
new file mode 100644
index 00000000..f957c25d
--- /dev/null
+++ b/projects/llm_d/orchestration/config.d/project.yaml
@@ -0,0 +1,2 @@
+name: llm_d
+args: []
diff --git a/projects/llm_d/orchestration/config.yaml b/projects/llm_d/orchestration/config.yaml
deleted file mode 100644
index c36dfa60..00000000
--- a/projects/llm_d/orchestration/config.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-project:
-  name: llm_d
-  args: []
diff --git a/projects/llm_d/tests/test_runtime.py b/projects/llm_d/tests/test_runtime.py
index 32f31f8e..018f314d 100644
--- a/projects/llm_d/tests/test_runtime.py
+++ b/projects/llm_d/tests/test_runtime.py
@@ -62,15 +62,21 @@ def test_load_run_configuration_consolidates_config_d(
     assert "runtime" in consolidated
     assert "scheduler_profiles" in consolidated
     assert "workloads" in consolidated
+    assert consolidated["project"]["name"] == "llm_d"
     assert consolidated["runtime"]["default_preset"] == "smoke"
+    assert consolidated["platform"]["cluster"]["namespace"]["name"] == "forge-llm-d"
+    assert isinstance(consolidated["platform"]["operators"], dict)
 
 
 def test_namespace_override_is_not_managed(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
-    monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", '{"namespace":"custom-ns"}')
     artifact_dir = tmp_path / "artifacts"
     artifact_dir.mkdir()
 
-    config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
+    config = llmd_runtime.load_run_configuration(
+        cwd=tmp_path,
+        artifact_dir=artifact_dir,
+        raw_overrides='{"namespace":"custom-ns"}',
+    )
 
     assert config.namespace == "custom-ns"
     assert config.namespace_is_managed is False
@@ -91,6 +97,24 @@ def test_default_namespace_comes_from_project_config(
 
     assert config.namespace == "forge-llm-d"
     assert config.namespace_is_managed is False
+    assert config.platform["cluster"]["namespace"]["prefix"] == "llm-d"
+    assert "rhods-operator" in config.platform["operators"]
+
+
+def test_load_run_configuration_ignores_runtime_env_vars(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", '{"namespace":"ignored-ns"}')
+    monkeypatch.setenv("FORGE_PRESET", "benchmark-short")
+    monkeypatch.setenv("FORGE_JOB_NAME", "ignored-job")
+    artifact_dir = tmp_path / "artifacts"
+    artifact_dir.mkdir()
+
+    config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
+
+    assert config.preset_name == "smoke"
+    assert config.namespace == "forge-llm-d"
+    assert config.job_name == "local-smoke"
 
 
 def test_write_prepare_inputs_round_trip(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:

From d5186a71c083d6c1ca180aadcf38a784868d40d2 Mon Sep 17 00:00:00 2001
From: Alberto Perdomo <aperdomo@redhat.com>
Date: Tue, 5 May 2026 09:03:09 +0100
Subject: [PATCH 19/21] test: Align llm_d runtime coverage with explicit inputs

---
 projects/llm_d/tests/test_runtime.py | 52 +++++++++++++++++++++++-----
 1 file changed, 43 insertions(+), 9 deletions(-)

diff --git a/projects/llm_d/tests/test_runtime.py b/projects/llm_d/tests/test_runtime.py
index 018f314d..6e835b4c 100644
--- a/projects/llm_d/tests/test_runtime.py
+++ b/projects/llm_d/tests/test_runtime.py
@@ -27,7 +27,6 @@ def test_parse_overrides_rejects_unknown_keys() -> None:
 def test_load_run_configuration_resolves_alias(
     tmp_path: Path, monkeypatch: pytest.MonkeyPatch
 ) -> None:
-    monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
     artifact_dir = tmp_path / "artifacts"
     artifact_dir.mkdir()
 
@@ -49,7 +48,6 @@ def test_load_run_configuration_resolves_alias(
 def test_load_run_configuration_consolidates_config_d(
     tmp_path: Path, monkeypatch: pytest.MonkeyPatch
 ) -> None:
-    monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
     artifact_dir = tmp_path / "artifacts"
     artifact_dir.mkdir()
 
@@ -85,7 +83,6 @@ def test_namespace_override_is_not_managed(tmp_path: Path, monkeypatch: pytest.M
 def test_default_namespace_comes_from_project_config(
     tmp_path: Path, monkeypatch: pytest.MonkeyPatch
 ) -> None:
-    monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
     artifact_dir = tmp_path / "artifacts"
     artifact_dir.mkdir()
     (tmp_path / "fournos_config.yaml").write_text(
@@ -184,13 +181,16 @@ def test_write_test_inputs_round_trip(tmp_path: Path, monkeypatch: pytest.Monkey
 def test_orchestration_prepare_writes_inputs_and_invokes_toolbox(
     orchestration, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
 ) -> None:
-    monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
     artifact_dir = tmp_path / "artifacts"
     artifact_dir.mkdir()
     config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
     captured: dict[str, str] = {}
 
-    monkeypatch.setattr(orchestration.llmd_runtime, "load_run_configuration", lambda: config)
+    monkeypatch.setattr(
+        orchestration.llmd_runtime,
+        "load_run_configuration",
+        lambda **_kwargs: config,
+    )
     monkeypatch.setattr(
         orchestration,
         "prepare_toolbox_run",
@@ -208,13 +208,16 @@ def test_orchestration_prepare_writes_inputs_and_invokes_toolbox(
 def test_orchestration_test_writes_inputs_and_invokes_toolbox(
     orchestration, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
 ) -> None:
-    monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
     artifact_dir = tmp_path / "artifacts"
     artifact_dir.mkdir()
     config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
     captured: dict[str, str] = {}
 
-    monkeypatch.setattr(orchestration.llmd_runtime, "load_run_configuration", lambda: config)
+    monkeypatch.setattr(
+        orchestration.llmd_runtime,
+        "load_run_configuration",
+        lambda **_kwargs: config,
+    )
     monkeypatch.setattr(
         orchestration,
         "test_toolbox_run",
@@ -233,13 +236,16 @@ def test_orchestration_test_writes_inputs_and_invokes_toolbox(
 def test_orchestration_cleanup_writes_inputs_and_invokes_toolbox(
     orchestration, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
 ) -> None:
-    monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}")
     artifact_dir = tmp_path / "artifacts"
     artifact_dir.mkdir()
     config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir)
     captured: dict[str, str] = {}
 
-    monkeypatch.setattr(orchestration.llmd_runtime, "load_run_configuration", lambda: config)
+    monkeypatch.setattr(
+        orchestration.llmd_runtime,
+        "load_run_configuration",
+        lambda **_kwargs: config,
+    )
     monkeypatch.setattr(
         orchestration,
         "cleanup_toolbox_run",
@@ -254,6 +260,34 @@ def test_orchestration_cleanup_writes_inputs_and_invokes_toolbox(
     assert loaded.platform == config.platform
 
 
+@pytest.mark.parametrize("orchestration", [llmd_ci, llmd_cli])
+def test_orchestration_load_runtime_configuration_reads_env(
+    orchestration, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    monkeypatch.setenv("FORGE_PRESET", "smoke-precise")
+    monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", '{"namespace":"custom-ns"}')
+    monkeypatch.setenv("FORGE_JOB_NAME", "job-from-env")
+    captured: dict[str, str | None] = {}
+    sentinel = object()
+
+    def fake_load_run_configuration(**kwargs):
+        captured.update(kwargs)
+        return sentinel
+
+    monkeypatch.setattr(
+        orchestration.llmd_runtime, "load_run_configuration", fake_load_run_configuration
+    )
+
+    result = orchestration.load_runtime_configuration()
+
+    assert result is sentinel
+    assert captured == {
+        "requested_preset": "smoke-precise",
+        "raw_overrides": '{"namespace":"custom-ns"}',
+        "job_name": "job-from-env",
+    }
+
+
 def test_render_inference_service_injects_model_and_scheduler_profile(
     tmp_path: Path, monkeypatch: pytest.MonkeyPatch
 ) -> None:

From 9a5f3332c3c5e8e83419b284e9954438da81b022 Mon Sep 17 00:00:00 2001
From: Alberto Perdomo <aperdomo@redhat.com>
Date: Tue, 5 May 2026 09:05:07 +0100
Subject: [PATCH 20/21] refactor: Deduplicate DSL toolbox path helpers

---
 projects/core/dsl/log.py     | 30 +++++-------------------------
 projects/core/dsl/runtime.py | 23 +++++++----------------
 2 files changed, 12 insertions(+), 41 deletions(-)

diff --git a/projects/core/dsl/log.py b/projects/core/dsl/log.py
index b93a5076..dc28ffab 100644
--- a/projects/core/dsl/log.py
+++ b/projects/core/dsl/log.py
@@ -16,20 +16,17 @@ def setup_clean_logger(name: str):
     logger = logging.getLogger(name)
     logger.setLevel(logging.INFO)
 
-    # Only configure if not already configured
     if not logger.handlers:
-        # Create console handler with clean format
         console_handler = logging.StreamHandler()
         console_handler.setLevel(logging.INFO)
         console_handler.setFormatter(logging.Formatter("%(message)s"))
 
         logger.addHandler(console_handler)
 
-    logger.propagate = False  # Don't propagate to root logger
+    logger.propagate = False
     return logger
 
 
-# Configure clean logging for DSL operations
 logger = setup_clean_logger("DSL")
 
 
@@ -45,30 +42,23 @@ def log_task_header(task_name: str, task_doc: str, rel_filename: str, line_no: i
 
 def log_execution_banner(function_args: dict = None, log_file: str = None):
     """Log the execution banner with function info and arguments"""
-    # Get the caller's filename and function name for the header
     frame = inspect.currentframe()
-    caller_frame = (
-        frame.f_back.f_back
-    )  # Go back 2 frames (this func -> execute_tasks -> actual caller)
+    caller_frame = frame.f_back.f_back
     filename = caller_frame.f_code.co_filename
 
     rel_filename = _get_forge_relative_path(filename)
-
-    # Use parent directory name as function name for toolbox operations
     function_name = _get_toolbox_function_name(filename)
 
-    # Print execution header
     logger.info("")
     logger.info("===============================================================================")
     logger.info(f"| FILE: {rel_filename}")
     logger.info(f"| COMMAND: {function_name}")
 
     if function_args:
-        # Display arguments in YAML format
         logger.info("| ARGUMENTS:")
 
         for key, value in function_args.items():
-            if key == "function_args":  # Skip the function_args parameter itself
+            if key == "function_args":
                 continue
             if value is None:
                 continue
@@ -83,19 +73,13 @@ def log_execution_banner(function_args: dict = None, log_file: str = None):
 
 def log_completion_banner(function_args: dict = None, status: str = "SUCCESS"):
     """Log the completion banner with function info and completion status"""
-    # Get the caller's filename and function name for the header
     frame = inspect.currentframe()
-    caller_frame = (
-        frame.f_back.f_back
-    )  # Go back 2 frames (this func -> execute_tasks -> actual caller)
+    caller_frame = frame.f_back.f_back
     filename = caller_frame.f_code.co_filename
 
     rel_filename = _get_forge_relative_path(filename)
-
-    # Use parent directory name as function name for toolbox operations
     function_name = _get_toolbox_function_name(filename)
 
-    # Print completion header
     logger.info("")
     logger.info("===============================================================================")
     logger.info(f"| {rel_filename}")
@@ -115,8 +99,4 @@ def _get_forge_relative_path(filename):
 
 def _get_toolbox_function_name(filename):
     """Extract toolbox function name from file path (parent directory name)"""
-    filename_path = Path(filename)
-
-    # For paths like projects/llm_d/toolbox/capture_llmisvc_state/main.py
-    # Return the parent directory name: capture_llmisvc_state
-    return filename_path.parent.name
+    return Path(filename).parent.name
diff --git a/projects/core/dsl/runtime.py b/projects/core/dsl/runtime.py
index c8f807db..1c40c1df 100644
--- a/projects/core/dsl/runtime.py
+++ b/projects/core/dsl/runtime.py
@@ -15,7 +15,13 @@
 from projects.core.library.run import SignalError
 
 from .context import create_task_parameters
-from .log import log_completion_banner, log_execution_banner, logger
+from .log import (
+    _get_forge_relative_path,
+    _get_toolbox_function_name,
+    log_completion_banner,
+    log_execution_banner,
+    logger,
+)
 from .script_manager import get_script_manager
 
 # Import from task.py to avoid circular imports
@@ -401,18 +407,3 @@ def _generate_restart_script(function_args: dict, caller_frame, meta_dir):
     os.chmod(restart_file, 0o755)
 
     logger.debug(f"Generated restart script: {restart_file}")
-
-
-def _get_forge_relative_path(filename):
-    """Get file path relative to FORGE home directory (forge root)"""
-
-    return Path(filename).relative_to(env.FORGE_HOME)
-
-
-def _get_toolbox_function_name(filename):
-    """Extract toolbox function name from file path (parent directory name)"""
-    filename_path = Path(filename)
-
-    # For paths like projects/llm_d/toolbox/capture_llmisvc_state/main.py
-    # Return the parent directory name: capture_llmisvc_state
-    return filename_path.parent.name

From 629d3bcaf4a898471d4eadbab903e4cacdf9a974 Mon Sep 17 00:00:00 2001
From: Alberto Perdomo <aperdomo@redhat.com>
Date: Tue, 5 May 2026 09:07:18 +0100
Subject: [PATCH 21/21] docs: Refresh llm_d layout references

---
 projects/llm_d/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/projects/llm_d/README.md b/projects/llm_d/README.md
index fd443121..82a108ac 100644
--- a/projects/llm_d/README.md
+++ b/projects/llm_d/README.md
@@ -10,7 +10,7 @@ The current implementation is intentionally narrow:
 
 Configuration layout:
 
-- base config: [`orchestration/config.yaml`](./orchestration/config.yaml)
+- project config chunk: [`orchestration/config.d/project.yaml`](./orchestration/config.d/project.yaml)
 - config chunks: [`orchestration/config.d`](./orchestration/config.d)
 - presets: [`orchestration/presets.d`](./orchestration/presets.d)
 - manifests: [`orchestration/manifests`](./orchestration/manifests)
@@ -19,7 +19,7 @@ Main entrypoints:
 
 - CI phase wrapper: [`orchestration/ci.py`](./orchestration/ci.py)
 - CLI wrapper: [`orchestration/cli.py`](./orchestration/cli.py)
-- Shared runtime/config loader: [`orchestration/llmd_runtime.py`](./orchestration/llmd_runtime.py)
+- Shared runtime/config loader: [`runtime/llmd_runtime.py`](./runtime/llmd_runtime.py)
 - Toolbox prepare command: [`toolbox/prepare/main.py`](./toolbox/prepare/main.py)
 - Toolbox test command: [`toolbox/test/main.py`](./toolbox/test/main.py)
 - Toolbox cleanup command: [`toolbox/cleanup/main.py`](./toolbox/cleanup/main.py)