From b7e48e353f88c1427f30ed19d5463ec6e754c057 Mon Sep 17 00:00:00 2001 From: Alberto Perdomo Date: Mon, 13 Apr 2026 13:56:30 +0100 Subject: [PATCH 01/21] feat: Initial commit Signed-off-by: Alberto Perdomo --- .../llm_d/manifests/datasciencecluster.yaml | 22 + .../epp-approximate-prefix-cache.yaml | 15 + config/llm_d/manifests/gateway.yaml | 14 + config/llm_d/manifests/gpu-clusterpolicy.yaml | 37 + .../llm_d/manifests/llminferenceservice.yaml | 96 +++ .../manifests/nfd-nodefeaturediscovery.yaml | 6 + config/llm_d/models.yaml | 25 + config/llm_d/platform.yaml | 82 +++ config/llm_d/presets.yaml | 14 + config/llm_d/workloads.yaml | 20 + projects/llm_d/README.md | 310 +------- projects/llm_d/orchestration/ci.py | 6 +- projects/llm_d/orchestration/cli.py | 25 +- projects/llm_d/orchestration/llmd_runtime.py | 695 ++++++++++++++++++ projects/llm_d/orchestration/prepare_llmd.py | 430 ++++++++++- projects/llm_d/orchestration/test_llmd.py | 492 ++++++++++++- .../llm_d/toolbox/capture_isvc_state/main.py | 45 +- tests/llm_d/test_runtime.py | 208 ++++++ 18 files changed, 2164 insertions(+), 378 deletions(-) create mode 100644 config/llm_d/manifests/datasciencecluster.yaml create mode 100644 config/llm_d/manifests/epp-approximate-prefix-cache.yaml create mode 100644 config/llm_d/manifests/gateway.yaml create mode 100644 config/llm_d/manifests/gpu-clusterpolicy.yaml create mode 100644 config/llm_d/manifests/llminferenceservice.yaml create mode 100644 config/llm_d/manifests/nfd-nodefeaturediscovery.yaml create mode 100644 config/llm_d/models.yaml create mode 100644 config/llm_d/platform.yaml create mode 100644 config/llm_d/presets.yaml create mode 100644 config/llm_d/workloads.yaml mode change 100755 => 100644 projects/llm_d/orchestration/ci.py mode change 100755 => 100644 projects/llm_d/orchestration/cli.py create mode 100644 projects/llm_d/orchestration/llmd_runtime.py mode change 100755 => 100644 projects/llm_d/toolbox/capture_isvc_state/main.py create mode 100644 tests/llm_d/test_runtime.py diff --git a/config/llm_d/manifests/datasciencecluster.yaml b/config/llm_d/manifests/datasciencecluster.yaml new file mode 100644 index 00000000..fd45316d --- /dev/null +++ b/config/llm_d/manifests/datasciencecluster.yaml @@ -0,0 +1,22 @@ +apiVersion: datasciencecluster.opendatahub.io/v1 +kind: DataScienceCluster +metadata: + name: default-dsc + namespace: redhat-ods-applications +spec: + components: + codeflare: + managementState: Removed + dashboard: + managementState: Removed + datasciencepipelines: + managementState: Removed + kserve: + managementState: Managed + rawDeploymentServiceConfig: Headless + modelmeshserving: + managementState: Removed + ray: + managementState: Removed + workbenches: + managementState: Removed diff --git a/config/llm_d/manifests/epp-approximate-prefix-cache.yaml b/config/llm_d/manifests/epp-approximate-prefix-cache.yaml new file mode 100644 index 00000000..e584dcf2 --- /dev/null +++ b/config/llm_d/manifests/epp-approximate-prefix-cache.yaml @@ -0,0 +1,15 @@ +apiVersion: inference.networking.x-k8s.io/v1alpha1 +kind: EndpointPickerConfig +plugins: + - type: queue-scorer + - type: kv-cache-utilization-scorer + - type: prefix-cache-scorer +schedulingProfiles: + - name: default + plugins: + - pluginRef: queue-scorer + weight: 2 + - pluginRef: kv-cache-utilization-scorer + weight: 2 + - pluginRef: prefix-cache-scorer + weight: 3 diff --git a/config/llm_d/manifests/gateway.yaml b/config/llm_d/manifests/gateway.yaml new file mode 100644 index 00000000..dff0c398 --- /dev/null +++ b/config/llm_d/manifests/gateway.yaml @@ -0,0 +1,14 @@ +apiVersion: gateway.networking.k8s.io/v1 +kind: Gateway +metadata: + name: openshift-ai-inference + namespace: openshift-ingress +spec: + gatewayClassName: data-science-gateway-class + listeners: + - name: http + port: 80 + protocol: HTTP + allowedRoutes: + namespaces: + from: All diff --git a/config/llm_d/manifests/gpu-clusterpolicy.yaml b/config/llm_d/manifests/gpu-clusterpolicy.yaml new file mode 100644 index 00000000..6a9ad7ee --- /dev/null +++ b/config/llm_d/manifests/gpu-clusterpolicy.yaml @@ -0,0 +1,37 @@ +apiVersion: nvidia.com/v1 +kind: ClusterPolicy +metadata: + name: gpu-cluster-policy +spec: + daemonsets: + tolerations: + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + updateStrategy: RollingUpdate + dcgm: + enabled: true + dcgmExporter: + enabled: true + devicePlugin: + enabled: true + driver: + enabled: true + kernelModuleType: auto + gfd: + enabled: true + mig: + strategy: single + nodeStatusExporter: + enabled: true + operator: + defaultRuntime: crio + runtimeClass: nvidia + toolkit: + enabled: true + installDir: /usr/local/nvidia + validator: + plugin: + env: + - name: WITH_WORKLOAD + value: "false" diff --git a/config/llm_d/manifests/llminferenceservice.yaml b/config/llm_d/manifests/llminferenceservice.yaml new file mode 100644 index 00000000..cff616f8 --- /dev/null +++ b/config/llm_d/manifests/llminferenceservice.yaml @@ -0,0 +1,96 @@ +apiVersion: serving.kserve.io/v1alpha1 +kind: LLMInferenceService +metadata: + name: llm-d + namespace: llm-d + annotations: + security.opendatahub.io/enable-auth: "false" + prometheus.io/path: /metrics + prometheus.io/port: "8000" +spec: + replicas: 1 + model: + uri: hf://Qwen/Qwen3-0.6B + name: Qwen/Qwen3-0.6B + router: + scheduler: + template: + containers: + - name: main + env: + - name: TOKENIZER_CACHE_DIR + value: /tmp/tokenizer-cache + - name: HF_HOME + value: /tmp/tokenizer-cache + - name: TRANSFORMERS_CACHE + value: /tmp/tokenizer-cache + - name: XDG_CACHE_HOME + value: /tmp + args: + - --cert-path + - /var/run/kserve/tls + - --pool-group + - inference.networking.x-k8s.io + - --pool-name + - "{{ ChildName .ObjectMeta.Name `-inference-pool` }}" + - --pool-namespace + - "{{ .ObjectMeta.Namespace }}" + - --zap-encoder + - json + - --grpc-port + - "9002" + - --grpc-health-port + - "9003" + - --secure-serving + - --model-server-metrics-scheme + - https + - --config-text + volumeMounts: + - name: tokenizer-cache + mountPath: /tmp/tokenizer-cache + - name: cachi2-cache + mountPath: /cachi2 + volumes: + - name: tokenizer-cache + emptyDir: {} + - name: cachi2-cache + emptyDir: {} + nodeSelector: + nvidia.com/gpu.present: "true" + route: {} + gateway: {} + template: + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + containers: + - name: main + resources: + requests: + cpu: "4" + memory: 16Gi + nvidia.com/gpu: "1" + limits: + cpu: "4" + memory: 16Gi + nvidia.com/gpu: "1" + livenessProbe: + httpGet: + path: /health + port: 8000 + scheme: HTTPS + initialDelaySeconds: 900 + periodSeconds: 60 + timeoutSeconds: 60 + failureThreshold: 1000 + readinessProbe: + failureThreshold: 10000 + httpGet: + path: /health + port: 8000 + scheme: HTTPS + initialDelaySeconds: 60 + periodSeconds: 30 + successThreshold: 1 + timeoutSeconds: 30 diff --git a/config/llm_d/manifests/nfd-nodefeaturediscovery.yaml b/config/llm_d/manifests/nfd-nodefeaturediscovery.yaml new file mode 100644 index 00000000..df19596f --- /dev/null +++ b/config/llm_d/manifests/nfd-nodefeaturediscovery.yaml @@ -0,0 +1,6 @@ +apiVersion: nfd.openshift.io/v1 +kind: NodeFeatureDiscovery +metadata: + name: nfd-instance + namespace: openshift-nfd +spec: {} diff --git a/config/llm_d/models.yaml b/config/llm_d/models.yaml new file mode 100644 index 00000000..46cf4bf4 --- /dev/null +++ b/config/llm_d/models.yaml @@ -0,0 +1,25 @@ +models: + + qwen3-0-6b: + served_model_name: Qwen/Qwen3-0.6B + uri: hf://Qwen/Qwen3-0.6B + resources: + requests: + cpu: "4" + memory: 16Gi + nvidia.com/gpu: "1" + limits: + cpu: "4" + memory: 16Gi + nvidia.com/gpu: "1" + + llama-3-1-8b-instruct-fp8: + served_model_name: llama-3-1-8b-instruct-fp8 + uri: oci://registry.redhat.io/rhelai1/modelcar-llama-3-1-8b-instruct-fp8-dynamic:1.5 + resources: + requests: + cpu: "4" + memory: 8Gi + nvidia.com/gpu: "1" + limits: + nvidia.com/gpu: "1" diff --git a/config/llm_d/platform.yaml b/config/llm_d/platform.yaml new file mode 100644 index 00000000..c5e35ea4 --- /dev/null +++ b/config/llm_d/platform.yaml @@ -0,0 +1,82 @@ +cluster: + minimum_openshift_version: "4.19.9" + namespace_prefix: llm-d + namespace_max_length: 63 + cleanup_timeout_seconds: 900 + gpu_node_label_selector: nvidia.com/gpu.present=true + nfd_gpu_detection_labels: + - feature.node.kubernetes.io/pci-10de.present + - feature.node.kubernetes.io/pci-0302_10de.present + - feature.node.kubernetes.io/pci-0300_10de.present + +operators: + - display_name: OpenShift Cert Manager + package: openshift-cert-manager-operator + namespace: openshift-cert-manager-operator + channel: stable-v1.18 + source: redhat-operators + wait_timeout_seconds: 900 + - display_name: Leader Worker Set + package: leader-worker-set + namespace: openshift-lws + channel: stable + source: redhat-operators + wait_timeout_seconds: 900 + - display_name: Node Feature Discovery + package: nfd + namespace: openshift-nfd + channel: stable + source: redhat-operators + wait_timeout_seconds: 900 + bootstrap_crd: nodefeaturediscoveries.nfd.openshift.io + bootstrap_manifest: manifests/nfd-nodefeaturediscovery.yaml + - display_name: NVIDIA GPU Operator + package: gpu-operator-certified + namespace: nvidia-gpu-operator + channel: stable + source: certified-operators + wait_timeout_seconds: 1800 + bootstrap_crd: clusterpolicies.nvidia.com + bootstrap_manifest: manifests/gpu-clusterpolicy.yaml + - display_name: Red Hat OpenShift AI + package: rhods-operator + namespace: redhat-ods-operator + channel: stable-3.x + source: redhat-operators + wait_timeout_seconds: 1800 + +rhoai: + namespace: redhat-ods-applications + datasciencecluster_name: default-dsc + datasciencecluster_template: manifests/datasciencecluster.yaml + wait_timeout_seconds: 1800 + required_crds_before_dsc: + - datascienceclusters.datasciencecluster.opendatahub.io + required_crds_after_dsc: + - llminferenceservices.serving.kserve.io + +gateway: + namespace: openshift-ingress + name: openshift-ai-inference + gateway_class_name: data-science-gateway-class + status_address_name: gateway-external + create_if_missing: true + manifest_template: manifests/gateway.yaml + wait_timeout_seconds: 600 + +inference_service: + name: llm-d + template: manifests/llminferenceservice.yaml + epp_config_template: manifests/epp-approximate-prefix-cache.yaml + workload_deployment_name_suffix: -kserve + pod_appearance_timeout_seconds: 600 + ready_timeout_seconds: 1800 + delete_timeout_seconds: 900 + +artifacts: + capture_namespace_events: true + +smoke: + endpoint_path: /v1/completions + request_retries: 30 + request_retry_delay_seconds: 10 diff --git a/config/llm_d/presets.yaml b/config/llm_d/presets.yaml new file mode 100644 index 00000000..9fdaae32 --- /dev/null +++ b/config/llm_d/presets.yaml @@ -0,0 +1,14 @@ +aliases: + cks: smoke + +presets: + + smoke: + model: qwen3-0-6b + smoke_request: default + benchmark: null + + benchmark-short: + model: llama-3-1-8b-instruct-fp8 + smoke_request: default + benchmark: short diff --git a/config/llm_d/workloads.yaml b/config/llm_d/workloads.yaml new file mode 100644 index 00000000..f5ebbb85 --- /dev/null +++ b/config/llm_d/workloads.yaml @@ -0,0 +1,20 @@ +smoke_requests: + default: + prompt: San Francisco is a + max_tokens: 50 + temperature: 0.7 + +benchmarks: + + short: + job_name: guidellm-benchmark + image: ghcr.io/vllm-project/guidellm:v0.5.4 + pvc_size: 1Gi + timeout_seconds: 900 + rate: 1 + args: + backend_type: openai_http + rate_type: concurrent + max_seconds: 120 + sample_requests: 20 + data: prompt_tokens=256,output_tokens=128 diff --git a/projects/llm_d/README.md b/projects/llm_d/README.md index f254277f..d76634d9 100644 --- a/projects/llm_d/README.md +++ b/projects/llm_d/README.md @@ -1,304 +1,16 @@ -# Skeleton Project +# llm_d -This is a template/skeleton project that demonstrates how to create a new project within the **FORGE** test harness framework. +`llm_d` is the Forge project for validating downstream llm-d on RHOAI. -## Overview +The current implementation is intentionally narrow: -This skeleton shows the essential structure and patterns for building projects that comply with FORGE's constitutional principles: +- target only downstream `LLMInferenceService` +- keep the public interface compatible with current Fournos phase execution +- use checked-in presets and manifests instead of a large mutable config surface -- **CI-First Testing**: Structured phases ensure consistent CI integration -- **Observable Measurements**: Command execution logging and timing -- **Reproducible Results**: Deterministic operations with clear success/failure -- **Scale-Aware Design**: Efficient synchronous operations -- **AI Platform Specificity**: OpenShift AI focused testing patterns +Main entrypoints: -## Project Structure - -``` -skeleton/ -├── orchestration/ -│ └── ci.py # Main CI script with Click-based CLI -├── README.md # This documentation -├── config.yaml # Project configuration (optional) -├── tests/ # Test scripts and data (optional) -└── scripts/ # Helper scripts (optional) -``` - -## Quick Start - -### 1. Run Individual Phases - -```bash -# From the FORGE root directory - -# Prepare environment -./run_ci skeleton ci prepare - -# Run tests -./run_ci skeleton ci test - -# Clean up -./run_ci skeleton ci cleanup -``` - -### 2. Development Options - -```bash -# Verbose output -./run_ci skeleton ci --verbose test - -# See all available commands -./run_ci skeleton ci --help -``` - -## Creating Your Own Project - -### Step 1: Copy Skeleton - -```bash -cp -r projects/skeleton projects/your-project-name -cd projects/your-project-name -``` - -### Step 2: Customize - -1. **Update `orchestration/ci.py`**: - - Change `self.project_name` to your project name - - Replace placeholder `echo` commands with actual test logic - - Update the CLI description and help text - -2. **Update `README.md`**: - - Document your project's purpose and usage - - Add specific setup instructions - -3. **Add configuration** (optional): - - Create `config.yaml` for project-specific settings - - Reference it in your CI script - -### Step 3: Implement Test Logic - -Replace the example `echo` commands with your actual test logic: - -#### Prepare Phase -```python -def prepare(self): - self.log("Starting prepare phase...") - - # Example: Install dependencies - if not self.execute_command( - "oc apply -f manifests/setup.yaml", - "Deploy setup resources" - ): - return 1 - - # Example: Validate environment - if not self.execute_command( - "oc get nodes", - "Check cluster nodes" - ): - return 1 - - self.log("Prepare phase completed!", "success") - return 0 -``` - -#### Test Phase -```python -def test(self): - self.log("Starting test phase...") - - # Example: Run performance tests - if not self.execute_command( - "python scripts/performance_test.py --config config.yaml", - "Running performance tests" - ): - return 1 - - # Example: Run functional tests - if not self.execute_command( - "pytest tests/ -v", - "Running functional tests" - ): - return 1 - - self.log("Test phase completed!", "success") - return 0 -``` - -#### Cleanup Phase -```python -def cleanup(self): - self.log("Starting cleanup phase...") - - # Example: Remove test resources - self.execute_command( - "oc delete -f manifests/", - "Cleanup test resources" - ) - - # Example: Generate reports - self.execute_command( - "python scripts/generate_report.py", - "Generate final report" - ) - - self.log("Cleanup phase completed!", "success") - return 0 -``` - -## Key Patterns - -### 1. Phase Structure - -Each project should implement these standard phases: -- **prepare**: Set up environment and dependencies -- **test**: Execute main testing logic -- **cleanup**: Clean up resources and finalize - -### 2. Command Execution - -Use the `execute_command` method for consistent execution and logging: - -```python -# Basic command execution -success = self.execute_command("your-command", "Description") -if not success: - return 1 # Exit with error - -# Command with complex logic -result = self.execute_command( - "kubectl get pods -o json", - "Check pod status" -) -``` - -### 3. Error Handling - -Always check command results and handle failures appropriately: - -```python -if not self.execute_command("critical-command", "Critical step"): - self.log("Critical step failed!", "error") - return 1 # Exit with error code - -# Cleanup commands can be non-critical -self.execute_command("cleanup-command", "Optional cleanup") -# Continue regardless of success -``` - -### 4. Logging - -Use the logging methods for consistent output: - -```python -self.log("Starting operation", "info") # ℹ️ [project] Starting operation -self.log("Operation completed", "success") # ✅ [project] Operation completed -self.log("Warning occurred", "warning") # ⚠️ [project] Warning occurred -self.log("Error occurred", "error") # ❌ [project] Error occurred -``` - -### 5. Verbose Mode - -The framework automatically handles verbose mode: - -```python -# In verbose mode, command details are automatically shown -# Your execute_command calls will show: -# - Command being executed -# - Command output (if any) -# - Execution duration -``` - -## Click CLI Structure - -The skeleton uses Click groups to organize commands: - -```python -@click.group() -@click.option('--verbose', '-v', is_flag=True, help='Enable verbose output') -@click.pass_context -def cli(ctx, verbose): - """Project CI Operations for FORGE.""" - ctx.ensure_object(types.SimpleNamespace) - ctx.obj.verbose = verbose - ctx.obj.runner = YourProjectTestRunner(verbose) - -@cli.command() -@click.pass_context -def prepare(ctx): - """Prepare phase - Set up environment and dependencies.""" - runner = ctx.obj.runner - exit_code = runner.prepare() - sys.exit(exit_code) -``` - -## Best Practices - -### 1. Constitutional Compliance - -- ✅ **CI-First**: Design for automated execution without user interaction -- ✅ **Observable**: Log important events and command execution -- ✅ **Reproducible**: Use deterministic operations and clear error codes -- ✅ **Scale-Aware**: Keep operations efficient and focused -- ✅ **AI Platform Specific**: Focus on OpenShift AI scenarios and tooling - -### 2. Error Handling - -- Always validate prerequisites in prepare phase -- Check command results and fail fast on errors -- Provide meaningful error messages with context -- Clean up resources even when tests fail (use try/except if needed) - -### 3. Command Design - -- Make commands idempotent when possible -- Use meaningful descriptions for all execute_command calls -- Test commands locally before adding to CI -- Consider timeouts for long-running operations - -### 4. Configuration - -- Keep project configuration in `config.yaml` or environment variables -- Make tests configurable for different environments -- Document all configuration options -- Use sensible defaults - -## Testing the Skeleton - -```bash -# Test individual phases -./run_ci skeleton ci prepare -./run_ci skeleton ci test -./run_ci skeleton ci cleanup - -# Test with verbose output -./run_ci skeleton ci --verbose prepare - -# See all available commands -./run_ci skeleton ci --help -``` - -## Integration with CI Systems - -The skeleton is designed for easy CI integration: - -```bash -# In your CI pipeline -./run_ci your-project ci prepare || exit 1 -./run_ci your-project ci test || exit 1 -./run_ci your-project ci cleanup # Always run cleanup -``` - -## Next Steps - -1. **Study the Code**: Review `orchestration/ci.py` to understand the patterns -2. **Copy and Customize**: Create your own project based on this skeleton -3. **Implement Tests**: Replace placeholder `echo` commands with real test logic -4. **Test Integration**: Verify your project works with the run_ci entrypoint -5. **Add Documentation**: Document your specific test scenarios and setup - -## Support - -- Review other projects in `projects/` for more examples -- Check the main FORGE documentation -- Study the run_ci entrypoint code in `projects/core/ci_entrypoint/` +- CI phase wrapper: [ci.py](/Users/aperdomo/workspace/redhat/forge/projects/llm_d/orchestration/ci.py) +- Prepare flow: [prepare_llmd.py](/Users/aperdomo/workspace/redhat/forge/projects/llm_d/orchestration/prepare_llmd.py) +- Test flow: [test_llmd.py](/Users/aperdomo/workspace/redhat/forge/projects/llm_d/orchestration/test_llmd.py) +- Shared runtime/config loader: [llmd_runtime.py](/Users/aperdomo/workspace/redhat/forge/projects/llm_d/orchestration/llmd_runtime.py) diff --git a/projects/llm_d/orchestration/ci.py b/projects/llm_d/orchestration/ci.py old mode 100755 new mode 100644 index 7623510f..97073e6e --- a/projects/llm_d/orchestration/ci.py +++ b/projects/llm_d/orchestration/ci.py @@ -25,7 +25,7 @@ def main(ctx): @main.command() @click.pass_context @ci_lib.safe_ci_command -def prepare(ctx): +def prepare(ctx) -> int: """Prepare phase - Set up environment and dependencies.""" return prepare_llmd.prepare() @@ -33,7 +33,7 @@ def prepare(ctx): @main.command() @click.pass_context @ci_lib.safe_ci_command -def test(ctx): +def test(ctx) -> int: """Test phase - Execute the main testing logic.""" return test_llmd.test() @@ -41,7 +41,7 @@ def test(ctx): @main.command() @click.pass_context @ci_lib.safe_ci_command -def pre_cleanup(ctx): +def pre_cleanup(ctx) -> int: """Cleanup phase - Clean up resources and finalize.""" return prepare_llmd.cleanup() diff --git a/projects/llm_d/orchestration/cli.py b/projects/llm_d/orchestration/cli.py old mode 100755 new mode 100644 index def09477..06ae9ef6 --- a/projects/llm_d/orchestration/cli.py +++ b/projects/llm_d/orchestration/cli.py @@ -1,7 +1,4 @@ #!/usr/bin/env python3 -""" -LLM-D Project CLI Operations -""" import logging import sys @@ -19,7 +16,7 @@ @click.group() @click.pass_context def main(ctx): - """LLM-D Project CI Operations for FORGE.""" + """LLM-D Project CLI Operations for FORGE.""" ctx.ensure_object(types.SimpleNamespace) test_llmd.init() @@ -27,37 +24,33 @@ def main(ctx): @main.command() @click.pass_context @safe_cli_command -def prepare(ctx): +def prepare(ctx) -> int: """Prepare phase - Set up environment and dependencies.""" - exit_code = prepare_llmd.prepare() - sys.exit(exit_code) + return prepare_llmd.prepare() @main.command() @click.pass_context @safe_cli_command -def test(ctx): +def test(ctx) -> int: """Test phase - Execute the main testing logic.""" - exit_code = test_llmd.test() - sys.exit(exit_code) + return test_llmd.test() @main.command() @click.pass_context @safe_cli_command -def pre_cleanup(ctx): +def pre_cleanup(ctx) -> int: """Cleanup phase - Clean up resources and finalize.""" - exit_code = prepare_llmd.cleanup() - sys.exit(exit_code) + return prepare_llmd.cleanup() @main.command() @click.pass_context @safe_cli_command -def post_cleanup(ctx): +def post_cleanup(ctx) -> int: """Cleanup phase - Clean up resources and finalize.""" - exit_code = prepare_llmd.cleanup() - sys.exit(exit_code) + return prepare_llmd.cleanup() if __name__ == "__main__": diff --git a/projects/llm_d/orchestration/llmd_runtime.py b/projects/llm_d/orchestration/llmd_runtime.py new file mode 100644 index 00000000..aba35fd8 --- /dev/null +++ b/projects/llm_d/orchestration/llmd_runtime.py @@ -0,0 +1,695 @@ +from __future__ import annotations + +import copy +import json +import logging +import os +import re +import shlex +import subprocess +import sys +import time +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Iterable + +import yaml + +FORGE_HOME = Path(__file__).resolve().parents[3] +if str(FORGE_HOME) not in sys.path: + sys.path.insert(0, str(FORGE_HOME)) + +from projects.core.library import env, run + +LOGGER = logging.getLogger(__name__) +CONFIG_DIR = FORGE_HOME / "config" / "llm_d" +ALLOWED_OVERRIDE_KEYS = frozenset({"namespace"}) + + +class CommandError(RuntimeError): + """Raised when an external command exits unsuccessfully.""" + + +@dataclass(frozen=True) +class ResolvedConfig: + artifact_dir: Path + project_root: Path + config_dir: Path + preset_name: str + preset_alias: str | None + job_name: str + namespace: str + namespace_is_managed: bool + gpu_count: int | None + platform: dict[str, Any] + model: dict[str, Any] + smoke_request: dict[str, Any] + benchmark: dict[str, Any] | None + fournos_config: dict[str, Any] + overrides: dict[str, Any] + + @property + def manifests_dir(self) -> Path: + return self.config_dir / "manifests" + + +def init() -> Path: + if not logging.getLogger().handlers: + logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") + + env.init() + run.init() + ensure_artifact_directories(env.ARTIFACT_DIR) + return env.ARTIFACT_DIR + + +def ensure_artifact_directories(artifact_dir: Path) -> None: + for relative in ("src", "artifacts", "artifacts/results"): + (artifact_dir / relative).mkdir(parents=True, exist_ok=True) + + +def load_run_configuration( + *, cwd: Path | None = None, artifact_dir: Path | None = None +) -> ResolvedConfig: + cwd = cwd or Path.cwd() + artifact_dir = artifact_dir or env.ARTIFACT_DIR + if artifact_dir is None: + raise RuntimeError("ARTIFACT_DIR is not initialized") + + platform_data = load_yaml(CONFIG_DIR / "platform.yaml") + models_data = load_yaml(CONFIG_DIR / "models.yaml")["models"] + workloads_data = load_yaml(CONFIG_DIR / "workloads.yaml") + preset_data = load_yaml(CONFIG_DIR / "presets.yaml") + + fournos_config = load_fournos_config(cwd) + overrides = parse_overrides(os.environ.get("FORGE_CONFIG_OVERRIDES", "")) + + requested_preset = ( + fournos_config.get("preset") or os.environ.get("FORGE_PRESET") or "smoke" + ) + alias = ( + requested_preset if requested_preset in preset_data.get("aliases", {}) else None + ) + preset_name = preset_data.get("aliases", {}).get(requested_preset, requested_preset) + preset = preset_data["presets"].get(preset_name) + if preset is None: + raise ValueError(f"Unknown llm_d preset: {requested_preset}") + + model_name = preset["model"] + model = copy.deepcopy(models_data[model_name]) + + smoke_request_name = preset.get("smoke_request", "default") + smoke_request = copy.deepcopy(workloads_data["smoke_requests"][smoke_request_name]) + + benchmark_name = preset.get("benchmark") + benchmark = None + if benchmark_name: + benchmark = copy.deepcopy(workloads_data["benchmarks"][benchmark_name]) + + job_name = fournos_config.get("job-name") or os.environ.get("FORGE_JOB_NAME") + if not job_name: + job_name = f"local-{preset_name}" + + namespace_override = overrides.get("namespace") or fournos_config.get("namespace") + namespace = namespace_override or derive_namespace( + job_name, + platform_data["cluster"]["namespace_prefix"], + platform_data["cluster"]["namespace_max_length"], + ) + + gpu_count = normalize_gpu_count(fournos_config.get("gpu-count")) + + return ResolvedConfig( + artifact_dir=Path(artifact_dir), + project_root=FORGE_HOME, + config_dir=CONFIG_DIR, + preset_name=preset_name, + preset_alias=alias, + job_name=job_name, + namespace=namespace, + namespace_is_managed=namespace_override is None, + gpu_count=gpu_count, + platform=platform_data, + model=model, + smoke_request=smoke_request, + benchmark=benchmark, + fournos_config=fournos_config, + overrides=overrides, + ) + + +def load_fournos_config(cwd: Path) -> dict[str, Any]: + config_path = cwd / "fournos_config.yaml" + if not config_path.exists(): + return {} + + data = load_yaml(config_path) + if data is None: + return {} + if not isinstance(data, dict): + raise ValueError( + f"Unexpected FOURNOS config type in {config_path}: {type(data)}" + ) + return data + + +def parse_overrides(raw: str) -> dict[str, Any]: + if not raw or raw.strip() in {"", "null", "{}"}: + return {} + + try: + data = json.loads(raw) + except json.JSONDecodeError as exc: + raise ValueError(f"FORGE_CONFIG_OVERRIDES is not valid JSON: {exc}") from exc + + if not isinstance(data, dict): + raise ValueError("FORGE_CONFIG_OVERRIDES must decode to a JSON object") + + unsupported = sorted(set(data) - ALLOWED_OVERRIDE_KEYS) + if unsupported: + raise ValueError( + "Unsupported llm_d override keys: " + f"{', '.join(unsupported)}. Allowed keys: {', '.join(sorted(ALLOWED_OVERRIDE_KEYS))}" + ) + + return data + + +def normalize_gpu_count(value: Any) -> int | None: + if value in (None, ""): + return None + try: + return int(value) + except (TypeError, ValueError): + LOGGER.warning("Ignoring invalid gpu-count value: %s", value) + return None + + +def derive_namespace(job_name: str, prefix: str, max_length: int) -> str: + slug = re.sub(r"[^a-z0-9-]+", "-", job_name.lower()) + slug = re.sub(r"-{2,}", "-", slug).strip("-") + if not slug: + slug = "run" + + if slug.startswith(f"{prefix}-"): + namespace = slug + else: + namespace = f"{prefix}-{slug}" + + namespace = namespace[:max_length].rstrip("-") + if not namespace: + raise ValueError( + f"Could not derive a valid namespace from job name: {job_name}" + ) + return namespace + + +def load_yaml(path: Path) -> Any: + with path.open(encoding="utf-8") as handle: + return yaml.safe_load(handle) + + +def write_yaml(path: Path, payload: Any) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8") as handle: + yaml.safe_dump(payload, handle, sort_keys=False) + + +def write_json(path: Path, payload: Any) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8") as handle: + json.dump(payload, handle, indent=2, sort_keys=True) + handle.write("\n") + + +def write_text(path: Path, content: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(content, encoding="utf-8") + + +def run_command( + args: Iterable[str], + *, + check: bool = True, + capture_output: bool = True, + input_text: str | None = None, +) -> subprocess.CompletedProcess[str]: + cmd = [str(arg) for arg in args] + LOGGER.info("run: %s", " ".join(shlex.quote(arg) for arg in cmd)) + result = subprocess.run( + cmd, + check=False, + text=True, + capture_output=capture_output, + input=input_text, + ) + + if capture_output: + if result.stdout: + LOGGER.info("stdout:\n%s", result.stdout.rstrip()) + if result.stderr: + LOGGER.info("stderr:\n%s", result.stderr.rstrip()) + + if check and result.returncode != 0: + raise CommandError( + f"Command failed with exit code {result.returncode}: " + f"{' '.join(shlex.quote(arg) for arg in cmd)}" + ) + + return result + + +def oc( + *args: str, + check: bool = True, + capture_output: bool = True, + input_text: str | None = None, +) -> subprocess.CompletedProcess[str]: + return run_command( + ["oc", *args], + check=check, + capture_output=capture_output, + input_text=input_text, + ) + + +def apply_manifest(artifact_path: Path, manifest: dict[str, Any]) -> None: + write_yaml(artifact_path, manifest) + oc("apply", "-f", str(artifact_path)) + + +def oc_get_json( + kind: str, + *, + name: str | None = None, + namespace: str | None = None, + selector: str | None = None, + ignore_not_found: bool = False, +) -> dict[str, Any] | None: + args = ["get", kind] + if name: + args.append(name) + if namespace: + args.extend(["-n", namespace]) + if selector: + args.extend(["-l", selector]) + args.extend(["-o", "json"]) + + result = oc(*args, check=not ignore_not_found, capture_output=True) + if ignore_not_found and result.returncode != 0: + return None + return json.loads(result.stdout) + + +def resource_exists(kind: str, name: str, *, namespace: str | None = None) -> bool: + result = oc( + "get", + kind, + name, + *([] if namespace is None else ["-n", namespace]), + check=False, + capture_output=True, + ) + return result.returncode == 0 + + +def wait_until( + description: str, + *, + timeout_seconds: int, + interval_seconds: int, + predicate, +) -> Any: + deadline = time.time() + timeout_seconds + last_error: Exception | None = None + + while time.time() < deadline: + try: + value = predicate() + if value: + return value + last_error = None + except Exception as exc: # pragma: no cover - exercised in integration paths + last_error = exc + LOGGER.info("waiting for %s: %s", description, exc) + time.sleep(interval_seconds) + + if last_error: + raise RuntimeError( + f"Timed out waiting for {description}: {last_error}" + ) from last_error + raise RuntimeError(f"Timed out waiting for {description}") + + +def wait_for_namespace_deleted(namespace: str, timeout_seconds: int) -> None: + def _namespace_gone() -> bool: + return not resource_exists("namespace", namespace) + + wait_until( + f"namespace/{namespace} deletion", + timeout_seconds=timeout_seconds, + interval_seconds=10, + predicate=_namespace_gone, + ) + + +def wait_for_crd(crd_name: str, timeout_seconds: int) -> None: + wait_until( + f"crd/{crd_name}", + timeout_seconds=timeout_seconds, + interval_seconds=10, + predicate=lambda: resource_exists("crd", crd_name), + ) + + +def wait_for_operator_csv( + package: str, namespace: str, timeout_seconds: int +) -> dict[str, Any]: + selector = f"operators.coreos.com/{package}.{namespace}" + + def _csv_ready() -> dict[str, Any] | None: + data = oc_get_json( + "csv", namespace=namespace, selector=selector, ignore_not_found=True + ) + if not data: + return None + items = data.get("items", []) + if not items: + return None + csv = items[0] + if csv.get("status", {}).get("phase") == "Succeeded": + return csv + return None + + return wait_until( + f"{package} CSV in {namespace}", + timeout_seconds=timeout_seconds, + interval_seconds=15, + predicate=_csv_ready, + ) + + +def ensure_namespace(namespace: str, *, labels: dict[str, str] | None = None) -> None: + if not resource_exists("namespace", namespace): + oc("create", "namespace", namespace) + + if labels: + label_args = [f"{key}={value}" for key, value in labels.items()] + oc("label", "namespace", namespace, "--overwrite", *label_args) + + +def ensure_operator_group(namespace: str, package: str) -> None: + data = oc_get_json("operatorgroup", namespace=namespace, ignore_not_found=True) + if data and data.get("items"): + for item in data["items"]: + targets = item.get("spec", {}).get("targetNamespaces") or [namespace] + if namespace in targets: + return + raise RuntimeError( + f"Existing OperatorGroup objects in {namespace} do not target {namespace}" + ) + + operator_group = { + "apiVersion": "operators.coreos.com/v1", + "kind": "OperatorGroup", + "metadata": {"name": package, "namespace": namespace}, + "spec": {"targetNamespaces": [namespace]}, + } + oc("apply", "-f", "-", input_text=yaml.safe_dump(operator_group, sort_keys=False)) + + +def ensure_subscription(operator_spec: dict[str, Any]) -> None: + namespace = operator_spec["namespace"] + package = operator_spec["package"] + + ensure_namespace(namespace) + ensure_operator_group(namespace, package) + + subscription = desired_subscription(operator_spec) + current = oc_get_json( + "subscription.operators.coreos.com", + name=package, + namespace=namespace, + ignore_not_found=True, + ) + if current and not subscription_spec_matches( + current.get("spec", {}), subscription["spec"] + ): + LOGGER.info("Reconciling subscription drift for %s in %s", package, namespace) + + oc("apply", "-f", "-", input_text=yaml.safe_dump(subscription, sort_keys=False)) + + def _subscription_reconciled() -> dict[str, Any] | None: + payload = oc_get_json( + "subscription.operators.coreos.com", + name=package, + namespace=namespace, + ) + if subscription_spec_matches(payload.get("spec", {}), subscription["spec"]): + return payload + return None + + wait_until( + f"subscription/{package} reconciliation in {namespace}", + timeout_seconds=60, + interval_seconds=5, + predicate=_subscription_reconciled, + ) + + +def desired_subscription(operator_spec: dict[str, Any]) -> dict[str, Any]: + namespace = operator_spec["namespace"] + package = operator_spec["package"] + return { + "apiVersion": "operators.coreos.com/v1alpha1", + "kind": "Subscription", + "metadata": {"name": package, "namespace": namespace}, + "spec": { + "channel": operator_spec["channel"], + "installPlanApproval": "Automatic", + "name": package, + "source": operator_spec["source"], + "sourceNamespace": "openshift-marketplace", + }, + } + + +def subscription_spec_matches(actual: dict[str, Any], expected: dict[str, Any]) -> bool: + keys = ("channel", "installPlanApproval", "name", "source", "sourceNamespace") + return all(actual.get(key) == expected.get(key) for key in keys) + + +def operator_spec_by_package(platform: dict[str, Any], package: str) -> dict[str, Any]: + for operator_spec in platform["operators"]: + if operator_spec["package"] == package: + return operator_spec + raise KeyError(f"Unknown operator package in llm_d platform config: {package}") + + +def load_manifest_template( + config: ResolvedConfig, relative_path: str +) -> dict[str, Any]: + return load_yaml(config.config_dir / relative_path) + + +def version_tuple(value: str) -> tuple[int, ...]: + numbers = re.findall(r"\d+", value) + return tuple(int(number) for number in numbers[:3]) + + +def condition_status(resource: dict[str, Any], condition_type: str) -> str | None: + conditions = resource.get("status", {}).get("conditions", []) + for condition in conditions: + if condition.get("type") == condition_type: + return condition.get("status") + return None + + +def render_datasciencecluster(config: ResolvedConfig) -> dict[str, Any]: + template_path = ( + config.config_dir / config.platform["rhoai"]["datasciencecluster_template"] + ) + manifest = load_yaml(template_path) + manifest["metadata"]["name"] = config.platform["rhoai"]["datasciencecluster_name"] + manifest["metadata"]["namespace"] = config.platform["rhoai"]["namespace"] + return manifest + + +def render_gateway(config: ResolvedConfig) -> dict[str, Any]: + template_path = config.config_dir / config.platform["gateway"]["manifest_template"] + manifest = load_yaml(template_path) + manifest["metadata"]["name"] = config.platform["gateway"]["name"] + manifest["metadata"]["namespace"] = config.platform["gateway"]["namespace"] + manifest["spec"]["gatewayClassName"] = config.platform["gateway"][ + "gateway_class_name" + ] + return manifest + + +def render_inference_service(config: ResolvedConfig) -> dict[str, Any]: + template_path = config.config_dir / config.platform["inference_service"]["template"] + manifest = load_yaml(template_path) + + name = config.platform["inference_service"]["name"] + manifest["metadata"]["name"] = name + manifest["metadata"]["namespace"] = config.namespace + manifest["metadata"].setdefault("labels", {}) + manifest["metadata"]["labels"].update( + { + "app.kubernetes.io/managed-by": "forge", + "forge.openshift.io/project": "llm_d", + } + ) + + manifest["spec"]["model"]["uri"] = config.model["uri"] + manifest["spec"]["model"]["name"] = config.model["served_model_name"] + manifest["spec"]["template"]["containers"][0]["resources"] = copy.deepcopy( + config.model["resources"] + ) + + epp_path = ( + config.config_dir / config.platform["inference_service"]["epp_config_template"] + ) + epp_config = epp_path.read_text(encoding="utf-8") + router_args = manifest["spec"]["router"]["scheduler"]["template"]["containers"][0][ + "args" + ] + if not router_args or router_args[-1] != "--config-text": + raise ValueError("Expected llm-d router args to end with --config-text") + router_args.append(epp_config) + + return manifest + + +def render_guidellm_pvc(config: ResolvedConfig) -> dict[str, Any]: + if not config.benchmark: + raise ValueError("Benchmark configuration is not enabled for this preset") + + return { + "apiVersion": "v1", + "kind": "PersistentVolumeClaim", + "metadata": { + "name": config.benchmark["job_name"], + "namespace": config.namespace, + }, + "spec": { + "accessModes": ["ReadWriteOnce"], + "resources": {"requests": {"storage": config.benchmark["pvc_size"]}}, + }, + } + + +def render_guidellm_job(config: ResolvedConfig, endpoint_url: str) -> dict[str, Any]: + if not config.benchmark: + raise ValueError("Benchmark configuration is not enabled for this preset") + + args = [ + "benchmark", + "run", + f"--target={endpoint_url}", + f"--rate={config.benchmark['rate']}", + ] + for key, value in config.benchmark["args"].items(): + if value is None: + continue + args.append(f"--{key.replace('_', '-')}={value}") + args.append("--outputs=json") + + return { + "apiVersion": "batch/v1", + "kind": "Job", + "metadata": { + "name": config.benchmark["job_name"], + "namespace": config.namespace, + }, + "spec": { + "backoffLimit": 0, + "template": { + "spec": { + "serviceAccountName": "default", + "restartPolicy": "Never", + "containers": [ + { + "name": "guidellm", + "image": config.benchmark["image"], + "command": ["/opt/app-root/bin/guidellm"], + "args": args, + "env": [{"name": "USER", "value": "guidellm"}], + "volumeMounts": [ + {"name": "home", "mountPath": "/home/guidellm"}, + {"name": "results", "mountPath": "/results"}, + ], + } + ], + "volumes": [ + {"name": "home", "emptyDir": {}}, + { + "name": "results", + "persistentVolumeClaim": { + "claimName": config.benchmark["job_name"] + }, + }, + ], + } + }, + }, + } + + +def render_guidellm_copy_pod( + config: ResolvedConfig, node_name: str | None = None +) -> dict[str, Any]: + if not config.benchmark: + raise ValueError("Benchmark configuration is not enabled for this preset") + + pod = { + "apiVersion": "v1", + "kind": "Pod", + "metadata": { + "name": f"{config.benchmark['job_name']}-copy", + "namespace": config.namespace, + }, + "spec": { + "restartPolicy": "Never", + "initContainers": [ + { + "name": "permission-fixer", + "image": config.benchmark["image"], + "command": [ + "/bin/sh", + "-c", + "chmod 755 /results && chown -R 1001:1001 /results || true", + ], + "securityContext": { + "runAsUser": 0, + "allowPrivilegeEscalation": True, + }, + "volumeMounts": [{"name": "results", "mountPath": "/results"}], + } + ], + "containers": [ + { + "name": "copy-helper", + "image": config.benchmark["image"], + "command": ["/bin/sleep", "300"], + "securityContext": { + "runAsUser": 1001, + "runAsNonRoot": True, + "allowPrivilegeEscalation": False, + }, + "volumeMounts": [{"name": "results", "mountPath": "/results"}], + } + ], + "volumes": [ + { + "name": "results", + "persistentVolumeClaim": { + "claimName": config.benchmark["job_name"] + }, + } + ], + }, + } + if node_name: + pod["spec"]["nodeName"] = node_name + return pod diff --git a/projects/llm_d/orchestration/prepare_llmd.py b/projects/llm_d/orchestration/prepare_llmd.py index c28ad8c7..fdabe4b8 100644 --- a/projects/llm_d/orchestration/prepare_llmd.py +++ b/projects/llm_d/orchestration/prepare_llmd.py @@ -1,16 +1,428 @@ +from __future__ import annotations + +import json import logging +from pathlib import Path + +from projects.llm_d.orchestration import llmd_runtime + +LOGGER = logging.getLogger(__name__) + + +def prepare() -> int: + llmd_runtime.init() + config = llmd_runtime.load_run_configuration() + + LOGGER.info( + "Preparing llm_d preset=%s namespace=%s", config.preset_name, config.namespace + ) + + verify_oc_access() + verify_cluster_version(config) + prepare_cert_manager(config) + prepare_leader_worker_set(config) + prepare_nfd(config) + prepare_gpu_operator(config) + prepare_rhoai_operator(config) + apply_datasciencecluster(config) + wait_for_datasciencecluster_ready(config) + ensure_required_crds(config.platform["rhoai"]["required_crds_after_dsc"], config) + ensure_gateway(config) + ensure_test_namespace(config) + verify_gpu_nodes(config) + capture_prepare_state(config) + + return 0 + + +def cleanup() -> int: + llmd_runtime.init() + config = llmd_runtime.load_run_configuration() + + inference_service_name = config.platform["inference_service"]["name"] + benchmark_name = ( + config.benchmark["job_name"] if config.benchmark else "guidellm-benchmark" + ) + + if config.namespace_is_managed: + if llmd_runtime.resource_exists("namespace", config.namespace): + llmd_runtime.oc( + "delete", "namespace", config.namespace, "--ignore-not-found=true" + ) + llmd_runtime.wait_for_namespace_deleted( + config.namespace, + timeout_seconds=config.platform["cluster"]["cleanup_timeout_seconds"], + ) + else: + llmd_runtime.oc( + "delete", + "llminferenceservice", + inference_service_name, + "-n", + config.namespace, + "--ignore-not-found=true", + check=False, + ) + llmd_runtime.oc( + "delete", + "job,pvc", + benchmark_name, + "-n", + config.namespace, + "--ignore-not-found=true", + check=False, + ) + llmd_runtime.oc( + "delete", + "pod", + f"{benchmark_name}-copy", + "-n", + config.namespace, + "--ignore-not-found=true", + check=False, + ) + + return 0 + + +def verify_oc_access() -> None: + llmd_runtime.oc("whoami", capture_output=True) + + +def verify_cluster_version(config: llmd_runtime.ResolvedConfig) -> None: + version_info = llmd_runtime.oc("version", "-o", "json", capture_output=True) + payload = json.loads(version_info.stdout) + + openshift_version = ( + payload.get("openshiftVersion") + or payload.get("serverVersion", {}).get("gitVersion") + or payload.get("serverVersion", {}).get("platform") + ) + if not openshift_version: + raise RuntimeError( + "Could not determine OpenShift version from `oc version -o json`" + ) + + minimum = config.platform["cluster"]["minimum_openshift_version"] + if llmd_runtime.version_tuple(openshift_version) < llmd_runtime.version_tuple( + minimum + ): + raise RuntimeError( + f"Cluster version {openshift_version} is older than the llm_d minimum {minimum}" + ) + + +def ensure_operator_subscription(operator_spec: dict[str, str]) -> dict[str, object]: + llmd_runtime.ensure_subscription(operator_spec) + return llmd_runtime.wait_for_operator_csv( + operator_spec["package"], + operator_spec["namespace"], + timeout_seconds=operator_spec["wait_timeout_seconds"], + ) + + +def prepare_cert_manager(config: llmd_runtime.ResolvedConfig) -> None: + operator_spec = llmd_runtime.operator_spec_by_package( + config.platform, "openshift-cert-manager-operator" + ) + ensure_operator_subscription(operator_spec) + + +def prepare_leader_worker_set(config: llmd_runtime.ResolvedConfig) -> None: + operator_spec = llmd_runtime.operator_spec_by_package( + config.platform, "leader-worker-set" + ) + ensure_operator_subscription(operator_spec) + + +def prepare_nfd(config: llmd_runtime.ResolvedConfig) -> None: + operator_spec = llmd_runtime.operator_spec_by_package(config.platform, "nfd") + ensure_operator_subscription(operator_spec) + llmd_runtime.wait_for_crd( + operator_spec["bootstrap_crd"], + timeout_seconds=operator_spec["wait_timeout_seconds"], + ) + + manifest = llmd_runtime.load_manifest_template( + config, operator_spec["bootstrap_manifest"] + ) + llmd_runtime.apply_manifest( + config.artifact_dir / "src" / "nfd-nodefeaturediscovery.yaml", + manifest, + ) + + llmd_runtime.wait_until( + "NodeFeatureDiscovery bootstrap resource", + timeout_seconds=operator_spec["wait_timeout_seconds"], + interval_seconds=10, + predicate=lambda: llmd_runtime.resource_exists( + "nodefeaturediscovery", + manifest["metadata"]["name"], + namespace=manifest["metadata"]["namespace"], + ), + ) + + wait_for_nfd_gpu_labels( + config, timeout_seconds=operator_spec["wait_timeout_seconds"] + ) + + +def prepare_gpu_operator(config: llmd_runtime.ResolvedConfig) -> None: + operator_spec = llmd_runtime.operator_spec_by_package( + config.platform, "gpu-operator-certified" + ) + ensure_operator_subscription(operator_spec) + llmd_runtime.wait_for_crd( + operator_spec["bootstrap_crd"], + timeout_seconds=operator_spec["wait_timeout_seconds"], + ) + + manifest = llmd_runtime.load_manifest_template( + config, operator_spec["bootstrap_manifest"] + ) + clusterpolicy_name = manifest["metadata"]["name"] + if llmd_runtime.resource_exists("clusterpolicy", clusterpolicy_name): + LOGGER.info( + "ClusterPolicy/%s already exists; verifying readiness instead of applying bootstrap manifest", + clusterpolicy_name, + ) + wait_for_gpu_clusterpolicy_ready( + clusterpolicy_name, + timeout_seconds=operator_spec["wait_timeout_seconds"], + ) + return + + llmd_runtime.apply_manifest( + config.artifact_dir / "src" / "gpu-clusterpolicy.yaml", + manifest, + ) + + wait_for_gpu_clusterpolicy_ready( + clusterpolicy_name, + timeout_seconds=operator_spec["wait_timeout_seconds"], + ) + + +def wait_for_gpu_clusterpolicy_ready( + clusterpolicy_name: str, *, timeout_seconds: int +) -> None: + def _clusterpolicy_ready() -> bool: + payload = llmd_runtime.oc_get_json( + "clusterpolicy", + name=clusterpolicy_name, + ) + state = payload.get("status", {}).get("state", "") + return state.lower() == "ready" + + llmd_runtime.wait_until( + f"clusterpolicy/{clusterpolicy_name} ready", + timeout_seconds=timeout_seconds, + interval_seconds=15, + predicate=_clusterpolicy_ready, + ) + + +def prepare_rhoai_operator(config: llmd_runtime.ResolvedConfig) -> None: + operator_spec = llmd_runtime.operator_spec_by_package( + config.platform, "rhods-operator" + ) + ensure_operator_subscription(operator_spec) + ensure_required_crds(config.platform["rhoai"]["required_crds_before_dsc"], config) + + +def ensure_required_crds( + crd_names: list[str], config: llmd_runtime.ResolvedConfig +) -> None: + for crd_name in crd_names: + llmd_runtime.wait_for_crd( + crd_name, + timeout_seconds=config.platform["rhoai"]["wait_timeout_seconds"], + ) + + +def apply_datasciencecluster(config: llmd_runtime.ResolvedConfig) -> None: + manifest = llmd_runtime.render_datasciencecluster(config) + llmd_runtime.apply_manifest( + config.artifact_dir / "src" / "datasciencecluster.yaml", manifest + ) + llmd_runtime.oc( + "get", + "datasciencecluster", + config.platform["rhoai"]["datasciencecluster_name"], + "-n", + config.platform["rhoai"]["namespace"], + "-o", + "yaml", + capture_output=True, + ) + + +def wait_for_datasciencecluster_ready(config: llmd_runtime.ResolvedConfig) -> None: + rhoai = config.platform["rhoai"] + + def _dsc_ready() -> bool: + payload = llmd_runtime.oc_get_json( + "datasciencecluster", + name=rhoai["datasciencecluster_name"], + namespace=rhoai["namespace"], + ) + phase = payload.get("status", {}).get("phase") + if phase == "Ready": + return True + if phase in {"Failed", "Error"}: + raise RuntimeError(f"DataScienceCluster entered terminal phase {phase}") + return False + + llmd_runtime.wait_until( + f"datasciencecluster/{rhoai['datasciencecluster_name']} ready", + timeout_seconds=rhoai["wait_timeout_seconds"], + interval_seconds=10, + predicate=_dsc_ready, + ) + + +def ensure_gateway(config: llmd_runtime.ResolvedConfig) -> None: + gateway = config.platform["gateway"] + if not llmd_runtime.resource_exists( + "gateway", gateway["name"], namespace=gateway["namespace"] + ): + if not gateway["create_if_missing"]: + raise RuntimeError( + f"Required gateway {gateway['name']} does not exist in {gateway['namespace']}" + ) + manifest = llmd_runtime.render_gateway(config) + llmd_runtime.apply_manifest( + config.artifact_dir / "src" / "gateway.yaml", manifest + ) + + def _gateway_programmed() -> bool: + resource = llmd_runtime.oc_get_json( + "gateway", + name=gateway["name"], + namespace=gateway["namespace"], + ) + return llmd_runtime.condition_status(resource, "Programmed") == "True" + + llmd_runtime.wait_until( + f"gateway/{gateway['name']} programmed", + timeout_seconds=gateway["wait_timeout_seconds"], + interval_seconds=10, + predicate=_gateway_programmed, + ) + + +def ensure_test_namespace(config: llmd_runtime.ResolvedConfig) -> None: + llmd_runtime.ensure_namespace( + config.namespace, + labels={ + "app.kubernetes.io/managed-by": "forge", + "forge.openshift.io/project": "llm_d", + }, + ) + + +def verify_gpu_nodes(config: llmd_runtime.ResolvedConfig) -> None: + selector = config.platform["cluster"]["gpu_node_label_selector"] + data = llmd_runtime.oc_get_json("nodes", selector=selector, ignore_not_found=True) + items = data.get("items", []) if data else [] + if not items: + raise RuntimeError( + f"No GPU nodes found with selector {selector}. The llm_d smoke path requires GPUs." + ) + + +def wait_for_nfd_gpu_labels( + config: llmd_runtime.ResolvedConfig, *, timeout_seconds: int +) -> None: + selectors = config.platform["cluster"]["nfd_gpu_detection_labels"] + + def _labels_present() -> bool: + for selector in selectors: + data = llmd_runtime.oc_get_json( + "nodes", selector=selector, ignore_not_found=True + ) + if data and data.get("items"): + return True + return False + + llmd_runtime.wait_until( + "NFD GPU discovery labels on cluster nodes", + timeout_seconds=timeout_seconds, + interval_seconds=15, + predicate=_labels_present, + ) + -from projects.core.library import config +def capture_prepare_state(config: llmd_runtime.ResolvedConfig) -> None: + artifacts_dir = config.artifact_dir / "artifacts" + rhoai = config.platform["rhoai"] + gateway = config.platform["gateway"] -logger = logging.getLogger(__name__) + capture_resource_yaml( + "datasciencecluster", + rhoai["datasciencecluster_name"], + rhoai["namespace"], + artifacts_dir / "datasciencecluster.yaml", + ) + capture_resource_yaml( + "gateway", + gateway["name"], + gateway["namespace"], + artifacts_dir / "gateway.yaml", + ) + gateway_service = llmd_runtime.oc( + "get", + "service", + "-A", + "-l", + f"gateway.networking.k8s.io/gateway-name={gateway['name']}", + "-o", + "yaml", + check=False, + capture_output=True, + ) + if gateway_service.returncode == 0 and gateway_service.stdout: + llmd_runtime.write_text( + artifacts_dir / "gateway.service.yaml", gateway_service.stdout + ) + if config.platform["artifacts"]["capture_namespace_events"]: + capture_namespace_events( + config.namespace, artifacts_dir / "namespace.events.txt" + ) -def prepare(): - ns = config.project.get_config("prepare.namespace.name") - logger.warning(f"Hello prepare {ns}") - pass +def capture_resource_yaml( + kind: str, + name: str, + namespace: str, + destination: Path, + *, + check: bool = True, +) -> None: + result = llmd_runtime.oc( + "get", + kind, + name, + "-n", + namespace, + "-o", + "yaml", + check=check, + capture_output=True, + ) + if result.returncode == 0 and result.stdout: + llmd_runtime.write_text(destination, result.stdout) -def cleanup(): - logger.warning("Hello cleanup") - pass +def capture_namespace_events(namespace: str, destination: Path) -> None: + result = llmd_runtime.oc( + "get", + "events", + "-n", + namespace, + "--sort-by=.metadata.creationTimestamp", + check=False, + capture_output=True, + ) + if result.returncode == 0 and result.stdout: + llmd_runtime.write_text(destination, result.stdout) diff --git a/projects/llm_d/orchestration/test_llmd.py b/projects/llm_d/orchestration/test_llmd.py index 8290ee63..b11948d7 100644 --- a/projects/llm_d/orchestration/test_llmd.py +++ b/projects/llm_d/orchestration/test_llmd.py @@ -1,29 +1,483 @@ +from __future__ import annotations + +import json import logging -import pathlib +import time +from pathlib import Path + +from projects.llm_d.orchestration import llmd_runtime + +LOGGER = logging.getLogger(__name__) + + +def init() -> None: + llmd_runtime.init() + + +def test() -> int: + llmd_runtime.init() + config = llmd_runtime.load_run_configuration() + + name = config.platform["inference_service"]["name"] + namespace = config.namespace + artifacts_dir = config.artifact_dir / "artifacts" + + LOGGER.info("Testing llm_d preset=%s namespace=%s", config.preset_name, namespace) + + endpoint_url = None + try: + endpoint_url = deploy_inference_service(config) + smoke_response = run_smoke_request(config, endpoint_url) + llmd_runtime.write_json(artifacts_dir / "smoke.response.json", smoke_response) + + if config.benchmark: + run_guidellm_benchmark(config, endpoint_url) + + return 0 + finally: + capture_inference_service_state(config) + if endpoint_url: + llmd_runtime.write_text(artifacts_dir / "endpoint.url", f"{endpoint_url}\n") + benchmark_name = ( + config.benchmark["job_name"] if config.benchmark else "guidellm-benchmark" + ) + llmd_runtime.oc( + "delete", + "job,pvc", + benchmark_name, + "-n", + namespace, + "--ignore-not-found=true", + check=False, + ) + llmd_runtime.oc( + "delete", + "pod", + f"{benchmark_name}-copy", + "-n", + namespace, + "--ignore-not-found=true", + check=False, + ) + events = llmd_runtime.oc( + "get", + "events", + "-n", + namespace, + "--sort-by=.metadata.creationTimestamp", + check=False, + capture_output=True, + ) + if events.returncode == 0 and events.stdout: + llmd_runtime.write_text( + artifacts_dir / "namespace.events.txt", events.stdout + ) + + +def deploy_inference_service(config: llmd_runtime.ResolvedConfig) -> str: + name = config.platform["inference_service"]["name"] + namespace = config.namespace + selector = f"app.kubernetes.io/name={name}" + + llmd_runtime.oc( + "delete", + "llminferenceservice", + name, + "-n", + namespace, + "--ignore-not-found=true", + check=False, + ) + + def _old_pods_gone() -> bool: + pods = llmd_runtime.oc_get_json( + "pods", namespace=namespace, selector=selector, ignore_not_found=True + ) + return not pods or not pods.get("items") + + llmd_runtime.wait_until( + f"old llm-d pods to disappear in {namespace}", + timeout_seconds=config.platform["inference_service"]["delete_timeout_seconds"], + interval_seconds=10, + predicate=_old_pods_gone, + ) + + manifest = llmd_runtime.render_inference_service(config) + llmd_runtime.apply_manifest( + config.artifact_dir / "src" / "llminferenceservice.yaml", manifest + ) + + def _pods_present() -> bool: + pods = llmd_runtime.oc_get_json( + "pods", namespace=namespace, selector=selector, ignore_not_found=True + ) + return bool(pods and pods.get("items")) + + llmd_runtime.wait_until( + f"llm-d pods to appear in {namespace}", + timeout_seconds=config.platform["inference_service"][ + "pod_appearance_timeout_seconds" + ], + interval_seconds=5, + predicate=_pods_present, + ) + + def _service_ready() -> bool: + payload = llmd_runtime.oc_get_json( + "llminferenceservice", name=name, namespace=namespace + ) + return llmd_runtime.condition_status(payload, "Ready") == "True" + + llmd_runtime.wait_until( + f"llminferenceservice/{name} ready", + timeout_seconds=config.platform["inference_service"]["ready_timeout_seconds"], + interval_seconds=10, + predicate=_service_ready, + ) + + return llmd_runtime.wait_until( + f"gateway address for llminferenceservice/{name}", + timeout_seconds=config.platform["inference_service"]["ready_timeout_seconds"], + interval_seconds=10, + predicate=lambda: try_resolve_endpoint_url(config), + ) + + +def resolve_endpoint_url(config: llmd_runtime.ResolvedConfig) -> str: + endpoint_url = try_resolve_endpoint_url(config) + if endpoint_url: + return endpoint_url + + name = config.platform["inference_service"]["name"] + gateway_name = config.platform["gateway"]["status_address_name"] + raise RuntimeError( + f"Gateway address {gateway_name} is missing from llminferenceservice/{name} status.addresses" + ) + + +def try_resolve_endpoint_url(config: llmd_runtime.ResolvedConfig) -> str | None: + name = config.platform["inference_service"]["name"] + namespace = config.namespace + gateway_name = config.platform["gateway"]["status_address_name"] + payload = llmd_runtime.oc_get_json( + "llminferenceservice", name=name, namespace=namespace + ) + + for address in payload.get("status", {}).get("addresses", []): + if address.get("name") == gateway_name and address.get("url"): + return address["url"] + return None + + +def run_smoke_request( + config: llmd_runtime.ResolvedConfig, endpoint_url: str +) -> dict[str, object]: + namespace = config.namespace + name = config.platform["inference_service"]["name"] + deployment_name = f"{name}{config.platform['inference_service']['workload_deployment_name_suffix']}" + + payload = { + "model": config.model["served_model_name"], + "prompt": config.smoke_request["prompt"], + "max_tokens": config.smoke_request["max_tokens"], + "temperature": config.smoke_request["temperature"], + } + llmd_runtime.write_json( + config.artifact_dir / "artifacts" / "smoke.request.json", payload + ) + + retries = config.platform["smoke"]["request_retries"] + delay = config.platform["smoke"]["request_retry_delay_seconds"] + result = None + for _ in range(retries): + result = llmd_runtime.oc( + "exec", + "-n", + namespace, + f"deployment/{deployment_name}", + "-c", + "main", + "--", + "curl", + "-k", + "-sSf", + f"{endpoint_url}{config.platform['smoke']['endpoint_path']}", + "-H", + "Content-Type: application/json", + "-d", + json.dumps(payload), + check=False, + capture_output=True, + ) + if result.returncode == 0: + break + time.sleep(delay) + + if result is None or result.returncode != 0: + raise RuntimeError("Smoke request never succeeded against the llm_d endpoint") + + response = json.loads(result.stdout) + if not response.get("choices"): + raise RuntimeError(f"Invalid smoke response payload: {result.stdout}") + return response + + +def run_guidellm_benchmark( + config: llmd_runtime.ResolvedConfig, endpoint_url: str +) -> None: + benchmark_name = config.benchmark["job_name"] + namespace = config.namespace + + llmd_runtime.oc( + "delete", + "job,pvc", + benchmark_name, + "-n", + namespace, + "--ignore-not-found=true", + check=False, + ) + llmd_runtime.oc( + "delete", + "pod", + f"{benchmark_name}-copy", + "-n", + namespace, + "--ignore-not-found=true", + check=False, + ) + + llmd_runtime.apply_manifest( + config.artifact_dir / "src" / "guidellm-pvc.yaml", + llmd_runtime.render_guidellm_pvc(config), + ) + llmd_runtime.apply_manifest( + config.artifact_dir / "src" / "guidellm-job.yaml", + llmd_runtime.render_guidellm_job(config, endpoint_url), + ) + + def _job_terminal() -> dict[str, object] | None: + payload = llmd_runtime.oc_get_json( + "job", name=benchmark_name, namespace=namespace + ) + status = payload.get("status", {}) + if status.get("succeeded"): + return payload + if status.get("failed"): + raise RuntimeError(f"GuideLLM job {benchmark_name} failed") + return None + + llmd_runtime.wait_until( + f"GuideLLM job/{benchmark_name}", + timeout_seconds=config.benchmark["timeout_seconds"], + interval_seconds=10, + predicate=_job_terminal, + ) + + capture_guidellm_state(config) + copy_guidellm_results(config) + + +def copy_guidellm_results(config: llmd_runtime.ResolvedConfig) -> None: + benchmark_name = config.benchmark["job_name"] + namespace = config.namespace + pod_data = llmd_runtime.oc_get_json( + "pods", + namespace=namespace, + selector=f"job-name={benchmark_name}", + ignore_not_found=True, + ) + node_name = None + if pod_data and pod_data.get("items"): + node_name = pod_data["items"][0].get("spec", {}).get("nodeName") + + llmd_runtime.apply_manifest( + config.artifact_dir / "src" / "guidellm-copy-pod.yaml", + llmd_runtime.render_guidellm_copy_pod(config, node_name=node_name), + ) + + def _helper_ready() -> bool: + payload = llmd_runtime.oc_get_json( + "pod", + name=f"{benchmark_name}-copy", + namespace=namespace, + ) + conditions = payload.get("status", {}).get("conditions", []) + return any( + condition.get("type") == "Ready" and condition.get("status") == "True" + for condition in conditions + ) + + llmd_runtime.wait_until( + f"GuideLLM copy helper pod/{benchmark_name}-copy", + timeout_seconds=120, + interval_seconds=5, + predicate=_helper_ready, + ) + + result = llmd_runtime.oc( + "exec", + "-n", + namespace, + f"{benchmark_name}-copy", + "--", + "cat", + "/results/benchmarks.json", + check=False, + capture_output=True, + ) + if result.returncode == 0 and result.stdout: + llmd_runtime.write_text( + config.artifact_dir / "artifacts" / "results" / "benchmarks.json", + result.stdout, + ) + + +def capture_inference_service_state(config: llmd_runtime.ResolvedConfig) -> None: + name = config.platform["inference_service"]["name"] + namespace = config.namespace + artifacts_dir = config.artifact_dir / "artifacts" + selector = f"app.kubernetes.io/name={name}" -from projects.core.library import config, env, run -from projects.llm_d.toolbox.capture_isvc_state.main import run as capture_isvc_state + capture_get( + "llminferenceservice", + name, + namespace, + "yaml", + artifacts_dir / "llminferenceservice.yaml", + ) + capture_get( + "llminferenceservice", + name, + namespace, + "json", + artifacts_dir / "llminferenceservice.json", + ) + capture_get( + "pods", + None, + namespace, + "yaml", + artifacts_dir / "llminferenceservice.pods.yaml", + selector=selector, + ) + capture_get( + "deployments", + None, + namespace, + "yaml", + artifacts_dir / "llminferenceservice.deployments.yaml", + selector=selector, + ) + capture_get( + "replicasets", + None, + namespace, + "yaml", + artifacts_dir / "llminferenceservice.replicasets.yaml", + selector=selector, + ) + capture_get( + "pods", None, namespace, "wide", artifacts_dir / "namespace.pods.status" + ) + capture_get( + "services", None, namespace, "wide", artifacts_dir / "namespace.services.status" + ) -logger = logging.getLogger(__name__) + pod_list = llmd_runtime.oc_get_json( + "pods", namespace=namespace, selector=selector, ignore_not_found=True + ) + if pod_list: + lines = [] + previous_lines = [] + for pod in pod_list.get("items", []): + pod_name = pod["metadata"]["name"] + lines.append(f"=== {pod_name} ===") + log_result = llmd_runtime.oc( + "logs", + pod_name, + "-n", + namespace, + "--all-containers=true", + check=False, + capture_output=True, + ) + if log_result.stdout: + lines.append(log_result.stdout.rstrip()) + previous_lines.append(f"=== {pod_name} ===") + previous_result = llmd_runtime.oc( + "logs", + pod_name, + "-n", + namespace, + "--previous", + "--all-containers=true", + check=False, + capture_output=True, + ) + if previous_result.stdout: + previous_lines.append(previous_result.stdout.rstrip()) -def init(): - env.init() - run.init() - config.init(pathlib.Path(__file__).parent) + llmd_runtime.write_text( + artifacts_dir / "llminferenceservice.pods.logs", "\n".join(lines) + "\n" + ) + llmd_runtime.write_text( + artifacts_dir / "llminferenceservice.pods.previous.logs", + "\n".join(previous_lines) + "\n", + ) -@config.requires( - ns="prepare.namespace.name", - name="tests.llmd.flavors", -) -def test(_cfg): - logger.warning(f"Hello test {_cfg.ns}/{_cfg.name}") +def capture_guidellm_state(config: llmd_runtime.ResolvedConfig) -> None: + benchmark_name = config.benchmark["job_name"] + namespace = config.namespace + artifacts_dir = config.artifact_dir / "artifacts" - # two alternatives to query the configuration: - # @config.requires(dict) or config.project.get_config("") - # and we will define something similar for the secrets + capture_get( + "job", + benchmark_name, + namespace, + "yaml", + artifacts_dir / "guidellm_benchmark_job.yaml", + ) + capture_get( + "pods", + None, + namespace, + "yaml", + artifacts_dir / "guidellm_benchmark_job.pods.yaml", + selector=f"job-name={benchmark_name}", + ) + result = llmd_runtime.oc( + "logs", + f"job/{benchmark_name}", + "-n", + namespace, + check=False, + capture_output=True, + ) + if result.returncode == 0 and result.stdout: + llmd_runtime.write_text( + artifacts_dir / "guidellm_benchmark_job.logs", result.stdout + ) - config.project.get_config("tests.llmd.flavors") - capture_isvc_state(_cfg.name, namespace=_cfg.ns) +def capture_get( + kind: str, + name: str | None, + namespace: str, + output: str, + destination: Path, + *, + selector: str | None = None, +) -> None: + args = ["get", kind] + if name: + args.append(name) + args.extend(["-n", namespace]) + if selector: + args.extend(["-l", selector]) + args.extend(["-o", output]) + result = llmd_runtime.oc(*args, check=False, capture_output=True) + if result.returncode == 0 and result.stdout: + llmd_runtime.write_text(destination, result.stdout) diff --git a/projects/llm_d/toolbox/capture_isvc_state/main.py b/projects/llm_d/toolbox/capture_isvc_state/main.py old mode 100755 new mode 100644 index 78448e15..85d09bc8 --- a/projects/llm_d/toolbox/capture_isvc_state/main.py +++ b/projects/llm_d/toolbox/capture_isvc_state/main.py @@ -5,12 +5,7 @@ Replaces llmd_capture_isvc_state Ansible role """ -from projects.core.dsl import ( - execute_tasks, - shell, - task, - toolbox, -) +from projects.core.dsl import execute_tasks, shell, task, toolbox def run(llmisvc_name: str, *, namespace: str = ""): @@ -22,7 +17,6 @@ def run(llmisvc_name: str, *, namespace: str = ""): namespace: Namespace of the LLMInferenceService (empty string auto-detects current namespace) """ - # Execute all registered tasks in order, respecting conditions return execute_tasks(locals()) @@ -157,7 +151,6 @@ def capture_podmonitors(args, context): @task def capture_pod_logs(args, context): """Capture logs from LLMInferenceService pods""" - # Get list of pod names result = shell.run( f'oc get pods -l "app.kubernetes.io/name={args.llmisvc_name}" -n {context.target_namespace} -o jsonpath="{{.items[*].metadata.name}}"', check=False, @@ -170,19 +163,16 @@ def capture_pod_logs(args, context): log_file = args.artifact_dir / "artifacts/llminferenceservice.pods.logs" - # Capture logs for each pod - with open(log_file, "w") as f: # Start with empty file + with open(log_file, "w") as handle: for pod_name in pod_names: - f.write(f"=== Logs for pod: {pod_name} ===\n") - - # Get logs for this pod + handle.write(f"=== Logs for pod: {pod_name} ===\n") log_result = shell.run( f"oc logs {pod_name} -n {context.target_namespace} --all-containers=true", check=False, log_stdout=False, ) - f.write(log_result.stdout) - f.write("\n") + handle.write(log_result.stdout) + handle.write("\n") return f"Pod logs captured for {len(pod_names)} pods" @@ -190,7 +180,6 @@ def capture_pod_logs(args, context): @task def capture_pod_previous_logs(args, context): """Capture previous logs from LLMInferenceService pods if available""" - # Get list of pod names result = shell.run( f'oc get pods -l "app.kubernetes.io/name={args.llmisvc_name}" -n {context.target_namespace} -o jsonpath="{{.items[*].metadata.name}}"', check=False, @@ -202,19 +191,16 @@ def capture_pod_previous_logs(args, context): log_file = args.artifact_dir / "artifacts/llminferenceservice.pods.previous.logs" - # Capture previous logs for each pod - with open(log_file, "w") as f: # Start with empty file + with open(log_file, "w") as handle: for pod_name in pod_names: - f.write(f"=== Previous logs for pod: {pod_name} ===\n") - - # Get previous logs for this pod + handle.write(f"=== Previous logs for pod: {pod_name} ===\n") log_result = shell.run( f"oc logs {pod_name} -n {context.target_namespace} --previous --all-containers=true", check=False, log_stdout=False, ) - f.write(log_result.stdout) - f.write("\n") + handle.write(log_result.stdout) + handle.write("\n") return f"Pod previous logs captured for {len(pod_names)} pods" @@ -233,7 +219,6 @@ def capture_llminferenceservice_describe(args, context): @task def capture_pods_describe(args, context): """Capture describe output for related pods""" - # Get list of pod names result = shell.run( f'oc get pods -l "app.kubernetes.io/name={args.llmisvc_name}" -n {context.target_namespace} -o jsonpath="{{.items[*].metadata.name}}"', check=False, @@ -245,24 +230,20 @@ def capture_pods_describe(args, context): describe_file = args.artifact_dir / "artifacts/llminferenceservice.pods.describe.txt" - # Capture describe output for each pod - with open(describe_file, "w") as f: # Start with empty file + with open(describe_file, "w") as handle: for pod_name in pod_names: - f.write(f"=== Describe for pod: {pod_name} ===\n") - - # Get describe output for this pod + handle.write(f"=== Describe for pod: {pod_name} ===\n") describe_result = shell.run( f"oc describe pod {pod_name} -n {context.target_namespace}", log_stdout=False, check=False, ) - f.write(describe_result.stdout) - f.write("\n") + handle.write(describe_result.stdout) + handle.write("\n") return f"Pod describe output captured for {len(pod_names)} pods" -# Create the main function using the toolbox library main = toolbox.create_toolbox_main(run) diff --git a/tests/llm_d/test_runtime.py b/tests/llm_d/test_runtime.py new file mode 100644 index 00000000..4557de00 --- /dev/null +++ b/tests/llm_d/test_runtime.py @@ -0,0 +1,208 @@ +from __future__ import annotations + +from pathlib import Path + +import pytest + +from projects.llm_d.orchestration import llmd_runtime +from projects.llm_d.orchestration import prepare_llmd +from projects.llm_d.orchestration import test_llmd + + +def test_derive_namespace_uses_prefix_once() -> None: + namespace = llmd_runtime.derive_namespace("llm-d-nightly-smoke", "llm-d", 63) + assert namespace == "llm-d-nightly-smoke" + + +def test_parse_overrides_rejects_unknown_keys() -> None: + with pytest.raises(ValueError, match="Unsupported llm_d override keys"): + llmd_runtime.parse_overrides('{"model":"other"}') + + +def test_load_run_configuration_resolves_alias(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + + fournos_config = tmp_path / "fournos_config.yaml" + fournos_config.write_text( + "preset: cks\njob-name: llm-d-e2e\n", + encoding="utf-8", + ) + + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + + assert config.preset_name == "smoke" + assert config.preset_alias == "cks" + assert config.model["served_model_name"] == "Qwen/Qwen3-0.6B" + assert config.namespace == "llm-d-e2e" + assert config.namespace_is_managed is True + + +def test_namespace_override_is_not_managed(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", '{"namespace":"custom-ns"}') + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + + assert config.namespace == "custom-ns" + assert config.namespace_is_managed is False + + +def test_render_inference_service_injects_model_and_epp(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + manifest = llmd_runtime.render_inference_service(config) + + assert manifest["metadata"]["name"] == "llm-d" + assert manifest["metadata"]["namespace"] == config.namespace + assert manifest["spec"]["model"]["name"] == "Qwen/Qwen3-0.6B" + assert manifest["spec"]["model"]["uri"] == "hf://Qwen/Qwen3-0.6B" + assert manifest["spec"]["model"]["name"] == config.model["served_model_name"] + assert manifest["spec"]["model"]["uri"] == config.model["uri"] + router_args = manifest["spec"]["router"]["scheduler"]["template"]["containers"][0]["args"] + assert router_args[-2] == "--config-text" + assert "EndpointPickerConfig" in router_args[-1] + + +def test_render_guidellm_job_uses_target_and_rate(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + (tmp_path / "fournos_config.yaml").write_text( + "preset: benchmark-short\njob-name: llm-d-benchmark\n", + encoding="utf-8", + ) + + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + manifest = llmd_runtime.render_guidellm_job(config, "https://example.test") + + container = manifest["spec"]["template"]["spec"]["containers"][0] + assert container["image"] == "ghcr.io/vllm-project/guidellm:v0.5.4" + assert "--target=https://example.test" in container["args"] + assert "--rate=1" in container["args"] + + +def test_prepare_gpu_operator_skips_existing_clusterpolicy( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + + calls: list[str] = [] + + monkeypatch.setattr( + prepare_llmd, + "ensure_operator_subscription", + lambda operator_spec: calls.append(f"subscription:{operator_spec['package']}"), + ) + monkeypatch.setattr( + llmd_runtime, + "wait_for_crd", + lambda crd_name, *, timeout_seconds: calls.append(f"crd:{crd_name}"), + ) + monkeypatch.setattr( + llmd_runtime, + "load_manifest_template", + lambda _config, _path: { + "apiVersion": "nvidia.com/v1", + "kind": "ClusterPolicy", + "metadata": {"name": "gpu-cluster-policy"}, + "spec": {}, + }, + ) + monkeypatch.setattr(llmd_runtime, "resource_exists", lambda kind, name: True) + + def fail_apply(*_: object, **__: object) -> None: + raise AssertionError("existing ClusterPolicy must not be reapplied") + + monkeypatch.setattr(llmd_runtime, "apply_manifest", fail_apply) + monkeypatch.setattr( + llmd_runtime, + "oc_get_json", + lambda kind, name: {"status": {"state": "ready"}}, + ) + + prepare_llmd.prepare_gpu_operator(config) + + assert calls == ["subscription:gpu-operator-certified", "crd:clusterpolicies.nvidia.com"] + + +def test_prepare_gpu_operator_bootstraps_missing_clusterpolicy( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + + applied: list[Path] = [] + manifest = { + "apiVersion": "nvidia.com/v1", + "kind": "ClusterPolicy", + "metadata": {"name": "gpu-cluster-policy"}, + "spec": {}, + } + + monkeypatch.setattr(prepare_llmd, "ensure_operator_subscription", lambda _: None) + monkeypatch.setattr(llmd_runtime, "wait_for_crd", lambda *_, **__: None) + monkeypatch.setattr(llmd_runtime, "load_manifest_template", lambda _config, _path: manifest) + monkeypatch.setattr(llmd_runtime, "resource_exists", lambda kind, name: False) + monkeypatch.setattr( + llmd_runtime, + "apply_manifest", + lambda artifact_path, _manifest: applied.append(artifact_path), + ) + monkeypatch.setattr( + llmd_runtime, + "oc_get_json", + lambda kind, name: {"status": {"state": "ready"}}, + ) + + prepare_llmd.prepare_gpu_operator(config) + + assert applied == [artifact_dir / "src" / "gpu-clusterpolicy.yaml"] + + +def test_gpu_clusterpolicy_manifest_has_required_default_sections() -> None: + manifest = llmd_runtime.load_yaml( + llmd_runtime.CONFIG_DIR / "manifests" / "gpu-clusterpolicy.yaml" + ) + + assert manifest["kind"] == "ClusterPolicy" + assert manifest["metadata"]["name"] == "gpu-cluster-policy" + assert { + "daemonsets", + "dcgm", + "dcgmExporter", + "devicePlugin", + "driver", + "gfd", + "nodeStatusExporter", + "operator", + "toolkit", + } <= set(manifest["spec"]) + + +def test_resolve_endpoint_url_requires_gateway_address( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + + def fake_oc_get_json(kind: str, **_: object) -> dict[str, object]: + assert kind == "llminferenceservice" + return {"status": {"addresses": [{"name": "other", "url": "https://wrong"}]}} + + monkeypatch.setattr(llmd_runtime, "oc_get_json", fake_oc_get_json) + + with pytest.raises(RuntimeError, match="Gateway address"): + test_llmd.resolve_endpoint_url(config) From b575b037dc725ddacbcf3629645020e3c16b0d2b Mon Sep 17 00:00:00 2001 From: Alberto Perdomo Date: Wed, 15 Apr 2026 11:02:18 +0100 Subject: [PATCH 02/21] refactor: move llm_d config and phases --- config/llm_d/models.yaml | 25 - config/llm_d/presets.yaml | 14 - projects/core/library/config.py | 6 +- projects/llm_d/README.md | 19 +- .../llm_d/orchestration/config.d/models.yaml | 25 + .../orchestration/config.d}/platform.yaml | 0 .../llm_d/orchestration/config.d/runtime.yaml | 7 + .../orchestration/config.d}/workloads.yaml | 1 - projects/llm_d/orchestration/config.yaml | 233 +-------- projects/llm_d/orchestration/llmd_runtime.py | 93 ++-- .../manifests/datasciencecluster.yaml | 0 .../epp-approximate-prefix-cache.yaml | 0 .../orchestration}/manifests/gateway.yaml | 0 .../manifests/gpu-clusterpolicy.yaml | 0 .../manifests/llminferenceservice.yaml | 0 .../manifests/nfd-nodefeaturediscovery.yaml | 0 projects/llm_d/orchestration/prepare_llmd.py | 427 +-------------- .../llm_d/orchestration/presets.d/cks.yaml | 23 - .../orchestration/presets.d/presets.yaml | 18 +- projects/llm_d/orchestration/test_llmd.py | 477 +---------------- projects/llm_d/toolbox/cleanup/main.py | 66 +++ projects/llm_d/toolbox/prepare/main.py | 391 ++++++++++++++ projects/llm_d/toolbox/test/main.py | 492 ++++++++++++++++++ tests/llm_d/test_runtime.py | 90 +++- 24 files changed, 1149 insertions(+), 1258 deletions(-) delete mode 100644 config/llm_d/models.yaml delete mode 100644 config/llm_d/presets.yaml create mode 100644 projects/llm_d/orchestration/config.d/models.yaml rename {config/llm_d => projects/llm_d/orchestration/config.d}/platform.yaml (100%) create mode 100644 projects/llm_d/orchestration/config.d/runtime.yaml rename {config/llm_d => projects/llm_d/orchestration/config.d}/workloads.yaml (99%) rename {config/llm_d => projects/llm_d/orchestration}/manifests/datasciencecluster.yaml (100%) rename {config/llm_d => projects/llm_d/orchestration}/manifests/epp-approximate-prefix-cache.yaml (100%) rename {config/llm_d => projects/llm_d/orchestration}/manifests/gateway.yaml (100%) rename {config/llm_d => projects/llm_d/orchestration}/manifests/gpu-clusterpolicy.yaml (100%) rename {config/llm_d => projects/llm_d/orchestration}/manifests/llminferenceservice.yaml (100%) rename {config/llm_d => projects/llm_d/orchestration}/manifests/nfd-nodefeaturediscovery.yaml (100%) delete mode 100644 projects/llm_d/orchestration/presets.d/cks.yaml create mode 100644 projects/llm_d/toolbox/cleanup/main.py create mode 100644 projects/llm_d/toolbox/prepare/main.py create mode 100644 projects/llm_d/toolbox/test/main.py diff --git a/config/llm_d/models.yaml b/config/llm_d/models.yaml deleted file mode 100644 index 46cf4bf4..00000000 --- a/config/llm_d/models.yaml +++ /dev/null @@ -1,25 +0,0 @@ -models: - - qwen3-0-6b: - served_model_name: Qwen/Qwen3-0.6B - uri: hf://Qwen/Qwen3-0.6B - resources: - requests: - cpu: "4" - memory: 16Gi - nvidia.com/gpu: "1" - limits: - cpu: "4" - memory: 16Gi - nvidia.com/gpu: "1" - - llama-3-1-8b-instruct-fp8: - served_model_name: llama-3-1-8b-instruct-fp8 - uri: oci://registry.redhat.io/rhelai1/modelcar-llama-3-1-8b-instruct-fp8-dynamic:1.5 - resources: - requests: - cpu: "4" - memory: 8Gi - nvidia.com/gpu: "1" - limits: - nvidia.com/gpu: "1" diff --git a/config/llm_d/presets.yaml b/config/llm_d/presets.yaml deleted file mode 100644 index 9fdaae32..00000000 --- a/config/llm_d/presets.yaml +++ /dev/null @@ -1,14 +0,0 @@ -aliases: - cks: smoke - -presets: - - smoke: - model: qwen3-0-6b - smoke_request: default - benchmark: null - - benchmark-short: - model: llama-3-1-8b-instruct-fp8 - smoke_request: default - benchmark: short diff --git a/projects/core/library/config.py b/projects/core/library/config.py index 55d3cb24..b17b3e3a 100644 --- a/projects/core/library/config.py +++ b/projects/core/library/config.py @@ -307,7 +307,9 @@ def multi_dereference(): # --- # - new_value = simple_dereference() if value.startswith("@") else multi_dereference() + new_value = ( + simple_dereference() if value.startswith("@") else multi_dereference() + ) if not handled_secretly: logger.info(f"resolve_reference: {value} ==> '{new_value}'") @@ -435,8 +437,6 @@ def init(orchestration_dir, *, apply_config_overrides=True): project = Config(config_path) - env.ARTIFACT_DIR / VARIABLE_OVERRIDES_FILENAME - if not apply_config_overrides: logger.info( "config.init: running with 'apply_config_overrides', " diff --git a/projects/llm_d/README.md b/projects/llm_d/README.md index d76634d9..fd443121 100644 --- a/projects/llm_d/README.md +++ b/projects/llm_d/README.md @@ -6,11 +6,20 @@ The current implementation is intentionally narrow: - target only downstream `LLMInferenceService` - keep the public interface compatible with current Fournos phase execution -- use checked-in presets and manifests instead of a large mutable config surface +- use checked-in config chunks and manifests instead of a large mutable config surface + +Configuration layout: + +- base config: [`orchestration/config.yaml`](./orchestration/config.yaml) +- config chunks: [`orchestration/config.d`](./orchestration/config.d) +- presets: [`orchestration/presets.d`](./orchestration/presets.d) +- manifests: [`orchestration/manifests`](./orchestration/manifests) Main entrypoints: -- CI phase wrapper: [ci.py](/Users/aperdomo/workspace/redhat/forge/projects/llm_d/orchestration/ci.py) -- Prepare flow: [prepare_llmd.py](/Users/aperdomo/workspace/redhat/forge/projects/llm_d/orchestration/prepare_llmd.py) -- Test flow: [test_llmd.py](/Users/aperdomo/workspace/redhat/forge/projects/llm_d/orchestration/test_llmd.py) -- Shared runtime/config loader: [llmd_runtime.py](/Users/aperdomo/workspace/redhat/forge/projects/llm_d/orchestration/llmd_runtime.py) +- CI phase wrapper: [`orchestration/ci.py`](./orchestration/ci.py) +- CLI wrapper: [`orchestration/cli.py`](./orchestration/cli.py) +- Shared runtime/config loader: [`orchestration/llmd_runtime.py`](./orchestration/llmd_runtime.py) +- Toolbox prepare command: [`toolbox/prepare/main.py`](./toolbox/prepare/main.py) +- Toolbox test command: [`toolbox/test/main.py`](./toolbox/test/main.py) +- Toolbox cleanup command: [`toolbox/cleanup/main.py`](./toolbox/cleanup/main.py) diff --git a/projects/llm_d/orchestration/config.d/models.yaml b/projects/llm_d/orchestration/config.d/models.yaml new file mode 100644 index 00000000..fd204db4 --- /dev/null +++ b/projects/llm_d/orchestration/config.d/models.yaml @@ -0,0 +1,25 @@ +qwen3-0-6b: + served_model_name: Qwen/Qwen3-0.6B + uri: hf://Qwen/Qwen3-0.6B + resources: + requests: + cpu: "4" + memory: 16Gi + nvidia.com/gpu: "1" + limits: + cpu: "4" + memory: 16Gi + nvidia.com/gpu: "1" + +llama-3-1-8b-instruct-fp8: + served_model_name: llama-3-1-8b-instruct-fp8 + uri: oci://registry.redhat.io/rhelai1/modelcar-llama-3-1-8b-instruct-fp8-dynamic:1.5 + resources: + requests: + cpu: "4" + memory: 8Gi + nvidia.com/gpu: "1" + limits: + cpu: "4" + memory: 8Gi + nvidia.com/gpu: "1" diff --git a/config/llm_d/platform.yaml b/projects/llm_d/orchestration/config.d/platform.yaml similarity index 100% rename from config/llm_d/platform.yaml rename to projects/llm_d/orchestration/config.d/platform.yaml diff --git a/projects/llm_d/orchestration/config.d/runtime.yaml b/projects/llm_d/orchestration/config.d/runtime.yaml new file mode 100644 index 00000000..982d8fd2 --- /dev/null +++ b/projects/llm_d/orchestration/config.d/runtime.yaml @@ -0,0 +1,7 @@ +default_preset: smoke +allowed_override_keys: + - namespace +selected_preset: smoke +model_key: qwen3-0-6b +smoke_request_key: default +benchmark_key: null diff --git a/config/llm_d/workloads.yaml b/projects/llm_d/orchestration/config.d/workloads.yaml similarity index 99% rename from config/llm_d/workloads.yaml rename to projects/llm_d/orchestration/config.d/workloads.yaml index f5ebbb85..1ce9bdc6 100644 --- a/config/llm_d/workloads.yaml +++ b/projects/llm_d/orchestration/config.d/workloads.yaml @@ -5,7 +5,6 @@ smoke_requests: temperature: 0.7 benchmarks: - short: job_name: guidellm-benchmark image: ghcr.io/vllm-project/guidellm:v0.5.4 diff --git a/projects/llm_d/orchestration/config.yaml b/projects/llm_d/orchestration/config.yaml index e7367e8f..c36dfa60 100644 --- a/projects/llm_d/orchestration/config.yaml +++ b/projects/llm_d/orchestration/config.yaml @@ -1,230 +1,3 @@ -prepare: - skip: false - namespace: - name: llm-d-project - - operators: - skip: false - list: - - name: "Red Hat Connectivity Link" - catalog: redhat-operators - operator: rhcl-operator - namespace: all - enabled: false - - - name: "OpenShift Cert Manager" - catalog: redhat-operators - operator: openshift-cert-manager-operator - namespace: openshift-cert-manager-operator - enabled: true - - - name: "Leader Worker Set" - catalog: redhat-operators - operator: leader-worker-set - namespace: openshift-lws - deploy_cr: true - enabled: true - - - name: "Node Feature Discovery" - catalog: redhat-operators - operator: nfd - namespace: openshift-nfd - deploy_cr: 1 - enabled: true - - - name: "NVIDIA GPU Operator" - catalog: certified-operators - operator: gpu-operator-certified - namespace: nvidia-gpu-operator - deploy_cr: true - enabled: true - - - name: "Grafana Operator" - catalog: community-operators - operator: grafana-operator - namespace: grafana-operator - enabled: true - extra_args: - all_namespaces: true - - cluster: - skip: false - nodes: - auto_scale: false - auto_scale_down_on_exit: false - instance_type: gx3-16x80x1l4 - count: 2 - - rhoai: - skip: false - image: "quay.io/rhoai/rhoai-fbc-fragment" - tag: "rhoai-3.3@sha256:f6e7db613cd040e53da2d47850477a9b914de18979adaaac47e15dc7c76f8a76" - channel: "stable-3.x" - datasciencecluster: - enable: "[kserve]" - extra_settings: '{"spec.components.kserve.rawDeploymentServiceConfig": "Headless"}' - - gateway: - skip: false - name: openshift-ai-inference # NOTE: Should not be changed for the time being - - grafana: - skip: false - namespace: grafana - datasources: - - grafana/datasource.yaml - dashboards_dir: grafana/dashboards - - monitoring: - skip: false - namespaces: - - "@prepare.namespace.name" - - gpu: - wait_for_readiness: false - - preload: - skip: false - extra_images: {} - node_selector_key: nvidia.com/gpu.present - node_selector_value: "true" - - pvc: - enabled: true - size: 2000Gi - name: storage - access_mode: ReadWriteOnce - storage_class: null - - model_downloader: - image: ghcr.io/opendatahub-io/rhaii-on-xks/kserve-storage-initializer:e6b5db0@sha256:b305264fe2211be2c6063500c4c11da79e8357af4b34dd8567b0d8e8dea7e1d4 - - cleanup: - skip: false - -models: - facebook-opt-125m: - name: facebook/opt-125m - source: hf://facebook/opt-125m - resources: - cpu: 2 - memory: 8Gi - - llama3-1-8b: - name: RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8-dynamic - uri: oci://registry.redhat.io/rhelai1/modelcar-llama-3-1-8b-instruct-fp8-dynamic:1.5 - # source: hf://RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8-dynamic - resources: {} - - llama3-3-70b: - name: RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic - source: hf://RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic - resources: - cpu: 4 - memory: 64Gi - - gpt-oss-120: - name: openai/gpt-oss-120b - source: hf://openai/gpt-oss-120b - resources: - cpu: 4 - memory: 64Gi - - granite4-tiny: - name: RedHatAI/granite-4.0-h-tiny-FP8-dynamic - source: hf://RedHatAI/granite-4.0-h-tiny-FP8-dynamic - resources: {} - -tests: - llmd: - skip: false - skip_prepare: false - flavors: intelligentrouting - namespace: "@prepare.namespace.name" - - inference_service: - skip_deployment: false - name: llm-d - yaml_file: llama-3-1-8b-instruct-fp8.yaml - timeout: 900 - do_simple_test: true - gateway: - name: gateway-external - model: llama3-1-8b - metrics: - manual_capture: true - scheduler_servicemonitor_name: kserve-llm-isvc-scheduler - vllm_podmonitor_name: kserve-llm-isvc-vllm-engine - - # vLLM arguments (always applied) - vllm_args: - - "--disable-uvicorn-access-log" - - "--enable-prefix-caching" - - "--uvicorn-log-level=debug" - - "--trust-remote-code" - - "--disable-log-requests" - - "--max-model-len=40960" # keep in 5th position or uddate the presets - - "--gpu-memory-utilization=0.92" - - kueue: - enabled: false - prefix: "kueue.x-k8s.io/" - labels: - pod-group-name: llmisvc - managed: "true" - annotations: - queue-name: perf-gpu-queue - - # Extra properties to inject into the LLMISVC YAML using dotted-key notation - extra_properties: {} - - benchmarks: - guidellm: - enabled: true - name: guidellm-benchmark - backend_type: openai_http - rate_type: concurrent - max_seconds: 120 - max_requests: null - timeout: 900 - data: prompt_tokens=256,output_tokens=128 - rate: 1 - sample_requests: 20 - - capture_prom: true - capture_prom_uwm: true - dry_mode: false - visualize: true - -export_artifacts: - enabled: false - -matbench: - enabled: true - preset: null - workload: projects.llm-d.visualizations.llmd_inference - config_file: plots.yaml - # directory to plot - lts: - generate: true - opensearch: - export: - enabled: false - enabled_on_replot: false - fail_test_on_fail: true - instance: smoke - index: forge-llm-d-cpt - index_prefix: "" - build_counter_index: "forge-llm-d-builds" # used to generate a unique ID for each build - regression_analyses: - enabled: false - enabled_on_replot: true - upload_lts_on_regression: true - # if the regression analyses fail, mark the test as failed - fail_test_on_regression: true - notification: - enabled: true - title: "llm-d CPT" - download: - mode: prefer_cache - url: null +project: + name: llm_d + args: [] diff --git a/projects/llm_d/orchestration/llmd_runtime.py b/projects/llm_d/orchestration/llmd_runtime.py index aba35fd8..8b507bd2 100644 --- a/projects/llm_d/orchestration/llmd_runtime.py +++ b/projects/llm_d/orchestration/llmd_runtime.py @@ -7,7 +7,6 @@ import re import shlex import subprocess -import sys import time from dataclasses import dataclass from pathlib import Path @@ -15,15 +14,11 @@ import yaml -FORGE_HOME = Path(__file__).resolve().parents[3] -if str(FORGE_HOME) not in sys.path: - sys.path.insert(0, str(FORGE_HOME)) - -from projects.core.library import env, run +from projects.core.library import config, env, run LOGGER = logging.getLogger(__name__) -CONFIG_DIR = FORGE_HOME / "config" / "llm_d" -ALLOWED_OVERRIDE_KEYS = frozenset({"namespace"}) +ORCHESTRATION_DIR = env.FORGE_HOME / "projects" / "llm_d" / "orchestration" +CONFIG_DIR = ORCHESTRATION_DIR class CommandError(RuntimeError): @@ -72,39 +67,42 @@ def load_run_configuration( *, cwd: Path | None = None, artifact_dir: Path | None = None ) -> ResolvedConfig: cwd = cwd or Path.cwd() - artifact_dir = artifact_dir or env.ARTIFACT_DIR - if artifact_dir is None: - raise RuntimeError("ARTIFACT_DIR is not initialized") - - platform_data = load_yaml(CONFIG_DIR / "platform.yaml") - models_data = load_yaml(CONFIG_DIR / "models.yaml")["models"] - workloads_data = load_yaml(CONFIG_DIR / "workloads.yaml") - preset_data = load_yaml(CONFIG_DIR / "presets.yaml") + if artifact_dir is not None: + os.environ["ARTIFACT_DIR"] = str(artifact_dir) + artifact_dir = init() + _reinitialize_project_config() + platform_data = copy.deepcopy(config.project.get_config("platform")) fournos_config = load_fournos_config(cwd) - overrides = parse_overrides(os.environ.get("FORGE_CONFIG_OVERRIDES", "")) + overrides = parse_overrides( + os.environ.get("FORGE_CONFIG_OVERRIDES", ""), + allowed_keys=config.project.get_config("runtime.allowed_override_keys", []), + ) requested_preset = ( - fournos_config.get("preset") or os.environ.get("FORGE_PRESET") or "smoke" - ) - alias = ( - requested_preset if requested_preset in preset_data.get("aliases", {}) else None + fournos_config.get("preset") + or os.environ.get("FORGE_PRESET") + or config.project.get_config("runtime.default_preset") ) - preset_name = preset_data.get("aliases", {}).get(requested_preset, requested_preset) - preset = preset_data["presets"].get(preset_name) - if preset is None: - raise ValueError(f"Unknown llm_d preset: {requested_preset}") + apply_requested_preset(requested_preset) + + preset_name = config.project.get_config("runtime.selected_preset") + preset_alias = requested_preset if requested_preset != preset_name else None - model_name = preset["model"] - model = copy.deepcopy(models_data[model_name]) + model_name = config.project.get_config("runtime.model_key") + model = copy.deepcopy(config.project.get_config(f"models.{model_name}")) - smoke_request_name = preset.get("smoke_request", "default") - smoke_request = copy.deepcopy(workloads_data["smoke_requests"][smoke_request_name]) + smoke_request_name = config.project.get_config("runtime.smoke_request_key") + smoke_request = copy.deepcopy( + config.project.get_config(f"workloads.smoke_requests.{smoke_request_name}") + ) - benchmark_name = preset.get("benchmark") + benchmark_name = config.project.get_config("runtime.benchmark_key", None) benchmark = None if benchmark_name: - benchmark = copy.deepcopy(workloads_data["benchmarks"][benchmark_name]) + benchmark = copy.deepcopy( + config.project.get_config(f"workloads.benchmarks.{benchmark_name}") + ) job_name = fournos_config.get("job-name") or os.environ.get("FORGE_JOB_NAME") if not job_name: @@ -121,10 +119,10 @@ def load_run_configuration( return ResolvedConfig( artifact_dir=Path(artifact_dir), - project_root=FORGE_HOME, - config_dir=CONFIG_DIR, + project_root=env.FORGE_HOME, + config_dir=ORCHESTRATION_DIR, preset_name=preset_name, - preset_alias=alias, + preset_alias=preset_alias, job_name=job_name, namespace=namespace, namespace_is_managed=namespace_override is None, @@ -138,6 +136,26 @@ def load_run_configuration( ) +def _reinitialize_project_config() -> None: + config.project = None + artifact_config = env.ARTIFACT_DIR / "config.yaml" + if artifact_config.exists(): + artifact_config.unlink() + + presets_applied = env.ARTIFACT_DIR / "presets_applied" + if presets_applied.exists(): + presets_applied.unlink() + + config.init(ORCHESTRATION_DIR) + + +def apply_requested_preset(requested_preset: str) -> None: + if not config.project.get_preset(requested_preset): + raise ValueError(f"Unknown llm_d preset: {requested_preset}") + + config.project.apply_preset(requested_preset) + + def load_fournos_config(cwd: Path) -> dict[str, Any]: config_path = cwd / "fournos_config.yaml" if not config_path.exists(): @@ -153,7 +171,7 @@ def load_fournos_config(cwd: Path) -> dict[str, Any]: return data -def parse_overrides(raw: str) -> dict[str, Any]: +def parse_overrides(raw: str, *, allowed_keys: Iterable[str]) -> dict[str, Any]: if not raw or raw.strip() in {"", "null", "{}"}: return {} @@ -165,11 +183,12 @@ def parse_overrides(raw: str) -> dict[str, Any]: if not isinstance(data, dict): raise ValueError("FORGE_CONFIG_OVERRIDES must decode to a JSON object") - unsupported = sorted(set(data) - ALLOWED_OVERRIDE_KEYS) + allowed_keys = frozenset(allowed_keys) + unsupported = sorted(set(data) - allowed_keys) if unsupported: raise ValueError( "Unsupported llm_d override keys: " - f"{', '.join(unsupported)}. Allowed keys: {', '.join(sorted(ALLOWED_OVERRIDE_KEYS))}" + f"{', '.join(unsupported)}. Allowed keys: {', '.join(sorted(allowed_keys))}" ) return data diff --git a/config/llm_d/manifests/datasciencecluster.yaml b/projects/llm_d/orchestration/manifests/datasciencecluster.yaml similarity index 100% rename from config/llm_d/manifests/datasciencecluster.yaml rename to projects/llm_d/orchestration/manifests/datasciencecluster.yaml diff --git a/config/llm_d/manifests/epp-approximate-prefix-cache.yaml b/projects/llm_d/orchestration/manifests/epp-approximate-prefix-cache.yaml similarity index 100% rename from config/llm_d/manifests/epp-approximate-prefix-cache.yaml rename to projects/llm_d/orchestration/manifests/epp-approximate-prefix-cache.yaml diff --git a/config/llm_d/manifests/gateway.yaml b/projects/llm_d/orchestration/manifests/gateway.yaml similarity index 100% rename from config/llm_d/manifests/gateway.yaml rename to projects/llm_d/orchestration/manifests/gateway.yaml diff --git a/config/llm_d/manifests/gpu-clusterpolicy.yaml b/projects/llm_d/orchestration/manifests/gpu-clusterpolicy.yaml similarity index 100% rename from config/llm_d/manifests/gpu-clusterpolicy.yaml rename to projects/llm_d/orchestration/manifests/gpu-clusterpolicy.yaml diff --git a/config/llm_d/manifests/llminferenceservice.yaml b/projects/llm_d/orchestration/manifests/llminferenceservice.yaml similarity index 100% rename from config/llm_d/manifests/llminferenceservice.yaml rename to projects/llm_d/orchestration/manifests/llminferenceservice.yaml diff --git a/config/llm_d/manifests/nfd-nodefeaturediscovery.yaml b/projects/llm_d/orchestration/manifests/nfd-nodefeaturediscovery.yaml similarity index 100% rename from config/llm_d/manifests/nfd-nodefeaturediscovery.yaml rename to projects/llm_d/orchestration/manifests/nfd-nodefeaturediscovery.yaml diff --git a/projects/llm_d/orchestration/prepare_llmd.py b/projects/llm_d/orchestration/prepare_llmd.py index fdabe4b8..d52f921a 100644 --- a/projects/llm_d/orchestration/prepare_llmd.py +++ b/projects/llm_d/orchestration/prepare_llmd.py @@ -1,428 +1,15 @@ from __future__ import annotations -import json -import logging -from pathlib import Path - -from projects.llm_d.orchestration import llmd_runtime - -LOGGER = logging.getLogger(__name__) +from projects.llm_d.toolbox.cleanup.main import run as cleanup_toolbox_run +from projects.llm_d.toolbox.cleanup.main import run_cleanup +from projects.llm_d.toolbox.prepare.main import prepare_gpu_operator +from projects.llm_d.toolbox.prepare.main import run as prepare_toolbox_run +from projects.llm_d.toolbox.prepare.main import run_prepare def prepare() -> int: - llmd_runtime.init() - config = llmd_runtime.load_run_configuration() - - LOGGER.info( - "Preparing llm_d preset=%s namespace=%s", config.preset_name, config.namespace - ) - - verify_oc_access() - verify_cluster_version(config) - prepare_cert_manager(config) - prepare_leader_worker_set(config) - prepare_nfd(config) - prepare_gpu_operator(config) - prepare_rhoai_operator(config) - apply_datasciencecluster(config) - wait_for_datasciencecluster_ready(config) - ensure_required_crds(config.platform["rhoai"]["required_crds_after_dsc"], config) - ensure_gateway(config) - ensure_test_namespace(config) - verify_gpu_nodes(config) - capture_prepare_state(config) - - return 0 + return prepare_toolbox_run() def cleanup() -> int: - llmd_runtime.init() - config = llmd_runtime.load_run_configuration() - - inference_service_name = config.platform["inference_service"]["name"] - benchmark_name = ( - config.benchmark["job_name"] if config.benchmark else "guidellm-benchmark" - ) - - if config.namespace_is_managed: - if llmd_runtime.resource_exists("namespace", config.namespace): - llmd_runtime.oc( - "delete", "namespace", config.namespace, "--ignore-not-found=true" - ) - llmd_runtime.wait_for_namespace_deleted( - config.namespace, - timeout_seconds=config.platform["cluster"]["cleanup_timeout_seconds"], - ) - else: - llmd_runtime.oc( - "delete", - "llminferenceservice", - inference_service_name, - "-n", - config.namespace, - "--ignore-not-found=true", - check=False, - ) - llmd_runtime.oc( - "delete", - "job,pvc", - benchmark_name, - "-n", - config.namespace, - "--ignore-not-found=true", - check=False, - ) - llmd_runtime.oc( - "delete", - "pod", - f"{benchmark_name}-copy", - "-n", - config.namespace, - "--ignore-not-found=true", - check=False, - ) - - return 0 - - -def verify_oc_access() -> None: - llmd_runtime.oc("whoami", capture_output=True) - - -def verify_cluster_version(config: llmd_runtime.ResolvedConfig) -> None: - version_info = llmd_runtime.oc("version", "-o", "json", capture_output=True) - payload = json.loads(version_info.stdout) - - openshift_version = ( - payload.get("openshiftVersion") - or payload.get("serverVersion", {}).get("gitVersion") - or payload.get("serverVersion", {}).get("platform") - ) - if not openshift_version: - raise RuntimeError( - "Could not determine OpenShift version from `oc version -o json`" - ) - - minimum = config.platform["cluster"]["minimum_openshift_version"] - if llmd_runtime.version_tuple(openshift_version) < llmd_runtime.version_tuple( - minimum - ): - raise RuntimeError( - f"Cluster version {openshift_version} is older than the llm_d minimum {minimum}" - ) - - -def ensure_operator_subscription(operator_spec: dict[str, str]) -> dict[str, object]: - llmd_runtime.ensure_subscription(operator_spec) - return llmd_runtime.wait_for_operator_csv( - operator_spec["package"], - operator_spec["namespace"], - timeout_seconds=operator_spec["wait_timeout_seconds"], - ) - - -def prepare_cert_manager(config: llmd_runtime.ResolvedConfig) -> None: - operator_spec = llmd_runtime.operator_spec_by_package( - config.platform, "openshift-cert-manager-operator" - ) - ensure_operator_subscription(operator_spec) - - -def prepare_leader_worker_set(config: llmd_runtime.ResolvedConfig) -> None: - operator_spec = llmd_runtime.operator_spec_by_package( - config.platform, "leader-worker-set" - ) - ensure_operator_subscription(operator_spec) - - -def prepare_nfd(config: llmd_runtime.ResolvedConfig) -> None: - operator_spec = llmd_runtime.operator_spec_by_package(config.platform, "nfd") - ensure_operator_subscription(operator_spec) - llmd_runtime.wait_for_crd( - operator_spec["bootstrap_crd"], - timeout_seconds=operator_spec["wait_timeout_seconds"], - ) - - manifest = llmd_runtime.load_manifest_template( - config, operator_spec["bootstrap_manifest"] - ) - llmd_runtime.apply_manifest( - config.artifact_dir / "src" / "nfd-nodefeaturediscovery.yaml", - manifest, - ) - - llmd_runtime.wait_until( - "NodeFeatureDiscovery bootstrap resource", - timeout_seconds=operator_spec["wait_timeout_seconds"], - interval_seconds=10, - predicate=lambda: llmd_runtime.resource_exists( - "nodefeaturediscovery", - manifest["metadata"]["name"], - namespace=manifest["metadata"]["namespace"], - ), - ) - - wait_for_nfd_gpu_labels( - config, timeout_seconds=operator_spec["wait_timeout_seconds"] - ) - - -def prepare_gpu_operator(config: llmd_runtime.ResolvedConfig) -> None: - operator_spec = llmd_runtime.operator_spec_by_package( - config.platform, "gpu-operator-certified" - ) - ensure_operator_subscription(operator_spec) - llmd_runtime.wait_for_crd( - operator_spec["bootstrap_crd"], - timeout_seconds=operator_spec["wait_timeout_seconds"], - ) - - manifest = llmd_runtime.load_manifest_template( - config, operator_spec["bootstrap_manifest"] - ) - clusterpolicy_name = manifest["metadata"]["name"] - if llmd_runtime.resource_exists("clusterpolicy", clusterpolicy_name): - LOGGER.info( - "ClusterPolicy/%s already exists; verifying readiness instead of applying bootstrap manifest", - clusterpolicy_name, - ) - wait_for_gpu_clusterpolicy_ready( - clusterpolicy_name, - timeout_seconds=operator_spec["wait_timeout_seconds"], - ) - return - - llmd_runtime.apply_manifest( - config.artifact_dir / "src" / "gpu-clusterpolicy.yaml", - manifest, - ) - - wait_for_gpu_clusterpolicy_ready( - clusterpolicy_name, - timeout_seconds=operator_spec["wait_timeout_seconds"], - ) - - -def wait_for_gpu_clusterpolicy_ready( - clusterpolicy_name: str, *, timeout_seconds: int -) -> None: - def _clusterpolicy_ready() -> bool: - payload = llmd_runtime.oc_get_json( - "clusterpolicy", - name=clusterpolicy_name, - ) - state = payload.get("status", {}).get("state", "") - return state.lower() == "ready" - - llmd_runtime.wait_until( - f"clusterpolicy/{clusterpolicy_name} ready", - timeout_seconds=timeout_seconds, - interval_seconds=15, - predicate=_clusterpolicy_ready, - ) - - -def prepare_rhoai_operator(config: llmd_runtime.ResolvedConfig) -> None: - operator_spec = llmd_runtime.operator_spec_by_package( - config.platform, "rhods-operator" - ) - ensure_operator_subscription(operator_spec) - ensure_required_crds(config.platform["rhoai"]["required_crds_before_dsc"], config) - - -def ensure_required_crds( - crd_names: list[str], config: llmd_runtime.ResolvedConfig -) -> None: - for crd_name in crd_names: - llmd_runtime.wait_for_crd( - crd_name, - timeout_seconds=config.platform["rhoai"]["wait_timeout_seconds"], - ) - - -def apply_datasciencecluster(config: llmd_runtime.ResolvedConfig) -> None: - manifest = llmd_runtime.render_datasciencecluster(config) - llmd_runtime.apply_manifest( - config.artifact_dir / "src" / "datasciencecluster.yaml", manifest - ) - llmd_runtime.oc( - "get", - "datasciencecluster", - config.platform["rhoai"]["datasciencecluster_name"], - "-n", - config.platform["rhoai"]["namespace"], - "-o", - "yaml", - capture_output=True, - ) - - -def wait_for_datasciencecluster_ready(config: llmd_runtime.ResolvedConfig) -> None: - rhoai = config.platform["rhoai"] - - def _dsc_ready() -> bool: - payload = llmd_runtime.oc_get_json( - "datasciencecluster", - name=rhoai["datasciencecluster_name"], - namespace=rhoai["namespace"], - ) - phase = payload.get("status", {}).get("phase") - if phase == "Ready": - return True - if phase in {"Failed", "Error"}: - raise RuntimeError(f"DataScienceCluster entered terminal phase {phase}") - return False - - llmd_runtime.wait_until( - f"datasciencecluster/{rhoai['datasciencecluster_name']} ready", - timeout_seconds=rhoai["wait_timeout_seconds"], - interval_seconds=10, - predicate=_dsc_ready, - ) - - -def ensure_gateway(config: llmd_runtime.ResolvedConfig) -> None: - gateway = config.platform["gateway"] - if not llmd_runtime.resource_exists( - "gateway", gateway["name"], namespace=gateway["namespace"] - ): - if not gateway["create_if_missing"]: - raise RuntimeError( - f"Required gateway {gateway['name']} does not exist in {gateway['namespace']}" - ) - manifest = llmd_runtime.render_gateway(config) - llmd_runtime.apply_manifest( - config.artifact_dir / "src" / "gateway.yaml", manifest - ) - - def _gateway_programmed() -> bool: - resource = llmd_runtime.oc_get_json( - "gateway", - name=gateway["name"], - namespace=gateway["namespace"], - ) - return llmd_runtime.condition_status(resource, "Programmed") == "True" - - llmd_runtime.wait_until( - f"gateway/{gateway['name']} programmed", - timeout_seconds=gateway["wait_timeout_seconds"], - interval_seconds=10, - predicate=_gateway_programmed, - ) - - -def ensure_test_namespace(config: llmd_runtime.ResolvedConfig) -> None: - llmd_runtime.ensure_namespace( - config.namespace, - labels={ - "app.kubernetes.io/managed-by": "forge", - "forge.openshift.io/project": "llm_d", - }, - ) - - -def verify_gpu_nodes(config: llmd_runtime.ResolvedConfig) -> None: - selector = config.platform["cluster"]["gpu_node_label_selector"] - data = llmd_runtime.oc_get_json("nodes", selector=selector, ignore_not_found=True) - items = data.get("items", []) if data else [] - if not items: - raise RuntimeError( - f"No GPU nodes found with selector {selector}. The llm_d smoke path requires GPUs." - ) - - -def wait_for_nfd_gpu_labels( - config: llmd_runtime.ResolvedConfig, *, timeout_seconds: int -) -> None: - selectors = config.platform["cluster"]["nfd_gpu_detection_labels"] - - def _labels_present() -> bool: - for selector in selectors: - data = llmd_runtime.oc_get_json( - "nodes", selector=selector, ignore_not_found=True - ) - if data and data.get("items"): - return True - return False - - llmd_runtime.wait_until( - "NFD GPU discovery labels on cluster nodes", - timeout_seconds=timeout_seconds, - interval_seconds=15, - predicate=_labels_present, - ) - - -def capture_prepare_state(config: llmd_runtime.ResolvedConfig) -> None: - artifacts_dir = config.artifact_dir / "artifacts" - rhoai = config.platform["rhoai"] - gateway = config.platform["gateway"] - - capture_resource_yaml( - "datasciencecluster", - rhoai["datasciencecluster_name"], - rhoai["namespace"], - artifacts_dir / "datasciencecluster.yaml", - ) - capture_resource_yaml( - "gateway", - gateway["name"], - gateway["namespace"], - artifacts_dir / "gateway.yaml", - ) - gateway_service = llmd_runtime.oc( - "get", - "service", - "-A", - "-l", - f"gateway.networking.k8s.io/gateway-name={gateway['name']}", - "-o", - "yaml", - check=False, - capture_output=True, - ) - if gateway_service.returncode == 0 and gateway_service.stdout: - llmd_runtime.write_text( - artifacts_dir / "gateway.service.yaml", gateway_service.stdout - ) - if config.platform["artifacts"]["capture_namespace_events"]: - capture_namespace_events( - config.namespace, artifacts_dir / "namespace.events.txt" - ) - - -def capture_resource_yaml( - kind: str, - name: str, - namespace: str, - destination: Path, - *, - check: bool = True, -) -> None: - result = llmd_runtime.oc( - "get", - kind, - name, - "-n", - namespace, - "-o", - "yaml", - check=check, - capture_output=True, - ) - if result.returncode == 0 and result.stdout: - llmd_runtime.write_text(destination, result.stdout) - - -def capture_namespace_events(namespace: str, destination: Path) -> None: - result = llmd_runtime.oc( - "get", - "events", - "-n", - namespace, - "--sort-by=.metadata.creationTimestamp", - check=False, - capture_output=True, - ) - if result.returncode == 0 and result.stdout: - llmd_runtime.write_text(destination, result.stdout) + return cleanup_toolbox_run() diff --git a/projects/llm_d/orchestration/presets.d/cks.yaml b/projects/llm_d/orchestration/presets.d/cks.yaml deleted file mode 100644 index b4f842dc..00000000 --- a/projects/llm_d/orchestration/presets.d/cks.yaml +++ /dev/null @@ -1,23 +0,0 @@ -extends: [pvc_rwx, llama-70b] - -tests.capture_prom: false -tests.capture_prom_uwm: false -tests.llmd.skip_prepare: true -prepare.namespace.name: kpouget-dev -prepare.preload.node_selector_key: gpu.nvidia.com/class -prepare.preload.node_selector_value: "H200" -tests.llmd.inference_service.extra_properties: - spec.template.affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: kubernetes.io/hostname - operator: NotIn - values: - - gf48e48 - - gf4334a -prepare.preload.extra_images: - vllm-cuda-rhel9: registry.redhat.io/rhaiis/vllm-cuda-rhel9@sha256:094db84a1da5e8a575d0c9eade114fa30f4a2061064a338e3e032f3578f8082a - llm-d-inference-scheduler: ghcr.io/opendatahub-io/rhaii-on-xks/llm-d-inference-scheduler:e6b5db0@sha256:43e8b8edc158f31535c8b23d77629f8cde111cc762a8f4ee5f2f884470566211 - guidellm: ghcr.io/vllm-project/guidellm:v0.5.4 diff --git a/projects/llm_d/orchestration/presets.d/presets.yaml b/projects/llm_d/orchestration/presets.d/presets.yaml index 3bd1e3fb..37fcc711 100644 --- a/projects/llm_d/orchestration/presets.d/presets.yaml +++ b/projects/llm_d/orchestration/presets.d/presets.yaml @@ -1,9 +1,17 @@ __multiple: true -pvc_rwx: - prepare.pvc.name: storage-rwx - prepare.pvc.access_mode: ReadWriteMany +smoke: + runtime.selected_preset: smoke + runtime.model_key: qwen3-0-6b + runtime.smoke_request_key: default + runtime.benchmark_key: null +benchmark-short: + runtime.selected_preset: benchmark-short + runtime.model_key: llama-3-1-8b-instruct-fp8 + runtime.smoke_request_key: default + runtime.benchmark_key: short -llama-70b: - tests.llmd.inference_service.model: llama3-3-70b +cks: + extends: + - smoke diff --git a/projects/llm_d/orchestration/test_llmd.py b/projects/llm_d/orchestration/test_llmd.py index b11948d7..8fc2bc40 100644 --- a/projects/llm_d/orchestration/test_llmd.py +++ b/projects/llm_d/orchestration/test_llmd.py @@ -1,13 +1,9 @@ from __future__ import annotations -import json -import logging -import time -from pathlib import Path - from projects.llm_d.orchestration import llmd_runtime - -LOGGER = logging.getLogger(__name__) +from projects.llm_d.toolbox.test.main import resolve_endpoint_url +from projects.llm_d.toolbox.test.main import run as test_toolbox_run +from projects.llm_d.toolbox.test.main import run_test def init() -> None: @@ -15,469 +11,4 @@ def init() -> None: def test() -> int: - llmd_runtime.init() - config = llmd_runtime.load_run_configuration() - - name = config.platform["inference_service"]["name"] - namespace = config.namespace - artifacts_dir = config.artifact_dir / "artifacts" - - LOGGER.info("Testing llm_d preset=%s namespace=%s", config.preset_name, namespace) - - endpoint_url = None - try: - endpoint_url = deploy_inference_service(config) - smoke_response = run_smoke_request(config, endpoint_url) - llmd_runtime.write_json(artifacts_dir / "smoke.response.json", smoke_response) - - if config.benchmark: - run_guidellm_benchmark(config, endpoint_url) - - return 0 - finally: - capture_inference_service_state(config) - if endpoint_url: - llmd_runtime.write_text(artifacts_dir / "endpoint.url", f"{endpoint_url}\n") - benchmark_name = ( - config.benchmark["job_name"] if config.benchmark else "guidellm-benchmark" - ) - llmd_runtime.oc( - "delete", - "job,pvc", - benchmark_name, - "-n", - namespace, - "--ignore-not-found=true", - check=False, - ) - llmd_runtime.oc( - "delete", - "pod", - f"{benchmark_name}-copy", - "-n", - namespace, - "--ignore-not-found=true", - check=False, - ) - events = llmd_runtime.oc( - "get", - "events", - "-n", - namespace, - "--sort-by=.metadata.creationTimestamp", - check=False, - capture_output=True, - ) - if events.returncode == 0 and events.stdout: - llmd_runtime.write_text( - artifacts_dir / "namespace.events.txt", events.stdout - ) - - -def deploy_inference_service(config: llmd_runtime.ResolvedConfig) -> str: - name = config.platform["inference_service"]["name"] - namespace = config.namespace - selector = f"app.kubernetes.io/name={name}" - - llmd_runtime.oc( - "delete", - "llminferenceservice", - name, - "-n", - namespace, - "--ignore-not-found=true", - check=False, - ) - - def _old_pods_gone() -> bool: - pods = llmd_runtime.oc_get_json( - "pods", namespace=namespace, selector=selector, ignore_not_found=True - ) - return not pods or not pods.get("items") - - llmd_runtime.wait_until( - f"old llm-d pods to disappear in {namespace}", - timeout_seconds=config.platform["inference_service"]["delete_timeout_seconds"], - interval_seconds=10, - predicate=_old_pods_gone, - ) - - manifest = llmd_runtime.render_inference_service(config) - llmd_runtime.apply_manifest( - config.artifact_dir / "src" / "llminferenceservice.yaml", manifest - ) - - def _pods_present() -> bool: - pods = llmd_runtime.oc_get_json( - "pods", namespace=namespace, selector=selector, ignore_not_found=True - ) - return bool(pods and pods.get("items")) - - llmd_runtime.wait_until( - f"llm-d pods to appear in {namespace}", - timeout_seconds=config.platform["inference_service"][ - "pod_appearance_timeout_seconds" - ], - interval_seconds=5, - predicate=_pods_present, - ) - - def _service_ready() -> bool: - payload = llmd_runtime.oc_get_json( - "llminferenceservice", name=name, namespace=namespace - ) - return llmd_runtime.condition_status(payload, "Ready") == "True" - - llmd_runtime.wait_until( - f"llminferenceservice/{name} ready", - timeout_seconds=config.platform["inference_service"]["ready_timeout_seconds"], - interval_seconds=10, - predicate=_service_ready, - ) - - return llmd_runtime.wait_until( - f"gateway address for llminferenceservice/{name}", - timeout_seconds=config.platform["inference_service"]["ready_timeout_seconds"], - interval_seconds=10, - predicate=lambda: try_resolve_endpoint_url(config), - ) - - -def resolve_endpoint_url(config: llmd_runtime.ResolvedConfig) -> str: - endpoint_url = try_resolve_endpoint_url(config) - if endpoint_url: - return endpoint_url - - name = config.platform["inference_service"]["name"] - gateway_name = config.platform["gateway"]["status_address_name"] - raise RuntimeError( - f"Gateway address {gateway_name} is missing from llminferenceservice/{name} status.addresses" - ) - - -def try_resolve_endpoint_url(config: llmd_runtime.ResolvedConfig) -> str | None: - name = config.platform["inference_service"]["name"] - namespace = config.namespace - gateway_name = config.platform["gateway"]["status_address_name"] - payload = llmd_runtime.oc_get_json( - "llminferenceservice", name=name, namespace=namespace - ) - - for address in payload.get("status", {}).get("addresses", []): - if address.get("name") == gateway_name and address.get("url"): - return address["url"] - return None - - -def run_smoke_request( - config: llmd_runtime.ResolvedConfig, endpoint_url: str -) -> dict[str, object]: - namespace = config.namespace - name = config.platform["inference_service"]["name"] - deployment_name = f"{name}{config.platform['inference_service']['workload_deployment_name_suffix']}" - - payload = { - "model": config.model["served_model_name"], - "prompt": config.smoke_request["prompt"], - "max_tokens": config.smoke_request["max_tokens"], - "temperature": config.smoke_request["temperature"], - } - llmd_runtime.write_json( - config.artifact_dir / "artifacts" / "smoke.request.json", payload - ) - - retries = config.platform["smoke"]["request_retries"] - delay = config.platform["smoke"]["request_retry_delay_seconds"] - result = None - for _ in range(retries): - result = llmd_runtime.oc( - "exec", - "-n", - namespace, - f"deployment/{deployment_name}", - "-c", - "main", - "--", - "curl", - "-k", - "-sSf", - f"{endpoint_url}{config.platform['smoke']['endpoint_path']}", - "-H", - "Content-Type: application/json", - "-d", - json.dumps(payload), - check=False, - capture_output=True, - ) - if result.returncode == 0: - break - time.sleep(delay) - - if result is None or result.returncode != 0: - raise RuntimeError("Smoke request never succeeded against the llm_d endpoint") - - response = json.loads(result.stdout) - if not response.get("choices"): - raise RuntimeError(f"Invalid smoke response payload: {result.stdout}") - return response - - -def run_guidellm_benchmark( - config: llmd_runtime.ResolvedConfig, endpoint_url: str -) -> None: - benchmark_name = config.benchmark["job_name"] - namespace = config.namespace - - llmd_runtime.oc( - "delete", - "job,pvc", - benchmark_name, - "-n", - namespace, - "--ignore-not-found=true", - check=False, - ) - llmd_runtime.oc( - "delete", - "pod", - f"{benchmark_name}-copy", - "-n", - namespace, - "--ignore-not-found=true", - check=False, - ) - - llmd_runtime.apply_manifest( - config.artifact_dir / "src" / "guidellm-pvc.yaml", - llmd_runtime.render_guidellm_pvc(config), - ) - llmd_runtime.apply_manifest( - config.artifact_dir / "src" / "guidellm-job.yaml", - llmd_runtime.render_guidellm_job(config, endpoint_url), - ) - - def _job_terminal() -> dict[str, object] | None: - payload = llmd_runtime.oc_get_json( - "job", name=benchmark_name, namespace=namespace - ) - status = payload.get("status", {}) - if status.get("succeeded"): - return payload - if status.get("failed"): - raise RuntimeError(f"GuideLLM job {benchmark_name} failed") - return None - - llmd_runtime.wait_until( - f"GuideLLM job/{benchmark_name}", - timeout_seconds=config.benchmark["timeout_seconds"], - interval_seconds=10, - predicate=_job_terminal, - ) - - capture_guidellm_state(config) - copy_guidellm_results(config) - - -def copy_guidellm_results(config: llmd_runtime.ResolvedConfig) -> None: - benchmark_name = config.benchmark["job_name"] - namespace = config.namespace - pod_data = llmd_runtime.oc_get_json( - "pods", - namespace=namespace, - selector=f"job-name={benchmark_name}", - ignore_not_found=True, - ) - node_name = None - if pod_data and pod_data.get("items"): - node_name = pod_data["items"][0].get("spec", {}).get("nodeName") - - llmd_runtime.apply_manifest( - config.artifact_dir / "src" / "guidellm-copy-pod.yaml", - llmd_runtime.render_guidellm_copy_pod(config, node_name=node_name), - ) - - def _helper_ready() -> bool: - payload = llmd_runtime.oc_get_json( - "pod", - name=f"{benchmark_name}-copy", - namespace=namespace, - ) - conditions = payload.get("status", {}).get("conditions", []) - return any( - condition.get("type") == "Ready" and condition.get("status") == "True" - for condition in conditions - ) - - llmd_runtime.wait_until( - f"GuideLLM copy helper pod/{benchmark_name}-copy", - timeout_seconds=120, - interval_seconds=5, - predicate=_helper_ready, - ) - - result = llmd_runtime.oc( - "exec", - "-n", - namespace, - f"{benchmark_name}-copy", - "--", - "cat", - "/results/benchmarks.json", - check=False, - capture_output=True, - ) - if result.returncode == 0 and result.stdout: - llmd_runtime.write_text( - config.artifact_dir / "artifacts" / "results" / "benchmarks.json", - result.stdout, - ) - - -def capture_inference_service_state(config: llmd_runtime.ResolvedConfig) -> None: - name = config.platform["inference_service"]["name"] - namespace = config.namespace - artifacts_dir = config.artifact_dir / "artifacts" - selector = f"app.kubernetes.io/name={name}" - - capture_get( - "llminferenceservice", - name, - namespace, - "yaml", - artifacts_dir / "llminferenceservice.yaml", - ) - capture_get( - "llminferenceservice", - name, - namespace, - "json", - artifacts_dir / "llminferenceservice.json", - ) - capture_get( - "pods", - None, - namespace, - "yaml", - artifacts_dir / "llminferenceservice.pods.yaml", - selector=selector, - ) - capture_get( - "deployments", - None, - namespace, - "yaml", - artifacts_dir / "llminferenceservice.deployments.yaml", - selector=selector, - ) - capture_get( - "replicasets", - None, - namespace, - "yaml", - artifacts_dir / "llminferenceservice.replicasets.yaml", - selector=selector, - ) - capture_get( - "pods", None, namespace, "wide", artifacts_dir / "namespace.pods.status" - ) - capture_get( - "services", None, namespace, "wide", artifacts_dir / "namespace.services.status" - ) - - pod_list = llmd_runtime.oc_get_json( - "pods", namespace=namespace, selector=selector, ignore_not_found=True - ) - if pod_list: - lines = [] - previous_lines = [] - for pod in pod_list.get("items", []): - pod_name = pod["metadata"]["name"] - lines.append(f"=== {pod_name} ===") - log_result = llmd_runtime.oc( - "logs", - pod_name, - "-n", - namespace, - "--all-containers=true", - check=False, - capture_output=True, - ) - if log_result.stdout: - lines.append(log_result.stdout.rstrip()) - - previous_lines.append(f"=== {pod_name} ===") - previous_result = llmd_runtime.oc( - "logs", - pod_name, - "-n", - namespace, - "--previous", - "--all-containers=true", - check=False, - capture_output=True, - ) - if previous_result.stdout: - previous_lines.append(previous_result.stdout.rstrip()) - - llmd_runtime.write_text( - artifacts_dir / "llminferenceservice.pods.logs", "\n".join(lines) + "\n" - ) - llmd_runtime.write_text( - artifacts_dir / "llminferenceservice.pods.previous.logs", - "\n".join(previous_lines) + "\n", - ) - - -def capture_guidellm_state(config: llmd_runtime.ResolvedConfig) -> None: - benchmark_name = config.benchmark["job_name"] - namespace = config.namespace - artifacts_dir = config.artifact_dir / "artifacts" - - capture_get( - "job", - benchmark_name, - namespace, - "yaml", - artifacts_dir / "guidellm_benchmark_job.yaml", - ) - capture_get( - "pods", - None, - namespace, - "yaml", - artifacts_dir / "guidellm_benchmark_job.pods.yaml", - selector=f"job-name={benchmark_name}", - ) - result = llmd_runtime.oc( - "logs", - f"job/{benchmark_name}", - "-n", - namespace, - check=False, - capture_output=True, - ) - if result.returncode == 0 and result.stdout: - llmd_runtime.write_text( - artifacts_dir / "guidellm_benchmark_job.logs", result.stdout - ) - - -def capture_get( - kind: str, - name: str | None, - namespace: str, - output: str, - destination: Path, - *, - selector: str | None = None, -) -> None: - args = ["get", kind] - if name: - args.append(name) - args.extend(["-n", namespace]) - if selector: - args.extend(["-l", selector]) - args.extend(["-o", output]) - result = llmd_runtime.oc(*args, check=False, capture_output=True) - if result.returncode == 0 and result.stdout: - llmd_runtime.write_text(destination, result.stdout) + return test_toolbox_run() diff --git a/projects/llm_d/toolbox/cleanup/main.py b/projects/llm_d/toolbox/cleanup/main.py new file mode 100644 index 00000000..46d0aedf --- /dev/null +++ b/projects/llm_d/toolbox/cleanup/main.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +from projects.core.dsl import toolbox +from projects.llm_d.orchestration import llmd_runtime + + +def run() -> int: + llmd_runtime.init() + config = llmd_runtime.load_run_configuration() + return run_cleanup(config) + + +def run_cleanup(config: llmd_runtime.ResolvedConfig) -> int: + inference_service_name = config.platform["inference_service"]["name"] + benchmark_name = ( + config.benchmark["job_name"] if config.benchmark else "guidellm-benchmark" + ) + + if config.namespace_is_managed: + if llmd_runtime.resource_exists("namespace", config.namespace): + llmd_runtime.oc( + "delete", "namespace", config.namespace, "--ignore-not-found=true" + ) + llmd_runtime.wait_for_namespace_deleted( + config.namespace, + timeout_seconds=config.platform["cluster"]["cleanup_timeout_seconds"], + ) + else: + llmd_runtime.oc( + "delete", + "llminferenceservice", + inference_service_name, + "-n", + config.namespace, + "--ignore-not-found=true", + check=False, + ) + llmd_runtime.oc( + "delete", + "job,pvc", + benchmark_name, + "-n", + config.namespace, + "--ignore-not-found=true", + check=False, + ) + llmd_runtime.oc( + "delete", + "pod", + f"{benchmark_name}-copy", + "-n", + config.namespace, + "--ignore-not-found=true", + check=False, + ) + + return 0 + + +main = toolbox.create_toolbox_main(run) + + +if __name__ == "__main__": + main() diff --git a/projects/llm_d/toolbox/prepare/main.py b/projects/llm_d/toolbox/prepare/main.py new file mode 100644 index 00000000..7edad1f8 --- /dev/null +++ b/projects/llm_d/toolbox/prepare/main.py @@ -0,0 +1,391 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +import json +import logging +from pathlib import Path + +from projects.core.dsl import toolbox +from projects.llm_d.orchestration import llmd_runtime + +LOGGER = logging.getLogger(__name__) + + +def run() -> int: + llmd_runtime.init() + config = llmd_runtime.load_run_configuration() + return run_prepare(config) + + +def run_prepare(config: llmd_runtime.ResolvedConfig) -> int: + LOGGER.info( + "Preparing llm_d preset=%s namespace=%s", config.preset_name, config.namespace + ) + + verify_oc_access() + verify_cluster_version(config) + prepare_cert_manager(config) + prepare_leader_worker_set(config) + prepare_nfd(config) + prepare_gpu_operator(config) + prepare_rhoai_operator(config) + apply_datasciencecluster(config) + wait_for_datasciencecluster_ready(config) + ensure_required_crds(config.platform["rhoai"]["required_crds_after_dsc"], config) + ensure_gateway(config) + ensure_test_namespace(config) + verify_gpu_nodes(config) + capture_prepare_state(config) + + return 0 + + +def verify_oc_access() -> None: + llmd_runtime.oc("whoami", capture_output=True) + + +def verify_cluster_version(config: llmd_runtime.ResolvedConfig) -> None: + version_info = llmd_runtime.oc("version", "-o", "json", capture_output=True) + payload = json.loads(version_info.stdout) + + openshift_version = ( + payload.get("openshiftVersion") + or payload.get("serverVersion", {}).get("gitVersion") + or payload.get("serverVersion", {}).get("platform") + ) + if not openshift_version: + raise RuntimeError( + "Could not determine OpenShift version from `oc version -o json`" + ) + + minimum = config.platform["cluster"]["minimum_openshift_version"] + if llmd_runtime.version_tuple(openshift_version) < llmd_runtime.version_tuple( + minimum + ): + raise RuntimeError( + f"Cluster version {openshift_version} is older than the llm_d minimum {minimum}" + ) + + +def ensure_operator_subscription(operator_spec: dict[str, str]) -> dict[str, object]: + llmd_runtime.ensure_subscription(operator_spec) + return llmd_runtime.wait_for_operator_csv( + operator_spec["package"], + operator_spec["namespace"], + timeout_seconds=operator_spec["wait_timeout_seconds"], + ) + + +def prepare_cert_manager(config: llmd_runtime.ResolvedConfig) -> None: + operator_spec = llmd_runtime.operator_spec_by_package( + config.platform, "openshift-cert-manager-operator" + ) + ensure_operator_subscription(operator_spec) + + +def prepare_leader_worker_set(config: llmd_runtime.ResolvedConfig) -> None: + operator_spec = llmd_runtime.operator_spec_by_package( + config.platform, "leader-worker-set" + ) + ensure_operator_subscription(operator_spec) + + +def prepare_nfd(config: llmd_runtime.ResolvedConfig) -> None: + operator_spec = llmd_runtime.operator_spec_by_package(config.platform, "nfd") + ensure_operator_subscription(operator_spec) + llmd_runtime.wait_for_crd( + operator_spec["bootstrap_crd"], + timeout_seconds=operator_spec["wait_timeout_seconds"], + ) + + manifest = llmd_runtime.load_manifest_template( + config, operator_spec["bootstrap_manifest"] + ) + llmd_runtime.apply_manifest( + config.artifact_dir / "src" / "nfd-nodefeaturediscovery.yaml", + manifest, + ) + + llmd_runtime.wait_until( + "NodeFeatureDiscovery bootstrap resource", + timeout_seconds=operator_spec["wait_timeout_seconds"], + interval_seconds=10, + predicate=lambda: llmd_runtime.resource_exists( + "nodefeaturediscovery", + manifest["metadata"]["name"], + namespace=manifest["metadata"]["namespace"], + ), + ) + + wait_for_nfd_gpu_labels( + config, timeout_seconds=operator_spec["wait_timeout_seconds"] + ) + + +def prepare_gpu_operator(config: llmd_runtime.ResolvedConfig) -> None: + operator_spec = llmd_runtime.operator_spec_by_package( + config.platform, "gpu-operator-certified" + ) + ensure_operator_subscription(operator_spec) + llmd_runtime.wait_for_crd( + operator_spec["bootstrap_crd"], + timeout_seconds=operator_spec["wait_timeout_seconds"], + ) + + manifest = llmd_runtime.load_manifest_template( + config, operator_spec["bootstrap_manifest"] + ) + clusterpolicy_name = manifest["metadata"]["name"] + if llmd_runtime.resource_exists("clusterpolicy", clusterpolicy_name): + LOGGER.info( + "ClusterPolicy/%s already exists; verifying readiness instead of applying bootstrap manifest", + clusterpolicy_name, + ) + wait_for_gpu_clusterpolicy_ready( + clusterpolicy_name, + timeout_seconds=operator_spec["wait_timeout_seconds"], + ) + return + + llmd_runtime.apply_manifest( + config.artifact_dir / "src" / "gpu-clusterpolicy.yaml", + manifest, + ) + + wait_for_gpu_clusterpolicy_ready( + clusterpolicy_name, + timeout_seconds=operator_spec["wait_timeout_seconds"], + ) + + +def wait_for_gpu_clusterpolicy_ready( + clusterpolicy_name: str, *, timeout_seconds: int +) -> None: + def _clusterpolicy_ready() -> bool: + payload = llmd_runtime.oc_get_json( + "clusterpolicy", + name=clusterpolicy_name, + ) + state = payload.get("status", {}).get("state", "") + return state.lower() == "ready" + + llmd_runtime.wait_until( + f"clusterpolicy/{clusterpolicy_name} ready", + timeout_seconds=timeout_seconds, + interval_seconds=15, + predicate=_clusterpolicy_ready, + ) + + +def prepare_rhoai_operator(config: llmd_runtime.ResolvedConfig) -> None: + operator_spec = llmd_runtime.operator_spec_by_package( + config.platform, "rhods-operator" + ) + ensure_operator_subscription(operator_spec) + ensure_required_crds(config.platform["rhoai"]["required_crds_before_dsc"], config) + + +def ensure_required_crds( + crd_names: list[str], config: llmd_runtime.ResolvedConfig +) -> None: + for crd_name in crd_names: + llmd_runtime.wait_for_crd( + crd_name, + timeout_seconds=config.platform["rhoai"]["wait_timeout_seconds"], + ) + + +def apply_datasciencecluster(config: llmd_runtime.ResolvedConfig) -> None: + manifest = llmd_runtime.render_datasciencecluster(config) + llmd_runtime.apply_manifest( + config.artifact_dir / "src" / "datasciencecluster.yaml", manifest + ) + llmd_runtime.oc( + "get", + "datasciencecluster", + config.platform["rhoai"]["datasciencecluster_name"], + "-n", + config.platform["rhoai"]["namespace"], + "-o", + "yaml", + capture_output=True, + ) + + +def wait_for_datasciencecluster_ready(config: llmd_runtime.ResolvedConfig) -> None: + rhoai = config.platform["rhoai"] + + def _dsc_ready() -> bool: + payload = llmd_runtime.oc_get_json( + "datasciencecluster", + name=rhoai["datasciencecluster_name"], + namespace=rhoai["namespace"], + ) + phase = payload.get("status", {}).get("phase") + if phase == "Ready": + return True + if phase in {"Failed", "Error"}: + raise RuntimeError(f"DataScienceCluster entered terminal phase {phase}") + return False + + llmd_runtime.wait_until( + f"datasciencecluster/{rhoai['datasciencecluster_name']} ready", + timeout_seconds=rhoai["wait_timeout_seconds"], + interval_seconds=10, + predicate=_dsc_ready, + ) + + +def ensure_gateway(config: llmd_runtime.ResolvedConfig) -> None: + gateway = config.platform["gateway"] + if not llmd_runtime.resource_exists( + "gateway", gateway["name"], namespace=gateway["namespace"] + ): + if not gateway["create_if_missing"]: + raise RuntimeError( + f"Required gateway {gateway['name']} does not exist in {gateway['namespace']}" + ) + manifest = llmd_runtime.render_gateway(config) + llmd_runtime.apply_manifest( + config.artifact_dir / "src" / "gateway.yaml", manifest + ) + + def _gateway_programmed() -> bool: + resource = llmd_runtime.oc_get_json( + "gateway", + name=gateway["name"], + namespace=gateway["namespace"], + ) + return llmd_runtime.condition_status(resource, "Programmed") == "True" + + llmd_runtime.wait_until( + f"gateway/{gateway['name']} programmed", + timeout_seconds=gateway["wait_timeout_seconds"], + interval_seconds=10, + predicate=_gateway_programmed, + ) + + +def ensure_test_namespace(config: llmd_runtime.ResolvedConfig) -> None: + llmd_runtime.ensure_namespace( + config.namespace, + labels={ + "app.kubernetes.io/managed-by": "forge", + "forge.openshift.io/project": "llm_d", + }, + ) + + +def verify_gpu_nodes(config: llmd_runtime.ResolvedConfig) -> None: + selector = config.platform["cluster"]["gpu_node_label_selector"] + data = llmd_runtime.oc_get_json("nodes", selector=selector, ignore_not_found=True) + items = data.get("items", []) if data else [] + if not items: + raise RuntimeError( + f"No GPU nodes found with selector {selector}. The llm_d smoke path requires GPUs." + ) + + +def wait_for_nfd_gpu_labels( + config: llmd_runtime.ResolvedConfig, *, timeout_seconds: int +) -> None: + selectors = config.platform["cluster"]["nfd_gpu_detection_labels"] + + def _labels_present() -> bool: + for selector in selectors: + data = llmd_runtime.oc_get_json( + "nodes", selector=selector, ignore_not_found=True + ) + if data and data.get("items"): + return True + return False + + llmd_runtime.wait_until( + "NFD GPU discovery labels on cluster nodes", + timeout_seconds=timeout_seconds, + interval_seconds=15, + predicate=_labels_present, + ) + + +def capture_prepare_state(config: llmd_runtime.ResolvedConfig) -> None: + artifacts_dir = config.artifact_dir / "artifacts" + rhoai = config.platform["rhoai"] + gateway = config.platform["gateway"] + + capture_resource_yaml( + "datasciencecluster", + rhoai["datasciencecluster_name"], + rhoai["namespace"], + artifacts_dir / "datasciencecluster.yaml", + ) + capture_resource_yaml( + "gateway", + gateway["name"], + gateway["namespace"], + artifacts_dir / "gateway.yaml", + ) + gateway_service = llmd_runtime.oc( + "get", + "service", + "-A", + "-l", + f"gateway.networking.k8s.io/gateway-name={gateway['name']}", + "-o", + "yaml", + check=False, + capture_output=True, + ) + if gateway_service.returncode == 0 and gateway_service.stdout: + llmd_runtime.write_text( + artifacts_dir / "gateway.service.yaml", gateway_service.stdout + ) + if config.platform["artifacts"]["capture_namespace_events"]: + capture_namespace_events( + config.namespace, artifacts_dir / "namespace.events.txt" + ) + + +def capture_resource_yaml( + kind: str, + name: str, + namespace: str, + destination: Path, + *, + check: bool = True, +) -> None: + result = llmd_runtime.oc( + "get", + kind, + name, + "-n", + namespace, + "-o", + "yaml", + check=check, + capture_output=True, + ) + if result.returncode == 0 and result.stdout: + llmd_runtime.write_text(destination, result.stdout) + + +def capture_namespace_events(namespace: str, destination: Path) -> None: + result = llmd_runtime.oc( + "get", + "events", + "-n", + namespace, + "--sort-by=.metadata.creationTimestamp", + check=False, + capture_output=True, + ) + if result.returncode == 0 and result.stdout: + llmd_runtime.write_text(destination, result.stdout) + + +main = toolbox.create_toolbox_main(run) + + +if __name__ == "__main__": + main() diff --git a/projects/llm_d/toolbox/test/main.py b/projects/llm_d/toolbox/test/main.py new file mode 100644 index 00000000..d779c18c --- /dev/null +++ b/projects/llm_d/toolbox/test/main.py @@ -0,0 +1,492 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +import json +import logging +import time +from pathlib import Path + +from projects.core.dsl import toolbox +from projects.llm_d.orchestration import llmd_runtime + +LOGGER = logging.getLogger(__name__) + + +def run() -> int: + llmd_runtime.init() + config = llmd_runtime.load_run_configuration() + return run_test(config) + + +def run_test(config: llmd_runtime.ResolvedConfig) -> int: + name = config.platform["inference_service"]["name"] + namespace = config.namespace + artifacts_dir = config.artifact_dir / "artifacts" + + LOGGER.info("Testing llm_d preset=%s namespace=%s", config.preset_name, namespace) + + endpoint_url = None + try: + endpoint_url = deploy_inference_service(config) + smoke_response = run_smoke_request(config, endpoint_url) + llmd_runtime.write_json(artifacts_dir / "smoke.response.json", smoke_response) + + if config.benchmark: + run_guidellm_benchmark(config, endpoint_url) + + return 0 + finally: + capture_inference_service_state(config) + if endpoint_url: + llmd_runtime.write_text(artifacts_dir / "endpoint.url", f"{endpoint_url}\n") + benchmark_name = ( + config.benchmark["job_name"] if config.benchmark else "guidellm-benchmark" + ) + llmd_runtime.oc( + "delete", + "job,pvc", + benchmark_name, + "-n", + namespace, + "--ignore-not-found=true", + check=False, + ) + llmd_runtime.oc( + "delete", + "pod", + f"{benchmark_name}-copy", + "-n", + namespace, + "--ignore-not-found=true", + check=False, + ) + events = llmd_runtime.oc( + "get", + "events", + "-n", + namespace, + "--sort-by=.metadata.creationTimestamp", + check=False, + capture_output=True, + ) + if events.returncode == 0 and events.stdout: + llmd_runtime.write_text( + artifacts_dir / "namespace.events.txt", events.stdout + ) + + +def deploy_inference_service(config: llmd_runtime.ResolvedConfig) -> str: + name = config.platform["inference_service"]["name"] + namespace = config.namespace + selector = f"app.kubernetes.io/name={name}" + + llmd_runtime.oc( + "delete", + "llminferenceservice", + name, + "-n", + namespace, + "--ignore-not-found=true", + check=False, + ) + + def _old_pods_gone() -> bool: + pods = llmd_runtime.oc_get_json( + "pods", namespace=namespace, selector=selector, ignore_not_found=True + ) + return not pods or not pods.get("items") + + llmd_runtime.wait_until( + f"old llm-d pods to disappear in {namespace}", + timeout_seconds=config.platform["inference_service"]["delete_timeout_seconds"], + interval_seconds=10, + predicate=_old_pods_gone, + ) + + manifest = llmd_runtime.render_inference_service(config) + llmd_runtime.apply_manifest( + config.artifact_dir / "src" / "llminferenceservice.yaml", manifest + ) + + def _pods_present() -> bool: + pods = llmd_runtime.oc_get_json( + "pods", namespace=namespace, selector=selector, ignore_not_found=True + ) + return bool(pods and pods.get("items")) + + llmd_runtime.wait_until( + f"llm-d pods to appear in {namespace}", + timeout_seconds=config.platform["inference_service"][ + "pod_appearance_timeout_seconds" + ], + interval_seconds=5, + predicate=_pods_present, + ) + + def _service_ready() -> bool: + payload = llmd_runtime.oc_get_json( + "llminferenceservice", name=name, namespace=namespace + ) + return llmd_runtime.condition_status(payload, "Ready") == "True" + + llmd_runtime.wait_until( + f"llminferenceservice/{name} ready", + timeout_seconds=config.platform["inference_service"]["ready_timeout_seconds"], + interval_seconds=10, + predicate=_service_ready, + ) + + return llmd_runtime.wait_until( + f"gateway address for llminferenceservice/{name}", + timeout_seconds=config.platform["inference_service"]["ready_timeout_seconds"], + interval_seconds=10, + predicate=lambda: try_resolve_endpoint_url(config), + ) + + +def resolve_endpoint_url(config: llmd_runtime.ResolvedConfig) -> str: + endpoint_url = try_resolve_endpoint_url(config) + if endpoint_url: + return endpoint_url + + name = config.platform["inference_service"]["name"] + gateway_name = config.platform["gateway"]["status_address_name"] + raise RuntimeError( + f"Gateway address {gateway_name} is missing from llminferenceservice/{name} status.addresses" + ) + + +def try_resolve_endpoint_url(config: llmd_runtime.ResolvedConfig) -> str | None: + name = config.platform["inference_service"]["name"] + namespace = config.namespace + gateway_name = config.platform["gateway"]["status_address_name"] + payload = llmd_runtime.oc_get_json( + "llminferenceservice", name=name, namespace=namespace + ) + + for address in payload.get("status", {}).get("addresses", []): + if address.get("name") == gateway_name and address.get("url"): + return address["url"] + return None + + +def run_smoke_request( + config: llmd_runtime.ResolvedConfig, endpoint_url: str +) -> dict[str, object]: + namespace = config.namespace + name = config.platform["inference_service"]["name"] + deployment_name = f"{name}{config.platform['inference_service']['workload_deployment_name_suffix']}" + + payload = { + "model": config.model["served_model_name"], + "prompt": config.smoke_request["prompt"], + "max_tokens": config.smoke_request["max_tokens"], + "temperature": config.smoke_request["temperature"], + } + llmd_runtime.write_json( + config.artifact_dir / "artifacts" / "smoke.request.json", payload + ) + + retries = config.platform["smoke"]["request_retries"] + delay = config.platform["smoke"]["request_retry_delay_seconds"] + result = None + for _ in range(retries): + result = llmd_runtime.oc( + "exec", + "-n", + namespace, + f"deployment/{deployment_name}", + "-c", + "main", + "--", + "curl", + "-k", + "-sSf", + f"{endpoint_url}{config.platform['smoke']['endpoint_path']}", + "-H", + "Content-Type: application/json", + "-d", + json.dumps(payload), + check=False, + capture_output=True, + ) + if result.returncode == 0: + break + time.sleep(delay) + + if result is None or result.returncode != 0: + raise RuntimeError("Smoke request never succeeded against the llm_d endpoint") + + response = json.loads(result.stdout) + if not response.get("choices"): + raise RuntimeError(f"Invalid smoke response payload: {result.stdout}") + return response + + +def run_guidellm_benchmark( + config: llmd_runtime.ResolvedConfig, endpoint_url: str +) -> None: + benchmark_name = config.benchmark["job_name"] + namespace = config.namespace + + llmd_runtime.oc( + "delete", + "job,pvc", + benchmark_name, + "-n", + namespace, + "--ignore-not-found=true", + check=False, + ) + llmd_runtime.oc( + "delete", + "pod", + f"{benchmark_name}-copy", + "-n", + namespace, + "--ignore-not-found=true", + check=False, + ) + + llmd_runtime.apply_manifest( + config.artifact_dir / "src" / "guidellm-pvc.yaml", + llmd_runtime.render_guidellm_pvc(config), + ) + llmd_runtime.apply_manifest( + config.artifact_dir / "src" / "guidellm-job.yaml", + llmd_runtime.render_guidellm_job(config, endpoint_url), + ) + + def _job_terminal() -> dict[str, object] | None: + payload = llmd_runtime.oc_get_json( + "job", name=benchmark_name, namespace=namespace + ) + status = payload.get("status", {}) + if status.get("succeeded"): + return payload + if status.get("failed"): + raise RuntimeError(f"GuideLLM job {benchmark_name} failed") + return None + + llmd_runtime.wait_until( + f"GuideLLM job/{benchmark_name}", + timeout_seconds=config.benchmark["timeout_seconds"], + interval_seconds=10, + predicate=_job_terminal, + ) + + capture_guidellm_state(config) + copy_guidellm_results(config) + + +def copy_guidellm_results(config: llmd_runtime.ResolvedConfig) -> None: + benchmark_name = config.benchmark["job_name"] + namespace = config.namespace + pod_data = llmd_runtime.oc_get_json( + "pods", + namespace=namespace, + selector=f"job-name={benchmark_name}", + ignore_not_found=True, + ) + node_name = None + if pod_data and pod_data.get("items"): + node_name = pod_data["items"][0].get("spec", {}).get("nodeName") + + llmd_runtime.apply_manifest( + config.artifact_dir / "src" / "guidellm-copy-pod.yaml", + llmd_runtime.render_guidellm_copy_pod(config, node_name=node_name), + ) + + def _helper_ready() -> bool: + payload = llmd_runtime.oc_get_json( + "pod", + name=f"{benchmark_name}-copy", + namespace=namespace, + ) + conditions = payload.get("status", {}).get("conditions", []) + return any( + condition.get("type") == "Ready" and condition.get("status") == "True" + for condition in conditions + ) + + llmd_runtime.wait_until( + f"GuideLLM copy helper pod/{benchmark_name}-copy", + timeout_seconds=120, + interval_seconds=5, + predicate=_helper_ready, + ) + + result = llmd_runtime.oc( + "exec", + "-n", + namespace, + f"{benchmark_name}-copy", + "--", + "cat", + "/results/benchmarks.json", + check=False, + capture_output=True, + ) + if result.returncode == 0 and result.stdout: + llmd_runtime.write_text( + config.artifact_dir / "artifacts" / "results" / "benchmarks.json", + result.stdout, + ) + + +def capture_inference_service_state(config: llmd_runtime.ResolvedConfig) -> None: + name = config.platform["inference_service"]["name"] + namespace = config.namespace + artifacts_dir = config.artifact_dir / "artifacts" + selector = f"app.kubernetes.io/name={name}" + + capture_get( + "llminferenceservice", + name, + namespace, + "yaml", + artifacts_dir / "llminferenceservice.yaml", + ) + capture_get( + "llminferenceservice", + name, + namespace, + "json", + artifacts_dir / "llminferenceservice.json", + ) + capture_get( + "pods", + None, + namespace, + "yaml", + artifacts_dir / "llminferenceservice.pods.yaml", + selector=selector, + ) + capture_get( + "deployments", + None, + namespace, + "yaml", + artifacts_dir / "llminferenceservice.deployments.yaml", + selector=selector, + ) + capture_get( + "replicasets", + None, + namespace, + "yaml", + artifacts_dir / "llminferenceservice.replicasets.yaml", + selector=selector, + ) + capture_get( + "pods", None, namespace, "wide", artifacts_dir / "namespace.pods.status" + ) + capture_get( + "services", None, namespace, "wide", artifacts_dir / "namespace.services.status" + ) + + pod_list = llmd_runtime.oc_get_json( + "pods", namespace=namespace, selector=selector, ignore_not_found=True + ) + if pod_list: + lines = [] + previous_lines = [] + for pod in pod_list.get("items", []): + pod_name = pod["metadata"]["name"] + lines.append(f"=== {pod_name} ===") + log_result = llmd_runtime.oc( + "logs", + pod_name, + "-n", + namespace, + "--all-containers=true", + check=False, + capture_output=True, + ) + if log_result.stdout: + lines.append(log_result.stdout.rstrip()) + + previous_lines.append(f"=== {pod_name} ===") + previous_result = llmd_runtime.oc( + "logs", + pod_name, + "-n", + namespace, + "--previous", + "--all-containers=true", + check=False, + capture_output=True, + ) + if previous_result.stdout: + previous_lines.append(previous_result.stdout.rstrip()) + + llmd_runtime.write_text( + artifacts_dir / "llminferenceservice.pods.logs", "\n".join(lines) + "\n" + ) + llmd_runtime.write_text( + artifacts_dir / "llminferenceservice.pods.previous.logs", + "\n".join(previous_lines) + "\n", + ) + + +def capture_guidellm_state(config: llmd_runtime.ResolvedConfig) -> None: + benchmark_name = config.benchmark["job_name"] + namespace = config.namespace + artifacts_dir = config.artifact_dir / "artifacts" + + capture_get( + "job", + benchmark_name, + namespace, + "yaml", + artifacts_dir / "guidellm_benchmark_job.yaml", + ) + capture_get( + "pods", + None, + namespace, + "yaml", + artifacts_dir / "guidellm_benchmark_job.pods.yaml", + selector=f"job-name={benchmark_name}", + ) + result = llmd_runtime.oc( + "logs", + f"job/{benchmark_name}", + "-n", + namespace, + check=False, + capture_output=True, + ) + if result.returncode == 0 and result.stdout: + llmd_runtime.write_text( + artifacts_dir / "guidellm_benchmark_job.logs", result.stdout + ) + + +def capture_get( + kind: str, + name: str | None, + namespace: str, + output: str, + destination: Path, + *, + selector: str | None = None, +) -> None: + args = ["get", kind] + if name: + args.append(name) + args.extend(["-n", namespace]) + if selector: + args.extend(["-l", selector]) + args.extend(["-o", output]) + result = llmd_runtime.oc(*args, check=False, capture_output=True) + if result.returncode == 0 and result.stdout: + llmd_runtime.write_text(destination, result.stdout) + + +main = toolbox.create_toolbox_main(run) + + +if __name__ == "__main__": + main() diff --git a/tests/llm_d/test_runtime.py b/tests/llm_d/test_runtime.py index 4557de00..b2bcff1d 100644 --- a/tests/llm_d/test_runtime.py +++ b/tests/llm_d/test_runtime.py @@ -5,8 +5,8 @@ import pytest from projects.llm_d.orchestration import llmd_runtime -from projects.llm_d.orchestration import prepare_llmd -from projects.llm_d.orchestration import test_llmd +from projects.llm_d.toolbox.prepare import main as prepare_toolbox +from projects.llm_d.toolbox.test import main as test_toolbox def test_derive_namespace_uses_prefix_once() -> None: @@ -16,10 +16,12 @@ def test_derive_namespace_uses_prefix_once() -> None: def test_parse_overrides_rejects_unknown_keys() -> None: with pytest.raises(ValueError, match="Unsupported llm_d override keys"): - llmd_runtime.parse_overrides('{"model":"other"}') + llmd_runtime.parse_overrides('{"model":"other"}', allowed_keys=("namespace",)) -def test_load_run_configuration_resolves_alias(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: +def test_load_run_configuration_resolves_alias( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") artifact_dir = tmp_path / "artifacts" artifact_dir.mkdir() @@ -30,7 +32,9 @@ def test_load_run_configuration_resolves_alias(tmp_path: Path, monkeypatch: pyte encoding="utf-8", ) - config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + config = llmd_runtime.load_run_configuration( + cwd=tmp_path, artifact_dir=artifact_dir + ) assert config.preset_name == "smoke" assert config.preset_alias == "cks" @@ -39,23 +43,48 @@ def test_load_run_configuration_resolves_alias(tmp_path: Path, monkeypatch: pyte assert config.namespace_is_managed is True -def test_namespace_override_is_not_managed(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: +def test_load_run_configuration_consolidates_config_d( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + + llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + consolidated = llmd_runtime.load_yaml(artifact_dir / "config.yaml") + + assert "platform" in consolidated + assert "models" in consolidated + assert "runtime" in consolidated + assert "workloads" in consolidated + assert consolidated["runtime"]["default_preset"] == "smoke" + + +def test_namespace_override_is_not_managed( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", '{"namespace":"custom-ns"}') artifact_dir = tmp_path / "artifacts" artifact_dir.mkdir() - config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + config = llmd_runtime.load_run_configuration( + cwd=tmp_path, artifact_dir=artifact_dir + ) assert config.namespace == "custom-ns" assert config.namespace_is_managed is False -def test_render_inference_service_injects_model_and_epp(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: +def test_render_inference_service_injects_model_and_epp( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") artifact_dir = tmp_path / "artifacts" artifact_dir.mkdir() - config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + config = llmd_runtime.load_run_configuration( + cwd=tmp_path, artifact_dir=artifact_dir + ) manifest = llmd_runtime.render_inference_service(config) assert manifest["metadata"]["name"] == "llm-d" @@ -64,12 +93,16 @@ def test_render_inference_service_injects_model_and_epp(tmp_path: Path, monkeypa assert manifest["spec"]["model"]["uri"] == "hf://Qwen/Qwen3-0.6B" assert manifest["spec"]["model"]["name"] == config.model["served_model_name"] assert manifest["spec"]["model"]["uri"] == config.model["uri"] - router_args = manifest["spec"]["router"]["scheduler"]["template"]["containers"][0]["args"] + router_args = manifest["spec"]["router"]["scheduler"]["template"]["containers"][0][ + "args" + ] assert router_args[-2] == "--config-text" assert "EndpointPickerConfig" in router_args[-1] -def test_render_guidellm_job_uses_target_and_rate(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: +def test_render_guidellm_job_uses_target_and_rate( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") artifact_dir = tmp_path / "artifacts" artifact_dir.mkdir() @@ -78,7 +111,9 @@ def test_render_guidellm_job_uses_target_and_rate(tmp_path: Path, monkeypatch: p encoding="utf-8", ) - config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + config = llmd_runtime.load_run_configuration( + cwd=tmp_path, artifact_dir=artifact_dir + ) manifest = llmd_runtime.render_guidellm_job(config, "https://example.test") container = manifest["spec"]["template"]["spec"]["containers"][0] @@ -93,12 +128,14 @@ def test_prepare_gpu_operator_skips_existing_clusterpolicy( monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") artifact_dir = tmp_path / "artifacts" artifact_dir.mkdir() - config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + config = llmd_runtime.load_run_configuration( + cwd=tmp_path, artifact_dir=artifact_dir + ) calls: list[str] = [] monkeypatch.setattr( - prepare_llmd, + prepare_toolbox, "ensure_operator_subscription", lambda operator_spec: calls.append(f"subscription:{operator_spec['package']}"), ) @@ -129,9 +166,12 @@ def fail_apply(*_: object, **__: object) -> None: lambda kind, name: {"status": {"state": "ready"}}, ) - prepare_llmd.prepare_gpu_operator(config) + prepare_toolbox.prepare_gpu_operator(config) - assert calls == ["subscription:gpu-operator-certified", "crd:clusterpolicies.nvidia.com"] + assert calls == [ + "subscription:gpu-operator-certified", + "crd:clusterpolicies.nvidia.com", + ] def test_prepare_gpu_operator_bootstraps_missing_clusterpolicy( @@ -140,7 +180,9 @@ def test_prepare_gpu_operator_bootstraps_missing_clusterpolicy( monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") artifact_dir = tmp_path / "artifacts" artifact_dir.mkdir() - config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + config = llmd_runtime.load_run_configuration( + cwd=tmp_path, artifact_dir=artifact_dir + ) applied: list[Path] = [] manifest = { @@ -150,9 +192,11 @@ def test_prepare_gpu_operator_bootstraps_missing_clusterpolicy( "spec": {}, } - monkeypatch.setattr(prepare_llmd, "ensure_operator_subscription", lambda _: None) + monkeypatch.setattr(prepare_toolbox, "ensure_operator_subscription", lambda _: None) monkeypatch.setattr(llmd_runtime, "wait_for_crd", lambda *_, **__: None) - monkeypatch.setattr(llmd_runtime, "load_manifest_template", lambda _config, _path: manifest) + monkeypatch.setattr( + llmd_runtime, "load_manifest_template", lambda _config, _path: manifest + ) monkeypatch.setattr(llmd_runtime, "resource_exists", lambda kind, name: False) monkeypatch.setattr( llmd_runtime, @@ -165,7 +209,7 @@ def test_prepare_gpu_operator_bootstraps_missing_clusterpolicy( lambda kind, name: {"status": {"state": "ready"}}, ) - prepare_llmd.prepare_gpu_operator(config) + prepare_toolbox.prepare_gpu_operator(config) assert applied == [artifact_dir / "src" / "gpu-clusterpolicy.yaml"] @@ -196,7 +240,9 @@ def test_resolve_endpoint_url_requires_gateway_address( monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") artifact_dir = tmp_path / "artifacts" artifact_dir.mkdir() - config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + config = llmd_runtime.load_run_configuration( + cwd=tmp_path, artifact_dir=artifact_dir + ) def fake_oc_get_json(kind: str, **_: object) -> dict[str, object]: assert kind == "llminferenceservice" @@ -205,4 +251,4 @@ def fake_oc_get_json(kind: str, **_: object) -> dict[str, object]: monkeypatch.setattr(llmd_runtime, "oc_get_json", fake_oc_get_json) with pytest.raises(RuntimeError, match="Gateway address"): - test_llmd.resolve_endpoint_url(config) + test_toolbox.resolve_endpoint_url(config) From 1c707d6223449e496d9ac8bf54f95f1e9ee7804f Mon Sep 17 00:00:00 2001 From: Alberto Perdomo Date: Thu, 16 Apr 2026 08:30:42 +0100 Subject: [PATCH 03/21] feat: Add llm_d model cache --- .../orchestration/config.d/model_cache.yaml | 25 + .../llm_d/orchestration/config.d/models.yaml | 7 + .../orchestration/config.d/platform.yaml | 1 + projects/llm_d/orchestration/llmd_runtime.py | 452 +++++++++++++++++- projects/llm_d/toolbox/cleanup/main.py | 105 +++- projects/llm_d/toolbox/prepare/main.py | 4 + .../llm_d/toolbox/prepare_model_cache/main.py | 207 ++++++++ tests/llm_d/test_runtime.py | 185 ++++++- 8 files changed, 951 insertions(+), 35 deletions(-) create mode 100644 projects/llm_d/orchestration/config.d/model_cache.yaml create mode 100644 projects/llm_d/toolbox/prepare_model_cache/main.py diff --git a/projects/llm_d/orchestration/config.d/model_cache.yaml b/projects/llm_d/orchestration/config.d/model_cache.yaml new file mode 100644 index 00000000..eae01772 --- /dev/null +++ b/projects/llm_d/orchestration/config.d/model_cache.yaml @@ -0,0 +1,25 @@ +enabled: true +marker_filename: .forge-model-cache.json + +pvc: + name_prefix: llm-d-model + size: 15Gi + access_mode: ReadWriteOnce + storage_class_name: null + model_directory_name: model + +download: + wait_timeout_seconds: 7200 + poll_interval_seconds: 15 + pod_image_pull_policy: IfNotPresent + +hf: + downloader_image: registry.access.redhat.com/ubi9/python-311 + token_secret_name: null + token_secret_key: token + +oci: + extractor_image: registry.redhat.io/openshift4/ose-cli:v4.19 + registry_auth_secret_name: null + registry_auth_secret_key: .dockerconfigjson + image_path: / diff --git a/projects/llm_d/orchestration/config.d/models.yaml b/projects/llm_d/orchestration/config.d/models.yaml index fd204db4..4334cf4a 100644 --- a/projects/llm_d/orchestration/config.d/models.yaml +++ b/projects/llm_d/orchestration/config.d/models.yaml @@ -1,6 +1,9 @@ qwen3-0-6b: served_model_name: Qwen/Qwen3-0.6B uri: hf://Qwen/Qwen3-0.6B + cache: + pvc_size: 10Gi + access_mode: ReadWriteOnce resources: requests: cpu: "4" @@ -14,6 +17,10 @@ qwen3-0-6b: llama-3-1-8b-instruct-fp8: served_model_name: llama-3-1-8b-instruct-fp8 uri: oci://registry.redhat.io/rhelai1/modelcar-llama-3-1-8b-instruct-fp8-dynamic:1.5 + cache: + pvc_size: 40Gi + access_mode: ReadWriteOnce + oci_image_path: / resources: requests: cpu: "4" diff --git a/projects/llm_d/orchestration/config.d/platform.yaml b/projects/llm_d/orchestration/config.d/platform.yaml index c5e35ea4..9f3b9e0e 100644 --- a/projects/llm_d/orchestration/config.d/platform.yaml +++ b/projects/llm_d/orchestration/config.d/platform.yaml @@ -1,5 +1,6 @@ cluster: minimum_openshift_version: "4.19.9" + namespace_name: forge-llm-d namespace_prefix: llm-d namespace_max_length: 63 cleanup_timeout_seconds: 900 diff --git a/projects/llm_d/orchestration/llmd_runtime.py b/projects/llm_d/orchestration/llmd_runtime.py index 8b507bd2..2c961e7c 100644 --- a/projects/llm_d/orchestration/llmd_runtime.py +++ b/projects/llm_d/orchestration/llmd_runtime.py @@ -1,6 +1,7 @@ from __future__ import annotations import copy +import hashlib import json import logging import os @@ -37,7 +38,9 @@ class ResolvedConfig: namespace_is_managed: bool gpu_count: int | None platform: dict[str, Any] + model_key: str model: dict[str, Any] + model_cache: dict[str, Any] smoke_request: dict[str, Any] benchmark: dict[str, Any] | None fournos_config: dict[str, Any] @@ -48,6 +51,31 @@ def manifests_dir(self) -> Path: return self.config_dir / "manifests" +@dataclass(frozen=True) +class ModelCacheSpec: + source_uri: str + source_scheme: str + cache_key: str + namespace: str + pvc_name: str + pvc_size: str + access_mode: str + storage_class_name: str | None + model_path: str + model_uri: str + marker_filename: str + download_job_name: str + hf_token_secret_name: str | None + hf_token_secret_key: str | None + oci_image_path: str | None + oci_registry_auth_secret_name: str | None + oci_registry_auth_secret_key: str | None + + @property + def marker_path(self) -> str: + return f"/cache/{self.model_path}/{self.marker_filename}" + + def init() -> Path: if not logging.getLogger().handlers: logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") @@ -73,6 +101,7 @@ def load_run_configuration( _reinitialize_project_config() platform_data = copy.deepcopy(config.project.get_config("platform")) + model_cache = copy.deepcopy(config.project.get_config("model_cache")) fournos_config = load_fournos_config(cwd) overrides = parse_overrides( os.environ.get("FORGE_CONFIG_OVERRIDES", ""), @@ -109,10 +138,15 @@ def load_run_configuration( job_name = f"local-{preset_name}" namespace_override = overrides.get("namespace") or fournos_config.get("namespace") - namespace = namespace_override or derive_namespace( - job_name, - platform_data["cluster"]["namespace_prefix"], - platform_data["cluster"]["namespace_max_length"], + default_namespace = platform_data["cluster"].get("namespace_name") + namespace = ( + namespace_override + or default_namespace + or derive_namespace( + job_name, + platform_data["cluster"]["namespace_prefix"], + platform_data["cluster"]["namespace_max_length"], + ) ) gpu_count = normalize_gpu_count(fournos_config.get("gpu-count")) @@ -125,10 +159,12 @@ def load_run_configuration( preset_alias=preset_alias, job_name=job_name, namespace=namespace, - namespace_is_managed=namespace_override is None, + namespace_is_managed=namespace_override is None and default_namespace is None, gpu_count=gpu_count, platform=platform_data, + model_key=model_name, model=model, + model_cache=model_cache, smoke_request=smoke_request, benchmark=benchmark, fournos_config=fournos_config, @@ -223,6 +259,76 @@ def derive_namespace(job_name: str, prefix: str, max_length: int) -> str: return namespace +def slugify_identifier(value: str, *, max_length: int = 63) -> str: + slug = re.sub(r"[^a-z0-9-]+", "-", value.lower()) + slug = re.sub(r"-{2,}", "-", slug).strip("-") + return slug[:max_length].rstrip("-") or "item" + + +def truncate_k8s_name(value: str, *, max_length: int = 63) -> str: + return value[:max_length].rstrip("-") + + +def resolve_model_cache(config: ResolvedConfig) -> ModelCacheSpec | None: + if not config.model_cache.get("enabled", False): + return None + + source_uri = config.model["uri"] + if source_uri.startswith(("pvc://", "pvc+hf://")): + return None + + if source_uri.startswith("hf://"): + source_scheme = "hf" + elif source_uri.startswith("oci://"): + source_scheme = "oci" + else: + raise ValueError( + f"Unsupported model cache source URI for {config.model_key}: {source_uri}" + ) + + model_cache_overrides = config.model.get("cache", {}) + pvc_defaults = config.model_cache["pvc"] + pvc_prefix = config.model_cache["pvc"]["name_prefix"] + cache_key = hashlib.sha256(source_uri.encode("utf-8")).hexdigest()[:10] + pvc_name = truncate_k8s_name( + f"{pvc_prefix}-{slugify_identifier(config.model_key, max_length=32)}-{cache_key}" + ) + model_path = pvc_defaults["model_directory_name"] + + return ModelCacheSpec( + source_uri=source_uri, + source_scheme=source_scheme, + cache_key=cache_key, + namespace=config.namespace, + pvc_name=pvc_name, + pvc_size=model_cache_overrides.get("pvc_size", pvc_defaults["size"]), + access_mode=model_cache_overrides.get( + "access_mode", pvc_defaults["access_mode"] + ), + storage_class_name=model_cache_overrides.get( + "storage_class_name", pvc_defaults.get("storage_class_name") + ), + model_path=model_path, + model_uri=f"pvc://{pvc_name}/{model_path}", + marker_filename=config.model_cache["marker_filename"], + download_job_name=truncate_k8s_name(f"{pvc_name}-download"), + hf_token_secret_name=model_cache_overrides.get( + "hf_token_secret_name", config.model_cache["hf"].get("token_secret_name") + ), + hf_token_secret_key=config.model_cache["hf"].get("token_secret_key"), + oci_image_path=model_cache_overrides.get( + "oci_image_path", config.model_cache["oci"].get("image_path") + ), + oci_registry_auth_secret_name=model_cache_overrides.get( + "oci_registry_auth_secret_name", + config.model_cache["oci"].get("registry_auth_secret_name"), + ), + oci_registry_auth_secret_key=config.model_cache["oci"].get( + "registry_auth_secret_key" + ), + ) + + def load_yaml(path: Path) -> Any: with path.open(encoding="utf-8") as handle: return yaml.safe_load(handle) @@ -524,6 +630,93 @@ def condition_status(resource: dict[str, Any], condition_type: str) -> str | Non return None +def pvc_access_mode_matches(actual_modes: list[str], expected_mode: str) -> bool: + return expected_mode in actual_modes + + +def wait_for_pvc_bound( + pvc_name: str, namespace: str, *, timeout_seconds: int +) -> dict[str, Any]: + def _pvc_bound() -> dict[str, Any] | None: + payload = oc_get_json( + "persistentvolumeclaim", + name=pvc_name, + namespace=namespace, + ignore_not_found=True, + ) + if not payload: + return None + if payload.get("status", {}).get("phase") == "Bound": + return payload + return None + + return wait_until( + f"persistentvolumeclaim/{pvc_name} bound in {namespace}", + timeout_seconds=timeout_seconds, + interval_seconds=5, + predicate=_pvc_bound, + ) + + +def wait_for_job_completion( + job_name: str, namespace: str, *, timeout_seconds: int, interval_seconds: int = 10 +) -> dict[str, Any]: + def _job_completed() -> dict[str, Any] | None: + payload = oc_get_json( + "job", + name=job_name, + namespace=namespace, + ignore_not_found=True, + ) + if not payload: + return None + status = payload.get("status", {}) + if status.get("succeeded", 0): + return payload + failed_count = status.get("failed", 0) + for condition in status.get("conditions", []): + if condition.get("type") == "Failed" and condition.get("status") == "True": + raise RuntimeError( + f"job/{job_name} failed: {condition.get('reason') or 'unknown reason'}" + ) + if failed_count: + raise RuntimeError(f"job/{job_name} failed after {failed_count} attempt(s)") + return None + + return wait_until( + f"job/{job_name} completion in {namespace}", + timeout_seconds=timeout_seconds, + interval_seconds=interval_seconds, + predicate=_job_completed, + ) + + +def job_pod_names(job_name: str, namespace: str) -> list[str]: + payload = oc_get_json( + "pods", + namespace=namespace, + selector=f"job-name={job_name}", + ignore_not_found=True, + ) + if not payload: + return [] + return [item["metadata"]["name"] for item in payload.get("items", [])] + + +def resolve_default_serviceaccount_image_pull_secret(namespace: str) -> str | None: + payload = oc_get_json( + "serviceaccount", name="default", namespace=namespace, ignore_not_found=True + ) + if not payload: + return None + + for item in payload.get("imagePullSecrets", []): + name = item.get("name") + if name: + return name + return None + + def render_datasciencecluster(config: ResolvedConfig) -> dict[str, Any]: template_path = ( config.config_dir / config.platform["rhoai"]["datasciencecluster_template"] @@ -545,6 +738,230 @@ def render_gateway(config: ResolvedConfig) -> dict[str, Any]: return manifest +def render_model_cache_pvc(spec: ModelCacheSpec) -> dict[str, Any]: + manifest: dict[str, Any] = { + "apiVersion": "v1", + "kind": "PersistentVolumeClaim", + "metadata": { + "name": spec.pvc_name, + "namespace": spec.namespace, + "labels": { + "app.kubernetes.io/managed-by": "forge", + "forge.openshift.io/project": "llm_d", + "forge.openshift.io/model-cache": "true", + "forge.openshift.io/preserve": "true", + }, + "annotations": { + "forge.openshift.io/model-cache-key": spec.cache_key, + "forge.openshift.io/model-source-uri": spec.source_uri, + }, + }, + "spec": { + "accessModes": [spec.access_mode], + "resources": {"requests": {"storage": spec.pvc_size}}, + }, + } + if spec.storage_class_name: + manifest["spec"]["storageClassName"] = spec.storage_class_name + return manifest + + +def render_model_cache_job( + config: ResolvedConfig, spec: ModelCacheSpec +) -> dict[str, Any]: + common_env = [ + {"name": "MODEL_SOURCE", "value": spec.source_uri}, + {"name": "MODEL_TARGET_DIR", "value": f"/cache/{spec.model_path}"}, + {"name": "MARKER_FILE", "value": spec.marker_path}, + {"name": "CACHE_KEY", "value": spec.cache_key}, + ] + volumes: list[dict[str, Any]] = [ + {"name": "cache", "persistentVolumeClaim": {"claimName": spec.pvc_name}} + ] + + container: dict[str, Any] + if spec.source_scheme == "hf": + command = """ +set -euo pipefail +mkdir -p "${MODEL_TARGET_DIR}" +rm -rf "${MODEL_TARGET_DIR}"/* +python -m pip install --quiet --no-cache-dir 'huggingface_hub[hf_xet]' +python - <<'PY' +import os +from huggingface_hub import snapshot_download + +token = None +token_file = os.environ.get("HF_TOKEN_FILE") +if token_file and os.path.exists(token_file): + with open(token_file, encoding="utf-8") as handle: + token = handle.read().strip() or None + +snapshot_download( + repo_id=os.environ["MODEL_SOURCE"][5:], + local_dir=os.environ["MODEL_TARGET_DIR"], + local_dir_use_symlinks=False, + token=token, +) +PY +cat > "${MARKER_FILE}" < "${MARKER_FILE}" < bool: + payload = oc_get_json( + "persistentvolumeclaim", + name=spec.pvc_name, + namespace=spec.namespace, + ignore_not_found=True, + ) + if not payload: + return False + + annotations = payload.get("metadata", {}).get("annotations", {}) + return ( + annotations.get("forge.openshift.io/model-cache-ready") == "true" + and annotations.get("forge.openshift.io/model-cache-key") == spec.cache_key + and annotations.get("forge.openshift.io/model-source-uri") == spec.source_uri + ) + + +def annotate_model_cache_pvc(spec: ModelCacheSpec) -> None: + oc( + "annotate", + "persistentvolumeclaim", + spec.pvc_name, + "-n", + spec.namespace, + "--overwrite", + f"forge.openshift.io/model-cache-ready=true", + f"forge.openshift.io/model-cache-key={spec.cache_key}", + f"forge.openshift.io/model-source-uri={spec.source_uri}", + f"forge.openshift.io/model-uri={spec.model_uri}", + ) + + def render_inference_service(config: ResolvedConfig) -> dict[str, Any]: template_path = config.config_dir / config.platform["inference_service"]["template"] manifest = load_yaml(template_path) @@ -560,7 +977,10 @@ def render_inference_service(config: ResolvedConfig) -> dict[str, Any]: } ) - manifest["spec"]["model"]["uri"] = config.model["uri"] + cache_spec = resolve_model_cache(config) + manifest["spec"]["model"]["uri"] = ( + cache_spec.model_uri if cache_spec else config.model["uri"] + ) manifest["spec"]["model"]["name"] = config.model["served_model_name"] manifest["spec"]["template"]["containers"][0]["resources"] = copy.deepcopy( config.model["resources"] @@ -590,6 +1010,10 @@ def render_guidellm_pvc(config: ResolvedConfig) -> dict[str, Any]: "metadata": { "name": config.benchmark["job_name"], "namespace": config.namespace, + "labels": { + "app.kubernetes.io/managed-by": "forge", + "forge.openshift.io/project": "llm_d", + }, }, "spec": { "accessModes": ["ReadWriteOnce"], @@ -620,10 +1044,20 @@ def render_guidellm_job(config: ResolvedConfig, endpoint_url: str) -> dict[str, "metadata": { "name": config.benchmark["job_name"], "namespace": config.namespace, + "labels": { + "app.kubernetes.io/managed-by": "forge", + "forge.openshift.io/project": "llm_d", + }, }, "spec": { "backoffLimit": 0, "template": { + "metadata": { + "labels": { + "app.kubernetes.io/managed-by": "forge", + "forge.openshift.io/project": "llm_d", + } + }, "spec": { "serviceAccountName": "default", "restartPolicy": "Never", @@ -649,7 +1083,7 @@ def render_guidellm_job(config: ResolvedConfig, endpoint_url: str) -> dict[str, }, }, ], - } + }, }, }, } @@ -667,6 +1101,10 @@ def render_guidellm_copy_pod( "metadata": { "name": f"{config.benchmark['job_name']}-copy", "namespace": config.namespace, + "labels": { + "app.kubernetes.io/managed-by": "forge", + "forge.openshift.io/project": "llm_d", + }, }, "spec": { "restartPolicy": "Never", diff --git a/projects/llm_d/toolbox/cleanup/main.py b/projects/llm_d/toolbox/cleanup/main.py index 46d0aedf..d80726ef 100644 --- a/projects/llm_d/toolbox/cleanup/main.py +++ b/projects/llm_d/toolbox/cleanup/main.py @@ -13,36 +13,38 @@ def run() -> int: def run_cleanup(config: llmd_runtime.ResolvedConfig) -> int: + delete_run_leftovers(config) + return 0 + + +def delete_run_leftovers(config: llmd_runtime.ResolvedConfig) -> None: + if not llmd_runtime.resource_exists("namespace", config.namespace): + return + inference_service_name = config.platform["inference_service"]["name"] - benchmark_name = ( - config.benchmark["job_name"] if config.benchmark else "guidellm-benchmark" + namespace = config.namespace + cleanup_timeout_seconds = config.platform["cluster"]["cleanup_timeout_seconds"] + benchmark_names = {"guidellm-benchmark"} + if config.benchmark: + benchmark_names.add(config.benchmark["job_name"]) + + llmd_runtime.oc( + "delete", + "llminferenceservice", + inference_service_name, + "-n", + namespace, + "--ignore-not-found=true", + check=False, ) - if config.namespace_is_managed: - if llmd_runtime.resource_exists("namespace", config.namespace): - llmd_runtime.oc( - "delete", "namespace", config.namespace, "--ignore-not-found=true" - ) - llmd_runtime.wait_for_namespace_deleted( - config.namespace, - timeout_seconds=config.platform["cluster"]["cleanup_timeout_seconds"], - ) - else: - llmd_runtime.oc( - "delete", - "llminferenceservice", - inference_service_name, - "-n", - config.namespace, - "--ignore-not-found=true", - check=False, - ) + for benchmark_name in sorted(benchmark_names): llmd_runtime.oc( "delete", "job,pvc", benchmark_name, "-n", - config.namespace, + namespace, "--ignore-not-found=true", check=False, ) @@ -51,12 +53,67 @@ def run_cleanup(config: llmd_runtime.ResolvedConfig) -> int: "pod", f"{benchmark_name}-copy", "-n", - config.namespace, + namespace, "--ignore-not-found=true", check=False, ) - return 0 + llmd_runtime.oc( + "delete", + "job", + "-n", + namespace, + "-l", + "forge.openshift.io/project=llm_d", + "--ignore-not-found=true", + check=False, + ) + llmd_runtime.oc( + "delete", + "pod", + "-n", + namespace, + "-l", + "forge.openshift.io/project=llm_d", + "--ignore-not-found=true", + check=False, + ) + llmd_runtime.oc( + "delete", + "pvc", + "-n", + namespace, + "-l", + "forge.openshift.io/project=llm_d,forge.openshift.io/preserve!=true", + "--ignore-not-found=true", + check=False, + ) + + llmd_runtime.wait_until( + f"llminferenceservice/{inference_service_name} deletion in {namespace}", + timeout_seconds=cleanup_timeout_seconds, + interval_seconds=10, + predicate=lambda: not llmd_runtime.resource_exists( + "llminferenceservice", inference_service_name, namespace=namespace + ), + ) + + llmd_runtime.wait_until( + f"llm-d workload pods deletion in {namespace}", + timeout_seconds=cleanup_timeout_seconds, + interval_seconds=10, + predicate=lambda: _llm_d_pods_gone(namespace, inference_service_name), + ) + + +def _llm_d_pods_gone(namespace: str, inference_service_name: str) -> bool: + payload = llmd_runtime.oc_get_json( + "pods", + namespace=namespace, + selector=f"app.kubernetes.io/name={inference_service_name}", + ignore_not_found=True, + ) + return not payload or not payload.get("items") main = toolbox.create_toolbox_main(run) diff --git a/projects/llm_d/toolbox/prepare/main.py b/projects/llm_d/toolbox/prepare/main.py index 7edad1f8..3ebbaf67 100644 --- a/projects/llm_d/toolbox/prepare/main.py +++ b/projects/llm_d/toolbox/prepare/main.py @@ -8,6 +8,8 @@ from projects.core.dsl import toolbox from projects.llm_d.orchestration import llmd_runtime +from projects.llm_d.toolbox.cleanup import main as cleanup_toolbox +from projects.llm_d.toolbox.prepare_model_cache import main as prepare_model_cache LOGGER = logging.getLogger(__name__) @@ -35,6 +37,8 @@ def run_prepare(config: llmd_runtime.ResolvedConfig) -> int: ensure_required_crds(config.platform["rhoai"]["required_crds_after_dsc"], config) ensure_gateway(config) ensure_test_namespace(config) + cleanup_toolbox.delete_run_leftovers(config) + prepare_model_cache.run_prepare_model_cache(config) verify_gpu_nodes(config) capture_prepare_state(config) diff --git a/projects/llm_d/toolbox/prepare_model_cache/main.py b/projects/llm_d/toolbox/prepare_model_cache/main.py new file mode 100644 index 00000000..143ae77a --- /dev/null +++ b/projects/llm_d/toolbox/prepare_model_cache/main.py @@ -0,0 +1,207 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +import logging + +from projects.core.dsl import toolbox +from projects.llm_d.orchestration import llmd_runtime + +LOGGER = logging.getLogger(__name__) + + +def run() -> int: + llmd_runtime.init() + config = llmd_runtime.load_run_configuration() + return run_prepare_model_cache(config) + + +def run_prepare_model_cache(config: llmd_runtime.ResolvedConfig) -> int: + cache_spec = llmd_runtime.resolve_model_cache(config) + if not cache_spec: + LOGGER.info("Model cache disabled for preset=%s", config.preset_name) + return 0 + + if config.namespace_is_managed: + LOGGER.warning( + "Model cache PVC %s lives in managed namespace %s. Namespace cleanup will remove it; cache reuse requires a stable namespace override.", + cache_spec.pvc_name, + cache_spec.namespace, + ) + + ensure_model_cache_pvc(config, cache_spec) + if llmd_runtime.model_cache_pvc_ready(cache_spec): + LOGGER.info( + "Model cache PVC %s already contains %s; skipping download", + cache_spec.pvc_name, + cache_spec.source_uri, + ) + capture_model_cache_state(config, cache_spec) + return 0 + + run_model_cache_download_job(config, cache_spec) + llmd_runtime.annotate_model_cache_pvc(cache_spec) + capture_model_cache_state(config, cache_spec) + return 0 + + +def ensure_model_cache_pvc( + config: llmd_runtime.ResolvedConfig, cache_spec: llmd_runtime.ModelCacheSpec +) -> None: + existing = llmd_runtime.oc_get_json( + "persistentvolumeclaim", + name=cache_spec.pvc_name, + namespace=cache_spec.namespace, + ignore_not_found=True, + ) + if existing: + actual_modes = existing.get("spec", {}).get("accessModes", []) + if not llmd_runtime.pvc_access_mode_matches( + actual_modes, cache_spec.access_mode + ): + raise RuntimeError( + f"PVC {cache_spec.pvc_name} exists with access modes {actual_modes}, expected {cache_spec.access_mode}" + ) + + actual_storage_class = existing.get("spec", {}).get("storageClassName") + if ( + cache_spec.storage_class_name + and actual_storage_class != cache_spec.storage_class_name + ): + raise RuntimeError( + f"PVC {cache_spec.pvc_name} exists with storageClassName={actual_storage_class}, expected {cache_spec.storage_class_name}" + ) + + llmd_runtime.wait_for_pvc_bound( + cache_spec.pvc_name, + cache_spec.namespace, + timeout_seconds=config.model_cache["download"]["wait_timeout_seconds"], + ) + return + + llmd_runtime.apply_manifest( + config.artifact_dir / "src" / "model-cache-pvc.yaml", + llmd_runtime.render_model_cache_pvc(cache_spec), + ) + llmd_runtime.wait_for_pvc_bound( + cache_spec.pvc_name, + cache_spec.namespace, + timeout_seconds=config.model_cache["download"]["wait_timeout_seconds"], + ) + + +def run_model_cache_download_job( + config: llmd_runtime.ResolvedConfig, cache_spec: llmd_runtime.ModelCacheSpec +) -> None: + llmd_runtime.oc( + "delete", + "job", + cache_spec.download_job_name, + "-n", + cache_spec.namespace, + "--ignore-not-found=true", + check=False, + ) + llmd_runtime.wait_until( + f"job/{cache_spec.download_job_name} deletion in {cache_spec.namespace}", + timeout_seconds=120, + interval_seconds=5, + predicate=lambda: not llmd_runtime.resource_exists( + "job", cache_spec.download_job_name, namespace=cache_spec.namespace + ), + ) + + llmd_runtime.apply_manifest( + config.artifact_dir / "src" / "model-cache-job.yaml", + llmd_runtime.render_model_cache_job(config, cache_spec), + ) + + try: + llmd_runtime.wait_for_job_completion( + cache_spec.download_job_name, + cache_spec.namespace, + timeout_seconds=config.model_cache["download"]["wait_timeout_seconds"], + interval_seconds=config.model_cache["download"]["poll_interval_seconds"], + ) + finally: + capture_model_cache_state(config, cache_spec) + + +def capture_model_cache_state( + config: llmd_runtime.ResolvedConfig, cache_spec: llmd_runtime.ModelCacheSpec +) -> None: + artifact_dir = config.artifact_dir / "artifacts" / "model-cache" + llmd_runtime.write_json( + artifact_dir / "spec.json", + { + "pvc_name": cache_spec.pvc_name, + "model_uri": cache_spec.model_uri, + "source_uri": cache_spec.source_uri, + "source_scheme": cache_spec.source_scheme, + }, + ) + + capture_resource_yaml( + "persistentvolumeclaim", + cache_spec.pvc_name, + cache_spec.namespace, + artifact_dir / "pvc.yaml", + ) + capture_resource_yaml( + "job", + cache_spec.download_job_name, + cache_spec.namespace, + artifact_dir / "job.yaml", + check=False, + ) + + for pod_name in llmd_runtime.job_pod_names( + cache_spec.download_job_name, cache_spec.namespace + ): + capture_resource_yaml( + "pod", + pod_name, + cache_spec.namespace, + artifact_dir / f"{pod_name}.yaml", + check=False, + ) + log_result = llmd_runtime.oc( + "logs", + pod_name, + "-n", + cache_spec.namespace, + check=False, + capture_output=True, + ) + if log_result.returncode == 0 and log_result.stdout: + llmd_runtime.write_text(artifact_dir / f"{pod_name}.log", log_result.stdout) + + +def capture_resource_yaml( + kind: str, + name: str, + namespace: str, + destination, + *, + check: bool = True, +) -> None: + result = llmd_runtime.oc( + "get", + kind, + name, + "-n", + namespace, + "-o", + "yaml", + check=check, + capture_output=True, + ) + if result.returncode == 0 and result.stdout: + llmd_runtime.write_text(destination, result.stdout) + + +main = toolbox.create_toolbox_main(run) + + +if __name__ == "__main__": + main() diff --git a/tests/llm_d/test_runtime.py b/tests/llm_d/test_runtime.py index b2bcff1d..d130e781 100644 --- a/tests/llm_d/test_runtime.py +++ b/tests/llm_d/test_runtime.py @@ -5,7 +5,9 @@ import pytest from projects.llm_d.orchestration import llmd_runtime +from projects.llm_d.toolbox.cleanup import main as cleanup_toolbox from projects.llm_d.toolbox.prepare import main as prepare_toolbox +from projects.llm_d.toolbox.prepare_model_cache import main as prepare_model_cache_toolbox from projects.llm_d.toolbox.test import main as test_toolbox @@ -39,8 +41,8 @@ def test_load_run_configuration_resolves_alias( assert config.preset_name == "smoke" assert config.preset_alias == "cks" assert config.model["served_model_name"] == "Qwen/Qwen3-0.6B" - assert config.namespace == "llm-d-e2e" - assert config.namespace_is_managed is True + assert config.namespace == "forge-llm-d" + assert config.namespace_is_managed is False def test_load_run_configuration_consolidates_config_d( @@ -54,6 +56,7 @@ def test_load_run_configuration_consolidates_config_d( consolidated = llmd_runtime.load_yaml(artifact_dir / "config.yaml") assert "platform" in consolidated + assert "model_cache" in consolidated assert "models" in consolidated assert "runtime" in consolidated assert "workloads" in consolidated @@ -75,6 +78,25 @@ def test_namespace_override_is_not_managed( assert config.namespace_is_managed is False +def test_default_namespace_comes_from_project_config( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + (tmp_path / "fournos_config.yaml").write_text( + "job-name: llm-d-nightly\n", + encoding="utf-8", + ) + + config = llmd_runtime.load_run_configuration( + cwd=tmp_path, artifact_dir=artifact_dir + ) + + assert config.namespace == "forge-llm-d" + assert config.namespace_is_managed is False + + def test_render_inference_service_injects_model_and_epp( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: @@ -86,13 +108,13 @@ def test_render_inference_service_injects_model_and_epp( cwd=tmp_path, artifact_dir=artifact_dir ) manifest = llmd_runtime.render_inference_service(config) + cache_spec = llmd_runtime.resolve_model_cache(config) assert manifest["metadata"]["name"] == "llm-d" assert manifest["metadata"]["namespace"] == config.namespace assert manifest["spec"]["model"]["name"] == "Qwen/Qwen3-0.6B" - assert manifest["spec"]["model"]["uri"] == "hf://Qwen/Qwen3-0.6B" + assert manifest["spec"]["model"]["uri"] == cache_spec.model_uri assert manifest["spec"]["model"]["name"] == config.model["served_model_name"] - assert manifest["spec"]["model"]["uri"] == config.model["uri"] router_args = manifest["spec"]["router"]["scheduler"]["template"]["containers"][0][ "args" ] @@ -100,6 +122,84 @@ def test_render_inference_service_injects_model_and_epp( assert "EndpointPickerConfig" in router_args[-1] +def test_resolve_model_cache_for_hf_model( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + + config = llmd_runtime.load_run_configuration( + cwd=tmp_path, artifact_dir=artifact_dir + ) + cache_spec = llmd_runtime.resolve_model_cache(config) + + assert cache_spec is not None + assert cache_spec.source_scheme == "hf" + assert cache_spec.pvc_name.startswith("llm-d-model-qwen3-0-6b-") + assert cache_spec.model_uri == f"pvc://{cache_spec.pvc_name}/model" + assert cache_spec.pvc_size == "10Gi" + assert cache_spec.access_mode == "ReadWriteOnce" + + +def test_render_model_cache_job_for_hf_model( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + + config = llmd_runtime.load_run_configuration( + cwd=tmp_path, artifact_dir=artifact_dir + ) + cache_spec = llmd_runtime.resolve_model_cache(config) + manifest = llmd_runtime.render_model_cache_job(config, cache_spec) + + container = manifest["spec"]["template"]["spec"]["containers"][0] + assert container["name"] == "hf-model-downloader" + assert container["image"] == "registry.access.redhat.com/ubi9/python-311" + assert any( + env["name"] == "MODEL_SOURCE" and env["value"] == "hf://Qwen/Qwen3-0.6B" + for env in container["env"] + ) + assert "huggingface_hub" in container["command"][-1] + + +def test_render_model_cache_job_for_oci_model_uses_registry_auth_secret( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + (tmp_path / "fournos_config.yaml").write_text( + "preset: benchmark-short\njob-name: llm-d-benchmark\n", + encoding="utf-8", + ) + + config = llmd_runtime.load_run_configuration( + cwd=tmp_path, artifact_dir=artifact_dir + ) + monkeypatch.setattr( + llmd_runtime, + "resolve_default_serviceaccount_image_pull_secret", + lambda namespace: "pull-secret", + ) + cache_spec = llmd_runtime.resolve_model_cache(config) + manifest = llmd_runtime.render_model_cache_job(config, cache_spec) + + container = manifest["spec"]["template"]["spec"]["containers"][0] + volume_names = { + volume["name"] for volume in manifest["spec"]["template"]["spec"]["volumes"] + } + + assert cache_spec.source_scheme == "oci" + assert container["name"] == "oci-model-extractor" + assert container["image"] == "registry.redhat.io/openshift4/ose-cli:v4.19" + assert any(env["name"] == "OCI_IMAGE_PATH" and env["value"] == "/" for env in container["env"]) + assert "registry-auth" in volume_names + assert "oc image extract" in container["command"][-1] + + def test_render_guidellm_job_uses_target_and_rate( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: @@ -122,6 +222,83 @@ def test_render_guidellm_job_uses_target_and_rate( assert "--rate=1" in container["args"] +def test_prepare_model_cache_skips_ready_pvc( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + + config = llmd_runtime.load_run_configuration( + cwd=tmp_path, artifact_dir=artifact_dir + ) + cache_spec = llmd_runtime.resolve_model_cache(config) + calls: list[str] = [] + + monkeypatch.setattr( + prepare_model_cache_toolbox, + "ensure_model_cache_pvc", + lambda _config, _cache_spec: calls.append("ensure-pvc"), + ) + monkeypatch.setattr( + llmd_runtime, "model_cache_pvc_ready", lambda _cache_spec: True + ) + monkeypatch.setattr( + prepare_model_cache_toolbox, + "capture_model_cache_state", + lambda _config, _cache_spec: calls.append("capture"), + ) + monkeypatch.setattr( + prepare_model_cache_toolbox, + "run_model_cache_download_job", + lambda _config, _cache_spec: calls.append("download"), + ) + + prepare_model_cache_toolbox.run_prepare_model_cache(config) + + assert calls == ["ensure-pvc", "capture"] + + +def test_cleanup_deletes_leftovers_but_not_namespace_or_preserved_pvcs( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + + config = llmd_runtime.load_run_configuration( + cwd=tmp_path, artifact_dir=artifact_dir + ) + oc_calls: list[tuple[str, ...]] = [] + + def fake_resource_exists(kind: str, name: str, namespace: str | None = None) -> bool: + if kind == "namespace": + return True + return False + + monkeypatch.setattr(llmd_runtime, "resource_exists", fake_resource_exists) + monkeypatch.setattr( + llmd_runtime, + "oc", + lambda *args, **kwargs: oc_calls.append(tuple(args)), + ) + monkeypatch.setattr(llmd_runtime, "wait_until", lambda *args, **kwargs: True) + monkeypatch.setattr(cleanup_toolbox, "_llm_d_pods_gone", lambda *_args: True) + + cleanup_toolbox.delete_run_leftovers(config) + + assert ("delete", "namespace", config.namespace, "--ignore-not-found=true") not in oc_calls + assert ( + "delete", + "pvc", + "-n", + config.namespace, + "-l", + "forge.openshift.io/project=llm_d,forge.openshift.io/preserve!=true", + "--ignore-not-found=true", + ) in oc_calls + + def test_prepare_gpu_operator_skips_existing_clusterpolicy( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: From be5a512c3a0933f5c4dfeba7e6f2c27291d28d4a Mon Sep 17 00:00:00 2001 From: Alberto Perdomo Date: Thu, 16 Apr 2026 13:35:58 +0100 Subject: [PATCH 04/21] fix: Lint after rebase --- projects/core/library/config.py | 4 +- projects/llm_d/orchestration/cli.py | 1 - projects/llm_d/orchestration/llmd_runtime.py | 89 +++++-------------- projects/llm_d/orchestration/prepare_llmd.py | 3 - projects/llm_d/orchestration/test_llmd.py | 2 - projects/llm_d/toolbox/prepare/main.py | 72 ++++----------- .../llm_d/toolbox/prepare_model_cache/main.py | 13 +-- projects/llm_d/toolbox/test/main.py | 57 ++++-------- tests/llm_d/test_runtime.py | 77 ++++------------ 9 files changed, 81 insertions(+), 237 deletions(-) diff --git a/projects/core/library/config.py b/projects/core/library/config.py index b17b3e3a..740e921c 100644 --- a/projects/core/library/config.py +++ b/projects/core/library/config.py @@ -307,9 +307,7 @@ def multi_dereference(): # --- # - new_value = ( - simple_dereference() if value.startswith("@") else multi_dereference() - ) + new_value = simple_dereference() if value.startswith("@") else multi_dereference() if not handled_secretly: logger.info(f"resolve_reference: {value} ==> '{new_value}'") diff --git a/projects/llm_d/orchestration/cli.py b/projects/llm_d/orchestration/cli.py index 06ae9ef6..ca87c653 100644 --- a/projects/llm_d/orchestration/cli.py +++ b/projects/llm_d/orchestration/cli.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 import logging -import sys import types import click diff --git a/projects/llm_d/orchestration/llmd_runtime.py b/projects/llm_d/orchestration/llmd_runtime.py index 2c961e7c..48c503f3 100644 --- a/projects/llm_d/orchestration/llmd_runtime.py +++ b/projects/llm_d/orchestration/llmd_runtime.py @@ -9,9 +9,10 @@ import shlex import subprocess import time +from collections.abc import Iterable from dataclasses import dataclass from pathlib import Path -from typing import Any, Iterable +from typing import Any import yaml @@ -201,9 +202,7 @@ def load_fournos_config(cwd: Path) -> dict[str, Any]: if data is None: return {} if not isinstance(data, dict): - raise ValueError( - f"Unexpected FOURNOS config type in {config_path}: {type(data)}" - ) + raise ValueError(f"Unexpected FOURNOS config type in {config_path}: {type(data)}") return data @@ -253,9 +252,7 @@ def derive_namespace(job_name: str, prefix: str, max_length: int) -> str: namespace = namespace[:max_length].rstrip("-") if not namespace: - raise ValueError( - f"Could not derive a valid namespace from job name: {job_name}" - ) + raise ValueError(f"Could not derive a valid namespace from job name: {job_name}") return namespace @@ -282,9 +279,7 @@ def resolve_model_cache(config: ResolvedConfig) -> ModelCacheSpec | None: elif source_uri.startswith("oci://"): source_scheme = "oci" else: - raise ValueError( - f"Unsupported model cache source URI for {config.model_key}: {source_uri}" - ) + raise ValueError(f"Unsupported model cache source URI for {config.model_key}: {source_uri}") model_cache_overrides = config.model.get("cache", {}) pvc_defaults = config.model_cache["pvc"] @@ -302,9 +297,7 @@ def resolve_model_cache(config: ResolvedConfig) -> ModelCacheSpec | None: namespace=config.namespace, pvc_name=pvc_name, pvc_size=model_cache_overrides.get("pvc_size", pvc_defaults["size"]), - access_mode=model_cache_overrides.get( - "access_mode", pvc_defaults["access_mode"] - ), + access_mode=model_cache_overrides.get("access_mode", pvc_defaults["access_mode"]), storage_class_name=model_cache_overrides.get( "storage_class_name", pvc_defaults.get("storage_class_name") ), @@ -323,9 +316,7 @@ def resolve_model_cache(config: ResolvedConfig) -> ModelCacheSpec | None: "oci_registry_auth_secret_name", config.model_cache["oci"].get("registry_auth_secret_name"), ), - oci_registry_auth_secret_key=config.model_cache["oci"].get( - "registry_auth_secret_key" - ), + oci_registry_auth_secret_key=config.model_cache["oci"].get("registry_auth_secret_key"), ) @@ -460,9 +451,7 @@ def wait_until( time.sleep(interval_seconds) if last_error: - raise RuntimeError( - f"Timed out waiting for {description}: {last_error}" - ) from last_error + raise RuntimeError(f"Timed out waiting for {description}: {last_error}") from last_error raise RuntimeError(f"Timed out waiting for {description}") @@ -487,15 +476,11 @@ def wait_for_crd(crd_name: str, timeout_seconds: int) -> None: ) -def wait_for_operator_csv( - package: str, namespace: str, timeout_seconds: int -) -> dict[str, Any]: +def wait_for_operator_csv(package: str, namespace: str, timeout_seconds: int) -> dict[str, Any]: selector = f"operators.coreos.com/{package}.{namespace}" def _csv_ready() -> dict[str, Any] | None: - data = oc_get_json( - "csv", namespace=namespace, selector=selector, ignore_not_found=True - ) + data = oc_get_json("csv", namespace=namespace, selector=selector, ignore_not_found=True) if not data: return None items = data.get("items", []) @@ -557,9 +542,7 @@ def ensure_subscription(operator_spec: dict[str, Any]) -> None: namespace=namespace, ignore_not_found=True, ) - if current and not subscription_spec_matches( - current.get("spec", {}), subscription["spec"] - ): + if current and not subscription_spec_matches(current.get("spec", {}), subscription["spec"]): LOGGER.info("Reconciling subscription drift for %s in %s", package, namespace) oc("apply", "-f", "-", input_text=yaml.safe_dump(subscription, sort_keys=False)) @@ -611,9 +594,7 @@ def operator_spec_by_package(platform: dict[str, Any], package: str) -> dict[str raise KeyError(f"Unknown operator package in llm_d platform config: {package}") -def load_manifest_template( - config: ResolvedConfig, relative_path: str -) -> dict[str, Any]: +def load_manifest_template(config: ResolvedConfig, relative_path: str) -> dict[str, Any]: return load_yaml(config.config_dir / relative_path) @@ -634,9 +615,7 @@ def pvc_access_mode_matches(actual_modes: list[str], expected_mode: str) -> bool return expected_mode in actual_modes -def wait_for_pvc_bound( - pvc_name: str, namespace: str, *, timeout_seconds: int -) -> dict[str, Any]: +def wait_for_pvc_bound(pvc_name: str, namespace: str, *, timeout_seconds: int) -> dict[str, Any]: def _pvc_bound() -> dict[str, Any] | None: payload = oc_get_json( "persistentvolumeclaim", @@ -718,9 +697,7 @@ def resolve_default_serviceaccount_image_pull_secret(namespace: str) -> str | No def render_datasciencecluster(config: ResolvedConfig) -> dict[str, Any]: - template_path = ( - config.config_dir / config.platform["rhoai"]["datasciencecluster_template"] - ) + template_path = config.config_dir / config.platform["rhoai"]["datasciencecluster_template"] manifest = load_yaml(template_path) manifest["metadata"]["name"] = config.platform["rhoai"]["datasciencecluster_name"] manifest["metadata"]["namespace"] = config.platform["rhoai"]["namespace"] @@ -732,9 +709,7 @@ def render_gateway(config: ResolvedConfig) -> dict[str, Any]: manifest = load_yaml(template_path) manifest["metadata"]["name"] = config.platform["gateway"]["name"] manifest["metadata"]["namespace"] = config.platform["gateway"]["namespace"] - manifest["spec"]["gatewayClassName"] = config.platform["gateway"][ - "gateway_class_name" - ] + manifest["spec"]["gatewayClassName"] = config.platform["gateway"]["gateway_class_name"] return manifest @@ -766,9 +741,7 @@ def render_model_cache_pvc(spec: ModelCacheSpec) -> dict[str, Any]: return manifest -def render_model_cache_job( - config: ResolvedConfig, spec: ModelCacheSpec -) -> dict[str, Any]: +def render_model_cache_job(config: ResolvedConfig, spec: ModelCacheSpec) -> dict[str, Any]: common_env = [ {"name": "MODEL_SOURCE", "value": spec.source_uri}, {"name": "MODEL_TARGET_DIR", "value": f"/cache/{spec.model_path}"}, @@ -859,9 +832,7 @@ def render_model_cache_job( EOF """ volume_mounts = [{"name": "cache", "mountPath": "/cache"}] - common_env.append( - {"name": "OCI_IMAGE_PATH", "value": spec.oci_image_path or "/"} - ) + common_env.append({"name": "OCI_IMAGE_PATH", "value": spec.oci_image_path or "/"}) if registry_auth_secret_name: volumes.append( { @@ -907,9 +878,7 @@ def render_model_cache_job( }, "spec": { "backoffLimit": 0, - "activeDeadlineSeconds": config.model_cache["download"][ - "wait_timeout_seconds" - ], + "activeDeadlineSeconds": config.model_cache["download"]["wait_timeout_seconds"], "template": { "metadata": { "labels": { @@ -955,7 +924,7 @@ def annotate_model_cache_pvc(spec: ModelCacheSpec) -> None: "-n", spec.namespace, "--overwrite", - f"forge.openshift.io/model-cache-ready=true", + "forge.openshift.io/model-cache-ready=true", f"forge.openshift.io/model-cache-key={spec.cache_key}", f"forge.openshift.io/model-source-uri={spec.source_uri}", f"forge.openshift.io/model-uri={spec.model_uri}", @@ -978,21 +947,15 @@ def render_inference_service(config: ResolvedConfig) -> dict[str, Any]: ) cache_spec = resolve_model_cache(config) - manifest["spec"]["model"]["uri"] = ( - cache_spec.model_uri if cache_spec else config.model["uri"] - ) + manifest["spec"]["model"]["uri"] = cache_spec.model_uri if cache_spec else config.model["uri"] manifest["spec"]["model"]["name"] = config.model["served_model_name"] manifest["spec"]["template"]["containers"][0]["resources"] = copy.deepcopy( config.model["resources"] ) - epp_path = ( - config.config_dir / config.platform["inference_service"]["epp_config_template"] - ) + epp_path = config.config_dir / config.platform["inference_service"]["epp_config_template"] epp_config = epp_path.read_text(encoding="utf-8") - router_args = manifest["spec"]["router"]["scheduler"]["template"]["containers"][0][ - "args" - ] + router_args = manifest["spec"]["router"]["scheduler"]["template"]["containers"][0]["args"] if not router_args or router_args[-1] != "--config-text": raise ValueError("Expected llm-d router args to end with --config-text") router_args.append(epp_config) @@ -1078,9 +1041,7 @@ def render_guidellm_job(config: ResolvedConfig, endpoint_url: str) -> dict[str, {"name": "home", "emptyDir": {}}, { "name": "results", - "persistentVolumeClaim": { - "claimName": config.benchmark["job_name"] - }, + "persistentVolumeClaim": {"claimName": config.benchmark["job_name"]}, }, ], }, @@ -1140,9 +1101,7 @@ def render_guidellm_copy_pod( "volumes": [ { "name": "results", - "persistentVolumeClaim": { - "claimName": config.benchmark["job_name"] - }, + "persistentVolumeClaim": {"claimName": config.benchmark["job_name"]}, } ], }, diff --git a/projects/llm_d/orchestration/prepare_llmd.py b/projects/llm_d/orchestration/prepare_llmd.py index d52f921a..ba64a9dc 100644 --- a/projects/llm_d/orchestration/prepare_llmd.py +++ b/projects/llm_d/orchestration/prepare_llmd.py @@ -1,10 +1,7 @@ from __future__ import annotations from projects.llm_d.toolbox.cleanup.main import run as cleanup_toolbox_run -from projects.llm_d.toolbox.cleanup.main import run_cleanup -from projects.llm_d.toolbox.prepare.main import prepare_gpu_operator from projects.llm_d.toolbox.prepare.main import run as prepare_toolbox_run -from projects.llm_d.toolbox.prepare.main import run_prepare def prepare() -> int: diff --git a/projects/llm_d/orchestration/test_llmd.py b/projects/llm_d/orchestration/test_llmd.py index 8fc2bc40..5254cafb 100644 --- a/projects/llm_d/orchestration/test_llmd.py +++ b/projects/llm_d/orchestration/test_llmd.py @@ -1,9 +1,7 @@ from __future__ import annotations from projects.llm_d.orchestration import llmd_runtime -from projects.llm_d.toolbox.test.main import resolve_endpoint_url from projects.llm_d.toolbox.test.main import run as test_toolbox_run -from projects.llm_d.toolbox.test.main import run_test def init() -> None: diff --git a/projects/llm_d/toolbox/prepare/main.py b/projects/llm_d/toolbox/prepare/main.py index 3ebbaf67..621c34d4 100644 --- a/projects/llm_d/toolbox/prepare/main.py +++ b/projects/llm_d/toolbox/prepare/main.py @@ -21,9 +21,7 @@ def run() -> int: def run_prepare(config: llmd_runtime.ResolvedConfig) -> int: - LOGGER.info( - "Preparing llm_d preset=%s namespace=%s", config.preset_name, config.namespace - ) + LOGGER.info("Preparing llm_d preset=%s namespace=%s", config.preset_name, config.namespace) verify_oc_access() verify_cluster_version(config) @@ -59,14 +57,10 @@ def verify_cluster_version(config: llmd_runtime.ResolvedConfig) -> None: or payload.get("serverVersion", {}).get("platform") ) if not openshift_version: - raise RuntimeError( - "Could not determine OpenShift version from `oc version -o json`" - ) + raise RuntimeError("Could not determine OpenShift version from `oc version -o json`") minimum = config.platform["cluster"]["minimum_openshift_version"] - if llmd_runtime.version_tuple(openshift_version) < llmd_runtime.version_tuple( - minimum - ): + if llmd_runtime.version_tuple(openshift_version) < llmd_runtime.version_tuple(minimum): raise RuntimeError( f"Cluster version {openshift_version} is older than the llm_d minimum {minimum}" ) @@ -89,9 +83,7 @@ def prepare_cert_manager(config: llmd_runtime.ResolvedConfig) -> None: def prepare_leader_worker_set(config: llmd_runtime.ResolvedConfig) -> None: - operator_spec = llmd_runtime.operator_spec_by_package( - config.platform, "leader-worker-set" - ) + operator_spec = llmd_runtime.operator_spec_by_package(config.platform, "leader-worker-set") ensure_operator_subscription(operator_spec) @@ -103,9 +95,7 @@ def prepare_nfd(config: llmd_runtime.ResolvedConfig) -> None: timeout_seconds=operator_spec["wait_timeout_seconds"], ) - manifest = llmd_runtime.load_manifest_template( - config, operator_spec["bootstrap_manifest"] - ) + manifest = llmd_runtime.load_manifest_template(config, operator_spec["bootstrap_manifest"]) llmd_runtime.apply_manifest( config.artifact_dir / "src" / "nfd-nodefeaturediscovery.yaml", manifest, @@ -122,24 +112,18 @@ def prepare_nfd(config: llmd_runtime.ResolvedConfig) -> None: ), ) - wait_for_nfd_gpu_labels( - config, timeout_seconds=operator_spec["wait_timeout_seconds"] - ) + wait_for_nfd_gpu_labels(config, timeout_seconds=operator_spec["wait_timeout_seconds"]) def prepare_gpu_operator(config: llmd_runtime.ResolvedConfig) -> None: - operator_spec = llmd_runtime.operator_spec_by_package( - config.platform, "gpu-operator-certified" - ) + operator_spec = llmd_runtime.operator_spec_by_package(config.platform, "gpu-operator-certified") ensure_operator_subscription(operator_spec) llmd_runtime.wait_for_crd( operator_spec["bootstrap_crd"], timeout_seconds=operator_spec["wait_timeout_seconds"], ) - manifest = llmd_runtime.load_manifest_template( - config, operator_spec["bootstrap_manifest"] - ) + manifest = llmd_runtime.load_manifest_template(config, operator_spec["bootstrap_manifest"]) clusterpolicy_name = manifest["metadata"]["name"] if llmd_runtime.resource_exists("clusterpolicy", clusterpolicy_name): LOGGER.info( @@ -163,9 +147,7 @@ def prepare_gpu_operator(config: llmd_runtime.ResolvedConfig) -> None: ) -def wait_for_gpu_clusterpolicy_ready( - clusterpolicy_name: str, *, timeout_seconds: int -) -> None: +def wait_for_gpu_clusterpolicy_ready(clusterpolicy_name: str, *, timeout_seconds: int) -> None: def _clusterpolicy_ready() -> bool: payload = llmd_runtime.oc_get_json( "clusterpolicy", @@ -183,16 +165,12 @@ def _clusterpolicy_ready() -> bool: def prepare_rhoai_operator(config: llmd_runtime.ResolvedConfig) -> None: - operator_spec = llmd_runtime.operator_spec_by_package( - config.platform, "rhods-operator" - ) + operator_spec = llmd_runtime.operator_spec_by_package(config.platform, "rhods-operator") ensure_operator_subscription(operator_spec) ensure_required_crds(config.platform["rhoai"]["required_crds_before_dsc"], config) -def ensure_required_crds( - crd_names: list[str], config: llmd_runtime.ResolvedConfig -) -> None: +def ensure_required_crds(crd_names: list[str], config: llmd_runtime.ResolvedConfig) -> None: for crd_name in crd_names: llmd_runtime.wait_for_crd( crd_name, @@ -202,9 +180,7 @@ def ensure_required_crds( def apply_datasciencecluster(config: llmd_runtime.ResolvedConfig) -> None: manifest = llmd_runtime.render_datasciencecluster(config) - llmd_runtime.apply_manifest( - config.artifact_dir / "src" / "datasciencecluster.yaml", manifest - ) + llmd_runtime.apply_manifest(config.artifact_dir / "src" / "datasciencecluster.yaml", manifest) llmd_runtime.oc( "get", "datasciencecluster", @@ -243,17 +219,13 @@ def _dsc_ready() -> bool: def ensure_gateway(config: llmd_runtime.ResolvedConfig) -> None: gateway = config.platform["gateway"] - if not llmd_runtime.resource_exists( - "gateway", gateway["name"], namespace=gateway["namespace"] - ): + if not llmd_runtime.resource_exists("gateway", gateway["name"], namespace=gateway["namespace"]): if not gateway["create_if_missing"]: raise RuntimeError( f"Required gateway {gateway['name']} does not exist in {gateway['namespace']}" ) manifest = llmd_runtime.render_gateway(config) - llmd_runtime.apply_manifest( - config.artifact_dir / "src" / "gateway.yaml", manifest - ) + llmd_runtime.apply_manifest(config.artifact_dir / "src" / "gateway.yaml", manifest) def _gateway_programmed() -> bool: resource = llmd_runtime.oc_get_json( @@ -291,16 +263,12 @@ def verify_gpu_nodes(config: llmd_runtime.ResolvedConfig) -> None: ) -def wait_for_nfd_gpu_labels( - config: llmd_runtime.ResolvedConfig, *, timeout_seconds: int -) -> None: +def wait_for_nfd_gpu_labels(config: llmd_runtime.ResolvedConfig, *, timeout_seconds: int) -> None: selectors = config.platform["cluster"]["nfd_gpu_detection_labels"] def _labels_present() -> bool: for selector in selectors: - data = llmd_runtime.oc_get_json( - "nodes", selector=selector, ignore_not_found=True - ) + data = llmd_runtime.oc_get_json("nodes", selector=selector, ignore_not_found=True) if data and data.get("items"): return True return False @@ -342,13 +310,9 @@ def capture_prepare_state(config: llmd_runtime.ResolvedConfig) -> None: capture_output=True, ) if gateway_service.returncode == 0 and gateway_service.stdout: - llmd_runtime.write_text( - artifacts_dir / "gateway.service.yaml", gateway_service.stdout - ) + llmd_runtime.write_text(artifacts_dir / "gateway.service.yaml", gateway_service.stdout) if config.platform["artifacts"]["capture_namespace_events"]: - capture_namespace_events( - config.namespace, artifacts_dir / "namespace.events.txt" - ) + capture_namespace_events(config.namespace, artifacts_dir / "namespace.events.txt") def capture_resource_yaml( diff --git a/projects/llm_d/toolbox/prepare_model_cache/main.py b/projects/llm_d/toolbox/prepare_model_cache/main.py index 143ae77a..1dc50758 100644 --- a/projects/llm_d/toolbox/prepare_model_cache/main.py +++ b/projects/llm_d/toolbox/prepare_model_cache/main.py @@ -56,18 +56,13 @@ def ensure_model_cache_pvc( ) if existing: actual_modes = existing.get("spec", {}).get("accessModes", []) - if not llmd_runtime.pvc_access_mode_matches( - actual_modes, cache_spec.access_mode - ): + if not llmd_runtime.pvc_access_mode_matches(actual_modes, cache_spec.access_mode): raise RuntimeError( f"PVC {cache_spec.pvc_name} exists with access modes {actual_modes}, expected {cache_spec.access_mode}" ) actual_storage_class = existing.get("spec", {}).get("storageClassName") - if ( - cache_spec.storage_class_name - and actual_storage_class != cache_spec.storage_class_name - ): + if cache_spec.storage_class_name and actual_storage_class != cache_spec.storage_class_name: raise RuntimeError( f"PVC {cache_spec.pvc_name} exists with storageClassName={actual_storage_class}, expected {cache_spec.storage_class_name}" ) @@ -155,9 +150,7 @@ def capture_model_cache_state( check=False, ) - for pod_name in llmd_runtime.job_pod_names( - cache_spec.download_job_name, cache_spec.namespace - ): + for pod_name in llmd_runtime.job_pod_names(cache_spec.download_job_name, cache_spec.namespace): capture_resource_yaml( "pod", pod_name, diff --git a/projects/llm_d/toolbox/test/main.py b/projects/llm_d/toolbox/test/main.py index d779c18c..0ea05751 100644 --- a/projects/llm_d/toolbox/test/main.py +++ b/projects/llm_d/toolbox/test/main.py @@ -20,7 +20,6 @@ def run() -> int: def run_test(config: llmd_runtime.ResolvedConfig) -> int: - name = config.platform["inference_service"]["name"] namespace = config.namespace artifacts_dir = config.artifact_dir / "artifacts" @@ -40,9 +39,7 @@ def run_test(config: llmd_runtime.ResolvedConfig) -> int: capture_inference_service_state(config) if endpoint_url: llmd_runtime.write_text(artifacts_dir / "endpoint.url", f"{endpoint_url}\n") - benchmark_name = ( - config.benchmark["job_name"] if config.benchmark else "guidellm-benchmark" - ) + benchmark_name = config.benchmark["job_name"] if config.benchmark else "guidellm-benchmark" llmd_runtime.oc( "delete", "job,pvc", @@ -71,9 +68,7 @@ def run_test(config: llmd_runtime.ResolvedConfig) -> int: capture_output=True, ) if events.returncode == 0 and events.stdout: - llmd_runtime.write_text( - artifacts_dir / "namespace.events.txt", events.stdout - ) + llmd_runtime.write_text(artifacts_dir / "namespace.events.txt", events.stdout) def deploy_inference_service(config: llmd_runtime.ResolvedConfig) -> str: @@ -105,9 +100,7 @@ def _old_pods_gone() -> bool: ) manifest = llmd_runtime.render_inference_service(config) - llmd_runtime.apply_manifest( - config.artifact_dir / "src" / "llminferenceservice.yaml", manifest - ) + llmd_runtime.apply_manifest(config.artifact_dir / "src" / "llminferenceservice.yaml", manifest) def _pods_present() -> bool: pods = llmd_runtime.oc_get_json( @@ -117,17 +110,13 @@ def _pods_present() -> bool: llmd_runtime.wait_until( f"llm-d pods to appear in {namespace}", - timeout_seconds=config.platform["inference_service"][ - "pod_appearance_timeout_seconds" - ], + timeout_seconds=config.platform["inference_service"]["pod_appearance_timeout_seconds"], interval_seconds=5, predicate=_pods_present, ) def _service_ready() -> bool: - payload = llmd_runtime.oc_get_json( - "llminferenceservice", name=name, namespace=namespace - ) + payload = llmd_runtime.oc_get_json("llminferenceservice", name=name, namespace=namespace) return llmd_runtime.condition_status(payload, "Ready") == "True" llmd_runtime.wait_until( @@ -161,9 +150,7 @@ def try_resolve_endpoint_url(config: llmd_runtime.ResolvedConfig) -> str | None: name = config.platform["inference_service"]["name"] namespace = config.namespace gateway_name = config.platform["gateway"]["status_address_name"] - payload = llmd_runtime.oc_get_json( - "llminferenceservice", name=name, namespace=namespace - ) + payload = llmd_runtime.oc_get_json("llminferenceservice", name=name, namespace=namespace) for address in payload.get("status", {}).get("addresses", []): if address.get("name") == gateway_name and address.get("url"): @@ -171,12 +158,12 @@ def try_resolve_endpoint_url(config: llmd_runtime.ResolvedConfig) -> str | None: return None -def run_smoke_request( - config: llmd_runtime.ResolvedConfig, endpoint_url: str -) -> dict[str, object]: +def run_smoke_request(config: llmd_runtime.ResolvedConfig, endpoint_url: str) -> dict[str, object]: namespace = config.namespace name = config.platform["inference_service"]["name"] - deployment_name = f"{name}{config.platform['inference_service']['workload_deployment_name_suffix']}" + deployment_name = ( + f"{name}{config.platform['inference_service']['workload_deployment_name_suffix']}" + ) payload = { "model": config.model["served_model_name"], @@ -184,9 +171,7 @@ def run_smoke_request( "max_tokens": config.smoke_request["max_tokens"], "temperature": config.smoke_request["temperature"], } - llmd_runtime.write_json( - config.artifact_dir / "artifacts" / "smoke.request.json", payload - ) + llmd_runtime.write_json(config.artifact_dir / "artifacts" / "smoke.request.json", payload) retries = config.platform["smoke"]["request_retries"] delay = config.platform["smoke"]["request_retry_delay_seconds"] @@ -224,9 +209,7 @@ def run_smoke_request( return response -def run_guidellm_benchmark( - config: llmd_runtime.ResolvedConfig, endpoint_url: str -) -> None: +def run_guidellm_benchmark(config: llmd_runtime.ResolvedConfig, endpoint_url: str) -> None: benchmark_name = config.benchmark["job_name"] namespace = config.namespace @@ -259,9 +242,7 @@ def run_guidellm_benchmark( ) def _job_terminal() -> dict[str, object] | None: - payload = llmd_runtime.oc_get_json( - "job", name=benchmark_name, namespace=namespace - ) + payload = llmd_runtime.oc_get_json("job", name=benchmark_name, namespace=namespace) status = payload.get("status", {}) if status.get("succeeded"): return payload @@ -379,12 +360,8 @@ def capture_inference_service_state(config: llmd_runtime.ResolvedConfig) -> None artifacts_dir / "llminferenceservice.replicasets.yaml", selector=selector, ) - capture_get( - "pods", None, namespace, "wide", artifacts_dir / "namespace.pods.status" - ) - capture_get( - "services", None, namespace, "wide", artifacts_dir / "namespace.services.status" - ) + capture_get("pods", None, namespace, "wide", artifacts_dir / "namespace.pods.status") + capture_get("services", None, namespace, "wide", artifacts_dir / "namespace.services.status") pod_list = llmd_runtime.oc_get_json( "pods", namespace=namespace, selector=selector, ignore_not_found=True @@ -459,9 +436,7 @@ def capture_guidellm_state(config: llmd_runtime.ResolvedConfig) -> None: capture_output=True, ) if result.returncode == 0 and result.stdout: - llmd_runtime.write_text( - artifacts_dir / "guidellm_benchmark_job.logs", result.stdout - ) + llmd_runtime.write_text(artifacts_dir / "guidellm_benchmark_job.logs", result.stdout) def capture_get( diff --git a/tests/llm_d/test_runtime.py b/tests/llm_d/test_runtime.py index d130e781..4fe116ee 100644 --- a/tests/llm_d/test_runtime.py +++ b/tests/llm_d/test_runtime.py @@ -34,9 +34,7 @@ def test_load_run_configuration_resolves_alias( encoding="utf-8", ) - config = llmd_runtime.load_run_configuration( - cwd=tmp_path, artifact_dir=artifact_dir - ) + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) assert config.preset_name == "smoke" assert config.preset_alias == "cks" @@ -63,16 +61,12 @@ def test_load_run_configuration_consolidates_config_d( assert consolidated["runtime"]["default_preset"] == "smoke" -def test_namespace_override_is_not_managed( - tmp_path: Path, monkeypatch: pytest.MonkeyPatch -) -> None: +def test_namespace_override_is_not_managed(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", '{"namespace":"custom-ns"}') artifact_dir = tmp_path / "artifacts" artifact_dir.mkdir() - config = llmd_runtime.load_run_configuration( - cwd=tmp_path, artifact_dir=artifact_dir - ) + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) assert config.namespace == "custom-ns" assert config.namespace_is_managed is False @@ -89,9 +83,7 @@ def test_default_namespace_comes_from_project_config( encoding="utf-8", ) - config = llmd_runtime.load_run_configuration( - cwd=tmp_path, artifact_dir=artifact_dir - ) + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) assert config.namespace == "forge-llm-d" assert config.namespace_is_managed is False @@ -104,9 +96,7 @@ def test_render_inference_service_injects_model_and_epp( artifact_dir = tmp_path / "artifacts" artifact_dir.mkdir() - config = llmd_runtime.load_run_configuration( - cwd=tmp_path, artifact_dir=artifact_dir - ) + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) manifest = llmd_runtime.render_inference_service(config) cache_spec = llmd_runtime.resolve_model_cache(config) @@ -115,23 +105,17 @@ def test_render_inference_service_injects_model_and_epp( assert manifest["spec"]["model"]["name"] == "Qwen/Qwen3-0.6B" assert manifest["spec"]["model"]["uri"] == cache_spec.model_uri assert manifest["spec"]["model"]["name"] == config.model["served_model_name"] - router_args = manifest["spec"]["router"]["scheduler"]["template"]["containers"][0][ - "args" - ] + router_args = manifest["spec"]["router"]["scheduler"]["template"]["containers"][0]["args"] assert router_args[-2] == "--config-text" assert "EndpointPickerConfig" in router_args[-1] -def test_resolve_model_cache_for_hf_model( - tmp_path: Path, monkeypatch: pytest.MonkeyPatch -) -> None: +def test_resolve_model_cache_for_hf_model(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") artifact_dir = tmp_path / "artifacts" artifact_dir.mkdir() - config = llmd_runtime.load_run_configuration( - cwd=tmp_path, artifact_dir=artifact_dir - ) + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) cache_spec = llmd_runtime.resolve_model_cache(config) assert cache_spec is not None @@ -149,9 +133,7 @@ def test_render_model_cache_job_for_hf_model( artifact_dir = tmp_path / "artifacts" artifact_dir.mkdir() - config = llmd_runtime.load_run_configuration( - cwd=tmp_path, artifact_dir=artifact_dir - ) + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) cache_spec = llmd_runtime.resolve_model_cache(config) manifest = llmd_runtime.render_model_cache_job(config, cache_spec) @@ -176,9 +158,7 @@ def test_render_model_cache_job_for_oci_model_uses_registry_auth_secret( encoding="utf-8", ) - config = llmd_runtime.load_run_configuration( - cwd=tmp_path, artifact_dir=artifact_dir - ) + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) monkeypatch.setattr( llmd_runtime, "resolve_default_serviceaccount_image_pull_secret", @@ -188,9 +168,7 @@ def test_render_model_cache_job_for_oci_model_uses_registry_auth_secret( manifest = llmd_runtime.render_model_cache_job(config, cache_spec) container = manifest["spec"]["template"]["spec"]["containers"][0] - volume_names = { - volume["name"] for volume in manifest["spec"]["template"]["spec"]["volumes"] - } + volume_names = {volume["name"] for volume in manifest["spec"]["template"]["spec"]["volumes"]} assert cache_spec.source_scheme == "oci" assert container["name"] == "oci-model-extractor" @@ -211,9 +189,7 @@ def test_render_guidellm_job_uses_target_and_rate( encoding="utf-8", ) - config = llmd_runtime.load_run_configuration( - cwd=tmp_path, artifact_dir=artifact_dir - ) + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) manifest = llmd_runtime.render_guidellm_job(config, "https://example.test") container = manifest["spec"]["template"]["spec"]["containers"][0] @@ -229,10 +205,7 @@ def test_prepare_model_cache_skips_ready_pvc( artifact_dir = tmp_path / "artifacts" artifact_dir.mkdir() - config = llmd_runtime.load_run_configuration( - cwd=tmp_path, artifact_dir=artifact_dir - ) - cache_spec = llmd_runtime.resolve_model_cache(config) + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) calls: list[str] = [] monkeypatch.setattr( @@ -240,9 +213,7 @@ def test_prepare_model_cache_skips_ready_pvc( "ensure_model_cache_pvc", lambda _config, _cache_spec: calls.append("ensure-pvc"), ) - monkeypatch.setattr( - llmd_runtime, "model_cache_pvc_ready", lambda _cache_spec: True - ) + monkeypatch.setattr(llmd_runtime, "model_cache_pvc_ready", lambda _cache_spec: True) monkeypatch.setattr( prepare_model_cache_toolbox, "capture_model_cache_state", @@ -266,9 +237,7 @@ def test_cleanup_deletes_leftovers_but_not_namespace_or_preserved_pvcs( artifact_dir = tmp_path / "artifacts" artifact_dir.mkdir() - config = llmd_runtime.load_run_configuration( - cwd=tmp_path, artifact_dir=artifact_dir - ) + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) oc_calls: list[tuple[str, ...]] = [] def fake_resource_exists(kind: str, name: str, namespace: str | None = None) -> bool: @@ -305,9 +274,7 @@ def test_prepare_gpu_operator_skips_existing_clusterpolicy( monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") artifact_dir = tmp_path / "artifacts" artifact_dir.mkdir() - config = llmd_runtime.load_run_configuration( - cwd=tmp_path, artifact_dir=artifact_dir - ) + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) calls: list[str] = [] @@ -357,9 +324,7 @@ def test_prepare_gpu_operator_bootstraps_missing_clusterpolicy( monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") artifact_dir = tmp_path / "artifacts" artifact_dir.mkdir() - config = llmd_runtime.load_run_configuration( - cwd=tmp_path, artifact_dir=artifact_dir - ) + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) applied: list[Path] = [] manifest = { @@ -371,9 +336,7 @@ def test_prepare_gpu_operator_bootstraps_missing_clusterpolicy( monkeypatch.setattr(prepare_toolbox, "ensure_operator_subscription", lambda _: None) monkeypatch.setattr(llmd_runtime, "wait_for_crd", lambda *_, **__: None) - monkeypatch.setattr( - llmd_runtime, "load_manifest_template", lambda _config, _path: manifest - ) + monkeypatch.setattr(llmd_runtime, "load_manifest_template", lambda _config, _path: manifest) monkeypatch.setattr(llmd_runtime, "resource_exists", lambda kind, name: False) monkeypatch.setattr( llmd_runtime, @@ -417,9 +380,7 @@ def test_resolve_endpoint_url_requires_gateway_address( monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") artifact_dir = tmp_path / "artifacts" artifact_dir.mkdir() - config = llmd_runtime.load_run_configuration( - cwd=tmp_path, artifact_dir=artifact_dir - ) + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) def fake_oc_get_json(kind: str, **_: object) -> dict[str, object]: assert kind == "llminferenceservice" From d97dc9bf108a8255e84978f8b243599834d61481 Mon Sep 17 00:00:00 2001 From: Alberto Perdomo Date: Mon, 20 Apr 2026 10:43:22 +0100 Subject: [PATCH 05/21] refactor: Separate llm_d scheduler profiles --- .../llm_d/orchestration/config.d/platform.yaml | 1 - .../llm_d/orchestration/config.d/runtime.yaml | 1 + .../config.d/scheduler_profiles.yaml | 2 ++ projects/llm_d/orchestration/llmd_runtime.py | 15 ++++++++++++--- .../llm_d/orchestration/presets.d/presets.yaml | 2 ++ .../approximate-prefix-cache.yaml} | 0 tests/llm_d/test_runtime.py | 4 +++- 7 files changed, 20 insertions(+), 5 deletions(-) create mode 100644 projects/llm_d/orchestration/config.d/scheduler_profiles.yaml rename projects/llm_d/orchestration/{manifests/epp-approximate-prefix-cache.yaml => scheduler_profiles/approximate-prefix-cache.yaml} (100%) diff --git a/projects/llm_d/orchestration/config.d/platform.yaml b/projects/llm_d/orchestration/config.d/platform.yaml index 9f3b9e0e..9ef74568 100644 --- a/projects/llm_d/orchestration/config.d/platform.yaml +++ b/projects/llm_d/orchestration/config.d/platform.yaml @@ -68,7 +68,6 @@ gateway: inference_service: name: llm-d template: manifests/llminferenceservice.yaml - epp_config_template: manifests/epp-approximate-prefix-cache.yaml workload_deployment_name_suffix: -kserve pod_appearance_timeout_seconds: 600 ready_timeout_seconds: 1800 diff --git a/projects/llm_d/orchestration/config.d/runtime.yaml b/projects/llm_d/orchestration/config.d/runtime.yaml index 982d8fd2..4f1bfb98 100644 --- a/projects/llm_d/orchestration/config.d/runtime.yaml +++ b/projects/llm_d/orchestration/config.d/runtime.yaml @@ -3,5 +3,6 @@ allowed_override_keys: - namespace selected_preset: smoke model_key: qwen3-0-6b +scheduler_profile_key: approximate-prefix-cache smoke_request_key: default benchmark_key: null diff --git a/projects/llm_d/orchestration/config.d/scheduler_profiles.yaml b/projects/llm_d/orchestration/config.d/scheduler_profiles.yaml new file mode 100644 index 00000000..cb579d9b --- /dev/null +++ b/projects/llm_d/orchestration/config.d/scheduler_profiles.yaml @@ -0,0 +1,2 @@ +approximate-prefix-cache: + config_path: scheduler_profiles/approximate-prefix-cache.yaml diff --git a/projects/llm_d/orchestration/llmd_runtime.py b/projects/llm_d/orchestration/llmd_runtime.py index 48c503f3..69206a2d 100644 --- a/projects/llm_d/orchestration/llmd_runtime.py +++ b/projects/llm_d/orchestration/llmd_runtime.py @@ -41,6 +41,8 @@ class ResolvedConfig: platform: dict[str, Any] model_key: str model: dict[str, Any] + scheduler_profile_key: str + scheduler_profile: dict[str, Any] model_cache: dict[str, Any] smoke_request: dict[str, Any] benchmark: dict[str, Any] | None @@ -122,6 +124,11 @@ def load_run_configuration( model_name = config.project.get_config("runtime.model_key") model = copy.deepcopy(config.project.get_config(f"models.{model_name}")) + scheduler_profile_key = config.project.get_config("runtime.scheduler_profile_key") + scheduler_profile = copy.deepcopy( + config.project.get_config(f"scheduler_profiles.{scheduler_profile_key}") + ) + smoke_request_name = config.project.get_config("runtime.smoke_request_key") smoke_request = copy.deepcopy( config.project.get_config(f"workloads.smoke_requests.{smoke_request_name}") @@ -165,6 +172,8 @@ def load_run_configuration( platform=platform_data, model_key=model_name, model=model, + scheduler_profile_key=scheduler_profile_key, + scheduler_profile=scheduler_profile, model_cache=model_cache, smoke_request=smoke_request, benchmark=benchmark, @@ -953,12 +962,12 @@ def render_inference_service(config: ResolvedConfig) -> dict[str, Any]: config.model["resources"] ) - epp_path = config.config_dir / config.platform["inference_service"]["epp_config_template"] - epp_config = epp_path.read_text(encoding="utf-8") + scheduler_profile_path = config.config_dir / config.scheduler_profile["config_path"] + scheduler_profile_config = scheduler_profile_path.read_text(encoding="utf-8") router_args = manifest["spec"]["router"]["scheduler"]["template"]["containers"][0]["args"] if not router_args or router_args[-1] != "--config-text": raise ValueError("Expected llm-d router args to end with --config-text") - router_args.append(epp_config) + router_args.append(scheduler_profile_config) return manifest diff --git a/projects/llm_d/orchestration/presets.d/presets.yaml b/projects/llm_d/orchestration/presets.d/presets.yaml index 37fcc711..9fc1392a 100644 --- a/projects/llm_d/orchestration/presets.d/presets.yaml +++ b/projects/llm_d/orchestration/presets.d/presets.yaml @@ -3,12 +3,14 @@ __multiple: true smoke: runtime.selected_preset: smoke runtime.model_key: qwen3-0-6b + runtime.scheduler_profile_key: approximate-prefix-cache runtime.smoke_request_key: default runtime.benchmark_key: null benchmark-short: runtime.selected_preset: benchmark-short runtime.model_key: llama-3-1-8b-instruct-fp8 + runtime.scheduler_profile_key: approximate-prefix-cache runtime.smoke_request_key: default runtime.benchmark_key: short diff --git a/projects/llm_d/orchestration/manifests/epp-approximate-prefix-cache.yaml b/projects/llm_d/orchestration/scheduler_profiles/approximate-prefix-cache.yaml similarity index 100% rename from projects/llm_d/orchestration/manifests/epp-approximate-prefix-cache.yaml rename to projects/llm_d/orchestration/scheduler_profiles/approximate-prefix-cache.yaml diff --git a/tests/llm_d/test_runtime.py b/tests/llm_d/test_runtime.py index 4fe116ee..50f84000 100644 --- a/tests/llm_d/test_runtime.py +++ b/tests/llm_d/test_runtime.py @@ -57,6 +57,7 @@ def test_load_run_configuration_consolidates_config_d( assert "model_cache" in consolidated assert "models" in consolidated assert "runtime" in consolidated + assert "scheduler_profiles" in consolidated assert "workloads" in consolidated assert consolidated["runtime"]["default_preset"] == "smoke" @@ -89,7 +90,7 @@ def test_default_namespace_comes_from_project_config( assert config.namespace_is_managed is False -def test_render_inference_service_injects_model_and_epp( +def test_render_inference_service_injects_model_and_scheduler_profile( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") @@ -105,6 +106,7 @@ def test_render_inference_service_injects_model_and_epp( assert manifest["spec"]["model"]["name"] == "Qwen/Qwen3-0.6B" assert manifest["spec"]["model"]["uri"] == cache_spec.model_uri assert manifest["spec"]["model"]["name"] == config.model["served_model_name"] + assert config.scheduler_profile_key == "approximate-prefix-cache" router_args = manifest["spec"]["router"]["scheduler"]["template"]["containers"][0]["args"] assert router_args[-2] == "--config-text" assert "EndpointPickerConfig" in router_args[-1] From 58abf1157c595190c2ac60c2c6f9627bf9b792d6 Mon Sep 17 00:00:00 2001 From: Alberto Perdomo Date: Mon, 20 Apr 2026 11:35:56 +0100 Subject: [PATCH 06/21] refactor: Rename llm_d capture toolbox --- projects/core/dsl/log.py | 4 ++-- projects/core/dsl/runtime.py | 4 ++-- .../{capture_isvc_state => capture_llmisvc_state}/main.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) rename projects/llm_d/toolbox/{capture_isvc_state => capture_llmisvc_state}/main.py (99%) diff --git a/projects/core/dsl/log.py b/projects/core/dsl/log.py index b5c911de..b93a5076 100644 --- a/projects/core/dsl/log.py +++ b/projects/core/dsl/log.py @@ -117,6 +117,6 @@ def _get_toolbox_function_name(filename): """Extract toolbox function name from file path (parent directory name)""" filename_path = Path(filename) - # For paths like projects/llm_d/toolbox/capture_isvc_state/main.py - # Return the parent directory name: capture_isvc_state + # For paths like projects/llm_d/toolbox/capture_llmisvc_state/main.py + # Return the parent directory name: capture_llmisvc_state return filename_path.parent.name diff --git a/projects/core/dsl/runtime.py b/projects/core/dsl/runtime.py index d1afff31..c8f807db 100644 --- a/projects/core/dsl/runtime.py +++ b/projects/core/dsl/runtime.py @@ -413,6 +413,6 @@ def _get_toolbox_function_name(filename): """Extract toolbox function name from file path (parent directory name)""" filename_path = Path(filename) - # For paths like projects/llm_d/toolbox/capture_isvc_state/main.py - # Return the parent directory name: capture_isvc_state + # For paths like projects/llm_d/toolbox/capture_llmisvc_state/main.py + # Return the parent directory name: capture_llmisvc_state return filename_path.parent.name diff --git a/projects/llm_d/toolbox/capture_isvc_state/main.py b/projects/llm_d/toolbox/capture_llmisvc_state/main.py similarity index 99% rename from projects/llm_d/toolbox/capture_isvc_state/main.py rename to projects/llm_d/toolbox/capture_llmisvc_state/main.py index 85d09bc8..1e4577c5 100644 --- a/projects/llm_d/toolbox/capture_isvc_state/main.py +++ b/projects/llm_d/toolbox/capture_llmisvc_state/main.py @@ -2,7 +2,7 @@ """ LLMInferenceService state capture using task-based DSL -Replaces llmd_capture_isvc_state Ansible role +Replaces llmd_capture_llmisvc_state Ansible role """ from projects.core.dsl import execute_tasks, shell, task, toolbox From 6155c42591153927d355a90f4338a387bfff20c7 Mon Sep 17 00:00:00 2001 From: Alberto Perdomo Date: Mon, 20 Apr 2026 14:23:51 +0100 Subject: [PATCH 07/21] fix: Harden llm_d runtime command handling --- projects/llm_d/orchestration/llmd_runtime.py | 68 ++++++++++++---- tests/llm_d/test_runtime.py | 82 ++++++++++++++++++++ 2 files changed, 133 insertions(+), 17 deletions(-) diff --git a/projects/llm_d/orchestration/llmd_runtime.py b/projects/llm_d/orchestration/llmd_runtime.py index 69206a2d..53c270fa 100644 --- a/projects/llm_d/orchestration/llmd_runtime.py +++ b/projects/llm_d/orchestration/llmd_runtime.py @@ -358,16 +358,26 @@ def run_command( check: bool = True, capture_output: bool = True, input_text: str | None = None, + timeout_seconds: float | None = 300, ) -> subprocess.CompletedProcess[str]: cmd = [str(arg) for arg in args] LOGGER.info("run: %s", " ".join(shlex.quote(arg) for arg in cmd)) - result = subprocess.run( - cmd, - check=False, - text=True, - capture_output=capture_output, - input=input_text, - ) + try: + result = subprocess.run( + cmd, + check=False, + text=True, + capture_output=capture_output, + input=input_text, + timeout=timeout_seconds, + ) + except subprocess.TimeoutExpired: + LOGGER.error( + "Command timed out after %ss: %s", + timeout_seconds, + " ".join(shlex.quote(arg) for arg in cmd), + ) + raise if capture_output: if result.stdout: @@ -389,12 +399,14 @@ def oc( check: bool = True, capture_output: bool = True, input_text: str | None = None, + timeout_seconds: float | None = 300, ) -> subprocess.CompletedProcess[str]: return run_command( ["oc", *args], check=check, capture_output=capture_output, input_text=input_text, + timeout_seconds=timeout_seconds, ) @@ -421,21 +433,41 @@ def oc_get_json( args.extend(["-o", "json"]) result = oc(*args, check=not ignore_not_found, capture_output=True) - if ignore_not_found and result.returncode != 0: - return None + if result.returncode != 0: + if ignore_not_found and _is_oc_not_found_error(result.stderr): + return None + raise CommandError( + f"oc {' '.join(shlex.quote(arg) for arg in args)} failed with exit code " + f"{result.returncode}: {result.stderr.strip()}" + ) + if not result.stdout: + raise CommandError(f"oc {' '.join(shlex.quote(arg) for arg in args)} returned no output") return json.loads(result.stdout) def resource_exists(kind: str, name: str, *, namespace: str | None = None) -> bool: - result = oc( - "get", - kind, - name, - *([] if namespace is None else ["-n", namespace]), - check=False, - capture_output=True, + return ( + oc_get_json( + kind, + name=name, + namespace=namespace, + ignore_not_found=True, + ) + is not None ) - return result.returncode == 0 + + +def _is_oc_not_found_error(stderr: str | None) -> bool: + if not stderr: + return False + + normalized = stderr.lower() + if "error from server (notfound)" in normalized: + return True + if "no resources found" in normalized: + return True + + return bool(re.search(r"\bnot found\b", normalized)) def wait_until( @@ -455,6 +487,8 @@ def wait_until( return value last_error = None except Exception as exc: # pragma: no cover - exercised in integration paths + if isinstance(exc, RuntimeError): + raise last_error = exc LOGGER.info("waiting for %s: %s", description, exc) time.sleep(interval_seconds) diff --git a/tests/llm_d/test_runtime.py b/tests/llm_d/test_runtime.py index 50f84000..b7639712 100644 --- a/tests/llm_d/test_runtime.py +++ b/tests/llm_d/test_runtime.py @@ -1,5 +1,6 @@ from __future__ import annotations +import subprocess from pathlib import Path import pytest @@ -392,3 +393,84 @@ def fake_oc_get_json(kind: str, **_: object) -> dict[str, object]: with pytest.raises(RuntimeError, match="Gateway address"): test_toolbox.resolve_endpoint_url(config) + + +def test_wait_until_reraises_runtime_error() -> None: + with pytest.raises(RuntimeError, match="terminal failure"): + llmd_runtime.wait_until( + "test condition", + timeout_seconds=1, + interval_seconds=0, + predicate=lambda: (_ for _ in ()).throw(RuntimeError("terminal failure")), + ) + + +def test_oc_forwards_timeout_to_run_command(monkeypatch: pytest.MonkeyPatch) -> None: + captured: dict[str, object] = {} + + def fake_run_command(args, **kwargs): + captured["args"] = list(args) + captured["kwargs"] = kwargs + return subprocess.CompletedProcess(args, 0, stdout="", stderr="") + + monkeypatch.setattr(llmd_runtime, "run_command", fake_run_command) + + llmd_runtime.oc("get", "pods", timeout_seconds=42) + + assert captured["args"] == ["oc", "get", "pods"] + assert captured["kwargs"]["timeout_seconds"] == 42 + + +def test_oc_get_json_returns_none_only_for_not_found( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr( + llmd_runtime, + "oc", + lambda *args, **kwargs: subprocess.CompletedProcess( + args, + 1, + stdout="", + stderr='Error from server (NotFound): llminferenceservices.serving.kserve.io "llm-d" not found', + ), + ) + + payload = llmd_runtime.oc_get_json( + "llminferenceservice", + name="llm-d", + namespace="forge-llm-d", + ignore_not_found=True, + ) + + assert payload is None + + +def test_oc_get_json_raises_for_non_not_found_errors( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr( + llmd_runtime, + "oc", + lambda *args, **kwargs: subprocess.CompletedProcess( + args, + 1, + stdout="", + stderr='Error from server (Forbidden): pods is forbidden: User "alice" cannot list resource "pods"', + ), + ) + + with pytest.raises(llmd_runtime.CommandError, match="Forbidden"): + llmd_runtime.oc_get_json("pods", namespace="forge-llm-d", ignore_not_found=True) + + +def test_resource_exists_propagates_non_not_found_errors( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr( + llmd_runtime, + "oc_get_json", + lambda *args, **kwargs: (_ for _ in ()).throw(llmd_runtime.CommandError("boom")), + ) + + with pytest.raises(llmd_runtime.CommandError, match="boom"): + llmd_runtime.resource_exists("namespace", "forge-llm-d") From a066d986616e1a6427e4d5d5b73593034fbe6c47 Mon Sep 17 00:00:00 2001 From: Alberto Perdomo Date: Tue, 21 Apr 2026 10:32:41 +0100 Subject: [PATCH 08/21] fix: Make NFD prepare idempotent --- projects/llm_d/toolbox/prepare/main.py | 21 +++++++--- tests/llm_d/test_runtime.py | 53 ++++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 6 deletions(-) diff --git a/projects/llm_d/toolbox/prepare/main.py b/projects/llm_d/toolbox/prepare/main.py index 621c34d4..831201d5 100644 --- a/projects/llm_d/toolbox/prepare/main.py +++ b/projects/llm_d/toolbox/prepare/main.py @@ -96,10 +96,19 @@ def prepare_nfd(config: llmd_runtime.ResolvedConfig) -> None: ) manifest = llmd_runtime.load_manifest_template(config, operator_spec["bootstrap_manifest"]) - llmd_runtime.apply_manifest( - config.artifact_dir / "src" / "nfd-nodefeaturediscovery.yaml", - manifest, - ) + nfd_name = manifest["metadata"]["name"] + nfd_namespace = manifest["metadata"]["namespace"] + if llmd_runtime.resource_exists("nodefeaturediscovery", nfd_name, namespace=nfd_namespace): + LOGGER.info( + "NodeFeatureDiscovery/%s already exists in %s; verifying GPU discovery labels", + nfd_name, + nfd_namespace, + ) + else: + llmd_runtime.apply_manifest( + config.artifact_dir / "src" / "nfd-nodefeaturediscovery.yaml", + manifest, + ) llmd_runtime.wait_until( "NodeFeatureDiscovery bootstrap resource", @@ -107,8 +116,8 @@ def prepare_nfd(config: llmd_runtime.ResolvedConfig) -> None: interval_seconds=10, predicate=lambda: llmd_runtime.resource_exists( "nodefeaturediscovery", - manifest["metadata"]["name"], - namespace=manifest["metadata"]["namespace"], + nfd_name, + namespace=nfd_namespace, ), ) diff --git a/tests/llm_d/test_runtime.py b/tests/llm_d/test_runtime.py index b7639712..2c2ab085 100644 --- a/tests/llm_d/test_runtime.py +++ b/tests/llm_d/test_runtime.py @@ -357,6 +357,59 @@ def test_prepare_gpu_operator_bootstraps_missing_clusterpolicy( assert applied == [artifact_dir / "src" / "gpu-clusterpolicy.yaml"] +def test_prepare_nfd_skips_existing_nodefeaturediscovery( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + + calls: list[str] = [] + manifest = { + "apiVersion": "nfd.openshift.io/v1", + "kind": "NodeFeatureDiscovery", + "metadata": {"name": "nfd-instance", "namespace": "openshift-nfd"}, + } + + monkeypatch.setattr( + prepare_toolbox, + "ensure_operator_subscription", + lambda operator_spec: calls.append(f"subscription:{operator_spec['package']}"), + ) + monkeypatch.setattr( + llmd_runtime, + "wait_for_crd", + lambda crd_name, *, timeout_seconds: calls.append(f"crd:{crd_name}"), + ) + monkeypatch.setattr(llmd_runtime, "load_manifest_template", lambda _config, _path: manifest) + monkeypatch.setattr(llmd_runtime, "resource_exists", lambda *args, **kwargs: True) + monkeypatch.setattr( + llmd_runtime, + "wait_until", + lambda *args, **kwargs: calls.append("wait-nfd"), + ) + monkeypatch.setattr( + prepare_toolbox, + "wait_for_nfd_gpu_labels", + lambda _config, *, timeout_seconds: calls.append("wait-labels"), + ) + + def fail_apply(*_: object, **__: object) -> None: + raise AssertionError("existing NodeFeatureDiscovery must not be reapplied") + + monkeypatch.setattr(llmd_runtime, "apply_manifest", fail_apply) + + prepare_toolbox.prepare_nfd(config) + + assert calls == [ + "subscription:nfd", + "crd:nodefeaturediscoveries.nfd.openshift.io", + "wait-nfd", + "wait-labels", + ] + + def test_gpu_clusterpolicy_manifest_has_required_default_sections() -> None: manifest = llmd_runtime.load_yaml( llmd_runtime.CONFIG_DIR / "manifests" / "gpu-clusterpolicy.yaml" From 819b46bc139820200c885c7124632ebd4611bdd7 Mon Sep 17 00:00:00 2001 From: Alberto Perdomo Date: Tue, 21 Apr 2026 10:33:00 +0100 Subject: [PATCH 09/21] fix: Run llm_d smoke in helper job --- .../orchestration/config.d/platform.yaml | 3 + projects/llm_d/orchestration/llmd_runtime.py | 79 ++++++++++++ projects/llm_d/toolbox/test/main.py | 115 +++++++++++++----- tests/llm_d/test_runtime.py | 59 +++++++++ 4 files changed, 225 insertions(+), 31 deletions(-) diff --git a/projects/llm_d/orchestration/config.d/platform.yaml b/projects/llm_d/orchestration/config.d/platform.yaml index 9ef74568..43092e7c 100644 --- a/projects/llm_d/orchestration/config.d/platform.yaml +++ b/projects/llm_d/orchestration/config.d/platform.yaml @@ -77,6 +77,9 @@ artifacts: capture_namespace_events: true smoke: + job_name: llm-d-smoke + client_image: curlimages/curl:8.11.1 endpoint_path: /v1/completions request_retries: 30 request_retry_delay_seconds: 10 + request_timeout_seconds: 60 diff --git a/projects/llm_d/orchestration/llmd_runtime.py b/projects/llm_d/orchestration/llmd_runtime.py index 53c270fa..733b04ec 100644 --- a/projects/llm_d/orchestration/llmd_runtime.py +++ b/projects/llm_d/orchestration/llmd_runtime.py @@ -1006,6 +1006,85 @@ def render_inference_service(config: ResolvedConfig) -> dict[str, Any]: return manifest +def render_smoke_request_job( + config: ResolvedConfig, endpoint_url: str, payload: dict[str, Any] +) -> dict[str, Any]: + smoke = config.platform["smoke"] + command = """ +set -eu +attempt=1 +while [ "${attempt}" -le "${REQUEST_RETRIES}" ]; do + if curl -k -sSf --max-time "${REQUEST_TIMEOUT_SECONDS}" \ + "${ENDPOINT_URL}${ENDPOINT_PATH}" \ + -H "Content-Type: application/json" \ + -d "${REQUEST_PAYLOAD}" \ + -o /tmp/smoke-response.json \ + 2>/tmp/smoke-error.log; then + cat /tmp/smoke-response.json + exit 0 + fi + attempt=$((attempt + 1)) + sleep "${REQUEST_RETRY_DELAY_SECONDS}" +done +cat /tmp/smoke-error.log >&2 || true +exit 1 +""" + + return { + "apiVersion": "batch/v1", + "kind": "Job", + "metadata": { + "name": smoke["job_name"], + "namespace": config.namespace, + "labels": { + "app.kubernetes.io/managed-by": "forge", + "forge.openshift.io/project": "llm_d", + "forge.openshift.io/component": "smoke", + }, + }, + "spec": { + "backoffLimit": 0, + "activeDeadlineSeconds": ( + smoke["request_retries"] + * (smoke["request_timeout_seconds"] + smoke["request_retry_delay_seconds"]) + ), + "template": { + "metadata": { + "labels": { + "app.kubernetes.io/managed-by": "forge", + "forge.openshift.io/project": "llm_d", + "forge.openshift.io/component": "smoke", + } + }, + "spec": { + "restartPolicy": "Never", + "containers": [ + { + "name": "smoke", + "image": smoke["client_image"], + "command": ["/bin/sh", "-ceu", command], + "env": [ + {"name": "ENDPOINT_URL", "value": endpoint_url}, + {"name": "ENDPOINT_PATH", "value": smoke["endpoint_path"]}, + {"name": "REQUEST_PAYLOAD", "value": json.dumps(payload)}, + {"name": "REQUEST_RETRIES", "value": str(smoke["request_retries"])}, + { + "name": "REQUEST_RETRY_DELAY_SECONDS", + "value": str(smoke["request_retry_delay_seconds"]), + }, + { + "name": "REQUEST_TIMEOUT_SECONDS", + "value": str(smoke["request_timeout_seconds"]), + }, + ], + } + ], + }, + }, + }, + } + + def render_guidellm_pvc(config: ResolvedConfig) -> dict[str, Any]: if not config.benchmark: raise ValueError("Benchmark configuration is not enabled for this preset") diff --git a/projects/llm_d/toolbox/test/main.py b/projects/llm_d/toolbox/test/main.py index 0ea05751..9c6242b6 100644 --- a/projects/llm_d/toolbox/test/main.py +++ b/projects/llm_d/toolbox/test/main.py @@ -4,7 +4,6 @@ import json import logging -import time from pathlib import Path from projects.core.dsl import toolbox @@ -40,6 +39,16 @@ def run_test(config: llmd_runtime.ResolvedConfig) -> int: if endpoint_url: llmd_runtime.write_text(artifacts_dir / "endpoint.url", f"{endpoint_url}\n") benchmark_name = config.benchmark["job_name"] if config.benchmark else "guidellm-benchmark" + smoke_job_name = config.platform["smoke"]["job_name"] + llmd_runtime.oc( + "delete", + "job", + smoke_job_name, + "-n", + namespace, + "--ignore-not-found=true", + check=False, + ) llmd_runtime.oc( "delete", "job,pvc", @@ -160,10 +169,7 @@ def try_resolve_endpoint_url(config: llmd_runtime.ResolvedConfig) -> str | None: def run_smoke_request(config: llmd_runtime.ResolvedConfig, endpoint_url: str) -> dict[str, object]: namespace = config.namespace - name = config.platform["inference_service"]["name"] - deployment_name = ( - f"{name}{config.platform['inference_service']['workload_deployment_name_suffix']}" - ) + job_name = config.platform["smoke"]["job_name"] payload = { "model": config.model["served_model_name"], @@ -173,35 +179,56 @@ def run_smoke_request(config: llmd_runtime.ResolvedConfig, endpoint_url: str) -> } llmd_runtime.write_json(config.artifact_dir / "artifacts" / "smoke.request.json", payload) - retries = config.platform["smoke"]["request_retries"] - delay = config.platform["smoke"]["request_retry_delay_seconds"] - result = None - for _ in range(retries): - result = llmd_runtime.oc( - "exec", - "-n", + llmd_runtime.oc( + "delete", + "job", + job_name, + "-n", + namespace, + "--ignore-not-found=true", + check=False, + ) + llmd_runtime.wait_until( + f"job/{job_name} deletion in {namespace}", + timeout_seconds=120, + interval_seconds=5, + predicate=lambda: not llmd_runtime.resource_exists("job", job_name, namespace=namespace), + ) + + llmd_runtime.apply_manifest( + config.artifact_dir / "src" / "smoke-job.yaml", + llmd_runtime.render_smoke_request_job(config, endpoint_url, payload), + ) + + try: + llmd_runtime.wait_for_job_completion( + job_name, namespace, - f"deployment/{deployment_name}", - "-c", - "main", - "--", - "curl", - "-k", - "-sSf", - f"{endpoint_url}{config.platform['smoke']['endpoint_path']}", - "-H", - "Content-Type: application/json", - "-d", - json.dumps(payload), - check=False, - capture_output=True, + timeout_seconds=( + config.platform["smoke"]["request_retries"] + * ( + config.platform["smoke"]["request_timeout_seconds"] + + config.platform["smoke"]["request_retry_delay_seconds"] + ) + ), + interval_seconds=5, ) - if result.returncode == 0: - break - time.sleep(delay) + finally: + capture_smoke_state(config) - if result is None or result.returncode != 0: - raise RuntimeError("Smoke request never succeeded against the llm_d endpoint") + result = llmd_runtime.oc( + "logs", + f"job/{job_name}", + "-n", + namespace, + check=False, + capture_output=True, + ) + + if result.returncode != 0 or not result.stdout: + raise RuntimeError( + f"Smoke request job {job_name} completed but response logs could not be read: {result.stderr}" + ) response = json.loads(result.stdout) if not response.get("choices"): @@ -209,6 +236,32 @@ def run_smoke_request(config: llmd_runtime.ResolvedConfig, endpoint_url: str) -> return response +def capture_smoke_state(config: llmd_runtime.ResolvedConfig) -> None: + job_name = config.platform["smoke"]["job_name"] + namespace = config.namespace + artifacts_dir = config.artifact_dir / "artifacts" + + capture_get("job", job_name, namespace, "yaml", artifacts_dir / "smoke_job.yaml") + capture_get( + "pods", + None, + namespace, + "yaml", + artifacts_dir / "smoke_job.pods.yaml", + selector=f"job-name={job_name}", + ) + result = llmd_runtime.oc( + "logs", + f"job/{job_name}", + "-n", + namespace, + check=False, + capture_output=True, + ) + if result.returncode == 0 and result.stdout: + llmd_runtime.write_text(artifacts_dir / "smoke_job.logs", result.stdout) + + def run_guidellm_benchmark(config: llmd_runtime.ResolvedConfig, endpoint_url: str) -> None: benchmark_name = config.benchmark["job_name"] namespace = config.namespace diff --git a/tests/llm_d/test_runtime.py b/tests/llm_d/test_runtime.py index 2c2ab085..4c0c2a34 100644 --- a/tests/llm_d/test_runtime.py +++ b/tests/llm_d/test_runtime.py @@ -201,6 +201,28 @@ def test_render_guidellm_job_uses_target_and_rate( assert "--rate=1" in container["args"] +def test_render_smoke_request_job_uses_curl_helper( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + payload = {"model": "Qwen/Qwen3-0.6B", "prompt": "test"} + manifest = llmd_runtime.render_smoke_request_job(config, "https://example.test", payload) + + container = manifest["spec"]["template"]["spec"]["containers"][0] + env = {item["name"]: item["value"] for item in container["env"]} + + assert manifest["kind"] == "Job" + assert manifest["metadata"]["name"] == "llm-d-smoke" + assert container["image"] == "curlimages/curl:8.11.1" + assert env["ENDPOINT_URL"] == "https://example.test" + assert env["ENDPOINT_PATH"] == "/v1/completions" + assert env["REQUEST_PAYLOAD"] == '{"model": "Qwen/Qwen3-0.6B", "prompt": "test"}' + + def test_prepare_model_cache_skips_ready_pvc( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: @@ -448,6 +470,43 @@ def fake_oc_get_json(kind: str, **_: object) -> dict[str, object]: test_toolbox.resolve_endpoint_url(config) +def test_run_smoke_request_uses_helper_job(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + oc_calls: list[tuple[str, ...]] = [] + applied: list[Path] = [] + + def fake_oc(*args, **kwargs): + oc_calls.append(tuple(args)) + if args[:2] == ("logs", "job/llm-d-smoke"): + return subprocess.CompletedProcess( + args, + 0, + stdout='{"choices":[{"text":"ok"}]}\n', + stderr="", + ) + return subprocess.CompletedProcess(args, 0, stdout="", stderr="") + + monkeypatch.setattr(llmd_runtime, "oc", fake_oc) + monkeypatch.setattr(llmd_runtime, "resource_exists", lambda *args, **kwargs: False) + monkeypatch.setattr(llmd_runtime, "wait_until", lambda *args, **kwargs: True) + monkeypatch.setattr(llmd_runtime, "wait_for_job_completion", lambda *args, **kwargs: True) + monkeypatch.setattr( + llmd_runtime, + "apply_manifest", + lambda artifact_path, _manifest: applied.append(artifact_path), + ) + monkeypatch.setattr(test_toolbox, "capture_smoke_state", lambda _config: None) + + response = test_toolbox.run_smoke_request(config, "https://example.test") + + assert response["choices"][0]["text"] == "ok" + assert applied == [artifact_dir / "src" / "smoke-job.yaml"] + assert not any(call and call[0] == "exec" for call in oc_calls) + + def test_wait_until_reraises_runtime_error() -> None: with pytest.raises(RuntimeError, match="terminal failure"): llmd_runtime.wait_until( From 7d4b39bc89fb45d1a1941973bcf1d30118778546 Mon Sep 17 00:00:00 2001 From: Alberto Perdomo Date: Tue, 21 Apr 2026 12:59:17 +0100 Subject: [PATCH 10/21] feat: Add llm_d scheduler profiles --- .../llm_d/orchestration/config.d/runtime.yaml | 2 +- .../config.d/scheduler_profiles.yaml | 9 +++- projects/llm_d/orchestration/llmd_runtime.py | 17 ++++++-- .../orchestration/presets.d/presets.yaml | 16 ++++++- ...ate-prefix-cache.yaml => approximate.yaml} | 0 .../scheduler_profiles/precise.yaml | 26 +++++++++++ tests/llm_d/test_runtime.py | 43 ++++++++++++++++++- 7 files changed, 104 insertions(+), 9 deletions(-) rename projects/llm_d/orchestration/scheduler_profiles/{approximate-prefix-cache.yaml => approximate.yaml} (100%) create mode 100644 projects/llm_d/orchestration/scheduler_profiles/precise.yaml diff --git a/projects/llm_d/orchestration/config.d/runtime.yaml b/projects/llm_d/orchestration/config.d/runtime.yaml index 4f1bfb98..c8715ccb 100644 --- a/projects/llm_d/orchestration/config.d/runtime.yaml +++ b/projects/llm_d/orchestration/config.d/runtime.yaml @@ -3,6 +3,6 @@ allowed_override_keys: - namespace selected_preset: smoke model_key: qwen3-0-6b -scheduler_profile_key: approximate-prefix-cache +scheduler_profile_key: approximate smoke_request_key: default benchmark_key: null diff --git a/projects/llm_d/orchestration/config.d/scheduler_profiles.yaml b/projects/llm_d/orchestration/config.d/scheduler_profiles.yaml index cb579d9b..b3bca162 100644 --- a/projects/llm_d/orchestration/config.d/scheduler_profiles.yaml +++ b/projects/llm_d/orchestration/config.d/scheduler_profiles.yaml @@ -1,2 +1,9 @@ +approximate: + config_path: scheduler_profiles/approximate.yaml + +precise: + config_path: scheduler_profiles/precise.yaml + +# Compatibility alias for earlier llm_d presets. approximate-prefix-cache: - config_path: scheduler_profiles/approximate-prefix-cache.yaml + config_path: scheduler_profiles/approximate.yaml diff --git a/projects/llm_d/orchestration/llmd_runtime.py b/projects/llm_d/orchestration/llmd_runtime.py index 733b04ec..c35b5a31 100644 --- a/projects/llm_d/orchestration/llmd_runtime.py +++ b/projects/llm_d/orchestration/llmd_runtime.py @@ -42,7 +42,7 @@ class ResolvedConfig: model_key: str model: dict[str, Any] scheduler_profile_key: str - scheduler_profile: dict[str, Any] + scheduler_profile: dict[str, Any] | None model_cache: dict[str, Any] smoke_request: dict[str, Any] benchmark: dict[str, Any] | None @@ -125,9 +125,11 @@ def load_run_configuration( model = copy.deepcopy(config.project.get_config(f"models.{model_name}")) scheduler_profile_key = config.project.get_config("runtime.scheduler_profile_key") - scheduler_profile = copy.deepcopy( - config.project.get_config(f"scheduler_profiles.{scheduler_profile_key}") - ) + scheduler_profile = None + if scheduler_profile_key != "default": + scheduler_profile = copy.deepcopy( + config.project.get_config(f"scheduler_profiles.{scheduler_profile_key}") + ) smoke_request_name = config.project.get_config("runtime.smoke_request_key") smoke_request = copy.deepcopy( @@ -996,6 +998,13 @@ def render_inference_service(config: ResolvedConfig) -> dict[str, Any]: config.model["resources"] ) + if config.scheduler_profile_key == "default": + manifest["spec"]["router"]["scheduler"] = {} + return manifest + + if config.scheduler_profile is None: + raise ValueError(f"Missing scheduler profile config for {config.scheduler_profile_key}") + scheduler_profile_path = config.config_dir / config.scheduler_profile["config_path"] scheduler_profile_config = scheduler_profile_path.read_text(encoding="utf-8") router_args = manifest["spec"]["router"]["scheduler"]["template"]["containers"][0]["args"] diff --git a/projects/llm_d/orchestration/presets.d/presets.yaml b/projects/llm_d/orchestration/presets.d/presets.yaml index 9fc1392a..0b3de3a7 100644 --- a/projects/llm_d/orchestration/presets.d/presets.yaml +++ b/projects/llm_d/orchestration/presets.d/presets.yaml @@ -3,14 +3,26 @@ __multiple: true smoke: runtime.selected_preset: smoke runtime.model_key: qwen3-0-6b - runtime.scheduler_profile_key: approximate-prefix-cache + runtime.scheduler_profile_key: approximate runtime.smoke_request_key: default runtime.benchmark_key: null +smoke-precise: + extends: + - smoke + runtime.selected_preset: smoke-precise + runtime.scheduler_profile_key: precise + +smoke-default-scheduler: + extends: + - smoke + runtime.selected_preset: smoke-default-scheduler + runtime.scheduler_profile_key: default + benchmark-short: runtime.selected_preset: benchmark-short runtime.model_key: llama-3-1-8b-instruct-fp8 - runtime.scheduler_profile_key: approximate-prefix-cache + runtime.scheduler_profile_key: approximate runtime.smoke_request_key: default runtime.benchmark_key: short diff --git a/projects/llm_d/orchestration/scheduler_profiles/approximate-prefix-cache.yaml b/projects/llm_d/orchestration/scheduler_profiles/approximate.yaml similarity index 100% rename from projects/llm_d/orchestration/scheduler_profiles/approximate-prefix-cache.yaml rename to projects/llm_d/orchestration/scheduler_profiles/approximate.yaml diff --git a/projects/llm_d/orchestration/scheduler_profiles/precise.yaml b/projects/llm_d/orchestration/scheduler_profiles/precise.yaml new file mode 100644 index 00000000..707e5e0c --- /dev/null +++ b/projects/llm_d/orchestration/scheduler_profiles/precise.yaml @@ -0,0 +1,26 @@ +apiVersion: inference.networking.x-k8s.io/v1alpha1 +kind: EndpointPickerConfig +plugins: + - type: single-profile-handler + - type: precise-prefix-cache-scorer + parameters: + indexerConfig: + tokenProcessorConfig: + blockSize: 64 + hashSeed: "42" + tokenizersPoolConfig: + hf: + tokenizersCacheDir: /tmp/tokenizers + - type: kv-cache-utilization-scorer + - type: queue-scorer + - type: max-score-picker +schedulingProfiles: + - name: default + plugins: + - pluginRef: precise-prefix-cache-scorer + weight: 3.0 + - pluginRef: kv-cache-utilization-scorer + weight: 2.0 + - pluginRef: queue-scorer + weight: 2.0 + - pluginRef: max-score-picker diff --git a/tests/llm_d/test_runtime.py b/tests/llm_d/test_runtime.py index 4c0c2a34..bc19284a 100644 --- a/tests/llm_d/test_runtime.py +++ b/tests/llm_d/test_runtime.py @@ -107,10 +107,51 @@ def test_render_inference_service_injects_model_and_scheduler_profile( assert manifest["spec"]["model"]["name"] == "Qwen/Qwen3-0.6B" assert manifest["spec"]["model"]["uri"] == cache_spec.model_uri assert manifest["spec"]["model"]["name"] == config.model["served_model_name"] - assert config.scheduler_profile_key == "approximate-prefix-cache" + assert config.scheduler_profile_key == "approximate" router_args = manifest["spec"]["router"]["scheduler"]["template"]["containers"][0]["args"] assert router_args[-2] == "--config-text" assert "EndpointPickerConfig" in router_args[-1] + assert "prefix-cache-scorer" in router_args[-1] + + +def test_render_inference_service_supports_precise_scheduler_profile( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + (tmp_path / "fournos_config.yaml").write_text( + "preset: smoke-precise\njob-name: llm-d-precise\n", + encoding="utf-8", + ) + + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + manifest = llmd_runtime.render_inference_service(config) + + assert config.scheduler_profile_key == "precise" + router_args = manifest["spec"]["router"]["scheduler"]["template"]["containers"][0]["args"] + assert router_args[-2] == "--config-text" + assert "precise-prefix-cache-scorer" in router_args[-1] + assert "tokenizersCacheDir" in router_args[-1] + + +def test_render_inference_service_supports_default_scheduler( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + (tmp_path / "fournos_config.yaml").write_text( + "preset: smoke-default-scheduler\njob-name: llm-d-default\n", + encoding="utf-8", + ) + + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + manifest = llmd_runtime.render_inference_service(config) + + assert config.scheduler_profile_key == "default" + assert config.scheduler_profile is None + assert manifest["spec"]["router"]["scheduler"] == {} def test_resolve_model_cache_for_hf_model(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: From 332adc33756d05937f220746a995751570f9ffc9 Mon Sep 17 00:00:00 2001 From: Alberto Perdomo Date: Sun, 3 May 2026 10:14:06 +0100 Subject: [PATCH 11/21] refactor: Split llm_d runtime helpers --- projects/llm_d/orchestration/llmd_runtime.py | 769 +++--------------- .../llm_d/orchestration/runtime_config.py | 352 ++++++++ .../llm_d/orchestration/runtime_manifests.py | 327 ++++++++ 3 files changed, 775 insertions(+), 673 deletions(-) create mode 100644 projects/llm_d/orchestration/runtime_config.py create mode 100644 projects/llm_d/orchestration/runtime_manifests.py diff --git a/projects/llm_d/orchestration/llmd_runtime.py b/projects/llm_d/orchestration/llmd_runtime.py index c35b5a31..59b054e6 100644 --- a/projects/llm_d/orchestration/llmd_runtime.py +++ b/projects/llm_d/orchestration/llmd_runtime.py @@ -1,359 +1,114 @@ from __future__ import annotations -import copy -import hashlib import json import logging -import os import re import shlex import subprocess import time from collections.abc import Iterable -from dataclasses import dataclass -from pathlib import Path from typing import Any import yaml -from projects.core.library import config, env, run +from projects.llm_d.orchestration.runtime_config import ( + CONFIG_DIR, + ORCHESTRATION_DIR, + ModelCacheSpec, + ResolvedConfig, + apply_requested_preset, + derive_namespace, + ensure_artifact_directories, + init, + load_fournos_config, + load_run_configuration, + load_yaml, + normalize_gpu_count, + parse_overrides, + resolve_model_cache, + slugify_identifier, + truncate_k8s_name, + version_tuple, + write_json, + write_text, + write_yaml, +) +from projects.llm_d.orchestration.runtime_manifests import ( + load_manifest_template, + render_datasciencecluster, + render_gateway, + render_guidellm_copy_pod, + render_guidellm_job, + render_guidellm_pvc, + render_inference_service, + render_model_cache_pvc, + render_smoke_request_job, +) LOGGER = logging.getLogger(__name__) -ORCHESTRATION_DIR = env.FORGE_HOME / "projects" / "llm_d" / "orchestration" -CONFIG_DIR = ORCHESTRATION_DIR + +__all__ = [ + "CONFIG_DIR", + "ORCHESTRATION_DIR", + "CommandError", + "ModelCacheSpec", + "ResolvedConfig", + "annotate_model_cache_pvc", + "apply_manifest", + "apply_requested_preset", + "condition_status", + "derive_namespace", + "desired_subscription", + "ensure_artifact_directories", + "ensure_namespace", + "ensure_operator_group", + "ensure_subscription", + "init", + "job_pod_names", + "load_fournos_config", + "load_manifest_template", + "load_run_configuration", + "load_yaml", + "model_cache_pvc_ready", + "normalize_gpu_count", + "oc", + "oc_get_json", + "operator_spec_by_package", + "parse_overrides", + "pvc_access_mode_matches", + "render_datasciencecluster", + "render_gateway", + "render_guidellm_copy_pod", + "render_guidellm_job", + "render_guidellm_pvc", + "render_inference_service", + "render_model_cache_job", + "render_model_cache_pvc", + "render_smoke_request_job", + "resource_exists", + "resolve_default_serviceaccount_image_pull_secret", + "resolve_model_cache", + "run_command", + "slugify_identifier", + "subscription_spec_matches", + "truncate_k8s_name", + "version_tuple", + "wait_for_crd", + "wait_for_job_completion", + "wait_for_namespace_deleted", + "wait_for_operator_csv", + "wait_for_pvc_bound", + "wait_until", + "write_json", + "write_text", + "write_yaml", +] class CommandError(RuntimeError): """Raised when an external command exits unsuccessfully.""" -@dataclass(frozen=True) -class ResolvedConfig: - artifact_dir: Path - project_root: Path - config_dir: Path - preset_name: str - preset_alias: str | None - job_name: str - namespace: str - namespace_is_managed: bool - gpu_count: int | None - platform: dict[str, Any] - model_key: str - model: dict[str, Any] - scheduler_profile_key: str - scheduler_profile: dict[str, Any] | None - model_cache: dict[str, Any] - smoke_request: dict[str, Any] - benchmark: dict[str, Any] | None - fournos_config: dict[str, Any] - overrides: dict[str, Any] - - @property - def manifests_dir(self) -> Path: - return self.config_dir / "manifests" - - -@dataclass(frozen=True) -class ModelCacheSpec: - source_uri: str - source_scheme: str - cache_key: str - namespace: str - pvc_name: str - pvc_size: str - access_mode: str - storage_class_name: str | None - model_path: str - model_uri: str - marker_filename: str - download_job_name: str - hf_token_secret_name: str | None - hf_token_secret_key: str | None - oci_image_path: str | None - oci_registry_auth_secret_name: str | None - oci_registry_auth_secret_key: str | None - - @property - def marker_path(self) -> str: - return f"/cache/{self.model_path}/{self.marker_filename}" - - -def init() -> Path: - if not logging.getLogger().handlers: - logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") - - env.init() - run.init() - ensure_artifact_directories(env.ARTIFACT_DIR) - return env.ARTIFACT_DIR - - -def ensure_artifact_directories(artifact_dir: Path) -> None: - for relative in ("src", "artifacts", "artifacts/results"): - (artifact_dir / relative).mkdir(parents=True, exist_ok=True) - - -def load_run_configuration( - *, cwd: Path | None = None, artifact_dir: Path | None = None -) -> ResolvedConfig: - cwd = cwd or Path.cwd() - if artifact_dir is not None: - os.environ["ARTIFACT_DIR"] = str(artifact_dir) - artifact_dir = init() - _reinitialize_project_config() - - platform_data = copy.deepcopy(config.project.get_config("platform")) - model_cache = copy.deepcopy(config.project.get_config("model_cache")) - fournos_config = load_fournos_config(cwd) - overrides = parse_overrides( - os.environ.get("FORGE_CONFIG_OVERRIDES", ""), - allowed_keys=config.project.get_config("runtime.allowed_override_keys", []), - ) - - requested_preset = ( - fournos_config.get("preset") - or os.environ.get("FORGE_PRESET") - or config.project.get_config("runtime.default_preset") - ) - apply_requested_preset(requested_preset) - - preset_name = config.project.get_config("runtime.selected_preset") - preset_alias = requested_preset if requested_preset != preset_name else None - - model_name = config.project.get_config("runtime.model_key") - model = copy.deepcopy(config.project.get_config(f"models.{model_name}")) - - scheduler_profile_key = config.project.get_config("runtime.scheduler_profile_key") - scheduler_profile = None - if scheduler_profile_key != "default": - scheduler_profile = copy.deepcopy( - config.project.get_config(f"scheduler_profiles.{scheduler_profile_key}") - ) - - smoke_request_name = config.project.get_config("runtime.smoke_request_key") - smoke_request = copy.deepcopy( - config.project.get_config(f"workloads.smoke_requests.{smoke_request_name}") - ) - - benchmark_name = config.project.get_config("runtime.benchmark_key", None) - benchmark = None - if benchmark_name: - benchmark = copy.deepcopy( - config.project.get_config(f"workloads.benchmarks.{benchmark_name}") - ) - - job_name = fournos_config.get("job-name") or os.environ.get("FORGE_JOB_NAME") - if not job_name: - job_name = f"local-{preset_name}" - - namespace_override = overrides.get("namespace") or fournos_config.get("namespace") - default_namespace = platform_data["cluster"].get("namespace_name") - namespace = ( - namespace_override - or default_namespace - or derive_namespace( - job_name, - platform_data["cluster"]["namespace_prefix"], - platform_data["cluster"]["namespace_max_length"], - ) - ) - - gpu_count = normalize_gpu_count(fournos_config.get("gpu-count")) - - return ResolvedConfig( - artifact_dir=Path(artifact_dir), - project_root=env.FORGE_HOME, - config_dir=ORCHESTRATION_DIR, - preset_name=preset_name, - preset_alias=preset_alias, - job_name=job_name, - namespace=namespace, - namespace_is_managed=namespace_override is None and default_namespace is None, - gpu_count=gpu_count, - platform=platform_data, - model_key=model_name, - model=model, - scheduler_profile_key=scheduler_profile_key, - scheduler_profile=scheduler_profile, - model_cache=model_cache, - smoke_request=smoke_request, - benchmark=benchmark, - fournos_config=fournos_config, - overrides=overrides, - ) - - -def _reinitialize_project_config() -> None: - config.project = None - artifact_config = env.ARTIFACT_DIR / "config.yaml" - if artifact_config.exists(): - artifact_config.unlink() - - presets_applied = env.ARTIFACT_DIR / "presets_applied" - if presets_applied.exists(): - presets_applied.unlink() - - config.init(ORCHESTRATION_DIR) - - -def apply_requested_preset(requested_preset: str) -> None: - if not config.project.get_preset(requested_preset): - raise ValueError(f"Unknown llm_d preset: {requested_preset}") - - config.project.apply_preset(requested_preset) - - -def load_fournos_config(cwd: Path) -> dict[str, Any]: - config_path = cwd / "fournos_config.yaml" - if not config_path.exists(): - return {} - - data = load_yaml(config_path) - if data is None: - return {} - if not isinstance(data, dict): - raise ValueError(f"Unexpected FOURNOS config type in {config_path}: {type(data)}") - return data - - -def parse_overrides(raw: str, *, allowed_keys: Iterable[str]) -> dict[str, Any]: - if not raw or raw.strip() in {"", "null", "{}"}: - return {} - - try: - data = json.loads(raw) - except json.JSONDecodeError as exc: - raise ValueError(f"FORGE_CONFIG_OVERRIDES is not valid JSON: {exc}") from exc - - if not isinstance(data, dict): - raise ValueError("FORGE_CONFIG_OVERRIDES must decode to a JSON object") - - allowed_keys = frozenset(allowed_keys) - unsupported = sorted(set(data) - allowed_keys) - if unsupported: - raise ValueError( - "Unsupported llm_d override keys: " - f"{', '.join(unsupported)}. Allowed keys: {', '.join(sorted(allowed_keys))}" - ) - - return data - - -def normalize_gpu_count(value: Any) -> int | None: - if value in (None, ""): - return None - try: - return int(value) - except (TypeError, ValueError): - LOGGER.warning("Ignoring invalid gpu-count value: %s", value) - return None - - -def derive_namespace(job_name: str, prefix: str, max_length: int) -> str: - slug = re.sub(r"[^a-z0-9-]+", "-", job_name.lower()) - slug = re.sub(r"-{2,}", "-", slug).strip("-") - if not slug: - slug = "run" - - if slug.startswith(f"{prefix}-"): - namespace = slug - else: - namespace = f"{prefix}-{slug}" - - namespace = namespace[:max_length].rstrip("-") - if not namespace: - raise ValueError(f"Could not derive a valid namespace from job name: {job_name}") - return namespace - - -def slugify_identifier(value: str, *, max_length: int = 63) -> str: - slug = re.sub(r"[^a-z0-9-]+", "-", value.lower()) - slug = re.sub(r"-{2,}", "-", slug).strip("-") - return slug[:max_length].rstrip("-") or "item" - - -def truncate_k8s_name(value: str, *, max_length: int = 63) -> str: - return value[:max_length].rstrip("-") - - -def resolve_model_cache(config: ResolvedConfig) -> ModelCacheSpec | None: - if not config.model_cache.get("enabled", False): - return None - - source_uri = config.model["uri"] - if source_uri.startswith(("pvc://", "pvc+hf://")): - return None - - if source_uri.startswith("hf://"): - source_scheme = "hf" - elif source_uri.startswith("oci://"): - source_scheme = "oci" - else: - raise ValueError(f"Unsupported model cache source URI for {config.model_key}: {source_uri}") - - model_cache_overrides = config.model.get("cache", {}) - pvc_defaults = config.model_cache["pvc"] - pvc_prefix = config.model_cache["pvc"]["name_prefix"] - cache_key = hashlib.sha256(source_uri.encode("utf-8")).hexdigest()[:10] - pvc_name = truncate_k8s_name( - f"{pvc_prefix}-{slugify_identifier(config.model_key, max_length=32)}-{cache_key}" - ) - model_path = pvc_defaults["model_directory_name"] - - return ModelCacheSpec( - source_uri=source_uri, - source_scheme=source_scheme, - cache_key=cache_key, - namespace=config.namespace, - pvc_name=pvc_name, - pvc_size=model_cache_overrides.get("pvc_size", pvc_defaults["size"]), - access_mode=model_cache_overrides.get("access_mode", pvc_defaults["access_mode"]), - storage_class_name=model_cache_overrides.get( - "storage_class_name", pvc_defaults.get("storage_class_name") - ), - model_path=model_path, - model_uri=f"pvc://{pvc_name}/{model_path}", - marker_filename=config.model_cache["marker_filename"], - download_job_name=truncate_k8s_name(f"{pvc_name}-download"), - hf_token_secret_name=model_cache_overrides.get( - "hf_token_secret_name", config.model_cache["hf"].get("token_secret_name") - ), - hf_token_secret_key=config.model_cache["hf"].get("token_secret_key"), - oci_image_path=model_cache_overrides.get( - "oci_image_path", config.model_cache["oci"].get("image_path") - ), - oci_registry_auth_secret_name=model_cache_overrides.get( - "oci_registry_auth_secret_name", - config.model_cache["oci"].get("registry_auth_secret_name"), - ), - oci_registry_auth_secret_key=config.model_cache["oci"].get("registry_auth_secret_key"), - ) - - -def load_yaml(path: Path) -> Any: - with path.open(encoding="utf-8") as handle: - return yaml.safe_load(handle) - - -def write_yaml(path: Path, payload: Any) -> None: - path.parent.mkdir(parents=True, exist_ok=True) - with path.open("w", encoding="utf-8") as handle: - yaml.safe_dump(payload, handle, sort_keys=False) - - -def write_json(path: Path, payload: Any) -> None: - path.parent.mkdir(parents=True, exist_ok=True) - with path.open("w", encoding="utf-8") as handle: - json.dump(payload, handle, indent=2, sort_keys=True) - handle.write("\n") - - -def write_text(path: Path, content: str) -> None: - path.parent.mkdir(parents=True, exist_ok=True) - path.write_text(content, encoding="utf-8") - - def run_command( args: Iterable[str], *, @@ -412,7 +167,7 @@ def oc( ) -def apply_manifest(artifact_path: Path, manifest: dict[str, Any]) -> None: +def apply_manifest(artifact_path: Any, manifest: dict[str, Any]) -> None: write_yaml(artifact_path, manifest) oc("apply", "-f", str(artifact_path)) @@ -501,14 +256,11 @@ def wait_until( def wait_for_namespace_deleted(namespace: str, timeout_seconds: int) -> None: - def _namespace_gone() -> bool: - return not resource_exists("namespace", namespace) - wait_until( f"namespace/{namespace} deletion", timeout_seconds=timeout_seconds, interval_seconds=10, - predicate=_namespace_gone, + predicate=lambda: not resource_exists("namespace", namespace), ) @@ -549,8 +301,7 @@ def ensure_namespace(namespace: str, *, labels: dict[str, str] | None = None) -> oc("create", "namespace", namespace) if labels: - label_args = [f"{key}={value}" for key, value in labels.items()] - oc("label", "namespace", namespace, "--overwrite", *label_args) + oc("label", "namespace", namespace, "--overwrite", *[f"{k}={v}" for k, v in labels.items()]) def ensure_operator_group(namespace: str, package: str) -> None: @@ -639,18 +390,8 @@ def operator_spec_by_package(platform: dict[str, Any], package: str) -> dict[str raise KeyError(f"Unknown operator package in llm_d platform config: {package}") -def load_manifest_template(config: ResolvedConfig, relative_path: str) -> dict[str, Any]: - return load_yaml(config.config_dir / relative_path) - - -def version_tuple(value: str) -> tuple[int, ...]: - numbers = re.findall(r"\d+", value) - return tuple(int(number) for number in numbers[:3]) - - def condition_status(resource: dict[str, Any], condition_type: str) -> str | None: - conditions = resource.get("status", {}).get("conditions", []) - for condition in conditions: + for condition in resource.get("status", {}).get("conditions", []): if condition.get("type") == condition_type: return condition.get("status") return None @@ -741,51 +482,6 @@ def resolve_default_serviceaccount_image_pull_secret(namespace: str) -> str | No return None -def render_datasciencecluster(config: ResolvedConfig) -> dict[str, Any]: - template_path = config.config_dir / config.platform["rhoai"]["datasciencecluster_template"] - manifest = load_yaml(template_path) - manifest["metadata"]["name"] = config.platform["rhoai"]["datasciencecluster_name"] - manifest["metadata"]["namespace"] = config.platform["rhoai"]["namespace"] - return manifest - - -def render_gateway(config: ResolvedConfig) -> dict[str, Any]: - template_path = config.config_dir / config.platform["gateway"]["manifest_template"] - manifest = load_yaml(template_path) - manifest["metadata"]["name"] = config.platform["gateway"]["name"] - manifest["metadata"]["namespace"] = config.platform["gateway"]["namespace"] - manifest["spec"]["gatewayClassName"] = config.platform["gateway"]["gateway_class_name"] - return manifest - - -def render_model_cache_pvc(spec: ModelCacheSpec) -> dict[str, Any]: - manifest: dict[str, Any] = { - "apiVersion": "v1", - "kind": "PersistentVolumeClaim", - "metadata": { - "name": spec.pvc_name, - "namespace": spec.namespace, - "labels": { - "app.kubernetes.io/managed-by": "forge", - "forge.openshift.io/project": "llm_d", - "forge.openshift.io/model-cache": "true", - "forge.openshift.io/preserve": "true", - }, - "annotations": { - "forge.openshift.io/model-cache-key": spec.cache_key, - "forge.openshift.io/model-source-uri": spec.source_uri, - }, - }, - "spec": { - "accessModes": [spec.access_mode], - "resources": {"requests": {"storage": spec.pvc_size}}, - }, - } - if spec.storage_class_name: - manifest["spec"]["storageClassName"] = spec.storage_class_name - return manifest - - def render_model_cache_job(config: ResolvedConfig, spec: ModelCacheSpec) -> dict[str, Any]: common_env = [ {"name": "MODEL_SOURCE", "value": spec.source_uri}, @@ -797,7 +493,6 @@ def render_model_cache_job(config: ResolvedConfig, spec: ModelCacheSpec) -> dict {"name": "cache", "persistentVolumeClaim": {"claimName": spec.pvc_name}} ] - container: dict[str, Any] if spec.source_scheme == "hf": command = """ set -euo pipefail @@ -828,10 +523,7 @@ def render_model_cache_job(config: ResolvedConfig, spec: ModelCacheSpec) -> dict volume_mounts = [{"name": "cache", "mountPath": "/cache"}] if spec.hf_token_secret_name: volumes.append( - { - "name": "hf-token", - "secret": {"secretName": spec.hf_token_secret_name}, - } + {"name": "hf-token", "secret": {"secretName": spec.hf_token_secret_name}} ) volume_mounts.append( { @@ -880,10 +572,7 @@ def render_model_cache_job(config: ResolvedConfig, spec: ModelCacheSpec) -> dict common_env.append({"name": "OCI_IMAGE_PATH", "value": spec.oci_image_path or "/"}) if registry_auth_secret_name: volumes.append( - { - "name": "registry-auth", - "secret": {"secretName": registry_auth_secret_name}, - } + {"name": "registry-auth", "secret": {"secretName": registry_auth_secret_name}} ) volume_mounts.append( { @@ -974,269 +663,3 @@ def annotate_model_cache_pvc(spec: ModelCacheSpec) -> None: f"forge.openshift.io/model-source-uri={spec.source_uri}", f"forge.openshift.io/model-uri={spec.model_uri}", ) - - -def render_inference_service(config: ResolvedConfig) -> dict[str, Any]: - template_path = config.config_dir / config.platform["inference_service"]["template"] - manifest = load_yaml(template_path) - - name = config.platform["inference_service"]["name"] - manifest["metadata"]["name"] = name - manifest["metadata"]["namespace"] = config.namespace - manifest["metadata"].setdefault("labels", {}) - manifest["metadata"]["labels"].update( - { - "app.kubernetes.io/managed-by": "forge", - "forge.openshift.io/project": "llm_d", - } - ) - - cache_spec = resolve_model_cache(config) - manifest["spec"]["model"]["uri"] = cache_spec.model_uri if cache_spec else config.model["uri"] - manifest["spec"]["model"]["name"] = config.model["served_model_name"] - manifest["spec"]["template"]["containers"][0]["resources"] = copy.deepcopy( - config.model["resources"] - ) - - if config.scheduler_profile_key == "default": - manifest["spec"]["router"]["scheduler"] = {} - return manifest - - if config.scheduler_profile is None: - raise ValueError(f"Missing scheduler profile config for {config.scheduler_profile_key}") - - scheduler_profile_path = config.config_dir / config.scheduler_profile["config_path"] - scheduler_profile_config = scheduler_profile_path.read_text(encoding="utf-8") - router_args = manifest["spec"]["router"]["scheduler"]["template"]["containers"][0]["args"] - if not router_args or router_args[-1] != "--config-text": - raise ValueError("Expected llm-d router args to end with --config-text") - router_args.append(scheduler_profile_config) - - return manifest - - -def render_smoke_request_job( - config: ResolvedConfig, endpoint_url: str, payload: dict[str, Any] -) -> dict[str, Any]: - smoke = config.platform["smoke"] - command = """ -set -eu -attempt=1 -while [ "${attempt}" -le "${REQUEST_RETRIES}" ]; do - if curl -k -sSf --max-time "${REQUEST_TIMEOUT_SECONDS}" \ - "${ENDPOINT_URL}${ENDPOINT_PATH}" \ - -H "Content-Type: application/json" \ - -d "${REQUEST_PAYLOAD}" \ - -o /tmp/smoke-response.json \ - 2>/tmp/smoke-error.log; then - cat /tmp/smoke-response.json - exit 0 - fi - attempt=$((attempt + 1)) - sleep "${REQUEST_RETRY_DELAY_SECONDS}" -done -cat /tmp/smoke-error.log >&2 || true -exit 1 -""" - - return { - "apiVersion": "batch/v1", - "kind": "Job", - "metadata": { - "name": smoke["job_name"], - "namespace": config.namespace, - "labels": { - "app.kubernetes.io/managed-by": "forge", - "forge.openshift.io/project": "llm_d", - "forge.openshift.io/component": "smoke", - }, - }, - "spec": { - "backoffLimit": 0, - "activeDeadlineSeconds": ( - smoke["request_retries"] - * (smoke["request_timeout_seconds"] + smoke["request_retry_delay_seconds"]) - ), - "template": { - "metadata": { - "labels": { - "app.kubernetes.io/managed-by": "forge", - "forge.openshift.io/project": "llm_d", - "forge.openshift.io/component": "smoke", - } - }, - "spec": { - "restartPolicy": "Never", - "containers": [ - { - "name": "smoke", - "image": smoke["client_image"], - "command": ["/bin/sh", "-ceu", command], - "env": [ - {"name": "ENDPOINT_URL", "value": endpoint_url}, - {"name": "ENDPOINT_PATH", "value": smoke["endpoint_path"]}, - {"name": "REQUEST_PAYLOAD", "value": json.dumps(payload)}, - {"name": "REQUEST_RETRIES", "value": str(smoke["request_retries"])}, - { - "name": "REQUEST_RETRY_DELAY_SECONDS", - "value": str(smoke["request_retry_delay_seconds"]), - }, - { - "name": "REQUEST_TIMEOUT_SECONDS", - "value": str(smoke["request_timeout_seconds"]), - }, - ], - } - ], - }, - }, - }, - } - - -def render_guidellm_pvc(config: ResolvedConfig) -> dict[str, Any]: - if not config.benchmark: - raise ValueError("Benchmark configuration is not enabled for this preset") - - return { - "apiVersion": "v1", - "kind": "PersistentVolumeClaim", - "metadata": { - "name": config.benchmark["job_name"], - "namespace": config.namespace, - "labels": { - "app.kubernetes.io/managed-by": "forge", - "forge.openshift.io/project": "llm_d", - }, - }, - "spec": { - "accessModes": ["ReadWriteOnce"], - "resources": {"requests": {"storage": config.benchmark["pvc_size"]}}, - }, - } - - -def render_guidellm_job(config: ResolvedConfig, endpoint_url: str) -> dict[str, Any]: - if not config.benchmark: - raise ValueError("Benchmark configuration is not enabled for this preset") - - args = [ - "benchmark", - "run", - f"--target={endpoint_url}", - f"--rate={config.benchmark['rate']}", - ] - for key, value in config.benchmark["args"].items(): - if value is None: - continue - args.append(f"--{key.replace('_', '-')}={value}") - args.append("--outputs=json") - - return { - "apiVersion": "batch/v1", - "kind": "Job", - "metadata": { - "name": config.benchmark["job_name"], - "namespace": config.namespace, - "labels": { - "app.kubernetes.io/managed-by": "forge", - "forge.openshift.io/project": "llm_d", - }, - }, - "spec": { - "backoffLimit": 0, - "template": { - "metadata": { - "labels": { - "app.kubernetes.io/managed-by": "forge", - "forge.openshift.io/project": "llm_d", - } - }, - "spec": { - "serviceAccountName": "default", - "restartPolicy": "Never", - "containers": [ - { - "name": "guidellm", - "image": config.benchmark["image"], - "command": ["/opt/app-root/bin/guidellm"], - "args": args, - "env": [{"name": "USER", "value": "guidellm"}], - "volumeMounts": [ - {"name": "home", "mountPath": "/home/guidellm"}, - {"name": "results", "mountPath": "/results"}, - ], - } - ], - "volumes": [ - {"name": "home", "emptyDir": {}}, - { - "name": "results", - "persistentVolumeClaim": {"claimName": config.benchmark["job_name"]}, - }, - ], - }, - }, - }, - } - - -def render_guidellm_copy_pod( - config: ResolvedConfig, node_name: str | None = None -) -> dict[str, Any]: - if not config.benchmark: - raise ValueError("Benchmark configuration is not enabled for this preset") - - pod = { - "apiVersion": "v1", - "kind": "Pod", - "metadata": { - "name": f"{config.benchmark['job_name']}-copy", - "namespace": config.namespace, - "labels": { - "app.kubernetes.io/managed-by": "forge", - "forge.openshift.io/project": "llm_d", - }, - }, - "spec": { - "restartPolicy": "Never", - "initContainers": [ - { - "name": "permission-fixer", - "image": config.benchmark["image"], - "command": [ - "/bin/sh", - "-c", - "chmod 755 /results && chown -R 1001:1001 /results || true", - ], - "securityContext": { - "runAsUser": 0, - "allowPrivilegeEscalation": True, - }, - "volumeMounts": [{"name": "results", "mountPath": "/results"}], - } - ], - "containers": [ - { - "name": "copy-helper", - "image": config.benchmark["image"], - "command": ["/bin/sleep", "300"], - "securityContext": { - "runAsUser": 1001, - "runAsNonRoot": True, - "allowPrivilegeEscalation": False, - }, - "volumeMounts": [{"name": "results", "mountPath": "/results"}], - } - ], - "volumes": [ - { - "name": "results", - "persistentVolumeClaim": {"claimName": config.benchmark["job_name"]}, - } - ], - }, - } - if node_name: - pod["spec"]["nodeName"] = node_name - return pod diff --git a/projects/llm_d/orchestration/runtime_config.py b/projects/llm_d/orchestration/runtime_config.py new file mode 100644 index 00000000..42b5fcb1 --- /dev/null +++ b/projects/llm_d/orchestration/runtime_config.py @@ -0,0 +1,352 @@ +from __future__ import annotations + +import copy +import hashlib +import json +import logging +import os +import re +from collections.abc import Iterable +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import yaml + +from projects.core.library import config, env, run + +LOGGER = logging.getLogger(__name__) +ORCHESTRATION_DIR = env.FORGE_HOME / "projects" / "llm_d" / "orchestration" +CONFIG_DIR = ORCHESTRATION_DIR + + +@dataclass(frozen=True) +class ResolvedConfig: + artifact_dir: Path + project_root: Path + config_dir: Path + preset_name: str + preset_alias: str | None + job_name: str + namespace: str + namespace_is_managed: bool + gpu_count: int | None + platform: dict[str, Any] + model_key: str + model: dict[str, Any] + scheduler_profile_key: str + scheduler_profile: dict[str, Any] | None + model_cache: dict[str, Any] + smoke_request: dict[str, Any] + benchmark: dict[str, Any] | None + fournos_config: dict[str, Any] + overrides: dict[str, Any] + + @property + def manifests_dir(self) -> Path: + return self.config_dir / "manifests" + + +@dataclass(frozen=True) +class ModelCacheSpec: + source_uri: str + source_scheme: str + cache_key: str + namespace: str + pvc_name: str + pvc_size: str + access_mode: str + storage_class_name: str | None + model_path: str + model_uri: str + marker_filename: str + download_job_name: str + hf_token_secret_name: str | None + hf_token_secret_key: str | None + oci_image_path: str | None + oci_registry_auth_secret_name: str | None + oci_registry_auth_secret_key: str | None + + @property + def marker_path(self) -> str: + return f"/cache/{self.model_path}/{self.marker_filename}" + + +def init() -> Path: + if not logging.getLogger().handlers: + logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") + + env.init() + run.init() + ensure_artifact_directories(env.ARTIFACT_DIR) + return env.ARTIFACT_DIR + + +def ensure_artifact_directories(artifact_dir: Path) -> None: + for relative in ("src", "artifacts", "artifacts/results"): + (artifact_dir / relative).mkdir(parents=True, exist_ok=True) + + +def load_run_configuration( + *, cwd: Path | None = None, artifact_dir: Path | None = None +) -> ResolvedConfig: + cwd = cwd or Path.cwd() + if artifact_dir is not None: + os.environ["ARTIFACT_DIR"] = str(artifact_dir) + artifact_dir = init() + _reinitialize_project_config() + + platform_data = copy.deepcopy(config.project.get_config("platform")) + model_cache = copy.deepcopy(config.project.get_config("model_cache")) + fournos_config = load_fournos_config(cwd) + overrides = parse_overrides( + os.environ.get("FORGE_CONFIG_OVERRIDES", ""), + allowed_keys=config.project.get_config("runtime.allowed_override_keys", []), + ) + + requested_preset = ( + fournos_config.get("preset") + or os.environ.get("FORGE_PRESET") + or config.project.get_config("runtime.default_preset") + ) + apply_requested_preset(requested_preset) + + preset_name = config.project.get_config("runtime.selected_preset") + preset_alias = requested_preset if requested_preset != preset_name else None + + model_name = config.project.get_config("runtime.model_key") + model = copy.deepcopy(config.project.get_config(f"models.{model_name}")) + + scheduler_profile_key = config.project.get_config("runtime.scheduler_profile_key") + scheduler_profile = None + if scheduler_profile_key != "default": + scheduler_profile = copy.deepcopy( + config.project.get_config(f"scheduler_profiles.{scheduler_profile_key}") + ) + + smoke_request_name = config.project.get_config("runtime.smoke_request_key") + smoke_request = copy.deepcopy( + config.project.get_config(f"workloads.smoke_requests.{smoke_request_name}") + ) + + benchmark_name = config.project.get_config("runtime.benchmark_key", None) + benchmark = None + if benchmark_name: + benchmark = copy.deepcopy( + config.project.get_config(f"workloads.benchmarks.{benchmark_name}") + ) + + job_name = fournos_config.get("job-name") or os.environ.get("FORGE_JOB_NAME") + if not job_name: + job_name = f"local-{preset_name}" + + namespace_override = overrides.get("namespace") or fournos_config.get("namespace") + default_namespace = platform_data["cluster"].get("namespace_name") + namespace = ( + namespace_override + or default_namespace + or derive_namespace( + job_name, + platform_data["cluster"]["namespace_prefix"], + platform_data["cluster"]["namespace_max_length"], + ) + ) + + gpu_count = normalize_gpu_count(fournos_config.get("gpu-count")) + + return ResolvedConfig( + artifact_dir=Path(artifact_dir), + project_root=env.FORGE_HOME, + config_dir=ORCHESTRATION_DIR, + preset_name=preset_name, + preset_alias=preset_alias, + job_name=job_name, + namespace=namespace, + namespace_is_managed=namespace_override is None and default_namespace is None, + gpu_count=gpu_count, + platform=platform_data, + model_key=model_name, + model=model, + scheduler_profile_key=scheduler_profile_key, + scheduler_profile=scheduler_profile, + model_cache=model_cache, + smoke_request=smoke_request, + benchmark=benchmark, + fournos_config=fournos_config, + overrides=overrides, + ) + + +def _reinitialize_project_config() -> None: + config.project = None + artifact_config = env.ARTIFACT_DIR / "config.yaml" + if artifact_config.exists(): + artifact_config.unlink() + + presets_applied = env.ARTIFACT_DIR / "presets_applied" + if presets_applied.exists(): + presets_applied.unlink() + + config.init(ORCHESTRATION_DIR) + + +def apply_requested_preset(requested_preset: str) -> None: + if not config.project.get_preset(requested_preset): + raise ValueError(f"Unknown llm_d preset: {requested_preset}") + + config.project.apply_preset(requested_preset) + + +def load_fournos_config(cwd: Path) -> dict[str, Any]: + config_path = cwd / "fournos_config.yaml" + if not config_path.exists(): + return {} + + data = load_yaml(config_path) + if data is None: + return {} + if not isinstance(data, dict): + raise ValueError(f"Unexpected FOURNOS config type in {config_path}: {type(data)}") + return data + + +def parse_overrides(raw: str, *, allowed_keys: Iterable[str]) -> dict[str, Any]: + if not raw or raw.strip() in {"", "null", "{}"}: + return {} + + try: + data = json.loads(raw) + except json.JSONDecodeError as exc: + raise ValueError(f"FORGE_CONFIG_OVERRIDES is not valid JSON: {exc}") from exc + + if not isinstance(data, dict): + raise ValueError("FORGE_CONFIG_OVERRIDES must decode to a JSON object") + + allowed_keys = frozenset(allowed_keys) + unsupported = sorted(set(data) - allowed_keys) + if unsupported: + raise ValueError( + "Unsupported llm_d override keys: " + f"{', '.join(unsupported)}. Allowed keys: {', '.join(sorted(allowed_keys))}" + ) + + return data + + +def normalize_gpu_count(value: Any) -> int | None: + if value in (None, ""): + return None + try: + return int(value) + except (TypeError, ValueError): + LOGGER.warning("Ignoring invalid gpu-count value: %s", value) + return None + + +def derive_namespace(job_name: str, prefix: str, max_length: int) -> str: + slug = re.sub(r"[^a-z0-9-]+", "-", job_name.lower()) + slug = re.sub(r"-{2,}", "-", slug).strip("-") + if not slug: + slug = "run" + + if slug.startswith(f"{prefix}-"): + namespace = slug + else: + namespace = f"{prefix}-{slug}" + + namespace = namespace[:max_length].rstrip("-") + if not namespace: + raise ValueError(f"Could not derive a valid namespace from job name: {job_name}") + return namespace + + +def slugify_identifier(value: str, *, max_length: int = 63) -> str: + slug = re.sub(r"[^a-z0-9-]+", "-", value.lower()) + slug = re.sub(r"-{2,}", "-", slug).strip("-") + return slug[:max_length].rstrip("-") or "item" + + +def truncate_k8s_name(value: str, *, max_length: int = 63) -> str: + return value[:max_length].rstrip("-") + + +def version_tuple(value: str) -> tuple[int, ...]: + numbers = re.findall(r"\d+", value) + return tuple(int(number) for number in numbers[:3]) + + +def resolve_model_cache(config: ResolvedConfig) -> ModelCacheSpec | None: + if not config.model_cache.get("enabled", False): + return None + + source_uri = config.model["uri"] + if source_uri.startswith(("pvc://", "pvc+hf://")): + return None + + if source_uri.startswith("hf://"): + source_scheme = "hf" + elif source_uri.startswith("oci://"): + source_scheme = "oci" + else: + raise ValueError(f"Unsupported model cache source URI for {config.model_key}: {source_uri}") + + model_cache_overrides = config.model.get("cache", {}) + pvc_defaults = config.model_cache["pvc"] + pvc_prefix = config.model_cache["pvc"]["name_prefix"] + cache_key = hashlib.sha256(source_uri.encode("utf-8")).hexdigest()[:10] + pvc_name = truncate_k8s_name( + f"{pvc_prefix}-{slugify_identifier(config.model_key, max_length=32)}-{cache_key}" + ) + model_path = pvc_defaults["model_directory_name"] + + return ModelCacheSpec( + source_uri=source_uri, + source_scheme=source_scheme, + cache_key=cache_key, + namespace=config.namespace, + pvc_name=pvc_name, + pvc_size=model_cache_overrides.get("pvc_size", pvc_defaults["size"]), + access_mode=model_cache_overrides.get("access_mode", pvc_defaults["access_mode"]), + storage_class_name=model_cache_overrides.get( + "storage_class_name", pvc_defaults.get("storage_class_name") + ), + model_path=model_path, + model_uri=f"pvc://{pvc_name}/{model_path}", + marker_filename=config.model_cache["marker_filename"], + download_job_name=truncate_k8s_name(f"{pvc_name}-download"), + hf_token_secret_name=model_cache_overrides.get( + "hf_token_secret_name", config.model_cache["hf"].get("token_secret_name") + ), + hf_token_secret_key=config.model_cache["hf"].get("token_secret_key"), + oci_image_path=model_cache_overrides.get( + "oci_image_path", config.model_cache["oci"].get("image_path") + ), + oci_registry_auth_secret_name=model_cache_overrides.get( + "oci_registry_auth_secret_name", + config.model_cache["oci"].get("registry_auth_secret_name"), + ), + oci_registry_auth_secret_key=config.model_cache["oci"].get("registry_auth_secret_key"), + ) + + +def load_yaml(path: Path) -> Any: + with path.open(encoding="utf-8") as handle: + return yaml.safe_load(handle) + + +def write_yaml(path: Path, payload: Any) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8") as handle: + yaml.safe_dump(payload, handle, sort_keys=False) + + +def write_json(path: Path, payload: Any) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8") as handle: + json.dump(payload, handle, indent=2, sort_keys=True) + handle.write("\n") + + +def write_text(path: Path, content: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(content, encoding="utf-8") diff --git a/projects/llm_d/orchestration/runtime_manifests.py b/projects/llm_d/orchestration/runtime_manifests.py new file mode 100644 index 00000000..0c72a88e --- /dev/null +++ b/projects/llm_d/orchestration/runtime_manifests.py @@ -0,0 +1,327 @@ +from __future__ import annotations + +import copy +import json +from typing import Any + +from projects.llm_d.orchestration.runtime_config import ( + ModelCacheSpec, + ResolvedConfig, + load_yaml, + resolve_model_cache, +) + + +def load_manifest_template(config: ResolvedConfig, relative_path: str) -> dict[str, Any]: + return load_yaml(config.config_dir / relative_path) + + +def render_datasciencecluster(config: ResolvedConfig) -> dict[str, Any]: + template_path = config.config_dir / config.platform["rhoai"]["datasciencecluster_template"] + manifest = load_yaml(template_path) + manifest["metadata"]["name"] = config.platform["rhoai"]["datasciencecluster_name"] + manifest["metadata"]["namespace"] = config.platform["rhoai"]["namespace"] + return manifest + + +def render_gateway(config: ResolvedConfig) -> dict[str, Any]: + template_path = config.config_dir / config.platform["gateway"]["manifest_template"] + manifest = load_yaml(template_path) + manifest["metadata"]["name"] = config.platform["gateway"]["name"] + manifest["metadata"]["namespace"] = config.platform["gateway"]["namespace"] + manifest["spec"]["gatewayClassName"] = config.platform["gateway"]["gateway_class_name"] + return manifest + + +def render_model_cache_pvc(spec: ModelCacheSpec) -> dict[str, Any]: + manifest: dict[str, Any] = { + "apiVersion": "v1", + "kind": "PersistentVolumeClaim", + "metadata": { + "name": spec.pvc_name, + "namespace": spec.namespace, + "labels": { + "app.kubernetes.io/managed-by": "forge", + "forge.openshift.io/project": "llm_d", + "forge.openshift.io/model-cache": "true", + "forge.openshift.io/preserve": "true", + }, + "annotations": { + "forge.openshift.io/model-cache-key": spec.cache_key, + "forge.openshift.io/model-source-uri": spec.source_uri, + }, + }, + "spec": { + "accessModes": [spec.access_mode], + "resources": {"requests": {"storage": spec.pvc_size}}, + }, + } + if spec.storage_class_name: + manifest["spec"]["storageClassName"] = spec.storage_class_name + return manifest + + +def render_inference_service(config: ResolvedConfig) -> dict[str, Any]: + template_path = config.config_dir / config.platform["inference_service"]["template"] + manifest = load_yaml(template_path) + + name = config.platform["inference_service"]["name"] + manifest["metadata"]["name"] = name + manifest["metadata"]["namespace"] = config.namespace + manifest["metadata"].setdefault("labels", {}) + manifest["metadata"]["labels"].update( + { + "app.kubernetes.io/managed-by": "forge", + "forge.openshift.io/project": "llm_d", + } + ) + + cache_spec = resolve_model_cache(config) + manifest["spec"]["model"]["uri"] = cache_spec.model_uri if cache_spec else config.model["uri"] + manifest["spec"]["model"]["name"] = config.model["served_model_name"] + manifest["spec"]["template"]["containers"][0]["resources"] = copy.deepcopy( + config.model["resources"] + ) + + if config.scheduler_profile_key == "default": + manifest["spec"]["router"]["scheduler"] = {} + return manifest + + if config.scheduler_profile is None: + raise ValueError(f"Missing scheduler profile config for {config.scheduler_profile_key}") + + scheduler_profile_path = config.config_dir / config.scheduler_profile["config_path"] + scheduler_profile_config = scheduler_profile_path.read_text(encoding="utf-8") + router_args = manifest["spec"]["router"]["scheduler"]["template"]["containers"][0]["args"] + if not router_args or router_args[-1] != "--config-text": + raise ValueError("Expected llm-d router args to end with --config-text") + router_args.append(scheduler_profile_config) + + return manifest + + +def render_smoke_request_job( + config: ResolvedConfig, endpoint_url: str, payload: dict[str, Any] +) -> dict[str, Any]: + smoke = config.platform["smoke"] + command = """ +set -eu +attempt=1 +while [ "${attempt}" -le "${REQUEST_RETRIES}" ]; do + if curl -k -sSf --max-time "${REQUEST_TIMEOUT_SECONDS}" \ + "${ENDPOINT_URL}${ENDPOINT_PATH}" \ + -H "Content-Type: application/json" \ + -d "${REQUEST_PAYLOAD}" \ + -o /tmp/smoke-response.json \ + 2>/tmp/smoke-error.log; then + cat /tmp/smoke-response.json + exit 0 + fi + attempt=$((attempt + 1)) + sleep "${REQUEST_RETRY_DELAY_SECONDS}" +done +cat /tmp/smoke-error.log >&2 || true +exit 1 +""" + + return { + "apiVersion": "batch/v1", + "kind": "Job", + "metadata": { + "name": smoke["job_name"], + "namespace": config.namespace, + "labels": { + "app.kubernetes.io/managed-by": "forge", + "forge.openshift.io/project": "llm_d", + "forge.openshift.io/component": "smoke", + }, + }, + "spec": { + "backoffLimit": 0, + "activeDeadlineSeconds": ( + smoke["request_retries"] + * (smoke["request_timeout_seconds"] + smoke["request_retry_delay_seconds"]) + ), + "template": { + "metadata": { + "labels": { + "app.kubernetes.io/managed-by": "forge", + "forge.openshift.io/project": "llm_d", + "forge.openshift.io/component": "smoke", + } + }, + "spec": { + "restartPolicy": "Never", + "containers": [ + { + "name": "smoke", + "image": smoke["client_image"], + "command": ["/bin/sh", "-ceu", command], + "env": [ + {"name": "ENDPOINT_URL", "value": endpoint_url}, + {"name": "ENDPOINT_PATH", "value": smoke["endpoint_path"]}, + {"name": "REQUEST_PAYLOAD", "value": json.dumps(payload)}, + {"name": "REQUEST_RETRIES", "value": str(smoke["request_retries"])}, + { + "name": "REQUEST_RETRY_DELAY_SECONDS", + "value": str(smoke["request_retry_delay_seconds"]), + }, + { + "name": "REQUEST_TIMEOUT_SECONDS", + "value": str(smoke["request_timeout_seconds"]), + }, + ], + } + ], + }, + }, + }, + } + + +def render_guidellm_pvc(config: ResolvedConfig) -> dict[str, Any]: + if not config.benchmark: + raise ValueError("Benchmark configuration is not enabled for this preset") + + return { + "apiVersion": "v1", + "kind": "PersistentVolumeClaim", + "metadata": { + "name": config.benchmark["job_name"], + "namespace": config.namespace, + "labels": { + "app.kubernetes.io/managed-by": "forge", + "forge.openshift.io/project": "llm_d", + }, + }, + "spec": { + "accessModes": ["ReadWriteOnce"], + "resources": {"requests": {"storage": config.benchmark["pvc_size"]}}, + }, + } + + +def render_guidellm_job(config: ResolvedConfig, endpoint_url: str) -> dict[str, Any]: + if not config.benchmark: + raise ValueError("Benchmark configuration is not enabled for this preset") + + args = [ + "benchmark", + "run", + f"--target={endpoint_url}", + f"--rate={config.benchmark['rate']}", + ] + for key, value in config.benchmark["args"].items(): + if value is None: + continue + args.append(f"--{key.replace('_', '-')}={value}") + args.append("--outputs=json") + + return { + "apiVersion": "batch/v1", + "kind": "Job", + "metadata": { + "name": config.benchmark["job_name"], + "namespace": config.namespace, + "labels": { + "app.kubernetes.io/managed-by": "forge", + "forge.openshift.io/project": "llm_d", + }, + }, + "spec": { + "backoffLimit": 0, + "template": { + "metadata": { + "labels": { + "app.kubernetes.io/managed-by": "forge", + "forge.openshift.io/project": "llm_d", + } + }, + "spec": { + "serviceAccountName": "default", + "restartPolicy": "Never", + "containers": [ + { + "name": "guidellm", + "image": config.benchmark["image"], + "command": ["/opt/app-root/bin/guidellm"], + "args": args, + "env": [{"name": "USER", "value": "guidellm"}], + "volumeMounts": [ + {"name": "home", "mountPath": "/home/guidellm"}, + {"name": "results", "mountPath": "/results"}, + ], + } + ], + "volumes": [ + {"name": "home", "emptyDir": {}}, + { + "name": "results", + "persistentVolumeClaim": {"claimName": config.benchmark["job_name"]}, + }, + ], + }, + }, + }, + } + + +def render_guidellm_copy_pod( + config: ResolvedConfig, node_name: str | None = None +) -> dict[str, Any]: + if not config.benchmark: + raise ValueError("Benchmark configuration is not enabled for this preset") + + pod = { + "apiVersion": "v1", + "kind": "Pod", + "metadata": { + "name": f"{config.benchmark['job_name']}-copy", + "namespace": config.namespace, + "labels": { + "app.kubernetes.io/managed-by": "forge", + "forge.openshift.io/project": "llm_d", + }, + }, + "spec": { + "restartPolicy": "Never", + "initContainers": [ + { + "name": "permission-fixer", + "image": config.benchmark["image"], + "command": [ + "/bin/sh", + "-c", + "chmod 755 /results && chown -R 1001:1001 /results || true", + ], + "securityContext": { + "runAsUser": 0, + "allowPrivilegeEscalation": True, + }, + "volumeMounts": [{"name": "results", "mountPath": "/results"}], + } + ], + "containers": [ + { + "name": "copy-helper", + "image": config.benchmark["image"], + "command": ["/bin/sleep", "300"], + "securityContext": { + "runAsUser": 1001, + "runAsNonRoot": True, + "allowPrivilegeEscalation": False, + }, + "volumeMounts": [{"name": "results", "mountPath": "/results"}], + } + ], + "volumes": [ + { + "name": "results", + "persistentVolumeClaim": {"claimName": config.benchmark["job_name"]}, + } + ], + }, + } + if node_name: + pod["spec"]["nodeName"] = node_name + return pod From acc95e96fff0910036a981d94b52a0405695978d Mon Sep 17 00:00:00 2001 From: Alberto Perdomo Date: Sun, 3 May 2026 10:19:33 +0100 Subject: [PATCH 12/21] refactor: Add llm_d phase input boundary --- projects/llm_d/orchestration/ci.py | 36 +++- projects/llm_d/orchestration/cli.py | 38 +++- projects/llm_d/orchestration/phase_inputs.py | 207 +++++++++++++++++++ projects/llm_d/orchestration/prepare_llmd.py | 12 -- projects/llm_d/orchestration/test_llmd.py | 12 -- tests/llm_d/test_runtime.py | 141 ++++++++++++- 6 files changed, 408 insertions(+), 38 deletions(-) create mode 100644 projects/llm_d/orchestration/phase_inputs.py delete mode 100644 projects/llm_d/orchestration/prepare_llmd.py delete mode 100644 projects/llm_d/orchestration/test_llmd.py diff --git a/projects/llm_d/orchestration/ci.py b/projects/llm_d/orchestration/ci.py index 97073e6e..ed02e0b2 100644 --- a/projects/llm_d/orchestration/ci.py +++ b/projects/llm_d/orchestration/ci.py @@ -7,10 +7,34 @@ import types import click -import prepare_llmd -import test_llmd from projects.core.library import ci as ci_lib +from projects.llm_d.orchestration import llmd_runtime, phase_inputs +from projects.llm_d.toolbox.cleanup.main import run as cleanup_toolbox_run +from projects.llm_d.toolbox.prepare.main import run as prepare_toolbox_run +from projects.llm_d.toolbox.test.main import run as test_toolbox_run + + +def init_runtime() -> None: + llmd_runtime.init() + + +def run_prepare_phase() -> int: + config = llmd_runtime.load_run_configuration() + inputs_file = phase_inputs.write_prepare_inputs(config) + return prepare_toolbox_run(inputs_file=str(inputs_file)) + + +def run_test_phase() -> int: + config = llmd_runtime.load_run_configuration() + inputs_file = phase_inputs.write_test_inputs(config) + return test_toolbox_run(inputs_file=str(inputs_file)) + + +def run_cleanup_phase() -> int: + config = llmd_runtime.load_run_configuration() + inputs_file = phase_inputs.write_cleanup_inputs(config) + return cleanup_toolbox_run(inputs_file=str(inputs_file)) @click.group() @@ -19,7 +43,7 @@ def main(ctx): """LLM-D Project CI Operations for FORGE.""" ctx.ensure_object(types.SimpleNamespace) - test_llmd.init() + init_runtime() @main.command() @@ -27,7 +51,7 @@ def main(ctx): @ci_lib.safe_ci_command def prepare(ctx) -> int: """Prepare phase - Set up environment and dependencies.""" - return prepare_llmd.prepare() + return run_prepare_phase() @main.command() @@ -35,7 +59,7 @@ def prepare(ctx) -> int: @ci_lib.safe_ci_command def test(ctx) -> int: """Test phase - Execute the main testing logic.""" - return test_llmd.test() + return run_test_phase() @main.command() @@ -43,7 +67,7 @@ def test(ctx) -> int: @ci_lib.safe_ci_command def pre_cleanup(ctx) -> int: """Cleanup phase - Clean up resources and finalize.""" - return prepare_llmd.cleanup() + return run_cleanup_phase() if __name__ == "__main__": diff --git a/projects/llm_d/orchestration/cli.py b/projects/llm_d/orchestration/cli.py index ca87c653..02b2e549 100644 --- a/projects/llm_d/orchestration/cli.py +++ b/projects/llm_d/orchestration/cli.py @@ -4,20 +4,44 @@ import types import click -import prepare_llmd -import test_llmd from projects.core.library.cli import safe_cli_command +from projects.llm_d.orchestration import llmd_runtime, phase_inputs +from projects.llm_d.toolbox.cleanup.main import run as cleanup_toolbox_run +from projects.llm_d.toolbox.prepare.main import run as prepare_toolbox_run +from projects.llm_d.toolbox.test.main import run as test_toolbox_run logger = logging.getLogger(__name__) +def init_runtime() -> None: + llmd_runtime.init() + + +def run_prepare_phase() -> int: + config = llmd_runtime.load_run_configuration() + inputs_file = phase_inputs.write_prepare_inputs(config) + return prepare_toolbox_run(inputs_file=str(inputs_file)) + + +def run_test_phase() -> int: + config = llmd_runtime.load_run_configuration() + inputs_file = phase_inputs.write_test_inputs(config) + return test_toolbox_run(inputs_file=str(inputs_file)) + + +def run_cleanup_phase() -> int: + config = llmd_runtime.load_run_configuration() + inputs_file = phase_inputs.write_cleanup_inputs(config) + return cleanup_toolbox_run(inputs_file=str(inputs_file)) + + @click.group() @click.pass_context def main(ctx): """LLM-D Project CLI Operations for FORGE.""" ctx.ensure_object(types.SimpleNamespace) - test_llmd.init() + init_runtime() @main.command() @@ -25,7 +49,7 @@ def main(ctx): @safe_cli_command def prepare(ctx) -> int: """Prepare phase - Set up environment and dependencies.""" - return prepare_llmd.prepare() + return run_prepare_phase() @main.command() @@ -33,7 +57,7 @@ def prepare(ctx) -> int: @safe_cli_command def test(ctx) -> int: """Test phase - Execute the main testing logic.""" - return test_llmd.test() + return run_test_phase() @main.command() @@ -41,7 +65,7 @@ def test(ctx) -> int: @safe_cli_command def pre_cleanup(ctx) -> int: """Cleanup phase - Clean up resources and finalize.""" - return prepare_llmd.cleanup() + return run_cleanup_phase() @main.command() @@ -49,7 +73,7 @@ def pre_cleanup(ctx) -> int: @safe_cli_command def post_cleanup(ctx) -> int: """Cleanup phase - Clean up resources and finalize.""" - return prepare_llmd.cleanup() + return run_cleanup_phase() if __name__ == "__main__": diff --git a/projects/llm_d/orchestration/phase_inputs.py b/projects/llm_d/orchestration/phase_inputs.py new file mode 100644 index 00000000..8a195515 --- /dev/null +++ b/projects/llm_d/orchestration/phase_inputs.py @@ -0,0 +1,207 @@ +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +from projects.llm_d.orchestration.runtime_config import ResolvedConfig, load_yaml, write_yaml + + +@dataclass(frozen=True) +class CleanupInputs: + artifact_dir: Path + namespace: str + platform: dict[str, Any] + benchmark: dict[str, Any] | None + + +@dataclass(frozen=True) +class PrepareModelCacheInputs: + artifact_dir: Path + preset_name: str + namespace: str + namespace_is_managed: bool + model_key: str + model: dict[str, Any] + model_cache: dict[str, Any] + + +@dataclass(frozen=True) +class PrepareInputs: + artifact_dir: Path + config_dir: Path + preset_name: str + namespace: str + namespace_is_managed: bool + platform: dict[str, Any] + model_key: str + model: dict[str, Any] + model_cache: dict[str, Any] + benchmark: dict[str, Any] | None + + +@dataclass(frozen=True) +class TestInputs: + artifact_dir: Path + config_dir: Path + preset_name: str + namespace: str + platform: dict[str, Any] + model_key: str + model: dict[str, Any] + scheduler_profile_key: str + scheduler_profile: dict[str, Any] | None + model_cache: dict[str, Any] + smoke_request: dict[str, Any] + benchmark: dict[str, Any] | None + + +def write_cleanup_inputs(config: ResolvedConfig) -> Path: + path = config.artifact_dir / "_meta" / "cleanup.inputs.yaml" + write_yaml( + path, + { + "artifact_dir": str(config.artifact_dir), + "namespace": config.namespace, + "platform": config.platform, + "benchmark": config.benchmark, + }, + ) + return path + + +def write_prepare_model_cache_inputs(config: ResolvedConfig) -> Path: + path = config.artifact_dir / "_meta" / "prepare_model_cache.inputs.yaml" + write_yaml( + path, + { + "artifact_dir": str(config.artifact_dir), + "preset_name": config.preset_name, + "namespace": config.namespace, + "namespace_is_managed": config.namespace_is_managed, + "model_key": config.model_key, + "model": config.model, + "model_cache": config.model_cache, + }, + ) + return path + + +def write_prepare_inputs(config: ResolvedConfig) -> Path: + path = config.artifact_dir / "_meta" / "prepare.inputs.yaml" + write_yaml( + path, + { + "artifact_dir": str(config.artifact_dir), + "config_dir": str(config.config_dir), + "preset_name": config.preset_name, + "namespace": config.namespace, + "namespace_is_managed": config.namespace_is_managed, + "platform": config.platform, + "model_key": config.model_key, + "model": config.model, + "model_cache": config.model_cache, + "benchmark": config.benchmark, + }, + ) + return path + + +def write_test_inputs(config: ResolvedConfig) -> Path: + path = config.artifact_dir / "_meta" / "test.inputs.yaml" + write_yaml( + path, + { + "artifact_dir": str(config.artifact_dir), + "config_dir": str(config.config_dir), + "preset_name": config.preset_name, + "namespace": config.namespace, + "platform": config.platform, + "model_key": config.model_key, + "model": config.model, + "scheduler_profile_key": config.scheduler_profile_key, + "scheduler_profile": config.scheduler_profile, + "model_cache": config.model_cache, + "smoke_request": config.smoke_request, + "benchmark": config.benchmark, + }, + ) + return path + + +def load_cleanup_inputs(path: str | Path) -> CleanupInputs: + payload = load_yaml(Path(path)) + return CleanupInputs( + artifact_dir=Path(payload["artifact_dir"]), + namespace=payload["namespace"], + platform=payload["platform"], + benchmark=payload["benchmark"], + ) + + +def load_prepare_model_cache_inputs(path: str | Path) -> PrepareModelCacheInputs: + payload = load_yaml(Path(path)) + return PrepareModelCacheInputs( + artifact_dir=Path(payload["artifact_dir"]), + preset_name=payload["preset_name"], + namespace=payload["namespace"], + namespace_is_managed=payload["namespace_is_managed"], + model_key=payload["model_key"], + model=payload["model"], + model_cache=payload["model_cache"], + ) + + +def load_prepare_inputs(path: str | Path) -> PrepareInputs: + payload = load_yaml(Path(path)) + return PrepareInputs( + artifact_dir=Path(payload["artifact_dir"]), + config_dir=Path(payload["config_dir"]), + preset_name=payload["preset_name"], + namespace=payload["namespace"], + namespace_is_managed=payload["namespace_is_managed"], + platform=payload["platform"], + model_key=payload["model_key"], + model=payload["model"], + model_cache=payload["model_cache"], + benchmark=payload["benchmark"], + ) + + +def load_test_inputs(path: str | Path) -> TestInputs: + payload = load_yaml(Path(path)) + return TestInputs( + artifact_dir=Path(payload["artifact_dir"]), + config_dir=Path(payload["config_dir"]), + preset_name=payload["preset_name"], + namespace=payload["namespace"], + platform=payload["platform"], + model_key=payload["model_key"], + model=payload["model"], + scheduler_profile_key=payload["scheduler_profile_key"], + scheduler_profile=payload["scheduler_profile"], + model_cache=payload["model_cache"], + smoke_request=payload["smoke_request"], + benchmark=payload["benchmark"], + ) + + +def cleanup_inputs_from_prepare(inputs: PrepareInputs) -> CleanupInputs: + return CleanupInputs( + artifact_dir=inputs.artifact_dir, + namespace=inputs.namespace, + platform=inputs.platform, + benchmark=inputs.benchmark, + ) + + +def prepare_model_cache_inputs_from_prepare(inputs: PrepareInputs) -> PrepareModelCacheInputs: + return PrepareModelCacheInputs( + artifact_dir=inputs.artifact_dir, + preset_name=inputs.preset_name, + namespace=inputs.namespace, + namespace_is_managed=inputs.namespace_is_managed, + model_key=inputs.model_key, + model=inputs.model, + model_cache=inputs.model_cache, + ) diff --git a/projects/llm_d/orchestration/prepare_llmd.py b/projects/llm_d/orchestration/prepare_llmd.py deleted file mode 100644 index ba64a9dc..00000000 --- a/projects/llm_d/orchestration/prepare_llmd.py +++ /dev/null @@ -1,12 +0,0 @@ -from __future__ import annotations - -from projects.llm_d.toolbox.cleanup.main import run as cleanup_toolbox_run -from projects.llm_d.toolbox.prepare.main import run as prepare_toolbox_run - - -def prepare() -> int: - return prepare_toolbox_run() - - -def cleanup() -> int: - return cleanup_toolbox_run() diff --git a/projects/llm_d/orchestration/test_llmd.py b/projects/llm_d/orchestration/test_llmd.py deleted file mode 100644 index 5254cafb..00000000 --- a/projects/llm_d/orchestration/test_llmd.py +++ /dev/null @@ -1,12 +0,0 @@ -from __future__ import annotations - -from projects.llm_d.orchestration import llmd_runtime -from projects.llm_d.toolbox.test.main import run as test_toolbox_run - - -def init() -> None: - llmd_runtime.init() - - -def test() -> int: - return test_toolbox_run() diff --git a/tests/llm_d/test_runtime.py b/tests/llm_d/test_runtime.py index bc19284a..126cb9e7 100644 --- a/tests/llm_d/test_runtime.py +++ b/tests/llm_d/test_runtime.py @@ -5,7 +5,9 @@ import pytest -from projects.llm_d.orchestration import llmd_runtime +from projects.llm_d.orchestration import ci as llmd_ci +from projects.llm_d.orchestration import cli as llmd_cli +from projects.llm_d.orchestration import llmd_runtime, phase_inputs from projects.llm_d.toolbox.cleanup import main as cleanup_toolbox from projects.llm_d.toolbox.prepare import main as prepare_toolbox from projects.llm_d.toolbox.prepare_model_cache import main as prepare_model_cache_toolbox @@ -91,6 +93,143 @@ def test_default_namespace_comes_from_project_config( assert config.namespace_is_managed is False +def test_write_prepare_inputs_round_trip(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + + path = phase_inputs.write_prepare_inputs(config) + payload = llmd_runtime.load_yaml(path) + loaded = phase_inputs.load_prepare_inputs(path) + + assert set(payload) == { + "artifact_dir", + "config_dir", + "preset_name", + "namespace", + "namespace_is_managed", + "platform", + "model_key", + "model", + "model_cache", + "benchmark", + } + assert loaded.artifact_dir == config.artifact_dir + assert loaded.config_dir == config.config_dir + assert loaded.namespace == config.namespace + assert loaded.platform == config.platform + assert loaded.model == config.model + assert loaded.model_cache == config.model_cache + assert loaded.benchmark == config.benchmark + + +def test_write_test_inputs_round_trip(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + + path = phase_inputs.write_test_inputs(config) + payload = llmd_runtime.load_yaml(path) + loaded = phase_inputs.load_test_inputs(path) + + assert set(payload) == { + "artifact_dir", + "config_dir", + "preset_name", + "namespace", + "platform", + "model_key", + "model", + "scheduler_profile_key", + "scheduler_profile", + "model_cache", + "smoke_request", + "benchmark", + } + assert loaded.namespace == config.namespace + assert loaded.scheduler_profile_key == config.scheduler_profile_key + assert loaded.smoke_request == config.smoke_request + assert loaded.benchmark == config.benchmark + + +@pytest.mark.parametrize("orchestration", [llmd_ci, llmd_cli]) +def test_orchestration_prepare_writes_inputs_and_invokes_toolbox( + orchestration, tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + captured: dict[str, str] = {} + + monkeypatch.setattr(orchestration.llmd_runtime, "load_run_configuration", lambda: config) + monkeypatch.setattr( + orchestration, + "prepare_toolbox_run", + lambda *, inputs_file: captured.setdefault("inputs_file", inputs_file) or 17, + ) + + result = orchestration.run_prepare_phase() + loaded = phase_inputs.load_prepare_inputs(captured["inputs_file"]) + + assert result == captured["inputs_file"] + assert loaded.namespace == config.namespace + + +@pytest.mark.parametrize("orchestration", [llmd_ci, llmd_cli]) +def test_orchestration_test_writes_inputs_and_invokes_toolbox( + orchestration, tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + captured: dict[str, str] = {} + + monkeypatch.setattr(orchestration.llmd_runtime, "load_run_configuration", lambda: config) + monkeypatch.setattr( + orchestration, + "test_toolbox_run", + lambda *, inputs_file: captured.setdefault("inputs_file", inputs_file) or 23, + ) + + result = orchestration.run_test_phase() + loaded = phase_inputs.load_test_inputs(captured["inputs_file"]) + + assert result == captured["inputs_file"] + assert loaded.namespace == config.namespace + assert loaded.model == config.model + + +@pytest.mark.parametrize("orchestration", [llmd_ci, llmd_cli]) +def test_orchestration_cleanup_writes_inputs_and_invokes_toolbox( + orchestration, tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + captured: dict[str, str] = {} + + monkeypatch.setattr(orchestration.llmd_runtime, "load_run_configuration", lambda: config) + monkeypatch.setattr( + orchestration, + "cleanup_toolbox_run", + lambda *, inputs_file: captured.setdefault("inputs_file", inputs_file) or 29, + ) + + result = orchestration.run_cleanup_phase() + loaded = phase_inputs.load_cleanup_inputs(captured["inputs_file"]) + + assert result == captured["inputs_file"] + assert loaded.namespace == config.namespace + assert loaded.platform == config.platform + + def test_render_inference_service_injects_model_and_scheduler_profile( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: From f7d0d1a6b19364ae4c9360a66b57b256df0d2ae6 Mon Sep 17 00:00:00 2001 From: Alberto Perdomo Date: Sun, 3 May 2026 10:27:34 +0100 Subject: [PATCH 13/21] refactor: Convert llm_d cleanup and model-cache phases to DSL tasks --- projects/llm_d/toolbox/cleanup/main.py | 164 +++++++++++------- .../llm_d/toolbox/prepare_model_cache/main.py | 65 ++++++- tests/llm_d/test_runtime.py | 22 +-- 3 files changed, 171 insertions(+), 80 deletions(-) diff --git a/projects/llm_d/toolbox/cleanup/main.py b/projects/llm_d/toolbox/cleanup/main.py index d80726ef..abd543db 100644 --- a/projects/llm_d/toolbox/cleanup/main.py +++ b/projects/llm_d/toolbox/cleanup/main.py @@ -2,89 +2,137 @@ from __future__ import annotations -from projects.core.dsl import toolbox -from projects.llm_d.orchestration import llmd_runtime +from projects.core.dsl import execute_tasks, shell, task, toolbox +from projects.llm_d.orchestration import llmd_runtime, phase_inputs -def run() -> int: - llmd_runtime.init() - config = llmd_runtime.load_run_configuration() - return run_cleanup(config) +def run(*, inputs_file: str) -> int: + """Delete llm_d runtime leftovers from a namespace. + Args: + inputs_file: Path to the cleanup phase input file generated by orchestration + """ -def run_cleanup(config: llmd_runtime.ResolvedConfig) -> int: - delete_run_leftovers(config) + llmd_runtime.init() + execute_tasks(locals()) return 0 -def delete_run_leftovers(config: llmd_runtime.ResolvedConfig) -> None: - if not llmd_runtime.resource_exists("namespace", config.namespace): - return +@task +def load_inputs(args, ctx): + """Load the cleanup phase inputs""" + + ctx.inputs = phase_inputs.load_cleanup_inputs(args.inputs_file) + return f"Loaded cleanup inputs for namespace {ctx.inputs.namespace}" + + +@task +def delete_leftovers(args, ctx): + """Delete llm_d runtime leftovers""" - inference_service_name = config.platform["inference_service"]["name"] - namespace = config.namespace - cleanup_timeout_seconds = config.platform["cluster"]["cleanup_timeout_seconds"] + inputs = ctx.inputs + if not llmd_runtime.resource_exists("namespace", inputs.namespace): + return f"Namespace {inputs.namespace} does not exist; nothing to clean" + + inference_service_name = inputs.platform["inference_service"]["name"] + namespace = inputs.namespace + cleanup_timeout_seconds = inputs.platform["cluster"]["cleanup_timeout_seconds"] benchmark_names = {"guidellm-benchmark"} - if config.benchmark: - benchmark_names.add(config.benchmark["job_name"]) - - llmd_runtime.oc( - "delete", - "llminferenceservice", - inference_service_name, - "-n", - namespace, + if inputs.benchmark: + benchmark_names.add(inputs.benchmark["job_name"]) + + shell.run( + f"oc delete llminferenceservice {inference_service_name} " + f"-n {namespace} --ignore-not-found=true", + check=False, + ) + + for benchmark_name in sorted(benchmark_names): + shell.run( + f"oc delete job,pvc {benchmark_name} -n {namespace} --ignore-not-found=true", + check=False, + ) + shell.run( + f"oc delete pod {benchmark_name}-copy -n {namespace} --ignore-not-found=true", + check=False, + ) + + shell.run( + f'oc delete job -n {namespace} -l "forge.openshift.io/project=llm_d" ' + "--ignore-not-found=true", + check=False, + ) + shell.run( + f'oc delete pod -n {namespace} -l "forge.openshift.io/project=llm_d" ' + "--ignore-not-found=true", + check=False, + ) + shell.run( + f"oc delete pvc -n {namespace} " + '-l "forge.openshift.io/project=llm_d,forge.openshift.io/preserve!=true" ' "--ignore-not-found=true", check=False, ) + llmd_runtime.wait_until( + f"llminferenceservice/{inference_service_name} deletion in {namespace}", + timeout_seconds=cleanup_timeout_seconds, + interval_seconds=10, + predicate=lambda: not llmd_runtime.resource_exists( + "llminferenceservice", inference_service_name, namespace=namespace + ), + ) + + llmd_runtime.wait_until( + f"llm-d workload pods deletion in {namespace}", + timeout_seconds=cleanup_timeout_seconds, + interval_seconds=10, + predicate=lambda: _llm_d_pods_gone(namespace, inference_service_name), + ) + + return f"Cleanup finished for namespace {namespace}" + + +def delete_run_leftovers(inputs: phase_inputs.CleanupInputs) -> None: + if not llmd_runtime.resource_exists("namespace", inputs.namespace): + return + + inference_service_name = inputs.platform["inference_service"]["name"] + namespace = inputs.namespace + cleanup_timeout_seconds = inputs.platform["cluster"]["cleanup_timeout_seconds"] + benchmark_names = {"guidellm-benchmark"} + if inputs.benchmark: + benchmark_names.add(inputs.benchmark["job_name"]) + + shell.run( + f"oc delete llminferenceservice {inference_service_name} " + f"-n {namespace} --ignore-not-found=true", + check=False, + ) + for benchmark_name in sorted(benchmark_names): - llmd_runtime.oc( - "delete", - "job,pvc", - benchmark_name, - "-n", - namespace, - "--ignore-not-found=true", + shell.run( + f"oc delete job,pvc {benchmark_name} -n {namespace} --ignore-not-found=true", check=False, ) - llmd_runtime.oc( - "delete", - "pod", - f"{benchmark_name}-copy", - "-n", - namespace, - "--ignore-not-found=true", + shell.run( + f"oc delete pod {benchmark_name}-copy -n {namespace} --ignore-not-found=true", check=False, ) - llmd_runtime.oc( - "delete", - "job", - "-n", - namespace, - "-l", - "forge.openshift.io/project=llm_d", + shell.run( + f'oc delete job -n {namespace} -l "forge.openshift.io/project=llm_d" ' "--ignore-not-found=true", check=False, ) - llmd_runtime.oc( - "delete", - "pod", - "-n", - namespace, - "-l", - "forge.openshift.io/project=llm_d", + shell.run( + f'oc delete pod -n {namespace} -l "forge.openshift.io/project=llm_d" ' "--ignore-not-found=true", check=False, ) - llmd_runtime.oc( - "delete", - "pvc", - "-n", - namespace, - "-l", - "forge.openshift.io/project=llm_d,forge.openshift.io/preserve!=true", + shell.run( + f"oc delete pvc -n {namespace} " + '-l "forge.openshift.io/project=llm_d,forge.openshift.io/preserve!=true" ' "--ignore-not-found=true", check=False, ) diff --git a/projects/llm_d/toolbox/prepare_model_cache/main.py b/projects/llm_d/toolbox/prepare_model_cache/main.py index 1dc50758..f698ef0c 100644 --- a/projects/llm_d/toolbox/prepare_model_cache/main.py +++ b/projects/llm_d/toolbox/prepare_model_cache/main.py @@ -4,19 +4,66 @@ import logging -from projects.core.dsl import toolbox -from projects.llm_d.orchestration import llmd_runtime +from projects.core.dsl import execute_tasks, task, toolbox +from projects.llm_d.orchestration import llmd_runtime, phase_inputs LOGGER = logging.getLogger(__name__) -def run() -> int: +def run(*, inputs_file: str) -> int: + """Prepare the shared model cache PVC and populate it when needed. + + Args: + inputs_file: Path to the prepare_model_cache phase input file generated by orchestration + """ + llmd_runtime.init() - config = llmd_runtime.load_run_configuration() - return run_prepare_model_cache(config) + execute_tasks(locals()) + return 0 + + +@task +def load_inputs(args, ctx): + """Load the model cache phase inputs""" + + ctx.inputs = phase_inputs.load_prepare_model_cache_inputs(args.inputs_file) + return f"Loaded model cache inputs for preset {ctx.inputs.preset_name}" + + +@task +def prepare_model_cache(args, ctx): + """Ensure the model cache PVC exists and is populated""" + + config = ctx.inputs + cache_spec = llmd_runtime.resolve_model_cache(config) + if not cache_spec: + LOGGER.info("Model cache disabled for preset=%s", config.preset_name) + return "Model cache disabled" + + if config.namespace_is_managed: + LOGGER.warning( + "Model cache PVC %s lives in managed namespace %s. Namespace cleanup will remove it; cache reuse requires a stable namespace override.", + cache_spec.pvc_name, + cache_spec.namespace, + ) + + ensure_model_cache_pvc(config, cache_spec) + if llmd_runtime.model_cache_pvc_ready(cache_spec): + LOGGER.info( + "Model cache PVC %s already contains %s; skipping download", + cache_spec.pvc_name, + cache_spec.source_uri, + ) + capture_model_cache_state(config, cache_spec) + return f"Model cache already populated in {cache_spec.pvc_name}" + + run_model_cache_download_job(config, cache_spec) + llmd_runtime.annotate_model_cache_pvc(cache_spec) + capture_model_cache_state(config, cache_spec) + return f"Model cache step finished for namespace {config.namespace}" -def run_prepare_model_cache(config: llmd_runtime.ResolvedConfig) -> int: +def run_prepare_model_cache(config: phase_inputs.PrepareModelCacheInputs) -> int: cache_spec = llmd_runtime.resolve_model_cache(config) if not cache_spec: LOGGER.info("Model cache disabled for preset=%s", config.preset_name) @@ -46,7 +93,7 @@ def run_prepare_model_cache(config: llmd_runtime.ResolvedConfig) -> int: def ensure_model_cache_pvc( - config: llmd_runtime.ResolvedConfig, cache_spec: llmd_runtime.ModelCacheSpec + config: phase_inputs.PrepareModelCacheInputs, cache_spec: llmd_runtime.ModelCacheSpec ) -> None: existing = llmd_runtime.oc_get_json( "persistentvolumeclaim", @@ -86,7 +133,7 @@ def ensure_model_cache_pvc( def run_model_cache_download_job( - config: llmd_runtime.ResolvedConfig, cache_spec: llmd_runtime.ModelCacheSpec + config: phase_inputs.PrepareModelCacheInputs, cache_spec: llmd_runtime.ModelCacheSpec ) -> None: llmd_runtime.oc( "delete", @@ -123,7 +170,7 @@ def run_model_cache_download_job( def capture_model_cache_state( - config: llmd_runtime.ResolvedConfig, cache_spec: llmd_runtime.ModelCacheSpec + config: phase_inputs.PrepareModelCacheInputs, cache_spec: llmd_runtime.ModelCacheSpec ) -> None: artifact_dir = config.artifact_dir / "artifacts" / "model-cache" llmd_runtime.write_json( diff --git a/tests/llm_d/test_runtime.py b/tests/llm_d/test_runtime.py index 126cb9e7..2171f7f4 100644 --- a/tests/llm_d/test_runtime.py +++ b/tests/llm_d/test_runtime.py @@ -443,7 +443,7 @@ def test_cleanup_deletes_leftovers_but_not_namespace_or_preserved_pvcs( artifact_dir.mkdir() config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) - oc_calls: list[tuple[str, ...]] = [] + shell_calls: list[str] = [] def fake_resource_exists(kind: str, name: str, namespace: str | None = None) -> bool: if kind == "namespace": @@ -452,25 +452,21 @@ def fake_resource_exists(kind: str, name: str, namespace: str | None = None) -> monkeypatch.setattr(llmd_runtime, "resource_exists", fake_resource_exists) monkeypatch.setattr( - llmd_runtime, - "oc", - lambda *args, **kwargs: oc_calls.append(tuple(args)), + cleanup_toolbox.shell, + "run", + lambda command, **kwargs: shell_calls.append(command), ) monkeypatch.setattr(llmd_runtime, "wait_until", lambda *args, **kwargs: True) monkeypatch.setattr(cleanup_toolbox, "_llm_d_pods_gone", lambda *_args: True) cleanup_toolbox.delete_run_leftovers(config) - assert ("delete", "namespace", config.namespace, "--ignore-not-found=true") not in oc_calls + assert f"oc delete namespace {config.namespace} --ignore-not-found=true" not in shell_calls assert ( - "delete", - "pvc", - "-n", - config.namespace, - "-l", - "forge.openshift.io/project=llm_d,forge.openshift.io/preserve!=true", - "--ignore-not-found=true", - ) in oc_calls + f"oc delete pvc -n {config.namespace} " + '-l "forge.openshift.io/project=llm_d,forge.openshift.io/preserve!=true" ' + "--ignore-not-found=true" + ) in shell_calls def test_prepare_gpu_operator_skips_existing_clusterpolicy( From 31c8c5ff90e6cfc3efebbd48cc4a5d807198aaa5 Mon Sep 17 00:00:00 2001 From: Alberto Perdomo Date: Sun, 3 May 2026 10:31:42 +0100 Subject: [PATCH 14/21] refactor: Inline llm_d prepare and test task logic --- projects/llm_d/toolbox/prepare/main.py | 477 +++++++++++++++++++++-- projects/llm_d/toolbox/test/main.py | 518 ++++++++++++++++++++++--- 2 files changed, 889 insertions(+), 106 deletions(-) diff --git a/projects/llm_d/toolbox/prepare/main.py b/projects/llm_d/toolbox/prepare/main.py index 831201d5..6851bf4b 100644 --- a/projects/llm_d/toolbox/prepare/main.py +++ b/projects/llm_d/toolbox/prepare/main.py @@ -6,48 +6,441 @@ import logging from pathlib import Path -from projects.core.dsl import toolbox -from projects.llm_d.orchestration import llmd_runtime -from projects.llm_d.toolbox.cleanup import main as cleanup_toolbox +from projects.core.dsl import execute_tasks, shell, task, toolbox +from projects.llm_d.orchestration import llmd_runtime, phase_inputs from projects.llm_d.toolbox.prepare_model_cache import main as prepare_model_cache LOGGER = logging.getLogger(__name__) -def run() -> int: - llmd_runtime.init() - config = llmd_runtime.load_run_configuration() - return run_prepare(config) - - -def run_prepare(config: llmd_runtime.ResolvedConfig) -> int: - LOGGER.info("Preparing llm_d preset=%s namespace=%s", config.preset_name, config.namespace) - - verify_oc_access() - verify_cluster_version(config) - prepare_cert_manager(config) - prepare_leader_worker_set(config) - prepare_nfd(config) - prepare_gpu_operator(config) - prepare_rhoai_operator(config) - apply_datasciencecluster(config) - wait_for_datasciencecluster_ready(config) - ensure_required_crds(config.platform["rhoai"]["required_crds_after_dsc"], config) - ensure_gateway(config) - ensure_test_namespace(config) - cleanup_toolbox.delete_run_leftovers(config) - prepare_model_cache.run_prepare_model_cache(config) - verify_gpu_nodes(config) - capture_prepare_state(config) +def run(*, inputs_file: str) -> int: + """Prepare a cluster for llm_d downstream smoke and benchmark runs. + + Args: + inputs_file: Path to the prepare phase input file generated by orchestration + """ + llmd_runtime.init() + execute_tasks(locals()) return 0 +@task +def load_inputs(args, ctx): + """Load the prepare phase inputs""" + + ctx.config = phase_inputs.load_prepare_inputs(args.inputs_file) + return f"Loaded prepare inputs for preset {ctx.config.preset_name}" + + +@task +def verify_oc_access_task(args, ctx): + """Verify OpenShift CLI access""" + + llmd_runtime.oc("whoami", capture_output=True) + return "OpenShift CLI access verified" + + +@task +def verify_cluster_version_task(args, ctx): + """Validate the cluster version against llm_d requirements""" + + version_info = llmd_runtime.oc("version", "-o", "json", capture_output=True) + payload = json.loads(version_info.stdout) + + openshift_version = ( + payload.get("openshiftVersion") + or payload.get("serverVersion", {}).get("gitVersion") + or payload.get("serverVersion", {}).get("platform") + ) + if not openshift_version: + raise RuntimeError("Could not determine OpenShift version from `oc version -o json`") + + minimum = ctx.config.platform["cluster"]["minimum_openshift_version"] + if llmd_runtime.version_tuple(openshift_version) < llmd_runtime.version_tuple(minimum): + raise RuntimeError( + f"Cluster version {openshift_version} is older than the llm_d minimum {minimum}" + ) + + return f"Cluster version satisfies {minimum}" + + +@task +def prepare_cert_manager_task(args, ctx): + """Ensure the cert-manager operator is installed""" + + operator_spec = llmd_runtime.operator_spec_by_package( + ctx.config.platform, "openshift-cert-manager-operator" + ) + ensure_operator_subscription(operator_spec) + return "cert-manager operator ready" + + +@task +def prepare_leader_worker_set_task(args, ctx): + """Ensure the leader-worker-set operator is installed""" + + operator_spec = llmd_runtime.operator_spec_by_package(ctx.config.platform, "leader-worker-set") + ensure_operator_subscription(operator_spec) + return "leader-worker-set operator ready" + + +@task +def prepare_nfd_task(args, ctx): + """Ensure Node Feature Discovery is installed and reporting GPU labels""" + + config = ctx.config + operator_spec = llmd_runtime.operator_spec_by_package(config.platform, "nfd") + ensure_operator_subscription(operator_spec) + llmd_runtime.wait_for_crd( + operator_spec["bootstrap_crd"], + timeout_seconds=operator_spec["wait_timeout_seconds"], + ) + + manifest = llmd_runtime.load_manifest_template(config, operator_spec["bootstrap_manifest"]) + nfd_name = manifest["metadata"]["name"] + nfd_namespace = manifest["metadata"]["namespace"] + if llmd_runtime.resource_exists("nodefeaturediscovery", nfd_name, namespace=nfd_namespace): + LOGGER.info( + "NodeFeatureDiscovery/%s already exists in %s; verifying GPU discovery labels", + nfd_name, + nfd_namespace, + ) + else: + llmd_runtime.apply_manifest( + config.artifact_dir / "src" / "nfd-nodefeaturediscovery.yaml", + manifest, + ) + + llmd_runtime.wait_until( + "NodeFeatureDiscovery bootstrap resource", + timeout_seconds=operator_spec["wait_timeout_seconds"], + interval_seconds=10, + predicate=lambda: llmd_runtime.resource_exists( + "nodefeaturediscovery", + nfd_name, + namespace=nfd_namespace, + ), + ) + + wait_for_nfd_gpu_labels(config, timeout_seconds=operator_spec["wait_timeout_seconds"]) + return "Node Feature Discovery ready" + + +@task +def prepare_gpu_operator_task(args, ctx): + """Ensure the GPU operator is installed and ready""" + + config = ctx.config + operator_spec = llmd_runtime.operator_spec_by_package(config.platform, "gpu-operator-certified") + ensure_operator_subscription(operator_spec) + llmd_runtime.wait_for_crd( + operator_spec["bootstrap_crd"], + timeout_seconds=operator_spec["wait_timeout_seconds"], + ) + + manifest = llmd_runtime.load_manifest_template(config, operator_spec["bootstrap_manifest"]) + clusterpolicy_name = manifest["metadata"]["name"] + if llmd_runtime.resource_exists("clusterpolicy", clusterpolicy_name): + LOGGER.info( + "ClusterPolicy/%s already exists; verifying readiness instead of applying bootstrap manifest", + clusterpolicy_name, + ) + else: + llmd_runtime.apply_manifest( + config.artifact_dir / "src" / "gpu-clusterpolicy.yaml", + manifest, + ) + + wait_for_gpu_clusterpolicy_ready( + clusterpolicy_name, + timeout_seconds=operator_spec["wait_timeout_seconds"], + ) + return "GPU operator ready" + + +@task +def prepare_rhoai_operator_task(args, ctx): + """Ensure the RHOAI operator is installed""" + + config = ctx.config + operator_spec = llmd_runtime.operator_spec_by_package(config.platform, "rhods-operator") + ensure_operator_subscription(operator_spec) + for crd_name in config.platform["rhoai"]["required_crds_before_dsc"]: + llmd_runtime.wait_for_crd( + crd_name, + timeout_seconds=config.platform["rhoai"]["wait_timeout_seconds"], + ) + return "RHOAI operator ready" + + +@task +def apply_datasciencecluster_task(args, ctx): + """Apply the DataScienceCluster manifest""" + + config = ctx.config + manifest = llmd_runtime.render_datasciencecluster(config) + llmd_runtime.apply_manifest(config.artifact_dir / "src" / "datasciencecluster.yaml", manifest) + llmd_runtime.oc( + "get", + "datasciencecluster", + config.platform["rhoai"]["datasciencecluster_name"], + "-n", + config.platform["rhoai"]["namespace"], + "-o", + "yaml", + capture_output=True, + ) + return "DataScienceCluster applied" + + +@task +def wait_for_datasciencecluster_ready_task(args, ctx): + """Wait for the DataScienceCluster to become ready""" + + rhoai = ctx.config.platform["rhoai"] + + def _dsc_ready() -> bool: + payload = llmd_runtime.oc_get_json( + "datasciencecluster", + name=rhoai["datasciencecluster_name"], + namespace=rhoai["namespace"], + ) + phase = payload.get("status", {}).get("phase") + if phase == "Ready": + return True + if phase in {"Failed", "Error"}: + raise RuntimeError(f"DataScienceCluster entered terminal phase {phase}") + return False + + llmd_runtime.wait_until( + f"datasciencecluster/{rhoai['datasciencecluster_name']} ready", + timeout_seconds=rhoai["wait_timeout_seconds"], + interval_seconds=10, + predicate=_dsc_ready, + ) + return "DataScienceCluster ready" + + +@task +def ensure_required_crds_task(args, ctx): + """Wait for the llm_d-required CRDs to exist""" + + for crd_name in ctx.config.platform["rhoai"]["required_crds_after_dsc"]: + llmd_runtime.wait_for_crd( + crd_name, + timeout_seconds=ctx.config.platform["rhoai"]["wait_timeout_seconds"], + ) + return "Required CRDs present" + + +@task +def ensure_gateway_task(args, ctx): + """Ensure the gateway exists and is programmed""" + + config = ctx.config + gateway = config.platform["gateway"] + if not llmd_runtime.resource_exists("gateway", gateway["name"], namespace=gateway["namespace"]): + if not gateway["create_if_missing"]: + raise RuntimeError( + f"Required gateway {gateway['name']} does not exist in {gateway['namespace']}" + ) + manifest = llmd_runtime.render_gateway(config) + llmd_runtime.apply_manifest(config.artifact_dir / "src" / "gateway.yaml", manifest) + + def _gateway_programmed() -> bool: + resource = llmd_runtime.oc_get_json( + "gateway", + name=gateway["name"], + namespace=gateway["namespace"], + ) + return llmd_runtime.condition_status(resource, "Programmed") == "True" + + llmd_runtime.wait_until( + f"gateway/{gateway['name']} programmed", + timeout_seconds=gateway["wait_timeout_seconds"], + interval_seconds=10, + predicate=_gateway_programmed, + ) + return "Gateway ready" + + +@task +def ensure_test_namespace_task(args, ctx): + """Ensure the llm_d namespace exists""" + + llmd_runtime.ensure_namespace( + ctx.config.namespace, + labels={ + "app.kubernetes.io/managed-by": "forge", + "forge.openshift.io/project": "llm_d", + }, + ) + return f"Namespace {ctx.config.namespace} ready" + + +@task +def cleanup_previous_run_task(args, ctx): + """Delete leftover llm_d resources from the namespace""" + + config = ctx.config + inference_service_name = config.platform["inference_service"]["name"] + namespace = config.namespace + cleanup_timeout_seconds = config.platform["cluster"]["cleanup_timeout_seconds"] + benchmark_names = {"guidellm-benchmark"} + if config.benchmark: + benchmark_names.add(config.benchmark["job_name"]) + + shell.run( + f"oc delete llminferenceservice {inference_service_name} " + f"-n {namespace} --ignore-not-found=true", + check=False, + ) + + for benchmark_name in sorted(benchmark_names): + shell.run( + f"oc delete job,pvc {benchmark_name} -n {namespace} --ignore-not-found=true", + check=False, + ) + shell.run( + f"oc delete pod {benchmark_name}-copy -n {namespace} --ignore-not-found=true", + check=False, + ) + + shell.run( + f'oc delete job -n {namespace} -l "forge.openshift.io/project=llm_d" ' + "--ignore-not-found=true", + check=False, + ) + shell.run( + f'oc delete pod -n {namespace} -l "forge.openshift.io/project=llm_d" ' + "--ignore-not-found=true", + check=False, + ) + shell.run( + f"oc delete pvc -n {namespace} " + '-l "forge.openshift.io/project=llm_d,forge.openshift.io/preserve!=true" ' + "--ignore-not-found=true", + check=False, + ) + + llmd_runtime.wait_until( + f"llminferenceservice/{inference_service_name} deletion in {namespace}", + timeout_seconds=cleanup_timeout_seconds, + interval_seconds=10, + predicate=lambda: not llmd_runtime.resource_exists( + "llminferenceservice", inference_service_name, namespace=namespace + ), + ) + + llmd_runtime.wait_until( + f"llm-d workload pods deletion in {namespace}", + timeout_seconds=cleanup_timeout_seconds, + interval_seconds=10, + predicate=lambda: not ( + pods := llmd_runtime.oc_get_json( + "pods", + namespace=namespace, + selector=f"app.kubernetes.io/name={inference_service_name}", + ignore_not_found=True, + ) + ) + or not pods.get("items"), + ) + return f"Previous llm_d leftovers deleted from {ctx.config.namespace}" + + +@task +def prepare_model_cache_task(args, ctx): + """Prepare the shared model cache if enabled""" + + cache_inputs = phase_inputs.prepare_model_cache_inputs_from_prepare(ctx.config) + cache_spec = llmd_runtime.resolve_model_cache(cache_inputs) + if not cache_spec: + LOGGER.info("Model cache disabled for preset=%s", cache_inputs.preset_name) + return "Model cache disabled" + + if cache_inputs.namespace_is_managed: + LOGGER.warning( + "Model cache PVC %s lives in managed namespace %s. Namespace cleanup will remove it; cache reuse requires a stable namespace override.", + cache_spec.pvc_name, + cache_spec.namespace, + ) + + prepare_model_cache.ensure_model_cache_pvc(cache_inputs, cache_spec) + if llmd_runtime.model_cache_pvc_ready(cache_spec): + LOGGER.info( + "Model cache PVC %s already contains %s; skipping download", + cache_spec.pvc_name, + cache_spec.source_uri, + ) + prepare_model_cache.capture_model_cache_state(cache_inputs, cache_spec) + return "Model cache already populated" + + prepare_model_cache.run_model_cache_download_job(cache_inputs, cache_spec) + llmd_runtime.annotate_model_cache_pvc(cache_spec) + prepare_model_cache.capture_model_cache_state(cache_inputs, cache_spec) + return "Model cache prepared" + + +@task +def verify_gpu_nodes_task(args, ctx): + """Verify that GPU nodes are available on the cluster""" + + selector = ctx.config.platform["cluster"]["gpu_node_label_selector"] + data = llmd_runtime.oc_get_json("nodes", selector=selector, ignore_not_found=True) + items = data.get("items", []) if data else [] + if not items: + raise RuntimeError( + f"No GPU nodes found with selector {selector}. The llm_d smoke path requires GPUs." + ) + return "GPU nodes detected" + + +@task +def capture_prepare_state_task(args, ctx): + """Capture cluster state after the prepare phase""" + + config = ctx.config + artifacts_dir = config.artifact_dir / "artifacts" + rhoai = config.platform["rhoai"] + gateway = config.platform["gateway"] + + capture_resource_yaml( + "datasciencecluster", + rhoai["datasciencecluster_name"], + rhoai["namespace"], + artifacts_dir / "datasciencecluster.yaml", + ) + capture_resource_yaml( + "gateway", + gateway["name"], + gateway["namespace"], + artifacts_dir / "gateway.yaml", + ) + gateway_service = llmd_runtime.oc( + "get", + "service", + "-A", + "-l", + f"gateway.networking.k8s.io/gateway-name={gateway['name']}", + "-o", + "yaml", + check=False, + capture_output=True, + ) + if gateway_service.returncode == 0 and gateway_service.stdout: + llmd_runtime.write_text(artifacts_dir / "gateway.service.yaml", gateway_service.stdout) + if config.platform["artifacts"]["capture_namespace_events"]: + capture_namespace_events(config.namespace, artifacts_dir / "namespace.events.txt") + return "Prepare-state artifacts captured" + + def verify_oc_access() -> None: llmd_runtime.oc("whoami", capture_output=True) -def verify_cluster_version(config: llmd_runtime.ResolvedConfig) -> None: +def verify_cluster_version(config: phase_inputs.PrepareInputs) -> None: version_info = llmd_runtime.oc("version", "-o", "json", capture_output=True) payload = json.loads(version_info.stdout) @@ -75,19 +468,19 @@ def ensure_operator_subscription(operator_spec: dict[str, str]) -> dict[str, obj ) -def prepare_cert_manager(config: llmd_runtime.ResolvedConfig) -> None: +def prepare_cert_manager(config: phase_inputs.PrepareInputs) -> None: operator_spec = llmd_runtime.operator_spec_by_package( config.platform, "openshift-cert-manager-operator" ) ensure_operator_subscription(operator_spec) -def prepare_leader_worker_set(config: llmd_runtime.ResolvedConfig) -> None: +def prepare_leader_worker_set(config: phase_inputs.PrepareInputs) -> None: operator_spec = llmd_runtime.operator_spec_by_package(config.platform, "leader-worker-set") ensure_operator_subscription(operator_spec) -def prepare_nfd(config: llmd_runtime.ResolvedConfig) -> None: +def prepare_nfd(config: phase_inputs.PrepareInputs) -> None: operator_spec = llmd_runtime.operator_spec_by_package(config.platform, "nfd") ensure_operator_subscription(operator_spec) llmd_runtime.wait_for_crd( @@ -124,7 +517,7 @@ def prepare_nfd(config: llmd_runtime.ResolvedConfig) -> None: wait_for_nfd_gpu_labels(config, timeout_seconds=operator_spec["wait_timeout_seconds"]) -def prepare_gpu_operator(config: llmd_runtime.ResolvedConfig) -> None: +def prepare_gpu_operator(config: phase_inputs.PrepareInputs) -> None: operator_spec = llmd_runtime.operator_spec_by_package(config.platform, "gpu-operator-certified") ensure_operator_subscription(operator_spec) llmd_runtime.wait_for_crd( @@ -173,13 +566,13 @@ def _clusterpolicy_ready() -> bool: ) -def prepare_rhoai_operator(config: llmd_runtime.ResolvedConfig) -> None: +def prepare_rhoai_operator(config: phase_inputs.PrepareInputs) -> None: operator_spec = llmd_runtime.operator_spec_by_package(config.platform, "rhods-operator") ensure_operator_subscription(operator_spec) ensure_required_crds(config.platform["rhoai"]["required_crds_before_dsc"], config) -def ensure_required_crds(crd_names: list[str], config: llmd_runtime.ResolvedConfig) -> None: +def ensure_required_crds(crd_names: list[str], config: phase_inputs.PrepareInputs) -> None: for crd_name in crd_names: llmd_runtime.wait_for_crd( crd_name, @@ -187,7 +580,7 @@ def ensure_required_crds(crd_names: list[str], config: llmd_runtime.ResolvedConf ) -def apply_datasciencecluster(config: llmd_runtime.ResolvedConfig) -> None: +def apply_datasciencecluster(config: phase_inputs.PrepareInputs) -> None: manifest = llmd_runtime.render_datasciencecluster(config) llmd_runtime.apply_manifest(config.artifact_dir / "src" / "datasciencecluster.yaml", manifest) llmd_runtime.oc( @@ -202,7 +595,7 @@ def apply_datasciencecluster(config: llmd_runtime.ResolvedConfig) -> None: ) -def wait_for_datasciencecluster_ready(config: llmd_runtime.ResolvedConfig) -> None: +def wait_for_datasciencecluster_ready(config: phase_inputs.PrepareInputs) -> None: rhoai = config.platform["rhoai"] def _dsc_ready() -> bool: @@ -226,7 +619,7 @@ def _dsc_ready() -> bool: ) -def ensure_gateway(config: llmd_runtime.ResolvedConfig) -> None: +def ensure_gateway(config: phase_inputs.PrepareInputs) -> None: gateway = config.platform["gateway"] if not llmd_runtime.resource_exists("gateway", gateway["name"], namespace=gateway["namespace"]): if not gateway["create_if_missing"]: @@ -252,7 +645,7 @@ def _gateway_programmed() -> bool: ) -def ensure_test_namespace(config: llmd_runtime.ResolvedConfig) -> None: +def ensure_test_namespace(config: phase_inputs.PrepareInputs) -> None: llmd_runtime.ensure_namespace( config.namespace, labels={ @@ -262,7 +655,7 @@ def ensure_test_namespace(config: llmd_runtime.ResolvedConfig) -> None: ) -def verify_gpu_nodes(config: llmd_runtime.ResolvedConfig) -> None: +def verify_gpu_nodes(config: phase_inputs.PrepareInputs) -> None: selector = config.platform["cluster"]["gpu_node_label_selector"] data = llmd_runtime.oc_get_json("nodes", selector=selector, ignore_not_found=True) items = data.get("items", []) if data else [] @@ -272,7 +665,7 @@ def verify_gpu_nodes(config: llmd_runtime.ResolvedConfig) -> None: ) -def wait_for_nfd_gpu_labels(config: llmd_runtime.ResolvedConfig, *, timeout_seconds: int) -> None: +def wait_for_nfd_gpu_labels(config: phase_inputs.PrepareInputs, *, timeout_seconds: int) -> None: selectors = config.platform["cluster"]["nfd_gpu_detection_labels"] def _labels_present() -> bool: @@ -290,7 +683,7 @@ def _labels_present() -> bool: ) -def capture_prepare_state(config: llmd_runtime.ResolvedConfig) -> None: +def capture_prepare_state(config: phase_inputs.PrepareInputs) -> None: artifacts_dir = config.artifact_dir / "artifacts" rhoai = config.platform["rhoai"] gateway = config.platform["gateway"] diff --git a/projects/llm_d/toolbox/test/main.py b/projects/llm_d/toolbox/test/main.py index 9c6242b6..5941d03a 100644 --- a/projects/llm_d/toolbox/test/main.py +++ b/projects/llm_d/toolbox/test/main.py @@ -6,81 +6,471 @@ import logging from pathlib import Path -from projects.core.dsl import toolbox -from projects.llm_d.orchestration import llmd_runtime +from projects.core.dsl import always, execute_tasks, task, toolbox +from projects.llm_d.orchestration import llmd_runtime, phase_inputs LOGGER = logging.getLogger(__name__) -def run() -> int: +def run(*, inputs_file: str) -> int: + """Deploy llm_d, run the smoke request, and optionally execute GuideLLM. + + Args: + inputs_file: Path to the test phase input file generated by orchestration + """ + llmd_runtime.init() - config = llmd_runtime.load_run_configuration() - return run_test(config) + execute_tasks(locals()) + return 0 + + +@task +def load_inputs(args, ctx): + """Load the test phase inputs""" + + ctx.config = phase_inputs.load_test_inputs(args.inputs_file) + return f"Loaded test inputs for preset {ctx.config.preset_name}" + +@task +def deploy_inference_service_task(args, ctx): + """Deploy the LLMInferenceService and resolve its endpoint""" -def run_test(config: llmd_runtime.ResolvedConfig) -> int: + config = ctx.config + name = config.platform["inference_service"]["name"] namespace = config.namespace - artifacts_dir = config.artifact_dir / "artifacts" + selector = f"app.kubernetes.io/name={name}" - LOGGER.info("Testing llm_d preset=%s namespace=%s", config.preset_name, namespace) + llmd_runtime.oc( + "delete", + "llminferenceservice", + name, + "-n", + namespace, + "--ignore-not-found=true", + check=False, + ) - endpoint_url = None - try: - endpoint_url = deploy_inference_service(config) - smoke_response = run_smoke_request(config, endpoint_url) - llmd_runtime.write_json(artifacts_dir / "smoke.response.json", smoke_response) + def _old_pods_gone() -> bool: + pods = llmd_runtime.oc_get_json( + "pods", namespace=namespace, selector=selector, ignore_not_found=True + ) + return not pods or not pods.get("items") - if config.benchmark: - run_guidellm_benchmark(config, endpoint_url) + llmd_runtime.wait_until( + f"old llm-d pods to disappear in {namespace}", + timeout_seconds=config.platform["inference_service"]["delete_timeout_seconds"], + interval_seconds=10, + predicate=_old_pods_gone, + ) - return 0 - finally: - capture_inference_service_state(config) - if endpoint_url: - llmd_runtime.write_text(artifacts_dir / "endpoint.url", f"{endpoint_url}\n") - benchmark_name = config.benchmark["job_name"] if config.benchmark else "guidellm-benchmark" - smoke_job_name = config.platform["smoke"]["job_name"] - llmd_runtime.oc( - "delete", - "job", - smoke_job_name, - "-n", - namespace, - "--ignore-not-found=true", - check=False, + manifest = llmd_runtime.render_inference_service(config) + llmd_runtime.apply_manifest(config.artifact_dir / "src" / "llminferenceservice.yaml", manifest) + + def _pods_present() -> bool: + pods = llmd_runtime.oc_get_json( + "pods", namespace=namespace, selector=selector, ignore_not_found=True ) - llmd_runtime.oc( - "delete", - "job,pvc", - benchmark_name, - "-n", + return bool(pods and pods.get("items")) + + llmd_runtime.wait_until( + f"llm-d pods to appear in {namespace}", + timeout_seconds=config.platform["inference_service"]["pod_appearance_timeout_seconds"], + interval_seconds=5, + predicate=_pods_present, + ) + + def _service_ready() -> bool: + payload = llmd_runtime.oc_get_json("llminferenceservice", name=name, namespace=namespace) + return llmd_runtime.condition_status(payload, "Ready") == "True" + + llmd_runtime.wait_until( + f"llminferenceservice/{name} ready", + timeout_seconds=config.platform["inference_service"]["ready_timeout_seconds"], + interval_seconds=10, + predicate=_service_ready, + ) + + ctx.endpoint_url = llmd_runtime.wait_until( + f"gateway address for llminferenceservice/{name}", + timeout_seconds=config.platform["inference_service"]["ready_timeout_seconds"], + interval_seconds=10, + predicate=lambda: try_resolve_endpoint_url(config), + ) + return f"Endpoint resolved: {ctx.endpoint_url}" + + +@task +def run_smoke_request_task(args, ctx): + """Run the smoke request against the deployed service""" + + config = ctx.config + namespace = config.namespace + job_name = config.platform["smoke"]["job_name"] + payload = { + "model": config.model["served_model_name"], + "prompt": config.smoke_request["prompt"], + "max_tokens": config.smoke_request["max_tokens"], + "temperature": config.smoke_request["temperature"], + } + llmd_runtime.write_json(config.artifact_dir / "artifacts" / "smoke.request.json", payload) + + llmd_runtime.oc( + "delete", + "job", + job_name, + "-n", + namespace, + "--ignore-not-found=true", + check=False, + ) + llmd_runtime.wait_until( + f"job/{job_name} deletion in {namespace}", + timeout_seconds=120, + interval_seconds=5, + predicate=lambda: not llmd_runtime.resource_exists("job", job_name, namespace=namespace), + ) + + llmd_runtime.apply_manifest( + config.artifact_dir / "src" / "smoke-job.yaml", + llmd_runtime.render_smoke_request_job(config, ctx.endpoint_url, payload), + ) + + try: + llmd_runtime.wait_for_job_completion( + job_name, namespace, - "--ignore-not-found=true", - check=False, + timeout_seconds=( + config.platform["smoke"]["request_retries"] + * ( + config.platform["smoke"]["request_timeout_seconds"] + + config.platform["smoke"]["request_retry_delay_seconds"] + ) + ), + interval_seconds=5, ) - llmd_runtime.oc( - "delete", - "pod", - f"{benchmark_name}-copy", - "-n", - namespace, - "--ignore-not-found=true", - check=False, + finally: + capture_smoke_state(config) + + result = llmd_runtime.oc( + "logs", + f"job/{job_name}", + "-n", + namespace, + check=False, + capture_output=True, + ) + + if result.returncode != 0 or not result.stdout: + raise RuntimeError( + f"Smoke request job {job_name} completed but response logs could not be read: {result.stderr}" ) - events = llmd_runtime.oc( - "get", - "events", - "-n", - namespace, - "--sort-by=.metadata.creationTimestamp", - check=False, - capture_output=True, + + response = json.loads(result.stdout) + if not response.get("choices"): + raise RuntimeError(f"Invalid smoke response payload: {result.stdout}") + + llmd_runtime.write_json(config.artifact_dir / "artifacts" / "smoke.response.json", response) + ctx.smoke_response = response + return "Smoke request completed" + + +@task +def run_guidellm_benchmark_task(args, ctx): + """Run the GuideLLM benchmark when enabled for the preset""" + + if not ctx.config.benchmark: + return "GuideLLM benchmark disabled" + + config = ctx.config + benchmark_name = config.benchmark["job_name"] + namespace = config.namespace + + llmd_runtime.oc( + "delete", + "job,pvc", + benchmark_name, + "-n", + namespace, + "--ignore-not-found=true", + check=False, + ) + llmd_runtime.oc( + "delete", + "pod", + f"{benchmark_name}-copy", + "-n", + namespace, + "--ignore-not-found=true", + check=False, + ) + + llmd_runtime.apply_manifest( + config.artifact_dir / "src" / "guidellm-pvc.yaml", + llmd_runtime.render_guidellm_pvc(config), + ) + llmd_runtime.apply_manifest( + config.artifact_dir / "src" / "guidellm-job.yaml", + llmd_runtime.render_guidellm_job(config, ctx.endpoint_url), + ) + + def _job_terminal() -> dict[str, object] | None: + payload = llmd_runtime.oc_get_json("job", name=benchmark_name, namespace=namespace) + status = payload.get("status", {}) + if status.get("succeeded"): + return payload + if status.get("failed"): + raise RuntimeError(f"GuideLLM job {benchmark_name} failed") + return None + + llmd_runtime.wait_until( + f"GuideLLM job/{benchmark_name}", + timeout_seconds=config.benchmark["timeout_seconds"], + interval_seconds=10, + predicate=_job_terminal, + ) + + capture_guidellm_state(config) + copy_guidellm_results(config) + return f"GuideLLM benchmark {ctx.config.benchmark['job_name']} completed" + + +@always +@task +def capture_inference_service_state_task(args, ctx): + """Capture the LLMInferenceService state and related resources""" + + config = getattr(ctx, "config", None) + if not config: + return "Test inputs unavailable; skipping state capture" + + name = config.platform["inference_service"]["name"] + namespace = config.namespace + artifacts_dir = config.artifact_dir / "artifacts" + selector = f"app.kubernetes.io/name={name}" + + capture_get( + "llminferenceservice", + name, + namespace, + "yaml", + artifacts_dir / "llminferenceservice.yaml", + ) + capture_get( + "llminferenceservice", + name, + namespace, + "json", + artifacts_dir / "llminferenceservice.json", + ) + capture_get( + "pods", + None, + namespace, + "yaml", + artifacts_dir / "llminferenceservice.pods.yaml", + selector=selector, + ) + capture_get( + "deployments", + None, + namespace, + "yaml", + artifacts_dir / "llminferenceservice.deployments.yaml", + selector=selector, + ) + capture_get( + "replicasets", + None, + namespace, + "yaml", + artifacts_dir / "llminferenceservice.replicasets.yaml", + selector=selector, + ) + capture_get("pods", None, namespace, "wide", artifacts_dir / "namespace.pods.status") + capture_get("services", None, namespace, "wide", artifacts_dir / "namespace.services.status") + + pod_list = llmd_runtime.oc_get_json( + "pods", namespace=namespace, selector=selector, ignore_not_found=True + ) + if pod_list: + lines = [] + previous_lines = [] + for pod in pod_list.get("items", []): + pod_name = pod["metadata"]["name"] + lines.append(f"=== {pod_name} ===") + log_result = llmd_runtime.oc( + "logs", + pod_name, + "-n", + namespace, + "--all-containers=true", + check=False, + capture_output=True, + ) + if log_result.stdout: + lines.append(log_result.stdout.rstrip()) + + previous_lines.append(f"=== {pod_name} ===") + previous_result = llmd_runtime.oc( + "logs", + pod_name, + "-n", + namespace, + "--previous", + "--all-containers=true", + check=False, + capture_output=True, + ) + if previous_result.stdout: + previous_lines.append(previous_result.stdout.rstrip()) + + llmd_runtime.write_text( + artifacts_dir / "llminferenceservice.pods.logs", "\n".join(lines) + "\n" + ) + llmd_runtime.write_text( + artifacts_dir / "llminferenceservice.pods.previous.logs", + "\n".join(previous_lines) + "\n", + ) + return "Inference-service artifacts captured" + + +@always +@task +def write_endpoint_url_task(args, ctx): + """Persist the resolved endpoint URL when available""" + + config = getattr(ctx, "config", None) + if not config: + return "Test inputs unavailable; skipping endpoint capture" + + endpoint_url = getattr(ctx, "endpoint_url", None) + if not endpoint_url: + return "Endpoint URL not available" + + llmd_runtime.write_text(config.artifact_dir / "artifacts" / "endpoint.url", f"{endpoint_url}\n") + return "Endpoint URL captured" + + +@always +@task +def cleanup_runtime_resources_task(args, ctx): + """Delete smoke and benchmark helper resources""" + + config = getattr(ctx, "config", None) + if not config: + return "Test inputs unavailable; skipping cleanup" + + benchmark_name = config.benchmark["job_name"] if config.benchmark else "guidellm-benchmark" + smoke_job_name = config.platform["smoke"]["job_name"] + namespace = config.namespace + + llmd_runtime.oc( + "delete", + "job", + smoke_job_name, + "-n", + namespace, + "--ignore-not-found=true", + check=False, + ) + llmd_runtime.oc( + "delete", + "job,pvc", + benchmark_name, + "-n", + namespace, + "--ignore-not-found=true", + check=False, + ) + llmd_runtime.oc( + "delete", + "pod", + f"{benchmark_name}-copy", + "-n", + namespace, + "--ignore-not-found=true", + check=False, + ) + return "Test helper resources deleted" + + +@always +@task +def capture_namespace_events_task(args, ctx): + """Capture namespace events after the test run""" + + config = getattr(ctx, "config", None) + if not config: + return "Test inputs unavailable; skipping namespace events capture" + + events = llmd_runtime.oc( + "get", + "events", + "-n", + config.namespace, + "--sort-by=.metadata.creationTimestamp", + check=False, + capture_output=True, + ) + if events.returncode == 0 and events.stdout: + llmd_runtime.write_text( + config.artifact_dir / "artifacts" / "namespace.events.txt", events.stdout + ) + return "Namespace events captured" + + +def cleanup_runtime_resources(config: phase_inputs.TestInputs) -> None: + benchmark_name = config.benchmark["job_name"] if config.benchmark else "guidellm-benchmark" + smoke_job_name = config.platform["smoke"]["job_name"] + namespace = config.namespace + + llmd_runtime.oc( + "delete", + "job", + smoke_job_name, + "-n", + namespace, + "--ignore-not-found=true", + check=False, + ) + llmd_runtime.oc( + "delete", + "job,pvc", + benchmark_name, + "-n", + namespace, + "--ignore-not-found=true", + check=False, + ) + llmd_runtime.oc( + "delete", + "pod", + f"{benchmark_name}-copy", + "-n", + namespace, + "--ignore-not-found=true", + check=False, + ) + + +def capture_namespace_events(config: phase_inputs.TestInputs) -> None: + events = llmd_runtime.oc( + "get", + "events", + "-n", + config.namespace, + "--sort-by=.metadata.creationTimestamp", + check=False, + capture_output=True, + ) + if events.returncode == 0 and events.stdout: + llmd_runtime.write_text( + config.artifact_dir / "artifacts" / "namespace.events.txt", events.stdout ) - if events.returncode == 0 and events.stdout: - llmd_runtime.write_text(artifacts_dir / "namespace.events.txt", events.stdout) -def deploy_inference_service(config: llmd_runtime.ResolvedConfig) -> str: +def deploy_inference_service(config: phase_inputs.TestInputs) -> str: name = config.platform["inference_service"]["name"] namespace = config.namespace selector = f"app.kubernetes.io/name={name}" @@ -143,7 +533,7 @@ def _service_ready() -> bool: ) -def resolve_endpoint_url(config: llmd_runtime.ResolvedConfig) -> str: +def resolve_endpoint_url(config: phase_inputs.TestInputs) -> str: endpoint_url = try_resolve_endpoint_url(config) if endpoint_url: return endpoint_url @@ -155,7 +545,7 @@ def resolve_endpoint_url(config: llmd_runtime.ResolvedConfig) -> str: ) -def try_resolve_endpoint_url(config: llmd_runtime.ResolvedConfig) -> str | None: +def try_resolve_endpoint_url(config: phase_inputs.TestInputs) -> str | None: name = config.platform["inference_service"]["name"] namespace = config.namespace gateway_name = config.platform["gateway"]["status_address_name"] @@ -167,7 +557,7 @@ def try_resolve_endpoint_url(config: llmd_runtime.ResolvedConfig) -> str | None: return None -def run_smoke_request(config: llmd_runtime.ResolvedConfig, endpoint_url: str) -> dict[str, object]: +def run_smoke_request(config: phase_inputs.TestInputs, endpoint_url: str) -> dict[str, object]: namespace = config.namespace job_name = config.platform["smoke"]["job_name"] @@ -236,7 +626,7 @@ def run_smoke_request(config: llmd_runtime.ResolvedConfig, endpoint_url: str) -> return response -def capture_smoke_state(config: llmd_runtime.ResolvedConfig) -> None: +def capture_smoke_state(config: phase_inputs.TestInputs) -> None: job_name = config.platform["smoke"]["job_name"] namespace = config.namespace artifacts_dir = config.artifact_dir / "artifacts" @@ -262,7 +652,7 @@ def capture_smoke_state(config: llmd_runtime.ResolvedConfig) -> None: llmd_runtime.write_text(artifacts_dir / "smoke_job.logs", result.stdout) -def run_guidellm_benchmark(config: llmd_runtime.ResolvedConfig, endpoint_url: str) -> None: +def run_guidellm_benchmark(config: phase_inputs.TestInputs, endpoint_url: str) -> None: benchmark_name = config.benchmark["job_name"] namespace = config.namespace @@ -314,7 +704,7 @@ def _job_terminal() -> dict[str, object] | None: copy_guidellm_results(config) -def copy_guidellm_results(config: llmd_runtime.ResolvedConfig) -> None: +def copy_guidellm_results(config: phase_inputs.TestInputs) -> None: benchmark_name = config.benchmark["job_name"] namespace = config.namespace pod_data = llmd_runtime.oc_get_json( @@ -369,7 +759,7 @@ def _helper_ready() -> bool: ) -def capture_inference_service_state(config: llmd_runtime.ResolvedConfig) -> None: +def capture_inference_service_state(config: phase_inputs.TestInputs) -> None: name = config.platform["inference_service"]["name"] namespace = config.namespace artifacts_dir = config.artifact_dir / "artifacts" @@ -460,7 +850,7 @@ def capture_inference_service_state(config: llmd_runtime.ResolvedConfig) -> None ) -def capture_guidellm_state(config: llmd_runtime.ResolvedConfig) -> None: +def capture_guidellm_state(config: phase_inputs.TestInputs) -> None: benchmark_name = config.benchmark["job_name"] namespace = config.namespace artifacts_dir = config.artifact_dir / "artifacts" From 2098bcd101a7dfe59424ffde4dd5811fe26a008a Mon Sep 17 00:00:00 2001 From: Alberto Perdomo Date: Sun, 3 May 2026 20:48:26 +0100 Subject: [PATCH 15/21] chore: Reorder tests within project Signed-off-by: Alberto Perdomo --- .github/workflows/test_toolbox_dsl.yml | 6 +++--- {tests/llm_d => projects/llm_d/tests}/test_runtime.py | 0 pyproject.toml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) rename {tests/llm_d => projects/llm_d/tests}/test_runtime.py (100%) diff --git a/.github/workflows/test_toolbox_dsl.yml b/.github/workflows/test_toolbox_dsl.yml index 0946fca0..d93faa2b 100644 --- a/.github/workflows/test_toolbox_dsl.yml +++ b/.github/workflows/test_toolbox_dsl.yml @@ -1,5 +1,5 @@ -# Unit tests for projects/core/dsl (task decorators, execute_tasks, failure/always/skip). -name: Toolbox DSL tests +# Python tests for repo-managed suites discovered via pyproject testpaths. +name: Python test suites on: pull_request: @@ -31,7 +31,7 @@ jobs: python -m pip install --upgrade pip python -m pip install pytest pyyaml jinja2 jsonpath_ng - - name: Run projects/core/tests + - name: Run pytest suites run: | set -o errexit # Tree + docstrings (what is being tested), then execute with one line per test + result. diff --git a/tests/llm_d/test_runtime.py b/projects/llm_d/tests/test_runtime.py similarity index 100% rename from tests/llm_d/test_runtime.py rename to projects/llm_d/tests/test_runtime.py diff --git a/pyproject.toml b/pyproject.toml index c6632bf9..b2b061f6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -125,7 +125,7 @@ ignore = [ [tool.pytest.ini_options] minversion = "7.0" addopts = "-ra -q --strict-markers --strict-config" -testpaths = ["projects/core/tests"] +testpaths = ["projects/core/tests", "projects/llm_d/tests"] python_files = ["test_*.py", "*_test.py"] python_classes = ["Test*"] python_functions = ["test_*"] From 7698ebe95fdc0345b9670325c843c6958f99a8a7 Mon Sep 17 00:00:00 2001 From: Alberto Perdomo Date: Mon, 4 May 2026 10:20:59 +0100 Subject: [PATCH 16/21] fix: Install Forge dependencies for pytest CI --- .github/workflows/test_toolbox_dsl.yml | 2 +- pyproject.toml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test_toolbox_dsl.yml b/.github/workflows/test_toolbox_dsl.yml index d93faa2b..73faeadb 100644 --- a/.github/workflows/test_toolbox_dsl.yml +++ b/.github/workflows/test_toolbox_dsl.yml @@ -29,7 +29,7 @@ jobs: run: | set -o errexit python -m pip install --upgrade pip - python -m pip install pytest pyyaml jinja2 jsonpath_ng + python -m pip install .[dev] - name: Run pytest suites run: | diff --git a/pyproject.toml b/pyproject.toml index b2b061f6..139c1bc7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,7 @@ dependencies = [ "plotly>=5.17.0", "dash>=2.14.0", "dash-bootstrap-components>=1.5.0", + "jinja2", "pyyaml>=6.0", "jsonschema>=4.19.0", "structlog>=23.1.0", From e1f88620ac46a9c5086ca57672ff5f1b0b100405 Mon Sep 17 00:00:00 2001 From: Alberto Perdomo Date: Tue, 5 May 2026 08:19:40 +0100 Subject: [PATCH 17/21] refactor: Move llm_d shared runtime out of orchestration --- projects/core/library/config.py | 17 +++++ projects/llm_d/orchestration/ci.py | 17 ++++- projects/llm_d/orchestration/cli.py | 17 ++++- .../llmd_runtime.py | 76 ++++++------------- .../phase_inputs.py | 2 +- .../runtime_config.py | 60 +++++++++------ .../runtime_manifests.py | 2 +- .../runtime/scripts/download_hf_model.sh | 28 +++++++ .../runtime/scripts/extract_oci_model.sh | 18 +++++ projects/llm_d/tests/test_runtime.py | 2 +- projects/llm_d/toolbox/cleanup/main.py | 2 +- projects/llm_d/toolbox/prepare/main.py | 2 +- .../llm_d/toolbox/prepare_model_cache/main.py | 2 +- projects/llm_d/toolbox/test/main.py | 2 +- 14 files changed, 158 insertions(+), 89 deletions(-) rename projects/llm_d/{orchestration => runtime}/llmd_runtime.py (91%) rename projects/llm_d/{orchestration => runtime}/phase_inputs.py (98%) rename projects/llm_d/{orchestration => runtime}/runtime_config.py (86%) rename projects/llm_d/{orchestration => runtime}/runtime_manifests.py (99%) create mode 100644 projects/llm_d/runtime/scripts/download_hf_model.sh create mode 100644 projects/llm_d/runtime/scripts/extract_oci_model.sh diff --git a/projects/core/library/config.py b/projects/core/library/config.py index 740e921c..02c69809 100644 --- a/projects/core/library/config.py +++ b/projects/core/library/config.py @@ -450,3 +450,20 @@ def init(orchestration_dir, *, apply_config_overrides=True): project.apply_config_overrides() project.apply_presets_from_project_args() project.apply_config_overrides() # reapply so that the value overrides are applied last + + +def reload(orchestration_dir, *, apply_config_overrides=True): + global project + + project = None + + artifact_config = env.ARTIFACT_DIR / "config.yaml" + if artifact_config.exists(): + artifact_config.unlink() + + presets_applied = env.ARTIFACT_DIR / "presets_applied" + if presets_applied.exists(): + presets_applied.unlink() + + init(orchestration_dir, apply_config_overrides=apply_config_overrides) + return project diff --git a/projects/llm_d/orchestration/ci.py b/projects/llm_d/orchestration/ci.py index ed02e0b2..bc5ae6f4 100644 --- a/projects/llm_d/orchestration/ci.py +++ b/projects/llm_d/orchestration/ci.py @@ -4,12 +4,13 @@ """ +import os import types import click from projects.core.library import ci as ci_lib -from projects.llm_d.orchestration import llmd_runtime, phase_inputs +from projects.llm_d.runtime import llmd_runtime, phase_inputs from projects.llm_d.toolbox.cleanup.main import run as cleanup_toolbox_run from projects.llm_d.toolbox.prepare.main import run as prepare_toolbox_run from projects.llm_d.toolbox.test.main import run as test_toolbox_run @@ -19,20 +20,28 @@ def init_runtime() -> None: llmd_runtime.init() +def load_runtime_configuration(): + return llmd_runtime.load_run_configuration( + requested_preset=os.environ.get("FORGE_PRESET"), + raw_overrides=os.environ.get("FORGE_CONFIG_OVERRIDES"), + job_name=os.environ.get("FORGE_JOB_NAME"), + ) + + def run_prepare_phase() -> int: - config = llmd_runtime.load_run_configuration() + config = load_runtime_configuration() inputs_file = phase_inputs.write_prepare_inputs(config) return prepare_toolbox_run(inputs_file=str(inputs_file)) def run_test_phase() -> int: - config = llmd_runtime.load_run_configuration() + config = load_runtime_configuration() inputs_file = phase_inputs.write_test_inputs(config) return test_toolbox_run(inputs_file=str(inputs_file)) def run_cleanup_phase() -> int: - config = llmd_runtime.load_run_configuration() + config = load_runtime_configuration() inputs_file = phase_inputs.write_cleanup_inputs(config) return cleanup_toolbox_run(inputs_file=str(inputs_file)) diff --git a/projects/llm_d/orchestration/cli.py b/projects/llm_d/orchestration/cli.py index 02b2e549..fdb84fa9 100644 --- a/projects/llm_d/orchestration/cli.py +++ b/projects/llm_d/orchestration/cli.py @@ -1,12 +1,13 @@ #!/usr/bin/env python3 import logging +import os import types import click from projects.core.library.cli import safe_cli_command -from projects.llm_d.orchestration import llmd_runtime, phase_inputs +from projects.llm_d.runtime import llmd_runtime, phase_inputs from projects.llm_d.toolbox.cleanup.main import run as cleanup_toolbox_run from projects.llm_d.toolbox.prepare.main import run as prepare_toolbox_run from projects.llm_d.toolbox.test.main import run as test_toolbox_run @@ -18,20 +19,28 @@ def init_runtime() -> None: llmd_runtime.init() +def load_runtime_configuration(): + return llmd_runtime.load_run_configuration( + requested_preset=os.environ.get("FORGE_PRESET"), + raw_overrides=os.environ.get("FORGE_CONFIG_OVERRIDES"), + job_name=os.environ.get("FORGE_JOB_NAME"), + ) + + def run_prepare_phase() -> int: - config = llmd_runtime.load_run_configuration() + config = load_runtime_configuration() inputs_file = phase_inputs.write_prepare_inputs(config) return prepare_toolbox_run(inputs_file=str(inputs_file)) def run_test_phase() -> int: - config = llmd_runtime.load_run_configuration() + config = load_runtime_configuration() inputs_file = phase_inputs.write_test_inputs(config) return test_toolbox_run(inputs_file=str(inputs_file)) def run_cleanup_phase() -> int: - config = llmd_runtime.load_run_configuration() + config = load_runtime_configuration() inputs_file = phase_inputs.write_cleanup_inputs(config) return cleanup_toolbox_run(inputs_file=str(inputs_file)) diff --git a/projects/llm_d/orchestration/llmd_runtime.py b/projects/llm_d/runtime/llmd_runtime.py similarity index 91% rename from projects/llm_d/orchestration/llmd_runtime.py rename to projects/llm_d/runtime/llmd_runtime.py index 59b054e6..53d4662b 100644 --- a/projects/llm_d/orchestration/llmd_runtime.py +++ b/projects/llm_d/runtime/llmd_runtime.py @@ -7,11 +7,12 @@ import subprocess import time from collections.abc import Iterable +from pathlib import Path from typing import Any import yaml -from projects.llm_d.orchestration.runtime_config import ( +from projects.llm_d.runtime.runtime_config import ( CONFIG_DIR, ORCHESTRATION_DIR, ModelCacheSpec, @@ -33,7 +34,7 @@ write_text, write_yaml, ) -from projects.llm_d.orchestration.runtime_manifests import ( +from projects.llm_d.runtime.runtime_manifests import ( load_manifest_template, render_datasciencecluster, render_gateway, @@ -45,7 +46,7 @@ render_smoke_request_job, ) -LOGGER = logging.getLogger(__name__) +logger = logging.getLogger(__name__) __all__ = [ "CONFIG_DIR", @@ -118,7 +119,7 @@ def run_command( timeout_seconds: float | None = 300, ) -> subprocess.CompletedProcess[str]: cmd = [str(arg) for arg in args] - LOGGER.info("run: %s", " ".join(shlex.quote(arg) for arg in cmd)) + logger.info("run: %s", " ".join(shlex.quote(arg) for arg in cmd)) try: result = subprocess.run( cmd, @@ -129,7 +130,7 @@ def run_command( timeout=timeout_seconds, ) except subprocess.TimeoutExpired: - LOGGER.error( + logger.error( "Command timed out after %ss: %s", timeout_seconds, " ".join(shlex.quote(arg) for arg in cmd), @@ -138,9 +139,9 @@ def run_command( if capture_output: if result.stdout: - LOGGER.info("stdout:\n%s", result.stdout.rstrip()) + logger.info("stdout:\n%s", result.stdout.rstrip()) if result.stderr: - LOGGER.info("stderr:\n%s", result.stderr.rstrip()) + logger.info("stderr:\n%s", result.stderr.rstrip()) if check and result.returncode != 0: raise CommandError( @@ -247,7 +248,7 @@ def wait_until( if isinstance(exc, RuntimeError): raise last_error = exc - LOGGER.info("waiting for %s: %s", description, exc) + logger.info("waiting for %s: %s", description, exc) time.sleep(interval_seconds) if last_error: @@ -339,7 +340,7 @@ def ensure_subscription(operator_spec: dict[str, Any]) -> None: ignore_not_found=True, ) if current and not subscription_spec_matches(current.get("spec", {}), subscription["spec"]): - LOGGER.info("Reconciling subscription drift for %s in %s", package, namespace) + logger.info("Reconciling subscription drift for %s in %s", package, namespace) oc("apply", "-f", "-", input_text=yaml.safe_dump(subscription, sort_keys=False)) @@ -384,7 +385,13 @@ def subscription_spec_matches(actual: dict[str, Any], expected: dict[str, Any]) def operator_spec_by_package(platform: dict[str, Any], package: str) -> dict[str, Any]: - for operator_spec in platform["operators"]: + operators = platform["operators"] + if isinstance(operators, dict): + if package in operators: + return {"package": package, **operators[package]} + raise KeyError(f"Unknown operator package in llm_d platform config: {package}") + + for operator_spec in operators: if operator_spec["package"] == package: return operator_spec raise KeyError(f"Unknown operator package in llm_d platform config: {package}") @@ -482,6 +489,11 @@ def resolve_default_serviceaccount_image_pull_secret(namespace: str) -> str | No return None +def load_runtime_script(name: str) -> str: + script_path = Path(__file__).resolve().parent / "scripts" / name + return script_path.read_text(encoding="utf-8") + + def render_model_cache_job(config: ResolvedConfig, spec: ModelCacheSpec) -> dict[str, Any]: common_env = [ {"name": "MODEL_SOURCE", "value": spec.source_uri}, @@ -494,32 +506,7 @@ def render_model_cache_job(config: ResolvedConfig, spec: ModelCacheSpec) -> dict ] if spec.source_scheme == "hf": - command = """ -set -euo pipefail -mkdir -p "${MODEL_TARGET_DIR}" -rm -rf "${MODEL_TARGET_DIR}"/* -python -m pip install --quiet --no-cache-dir 'huggingface_hub[hf_xet]' -python - <<'PY' -import os -from huggingface_hub import snapshot_download - -token = None -token_file = os.environ.get("HF_TOKEN_FILE") -if token_file and os.path.exists(token_file): - with open(token_file, encoding="utf-8") as handle: - token = handle.read().strip() or None - -snapshot_download( - repo_id=os.environ["MODEL_SOURCE"][5:], - local_dir=os.environ["MODEL_TARGET_DIR"], - local_dir_use_symlinks=False, - token=token, -) -PY -cat > "${MARKER_FILE}" < dict spec.oci_registry_auth_secret_name or resolve_default_serviceaccount_image_pull_secret(spec.namespace) ) - command = """ -set -euo pipefail -mkdir -p "${MODEL_TARGET_DIR}" -rm -rf "${MODEL_TARGET_DIR}"/* -auth_args=() -if [[ -n "${REGISTRY_AUTH_FILE:-}" && -f "${REGISTRY_AUTH_FILE}" ]]; then - auth_args+=(--registry-config="${REGISTRY_AUTH_FILE}") -fi -oc image extract "${MODEL_SOURCE#oci://}" \ - --path "${OCI_IMAGE_PATH}:${MODEL_TARGET_DIR}" \ - --confirm \ - "${auth_args[@]}" -cat > "${MARKER_FILE}" < None: def load_run_configuration( - *, cwd: Path | None = None, artifact_dir: Path | None = None + *, + cwd: Path | None = None, + artifact_dir: Path | None = None, + requested_preset: str | None = None, + raw_overrides: str | None = None, + job_name: str | None = None, ) -> ResolvedConfig: cwd = cwd or Path.cwd() if artifact_dir is not None: os.environ["ARTIFACT_DIR"] = str(artifact_dir) artifact_dir = init() - _reinitialize_project_config() + config.reload(ORCHESTRATION_DIR) - platform_data = copy.deepcopy(config.project.get_config("platform")) + platform_data = normalize_platform_config(copy.deepcopy(config.project.get_config("platform"))) model_cache = copy.deepcopy(config.project.get_config("model_cache")) fournos_config = load_fournos_config(cwd) overrides = parse_overrides( - os.environ.get("FORGE_CONFIG_OVERRIDES", ""), + raw_overrides or "", allowed_keys=config.project.get_config("runtime.allowed_override_keys", []), ) requested_preset = ( - fournos_config.get("preset") - or os.environ.get("FORGE_PRESET") + requested_preset + or fournos_config.get("preset") or config.project.get_config("runtime.default_preset") ) apply_requested_preset(requested_preset) @@ -136,19 +143,20 @@ def load_run_configuration( config.project.get_config(f"workloads.benchmarks.{benchmark_name}") ) - job_name = fournos_config.get("job-name") or os.environ.get("FORGE_JOB_NAME") + job_name = job_name or fournos_config.get("job-name") if not job_name: job_name = f"local-{preset_name}" namespace_override = overrides.get("namespace") or fournos_config.get("namespace") - default_namespace = platform_data["cluster"].get("namespace_name") + namespace_config = platform_data["cluster"]["namespace"] + default_namespace = namespace_config.get("name") namespace = ( namespace_override or default_namespace or derive_namespace( job_name, - platform_data["cluster"]["namespace_prefix"], - platform_data["cluster"]["namespace_max_length"], + namespace_config["prefix"], + namespace_config["max_length"], ) ) @@ -177,17 +185,25 @@ def load_run_configuration( ) -def _reinitialize_project_config() -> None: - config.project = None - artifact_config = env.ARTIFACT_DIR / "config.yaml" - if artifact_config.exists(): - artifact_config.unlink() +def normalize_platform_config(platform_data: dict[str, Any]) -> dict[str, Any]: + cluster = platform_data["cluster"] + if "namespace" not in cluster: + cluster["namespace"] = { + "name": cluster.pop("namespace_name", None), + "prefix": cluster.pop("namespace_prefix"), + "max_length": cluster.pop("namespace_max_length"), + } - presets_applied = env.ARTIFACT_DIR / "presets_applied" - if presets_applied.exists(): - presets_applied.unlink() + operators = platform_data["operators"] + if isinstance(operators, list): + platform_data["operators"] = { + operator_spec["package"]: { + key: value for key, value in operator_spec.items() if key != "package" + } + for operator_spec in operators + } - config.init(ORCHESTRATION_DIR) + return platform_data def apply_requested_preset(requested_preset: str) -> None: @@ -239,7 +255,7 @@ def normalize_gpu_count(value: Any) -> int | None: try: return int(value) except (TypeError, ValueError): - LOGGER.warning("Ignoring invalid gpu-count value: %s", value) + logger.warning("Ignoring invalid gpu-count value: %s", value) return None diff --git a/projects/llm_d/orchestration/runtime_manifests.py b/projects/llm_d/runtime/runtime_manifests.py similarity index 99% rename from projects/llm_d/orchestration/runtime_manifests.py rename to projects/llm_d/runtime/runtime_manifests.py index 0c72a88e..bc5fdca8 100644 --- a/projects/llm_d/orchestration/runtime_manifests.py +++ b/projects/llm_d/runtime/runtime_manifests.py @@ -4,7 +4,7 @@ import json from typing import Any -from projects.llm_d.orchestration.runtime_config import ( +from projects.llm_d.runtime.runtime_config import ( ModelCacheSpec, ResolvedConfig, load_yaml, diff --git a/projects/llm_d/runtime/scripts/download_hf_model.sh b/projects/llm_d/runtime/scripts/download_hf_model.sh new file mode 100644 index 00000000..9623d2aa --- /dev/null +++ b/projects/llm_d/runtime/scripts/download_hf_model.sh @@ -0,0 +1,28 @@ +set -euo pipefail + +mkdir -p "${MODEL_TARGET_DIR}" +rm -rf "${MODEL_TARGET_DIR}"/* + +python -m pip install --quiet --no-cache-dir 'huggingface_hub[hf_xet]' +python - <<'PY' +import os + +from huggingface_hub import snapshot_download + +token = None +token_file = os.environ.get("HF_TOKEN_FILE") +if token_file and os.path.exists(token_file): + with open(token_file, encoding="utf-8") as handle: + token = handle.read().strip() or None + +snapshot_download( + repo_id=os.environ["MODEL_SOURCE"][5:], + local_dir=os.environ["MODEL_TARGET_DIR"], + local_dir_use_symlinks=False, + token=token, +) +PY + +cat > "${MARKER_FILE}" < "${MARKER_FILE}" < int: diff --git a/projects/llm_d/toolbox/prepare/main.py b/projects/llm_d/toolbox/prepare/main.py index 6851bf4b..34b23478 100644 --- a/projects/llm_d/toolbox/prepare/main.py +++ b/projects/llm_d/toolbox/prepare/main.py @@ -7,7 +7,7 @@ from pathlib import Path from projects.core.dsl import execute_tasks, shell, task, toolbox -from projects.llm_d.orchestration import llmd_runtime, phase_inputs +from projects.llm_d.runtime import llmd_runtime, phase_inputs from projects.llm_d.toolbox.prepare_model_cache import main as prepare_model_cache LOGGER = logging.getLogger(__name__) diff --git a/projects/llm_d/toolbox/prepare_model_cache/main.py b/projects/llm_d/toolbox/prepare_model_cache/main.py index f698ef0c..73cfc24e 100644 --- a/projects/llm_d/toolbox/prepare_model_cache/main.py +++ b/projects/llm_d/toolbox/prepare_model_cache/main.py @@ -5,7 +5,7 @@ import logging from projects.core.dsl import execute_tasks, task, toolbox -from projects.llm_d.orchestration import llmd_runtime, phase_inputs +from projects.llm_d.runtime import llmd_runtime, phase_inputs LOGGER = logging.getLogger(__name__) diff --git a/projects/llm_d/toolbox/test/main.py b/projects/llm_d/toolbox/test/main.py index 5941d03a..609c9e46 100644 --- a/projects/llm_d/toolbox/test/main.py +++ b/projects/llm_d/toolbox/test/main.py @@ -7,7 +7,7 @@ from pathlib import Path from projects.core.dsl import always, execute_tasks, task, toolbox -from projects.llm_d.orchestration import llmd_runtime, phase_inputs +from projects.llm_d.runtime import llmd_runtime, phase_inputs LOGGER = logging.getLogger(__name__) From 5ca7bbdc3ffcbc6d52b0d756427664fa06645e7a Mon Sep 17 00:00:00 2001 From: Alberto Perdomo Date: Tue, 5 May 2026 08:20:13 +0100 Subject: [PATCH 18/21] refactor: Normalize llm_d project configuration layout --- .../orchestration/config.d/platform.yaml | 27 +++++++++--------- .../llm_d/orchestration/config.d/project.yaml | 2 ++ projects/llm_d/orchestration/config.yaml | 3 -- projects/llm_d/tests/test_runtime.py | 28 +++++++++++++++++-- 4 files changed, 42 insertions(+), 18 deletions(-) create mode 100644 projects/llm_d/orchestration/config.d/project.yaml delete mode 100644 projects/llm_d/orchestration/config.yaml diff --git a/projects/llm_d/orchestration/config.d/platform.yaml b/projects/llm_d/orchestration/config.d/platform.yaml index 43092e7c..6f823eba 100644 --- a/projects/llm_d/orchestration/config.d/platform.yaml +++ b/projects/llm_d/orchestration/config.d/platform.yaml @@ -1,8 +1,9 @@ cluster: minimum_openshift_version: "4.19.9" - namespace_name: forge-llm-d - namespace_prefix: llm-d - namespace_max_length: 63 + namespace: + name: forge-llm-d + prefix: llm-d + max_length: 63 cleanup_timeout_seconds: 900 gpu_node_label_selector: nvidia.com/gpu.present=true nfd_gpu_detection_labels: @@ -11,36 +12,36 @@ cluster: - feature.node.kubernetes.io/pci-0300_10de.present operators: - - display_name: OpenShift Cert Manager - package: openshift-cert-manager-operator + openshift-cert-manager-operator: + display_name: OpenShift Cert Manager namespace: openshift-cert-manager-operator channel: stable-v1.18 source: redhat-operators wait_timeout_seconds: 900 - - display_name: Leader Worker Set - package: leader-worker-set + leader-worker-set: + display_name: Leader Worker Set namespace: openshift-lws channel: stable source: redhat-operators wait_timeout_seconds: 900 - - display_name: Node Feature Discovery - package: nfd + nfd: + display_name: Node Feature Discovery namespace: openshift-nfd channel: stable source: redhat-operators wait_timeout_seconds: 900 bootstrap_crd: nodefeaturediscoveries.nfd.openshift.io bootstrap_manifest: manifests/nfd-nodefeaturediscovery.yaml - - display_name: NVIDIA GPU Operator - package: gpu-operator-certified + gpu-operator-certified: + display_name: NVIDIA GPU Operator namespace: nvidia-gpu-operator channel: stable source: certified-operators wait_timeout_seconds: 1800 bootstrap_crd: clusterpolicies.nvidia.com bootstrap_manifest: manifests/gpu-clusterpolicy.yaml - - display_name: Red Hat OpenShift AI - package: rhods-operator + rhods-operator: + display_name: Red Hat OpenShift AI namespace: redhat-ods-operator channel: stable-3.x source: redhat-operators diff --git a/projects/llm_d/orchestration/config.d/project.yaml b/projects/llm_d/orchestration/config.d/project.yaml new file mode 100644 index 00000000..f957c25d --- /dev/null +++ b/projects/llm_d/orchestration/config.d/project.yaml @@ -0,0 +1,2 @@ +name: llm_d +args: [] diff --git a/projects/llm_d/orchestration/config.yaml b/projects/llm_d/orchestration/config.yaml deleted file mode 100644 index c36dfa60..00000000 --- a/projects/llm_d/orchestration/config.yaml +++ /dev/null @@ -1,3 +0,0 @@ -project: - name: llm_d - args: [] diff --git a/projects/llm_d/tests/test_runtime.py b/projects/llm_d/tests/test_runtime.py index 32f31f8e..018f314d 100644 --- a/projects/llm_d/tests/test_runtime.py +++ b/projects/llm_d/tests/test_runtime.py @@ -62,15 +62,21 @@ def test_load_run_configuration_consolidates_config_d( assert "runtime" in consolidated assert "scheduler_profiles" in consolidated assert "workloads" in consolidated + assert consolidated["project"]["name"] == "llm_d" assert consolidated["runtime"]["default_preset"] == "smoke" + assert consolidated["platform"]["cluster"]["namespace"]["name"] == "forge-llm-d" + assert isinstance(consolidated["platform"]["operators"], dict) def test_namespace_override_is_not_managed(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: - monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", '{"namespace":"custom-ns"}') artifact_dir = tmp_path / "artifacts" artifact_dir.mkdir() - config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + config = llmd_runtime.load_run_configuration( + cwd=tmp_path, + artifact_dir=artifact_dir, + raw_overrides='{"namespace":"custom-ns"}', + ) assert config.namespace == "custom-ns" assert config.namespace_is_managed is False @@ -91,6 +97,24 @@ def test_default_namespace_comes_from_project_config( assert config.namespace == "forge-llm-d" assert config.namespace_is_managed is False + assert config.platform["cluster"]["namespace"]["prefix"] == "llm-d" + assert "rhods-operator" in config.platform["operators"] + + +def test_load_run_configuration_ignores_runtime_env_vars( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", '{"namespace":"ignored-ns"}') + monkeypatch.setenv("FORGE_PRESET", "benchmark-short") + monkeypatch.setenv("FORGE_JOB_NAME", "ignored-job") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + + assert config.preset_name == "smoke" + assert config.namespace == "forge-llm-d" + assert config.job_name == "local-smoke" def test_write_prepare_inputs_round_trip(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: From d5186a71c083d6c1ca180aadcf38a784868d40d2 Mon Sep 17 00:00:00 2001 From: Alberto Perdomo Date: Tue, 5 May 2026 09:03:09 +0100 Subject: [PATCH 19/21] test: Align llm_d runtime coverage with explicit inputs --- projects/llm_d/tests/test_runtime.py | 52 +++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 9 deletions(-) diff --git a/projects/llm_d/tests/test_runtime.py b/projects/llm_d/tests/test_runtime.py index 018f314d..6e835b4c 100644 --- a/projects/llm_d/tests/test_runtime.py +++ b/projects/llm_d/tests/test_runtime.py @@ -27,7 +27,6 @@ def test_parse_overrides_rejects_unknown_keys() -> None: def test_load_run_configuration_resolves_alias( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: - monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") artifact_dir = tmp_path / "artifacts" artifact_dir.mkdir() @@ -49,7 +48,6 @@ def test_load_run_configuration_resolves_alias( def test_load_run_configuration_consolidates_config_d( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: - monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") artifact_dir = tmp_path / "artifacts" artifact_dir.mkdir() @@ -85,7 +83,6 @@ def test_namespace_override_is_not_managed(tmp_path: Path, monkeypatch: pytest.M def test_default_namespace_comes_from_project_config( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: - monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") artifact_dir = tmp_path / "artifacts" artifact_dir.mkdir() (tmp_path / "fournos_config.yaml").write_text( @@ -184,13 +181,16 @@ def test_write_test_inputs_round_trip(tmp_path: Path, monkeypatch: pytest.Monkey def test_orchestration_prepare_writes_inputs_and_invokes_toolbox( orchestration, tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: - monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") artifact_dir = tmp_path / "artifacts" artifact_dir.mkdir() config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) captured: dict[str, str] = {} - monkeypatch.setattr(orchestration.llmd_runtime, "load_run_configuration", lambda: config) + monkeypatch.setattr( + orchestration.llmd_runtime, + "load_run_configuration", + lambda **_kwargs: config, + ) monkeypatch.setattr( orchestration, "prepare_toolbox_run", @@ -208,13 +208,16 @@ def test_orchestration_prepare_writes_inputs_and_invokes_toolbox( def test_orchestration_test_writes_inputs_and_invokes_toolbox( orchestration, tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: - monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") artifact_dir = tmp_path / "artifacts" artifact_dir.mkdir() config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) captured: dict[str, str] = {} - monkeypatch.setattr(orchestration.llmd_runtime, "load_run_configuration", lambda: config) + monkeypatch.setattr( + orchestration.llmd_runtime, + "load_run_configuration", + lambda **_kwargs: config, + ) monkeypatch.setattr( orchestration, "test_toolbox_run", @@ -233,13 +236,16 @@ def test_orchestration_test_writes_inputs_and_invokes_toolbox( def test_orchestration_cleanup_writes_inputs_and_invokes_toolbox( orchestration, tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: - monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") artifact_dir = tmp_path / "artifacts" artifact_dir.mkdir() config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) captured: dict[str, str] = {} - monkeypatch.setattr(orchestration.llmd_runtime, "load_run_configuration", lambda: config) + monkeypatch.setattr( + orchestration.llmd_runtime, + "load_run_configuration", + lambda **_kwargs: config, + ) monkeypatch.setattr( orchestration, "cleanup_toolbox_run", @@ -254,6 +260,34 @@ def test_orchestration_cleanup_writes_inputs_and_invokes_toolbox( assert loaded.platform == config.platform +@pytest.mark.parametrize("orchestration", [llmd_ci, llmd_cli]) +def test_orchestration_load_runtime_configuration_reads_env( + orchestration, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setenv("FORGE_PRESET", "smoke-precise") + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", '{"namespace":"custom-ns"}') + monkeypatch.setenv("FORGE_JOB_NAME", "job-from-env") + captured: dict[str, str | None] = {} + sentinel = object() + + def fake_load_run_configuration(**kwargs): + captured.update(kwargs) + return sentinel + + monkeypatch.setattr( + orchestration.llmd_runtime, "load_run_configuration", fake_load_run_configuration + ) + + result = orchestration.load_runtime_configuration() + + assert result is sentinel + assert captured == { + "requested_preset": "smoke-precise", + "raw_overrides": '{"namespace":"custom-ns"}', + "job_name": "job-from-env", + } + + def test_render_inference_service_injects_model_and_scheduler_profile( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: From 9a5f3332c3c5e8e83419b284e9954438da81b022 Mon Sep 17 00:00:00 2001 From: Alberto Perdomo Date: Tue, 5 May 2026 09:05:07 +0100 Subject: [PATCH 20/21] refactor: Deduplicate DSL toolbox path helpers --- projects/core/dsl/log.py | 30 +++++------------------------- projects/core/dsl/runtime.py | 23 +++++++---------------- 2 files changed, 12 insertions(+), 41 deletions(-) diff --git a/projects/core/dsl/log.py b/projects/core/dsl/log.py index b93a5076..dc28ffab 100644 --- a/projects/core/dsl/log.py +++ b/projects/core/dsl/log.py @@ -16,20 +16,17 @@ def setup_clean_logger(name: str): logger = logging.getLogger(name) logger.setLevel(logging.INFO) - # Only configure if not already configured if not logger.handlers: - # Create console handler with clean format console_handler = logging.StreamHandler() console_handler.setLevel(logging.INFO) console_handler.setFormatter(logging.Formatter("%(message)s")) logger.addHandler(console_handler) - logger.propagate = False # Don't propagate to root logger + logger.propagate = False return logger -# Configure clean logging for DSL operations logger = setup_clean_logger("DSL") @@ -45,30 +42,23 @@ def log_task_header(task_name: str, task_doc: str, rel_filename: str, line_no: i def log_execution_banner(function_args: dict = None, log_file: str = None): """Log the execution banner with function info and arguments""" - # Get the caller's filename and function name for the header frame = inspect.currentframe() - caller_frame = ( - frame.f_back.f_back - ) # Go back 2 frames (this func -> execute_tasks -> actual caller) + caller_frame = frame.f_back.f_back filename = caller_frame.f_code.co_filename rel_filename = _get_forge_relative_path(filename) - - # Use parent directory name as function name for toolbox operations function_name = _get_toolbox_function_name(filename) - # Print execution header logger.info("") logger.info("===============================================================================") logger.info(f"| FILE: {rel_filename}") logger.info(f"| COMMAND: {function_name}") if function_args: - # Display arguments in YAML format logger.info("| ARGUMENTS:") for key, value in function_args.items(): - if key == "function_args": # Skip the function_args parameter itself + if key == "function_args": continue if value is None: continue @@ -83,19 +73,13 @@ def log_execution_banner(function_args: dict = None, log_file: str = None): def log_completion_banner(function_args: dict = None, status: str = "SUCCESS"): """Log the completion banner with function info and completion status""" - # Get the caller's filename and function name for the header frame = inspect.currentframe() - caller_frame = ( - frame.f_back.f_back - ) # Go back 2 frames (this func -> execute_tasks -> actual caller) + caller_frame = frame.f_back.f_back filename = caller_frame.f_code.co_filename rel_filename = _get_forge_relative_path(filename) - - # Use parent directory name as function name for toolbox operations function_name = _get_toolbox_function_name(filename) - # Print completion header logger.info("") logger.info("===============================================================================") logger.info(f"| {rel_filename}") @@ -115,8 +99,4 @@ def _get_forge_relative_path(filename): def _get_toolbox_function_name(filename): """Extract toolbox function name from file path (parent directory name)""" - filename_path = Path(filename) - - # For paths like projects/llm_d/toolbox/capture_llmisvc_state/main.py - # Return the parent directory name: capture_llmisvc_state - return filename_path.parent.name + return Path(filename).parent.name diff --git a/projects/core/dsl/runtime.py b/projects/core/dsl/runtime.py index c8f807db..1c40c1df 100644 --- a/projects/core/dsl/runtime.py +++ b/projects/core/dsl/runtime.py @@ -15,7 +15,13 @@ from projects.core.library.run import SignalError from .context import create_task_parameters -from .log import log_completion_banner, log_execution_banner, logger +from .log import ( + _get_forge_relative_path, + _get_toolbox_function_name, + log_completion_banner, + log_execution_banner, + logger, +) from .script_manager import get_script_manager # Import from task.py to avoid circular imports @@ -401,18 +407,3 @@ def _generate_restart_script(function_args: dict, caller_frame, meta_dir): os.chmod(restart_file, 0o755) logger.debug(f"Generated restart script: {restart_file}") - - -def _get_forge_relative_path(filename): - """Get file path relative to FORGE home directory (forge root)""" - - return Path(filename).relative_to(env.FORGE_HOME) - - -def _get_toolbox_function_name(filename): - """Extract toolbox function name from file path (parent directory name)""" - filename_path = Path(filename) - - # For paths like projects/llm_d/toolbox/capture_llmisvc_state/main.py - # Return the parent directory name: capture_llmisvc_state - return filename_path.parent.name From 629d3bcaf4a898471d4eadbab903e4cacdf9a974 Mon Sep 17 00:00:00 2001 From: Alberto Perdomo Date: Tue, 5 May 2026 09:07:18 +0100 Subject: [PATCH 21/21] docs: Refresh llm_d layout references --- projects/llm_d/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/llm_d/README.md b/projects/llm_d/README.md index fd443121..82a108ac 100644 --- a/projects/llm_d/README.md +++ b/projects/llm_d/README.md @@ -10,7 +10,7 @@ The current implementation is intentionally narrow: Configuration layout: -- base config: [`orchestration/config.yaml`](./orchestration/config.yaml) +- project config chunk: [`orchestration/config.d/project.yaml`](./orchestration/config.d/project.yaml) - config chunks: [`orchestration/config.d`](./orchestration/config.d) - presets: [`orchestration/presets.d`](./orchestration/presets.d) - manifests: [`orchestration/manifests`](./orchestration/manifests) @@ -19,7 +19,7 @@ Main entrypoints: - CI phase wrapper: [`orchestration/ci.py`](./orchestration/ci.py) - CLI wrapper: [`orchestration/cli.py`](./orchestration/cli.py) -- Shared runtime/config loader: [`orchestration/llmd_runtime.py`](./orchestration/llmd_runtime.py) +- Shared runtime/config loader: [`runtime/llmd_runtime.py`](./runtime/llmd_runtime.py) - Toolbox prepare command: [`toolbox/prepare/main.py`](./toolbox/prepare/main.py) - Toolbox test command: [`toolbox/test/main.py`](./toolbox/test/main.py) - Toolbox cleanup command: [`toolbox/cleanup/main.py`](./toolbox/cleanup/main.py)