diff --git a/.github/workflows/test_toolbox_dsl.yml b/.github/workflows/test_toolbox_dsl.yml index 0946fca0..73faeadb 100644 --- a/.github/workflows/test_toolbox_dsl.yml +++ b/.github/workflows/test_toolbox_dsl.yml @@ -1,5 +1,5 @@ -# Unit tests for projects/core/dsl (task decorators, execute_tasks, failure/always/skip). -name: Toolbox DSL tests +# Python tests for repo-managed suites discovered via pyproject testpaths. +name: Python test suites on: pull_request: @@ -29,9 +29,9 @@ jobs: run: | set -o errexit python -m pip install --upgrade pip - python -m pip install pytest pyyaml jinja2 jsonpath_ng + python -m pip install .[dev] - - name: Run projects/core/tests + - name: Run pytest suites run: | set -o errexit # Tree + docstrings (what is being tested), then execute with one line per test + result. diff --git a/projects/core/dsl/log.py b/projects/core/dsl/log.py index b5c911de..dc28ffab 100644 --- a/projects/core/dsl/log.py +++ b/projects/core/dsl/log.py @@ -16,20 +16,17 @@ def setup_clean_logger(name: str): logger = logging.getLogger(name) logger.setLevel(logging.INFO) - # Only configure if not already configured if not logger.handlers: - # Create console handler with clean format console_handler = logging.StreamHandler() console_handler.setLevel(logging.INFO) console_handler.setFormatter(logging.Formatter("%(message)s")) logger.addHandler(console_handler) - logger.propagate = False # Don't propagate to root logger + logger.propagate = False return logger -# Configure clean logging for DSL operations logger = setup_clean_logger("DSL") @@ -45,30 +42,23 @@ def log_task_header(task_name: str, task_doc: str, rel_filename: str, line_no: i def log_execution_banner(function_args: dict = None, log_file: str = None): """Log the execution banner with function info and arguments""" - # Get the caller's filename and function name for the header frame = inspect.currentframe() - caller_frame = ( - frame.f_back.f_back - ) # Go back 2 frames (this func -> execute_tasks -> actual caller) + caller_frame = frame.f_back.f_back filename = caller_frame.f_code.co_filename rel_filename = _get_forge_relative_path(filename) - - # Use parent directory name as function name for toolbox operations function_name = _get_toolbox_function_name(filename) - # Print execution header logger.info("") logger.info("===============================================================================") logger.info(f"| FILE: {rel_filename}") logger.info(f"| COMMAND: {function_name}") if function_args: - # Display arguments in YAML format logger.info("| ARGUMENTS:") for key, value in function_args.items(): - if key == "function_args": # Skip the function_args parameter itself + if key == "function_args": continue if value is None: continue @@ -83,19 +73,13 @@ def log_execution_banner(function_args: dict = None, log_file: str = None): def log_completion_banner(function_args: dict = None, status: str = "SUCCESS"): """Log the completion banner with function info and completion status""" - # Get the caller's filename and function name for the header frame = inspect.currentframe() - caller_frame = ( - frame.f_back.f_back - ) # Go back 2 frames (this func -> execute_tasks -> actual caller) + caller_frame = frame.f_back.f_back filename = caller_frame.f_code.co_filename rel_filename = _get_forge_relative_path(filename) - - # Use parent directory name as function name for toolbox operations function_name = _get_toolbox_function_name(filename) - # Print completion header logger.info("") logger.info("===============================================================================") logger.info(f"| {rel_filename}") @@ -115,8 +99,4 @@ def _get_forge_relative_path(filename): def _get_toolbox_function_name(filename): """Extract toolbox function name from file path (parent directory name)""" - filename_path = Path(filename) - - # For paths like projects/llm_d/toolbox/capture_isvc_state/main.py - # Return the parent directory name: capture_isvc_state - return filename_path.parent.name + return Path(filename).parent.name diff --git a/projects/core/dsl/runtime.py b/projects/core/dsl/runtime.py index d05d91ac..4b986805 100644 --- a/projects/core/dsl/runtime.py +++ b/projects/core/dsl/runtime.py @@ -16,7 +16,13 @@ from projects.core.library.run import SignalError from .context import create_task_parameters -from .log import log_completion_banner, log_execution_banner, logger +from .log import ( + _get_forge_relative_path, + _get_toolbox_function_name, + log_completion_banner, + log_execution_banner, + logger, +) from .script_manager import get_script_manager # Import from task.py to avoid circular imports @@ -471,18 +477,3 @@ def _generate_restart_script(function_args: dict, caller_frame, meta_dir): os.chmod(restart_file, 0o755) logger.debug(f"Generated restart script: {restart_file}") - - -def _get_forge_relative_path(filename): - """Get file path relative to FORGE home directory (forge root)""" - - return Path(filename).relative_to(env.FORGE_HOME) - - -def _get_toolbox_function_name(filename): - """Extract toolbox function name from file path (parent directory name)""" - filename_path = Path(filename) - - # For paths like projects/llm_d/toolbox/capture_isvc_state/main.py - # Return the parent directory name: capture_isvc_state - return filename_path.parent.name diff --git a/projects/core/library/config.py b/projects/core/library/config.py index d84005c4..b8e2c79b 100644 --- a/projects/core/library/config.py +++ b/projects/core/library/config.py @@ -472,8 +472,6 @@ def init(orchestration_dir, *, apply_config_overrides=True): project = Config(config_path) - env.ARTIFACT_DIR / VARIABLE_OVERRIDES_FILENAME - if not apply_config_overrides: logger.info( "config.init: running with 'apply_config_overrides=False', " @@ -489,3 +487,20 @@ def init(orchestration_dir, *, apply_config_overrides=True): project.apply_config_overrides() project.apply_presets_from_project_args() project.apply_config_overrides() # reapply so that the value overrides are applied last + + +def reload(orchestration_dir, *, apply_config_overrides=True): + global project + + project = None + + artifact_config = env.ARTIFACT_DIR / "config.yaml" + if artifact_config.exists(): + artifact_config.unlink() + + presets_applied = env.ARTIFACT_DIR / "presets_applied" + if presets_applied.exists(): + presets_applied.unlink() + + init(orchestration_dir, apply_config_overrides=apply_config_overrides) + return project diff --git a/projects/llm_d/README.md b/projects/llm_d/README.md index f254277f..82a108ac 100644 --- a/projects/llm_d/README.md +++ b/projects/llm_d/README.md @@ -1,304 +1,25 @@ -# Skeleton Project +# llm_d -This is a template/skeleton project that demonstrates how to create a new project within the **FORGE** test harness framework. +`llm_d` is the Forge project for validating downstream llm-d on RHOAI. -## Overview +The current implementation is intentionally narrow: -This skeleton shows the essential structure and patterns for building projects that comply with FORGE's constitutional principles: +- target only downstream `LLMInferenceService` +- keep the public interface compatible with current Fournos phase execution +- use checked-in config chunks and manifests instead of a large mutable config surface -- **CI-First Testing**: Structured phases ensure consistent CI integration -- **Observable Measurements**: Command execution logging and timing -- **Reproducible Results**: Deterministic operations with clear success/failure -- **Scale-Aware Design**: Efficient synchronous operations -- **AI Platform Specificity**: OpenShift AI focused testing patterns +Configuration layout: -## Project Structure +- project config chunk: [`orchestration/config.d/project.yaml`](./orchestration/config.d/project.yaml) +- config chunks: [`orchestration/config.d`](./orchestration/config.d) +- presets: [`orchestration/presets.d`](./orchestration/presets.d) +- manifests: [`orchestration/manifests`](./orchestration/manifests) -``` -skeleton/ -├── orchestration/ -│ └── ci.py # Main CI script with Click-based CLI -├── README.md # This documentation -├── config.yaml # Project configuration (optional) -├── tests/ # Test scripts and data (optional) -└── scripts/ # Helper scripts (optional) -``` +Main entrypoints: -## Quick Start - -### 1. Run Individual Phases - -```bash -# From the FORGE root directory - -# Prepare environment -./run_ci skeleton ci prepare - -# Run tests -./run_ci skeleton ci test - -# Clean up -./run_ci skeleton ci cleanup -``` - -### 2. Development Options - -```bash -# Verbose output -./run_ci skeleton ci --verbose test - -# See all available commands -./run_ci skeleton ci --help -``` - -## Creating Your Own Project - -### Step 1: Copy Skeleton - -```bash -cp -r projects/skeleton projects/your-project-name -cd projects/your-project-name -``` - -### Step 2: Customize - -1. **Update `orchestration/ci.py`**: - - Change `self.project_name` to your project name - - Replace placeholder `echo` commands with actual test logic - - Update the CLI description and help text - -2. **Update `README.md`**: - - Document your project's purpose and usage - - Add specific setup instructions - -3. **Add configuration** (optional): - - Create `config.yaml` for project-specific settings - - Reference it in your CI script - -### Step 3: Implement Test Logic - -Replace the example `echo` commands with your actual test logic: - -#### Prepare Phase -```python -def prepare(self): - self.log("Starting prepare phase...") - - # Example: Install dependencies - if not self.execute_command( - "oc apply -f manifests/setup.yaml", - "Deploy setup resources" - ): - return 1 - - # Example: Validate environment - if not self.execute_command( - "oc get nodes", - "Check cluster nodes" - ): - return 1 - - self.log("Prepare phase completed!", "success") - return 0 -``` - -#### Test Phase -```python -def test(self): - self.log("Starting test phase...") - - # Example: Run performance tests - if not self.execute_command( - "python scripts/performance_test.py --config config.yaml", - "Running performance tests" - ): - return 1 - - # Example: Run functional tests - if not self.execute_command( - "pytest tests/ -v", - "Running functional tests" - ): - return 1 - - self.log("Test phase completed!", "success") - return 0 -``` - -#### Cleanup Phase -```python -def cleanup(self): - self.log("Starting cleanup phase...") - - # Example: Remove test resources - self.execute_command( - "oc delete -f manifests/", - "Cleanup test resources" - ) - - # Example: Generate reports - self.execute_command( - "python scripts/generate_report.py", - "Generate final report" - ) - - self.log("Cleanup phase completed!", "success") - return 0 -``` - -## Key Patterns - -### 1. Phase Structure - -Each project should implement these standard phases: -- **prepare**: Set up environment and dependencies -- **test**: Execute main testing logic -- **cleanup**: Clean up resources and finalize - -### 2. Command Execution - -Use the `execute_command` method for consistent execution and logging: - -```python -# Basic command execution -success = self.execute_command("your-command", "Description") -if not success: - return 1 # Exit with error - -# Command with complex logic -result = self.execute_command( - "kubectl get pods -o json", - "Check pod status" -) -``` - -### 3. Error Handling - -Always check command results and handle failures appropriately: - -```python -if not self.execute_command("critical-command", "Critical step"): - self.log("Critical step failed!", "error") - return 1 # Exit with error code - -# Cleanup commands can be non-critical -self.execute_command("cleanup-command", "Optional cleanup") -# Continue regardless of success -``` - -### 4. Logging - -Use the logging methods for consistent output: - -```python -self.log("Starting operation", "info") # ℹ️ [project] Starting operation -self.log("Operation completed", "success") # ✅ [project] Operation completed -self.log("Warning occurred", "warning") # ⚠️ [project] Warning occurred -self.log("Error occurred", "error") # ❌ [project] Error occurred -``` - -### 5. Verbose Mode - -The framework automatically handles verbose mode: - -```python -# In verbose mode, command details are automatically shown -# Your execute_command calls will show: -# - Command being executed -# - Command output (if any) -# - Execution duration -``` - -## Click CLI Structure - -The skeleton uses Click groups to organize commands: - -```python -@click.group() -@click.option('--verbose', '-v', is_flag=True, help='Enable verbose output') -@click.pass_context -def cli(ctx, verbose): - """Project CI Operations for FORGE.""" - ctx.ensure_object(types.SimpleNamespace) - ctx.obj.verbose = verbose - ctx.obj.runner = YourProjectTestRunner(verbose) - -@cli.command() -@click.pass_context -def prepare(ctx): - """Prepare phase - Set up environment and dependencies.""" - runner = ctx.obj.runner - exit_code = runner.prepare() - sys.exit(exit_code) -``` - -## Best Practices - -### 1. Constitutional Compliance - -- ✅ **CI-First**: Design for automated execution without user interaction -- ✅ **Observable**: Log important events and command execution -- ✅ **Reproducible**: Use deterministic operations and clear error codes -- ✅ **Scale-Aware**: Keep operations efficient and focused -- ✅ **AI Platform Specific**: Focus on OpenShift AI scenarios and tooling - -### 2. Error Handling - -- Always validate prerequisites in prepare phase -- Check command results and fail fast on errors -- Provide meaningful error messages with context -- Clean up resources even when tests fail (use try/except if needed) - -### 3. Command Design - -- Make commands idempotent when possible -- Use meaningful descriptions for all execute_command calls -- Test commands locally before adding to CI -- Consider timeouts for long-running operations - -### 4. Configuration - -- Keep project configuration in `config.yaml` or environment variables -- Make tests configurable for different environments -- Document all configuration options -- Use sensible defaults - -## Testing the Skeleton - -```bash -# Test individual phases -./run_ci skeleton ci prepare -./run_ci skeleton ci test -./run_ci skeleton ci cleanup - -# Test with verbose output -./run_ci skeleton ci --verbose prepare - -# See all available commands -./run_ci skeleton ci --help -``` - -## Integration with CI Systems - -The skeleton is designed for easy CI integration: - -```bash -# In your CI pipeline -./run_ci your-project ci prepare || exit 1 -./run_ci your-project ci test || exit 1 -./run_ci your-project ci cleanup # Always run cleanup -``` - -## Next Steps - -1. **Study the Code**: Review `orchestration/ci.py` to understand the patterns -2. **Copy and Customize**: Create your own project based on this skeleton -3. **Implement Tests**: Replace placeholder `echo` commands with real test logic -4. **Test Integration**: Verify your project works with the run_ci entrypoint -5. **Add Documentation**: Document your specific test scenarios and setup - -## Support - -- Review other projects in `projects/` for more examples -- Check the main FORGE documentation -- Study the run_ci entrypoint code in `projects/core/ci_entrypoint/` +- CI phase wrapper: [`orchestration/ci.py`](./orchestration/ci.py) +- CLI wrapper: [`orchestration/cli.py`](./orchestration/cli.py) +- Shared runtime/config loader: [`runtime/llmd_runtime.py`](./runtime/llmd_runtime.py) +- Toolbox prepare command: [`toolbox/prepare/main.py`](./toolbox/prepare/main.py) +- Toolbox test command: [`toolbox/test/main.py`](./toolbox/test/main.py) +- Toolbox cleanup command: [`toolbox/cleanup/main.py`](./toolbox/cleanup/main.py) diff --git a/projects/llm_d/orchestration/ci.py b/projects/llm_d/orchestration/ci.py old mode 100755 new mode 100644 index 7623510f..bc5ae6f4 --- a/projects/llm_d/orchestration/ci.py +++ b/projects/llm_d/orchestration/ci.py @@ -4,13 +4,46 @@ """ +import os import types import click -import prepare_llmd -import test_llmd from projects.core.library import ci as ci_lib +from projects.llm_d.runtime import llmd_runtime, phase_inputs +from projects.llm_d.toolbox.cleanup.main import run as cleanup_toolbox_run +from projects.llm_d.toolbox.prepare.main import run as prepare_toolbox_run +from projects.llm_d.toolbox.test.main import run as test_toolbox_run + + +def init_runtime() -> None: + llmd_runtime.init() + + +def load_runtime_configuration(): + return llmd_runtime.load_run_configuration( + requested_preset=os.environ.get("FORGE_PRESET"), + raw_overrides=os.environ.get("FORGE_CONFIG_OVERRIDES"), + job_name=os.environ.get("FORGE_JOB_NAME"), + ) + + +def run_prepare_phase() -> int: + config = load_runtime_configuration() + inputs_file = phase_inputs.write_prepare_inputs(config) + return prepare_toolbox_run(inputs_file=str(inputs_file)) + + +def run_test_phase() -> int: + config = load_runtime_configuration() + inputs_file = phase_inputs.write_test_inputs(config) + return test_toolbox_run(inputs_file=str(inputs_file)) + + +def run_cleanup_phase() -> int: + config = load_runtime_configuration() + inputs_file = phase_inputs.write_cleanup_inputs(config) + return cleanup_toolbox_run(inputs_file=str(inputs_file)) @click.group() @@ -19,31 +52,31 @@ def main(ctx): """LLM-D Project CI Operations for FORGE.""" ctx.ensure_object(types.SimpleNamespace) - test_llmd.init() + init_runtime() @main.command() @click.pass_context @ci_lib.safe_ci_command -def prepare(ctx): +def prepare(ctx) -> int: """Prepare phase - Set up environment and dependencies.""" - return prepare_llmd.prepare() + return run_prepare_phase() @main.command() @click.pass_context @ci_lib.safe_ci_command -def test(ctx): +def test(ctx) -> int: """Test phase - Execute the main testing logic.""" - return test_llmd.test() + return run_test_phase() @main.command() @click.pass_context @ci_lib.safe_ci_command -def pre_cleanup(ctx): +def pre_cleanup(ctx) -> int: """Cleanup phase - Clean up resources and finalize.""" - return prepare_llmd.cleanup() + return run_cleanup_phase() if __name__ == "__main__": diff --git a/projects/llm_d/orchestration/cli.py b/projects/llm_d/orchestration/cli.py old mode 100755 new mode 100644 index def09477..fdb84fa9 --- a/projects/llm_d/orchestration/cli.py +++ b/projects/llm_d/orchestration/cli.py @@ -1,63 +1,88 @@ #!/usr/bin/env python3 -""" -LLM-D Project CLI Operations -""" import logging -import sys +import os import types import click -import prepare_llmd -import test_llmd from projects.core.library.cli import safe_cli_command +from projects.llm_d.runtime import llmd_runtime, phase_inputs +from projects.llm_d.toolbox.cleanup.main import run as cleanup_toolbox_run +from projects.llm_d.toolbox.prepare.main import run as prepare_toolbox_run +from projects.llm_d.toolbox.test.main import run as test_toolbox_run logger = logging.getLogger(__name__) +def init_runtime() -> None: + llmd_runtime.init() + + +def load_runtime_configuration(): + return llmd_runtime.load_run_configuration( + requested_preset=os.environ.get("FORGE_PRESET"), + raw_overrides=os.environ.get("FORGE_CONFIG_OVERRIDES"), + job_name=os.environ.get("FORGE_JOB_NAME"), + ) + + +def run_prepare_phase() -> int: + config = load_runtime_configuration() + inputs_file = phase_inputs.write_prepare_inputs(config) + return prepare_toolbox_run(inputs_file=str(inputs_file)) + + +def run_test_phase() -> int: + config = load_runtime_configuration() + inputs_file = phase_inputs.write_test_inputs(config) + return test_toolbox_run(inputs_file=str(inputs_file)) + + +def run_cleanup_phase() -> int: + config = load_runtime_configuration() + inputs_file = phase_inputs.write_cleanup_inputs(config) + return cleanup_toolbox_run(inputs_file=str(inputs_file)) + + @click.group() @click.pass_context def main(ctx): - """LLM-D Project CI Operations for FORGE.""" + """LLM-D Project CLI Operations for FORGE.""" ctx.ensure_object(types.SimpleNamespace) - test_llmd.init() + init_runtime() @main.command() @click.pass_context @safe_cli_command -def prepare(ctx): +def prepare(ctx) -> int: """Prepare phase - Set up environment and dependencies.""" - exit_code = prepare_llmd.prepare() - sys.exit(exit_code) + return run_prepare_phase() @main.command() @click.pass_context @safe_cli_command -def test(ctx): +def test(ctx) -> int: """Test phase - Execute the main testing logic.""" - exit_code = test_llmd.test() - sys.exit(exit_code) + return run_test_phase() @main.command() @click.pass_context @safe_cli_command -def pre_cleanup(ctx): +def pre_cleanup(ctx) -> int: """Cleanup phase - Clean up resources and finalize.""" - exit_code = prepare_llmd.cleanup() - sys.exit(exit_code) + return run_cleanup_phase() @main.command() @click.pass_context @safe_cli_command -def post_cleanup(ctx): +def post_cleanup(ctx) -> int: """Cleanup phase - Clean up resources and finalize.""" - exit_code = prepare_llmd.cleanup() - sys.exit(exit_code) + return run_cleanup_phase() if __name__ == "__main__": diff --git a/projects/llm_d/orchestration/config.d/model_cache.yaml b/projects/llm_d/orchestration/config.d/model_cache.yaml new file mode 100644 index 00000000..eae01772 --- /dev/null +++ b/projects/llm_d/orchestration/config.d/model_cache.yaml @@ -0,0 +1,25 @@ +enabled: true +marker_filename: .forge-model-cache.json + +pvc: + name_prefix: llm-d-model + size: 15Gi + access_mode: ReadWriteOnce + storage_class_name: null + model_directory_name: model + +download: + wait_timeout_seconds: 7200 + poll_interval_seconds: 15 + pod_image_pull_policy: IfNotPresent + +hf: + downloader_image: registry.access.redhat.com/ubi9/python-311 + token_secret_name: null + token_secret_key: token + +oci: + extractor_image: registry.redhat.io/openshift4/ose-cli:v4.19 + registry_auth_secret_name: null + registry_auth_secret_key: .dockerconfigjson + image_path: / diff --git a/projects/llm_d/orchestration/config.d/models.yaml b/projects/llm_d/orchestration/config.d/models.yaml new file mode 100644 index 00000000..4334cf4a --- /dev/null +++ b/projects/llm_d/orchestration/config.d/models.yaml @@ -0,0 +1,32 @@ +qwen3-0-6b: + served_model_name: Qwen/Qwen3-0.6B + uri: hf://Qwen/Qwen3-0.6B + cache: + pvc_size: 10Gi + access_mode: ReadWriteOnce + resources: + requests: + cpu: "4" + memory: 16Gi + nvidia.com/gpu: "1" + limits: + cpu: "4" + memory: 16Gi + nvidia.com/gpu: "1" + +llama-3-1-8b-instruct-fp8: + served_model_name: llama-3-1-8b-instruct-fp8 + uri: oci://registry.redhat.io/rhelai1/modelcar-llama-3-1-8b-instruct-fp8-dynamic:1.5 + cache: + pvc_size: 40Gi + access_mode: ReadWriteOnce + oci_image_path: / + resources: + requests: + cpu: "4" + memory: 8Gi + nvidia.com/gpu: "1" + limits: + cpu: "4" + memory: 8Gi + nvidia.com/gpu: "1" diff --git a/projects/llm_d/orchestration/config.d/platform.yaml b/projects/llm_d/orchestration/config.d/platform.yaml new file mode 100644 index 00000000..6f823eba --- /dev/null +++ b/projects/llm_d/orchestration/config.d/platform.yaml @@ -0,0 +1,86 @@ +cluster: + minimum_openshift_version: "4.19.9" + namespace: + name: forge-llm-d + prefix: llm-d + max_length: 63 + cleanup_timeout_seconds: 900 + gpu_node_label_selector: nvidia.com/gpu.present=true + nfd_gpu_detection_labels: + - feature.node.kubernetes.io/pci-10de.present + - feature.node.kubernetes.io/pci-0302_10de.present + - feature.node.kubernetes.io/pci-0300_10de.present + +operators: + openshift-cert-manager-operator: + display_name: OpenShift Cert Manager + namespace: openshift-cert-manager-operator + channel: stable-v1.18 + source: redhat-operators + wait_timeout_seconds: 900 + leader-worker-set: + display_name: Leader Worker Set + namespace: openshift-lws + channel: stable + source: redhat-operators + wait_timeout_seconds: 900 + nfd: + display_name: Node Feature Discovery + namespace: openshift-nfd + channel: stable + source: redhat-operators + wait_timeout_seconds: 900 + bootstrap_crd: nodefeaturediscoveries.nfd.openshift.io + bootstrap_manifest: manifests/nfd-nodefeaturediscovery.yaml + gpu-operator-certified: + display_name: NVIDIA GPU Operator + namespace: nvidia-gpu-operator + channel: stable + source: certified-operators + wait_timeout_seconds: 1800 + bootstrap_crd: clusterpolicies.nvidia.com + bootstrap_manifest: manifests/gpu-clusterpolicy.yaml + rhods-operator: + display_name: Red Hat OpenShift AI + namespace: redhat-ods-operator + channel: stable-3.x + source: redhat-operators + wait_timeout_seconds: 1800 + +rhoai: + namespace: redhat-ods-applications + datasciencecluster_name: default-dsc + datasciencecluster_template: manifests/datasciencecluster.yaml + wait_timeout_seconds: 1800 + required_crds_before_dsc: + - datascienceclusters.datasciencecluster.opendatahub.io + required_crds_after_dsc: + - llminferenceservices.serving.kserve.io + +gateway: + namespace: openshift-ingress + name: openshift-ai-inference + gateway_class_name: data-science-gateway-class + status_address_name: gateway-external + create_if_missing: true + manifest_template: manifests/gateway.yaml + wait_timeout_seconds: 600 + +inference_service: + name: llm-d + template: manifests/llminferenceservice.yaml + workload_deployment_name_suffix: -kserve + pod_appearance_timeout_seconds: 600 + ready_timeout_seconds: 1800 + delete_timeout_seconds: 900 + +artifacts: + capture_namespace_events: true + +smoke: + job_name: llm-d-smoke + client_image: curlimages/curl:8.11.1 + endpoint_path: /v1/completions + request_retries: 30 + request_retry_delay_seconds: 10 + request_timeout_seconds: 60 diff --git a/projects/llm_d/orchestration/config.d/project.yaml b/projects/llm_d/orchestration/config.d/project.yaml new file mode 100644 index 00000000..f957c25d --- /dev/null +++ b/projects/llm_d/orchestration/config.d/project.yaml @@ -0,0 +1,2 @@ +name: llm_d +args: [] diff --git a/projects/llm_d/orchestration/config.d/runtime.yaml b/projects/llm_d/orchestration/config.d/runtime.yaml new file mode 100644 index 00000000..c8715ccb --- /dev/null +++ b/projects/llm_d/orchestration/config.d/runtime.yaml @@ -0,0 +1,8 @@ +default_preset: smoke +allowed_override_keys: + - namespace +selected_preset: smoke +model_key: qwen3-0-6b +scheduler_profile_key: approximate +smoke_request_key: default +benchmark_key: null diff --git a/projects/llm_d/orchestration/config.d/scheduler_profiles.yaml b/projects/llm_d/orchestration/config.d/scheduler_profiles.yaml new file mode 100644 index 00000000..b3bca162 --- /dev/null +++ b/projects/llm_d/orchestration/config.d/scheduler_profiles.yaml @@ -0,0 +1,9 @@ +approximate: + config_path: scheduler_profiles/approximate.yaml + +precise: + config_path: scheduler_profiles/precise.yaml + +# Compatibility alias for earlier llm_d presets. +approximate-prefix-cache: + config_path: scheduler_profiles/approximate.yaml diff --git a/projects/llm_d/orchestration/config.d/workloads.yaml b/projects/llm_d/orchestration/config.d/workloads.yaml new file mode 100644 index 00000000..1ce9bdc6 --- /dev/null +++ b/projects/llm_d/orchestration/config.d/workloads.yaml @@ -0,0 +1,19 @@ +smoke_requests: + default: + prompt: San Francisco is a + max_tokens: 50 + temperature: 0.7 + +benchmarks: + short: + job_name: guidellm-benchmark + image: ghcr.io/vllm-project/guidellm:v0.5.4 + pvc_size: 1Gi + timeout_seconds: 900 + rate: 1 + args: + backend_type: openai_http + rate_type: concurrent + max_seconds: 120 + sample_requests: 20 + data: prompt_tokens=256,output_tokens=128 diff --git a/projects/llm_d/orchestration/config.yaml b/projects/llm_d/orchestration/config.yaml deleted file mode 100644 index e7367e8f..00000000 --- a/projects/llm_d/orchestration/config.yaml +++ /dev/null @@ -1,230 +0,0 @@ -prepare: - skip: false - namespace: - name: llm-d-project - - operators: - skip: false - list: - - name: "Red Hat Connectivity Link" - catalog: redhat-operators - operator: rhcl-operator - namespace: all - enabled: false - - - name: "OpenShift Cert Manager" - catalog: redhat-operators - operator: openshift-cert-manager-operator - namespace: openshift-cert-manager-operator - enabled: true - - - name: "Leader Worker Set" - catalog: redhat-operators - operator: leader-worker-set - namespace: openshift-lws - deploy_cr: true - enabled: true - - - name: "Node Feature Discovery" - catalog: redhat-operators - operator: nfd - namespace: openshift-nfd - deploy_cr: 1 - enabled: true - - - name: "NVIDIA GPU Operator" - catalog: certified-operators - operator: gpu-operator-certified - namespace: nvidia-gpu-operator - deploy_cr: true - enabled: true - - - name: "Grafana Operator" - catalog: community-operators - operator: grafana-operator - namespace: grafana-operator - enabled: true - extra_args: - all_namespaces: true - - cluster: - skip: false - nodes: - auto_scale: false - auto_scale_down_on_exit: false - instance_type: gx3-16x80x1l4 - count: 2 - - rhoai: - skip: false - image: "quay.io/rhoai/rhoai-fbc-fragment" - tag: "rhoai-3.3@sha256:f6e7db613cd040e53da2d47850477a9b914de18979adaaac47e15dc7c76f8a76" - channel: "stable-3.x" - datasciencecluster: - enable: "[kserve]" - extra_settings: '{"spec.components.kserve.rawDeploymentServiceConfig": "Headless"}' - - gateway: - skip: false - name: openshift-ai-inference # NOTE: Should not be changed for the time being - - grafana: - skip: false - namespace: grafana - datasources: - - grafana/datasource.yaml - dashboards_dir: grafana/dashboards - - monitoring: - skip: false - namespaces: - - "@prepare.namespace.name" - - gpu: - wait_for_readiness: false - - preload: - skip: false - extra_images: {} - node_selector_key: nvidia.com/gpu.present - node_selector_value: "true" - - pvc: - enabled: true - size: 2000Gi - name: storage - access_mode: ReadWriteOnce - storage_class: null - - model_downloader: - image: ghcr.io/opendatahub-io/rhaii-on-xks/kserve-storage-initializer:e6b5db0@sha256:b305264fe2211be2c6063500c4c11da79e8357af4b34dd8567b0d8e8dea7e1d4 - - cleanup: - skip: false - -models: - facebook-opt-125m: - name: facebook/opt-125m - source: hf://facebook/opt-125m - resources: - cpu: 2 - memory: 8Gi - - llama3-1-8b: - name: RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8-dynamic - uri: oci://registry.redhat.io/rhelai1/modelcar-llama-3-1-8b-instruct-fp8-dynamic:1.5 - # source: hf://RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8-dynamic - resources: {} - - llama3-3-70b: - name: RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic - source: hf://RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic - resources: - cpu: 4 - memory: 64Gi - - gpt-oss-120: - name: openai/gpt-oss-120b - source: hf://openai/gpt-oss-120b - resources: - cpu: 4 - memory: 64Gi - - granite4-tiny: - name: RedHatAI/granite-4.0-h-tiny-FP8-dynamic - source: hf://RedHatAI/granite-4.0-h-tiny-FP8-dynamic - resources: {} - -tests: - llmd: - skip: false - skip_prepare: false - flavors: intelligentrouting - namespace: "@prepare.namespace.name" - - inference_service: - skip_deployment: false - name: llm-d - yaml_file: llama-3-1-8b-instruct-fp8.yaml - timeout: 900 - do_simple_test: true - gateway: - name: gateway-external - model: llama3-1-8b - metrics: - manual_capture: true - scheduler_servicemonitor_name: kserve-llm-isvc-scheduler - vllm_podmonitor_name: kserve-llm-isvc-vllm-engine - - # vLLM arguments (always applied) - vllm_args: - - "--disable-uvicorn-access-log" - - "--enable-prefix-caching" - - "--uvicorn-log-level=debug" - - "--trust-remote-code" - - "--disable-log-requests" - - "--max-model-len=40960" # keep in 5th position or uddate the presets - - "--gpu-memory-utilization=0.92" - - kueue: - enabled: false - prefix: "kueue.x-k8s.io/" - labels: - pod-group-name: llmisvc - managed: "true" - annotations: - queue-name: perf-gpu-queue - - # Extra properties to inject into the LLMISVC YAML using dotted-key notation - extra_properties: {} - - benchmarks: - guidellm: - enabled: true - name: guidellm-benchmark - backend_type: openai_http - rate_type: concurrent - max_seconds: 120 - max_requests: null - timeout: 900 - data: prompt_tokens=256,output_tokens=128 - rate: 1 - sample_requests: 20 - - capture_prom: true - capture_prom_uwm: true - dry_mode: false - visualize: true - -export_artifacts: - enabled: false - -matbench: - enabled: true - preset: null - workload: projects.llm-d.visualizations.llmd_inference - config_file: plots.yaml - # directory to plot - lts: - generate: true - opensearch: - export: - enabled: false - enabled_on_replot: false - fail_test_on_fail: true - instance: smoke - index: forge-llm-d-cpt - index_prefix: "" - build_counter_index: "forge-llm-d-builds" # used to generate a unique ID for each build - regression_analyses: - enabled: false - enabled_on_replot: true - upload_lts_on_regression: true - # if the regression analyses fail, mark the test as failed - fail_test_on_regression: true - notification: - enabled: true - title: "llm-d CPT" - download: - mode: prefer_cache - url: null diff --git a/projects/llm_d/orchestration/manifests/datasciencecluster.yaml b/projects/llm_d/orchestration/manifests/datasciencecluster.yaml new file mode 100644 index 00000000..fd45316d --- /dev/null +++ b/projects/llm_d/orchestration/manifests/datasciencecluster.yaml @@ -0,0 +1,22 @@ +apiVersion: datasciencecluster.opendatahub.io/v1 +kind: DataScienceCluster +metadata: + name: default-dsc + namespace: redhat-ods-applications +spec: + components: + codeflare: + managementState: Removed + dashboard: + managementState: Removed + datasciencepipelines: + managementState: Removed + kserve: + managementState: Managed + rawDeploymentServiceConfig: Headless + modelmeshserving: + managementState: Removed + ray: + managementState: Removed + workbenches: + managementState: Removed diff --git a/projects/llm_d/orchestration/manifests/gateway.yaml b/projects/llm_d/orchestration/manifests/gateway.yaml new file mode 100644 index 00000000..dff0c398 --- /dev/null +++ b/projects/llm_d/orchestration/manifests/gateway.yaml @@ -0,0 +1,14 @@ +apiVersion: gateway.networking.k8s.io/v1 +kind: Gateway +metadata: + name: openshift-ai-inference + namespace: openshift-ingress +spec: + gatewayClassName: data-science-gateway-class + listeners: + - name: http + port: 80 + protocol: HTTP + allowedRoutes: + namespaces: + from: All diff --git a/projects/llm_d/orchestration/manifests/gpu-clusterpolicy.yaml b/projects/llm_d/orchestration/manifests/gpu-clusterpolicy.yaml new file mode 100644 index 00000000..6a9ad7ee --- /dev/null +++ b/projects/llm_d/orchestration/manifests/gpu-clusterpolicy.yaml @@ -0,0 +1,37 @@ +apiVersion: nvidia.com/v1 +kind: ClusterPolicy +metadata: + name: gpu-cluster-policy +spec: + daemonsets: + tolerations: + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + updateStrategy: RollingUpdate + dcgm: + enabled: true + dcgmExporter: + enabled: true + devicePlugin: + enabled: true + driver: + enabled: true + kernelModuleType: auto + gfd: + enabled: true + mig: + strategy: single + nodeStatusExporter: + enabled: true + operator: + defaultRuntime: crio + runtimeClass: nvidia + toolkit: + enabled: true + installDir: /usr/local/nvidia + validator: + plugin: + env: + - name: WITH_WORKLOAD + value: "false" diff --git a/projects/llm_d/orchestration/manifests/llminferenceservice.yaml b/projects/llm_d/orchestration/manifests/llminferenceservice.yaml new file mode 100644 index 00000000..cff616f8 --- /dev/null +++ b/projects/llm_d/orchestration/manifests/llminferenceservice.yaml @@ -0,0 +1,96 @@ +apiVersion: serving.kserve.io/v1alpha1 +kind: LLMInferenceService +metadata: + name: llm-d + namespace: llm-d + annotations: + security.opendatahub.io/enable-auth: "false" + prometheus.io/path: /metrics + prometheus.io/port: "8000" +spec: + replicas: 1 + model: + uri: hf://Qwen/Qwen3-0.6B + name: Qwen/Qwen3-0.6B + router: + scheduler: + template: + containers: + - name: main + env: + - name: TOKENIZER_CACHE_DIR + value: /tmp/tokenizer-cache + - name: HF_HOME + value: /tmp/tokenizer-cache + - name: TRANSFORMERS_CACHE + value: /tmp/tokenizer-cache + - name: XDG_CACHE_HOME + value: /tmp + args: + - --cert-path + - /var/run/kserve/tls + - --pool-group + - inference.networking.x-k8s.io + - --pool-name + - "{{ ChildName .ObjectMeta.Name `-inference-pool` }}" + - --pool-namespace + - "{{ .ObjectMeta.Namespace }}" + - --zap-encoder + - json + - --grpc-port + - "9002" + - --grpc-health-port + - "9003" + - --secure-serving + - --model-server-metrics-scheme + - https + - --config-text + volumeMounts: + - name: tokenizer-cache + mountPath: /tmp/tokenizer-cache + - name: cachi2-cache + mountPath: /cachi2 + volumes: + - name: tokenizer-cache + emptyDir: {} + - name: cachi2-cache + emptyDir: {} + nodeSelector: + nvidia.com/gpu.present: "true" + route: {} + gateway: {} + template: + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + containers: + - name: main + resources: + requests: + cpu: "4" + memory: 16Gi + nvidia.com/gpu: "1" + limits: + cpu: "4" + memory: 16Gi + nvidia.com/gpu: "1" + livenessProbe: + httpGet: + path: /health + port: 8000 + scheme: HTTPS + initialDelaySeconds: 900 + periodSeconds: 60 + timeoutSeconds: 60 + failureThreshold: 1000 + readinessProbe: + failureThreshold: 10000 + httpGet: + path: /health + port: 8000 + scheme: HTTPS + initialDelaySeconds: 60 + periodSeconds: 30 + successThreshold: 1 + timeoutSeconds: 30 diff --git a/projects/llm_d/orchestration/manifests/nfd-nodefeaturediscovery.yaml b/projects/llm_d/orchestration/manifests/nfd-nodefeaturediscovery.yaml new file mode 100644 index 00000000..df19596f --- /dev/null +++ b/projects/llm_d/orchestration/manifests/nfd-nodefeaturediscovery.yaml @@ -0,0 +1,6 @@ +apiVersion: nfd.openshift.io/v1 +kind: NodeFeatureDiscovery +metadata: + name: nfd-instance + namespace: openshift-nfd +spec: {} diff --git a/projects/llm_d/orchestration/prepare_llmd.py b/projects/llm_d/orchestration/prepare_llmd.py deleted file mode 100644 index c28ad8c7..00000000 --- a/projects/llm_d/orchestration/prepare_llmd.py +++ /dev/null @@ -1,16 +0,0 @@ -import logging - -from projects.core.library import config - -logger = logging.getLogger(__name__) - - -def prepare(): - ns = config.project.get_config("prepare.namespace.name") - logger.warning(f"Hello prepare {ns}") - pass - - -def cleanup(): - logger.warning("Hello cleanup") - pass diff --git a/projects/llm_d/orchestration/presets.d/cks.yaml b/projects/llm_d/orchestration/presets.d/cks.yaml deleted file mode 100644 index b4f842dc..00000000 --- a/projects/llm_d/orchestration/presets.d/cks.yaml +++ /dev/null @@ -1,23 +0,0 @@ -extends: [pvc_rwx, llama-70b] - -tests.capture_prom: false -tests.capture_prom_uwm: false -tests.llmd.skip_prepare: true -prepare.namespace.name: kpouget-dev -prepare.preload.node_selector_key: gpu.nvidia.com/class -prepare.preload.node_selector_value: "H200" -tests.llmd.inference_service.extra_properties: - spec.template.affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: kubernetes.io/hostname - operator: NotIn - values: - - gf48e48 - - gf4334a -prepare.preload.extra_images: - vllm-cuda-rhel9: registry.redhat.io/rhaiis/vllm-cuda-rhel9@sha256:094db84a1da5e8a575d0c9eade114fa30f4a2061064a338e3e032f3578f8082a - llm-d-inference-scheduler: ghcr.io/opendatahub-io/rhaii-on-xks/llm-d-inference-scheduler:e6b5db0@sha256:43e8b8edc158f31535c8b23d77629f8cde111cc762a8f4ee5f2f884470566211 - guidellm: ghcr.io/vllm-project/guidellm:v0.5.4 diff --git a/projects/llm_d/orchestration/presets.d/presets.yaml b/projects/llm_d/orchestration/presets.d/presets.yaml index 3bd1e3fb..0b3de3a7 100644 --- a/projects/llm_d/orchestration/presets.d/presets.yaml +++ b/projects/llm_d/orchestration/presets.d/presets.yaml @@ -1,9 +1,31 @@ __multiple: true -pvc_rwx: - prepare.pvc.name: storage-rwx - prepare.pvc.access_mode: ReadWriteMany +smoke: + runtime.selected_preset: smoke + runtime.model_key: qwen3-0-6b + runtime.scheduler_profile_key: approximate + runtime.smoke_request_key: default + runtime.benchmark_key: null +smoke-precise: + extends: + - smoke + runtime.selected_preset: smoke-precise + runtime.scheduler_profile_key: precise -llama-70b: - tests.llmd.inference_service.model: llama3-3-70b +smoke-default-scheduler: + extends: + - smoke + runtime.selected_preset: smoke-default-scheduler + runtime.scheduler_profile_key: default + +benchmark-short: + runtime.selected_preset: benchmark-short + runtime.model_key: llama-3-1-8b-instruct-fp8 + runtime.scheduler_profile_key: approximate + runtime.smoke_request_key: default + runtime.benchmark_key: short + +cks: + extends: + - smoke diff --git a/projects/llm_d/orchestration/scheduler_profiles/approximate.yaml b/projects/llm_d/orchestration/scheduler_profiles/approximate.yaml new file mode 100644 index 00000000..e584dcf2 --- /dev/null +++ b/projects/llm_d/orchestration/scheduler_profiles/approximate.yaml @@ -0,0 +1,15 @@ +apiVersion: inference.networking.x-k8s.io/v1alpha1 +kind: EndpointPickerConfig +plugins: + - type: queue-scorer + - type: kv-cache-utilization-scorer + - type: prefix-cache-scorer +schedulingProfiles: + - name: default + plugins: + - pluginRef: queue-scorer + weight: 2 + - pluginRef: kv-cache-utilization-scorer + weight: 2 + - pluginRef: prefix-cache-scorer + weight: 3 diff --git a/projects/llm_d/orchestration/scheduler_profiles/precise.yaml b/projects/llm_d/orchestration/scheduler_profiles/precise.yaml new file mode 100644 index 00000000..707e5e0c --- /dev/null +++ b/projects/llm_d/orchestration/scheduler_profiles/precise.yaml @@ -0,0 +1,26 @@ +apiVersion: inference.networking.x-k8s.io/v1alpha1 +kind: EndpointPickerConfig +plugins: + - type: single-profile-handler + - type: precise-prefix-cache-scorer + parameters: + indexerConfig: + tokenProcessorConfig: + blockSize: 64 + hashSeed: "42" + tokenizersPoolConfig: + hf: + tokenizersCacheDir: /tmp/tokenizers + - type: kv-cache-utilization-scorer + - type: queue-scorer + - type: max-score-picker +schedulingProfiles: + - name: default + plugins: + - pluginRef: precise-prefix-cache-scorer + weight: 3.0 + - pluginRef: kv-cache-utilization-scorer + weight: 2.0 + - pluginRef: queue-scorer + weight: 2.0 + - pluginRef: max-score-picker diff --git a/projects/llm_d/orchestration/test_llmd.py b/projects/llm_d/orchestration/test_llmd.py deleted file mode 100644 index 8290ee63..00000000 --- a/projects/llm_d/orchestration/test_llmd.py +++ /dev/null @@ -1,29 +0,0 @@ -import logging -import pathlib - -from projects.core.library import config, env, run -from projects.llm_d.toolbox.capture_isvc_state.main import run as capture_isvc_state - -logger = logging.getLogger(__name__) - - -def init(): - env.init() - run.init() - config.init(pathlib.Path(__file__).parent) - - -@config.requires( - ns="prepare.namespace.name", - name="tests.llmd.flavors", -) -def test(_cfg): - logger.warning(f"Hello test {_cfg.ns}/{_cfg.name}") - - # two alternatives to query the configuration: - # @config.requires(dict) or config.project.get_config("") - # and we will define something similar for the secrets - - config.project.get_config("tests.llmd.flavors") - - capture_isvc_state(_cfg.name, namespace=_cfg.ns) diff --git a/projects/llm_d/runtime/llmd_runtime.py b/projects/llm_d/runtime/llmd_runtime.py new file mode 100644 index 00000000..53d4662b --- /dev/null +++ b/projects/llm_d/runtime/llmd_runtime.py @@ -0,0 +1,637 @@ +from __future__ import annotations + +import json +import logging +import re +import shlex +import subprocess +import time +from collections.abc import Iterable +from pathlib import Path +from typing import Any + +import yaml + +from projects.llm_d.runtime.runtime_config import ( + CONFIG_DIR, + ORCHESTRATION_DIR, + ModelCacheSpec, + ResolvedConfig, + apply_requested_preset, + derive_namespace, + ensure_artifact_directories, + init, + load_fournos_config, + load_run_configuration, + load_yaml, + normalize_gpu_count, + parse_overrides, + resolve_model_cache, + slugify_identifier, + truncate_k8s_name, + version_tuple, + write_json, + write_text, + write_yaml, +) +from projects.llm_d.runtime.runtime_manifests import ( + load_manifest_template, + render_datasciencecluster, + render_gateway, + render_guidellm_copy_pod, + render_guidellm_job, + render_guidellm_pvc, + render_inference_service, + render_model_cache_pvc, + render_smoke_request_job, +) + +logger = logging.getLogger(__name__) + +__all__ = [ + "CONFIG_DIR", + "ORCHESTRATION_DIR", + "CommandError", + "ModelCacheSpec", + "ResolvedConfig", + "annotate_model_cache_pvc", + "apply_manifest", + "apply_requested_preset", + "condition_status", + "derive_namespace", + "desired_subscription", + "ensure_artifact_directories", + "ensure_namespace", + "ensure_operator_group", + "ensure_subscription", + "init", + "job_pod_names", + "load_fournos_config", + "load_manifest_template", + "load_run_configuration", + "load_yaml", + "model_cache_pvc_ready", + "normalize_gpu_count", + "oc", + "oc_get_json", + "operator_spec_by_package", + "parse_overrides", + "pvc_access_mode_matches", + "render_datasciencecluster", + "render_gateway", + "render_guidellm_copy_pod", + "render_guidellm_job", + "render_guidellm_pvc", + "render_inference_service", + "render_model_cache_job", + "render_model_cache_pvc", + "render_smoke_request_job", + "resource_exists", + "resolve_default_serviceaccount_image_pull_secret", + "resolve_model_cache", + "run_command", + "slugify_identifier", + "subscription_spec_matches", + "truncate_k8s_name", + "version_tuple", + "wait_for_crd", + "wait_for_job_completion", + "wait_for_namespace_deleted", + "wait_for_operator_csv", + "wait_for_pvc_bound", + "wait_until", + "write_json", + "write_text", + "write_yaml", +] + + +class CommandError(RuntimeError): + """Raised when an external command exits unsuccessfully.""" + + +def run_command( + args: Iterable[str], + *, + check: bool = True, + capture_output: bool = True, + input_text: str | None = None, + timeout_seconds: float | None = 300, +) -> subprocess.CompletedProcess[str]: + cmd = [str(arg) for arg in args] + logger.info("run: %s", " ".join(shlex.quote(arg) for arg in cmd)) + try: + result = subprocess.run( + cmd, + check=False, + text=True, + capture_output=capture_output, + input=input_text, + timeout=timeout_seconds, + ) + except subprocess.TimeoutExpired: + logger.error( + "Command timed out after %ss: %s", + timeout_seconds, + " ".join(shlex.quote(arg) for arg in cmd), + ) + raise + + if capture_output: + if result.stdout: + logger.info("stdout:\n%s", result.stdout.rstrip()) + if result.stderr: + logger.info("stderr:\n%s", result.stderr.rstrip()) + + if check and result.returncode != 0: + raise CommandError( + f"Command failed with exit code {result.returncode}: " + f"{' '.join(shlex.quote(arg) for arg in cmd)}" + ) + + return result + + +def oc( + *args: str, + check: bool = True, + capture_output: bool = True, + input_text: str | None = None, + timeout_seconds: float | None = 300, +) -> subprocess.CompletedProcess[str]: + return run_command( + ["oc", *args], + check=check, + capture_output=capture_output, + input_text=input_text, + timeout_seconds=timeout_seconds, + ) + + +def apply_manifest(artifact_path: Any, manifest: dict[str, Any]) -> None: + write_yaml(artifact_path, manifest) + oc("apply", "-f", str(artifact_path)) + + +def oc_get_json( + kind: str, + *, + name: str | None = None, + namespace: str | None = None, + selector: str | None = None, + ignore_not_found: bool = False, +) -> dict[str, Any] | None: + args = ["get", kind] + if name: + args.append(name) + if namespace: + args.extend(["-n", namespace]) + if selector: + args.extend(["-l", selector]) + args.extend(["-o", "json"]) + + result = oc(*args, check=not ignore_not_found, capture_output=True) + if result.returncode != 0: + if ignore_not_found and _is_oc_not_found_error(result.stderr): + return None + raise CommandError( + f"oc {' '.join(shlex.quote(arg) for arg in args)} failed with exit code " + f"{result.returncode}: {result.stderr.strip()}" + ) + if not result.stdout: + raise CommandError(f"oc {' '.join(shlex.quote(arg) for arg in args)} returned no output") + return json.loads(result.stdout) + + +def resource_exists(kind: str, name: str, *, namespace: str | None = None) -> bool: + return ( + oc_get_json( + kind, + name=name, + namespace=namespace, + ignore_not_found=True, + ) + is not None + ) + + +def _is_oc_not_found_error(stderr: str | None) -> bool: + if not stderr: + return False + + normalized = stderr.lower() + if "error from server (notfound)" in normalized: + return True + if "no resources found" in normalized: + return True + + return bool(re.search(r"\bnot found\b", normalized)) + + +def wait_until( + description: str, + *, + timeout_seconds: int, + interval_seconds: int, + predicate, +) -> Any: + deadline = time.time() + timeout_seconds + last_error: Exception | None = None + + while time.time() < deadline: + try: + value = predicate() + if value: + return value + last_error = None + except Exception as exc: # pragma: no cover - exercised in integration paths + if isinstance(exc, RuntimeError): + raise + last_error = exc + logger.info("waiting for %s: %s", description, exc) + time.sleep(interval_seconds) + + if last_error: + raise RuntimeError(f"Timed out waiting for {description}: {last_error}") from last_error + raise RuntimeError(f"Timed out waiting for {description}") + + +def wait_for_namespace_deleted(namespace: str, timeout_seconds: int) -> None: + wait_until( + f"namespace/{namespace} deletion", + timeout_seconds=timeout_seconds, + interval_seconds=10, + predicate=lambda: not resource_exists("namespace", namespace), + ) + + +def wait_for_crd(crd_name: str, timeout_seconds: int) -> None: + wait_until( + f"crd/{crd_name}", + timeout_seconds=timeout_seconds, + interval_seconds=10, + predicate=lambda: resource_exists("crd", crd_name), + ) + + +def wait_for_operator_csv(package: str, namespace: str, timeout_seconds: int) -> dict[str, Any]: + selector = f"operators.coreos.com/{package}.{namespace}" + + def _csv_ready() -> dict[str, Any] | None: + data = oc_get_json("csv", namespace=namespace, selector=selector, ignore_not_found=True) + if not data: + return None + items = data.get("items", []) + if not items: + return None + csv = items[0] + if csv.get("status", {}).get("phase") == "Succeeded": + return csv + return None + + return wait_until( + f"{package} CSV in {namespace}", + timeout_seconds=timeout_seconds, + interval_seconds=15, + predicate=_csv_ready, + ) + + +def ensure_namespace(namespace: str, *, labels: dict[str, str] | None = None) -> None: + if not resource_exists("namespace", namespace): + oc("create", "namespace", namespace) + + if labels: + oc("label", "namespace", namespace, "--overwrite", *[f"{k}={v}" for k, v in labels.items()]) + + +def ensure_operator_group(namespace: str, package: str) -> None: + data = oc_get_json("operatorgroup", namespace=namespace, ignore_not_found=True) + if data and data.get("items"): + for item in data["items"]: + targets = item.get("spec", {}).get("targetNamespaces") or [namespace] + if namespace in targets: + return + raise RuntimeError( + f"Existing OperatorGroup objects in {namespace} do not target {namespace}" + ) + + operator_group = { + "apiVersion": "operators.coreos.com/v1", + "kind": "OperatorGroup", + "metadata": {"name": package, "namespace": namespace}, + "spec": {"targetNamespaces": [namespace]}, + } + oc("apply", "-f", "-", input_text=yaml.safe_dump(operator_group, sort_keys=False)) + + +def ensure_subscription(operator_spec: dict[str, Any]) -> None: + namespace = operator_spec["namespace"] + package = operator_spec["package"] + + ensure_namespace(namespace) + ensure_operator_group(namespace, package) + + subscription = desired_subscription(operator_spec) + current = oc_get_json( + "subscription.operators.coreos.com", + name=package, + namespace=namespace, + ignore_not_found=True, + ) + if current and not subscription_spec_matches(current.get("spec", {}), subscription["spec"]): + logger.info("Reconciling subscription drift for %s in %s", package, namespace) + + oc("apply", "-f", "-", input_text=yaml.safe_dump(subscription, sort_keys=False)) + + def _subscription_reconciled() -> dict[str, Any] | None: + payload = oc_get_json( + "subscription.operators.coreos.com", + name=package, + namespace=namespace, + ) + if subscription_spec_matches(payload.get("spec", {}), subscription["spec"]): + return payload + return None + + wait_until( + f"subscription/{package} reconciliation in {namespace}", + timeout_seconds=60, + interval_seconds=5, + predicate=_subscription_reconciled, + ) + + +def desired_subscription(operator_spec: dict[str, Any]) -> dict[str, Any]: + namespace = operator_spec["namespace"] + package = operator_spec["package"] + return { + "apiVersion": "operators.coreos.com/v1alpha1", + "kind": "Subscription", + "metadata": {"name": package, "namespace": namespace}, + "spec": { + "channel": operator_spec["channel"], + "installPlanApproval": "Automatic", + "name": package, + "source": operator_spec["source"], + "sourceNamespace": "openshift-marketplace", + }, + } + + +def subscription_spec_matches(actual: dict[str, Any], expected: dict[str, Any]) -> bool: + keys = ("channel", "installPlanApproval", "name", "source", "sourceNamespace") + return all(actual.get(key) == expected.get(key) for key in keys) + + +def operator_spec_by_package(platform: dict[str, Any], package: str) -> dict[str, Any]: + operators = platform["operators"] + if isinstance(operators, dict): + if package in operators: + return {"package": package, **operators[package]} + raise KeyError(f"Unknown operator package in llm_d platform config: {package}") + + for operator_spec in operators: + if operator_spec["package"] == package: + return operator_spec + raise KeyError(f"Unknown operator package in llm_d platform config: {package}") + + +def condition_status(resource: dict[str, Any], condition_type: str) -> str | None: + for condition in resource.get("status", {}).get("conditions", []): + if condition.get("type") == condition_type: + return condition.get("status") + return None + + +def pvc_access_mode_matches(actual_modes: list[str], expected_mode: str) -> bool: + return expected_mode in actual_modes + + +def wait_for_pvc_bound(pvc_name: str, namespace: str, *, timeout_seconds: int) -> dict[str, Any]: + def _pvc_bound() -> dict[str, Any] | None: + payload = oc_get_json( + "persistentvolumeclaim", + name=pvc_name, + namespace=namespace, + ignore_not_found=True, + ) + if not payload: + return None + if payload.get("status", {}).get("phase") == "Bound": + return payload + return None + + return wait_until( + f"persistentvolumeclaim/{pvc_name} bound in {namespace}", + timeout_seconds=timeout_seconds, + interval_seconds=5, + predicate=_pvc_bound, + ) + + +def wait_for_job_completion( + job_name: str, namespace: str, *, timeout_seconds: int, interval_seconds: int = 10 +) -> dict[str, Any]: + def _job_completed() -> dict[str, Any] | None: + payload = oc_get_json( + "job", + name=job_name, + namespace=namespace, + ignore_not_found=True, + ) + if not payload: + return None + status = payload.get("status", {}) + if status.get("succeeded", 0): + return payload + failed_count = status.get("failed", 0) + for condition in status.get("conditions", []): + if condition.get("type") == "Failed" and condition.get("status") == "True": + raise RuntimeError( + f"job/{job_name} failed: {condition.get('reason') or 'unknown reason'}" + ) + if failed_count: + raise RuntimeError(f"job/{job_name} failed after {failed_count} attempt(s)") + return None + + return wait_until( + f"job/{job_name} completion in {namespace}", + timeout_seconds=timeout_seconds, + interval_seconds=interval_seconds, + predicate=_job_completed, + ) + + +def job_pod_names(job_name: str, namespace: str) -> list[str]: + payload = oc_get_json( + "pods", + namespace=namespace, + selector=f"job-name={job_name}", + ignore_not_found=True, + ) + if not payload: + return [] + return [item["metadata"]["name"] for item in payload.get("items", [])] + + +def resolve_default_serviceaccount_image_pull_secret(namespace: str) -> str | None: + payload = oc_get_json( + "serviceaccount", name="default", namespace=namespace, ignore_not_found=True + ) + if not payload: + return None + + for item in payload.get("imagePullSecrets", []): + name = item.get("name") + if name: + return name + return None + + +def load_runtime_script(name: str) -> str: + script_path = Path(__file__).resolve().parent / "scripts" / name + return script_path.read_text(encoding="utf-8") + + +def render_model_cache_job(config: ResolvedConfig, spec: ModelCacheSpec) -> dict[str, Any]: + common_env = [ + {"name": "MODEL_SOURCE", "value": spec.source_uri}, + {"name": "MODEL_TARGET_DIR", "value": f"/cache/{spec.model_path}"}, + {"name": "MARKER_FILE", "value": spec.marker_path}, + {"name": "CACHE_KEY", "value": spec.cache_key}, + ] + volumes: list[dict[str, Any]] = [ + {"name": "cache", "persistentVolumeClaim": {"claimName": spec.pvc_name}} + ] + + if spec.source_scheme == "hf": + command = load_runtime_script("download_hf_model.sh") + volume_mounts = [{"name": "cache", "mountPath": "/cache"}] + if spec.hf_token_secret_name: + volumes.append( + {"name": "hf-token", "secret": {"secretName": spec.hf_token_secret_name}} + ) + volume_mounts.append( + { + "name": "hf-token", + "mountPath": "/var/run/forge/hf-token", + "readOnly": True, + } + ) + common_env.append( + { + "name": "HF_TOKEN_FILE", + "value": f"/var/run/forge/hf-token/{spec.hf_token_secret_key}", + } + ) + + container = { + "name": "hf-model-downloader", + "image": config.model_cache["hf"]["downloader_image"], + "imagePullPolicy": config.model_cache["download"]["pod_image_pull_policy"], + "command": ["/bin/bash", "-ceu", command], + "env": common_env, + "volumeMounts": volume_mounts, + } + elif spec.source_scheme == "oci": + registry_auth_secret_name = ( + spec.oci_registry_auth_secret_name + or resolve_default_serviceaccount_image_pull_secret(spec.namespace) + ) + command = load_runtime_script("extract_oci_model.sh") + volume_mounts = [{"name": "cache", "mountPath": "/cache"}] + common_env.append({"name": "OCI_IMAGE_PATH", "value": spec.oci_image_path or "/"}) + if registry_auth_secret_name: + volumes.append( + {"name": "registry-auth", "secret": {"secretName": registry_auth_secret_name}} + ) + volume_mounts.append( + { + "name": "registry-auth", + "mountPath": "/var/run/forge/registry-auth", + "readOnly": True, + } + ) + common_env.append( + { + "name": "REGISTRY_AUTH_FILE", + "value": f"/var/run/forge/registry-auth/{spec.oci_registry_auth_secret_key}", + } + ) + + container = { + "name": "oci-model-extractor", + "image": config.model_cache["oci"]["extractor_image"], + "imagePullPolicy": config.model_cache["download"]["pod_image_pull_policy"], + "command": ["/bin/bash", "-ceu", command], + "env": common_env, + "volumeMounts": volume_mounts, + } + else: # pragma: no cover - guarded by resolve_model_cache + raise ValueError(f"Unsupported model cache source scheme: {spec.source_scheme}") + + return { + "apiVersion": "batch/v1", + "kind": "Job", + "metadata": { + "name": spec.download_job_name, + "namespace": spec.namespace, + "labels": { + "app.kubernetes.io/managed-by": "forge", + "forge.openshift.io/project": "llm_d", + }, + }, + "spec": { + "backoffLimit": 0, + "activeDeadlineSeconds": config.model_cache["download"]["wait_timeout_seconds"], + "template": { + "metadata": { + "labels": { + "job-name": spec.download_job_name, + "app.kubernetes.io/managed-by": "forge", + "forge.openshift.io/project": "llm_d", + } + }, + "spec": { + "serviceAccountName": "default", + "restartPolicy": "Never", + "containers": [container], + "volumes": volumes, + }, + }, + }, + } + + +def model_cache_pvc_ready(spec: ModelCacheSpec) -> bool: + payload = oc_get_json( + "persistentvolumeclaim", + name=spec.pvc_name, + namespace=spec.namespace, + ignore_not_found=True, + ) + if not payload: + return False + + annotations = payload.get("metadata", {}).get("annotations", {}) + return ( + annotations.get("forge.openshift.io/model-cache-ready") == "true" + and annotations.get("forge.openshift.io/model-cache-key") == spec.cache_key + and annotations.get("forge.openshift.io/model-source-uri") == spec.source_uri + ) + + +def annotate_model_cache_pvc(spec: ModelCacheSpec) -> None: + oc( + "annotate", + "persistentvolumeclaim", + spec.pvc_name, + "-n", + spec.namespace, + "--overwrite", + "forge.openshift.io/model-cache-ready=true", + f"forge.openshift.io/model-cache-key={spec.cache_key}", + f"forge.openshift.io/model-source-uri={spec.source_uri}", + f"forge.openshift.io/model-uri={spec.model_uri}", + ) diff --git a/projects/llm_d/runtime/phase_inputs.py b/projects/llm_d/runtime/phase_inputs.py new file mode 100644 index 00000000..5b985737 --- /dev/null +++ b/projects/llm_d/runtime/phase_inputs.py @@ -0,0 +1,207 @@ +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +from projects.llm_d.runtime.runtime_config import ResolvedConfig, load_yaml, write_yaml + + +@dataclass(frozen=True) +class CleanupInputs: + artifact_dir: Path + namespace: str + platform: dict[str, Any] + benchmark: dict[str, Any] | None + + +@dataclass(frozen=True) +class PrepareModelCacheInputs: + artifact_dir: Path + preset_name: str + namespace: str + namespace_is_managed: bool + model_key: str + model: dict[str, Any] + model_cache: dict[str, Any] + + +@dataclass(frozen=True) +class PrepareInputs: + artifact_dir: Path + config_dir: Path + preset_name: str + namespace: str + namespace_is_managed: bool + platform: dict[str, Any] + model_key: str + model: dict[str, Any] + model_cache: dict[str, Any] + benchmark: dict[str, Any] | None + + +@dataclass(frozen=True) +class TestInputs: + artifact_dir: Path + config_dir: Path + preset_name: str + namespace: str + platform: dict[str, Any] + model_key: str + model: dict[str, Any] + scheduler_profile_key: str + scheduler_profile: dict[str, Any] | None + model_cache: dict[str, Any] + smoke_request: dict[str, Any] + benchmark: dict[str, Any] | None + + +def write_cleanup_inputs(config: ResolvedConfig) -> Path: + path = config.artifact_dir / "_meta" / "cleanup.inputs.yaml" + write_yaml( + path, + { + "artifact_dir": str(config.artifact_dir), + "namespace": config.namespace, + "platform": config.platform, + "benchmark": config.benchmark, + }, + ) + return path + + +def write_prepare_model_cache_inputs(config: ResolvedConfig) -> Path: + path = config.artifact_dir / "_meta" / "prepare_model_cache.inputs.yaml" + write_yaml( + path, + { + "artifact_dir": str(config.artifact_dir), + "preset_name": config.preset_name, + "namespace": config.namespace, + "namespace_is_managed": config.namespace_is_managed, + "model_key": config.model_key, + "model": config.model, + "model_cache": config.model_cache, + }, + ) + return path + + +def write_prepare_inputs(config: ResolvedConfig) -> Path: + path = config.artifact_dir / "_meta" / "prepare.inputs.yaml" + write_yaml( + path, + { + "artifact_dir": str(config.artifact_dir), + "config_dir": str(config.config_dir), + "preset_name": config.preset_name, + "namespace": config.namespace, + "namespace_is_managed": config.namespace_is_managed, + "platform": config.platform, + "model_key": config.model_key, + "model": config.model, + "model_cache": config.model_cache, + "benchmark": config.benchmark, + }, + ) + return path + + +def write_test_inputs(config: ResolvedConfig) -> Path: + path = config.artifact_dir / "_meta" / "test.inputs.yaml" + write_yaml( + path, + { + "artifact_dir": str(config.artifact_dir), + "config_dir": str(config.config_dir), + "preset_name": config.preset_name, + "namespace": config.namespace, + "platform": config.platform, + "model_key": config.model_key, + "model": config.model, + "scheduler_profile_key": config.scheduler_profile_key, + "scheduler_profile": config.scheduler_profile, + "model_cache": config.model_cache, + "smoke_request": config.smoke_request, + "benchmark": config.benchmark, + }, + ) + return path + + +def load_cleanup_inputs(path: str | Path) -> CleanupInputs: + payload = load_yaml(Path(path)) + return CleanupInputs( + artifact_dir=Path(payload["artifact_dir"]), + namespace=payload["namespace"], + platform=payload["platform"], + benchmark=payload["benchmark"], + ) + + +def load_prepare_model_cache_inputs(path: str | Path) -> PrepareModelCacheInputs: + payload = load_yaml(Path(path)) + return PrepareModelCacheInputs( + artifact_dir=Path(payload["artifact_dir"]), + preset_name=payload["preset_name"], + namespace=payload["namespace"], + namespace_is_managed=payload["namespace_is_managed"], + model_key=payload["model_key"], + model=payload["model"], + model_cache=payload["model_cache"], + ) + + +def load_prepare_inputs(path: str | Path) -> PrepareInputs: + payload = load_yaml(Path(path)) + return PrepareInputs( + artifact_dir=Path(payload["artifact_dir"]), + config_dir=Path(payload["config_dir"]), + preset_name=payload["preset_name"], + namespace=payload["namespace"], + namespace_is_managed=payload["namespace_is_managed"], + platform=payload["platform"], + model_key=payload["model_key"], + model=payload["model"], + model_cache=payload["model_cache"], + benchmark=payload["benchmark"], + ) + + +def load_test_inputs(path: str | Path) -> TestInputs: + payload = load_yaml(Path(path)) + return TestInputs( + artifact_dir=Path(payload["artifact_dir"]), + config_dir=Path(payload["config_dir"]), + preset_name=payload["preset_name"], + namespace=payload["namespace"], + platform=payload["platform"], + model_key=payload["model_key"], + model=payload["model"], + scheduler_profile_key=payload["scheduler_profile_key"], + scheduler_profile=payload["scheduler_profile"], + model_cache=payload["model_cache"], + smoke_request=payload["smoke_request"], + benchmark=payload["benchmark"], + ) + + +def cleanup_inputs_from_prepare(inputs: PrepareInputs) -> CleanupInputs: + return CleanupInputs( + artifact_dir=inputs.artifact_dir, + namespace=inputs.namespace, + platform=inputs.platform, + benchmark=inputs.benchmark, + ) + + +def prepare_model_cache_inputs_from_prepare(inputs: PrepareInputs) -> PrepareModelCacheInputs: + return PrepareModelCacheInputs( + artifact_dir=inputs.artifact_dir, + preset_name=inputs.preset_name, + namespace=inputs.namespace, + namespace_is_managed=inputs.namespace_is_managed, + model_key=inputs.model_key, + model=inputs.model, + model_cache=inputs.model_cache, + ) diff --git a/projects/llm_d/runtime/runtime_config.py b/projects/llm_d/runtime/runtime_config.py new file mode 100644 index 00000000..a16e1711 --- /dev/null +++ b/projects/llm_d/runtime/runtime_config.py @@ -0,0 +1,368 @@ +from __future__ import annotations + +import copy +import hashlib +import json +import logging +import os +import re +from collections.abc import Iterable +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import yaml + +from projects.core.library import config, env, run + +logger = logging.getLogger(__name__) +RUNTIME_DIR = Path(__file__).resolve().parent +PROJECT_DIR = RUNTIME_DIR.parent +ORCHESTRATION_DIR = PROJECT_DIR / "orchestration" +CONFIG_DIR = ORCHESTRATION_DIR + + +@dataclass(frozen=True) +class ResolvedConfig: + artifact_dir: Path + project_root: Path + config_dir: Path + preset_name: str + preset_alias: str | None + job_name: str + namespace: str + namespace_is_managed: bool + gpu_count: int | None + platform: dict[str, Any] + model_key: str + model: dict[str, Any] + scheduler_profile_key: str + scheduler_profile: dict[str, Any] | None + model_cache: dict[str, Any] + smoke_request: dict[str, Any] + benchmark: dict[str, Any] | None + fournos_config: dict[str, Any] + overrides: dict[str, Any] + + @property + def manifests_dir(self) -> Path: + return self.config_dir / "manifests" + + +@dataclass(frozen=True) +class ModelCacheSpec: + source_uri: str + source_scheme: str + cache_key: str + namespace: str + pvc_name: str + pvc_size: str + access_mode: str + storage_class_name: str | None + model_path: str + model_uri: str + marker_filename: str + download_job_name: str + hf_token_secret_name: str | None + hf_token_secret_key: str | None + oci_image_path: str | None + oci_registry_auth_secret_name: str | None + oci_registry_auth_secret_key: str | None + + @property + def marker_path(self) -> str: + return f"/cache/{self.model_path}/{self.marker_filename}" + + +def init() -> Path: + if not logging.getLogger().handlers: + logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") + + env.init() + run.init() + ensure_artifact_directories(env.ARTIFACT_DIR) + return env.ARTIFACT_DIR + + +def ensure_artifact_directories(artifact_dir: Path) -> None: + for relative in ("src", "artifacts", "artifacts/results"): + (artifact_dir / relative).mkdir(parents=True, exist_ok=True) + + +def load_run_configuration( + *, + cwd: Path | None = None, + artifact_dir: Path | None = None, + requested_preset: str | None = None, + raw_overrides: str | None = None, + job_name: str | None = None, +) -> ResolvedConfig: + cwd = cwd or Path.cwd() + if artifact_dir is not None: + os.environ["ARTIFACT_DIR"] = str(artifact_dir) + artifact_dir = init() + config.reload(ORCHESTRATION_DIR) + + platform_data = normalize_platform_config(copy.deepcopy(config.project.get_config("platform"))) + model_cache = copy.deepcopy(config.project.get_config("model_cache")) + fournos_config = load_fournos_config(cwd) + overrides = parse_overrides( + raw_overrides or "", + allowed_keys=config.project.get_config("runtime.allowed_override_keys", []), + ) + + requested_preset = ( + requested_preset + or fournos_config.get("preset") + or config.project.get_config("runtime.default_preset") + ) + apply_requested_preset(requested_preset) + + preset_name = config.project.get_config("runtime.selected_preset") + preset_alias = requested_preset if requested_preset != preset_name else None + + model_name = config.project.get_config("runtime.model_key") + model = copy.deepcopy(config.project.get_config(f"models.{model_name}")) + + scheduler_profile_key = config.project.get_config("runtime.scheduler_profile_key") + scheduler_profile = None + if scheduler_profile_key != "default": + scheduler_profile = copy.deepcopy( + config.project.get_config(f"scheduler_profiles.{scheduler_profile_key}") + ) + + smoke_request_name = config.project.get_config("runtime.smoke_request_key") + smoke_request = copy.deepcopy( + config.project.get_config(f"workloads.smoke_requests.{smoke_request_name}") + ) + + benchmark_name = config.project.get_config("runtime.benchmark_key", None) + benchmark = None + if benchmark_name: + benchmark = copy.deepcopy( + config.project.get_config(f"workloads.benchmarks.{benchmark_name}") + ) + + job_name = job_name or fournos_config.get("job-name") + if not job_name: + job_name = f"local-{preset_name}" + + namespace_override = overrides.get("namespace") or fournos_config.get("namespace") + namespace_config = platform_data["cluster"]["namespace"] + default_namespace = namespace_config.get("name") + namespace = ( + namespace_override + or default_namespace + or derive_namespace( + job_name, + namespace_config["prefix"], + namespace_config["max_length"], + ) + ) + + gpu_count = normalize_gpu_count(fournos_config.get("gpu-count")) + + return ResolvedConfig( + artifact_dir=Path(artifact_dir), + project_root=env.FORGE_HOME, + config_dir=ORCHESTRATION_DIR, + preset_name=preset_name, + preset_alias=preset_alias, + job_name=job_name, + namespace=namespace, + namespace_is_managed=namespace_override is None and default_namespace is None, + gpu_count=gpu_count, + platform=platform_data, + model_key=model_name, + model=model, + scheduler_profile_key=scheduler_profile_key, + scheduler_profile=scheduler_profile, + model_cache=model_cache, + smoke_request=smoke_request, + benchmark=benchmark, + fournos_config=fournos_config, + overrides=overrides, + ) + + +def normalize_platform_config(platform_data: dict[str, Any]) -> dict[str, Any]: + cluster = platform_data["cluster"] + if "namespace" not in cluster: + cluster["namespace"] = { + "name": cluster.pop("namespace_name", None), + "prefix": cluster.pop("namespace_prefix"), + "max_length": cluster.pop("namespace_max_length"), + } + + operators = platform_data["operators"] + if isinstance(operators, list): + platform_data["operators"] = { + operator_spec["package"]: { + key: value for key, value in operator_spec.items() if key != "package" + } + for operator_spec in operators + } + + return platform_data + + +def apply_requested_preset(requested_preset: str) -> None: + if not config.project.get_preset(requested_preset): + raise ValueError(f"Unknown llm_d preset: {requested_preset}") + + config.project.apply_preset(requested_preset) + + +def load_fournos_config(cwd: Path) -> dict[str, Any]: + config_path = cwd / "fournos_config.yaml" + if not config_path.exists(): + return {} + + data = load_yaml(config_path) + if data is None: + return {} + if not isinstance(data, dict): + raise ValueError(f"Unexpected FOURNOS config type in {config_path}: {type(data)}") + return data + + +def parse_overrides(raw: str, *, allowed_keys: Iterable[str]) -> dict[str, Any]: + if not raw or raw.strip() in {"", "null", "{}"}: + return {} + + try: + data = json.loads(raw) + except json.JSONDecodeError as exc: + raise ValueError(f"FORGE_CONFIG_OVERRIDES is not valid JSON: {exc}") from exc + + if not isinstance(data, dict): + raise ValueError("FORGE_CONFIG_OVERRIDES must decode to a JSON object") + + allowed_keys = frozenset(allowed_keys) + unsupported = sorted(set(data) - allowed_keys) + if unsupported: + raise ValueError( + "Unsupported llm_d override keys: " + f"{', '.join(unsupported)}. Allowed keys: {', '.join(sorted(allowed_keys))}" + ) + + return data + + +def normalize_gpu_count(value: Any) -> int | None: + if value in (None, ""): + return None + try: + return int(value) + except (TypeError, ValueError): + logger.warning("Ignoring invalid gpu-count value: %s", value) + return None + + +def derive_namespace(job_name: str, prefix: str, max_length: int) -> str: + slug = re.sub(r"[^a-z0-9-]+", "-", job_name.lower()) + slug = re.sub(r"-{2,}", "-", slug).strip("-") + if not slug: + slug = "run" + + if slug.startswith(f"{prefix}-"): + namespace = slug + else: + namespace = f"{prefix}-{slug}" + + namespace = namespace[:max_length].rstrip("-") + if not namespace: + raise ValueError(f"Could not derive a valid namespace from job name: {job_name}") + return namespace + + +def slugify_identifier(value: str, *, max_length: int = 63) -> str: + slug = re.sub(r"[^a-z0-9-]+", "-", value.lower()) + slug = re.sub(r"-{2,}", "-", slug).strip("-") + return slug[:max_length].rstrip("-") or "item" + + +def truncate_k8s_name(value: str, *, max_length: int = 63) -> str: + return value[:max_length].rstrip("-") + + +def version_tuple(value: str) -> tuple[int, ...]: + numbers = re.findall(r"\d+", value) + return tuple(int(number) for number in numbers[:3]) + + +def resolve_model_cache(config: ResolvedConfig) -> ModelCacheSpec | None: + if not config.model_cache.get("enabled", False): + return None + + source_uri = config.model["uri"] + if source_uri.startswith(("pvc://", "pvc+hf://")): + return None + + if source_uri.startswith("hf://"): + source_scheme = "hf" + elif source_uri.startswith("oci://"): + source_scheme = "oci" + else: + raise ValueError(f"Unsupported model cache source URI for {config.model_key}: {source_uri}") + + model_cache_overrides = config.model.get("cache", {}) + pvc_defaults = config.model_cache["pvc"] + pvc_prefix = config.model_cache["pvc"]["name_prefix"] + cache_key = hashlib.sha256(source_uri.encode("utf-8")).hexdigest()[:10] + pvc_name = truncate_k8s_name( + f"{pvc_prefix}-{slugify_identifier(config.model_key, max_length=32)}-{cache_key}" + ) + model_path = pvc_defaults["model_directory_name"] + + return ModelCacheSpec( + source_uri=source_uri, + source_scheme=source_scheme, + cache_key=cache_key, + namespace=config.namespace, + pvc_name=pvc_name, + pvc_size=model_cache_overrides.get("pvc_size", pvc_defaults["size"]), + access_mode=model_cache_overrides.get("access_mode", pvc_defaults["access_mode"]), + storage_class_name=model_cache_overrides.get( + "storage_class_name", pvc_defaults.get("storage_class_name") + ), + model_path=model_path, + model_uri=f"pvc://{pvc_name}/{model_path}", + marker_filename=config.model_cache["marker_filename"], + download_job_name=truncate_k8s_name(f"{pvc_name}-download"), + hf_token_secret_name=model_cache_overrides.get( + "hf_token_secret_name", config.model_cache["hf"].get("token_secret_name") + ), + hf_token_secret_key=config.model_cache["hf"].get("token_secret_key"), + oci_image_path=model_cache_overrides.get( + "oci_image_path", config.model_cache["oci"].get("image_path") + ), + oci_registry_auth_secret_name=model_cache_overrides.get( + "oci_registry_auth_secret_name", + config.model_cache["oci"].get("registry_auth_secret_name"), + ), + oci_registry_auth_secret_key=config.model_cache["oci"].get("registry_auth_secret_key"), + ) + + +def load_yaml(path: Path) -> Any: + with path.open(encoding="utf-8") as handle: + return yaml.safe_load(handle) + + +def write_yaml(path: Path, payload: Any) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8") as handle: + yaml.safe_dump(payload, handle, sort_keys=False) + + +def write_json(path: Path, payload: Any) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8") as handle: + json.dump(payload, handle, indent=2, sort_keys=True) + handle.write("\n") + + +def write_text(path: Path, content: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(content, encoding="utf-8") diff --git a/projects/llm_d/runtime/runtime_manifests.py b/projects/llm_d/runtime/runtime_manifests.py new file mode 100644 index 00000000..bc5fdca8 --- /dev/null +++ b/projects/llm_d/runtime/runtime_manifests.py @@ -0,0 +1,327 @@ +from __future__ import annotations + +import copy +import json +from typing import Any + +from projects.llm_d.runtime.runtime_config import ( + ModelCacheSpec, + ResolvedConfig, + load_yaml, + resolve_model_cache, +) + + +def load_manifest_template(config: ResolvedConfig, relative_path: str) -> dict[str, Any]: + return load_yaml(config.config_dir / relative_path) + + +def render_datasciencecluster(config: ResolvedConfig) -> dict[str, Any]: + template_path = config.config_dir / config.platform["rhoai"]["datasciencecluster_template"] + manifest = load_yaml(template_path) + manifest["metadata"]["name"] = config.platform["rhoai"]["datasciencecluster_name"] + manifest["metadata"]["namespace"] = config.platform["rhoai"]["namespace"] + return manifest + + +def render_gateway(config: ResolvedConfig) -> dict[str, Any]: + template_path = config.config_dir / config.platform["gateway"]["manifest_template"] + manifest = load_yaml(template_path) + manifest["metadata"]["name"] = config.platform["gateway"]["name"] + manifest["metadata"]["namespace"] = config.platform["gateway"]["namespace"] + manifest["spec"]["gatewayClassName"] = config.platform["gateway"]["gateway_class_name"] + return manifest + + +def render_model_cache_pvc(spec: ModelCacheSpec) -> dict[str, Any]: + manifest: dict[str, Any] = { + "apiVersion": "v1", + "kind": "PersistentVolumeClaim", + "metadata": { + "name": spec.pvc_name, + "namespace": spec.namespace, + "labels": { + "app.kubernetes.io/managed-by": "forge", + "forge.openshift.io/project": "llm_d", + "forge.openshift.io/model-cache": "true", + "forge.openshift.io/preserve": "true", + }, + "annotations": { + "forge.openshift.io/model-cache-key": spec.cache_key, + "forge.openshift.io/model-source-uri": spec.source_uri, + }, + }, + "spec": { + "accessModes": [spec.access_mode], + "resources": {"requests": {"storage": spec.pvc_size}}, + }, + } + if spec.storage_class_name: + manifest["spec"]["storageClassName"] = spec.storage_class_name + return manifest + + +def render_inference_service(config: ResolvedConfig) -> dict[str, Any]: + template_path = config.config_dir / config.platform["inference_service"]["template"] + manifest = load_yaml(template_path) + + name = config.platform["inference_service"]["name"] + manifest["metadata"]["name"] = name + manifest["metadata"]["namespace"] = config.namespace + manifest["metadata"].setdefault("labels", {}) + manifest["metadata"]["labels"].update( + { + "app.kubernetes.io/managed-by": "forge", + "forge.openshift.io/project": "llm_d", + } + ) + + cache_spec = resolve_model_cache(config) + manifest["spec"]["model"]["uri"] = cache_spec.model_uri if cache_spec else config.model["uri"] + manifest["spec"]["model"]["name"] = config.model["served_model_name"] + manifest["spec"]["template"]["containers"][0]["resources"] = copy.deepcopy( + config.model["resources"] + ) + + if config.scheduler_profile_key == "default": + manifest["spec"]["router"]["scheduler"] = {} + return manifest + + if config.scheduler_profile is None: + raise ValueError(f"Missing scheduler profile config for {config.scheduler_profile_key}") + + scheduler_profile_path = config.config_dir / config.scheduler_profile["config_path"] + scheduler_profile_config = scheduler_profile_path.read_text(encoding="utf-8") + router_args = manifest["spec"]["router"]["scheduler"]["template"]["containers"][0]["args"] + if not router_args or router_args[-1] != "--config-text": + raise ValueError("Expected llm-d router args to end with --config-text") + router_args.append(scheduler_profile_config) + + return manifest + + +def render_smoke_request_job( + config: ResolvedConfig, endpoint_url: str, payload: dict[str, Any] +) -> dict[str, Any]: + smoke = config.platform["smoke"] + command = """ +set -eu +attempt=1 +while [ "${attempt}" -le "${REQUEST_RETRIES}" ]; do + if curl -k -sSf --max-time "${REQUEST_TIMEOUT_SECONDS}" \ + "${ENDPOINT_URL}${ENDPOINT_PATH}" \ + -H "Content-Type: application/json" \ + -d "${REQUEST_PAYLOAD}" \ + -o /tmp/smoke-response.json \ + 2>/tmp/smoke-error.log; then + cat /tmp/smoke-response.json + exit 0 + fi + attempt=$((attempt + 1)) + sleep "${REQUEST_RETRY_DELAY_SECONDS}" +done +cat /tmp/smoke-error.log >&2 || true +exit 1 +""" + + return { + "apiVersion": "batch/v1", + "kind": "Job", + "metadata": { + "name": smoke["job_name"], + "namespace": config.namespace, + "labels": { + "app.kubernetes.io/managed-by": "forge", + "forge.openshift.io/project": "llm_d", + "forge.openshift.io/component": "smoke", + }, + }, + "spec": { + "backoffLimit": 0, + "activeDeadlineSeconds": ( + smoke["request_retries"] + * (smoke["request_timeout_seconds"] + smoke["request_retry_delay_seconds"]) + ), + "template": { + "metadata": { + "labels": { + "app.kubernetes.io/managed-by": "forge", + "forge.openshift.io/project": "llm_d", + "forge.openshift.io/component": "smoke", + } + }, + "spec": { + "restartPolicy": "Never", + "containers": [ + { + "name": "smoke", + "image": smoke["client_image"], + "command": ["/bin/sh", "-ceu", command], + "env": [ + {"name": "ENDPOINT_URL", "value": endpoint_url}, + {"name": "ENDPOINT_PATH", "value": smoke["endpoint_path"]}, + {"name": "REQUEST_PAYLOAD", "value": json.dumps(payload)}, + {"name": "REQUEST_RETRIES", "value": str(smoke["request_retries"])}, + { + "name": "REQUEST_RETRY_DELAY_SECONDS", + "value": str(smoke["request_retry_delay_seconds"]), + }, + { + "name": "REQUEST_TIMEOUT_SECONDS", + "value": str(smoke["request_timeout_seconds"]), + }, + ], + } + ], + }, + }, + }, + } + + +def render_guidellm_pvc(config: ResolvedConfig) -> dict[str, Any]: + if not config.benchmark: + raise ValueError("Benchmark configuration is not enabled for this preset") + + return { + "apiVersion": "v1", + "kind": "PersistentVolumeClaim", + "metadata": { + "name": config.benchmark["job_name"], + "namespace": config.namespace, + "labels": { + "app.kubernetes.io/managed-by": "forge", + "forge.openshift.io/project": "llm_d", + }, + }, + "spec": { + "accessModes": ["ReadWriteOnce"], + "resources": {"requests": {"storage": config.benchmark["pvc_size"]}}, + }, + } + + +def render_guidellm_job(config: ResolvedConfig, endpoint_url: str) -> dict[str, Any]: + if not config.benchmark: + raise ValueError("Benchmark configuration is not enabled for this preset") + + args = [ + "benchmark", + "run", + f"--target={endpoint_url}", + f"--rate={config.benchmark['rate']}", + ] + for key, value in config.benchmark["args"].items(): + if value is None: + continue + args.append(f"--{key.replace('_', '-')}={value}") + args.append("--outputs=json") + + return { + "apiVersion": "batch/v1", + "kind": "Job", + "metadata": { + "name": config.benchmark["job_name"], + "namespace": config.namespace, + "labels": { + "app.kubernetes.io/managed-by": "forge", + "forge.openshift.io/project": "llm_d", + }, + }, + "spec": { + "backoffLimit": 0, + "template": { + "metadata": { + "labels": { + "app.kubernetes.io/managed-by": "forge", + "forge.openshift.io/project": "llm_d", + } + }, + "spec": { + "serviceAccountName": "default", + "restartPolicy": "Never", + "containers": [ + { + "name": "guidellm", + "image": config.benchmark["image"], + "command": ["/opt/app-root/bin/guidellm"], + "args": args, + "env": [{"name": "USER", "value": "guidellm"}], + "volumeMounts": [ + {"name": "home", "mountPath": "/home/guidellm"}, + {"name": "results", "mountPath": "/results"}, + ], + } + ], + "volumes": [ + {"name": "home", "emptyDir": {}}, + { + "name": "results", + "persistentVolumeClaim": {"claimName": config.benchmark["job_name"]}, + }, + ], + }, + }, + }, + } + + +def render_guidellm_copy_pod( + config: ResolvedConfig, node_name: str | None = None +) -> dict[str, Any]: + if not config.benchmark: + raise ValueError("Benchmark configuration is not enabled for this preset") + + pod = { + "apiVersion": "v1", + "kind": "Pod", + "metadata": { + "name": f"{config.benchmark['job_name']}-copy", + "namespace": config.namespace, + "labels": { + "app.kubernetes.io/managed-by": "forge", + "forge.openshift.io/project": "llm_d", + }, + }, + "spec": { + "restartPolicy": "Never", + "initContainers": [ + { + "name": "permission-fixer", + "image": config.benchmark["image"], + "command": [ + "/bin/sh", + "-c", + "chmod 755 /results && chown -R 1001:1001 /results || true", + ], + "securityContext": { + "runAsUser": 0, + "allowPrivilegeEscalation": True, + }, + "volumeMounts": [{"name": "results", "mountPath": "/results"}], + } + ], + "containers": [ + { + "name": "copy-helper", + "image": config.benchmark["image"], + "command": ["/bin/sleep", "300"], + "securityContext": { + "runAsUser": 1001, + "runAsNonRoot": True, + "allowPrivilegeEscalation": False, + }, + "volumeMounts": [{"name": "results", "mountPath": "/results"}], + } + ], + "volumes": [ + { + "name": "results", + "persistentVolumeClaim": {"claimName": config.benchmark["job_name"]}, + } + ], + }, + } + if node_name: + pod["spec"]["nodeName"] = node_name + return pod diff --git a/projects/llm_d/runtime/scripts/download_hf_model.sh b/projects/llm_d/runtime/scripts/download_hf_model.sh new file mode 100644 index 00000000..9623d2aa --- /dev/null +++ b/projects/llm_d/runtime/scripts/download_hf_model.sh @@ -0,0 +1,28 @@ +set -euo pipefail + +mkdir -p "${MODEL_TARGET_DIR}" +rm -rf "${MODEL_TARGET_DIR}"/* + +python -m pip install --quiet --no-cache-dir 'huggingface_hub[hf_xet]' +python - <<'PY' +import os + +from huggingface_hub import snapshot_download + +token = None +token_file = os.environ.get("HF_TOKEN_FILE") +if token_file and os.path.exists(token_file): + with open(token_file, encoding="utf-8") as handle: + token = handle.read().strip() or None + +snapshot_download( + repo_id=os.environ["MODEL_SOURCE"][5:], + local_dir=os.environ["MODEL_TARGET_DIR"], + local_dir_use_symlinks=False, + token=token, +) +PY + +cat > "${MARKER_FILE}" < "${MARKER_FILE}" < None: + namespace = llmd_runtime.derive_namespace("llm-d-nightly-smoke", "llm-d", 63) + assert namespace == "llm-d-nightly-smoke" + + +def test_parse_overrides_rejects_unknown_keys() -> None: + with pytest.raises(ValueError, match="Unsupported llm_d override keys"): + llmd_runtime.parse_overrides('{"model":"other"}', allowed_keys=("namespace",)) + + +def test_load_run_configuration_resolves_alias( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + + fournos_config = tmp_path / "fournos_config.yaml" + fournos_config.write_text( + "preset: cks\njob-name: llm-d-e2e\n", + encoding="utf-8", + ) + + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + + assert config.preset_name == "smoke" + assert config.preset_alias == "cks" + assert config.model["served_model_name"] == "Qwen/Qwen3-0.6B" + assert config.namespace == "forge-llm-d" + assert config.namespace_is_managed is False + + +def test_load_run_configuration_consolidates_config_d( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + + llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + consolidated = llmd_runtime.load_yaml(artifact_dir / "config.yaml") + + assert "platform" in consolidated + assert "model_cache" in consolidated + assert "models" in consolidated + assert "runtime" in consolidated + assert "scheduler_profiles" in consolidated + assert "workloads" in consolidated + assert consolidated["project"]["name"] == "llm_d" + assert consolidated["runtime"]["default_preset"] == "smoke" + assert consolidated["platform"]["cluster"]["namespace"]["name"] == "forge-llm-d" + assert isinstance(consolidated["platform"]["operators"], dict) + + +def test_namespace_override_is_not_managed(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + + config = llmd_runtime.load_run_configuration( + cwd=tmp_path, + artifact_dir=artifact_dir, + raw_overrides='{"namespace":"custom-ns"}', + ) + + assert config.namespace == "custom-ns" + assert config.namespace_is_managed is False + + +def test_default_namespace_comes_from_project_config( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + (tmp_path / "fournos_config.yaml").write_text( + "job-name: llm-d-nightly\n", + encoding="utf-8", + ) + + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + + assert config.namespace == "forge-llm-d" + assert config.namespace_is_managed is False + assert config.platform["cluster"]["namespace"]["prefix"] == "llm-d" + assert "rhods-operator" in config.platform["operators"] + + +def test_load_run_configuration_ignores_runtime_env_vars( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", '{"namespace":"ignored-ns"}') + monkeypatch.setenv("FORGE_PRESET", "benchmark-short") + monkeypatch.setenv("FORGE_JOB_NAME", "ignored-job") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + + assert config.preset_name == "smoke" + assert config.namespace == "forge-llm-d" + assert config.job_name == "local-smoke" + + +def test_write_prepare_inputs_round_trip(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + + path = phase_inputs.write_prepare_inputs(config) + payload = llmd_runtime.load_yaml(path) + loaded = phase_inputs.load_prepare_inputs(path) + + assert set(payload) == { + "artifact_dir", + "config_dir", + "preset_name", + "namespace", + "namespace_is_managed", + "platform", + "model_key", + "model", + "model_cache", + "benchmark", + } + assert loaded.artifact_dir == config.artifact_dir + assert loaded.config_dir == config.config_dir + assert loaded.namespace == config.namespace + assert loaded.platform == config.platform + assert loaded.model == config.model + assert loaded.model_cache == config.model_cache + assert loaded.benchmark == config.benchmark + + +def test_write_test_inputs_round_trip(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + + path = phase_inputs.write_test_inputs(config) + payload = llmd_runtime.load_yaml(path) + loaded = phase_inputs.load_test_inputs(path) + + assert set(payload) == { + "artifact_dir", + "config_dir", + "preset_name", + "namespace", + "platform", + "model_key", + "model", + "scheduler_profile_key", + "scheduler_profile", + "model_cache", + "smoke_request", + "benchmark", + } + assert loaded.namespace == config.namespace + assert loaded.scheduler_profile_key == config.scheduler_profile_key + assert loaded.smoke_request == config.smoke_request + assert loaded.benchmark == config.benchmark + + +@pytest.mark.parametrize("orchestration", [llmd_ci, llmd_cli]) +def test_orchestration_prepare_writes_inputs_and_invokes_toolbox( + orchestration, tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + captured: dict[str, str] = {} + + monkeypatch.setattr( + orchestration.llmd_runtime, + "load_run_configuration", + lambda **_kwargs: config, + ) + monkeypatch.setattr( + orchestration, + "prepare_toolbox_run", + lambda *, inputs_file: captured.setdefault("inputs_file", inputs_file) or 17, + ) + + result = orchestration.run_prepare_phase() + loaded = phase_inputs.load_prepare_inputs(captured["inputs_file"]) + + assert result == captured["inputs_file"] + assert loaded.namespace == config.namespace + + +@pytest.mark.parametrize("orchestration", [llmd_ci, llmd_cli]) +def test_orchestration_test_writes_inputs_and_invokes_toolbox( + orchestration, tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + captured: dict[str, str] = {} + + monkeypatch.setattr( + orchestration.llmd_runtime, + "load_run_configuration", + lambda **_kwargs: config, + ) + monkeypatch.setattr( + orchestration, + "test_toolbox_run", + lambda *, inputs_file: captured.setdefault("inputs_file", inputs_file) or 23, + ) + + result = orchestration.run_test_phase() + loaded = phase_inputs.load_test_inputs(captured["inputs_file"]) + + assert result == captured["inputs_file"] + assert loaded.namespace == config.namespace + assert loaded.model == config.model + + +@pytest.mark.parametrize("orchestration", [llmd_ci, llmd_cli]) +def test_orchestration_cleanup_writes_inputs_and_invokes_toolbox( + orchestration, tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + captured: dict[str, str] = {} + + monkeypatch.setattr( + orchestration.llmd_runtime, + "load_run_configuration", + lambda **_kwargs: config, + ) + monkeypatch.setattr( + orchestration, + "cleanup_toolbox_run", + lambda *, inputs_file: captured.setdefault("inputs_file", inputs_file) or 29, + ) + + result = orchestration.run_cleanup_phase() + loaded = phase_inputs.load_cleanup_inputs(captured["inputs_file"]) + + assert result == captured["inputs_file"] + assert loaded.namespace == config.namespace + assert loaded.platform == config.platform + + +@pytest.mark.parametrize("orchestration", [llmd_ci, llmd_cli]) +def test_orchestration_load_runtime_configuration_reads_env( + orchestration, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setenv("FORGE_PRESET", "smoke-precise") + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", '{"namespace":"custom-ns"}') + monkeypatch.setenv("FORGE_JOB_NAME", "job-from-env") + captured: dict[str, str | None] = {} + sentinel = object() + + def fake_load_run_configuration(**kwargs): + captured.update(kwargs) + return sentinel + + monkeypatch.setattr( + orchestration.llmd_runtime, "load_run_configuration", fake_load_run_configuration + ) + + result = orchestration.load_runtime_configuration() + + assert result is sentinel + assert captured == { + "requested_preset": "smoke-precise", + "raw_overrides": '{"namespace":"custom-ns"}', + "job_name": "job-from-env", + } + + +def test_render_inference_service_injects_model_and_scheduler_profile( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + manifest = llmd_runtime.render_inference_service(config) + cache_spec = llmd_runtime.resolve_model_cache(config) + + assert manifest["metadata"]["name"] == "llm-d" + assert manifest["metadata"]["namespace"] == config.namespace + assert manifest["spec"]["model"]["name"] == "Qwen/Qwen3-0.6B" + assert manifest["spec"]["model"]["uri"] == cache_spec.model_uri + assert manifest["spec"]["model"]["name"] == config.model["served_model_name"] + assert config.scheduler_profile_key == "approximate" + router_args = manifest["spec"]["router"]["scheduler"]["template"]["containers"][0]["args"] + assert router_args[-2] == "--config-text" + assert "EndpointPickerConfig" in router_args[-1] + assert "prefix-cache-scorer" in router_args[-1] + + +def test_render_inference_service_supports_precise_scheduler_profile( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + (tmp_path / "fournos_config.yaml").write_text( + "preset: smoke-precise\njob-name: llm-d-precise\n", + encoding="utf-8", + ) + + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + manifest = llmd_runtime.render_inference_service(config) + + assert config.scheduler_profile_key == "precise" + router_args = manifest["spec"]["router"]["scheduler"]["template"]["containers"][0]["args"] + assert router_args[-2] == "--config-text" + assert "precise-prefix-cache-scorer" in router_args[-1] + assert "tokenizersCacheDir" in router_args[-1] + + +def test_render_inference_service_supports_default_scheduler( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + (tmp_path / "fournos_config.yaml").write_text( + "preset: smoke-default-scheduler\njob-name: llm-d-default\n", + encoding="utf-8", + ) + + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + manifest = llmd_runtime.render_inference_service(config) + + assert config.scheduler_profile_key == "default" + assert config.scheduler_profile is None + assert manifest["spec"]["router"]["scheduler"] == {} + + +def test_resolve_model_cache_for_hf_model(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + cache_spec = llmd_runtime.resolve_model_cache(config) + + assert cache_spec is not None + assert cache_spec.source_scheme == "hf" + assert cache_spec.pvc_name.startswith("llm-d-model-qwen3-0-6b-") + assert cache_spec.model_uri == f"pvc://{cache_spec.pvc_name}/model" + assert cache_spec.pvc_size == "10Gi" + assert cache_spec.access_mode == "ReadWriteOnce" + + +def test_render_model_cache_job_for_hf_model( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + cache_spec = llmd_runtime.resolve_model_cache(config) + manifest = llmd_runtime.render_model_cache_job(config, cache_spec) + + container = manifest["spec"]["template"]["spec"]["containers"][0] + assert container["name"] == "hf-model-downloader" + assert container["image"] == "registry.access.redhat.com/ubi9/python-311" + assert any( + env["name"] == "MODEL_SOURCE" and env["value"] == "hf://Qwen/Qwen3-0.6B" + for env in container["env"] + ) + assert "huggingface_hub" in container["command"][-1] + + +def test_render_model_cache_job_for_oci_model_uses_registry_auth_secret( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + (tmp_path / "fournos_config.yaml").write_text( + "preset: benchmark-short\njob-name: llm-d-benchmark\n", + encoding="utf-8", + ) + + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + monkeypatch.setattr( + llmd_runtime, + "resolve_default_serviceaccount_image_pull_secret", + lambda namespace: "pull-secret", + ) + cache_spec = llmd_runtime.resolve_model_cache(config) + manifest = llmd_runtime.render_model_cache_job(config, cache_spec) + + container = manifest["spec"]["template"]["spec"]["containers"][0] + volume_names = {volume["name"] for volume in manifest["spec"]["template"]["spec"]["volumes"]} + + assert cache_spec.source_scheme == "oci" + assert container["name"] == "oci-model-extractor" + assert container["image"] == "registry.redhat.io/openshift4/ose-cli:v4.19" + assert any(env["name"] == "OCI_IMAGE_PATH" and env["value"] == "/" for env in container["env"]) + assert "registry-auth" in volume_names + assert "oc image extract" in container["command"][-1] + + +def test_render_guidellm_job_uses_target_and_rate( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + (tmp_path / "fournos_config.yaml").write_text( + "preset: benchmark-short\njob-name: llm-d-benchmark\n", + encoding="utf-8", + ) + + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + manifest = llmd_runtime.render_guidellm_job(config, "https://example.test") + + container = manifest["spec"]["template"]["spec"]["containers"][0] + assert container["image"] == "ghcr.io/vllm-project/guidellm:v0.5.4" + assert "--target=https://example.test" in container["args"] + assert "--rate=1" in container["args"] + + +def test_render_smoke_request_job_uses_curl_helper( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + payload = {"model": "Qwen/Qwen3-0.6B", "prompt": "test"} + manifest = llmd_runtime.render_smoke_request_job(config, "https://example.test", payload) + + container = manifest["spec"]["template"]["spec"]["containers"][0] + env = {item["name"]: item["value"] for item in container["env"]} + + assert manifest["kind"] == "Job" + assert manifest["metadata"]["name"] == "llm-d-smoke" + assert container["image"] == "curlimages/curl:8.11.1" + assert env["ENDPOINT_URL"] == "https://example.test" + assert env["ENDPOINT_PATH"] == "/v1/completions" + assert env["REQUEST_PAYLOAD"] == '{"model": "Qwen/Qwen3-0.6B", "prompt": "test"}' + + +def test_prepare_model_cache_skips_ready_pvc( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + calls: list[str] = [] + + monkeypatch.setattr( + prepare_model_cache_toolbox, + "ensure_model_cache_pvc", + lambda _config, _cache_spec: calls.append("ensure-pvc"), + ) + monkeypatch.setattr(llmd_runtime, "model_cache_pvc_ready", lambda _cache_spec: True) + monkeypatch.setattr( + prepare_model_cache_toolbox, + "capture_model_cache_state", + lambda _config, _cache_spec: calls.append("capture"), + ) + monkeypatch.setattr( + prepare_model_cache_toolbox, + "run_model_cache_download_job", + lambda _config, _cache_spec: calls.append("download"), + ) + + prepare_model_cache_toolbox.run_prepare_model_cache(config) + + assert calls == ["ensure-pvc", "capture"] + + +def test_cleanup_deletes_leftovers_but_not_namespace_or_preserved_pvcs( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + shell_calls: list[str] = [] + + def fake_resource_exists(kind: str, name: str, namespace: str | None = None) -> bool: + if kind == "namespace": + return True + return False + + monkeypatch.setattr(llmd_runtime, "resource_exists", fake_resource_exists) + monkeypatch.setattr( + cleanup_toolbox.shell, + "run", + lambda command, **kwargs: shell_calls.append(command), + ) + monkeypatch.setattr(llmd_runtime, "wait_until", lambda *args, **kwargs: True) + monkeypatch.setattr(cleanup_toolbox, "_llm_d_pods_gone", lambda *_args: True) + + cleanup_toolbox.delete_run_leftovers(config) + + assert f"oc delete namespace {config.namespace} --ignore-not-found=true" not in shell_calls + assert ( + f"oc delete pvc -n {config.namespace} " + '-l "forge.openshift.io/project=llm_d,forge.openshift.io/preserve!=true" ' + "--ignore-not-found=true" + ) in shell_calls + + +def test_prepare_gpu_operator_skips_existing_clusterpolicy( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + + calls: list[str] = [] + + monkeypatch.setattr( + prepare_toolbox, + "ensure_operator_subscription", + lambda operator_spec: calls.append(f"subscription:{operator_spec['package']}"), + ) + monkeypatch.setattr( + llmd_runtime, + "wait_for_crd", + lambda crd_name, *, timeout_seconds: calls.append(f"crd:{crd_name}"), + ) + monkeypatch.setattr( + llmd_runtime, + "load_manifest_template", + lambda _config, _path: { + "apiVersion": "nvidia.com/v1", + "kind": "ClusterPolicy", + "metadata": {"name": "gpu-cluster-policy"}, + "spec": {}, + }, + ) + monkeypatch.setattr(llmd_runtime, "resource_exists", lambda kind, name: True) + + def fail_apply(*_: object, **__: object) -> None: + raise AssertionError("existing ClusterPolicy must not be reapplied") + + monkeypatch.setattr(llmd_runtime, "apply_manifest", fail_apply) + monkeypatch.setattr( + llmd_runtime, + "oc_get_json", + lambda kind, name: {"status": {"state": "ready"}}, + ) + + prepare_toolbox.prepare_gpu_operator(config) + + assert calls == [ + "subscription:gpu-operator-certified", + "crd:clusterpolicies.nvidia.com", + ] + + +def test_prepare_gpu_operator_bootstraps_missing_clusterpolicy( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + + applied: list[Path] = [] + manifest = { + "apiVersion": "nvidia.com/v1", + "kind": "ClusterPolicy", + "metadata": {"name": "gpu-cluster-policy"}, + "spec": {}, + } + + monkeypatch.setattr(prepare_toolbox, "ensure_operator_subscription", lambda _: None) + monkeypatch.setattr(llmd_runtime, "wait_for_crd", lambda *_, **__: None) + monkeypatch.setattr(llmd_runtime, "load_manifest_template", lambda _config, _path: manifest) + monkeypatch.setattr(llmd_runtime, "resource_exists", lambda kind, name: False) + monkeypatch.setattr( + llmd_runtime, + "apply_manifest", + lambda artifact_path, _manifest: applied.append(artifact_path), + ) + monkeypatch.setattr( + llmd_runtime, + "oc_get_json", + lambda kind, name: {"status": {"state": "ready"}}, + ) + + prepare_toolbox.prepare_gpu_operator(config) + + assert applied == [artifact_dir / "src" / "gpu-clusterpolicy.yaml"] + + +def test_prepare_nfd_skips_existing_nodefeaturediscovery( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + + calls: list[str] = [] + manifest = { + "apiVersion": "nfd.openshift.io/v1", + "kind": "NodeFeatureDiscovery", + "metadata": {"name": "nfd-instance", "namespace": "openshift-nfd"}, + } + + monkeypatch.setattr( + prepare_toolbox, + "ensure_operator_subscription", + lambda operator_spec: calls.append(f"subscription:{operator_spec['package']}"), + ) + monkeypatch.setattr( + llmd_runtime, + "wait_for_crd", + lambda crd_name, *, timeout_seconds: calls.append(f"crd:{crd_name}"), + ) + monkeypatch.setattr(llmd_runtime, "load_manifest_template", lambda _config, _path: manifest) + monkeypatch.setattr(llmd_runtime, "resource_exists", lambda *args, **kwargs: True) + monkeypatch.setattr( + llmd_runtime, + "wait_until", + lambda *args, **kwargs: calls.append("wait-nfd"), + ) + monkeypatch.setattr( + prepare_toolbox, + "wait_for_nfd_gpu_labels", + lambda _config, *, timeout_seconds: calls.append("wait-labels"), + ) + + def fail_apply(*_: object, **__: object) -> None: + raise AssertionError("existing NodeFeatureDiscovery must not be reapplied") + + monkeypatch.setattr(llmd_runtime, "apply_manifest", fail_apply) + + prepare_toolbox.prepare_nfd(config) + + assert calls == [ + "subscription:nfd", + "crd:nodefeaturediscoveries.nfd.openshift.io", + "wait-nfd", + "wait-labels", + ] + + +def test_gpu_clusterpolicy_manifest_has_required_default_sections() -> None: + manifest = llmd_runtime.load_yaml( + llmd_runtime.CONFIG_DIR / "manifests" / "gpu-clusterpolicy.yaml" + ) + + assert manifest["kind"] == "ClusterPolicy" + assert manifest["metadata"]["name"] == "gpu-cluster-policy" + assert { + "daemonsets", + "dcgm", + "dcgmExporter", + "devicePlugin", + "driver", + "gfd", + "nodeStatusExporter", + "operator", + "toolkit", + } <= set(manifest["spec"]) + + +def test_resolve_endpoint_url_requires_gateway_address( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + + def fake_oc_get_json(kind: str, **_: object) -> dict[str, object]: + assert kind == "llminferenceservice" + return {"status": {"addresses": [{"name": "other", "url": "https://wrong"}]}} + + monkeypatch.setattr(llmd_runtime, "oc_get_json", fake_oc_get_json) + + with pytest.raises(RuntimeError, match="Gateway address"): + test_toolbox.resolve_endpoint_url(config) + + +def test_run_smoke_request_uses_helper_job(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("FORGE_CONFIG_OVERRIDES", "{}") + artifact_dir = tmp_path / "artifacts" + artifact_dir.mkdir() + config = llmd_runtime.load_run_configuration(cwd=tmp_path, artifact_dir=artifact_dir) + oc_calls: list[tuple[str, ...]] = [] + applied: list[Path] = [] + + def fake_oc(*args, **kwargs): + oc_calls.append(tuple(args)) + if args[:2] == ("logs", "job/llm-d-smoke"): + return subprocess.CompletedProcess( + args, + 0, + stdout='{"choices":[{"text":"ok"}]}\n', + stderr="", + ) + return subprocess.CompletedProcess(args, 0, stdout="", stderr="") + + monkeypatch.setattr(llmd_runtime, "oc", fake_oc) + monkeypatch.setattr(llmd_runtime, "resource_exists", lambda *args, **kwargs: False) + monkeypatch.setattr(llmd_runtime, "wait_until", lambda *args, **kwargs: True) + monkeypatch.setattr(llmd_runtime, "wait_for_job_completion", lambda *args, **kwargs: True) + monkeypatch.setattr( + llmd_runtime, + "apply_manifest", + lambda artifact_path, _manifest: applied.append(artifact_path), + ) + monkeypatch.setattr(test_toolbox, "capture_smoke_state", lambda _config: None) + + response = test_toolbox.run_smoke_request(config, "https://example.test") + + assert response["choices"][0]["text"] == "ok" + assert applied == [artifact_dir / "src" / "smoke-job.yaml"] + assert not any(call and call[0] == "exec" for call in oc_calls) + + +def test_wait_until_reraises_runtime_error() -> None: + with pytest.raises(RuntimeError, match="terminal failure"): + llmd_runtime.wait_until( + "test condition", + timeout_seconds=1, + interval_seconds=0, + predicate=lambda: (_ for _ in ()).throw(RuntimeError("terminal failure")), + ) + + +def test_oc_forwards_timeout_to_run_command(monkeypatch: pytest.MonkeyPatch) -> None: + captured: dict[str, object] = {} + + def fake_run_command(args, **kwargs): + captured["args"] = list(args) + captured["kwargs"] = kwargs + return subprocess.CompletedProcess(args, 0, stdout="", stderr="") + + monkeypatch.setattr(llmd_runtime, "run_command", fake_run_command) + + llmd_runtime.oc("get", "pods", timeout_seconds=42) + + assert captured["args"] == ["oc", "get", "pods"] + assert captured["kwargs"]["timeout_seconds"] == 42 + + +def test_oc_get_json_returns_none_only_for_not_found( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr( + llmd_runtime, + "oc", + lambda *args, **kwargs: subprocess.CompletedProcess( + args, + 1, + stdout="", + stderr='Error from server (NotFound): llminferenceservices.serving.kserve.io "llm-d" not found', + ), + ) + + payload = llmd_runtime.oc_get_json( + "llminferenceservice", + name="llm-d", + namespace="forge-llm-d", + ignore_not_found=True, + ) + + assert payload is None + + +def test_oc_get_json_raises_for_non_not_found_errors( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr( + llmd_runtime, + "oc", + lambda *args, **kwargs: subprocess.CompletedProcess( + args, + 1, + stdout="", + stderr='Error from server (Forbidden): pods is forbidden: User "alice" cannot list resource "pods"', + ), + ) + + with pytest.raises(llmd_runtime.CommandError, match="Forbidden"): + llmd_runtime.oc_get_json("pods", namespace="forge-llm-d", ignore_not_found=True) + + +def test_resource_exists_propagates_non_not_found_errors( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr( + llmd_runtime, + "oc_get_json", + lambda *args, **kwargs: (_ for _ in ()).throw(llmd_runtime.CommandError("boom")), + ) + + with pytest.raises(llmd_runtime.CommandError, match="boom"): + llmd_runtime.resource_exists("namespace", "forge-llm-d") diff --git a/projects/llm_d/toolbox/capture_isvc_state/main.py b/projects/llm_d/toolbox/capture_llmisvc_state/main.py old mode 100755 new mode 100644 similarity index 86% rename from projects/llm_d/toolbox/capture_isvc_state/main.py rename to projects/llm_d/toolbox/capture_llmisvc_state/main.py index 78448e15..1e4577c5 --- a/projects/llm_d/toolbox/capture_isvc_state/main.py +++ b/projects/llm_d/toolbox/capture_llmisvc_state/main.py @@ -2,15 +2,10 @@ """ LLMInferenceService state capture using task-based DSL -Replaces llmd_capture_isvc_state Ansible role +Replaces llmd_capture_llmisvc_state Ansible role """ -from projects.core.dsl import ( - execute_tasks, - shell, - task, - toolbox, -) +from projects.core.dsl import execute_tasks, shell, task, toolbox def run(llmisvc_name: str, *, namespace: str = ""): @@ -22,7 +17,6 @@ def run(llmisvc_name: str, *, namespace: str = ""): namespace: Namespace of the LLMInferenceService (empty string auto-detects current namespace) """ - # Execute all registered tasks in order, respecting conditions return execute_tasks(locals()) @@ -157,7 +151,6 @@ def capture_podmonitors(args, context): @task def capture_pod_logs(args, context): """Capture logs from LLMInferenceService pods""" - # Get list of pod names result = shell.run( f'oc get pods -l "app.kubernetes.io/name={args.llmisvc_name}" -n {context.target_namespace} -o jsonpath="{{.items[*].metadata.name}}"', check=False, @@ -170,19 +163,16 @@ def capture_pod_logs(args, context): log_file = args.artifact_dir / "artifacts/llminferenceservice.pods.logs" - # Capture logs for each pod - with open(log_file, "w") as f: # Start with empty file + with open(log_file, "w") as handle: for pod_name in pod_names: - f.write(f"=== Logs for pod: {pod_name} ===\n") - - # Get logs for this pod + handle.write(f"=== Logs for pod: {pod_name} ===\n") log_result = shell.run( f"oc logs {pod_name} -n {context.target_namespace} --all-containers=true", check=False, log_stdout=False, ) - f.write(log_result.stdout) - f.write("\n") + handle.write(log_result.stdout) + handle.write("\n") return f"Pod logs captured for {len(pod_names)} pods" @@ -190,7 +180,6 @@ def capture_pod_logs(args, context): @task def capture_pod_previous_logs(args, context): """Capture previous logs from LLMInferenceService pods if available""" - # Get list of pod names result = shell.run( f'oc get pods -l "app.kubernetes.io/name={args.llmisvc_name}" -n {context.target_namespace} -o jsonpath="{{.items[*].metadata.name}}"', check=False, @@ -202,19 +191,16 @@ def capture_pod_previous_logs(args, context): log_file = args.artifact_dir / "artifacts/llminferenceservice.pods.previous.logs" - # Capture previous logs for each pod - with open(log_file, "w") as f: # Start with empty file + with open(log_file, "w") as handle: for pod_name in pod_names: - f.write(f"=== Previous logs for pod: {pod_name} ===\n") - - # Get previous logs for this pod + handle.write(f"=== Previous logs for pod: {pod_name} ===\n") log_result = shell.run( f"oc logs {pod_name} -n {context.target_namespace} --previous --all-containers=true", check=False, log_stdout=False, ) - f.write(log_result.stdout) - f.write("\n") + handle.write(log_result.stdout) + handle.write("\n") return f"Pod previous logs captured for {len(pod_names)} pods" @@ -233,7 +219,6 @@ def capture_llminferenceservice_describe(args, context): @task def capture_pods_describe(args, context): """Capture describe output for related pods""" - # Get list of pod names result = shell.run( f'oc get pods -l "app.kubernetes.io/name={args.llmisvc_name}" -n {context.target_namespace} -o jsonpath="{{.items[*].metadata.name}}"', check=False, @@ -245,24 +230,20 @@ def capture_pods_describe(args, context): describe_file = args.artifact_dir / "artifacts/llminferenceservice.pods.describe.txt" - # Capture describe output for each pod - with open(describe_file, "w") as f: # Start with empty file + with open(describe_file, "w") as handle: for pod_name in pod_names: - f.write(f"=== Describe for pod: {pod_name} ===\n") - - # Get describe output for this pod + handle.write(f"=== Describe for pod: {pod_name} ===\n") describe_result = shell.run( f"oc describe pod {pod_name} -n {context.target_namespace}", log_stdout=False, check=False, ) - f.write(describe_result.stdout) - f.write("\n") + handle.write(describe_result.stdout) + handle.write("\n") return f"Pod describe output captured for {len(pod_names)} pods" -# Create the main function using the toolbox library main = toolbox.create_toolbox_main(run) diff --git a/projects/llm_d/toolbox/cleanup/main.py b/projects/llm_d/toolbox/cleanup/main.py new file mode 100644 index 00000000..a32dbd6d --- /dev/null +++ b/projects/llm_d/toolbox/cleanup/main.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +from projects.core.dsl import execute_tasks, shell, task, toolbox +from projects.llm_d.runtime import llmd_runtime, phase_inputs + + +def run(*, inputs_file: str) -> int: + """Delete llm_d runtime leftovers from a namespace. + + Args: + inputs_file: Path to the cleanup phase input file generated by orchestration + """ + + llmd_runtime.init() + execute_tasks(locals()) + return 0 + + +@task +def load_inputs(args, ctx): + """Load the cleanup phase inputs""" + + ctx.inputs = phase_inputs.load_cleanup_inputs(args.inputs_file) + return f"Loaded cleanup inputs for namespace {ctx.inputs.namespace}" + + +@task +def delete_leftovers(args, ctx): + """Delete llm_d runtime leftovers""" + + inputs = ctx.inputs + if not llmd_runtime.resource_exists("namespace", inputs.namespace): + return f"Namespace {inputs.namespace} does not exist; nothing to clean" + + inference_service_name = inputs.platform["inference_service"]["name"] + namespace = inputs.namespace + cleanup_timeout_seconds = inputs.platform["cluster"]["cleanup_timeout_seconds"] + benchmark_names = {"guidellm-benchmark"} + if inputs.benchmark: + benchmark_names.add(inputs.benchmark["job_name"]) + + shell.run( + f"oc delete llminferenceservice {inference_service_name} " + f"-n {namespace} --ignore-not-found=true", + check=False, + ) + + for benchmark_name in sorted(benchmark_names): + shell.run( + f"oc delete job,pvc {benchmark_name} -n {namespace} --ignore-not-found=true", + check=False, + ) + shell.run( + f"oc delete pod {benchmark_name}-copy -n {namespace} --ignore-not-found=true", + check=False, + ) + + shell.run( + f'oc delete job -n {namespace} -l "forge.openshift.io/project=llm_d" ' + "--ignore-not-found=true", + check=False, + ) + shell.run( + f'oc delete pod -n {namespace} -l "forge.openshift.io/project=llm_d" ' + "--ignore-not-found=true", + check=False, + ) + shell.run( + f"oc delete pvc -n {namespace} " + '-l "forge.openshift.io/project=llm_d,forge.openshift.io/preserve!=true" ' + "--ignore-not-found=true", + check=False, + ) + + llmd_runtime.wait_until( + f"llminferenceservice/{inference_service_name} deletion in {namespace}", + timeout_seconds=cleanup_timeout_seconds, + interval_seconds=10, + predicate=lambda: not llmd_runtime.resource_exists( + "llminferenceservice", inference_service_name, namespace=namespace + ), + ) + + llmd_runtime.wait_until( + f"llm-d workload pods deletion in {namespace}", + timeout_seconds=cleanup_timeout_seconds, + interval_seconds=10, + predicate=lambda: _llm_d_pods_gone(namespace, inference_service_name), + ) + + return f"Cleanup finished for namespace {namespace}" + + +def delete_run_leftovers(inputs: phase_inputs.CleanupInputs) -> None: + if not llmd_runtime.resource_exists("namespace", inputs.namespace): + return + + inference_service_name = inputs.platform["inference_service"]["name"] + namespace = inputs.namespace + cleanup_timeout_seconds = inputs.platform["cluster"]["cleanup_timeout_seconds"] + benchmark_names = {"guidellm-benchmark"} + if inputs.benchmark: + benchmark_names.add(inputs.benchmark["job_name"]) + + shell.run( + f"oc delete llminferenceservice {inference_service_name} " + f"-n {namespace} --ignore-not-found=true", + check=False, + ) + + for benchmark_name in sorted(benchmark_names): + shell.run( + f"oc delete job,pvc {benchmark_name} -n {namespace} --ignore-not-found=true", + check=False, + ) + shell.run( + f"oc delete pod {benchmark_name}-copy -n {namespace} --ignore-not-found=true", + check=False, + ) + + shell.run( + f'oc delete job -n {namespace} -l "forge.openshift.io/project=llm_d" ' + "--ignore-not-found=true", + check=False, + ) + shell.run( + f'oc delete pod -n {namespace} -l "forge.openshift.io/project=llm_d" ' + "--ignore-not-found=true", + check=False, + ) + shell.run( + f"oc delete pvc -n {namespace} " + '-l "forge.openshift.io/project=llm_d,forge.openshift.io/preserve!=true" ' + "--ignore-not-found=true", + check=False, + ) + + llmd_runtime.wait_until( + f"llminferenceservice/{inference_service_name} deletion in {namespace}", + timeout_seconds=cleanup_timeout_seconds, + interval_seconds=10, + predicate=lambda: not llmd_runtime.resource_exists( + "llminferenceservice", inference_service_name, namespace=namespace + ), + ) + + llmd_runtime.wait_until( + f"llm-d workload pods deletion in {namespace}", + timeout_seconds=cleanup_timeout_seconds, + interval_seconds=10, + predicate=lambda: _llm_d_pods_gone(namespace, inference_service_name), + ) + + +def _llm_d_pods_gone(namespace: str, inference_service_name: str) -> bool: + payload = llmd_runtime.oc_get_json( + "pods", + namespace=namespace, + selector=f"app.kubernetes.io/name={inference_service_name}", + ignore_not_found=True, + ) + return not payload or not payload.get("items") + + +main = toolbox.create_toolbox_main(run) + + +if __name__ == "__main__": + main() diff --git a/projects/llm_d/toolbox/prepare/main.py b/projects/llm_d/toolbox/prepare/main.py new file mode 100644 index 00000000..34b23478 --- /dev/null +++ b/projects/llm_d/toolbox/prepare/main.py @@ -0,0 +1,761 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +import json +import logging +from pathlib import Path + +from projects.core.dsl import execute_tasks, shell, task, toolbox +from projects.llm_d.runtime import llmd_runtime, phase_inputs +from projects.llm_d.toolbox.prepare_model_cache import main as prepare_model_cache + +LOGGER = logging.getLogger(__name__) + + +def run(*, inputs_file: str) -> int: + """Prepare a cluster for llm_d downstream smoke and benchmark runs. + + Args: + inputs_file: Path to the prepare phase input file generated by orchestration + """ + + llmd_runtime.init() + execute_tasks(locals()) + return 0 + + +@task +def load_inputs(args, ctx): + """Load the prepare phase inputs""" + + ctx.config = phase_inputs.load_prepare_inputs(args.inputs_file) + return f"Loaded prepare inputs for preset {ctx.config.preset_name}" + + +@task +def verify_oc_access_task(args, ctx): + """Verify OpenShift CLI access""" + + llmd_runtime.oc("whoami", capture_output=True) + return "OpenShift CLI access verified" + + +@task +def verify_cluster_version_task(args, ctx): + """Validate the cluster version against llm_d requirements""" + + version_info = llmd_runtime.oc("version", "-o", "json", capture_output=True) + payload = json.loads(version_info.stdout) + + openshift_version = ( + payload.get("openshiftVersion") + or payload.get("serverVersion", {}).get("gitVersion") + or payload.get("serverVersion", {}).get("platform") + ) + if not openshift_version: + raise RuntimeError("Could not determine OpenShift version from `oc version -o json`") + + minimum = ctx.config.platform["cluster"]["minimum_openshift_version"] + if llmd_runtime.version_tuple(openshift_version) < llmd_runtime.version_tuple(minimum): + raise RuntimeError( + f"Cluster version {openshift_version} is older than the llm_d minimum {minimum}" + ) + + return f"Cluster version satisfies {minimum}" + + +@task +def prepare_cert_manager_task(args, ctx): + """Ensure the cert-manager operator is installed""" + + operator_spec = llmd_runtime.operator_spec_by_package( + ctx.config.platform, "openshift-cert-manager-operator" + ) + ensure_operator_subscription(operator_spec) + return "cert-manager operator ready" + + +@task +def prepare_leader_worker_set_task(args, ctx): + """Ensure the leader-worker-set operator is installed""" + + operator_spec = llmd_runtime.operator_spec_by_package(ctx.config.platform, "leader-worker-set") + ensure_operator_subscription(operator_spec) + return "leader-worker-set operator ready" + + +@task +def prepare_nfd_task(args, ctx): + """Ensure Node Feature Discovery is installed and reporting GPU labels""" + + config = ctx.config + operator_spec = llmd_runtime.operator_spec_by_package(config.platform, "nfd") + ensure_operator_subscription(operator_spec) + llmd_runtime.wait_for_crd( + operator_spec["bootstrap_crd"], + timeout_seconds=operator_spec["wait_timeout_seconds"], + ) + + manifest = llmd_runtime.load_manifest_template(config, operator_spec["bootstrap_manifest"]) + nfd_name = manifest["metadata"]["name"] + nfd_namespace = manifest["metadata"]["namespace"] + if llmd_runtime.resource_exists("nodefeaturediscovery", nfd_name, namespace=nfd_namespace): + LOGGER.info( + "NodeFeatureDiscovery/%s already exists in %s; verifying GPU discovery labels", + nfd_name, + nfd_namespace, + ) + else: + llmd_runtime.apply_manifest( + config.artifact_dir / "src" / "nfd-nodefeaturediscovery.yaml", + manifest, + ) + + llmd_runtime.wait_until( + "NodeFeatureDiscovery bootstrap resource", + timeout_seconds=operator_spec["wait_timeout_seconds"], + interval_seconds=10, + predicate=lambda: llmd_runtime.resource_exists( + "nodefeaturediscovery", + nfd_name, + namespace=nfd_namespace, + ), + ) + + wait_for_nfd_gpu_labels(config, timeout_seconds=operator_spec["wait_timeout_seconds"]) + return "Node Feature Discovery ready" + + +@task +def prepare_gpu_operator_task(args, ctx): + """Ensure the GPU operator is installed and ready""" + + config = ctx.config + operator_spec = llmd_runtime.operator_spec_by_package(config.platform, "gpu-operator-certified") + ensure_operator_subscription(operator_spec) + llmd_runtime.wait_for_crd( + operator_spec["bootstrap_crd"], + timeout_seconds=operator_spec["wait_timeout_seconds"], + ) + + manifest = llmd_runtime.load_manifest_template(config, operator_spec["bootstrap_manifest"]) + clusterpolicy_name = manifest["metadata"]["name"] + if llmd_runtime.resource_exists("clusterpolicy", clusterpolicy_name): + LOGGER.info( + "ClusterPolicy/%s already exists; verifying readiness instead of applying bootstrap manifest", + clusterpolicy_name, + ) + else: + llmd_runtime.apply_manifest( + config.artifact_dir / "src" / "gpu-clusterpolicy.yaml", + manifest, + ) + + wait_for_gpu_clusterpolicy_ready( + clusterpolicy_name, + timeout_seconds=operator_spec["wait_timeout_seconds"], + ) + return "GPU operator ready" + + +@task +def prepare_rhoai_operator_task(args, ctx): + """Ensure the RHOAI operator is installed""" + + config = ctx.config + operator_spec = llmd_runtime.operator_spec_by_package(config.platform, "rhods-operator") + ensure_operator_subscription(operator_spec) + for crd_name in config.platform["rhoai"]["required_crds_before_dsc"]: + llmd_runtime.wait_for_crd( + crd_name, + timeout_seconds=config.platform["rhoai"]["wait_timeout_seconds"], + ) + return "RHOAI operator ready" + + +@task +def apply_datasciencecluster_task(args, ctx): + """Apply the DataScienceCluster manifest""" + + config = ctx.config + manifest = llmd_runtime.render_datasciencecluster(config) + llmd_runtime.apply_manifest(config.artifact_dir / "src" / "datasciencecluster.yaml", manifest) + llmd_runtime.oc( + "get", + "datasciencecluster", + config.platform["rhoai"]["datasciencecluster_name"], + "-n", + config.platform["rhoai"]["namespace"], + "-o", + "yaml", + capture_output=True, + ) + return "DataScienceCluster applied" + + +@task +def wait_for_datasciencecluster_ready_task(args, ctx): + """Wait for the DataScienceCluster to become ready""" + + rhoai = ctx.config.platform["rhoai"] + + def _dsc_ready() -> bool: + payload = llmd_runtime.oc_get_json( + "datasciencecluster", + name=rhoai["datasciencecluster_name"], + namespace=rhoai["namespace"], + ) + phase = payload.get("status", {}).get("phase") + if phase == "Ready": + return True + if phase in {"Failed", "Error"}: + raise RuntimeError(f"DataScienceCluster entered terminal phase {phase}") + return False + + llmd_runtime.wait_until( + f"datasciencecluster/{rhoai['datasciencecluster_name']} ready", + timeout_seconds=rhoai["wait_timeout_seconds"], + interval_seconds=10, + predicate=_dsc_ready, + ) + return "DataScienceCluster ready" + + +@task +def ensure_required_crds_task(args, ctx): + """Wait for the llm_d-required CRDs to exist""" + + for crd_name in ctx.config.platform["rhoai"]["required_crds_after_dsc"]: + llmd_runtime.wait_for_crd( + crd_name, + timeout_seconds=ctx.config.platform["rhoai"]["wait_timeout_seconds"], + ) + return "Required CRDs present" + + +@task +def ensure_gateway_task(args, ctx): + """Ensure the gateway exists and is programmed""" + + config = ctx.config + gateway = config.platform["gateway"] + if not llmd_runtime.resource_exists("gateway", gateway["name"], namespace=gateway["namespace"]): + if not gateway["create_if_missing"]: + raise RuntimeError( + f"Required gateway {gateway['name']} does not exist in {gateway['namespace']}" + ) + manifest = llmd_runtime.render_gateway(config) + llmd_runtime.apply_manifest(config.artifact_dir / "src" / "gateway.yaml", manifest) + + def _gateway_programmed() -> bool: + resource = llmd_runtime.oc_get_json( + "gateway", + name=gateway["name"], + namespace=gateway["namespace"], + ) + return llmd_runtime.condition_status(resource, "Programmed") == "True" + + llmd_runtime.wait_until( + f"gateway/{gateway['name']} programmed", + timeout_seconds=gateway["wait_timeout_seconds"], + interval_seconds=10, + predicate=_gateway_programmed, + ) + return "Gateway ready" + + +@task +def ensure_test_namespace_task(args, ctx): + """Ensure the llm_d namespace exists""" + + llmd_runtime.ensure_namespace( + ctx.config.namespace, + labels={ + "app.kubernetes.io/managed-by": "forge", + "forge.openshift.io/project": "llm_d", + }, + ) + return f"Namespace {ctx.config.namespace} ready" + + +@task +def cleanup_previous_run_task(args, ctx): + """Delete leftover llm_d resources from the namespace""" + + config = ctx.config + inference_service_name = config.platform["inference_service"]["name"] + namespace = config.namespace + cleanup_timeout_seconds = config.platform["cluster"]["cleanup_timeout_seconds"] + benchmark_names = {"guidellm-benchmark"} + if config.benchmark: + benchmark_names.add(config.benchmark["job_name"]) + + shell.run( + f"oc delete llminferenceservice {inference_service_name} " + f"-n {namespace} --ignore-not-found=true", + check=False, + ) + + for benchmark_name in sorted(benchmark_names): + shell.run( + f"oc delete job,pvc {benchmark_name} -n {namespace} --ignore-not-found=true", + check=False, + ) + shell.run( + f"oc delete pod {benchmark_name}-copy -n {namespace} --ignore-not-found=true", + check=False, + ) + + shell.run( + f'oc delete job -n {namespace} -l "forge.openshift.io/project=llm_d" ' + "--ignore-not-found=true", + check=False, + ) + shell.run( + f'oc delete pod -n {namespace} -l "forge.openshift.io/project=llm_d" ' + "--ignore-not-found=true", + check=False, + ) + shell.run( + f"oc delete pvc -n {namespace} " + '-l "forge.openshift.io/project=llm_d,forge.openshift.io/preserve!=true" ' + "--ignore-not-found=true", + check=False, + ) + + llmd_runtime.wait_until( + f"llminferenceservice/{inference_service_name} deletion in {namespace}", + timeout_seconds=cleanup_timeout_seconds, + interval_seconds=10, + predicate=lambda: not llmd_runtime.resource_exists( + "llminferenceservice", inference_service_name, namespace=namespace + ), + ) + + llmd_runtime.wait_until( + f"llm-d workload pods deletion in {namespace}", + timeout_seconds=cleanup_timeout_seconds, + interval_seconds=10, + predicate=lambda: not ( + pods := llmd_runtime.oc_get_json( + "pods", + namespace=namespace, + selector=f"app.kubernetes.io/name={inference_service_name}", + ignore_not_found=True, + ) + ) + or not pods.get("items"), + ) + return f"Previous llm_d leftovers deleted from {ctx.config.namespace}" + + +@task +def prepare_model_cache_task(args, ctx): + """Prepare the shared model cache if enabled""" + + cache_inputs = phase_inputs.prepare_model_cache_inputs_from_prepare(ctx.config) + cache_spec = llmd_runtime.resolve_model_cache(cache_inputs) + if not cache_spec: + LOGGER.info("Model cache disabled for preset=%s", cache_inputs.preset_name) + return "Model cache disabled" + + if cache_inputs.namespace_is_managed: + LOGGER.warning( + "Model cache PVC %s lives in managed namespace %s. Namespace cleanup will remove it; cache reuse requires a stable namespace override.", + cache_spec.pvc_name, + cache_spec.namespace, + ) + + prepare_model_cache.ensure_model_cache_pvc(cache_inputs, cache_spec) + if llmd_runtime.model_cache_pvc_ready(cache_spec): + LOGGER.info( + "Model cache PVC %s already contains %s; skipping download", + cache_spec.pvc_name, + cache_spec.source_uri, + ) + prepare_model_cache.capture_model_cache_state(cache_inputs, cache_spec) + return "Model cache already populated" + + prepare_model_cache.run_model_cache_download_job(cache_inputs, cache_spec) + llmd_runtime.annotate_model_cache_pvc(cache_spec) + prepare_model_cache.capture_model_cache_state(cache_inputs, cache_spec) + return "Model cache prepared" + + +@task +def verify_gpu_nodes_task(args, ctx): + """Verify that GPU nodes are available on the cluster""" + + selector = ctx.config.platform["cluster"]["gpu_node_label_selector"] + data = llmd_runtime.oc_get_json("nodes", selector=selector, ignore_not_found=True) + items = data.get("items", []) if data else [] + if not items: + raise RuntimeError( + f"No GPU nodes found with selector {selector}. The llm_d smoke path requires GPUs." + ) + return "GPU nodes detected" + + +@task +def capture_prepare_state_task(args, ctx): + """Capture cluster state after the prepare phase""" + + config = ctx.config + artifacts_dir = config.artifact_dir / "artifacts" + rhoai = config.platform["rhoai"] + gateway = config.platform["gateway"] + + capture_resource_yaml( + "datasciencecluster", + rhoai["datasciencecluster_name"], + rhoai["namespace"], + artifacts_dir / "datasciencecluster.yaml", + ) + capture_resource_yaml( + "gateway", + gateway["name"], + gateway["namespace"], + artifacts_dir / "gateway.yaml", + ) + gateway_service = llmd_runtime.oc( + "get", + "service", + "-A", + "-l", + f"gateway.networking.k8s.io/gateway-name={gateway['name']}", + "-o", + "yaml", + check=False, + capture_output=True, + ) + if gateway_service.returncode == 0 and gateway_service.stdout: + llmd_runtime.write_text(artifacts_dir / "gateway.service.yaml", gateway_service.stdout) + if config.platform["artifacts"]["capture_namespace_events"]: + capture_namespace_events(config.namespace, artifacts_dir / "namespace.events.txt") + return "Prepare-state artifacts captured" + + +def verify_oc_access() -> None: + llmd_runtime.oc("whoami", capture_output=True) + + +def verify_cluster_version(config: phase_inputs.PrepareInputs) -> None: + version_info = llmd_runtime.oc("version", "-o", "json", capture_output=True) + payload = json.loads(version_info.stdout) + + openshift_version = ( + payload.get("openshiftVersion") + or payload.get("serverVersion", {}).get("gitVersion") + or payload.get("serverVersion", {}).get("platform") + ) + if not openshift_version: + raise RuntimeError("Could not determine OpenShift version from `oc version -o json`") + + minimum = config.platform["cluster"]["minimum_openshift_version"] + if llmd_runtime.version_tuple(openshift_version) < llmd_runtime.version_tuple(minimum): + raise RuntimeError( + f"Cluster version {openshift_version} is older than the llm_d minimum {minimum}" + ) + + +def ensure_operator_subscription(operator_spec: dict[str, str]) -> dict[str, object]: + llmd_runtime.ensure_subscription(operator_spec) + return llmd_runtime.wait_for_operator_csv( + operator_spec["package"], + operator_spec["namespace"], + timeout_seconds=operator_spec["wait_timeout_seconds"], + ) + + +def prepare_cert_manager(config: phase_inputs.PrepareInputs) -> None: + operator_spec = llmd_runtime.operator_spec_by_package( + config.platform, "openshift-cert-manager-operator" + ) + ensure_operator_subscription(operator_spec) + + +def prepare_leader_worker_set(config: phase_inputs.PrepareInputs) -> None: + operator_spec = llmd_runtime.operator_spec_by_package(config.platform, "leader-worker-set") + ensure_operator_subscription(operator_spec) + + +def prepare_nfd(config: phase_inputs.PrepareInputs) -> None: + operator_spec = llmd_runtime.operator_spec_by_package(config.platform, "nfd") + ensure_operator_subscription(operator_spec) + llmd_runtime.wait_for_crd( + operator_spec["bootstrap_crd"], + timeout_seconds=operator_spec["wait_timeout_seconds"], + ) + + manifest = llmd_runtime.load_manifest_template(config, operator_spec["bootstrap_manifest"]) + nfd_name = manifest["metadata"]["name"] + nfd_namespace = manifest["metadata"]["namespace"] + if llmd_runtime.resource_exists("nodefeaturediscovery", nfd_name, namespace=nfd_namespace): + LOGGER.info( + "NodeFeatureDiscovery/%s already exists in %s; verifying GPU discovery labels", + nfd_name, + nfd_namespace, + ) + else: + llmd_runtime.apply_manifest( + config.artifact_dir / "src" / "nfd-nodefeaturediscovery.yaml", + manifest, + ) + + llmd_runtime.wait_until( + "NodeFeatureDiscovery bootstrap resource", + timeout_seconds=operator_spec["wait_timeout_seconds"], + interval_seconds=10, + predicate=lambda: llmd_runtime.resource_exists( + "nodefeaturediscovery", + nfd_name, + namespace=nfd_namespace, + ), + ) + + wait_for_nfd_gpu_labels(config, timeout_seconds=operator_spec["wait_timeout_seconds"]) + + +def prepare_gpu_operator(config: phase_inputs.PrepareInputs) -> None: + operator_spec = llmd_runtime.operator_spec_by_package(config.platform, "gpu-operator-certified") + ensure_operator_subscription(operator_spec) + llmd_runtime.wait_for_crd( + operator_spec["bootstrap_crd"], + timeout_seconds=operator_spec["wait_timeout_seconds"], + ) + + manifest = llmd_runtime.load_manifest_template(config, operator_spec["bootstrap_manifest"]) + clusterpolicy_name = manifest["metadata"]["name"] + if llmd_runtime.resource_exists("clusterpolicy", clusterpolicy_name): + LOGGER.info( + "ClusterPolicy/%s already exists; verifying readiness instead of applying bootstrap manifest", + clusterpolicy_name, + ) + wait_for_gpu_clusterpolicy_ready( + clusterpolicy_name, + timeout_seconds=operator_spec["wait_timeout_seconds"], + ) + return + + llmd_runtime.apply_manifest( + config.artifact_dir / "src" / "gpu-clusterpolicy.yaml", + manifest, + ) + + wait_for_gpu_clusterpolicy_ready( + clusterpolicy_name, + timeout_seconds=operator_spec["wait_timeout_seconds"], + ) + + +def wait_for_gpu_clusterpolicy_ready(clusterpolicy_name: str, *, timeout_seconds: int) -> None: + def _clusterpolicy_ready() -> bool: + payload = llmd_runtime.oc_get_json( + "clusterpolicy", + name=clusterpolicy_name, + ) + state = payload.get("status", {}).get("state", "") + return state.lower() == "ready" + + llmd_runtime.wait_until( + f"clusterpolicy/{clusterpolicy_name} ready", + timeout_seconds=timeout_seconds, + interval_seconds=15, + predicate=_clusterpolicy_ready, + ) + + +def prepare_rhoai_operator(config: phase_inputs.PrepareInputs) -> None: + operator_spec = llmd_runtime.operator_spec_by_package(config.platform, "rhods-operator") + ensure_operator_subscription(operator_spec) + ensure_required_crds(config.platform["rhoai"]["required_crds_before_dsc"], config) + + +def ensure_required_crds(crd_names: list[str], config: phase_inputs.PrepareInputs) -> None: + for crd_name in crd_names: + llmd_runtime.wait_for_crd( + crd_name, + timeout_seconds=config.platform["rhoai"]["wait_timeout_seconds"], + ) + + +def apply_datasciencecluster(config: phase_inputs.PrepareInputs) -> None: + manifest = llmd_runtime.render_datasciencecluster(config) + llmd_runtime.apply_manifest(config.artifact_dir / "src" / "datasciencecluster.yaml", manifest) + llmd_runtime.oc( + "get", + "datasciencecluster", + config.platform["rhoai"]["datasciencecluster_name"], + "-n", + config.platform["rhoai"]["namespace"], + "-o", + "yaml", + capture_output=True, + ) + + +def wait_for_datasciencecluster_ready(config: phase_inputs.PrepareInputs) -> None: + rhoai = config.platform["rhoai"] + + def _dsc_ready() -> bool: + payload = llmd_runtime.oc_get_json( + "datasciencecluster", + name=rhoai["datasciencecluster_name"], + namespace=rhoai["namespace"], + ) + phase = payload.get("status", {}).get("phase") + if phase == "Ready": + return True + if phase in {"Failed", "Error"}: + raise RuntimeError(f"DataScienceCluster entered terminal phase {phase}") + return False + + llmd_runtime.wait_until( + f"datasciencecluster/{rhoai['datasciencecluster_name']} ready", + timeout_seconds=rhoai["wait_timeout_seconds"], + interval_seconds=10, + predicate=_dsc_ready, + ) + + +def ensure_gateway(config: phase_inputs.PrepareInputs) -> None: + gateway = config.platform["gateway"] + if not llmd_runtime.resource_exists("gateway", gateway["name"], namespace=gateway["namespace"]): + if not gateway["create_if_missing"]: + raise RuntimeError( + f"Required gateway {gateway['name']} does not exist in {gateway['namespace']}" + ) + manifest = llmd_runtime.render_gateway(config) + llmd_runtime.apply_manifest(config.artifact_dir / "src" / "gateway.yaml", manifest) + + def _gateway_programmed() -> bool: + resource = llmd_runtime.oc_get_json( + "gateway", + name=gateway["name"], + namespace=gateway["namespace"], + ) + return llmd_runtime.condition_status(resource, "Programmed") == "True" + + llmd_runtime.wait_until( + f"gateway/{gateway['name']} programmed", + timeout_seconds=gateway["wait_timeout_seconds"], + interval_seconds=10, + predicate=_gateway_programmed, + ) + + +def ensure_test_namespace(config: phase_inputs.PrepareInputs) -> None: + llmd_runtime.ensure_namespace( + config.namespace, + labels={ + "app.kubernetes.io/managed-by": "forge", + "forge.openshift.io/project": "llm_d", + }, + ) + + +def verify_gpu_nodes(config: phase_inputs.PrepareInputs) -> None: + selector = config.platform["cluster"]["gpu_node_label_selector"] + data = llmd_runtime.oc_get_json("nodes", selector=selector, ignore_not_found=True) + items = data.get("items", []) if data else [] + if not items: + raise RuntimeError( + f"No GPU nodes found with selector {selector}. The llm_d smoke path requires GPUs." + ) + + +def wait_for_nfd_gpu_labels(config: phase_inputs.PrepareInputs, *, timeout_seconds: int) -> None: + selectors = config.platform["cluster"]["nfd_gpu_detection_labels"] + + def _labels_present() -> bool: + for selector in selectors: + data = llmd_runtime.oc_get_json("nodes", selector=selector, ignore_not_found=True) + if data and data.get("items"): + return True + return False + + llmd_runtime.wait_until( + "NFD GPU discovery labels on cluster nodes", + timeout_seconds=timeout_seconds, + interval_seconds=15, + predicate=_labels_present, + ) + + +def capture_prepare_state(config: phase_inputs.PrepareInputs) -> None: + artifacts_dir = config.artifact_dir / "artifacts" + rhoai = config.platform["rhoai"] + gateway = config.platform["gateway"] + + capture_resource_yaml( + "datasciencecluster", + rhoai["datasciencecluster_name"], + rhoai["namespace"], + artifacts_dir / "datasciencecluster.yaml", + ) + capture_resource_yaml( + "gateway", + gateway["name"], + gateway["namespace"], + artifacts_dir / "gateway.yaml", + ) + gateway_service = llmd_runtime.oc( + "get", + "service", + "-A", + "-l", + f"gateway.networking.k8s.io/gateway-name={gateway['name']}", + "-o", + "yaml", + check=False, + capture_output=True, + ) + if gateway_service.returncode == 0 and gateway_service.stdout: + llmd_runtime.write_text(artifacts_dir / "gateway.service.yaml", gateway_service.stdout) + if config.platform["artifacts"]["capture_namespace_events"]: + capture_namespace_events(config.namespace, artifacts_dir / "namespace.events.txt") + + +def capture_resource_yaml( + kind: str, + name: str, + namespace: str, + destination: Path, + *, + check: bool = True, +) -> None: + result = llmd_runtime.oc( + "get", + kind, + name, + "-n", + namespace, + "-o", + "yaml", + check=check, + capture_output=True, + ) + if result.returncode == 0 and result.stdout: + llmd_runtime.write_text(destination, result.stdout) + + +def capture_namespace_events(namespace: str, destination: Path) -> None: + result = llmd_runtime.oc( + "get", + "events", + "-n", + namespace, + "--sort-by=.metadata.creationTimestamp", + check=False, + capture_output=True, + ) + if result.returncode == 0 and result.stdout: + llmd_runtime.write_text(destination, result.stdout) + + +main = toolbox.create_toolbox_main(run) + + +if __name__ == "__main__": + main() diff --git a/projects/llm_d/toolbox/prepare_model_cache/main.py b/projects/llm_d/toolbox/prepare_model_cache/main.py new file mode 100644 index 00000000..73cfc24e --- /dev/null +++ b/projects/llm_d/toolbox/prepare_model_cache/main.py @@ -0,0 +1,247 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +import logging + +from projects.core.dsl import execute_tasks, task, toolbox +from projects.llm_d.runtime import llmd_runtime, phase_inputs + +LOGGER = logging.getLogger(__name__) + + +def run(*, inputs_file: str) -> int: + """Prepare the shared model cache PVC and populate it when needed. + + Args: + inputs_file: Path to the prepare_model_cache phase input file generated by orchestration + """ + + llmd_runtime.init() + execute_tasks(locals()) + return 0 + + +@task +def load_inputs(args, ctx): + """Load the model cache phase inputs""" + + ctx.inputs = phase_inputs.load_prepare_model_cache_inputs(args.inputs_file) + return f"Loaded model cache inputs for preset {ctx.inputs.preset_name}" + + +@task +def prepare_model_cache(args, ctx): + """Ensure the model cache PVC exists and is populated""" + + config = ctx.inputs + cache_spec = llmd_runtime.resolve_model_cache(config) + if not cache_spec: + LOGGER.info("Model cache disabled for preset=%s", config.preset_name) + return "Model cache disabled" + + if config.namespace_is_managed: + LOGGER.warning( + "Model cache PVC %s lives in managed namespace %s. Namespace cleanup will remove it; cache reuse requires a stable namespace override.", + cache_spec.pvc_name, + cache_spec.namespace, + ) + + ensure_model_cache_pvc(config, cache_spec) + if llmd_runtime.model_cache_pvc_ready(cache_spec): + LOGGER.info( + "Model cache PVC %s already contains %s; skipping download", + cache_spec.pvc_name, + cache_spec.source_uri, + ) + capture_model_cache_state(config, cache_spec) + return f"Model cache already populated in {cache_spec.pvc_name}" + + run_model_cache_download_job(config, cache_spec) + llmd_runtime.annotate_model_cache_pvc(cache_spec) + capture_model_cache_state(config, cache_spec) + return f"Model cache step finished for namespace {config.namespace}" + + +def run_prepare_model_cache(config: phase_inputs.PrepareModelCacheInputs) -> int: + cache_spec = llmd_runtime.resolve_model_cache(config) + if not cache_spec: + LOGGER.info("Model cache disabled for preset=%s", config.preset_name) + return 0 + + if config.namespace_is_managed: + LOGGER.warning( + "Model cache PVC %s lives in managed namespace %s. Namespace cleanup will remove it; cache reuse requires a stable namespace override.", + cache_spec.pvc_name, + cache_spec.namespace, + ) + + ensure_model_cache_pvc(config, cache_spec) + if llmd_runtime.model_cache_pvc_ready(cache_spec): + LOGGER.info( + "Model cache PVC %s already contains %s; skipping download", + cache_spec.pvc_name, + cache_spec.source_uri, + ) + capture_model_cache_state(config, cache_spec) + return 0 + + run_model_cache_download_job(config, cache_spec) + llmd_runtime.annotate_model_cache_pvc(cache_spec) + capture_model_cache_state(config, cache_spec) + return 0 + + +def ensure_model_cache_pvc( + config: phase_inputs.PrepareModelCacheInputs, cache_spec: llmd_runtime.ModelCacheSpec +) -> None: + existing = llmd_runtime.oc_get_json( + "persistentvolumeclaim", + name=cache_spec.pvc_name, + namespace=cache_spec.namespace, + ignore_not_found=True, + ) + if existing: + actual_modes = existing.get("spec", {}).get("accessModes", []) + if not llmd_runtime.pvc_access_mode_matches(actual_modes, cache_spec.access_mode): + raise RuntimeError( + f"PVC {cache_spec.pvc_name} exists with access modes {actual_modes}, expected {cache_spec.access_mode}" + ) + + actual_storage_class = existing.get("spec", {}).get("storageClassName") + if cache_spec.storage_class_name and actual_storage_class != cache_spec.storage_class_name: + raise RuntimeError( + f"PVC {cache_spec.pvc_name} exists with storageClassName={actual_storage_class}, expected {cache_spec.storage_class_name}" + ) + + llmd_runtime.wait_for_pvc_bound( + cache_spec.pvc_name, + cache_spec.namespace, + timeout_seconds=config.model_cache["download"]["wait_timeout_seconds"], + ) + return + + llmd_runtime.apply_manifest( + config.artifact_dir / "src" / "model-cache-pvc.yaml", + llmd_runtime.render_model_cache_pvc(cache_spec), + ) + llmd_runtime.wait_for_pvc_bound( + cache_spec.pvc_name, + cache_spec.namespace, + timeout_seconds=config.model_cache["download"]["wait_timeout_seconds"], + ) + + +def run_model_cache_download_job( + config: phase_inputs.PrepareModelCacheInputs, cache_spec: llmd_runtime.ModelCacheSpec +) -> None: + llmd_runtime.oc( + "delete", + "job", + cache_spec.download_job_name, + "-n", + cache_spec.namespace, + "--ignore-not-found=true", + check=False, + ) + llmd_runtime.wait_until( + f"job/{cache_spec.download_job_name} deletion in {cache_spec.namespace}", + timeout_seconds=120, + interval_seconds=5, + predicate=lambda: not llmd_runtime.resource_exists( + "job", cache_spec.download_job_name, namespace=cache_spec.namespace + ), + ) + + llmd_runtime.apply_manifest( + config.artifact_dir / "src" / "model-cache-job.yaml", + llmd_runtime.render_model_cache_job(config, cache_spec), + ) + + try: + llmd_runtime.wait_for_job_completion( + cache_spec.download_job_name, + cache_spec.namespace, + timeout_seconds=config.model_cache["download"]["wait_timeout_seconds"], + interval_seconds=config.model_cache["download"]["poll_interval_seconds"], + ) + finally: + capture_model_cache_state(config, cache_spec) + + +def capture_model_cache_state( + config: phase_inputs.PrepareModelCacheInputs, cache_spec: llmd_runtime.ModelCacheSpec +) -> None: + artifact_dir = config.artifact_dir / "artifacts" / "model-cache" + llmd_runtime.write_json( + artifact_dir / "spec.json", + { + "pvc_name": cache_spec.pvc_name, + "model_uri": cache_spec.model_uri, + "source_uri": cache_spec.source_uri, + "source_scheme": cache_spec.source_scheme, + }, + ) + + capture_resource_yaml( + "persistentvolumeclaim", + cache_spec.pvc_name, + cache_spec.namespace, + artifact_dir / "pvc.yaml", + ) + capture_resource_yaml( + "job", + cache_spec.download_job_name, + cache_spec.namespace, + artifact_dir / "job.yaml", + check=False, + ) + + for pod_name in llmd_runtime.job_pod_names(cache_spec.download_job_name, cache_spec.namespace): + capture_resource_yaml( + "pod", + pod_name, + cache_spec.namespace, + artifact_dir / f"{pod_name}.yaml", + check=False, + ) + log_result = llmd_runtime.oc( + "logs", + pod_name, + "-n", + cache_spec.namespace, + check=False, + capture_output=True, + ) + if log_result.returncode == 0 and log_result.stdout: + llmd_runtime.write_text(artifact_dir / f"{pod_name}.log", log_result.stdout) + + +def capture_resource_yaml( + kind: str, + name: str, + namespace: str, + destination, + *, + check: bool = True, +) -> None: + result = llmd_runtime.oc( + "get", + kind, + name, + "-n", + namespace, + "-o", + "yaml", + check=check, + capture_output=True, + ) + if result.returncode == 0 and result.stdout: + llmd_runtime.write_text(destination, result.stdout) + + +main = toolbox.create_toolbox_main(run) + + +if __name__ == "__main__": + main() diff --git a/projects/llm_d/toolbox/test/main.py b/projects/llm_d/toolbox/test/main.py new file mode 100644 index 00000000..609c9e46 --- /dev/null +++ b/projects/llm_d/toolbox/test/main.py @@ -0,0 +1,910 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +import json +import logging +from pathlib import Path + +from projects.core.dsl import always, execute_tasks, task, toolbox +from projects.llm_d.runtime import llmd_runtime, phase_inputs + +LOGGER = logging.getLogger(__name__) + + +def run(*, inputs_file: str) -> int: + """Deploy llm_d, run the smoke request, and optionally execute GuideLLM. + + Args: + inputs_file: Path to the test phase input file generated by orchestration + """ + + llmd_runtime.init() + execute_tasks(locals()) + return 0 + + +@task +def load_inputs(args, ctx): + """Load the test phase inputs""" + + ctx.config = phase_inputs.load_test_inputs(args.inputs_file) + return f"Loaded test inputs for preset {ctx.config.preset_name}" + + +@task +def deploy_inference_service_task(args, ctx): + """Deploy the LLMInferenceService and resolve its endpoint""" + + config = ctx.config + name = config.platform["inference_service"]["name"] + namespace = config.namespace + selector = f"app.kubernetes.io/name={name}" + + llmd_runtime.oc( + "delete", + "llminferenceservice", + name, + "-n", + namespace, + "--ignore-not-found=true", + check=False, + ) + + def _old_pods_gone() -> bool: + pods = llmd_runtime.oc_get_json( + "pods", namespace=namespace, selector=selector, ignore_not_found=True + ) + return not pods or not pods.get("items") + + llmd_runtime.wait_until( + f"old llm-d pods to disappear in {namespace}", + timeout_seconds=config.platform["inference_service"]["delete_timeout_seconds"], + interval_seconds=10, + predicate=_old_pods_gone, + ) + + manifest = llmd_runtime.render_inference_service(config) + llmd_runtime.apply_manifest(config.artifact_dir / "src" / "llminferenceservice.yaml", manifest) + + def _pods_present() -> bool: + pods = llmd_runtime.oc_get_json( + "pods", namespace=namespace, selector=selector, ignore_not_found=True + ) + return bool(pods and pods.get("items")) + + llmd_runtime.wait_until( + f"llm-d pods to appear in {namespace}", + timeout_seconds=config.platform["inference_service"]["pod_appearance_timeout_seconds"], + interval_seconds=5, + predicate=_pods_present, + ) + + def _service_ready() -> bool: + payload = llmd_runtime.oc_get_json("llminferenceservice", name=name, namespace=namespace) + return llmd_runtime.condition_status(payload, "Ready") == "True" + + llmd_runtime.wait_until( + f"llminferenceservice/{name} ready", + timeout_seconds=config.platform["inference_service"]["ready_timeout_seconds"], + interval_seconds=10, + predicate=_service_ready, + ) + + ctx.endpoint_url = llmd_runtime.wait_until( + f"gateway address for llminferenceservice/{name}", + timeout_seconds=config.platform["inference_service"]["ready_timeout_seconds"], + interval_seconds=10, + predicate=lambda: try_resolve_endpoint_url(config), + ) + return f"Endpoint resolved: {ctx.endpoint_url}" + + +@task +def run_smoke_request_task(args, ctx): + """Run the smoke request against the deployed service""" + + config = ctx.config + namespace = config.namespace + job_name = config.platform["smoke"]["job_name"] + payload = { + "model": config.model["served_model_name"], + "prompt": config.smoke_request["prompt"], + "max_tokens": config.smoke_request["max_tokens"], + "temperature": config.smoke_request["temperature"], + } + llmd_runtime.write_json(config.artifact_dir / "artifacts" / "smoke.request.json", payload) + + llmd_runtime.oc( + "delete", + "job", + job_name, + "-n", + namespace, + "--ignore-not-found=true", + check=False, + ) + llmd_runtime.wait_until( + f"job/{job_name} deletion in {namespace}", + timeout_seconds=120, + interval_seconds=5, + predicate=lambda: not llmd_runtime.resource_exists("job", job_name, namespace=namespace), + ) + + llmd_runtime.apply_manifest( + config.artifact_dir / "src" / "smoke-job.yaml", + llmd_runtime.render_smoke_request_job(config, ctx.endpoint_url, payload), + ) + + try: + llmd_runtime.wait_for_job_completion( + job_name, + namespace, + timeout_seconds=( + config.platform["smoke"]["request_retries"] + * ( + config.platform["smoke"]["request_timeout_seconds"] + + config.platform["smoke"]["request_retry_delay_seconds"] + ) + ), + interval_seconds=5, + ) + finally: + capture_smoke_state(config) + + result = llmd_runtime.oc( + "logs", + f"job/{job_name}", + "-n", + namespace, + check=False, + capture_output=True, + ) + + if result.returncode != 0 or not result.stdout: + raise RuntimeError( + f"Smoke request job {job_name} completed but response logs could not be read: {result.stderr}" + ) + + response = json.loads(result.stdout) + if not response.get("choices"): + raise RuntimeError(f"Invalid smoke response payload: {result.stdout}") + + llmd_runtime.write_json(config.artifact_dir / "artifacts" / "smoke.response.json", response) + ctx.smoke_response = response + return "Smoke request completed" + + +@task +def run_guidellm_benchmark_task(args, ctx): + """Run the GuideLLM benchmark when enabled for the preset""" + + if not ctx.config.benchmark: + return "GuideLLM benchmark disabled" + + config = ctx.config + benchmark_name = config.benchmark["job_name"] + namespace = config.namespace + + llmd_runtime.oc( + "delete", + "job,pvc", + benchmark_name, + "-n", + namespace, + "--ignore-not-found=true", + check=False, + ) + llmd_runtime.oc( + "delete", + "pod", + f"{benchmark_name}-copy", + "-n", + namespace, + "--ignore-not-found=true", + check=False, + ) + + llmd_runtime.apply_manifest( + config.artifact_dir / "src" / "guidellm-pvc.yaml", + llmd_runtime.render_guidellm_pvc(config), + ) + llmd_runtime.apply_manifest( + config.artifact_dir / "src" / "guidellm-job.yaml", + llmd_runtime.render_guidellm_job(config, ctx.endpoint_url), + ) + + def _job_terminal() -> dict[str, object] | None: + payload = llmd_runtime.oc_get_json("job", name=benchmark_name, namespace=namespace) + status = payload.get("status", {}) + if status.get("succeeded"): + return payload + if status.get("failed"): + raise RuntimeError(f"GuideLLM job {benchmark_name} failed") + return None + + llmd_runtime.wait_until( + f"GuideLLM job/{benchmark_name}", + timeout_seconds=config.benchmark["timeout_seconds"], + interval_seconds=10, + predicate=_job_terminal, + ) + + capture_guidellm_state(config) + copy_guidellm_results(config) + return f"GuideLLM benchmark {ctx.config.benchmark['job_name']} completed" + + +@always +@task +def capture_inference_service_state_task(args, ctx): + """Capture the LLMInferenceService state and related resources""" + + config = getattr(ctx, "config", None) + if not config: + return "Test inputs unavailable; skipping state capture" + + name = config.platform["inference_service"]["name"] + namespace = config.namespace + artifacts_dir = config.artifact_dir / "artifacts" + selector = f"app.kubernetes.io/name={name}" + + capture_get( + "llminferenceservice", + name, + namespace, + "yaml", + artifacts_dir / "llminferenceservice.yaml", + ) + capture_get( + "llminferenceservice", + name, + namespace, + "json", + artifacts_dir / "llminferenceservice.json", + ) + capture_get( + "pods", + None, + namespace, + "yaml", + artifacts_dir / "llminferenceservice.pods.yaml", + selector=selector, + ) + capture_get( + "deployments", + None, + namespace, + "yaml", + artifacts_dir / "llminferenceservice.deployments.yaml", + selector=selector, + ) + capture_get( + "replicasets", + None, + namespace, + "yaml", + artifacts_dir / "llminferenceservice.replicasets.yaml", + selector=selector, + ) + capture_get("pods", None, namespace, "wide", artifacts_dir / "namespace.pods.status") + capture_get("services", None, namespace, "wide", artifacts_dir / "namespace.services.status") + + pod_list = llmd_runtime.oc_get_json( + "pods", namespace=namespace, selector=selector, ignore_not_found=True + ) + if pod_list: + lines = [] + previous_lines = [] + for pod in pod_list.get("items", []): + pod_name = pod["metadata"]["name"] + lines.append(f"=== {pod_name} ===") + log_result = llmd_runtime.oc( + "logs", + pod_name, + "-n", + namespace, + "--all-containers=true", + check=False, + capture_output=True, + ) + if log_result.stdout: + lines.append(log_result.stdout.rstrip()) + + previous_lines.append(f"=== {pod_name} ===") + previous_result = llmd_runtime.oc( + "logs", + pod_name, + "-n", + namespace, + "--previous", + "--all-containers=true", + check=False, + capture_output=True, + ) + if previous_result.stdout: + previous_lines.append(previous_result.stdout.rstrip()) + + llmd_runtime.write_text( + artifacts_dir / "llminferenceservice.pods.logs", "\n".join(lines) + "\n" + ) + llmd_runtime.write_text( + artifacts_dir / "llminferenceservice.pods.previous.logs", + "\n".join(previous_lines) + "\n", + ) + return "Inference-service artifacts captured" + + +@always +@task +def write_endpoint_url_task(args, ctx): + """Persist the resolved endpoint URL when available""" + + config = getattr(ctx, "config", None) + if not config: + return "Test inputs unavailable; skipping endpoint capture" + + endpoint_url = getattr(ctx, "endpoint_url", None) + if not endpoint_url: + return "Endpoint URL not available" + + llmd_runtime.write_text(config.artifact_dir / "artifacts" / "endpoint.url", f"{endpoint_url}\n") + return "Endpoint URL captured" + + +@always +@task +def cleanup_runtime_resources_task(args, ctx): + """Delete smoke and benchmark helper resources""" + + config = getattr(ctx, "config", None) + if not config: + return "Test inputs unavailable; skipping cleanup" + + benchmark_name = config.benchmark["job_name"] if config.benchmark else "guidellm-benchmark" + smoke_job_name = config.platform["smoke"]["job_name"] + namespace = config.namespace + + llmd_runtime.oc( + "delete", + "job", + smoke_job_name, + "-n", + namespace, + "--ignore-not-found=true", + check=False, + ) + llmd_runtime.oc( + "delete", + "job,pvc", + benchmark_name, + "-n", + namespace, + "--ignore-not-found=true", + check=False, + ) + llmd_runtime.oc( + "delete", + "pod", + f"{benchmark_name}-copy", + "-n", + namespace, + "--ignore-not-found=true", + check=False, + ) + return "Test helper resources deleted" + + +@always +@task +def capture_namespace_events_task(args, ctx): + """Capture namespace events after the test run""" + + config = getattr(ctx, "config", None) + if not config: + return "Test inputs unavailable; skipping namespace events capture" + + events = llmd_runtime.oc( + "get", + "events", + "-n", + config.namespace, + "--sort-by=.metadata.creationTimestamp", + check=False, + capture_output=True, + ) + if events.returncode == 0 and events.stdout: + llmd_runtime.write_text( + config.artifact_dir / "artifacts" / "namespace.events.txt", events.stdout + ) + return "Namespace events captured" + + +def cleanup_runtime_resources(config: phase_inputs.TestInputs) -> None: + benchmark_name = config.benchmark["job_name"] if config.benchmark else "guidellm-benchmark" + smoke_job_name = config.platform["smoke"]["job_name"] + namespace = config.namespace + + llmd_runtime.oc( + "delete", + "job", + smoke_job_name, + "-n", + namespace, + "--ignore-not-found=true", + check=False, + ) + llmd_runtime.oc( + "delete", + "job,pvc", + benchmark_name, + "-n", + namespace, + "--ignore-not-found=true", + check=False, + ) + llmd_runtime.oc( + "delete", + "pod", + f"{benchmark_name}-copy", + "-n", + namespace, + "--ignore-not-found=true", + check=False, + ) + + +def capture_namespace_events(config: phase_inputs.TestInputs) -> None: + events = llmd_runtime.oc( + "get", + "events", + "-n", + config.namespace, + "--sort-by=.metadata.creationTimestamp", + check=False, + capture_output=True, + ) + if events.returncode == 0 and events.stdout: + llmd_runtime.write_text( + config.artifact_dir / "artifacts" / "namespace.events.txt", events.stdout + ) + + +def deploy_inference_service(config: phase_inputs.TestInputs) -> str: + name = config.platform["inference_service"]["name"] + namespace = config.namespace + selector = f"app.kubernetes.io/name={name}" + + llmd_runtime.oc( + "delete", + "llminferenceservice", + name, + "-n", + namespace, + "--ignore-not-found=true", + check=False, + ) + + def _old_pods_gone() -> bool: + pods = llmd_runtime.oc_get_json( + "pods", namespace=namespace, selector=selector, ignore_not_found=True + ) + return not pods or not pods.get("items") + + llmd_runtime.wait_until( + f"old llm-d pods to disappear in {namespace}", + timeout_seconds=config.platform["inference_service"]["delete_timeout_seconds"], + interval_seconds=10, + predicate=_old_pods_gone, + ) + + manifest = llmd_runtime.render_inference_service(config) + llmd_runtime.apply_manifest(config.artifact_dir / "src" / "llminferenceservice.yaml", manifest) + + def _pods_present() -> bool: + pods = llmd_runtime.oc_get_json( + "pods", namespace=namespace, selector=selector, ignore_not_found=True + ) + return bool(pods and pods.get("items")) + + llmd_runtime.wait_until( + f"llm-d pods to appear in {namespace}", + timeout_seconds=config.platform["inference_service"]["pod_appearance_timeout_seconds"], + interval_seconds=5, + predicate=_pods_present, + ) + + def _service_ready() -> bool: + payload = llmd_runtime.oc_get_json("llminferenceservice", name=name, namespace=namespace) + return llmd_runtime.condition_status(payload, "Ready") == "True" + + llmd_runtime.wait_until( + f"llminferenceservice/{name} ready", + timeout_seconds=config.platform["inference_service"]["ready_timeout_seconds"], + interval_seconds=10, + predicate=_service_ready, + ) + + return llmd_runtime.wait_until( + f"gateway address for llminferenceservice/{name}", + timeout_seconds=config.platform["inference_service"]["ready_timeout_seconds"], + interval_seconds=10, + predicate=lambda: try_resolve_endpoint_url(config), + ) + + +def resolve_endpoint_url(config: phase_inputs.TestInputs) -> str: + endpoint_url = try_resolve_endpoint_url(config) + if endpoint_url: + return endpoint_url + + name = config.platform["inference_service"]["name"] + gateway_name = config.platform["gateway"]["status_address_name"] + raise RuntimeError( + f"Gateway address {gateway_name} is missing from llminferenceservice/{name} status.addresses" + ) + + +def try_resolve_endpoint_url(config: phase_inputs.TestInputs) -> str | None: + name = config.platform["inference_service"]["name"] + namespace = config.namespace + gateway_name = config.platform["gateway"]["status_address_name"] + payload = llmd_runtime.oc_get_json("llminferenceservice", name=name, namespace=namespace) + + for address in payload.get("status", {}).get("addresses", []): + if address.get("name") == gateway_name and address.get("url"): + return address["url"] + return None + + +def run_smoke_request(config: phase_inputs.TestInputs, endpoint_url: str) -> dict[str, object]: + namespace = config.namespace + job_name = config.platform["smoke"]["job_name"] + + payload = { + "model": config.model["served_model_name"], + "prompt": config.smoke_request["prompt"], + "max_tokens": config.smoke_request["max_tokens"], + "temperature": config.smoke_request["temperature"], + } + llmd_runtime.write_json(config.artifact_dir / "artifacts" / "smoke.request.json", payload) + + llmd_runtime.oc( + "delete", + "job", + job_name, + "-n", + namespace, + "--ignore-not-found=true", + check=False, + ) + llmd_runtime.wait_until( + f"job/{job_name} deletion in {namespace}", + timeout_seconds=120, + interval_seconds=5, + predicate=lambda: not llmd_runtime.resource_exists("job", job_name, namespace=namespace), + ) + + llmd_runtime.apply_manifest( + config.artifact_dir / "src" / "smoke-job.yaml", + llmd_runtime.render_smoke_request_job(config, endpoint_url, payload), + ) + + try: + llmd_runtime.wait_for_job_completion( + job_name, + namespace, + timeout_seconds=( + config.platform["smoke"]["request_retries"] + * ( + config.platform["smoke"]["request_timeout_seconds"] + + config.platform["smoke"]["request_retry_delay_seconds"] + ) + ), + interval_seconds=5, + ) + finally: + capture_smoke_state(config) + + result = llmd_runtime.oc( + "logs", + f"job/{job_name}", + "-n", + namespace, + check=False, + capture_output=True, + ) + + if result.returncode != 0 or not result.stdout: + raise RuntimeError( + f"Smoke request job {job_name} completed but response logs could not be read: {result.stderr}" + ) + + response = json.loads(result.stdout) + if not response.get("choices"): + raise RuntimeError(f"Invalid smoke response payload: {result.stdout}") + return response + + +def capture_smoke_state(config: phase_inputs.TestInputs) -> None: + job_name = config.platform["smoke"]["job_name"] + namespace = config.namespace + artifacts_dir = config.artifact_dir / "artifacts" + + capture_get("job", job_name, namespace, "yaml", artifacts_dir / "smoke_job.yaml") + capture_get( + "pods", + None, + namespace, + "yaml", + artifacts_dir / "smoke_job.pods.yaml", + selector=f"job-name={job_name}", + ) + result = llmd_runtime.oc( + "logs", + f"job/{job_name}", + "-n", + namespace, + check=False, + capture_output=True, + ) + if result.returncode == 0 and result.stdout: + llmd_runtime.write_text(artifacts_dir / "smoke_job.logs", result.stdout) + + +def run_guidellm_benchmark(config: phase_inputs.TestInputs, endpoint_url: str) -> None: + benchmark_name = config.benchmark["job_name"] + namespace = config.namespace + + llmd_runtime.oc( + "delete", + "job,pvc", + benchmark_name, + "-n", + namespace, + "--ignore-not-found=true", + check=False, + ) + llmd_runtime.oc( + "delete", + "pod", + f"{benchmark_name}-copy", + "-n", + namespace, + "--ignore-not-found=true", + check=False, + ) + + llmd_runtime.apply_manifest( + config.artifact_dir / "src" / "guidellm-pvc.yaml", + llmd_runtime.render_guidellm_pvc(config), + ) + llmd_runtime.apply_manifest( + config.artifact_dir / "src" / "guidellm-job.yaml", + llmd_runtime.render_guidellm_job(config, endpoint_url), + ) + + def _job_terminal() -> dict[str, object] | None: + payload = llmd_runtime.oc_get_json("job", name=benchmark_name, namespace=namespace) + status = payload.get("status", {}) + if status.get("succeeded"): + return payload + if status.get("failed"): + raise RuntimeError(f"GuideLLM job {benchmark_name} failed") + return None + + llmd_runtime.wait_until( + f"GuideLLM job/{benchmark_name}", + timeout_seconds=config.benchmark["timeout_seconds"], + interval_seconds=10, + predicate=_job_terminal, + ) + + capture_guidellm_state(config) + copy_guidellm_results(config) + + +def copy_guidellm_results(config: phase_inputs.TestInputs) -> None: + benchmark_name = config.benchmark["job_name"] + namespace = config.namespace + pod_data = llmd_runtime.oc_get_json( + "pods", + namespace=namespace, + selector=f"job-name={benchmark_name}", + ignore_not_found=True, + ) + node_name = None + if pod_data and pod_data.get("items"): + node_name = pod_data["items"][0].get("spec", {}).get("nodeName") + + llmd_runtime.apply_manifest( + config.artifact_dir / "src" / "guidellm-copy-pod.yaml", + llmd_runtime.render_guidellm_copy_pod(config, node_name=node_name), + ) + + def _helper_ready() -> bool: + payload = llmd_runtime.oc_get_json( + "pod", + name=f"{benchmark_name}-copy", + namespace=namespace, + ) + conditions = payload.get("status", {}).get("conditions", []) + return any( + condition.get("type") == "Ready" and condition.get("status") == "True" + for condition in conditions + ) + + llmd_runtime.wait_until( + f"GuideLLM copy helper pod/{benchmark_name}-copy", + timeout_seconds=120, + interval_seconds=5, + predicate=_helper_ready, + ) + + result = llmd_runtime.oc( + "exec", + "-n", + namespace, + f"{benchmark_name}-copy", + "--", + "cat", + "/results/benchmarks.json", + check=False, + capture_output=True, + ) + if result.returncode == 0 and result.stdout: + llmd_runtime.write_text( + config.artifact_dir / "artifacts" / "results" / "benchmarks.json", + result.stdout, + ) + + +def capture_inference_service_state(config: phase_inputs.TestInputs) -> None: + name = config.platform["inference_service"]["name"] + namespace = config.namespace + artifacts_dir = config.artifact_dir / "artifacts" + selector = f"app.kubernetes.io/name={name}" + + capture_get( + "llminferenceservice", + name, + namespace, + "yaml", + artifacts_dir / "llminferenceservice.yaml", + ) + capture_get( + "llminferenceservice", + name, + namespace, + "json", + artifacts_dir / "llminferenceservice.json", + ) + capture_get( + "pods", + None, + namespace, + "yaml", + artifacts_dir / "llminferenceservice.pods.yaml", + selector=selector, + ) + capture_get( + "deployments", + None, + namespace, + "yaml", + artifacts_dir / "llminferenceservice.deployments.yaml", + selector=selector, + ) + capture_get( + "replicasets", + None, + namespace, + "yaml", + artifacts_dir / "llminferenceservice.replicasets.yaml", + selector=selector, + ) + capture_get("pods", None, namespace, "wide", artifacts_dir / "namespace.pods.status") + capture_get("services", None, namespace, "wide", artifacts_dir / "namespace.services.status") + + pod_list = llmd_runtime.oc_get_json( + "pods", namespace=namespace, selector=selector, ignore_not_found=True + ) + if pod_list: + lines = [] + previous_lines = [] + for pod in pod_list.get("items", []): + pod_name = pod["metadata"]["name"] + lines.append(f"=== {pod_name} ===") + log_result = llmd_runtime.oc( + "logs", + pod_name, + "-n", + namespace, + "--all-containers=true", + check=False, + capture_output=True, + ) + if log_result.stdout: + lines.append(log_result.stdout.rstrip()) + + previous_lines.append(f"=== {pod_name} ===") + previous_result = llmd_runtime.oc( + "logs", + pod_name, + "-n", + namespace, + "--previous", + "--all-containers=true", + check=False, + capture_output=True, + ) + if previous_result.stdout: + previous_lines.append(previous_result.stdout.rstrip()) + + llmd_runtime.write_text( + artifacts_dir / "llminferenceservice.pods.logs", "\n".join(lines) + "\n" + ) + llmd_runtime.write_text( + artifacts_dir / "llminferenceservice.pods.previous.logs", + "\n".join(previous_lines) + "\n", + ) + + +def capture_guidellm_state(config: phase_inputs.TestInputs) -> None: + benchmark_name = config.benchmark["job_name"] + namespace = config.namespace + artifacts_dir = config.artifact_dir / "artifacts" + + capture_get( + "job", + benchmark_name, + namespace, + "yaml", + artifacts_dir / "guidellm_benchmark_job.yaml", + ) + capture_get( + "pods", + None, + namespace, + "yaml", + artifacts_dir / "guidellm_benchmark_job.pods.yaml", + selector=f"job-name={benchmark_name}", + ) + result = llmd_runtime.oc( + "logs", + f"job/{benchmark_name}", + "-n", + namespace, + check=False, + capture_output=True, + ) + if result.returncode == 0 and result.stdout: + llmd_runtime.write_text(artifacts_dir / "guidellm_benchmark_job.logs", result.stdout) + + +def capture_get( + kind: str, + name: str | None, + namespace: str, + output: str, + destination: Path, + *, + selector: str | None = None, +) -> None: + args = ["get", kind] + if name: + args.append(name) + args.extend(["-n", namespace]) + if selector: + args.extend(["-l", selector]) + args.extend(["-o", output]) + result = llmd_runtime.oc(*args, check=False, capture_output=True) + if result.returncode == 0 and result.stdout: + llmd_runtime.write_text(destination, result.stdout) + + +main = toolbox.create_toolbox_main(run) + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index c6632bf9..139c1bc7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,7 @@ dependencies = [ "plotly>=5.17.0", "dash>=2.14.0", "dash-bootstrap-components>=1.5.0", + "jinja2", "pyyaml>=6.0", "jsonschema>=4.19.0", "structlog>=23.1.0", @@ -125,7 +126,7 @@ ignore = [ [tool.pytest.ini_options] minversion = "7.0" addopts = "-ra -q --strict-markers --strict-config" -testpaths = ["projects/core/tests"] +testpaths = ["projects/core/tests", "projects/llm_d/tests"] python_files = ["test_*.py", "*_test.py"] python_classes = ["Test*"] python_functions = ["test_*"]