From df88c16d51649a79c6da588a9909bb66561c79e6 Mon Sep 17 00:00:00 2001 From: blublinsky Date: Mon, 16 Mar 2026 20:57:56 +0000 Subject: [PATCH] implementation of skills rag --- ols/app/models/config.py | 43 ++ ols/src/skills/__init__.py | 1 + ols/src/skills/skills_rag.py | 261 ++++++++++++ ols/utils/config.py | 50 +++ pyproject.toml | 1 + skills/degraded-operator-recovery/skill.md | 74 ++++ skills/namespace-troubleshooting/skill.md | 85 ++++ skills/node-not-ready/skill.md | 72 ++++ skills/pod-failure-diagnosis/skill.md | 73 ++++ skills/route-ingress-troubleshooting/skill.md | 83 ++++ tests/test_skills/test-triage/skill.md | 46 +++ tests/unit/app/models/test_config.py | 31 ++ tests/unit/skills/__init__.py | 1 + tests/unit/skills/test_skills_rag.py | 380 ++++++++++++++++++ uv.lock | 16 + 15 files changed, 1217 insertions(+) create mode 100644 ols/src/skills/__init__.py create mode 100644 ols/src/skills/skills_rag.py create mode 100644 skills/degraded-operator-recovery/skill.md create mode 100644 skills/namespace-troubleshooting/skill.md create mode 100644 skills/node-not-ready/skill.md create mode 100644 skills/pod-failure-diagnosis/skill.md create mode 100644 skills/route-ingress-troubleshooting/skill.md create mode 100644 tests/test_skills/test-triage/skill.md create mode 100644 tests/unit/skills/__init__.py create mode 100644 tests/unit/skills/test_skills_rag.py diff --git a/ols/app/models/config.py b/ols/app/models/config.py index 431a0f7ab..eaa5089ee 100644 --- a/ols/app/models/config.py +++ b/ols/app/models/config.py @@ -657,6 +657,44 @@ class ToolFilteringConfig(BaseModel): ) +class SkillsConfig(BaseModel): + """Configuration for skill selection using hybrid RAG retrieval. + + If this config is present, skill selection is enabled. If absent, no skills are used. + """ + + skills_dir: str = Field( + default="skills", + description="Path to directory containing skill subdirectories", + ) + + embed_model_path: Optional[str] = Field( + default=None, + description="Path to sentence transformer model for embeddings", + ) + + alpha: float = Field( + default=0.8, + ge=0.0, + le=1.0, + description="Weight for dense vs sparse retrieval (1.0 = full dense, 0.0 = full sparse)", + ) + + top_k: int = Field( + default=3, + ge=1, + le=20, + description="Number of candidate skills to consider during retrieval", + ) + + threshold: float = Field( + default=0.3, + ge=0.0, + le=1.0, + description="Minimum similarity score to accept a skill match", + ) + + class ApprovalType(StrEnum): """Approval strategy for tool execution.""" @@ -1117,6 +1155,8 @@ class OLSConfig(BaseModel): tools_approval: Optional[ToolsApprovalConfig] = None + skills: Optional[SkillsConfig] = None + def __init__( self, data: Optional[dict] = None, ignore_missing_certs: bool = False ) -> None: @@ -1173,6 +1213,8 @@ def __init__( self.tool_filtering = ToolFilteringConfig(**data.get("tool_filtering")) if data.get("tools_approval", None) is not None: self.tools_approval = ToolsApprovalConfig(**data.get("tools_approval")) + if data.get("skills", None) is not None: + self.skills = SkillsConfig(**data.get("skills")) def __eq__(self, other: object) -> bool: """Compare two objects for equality.""" @@ -1198,6 +1240,7 @@ def __eq__(self, other: object) -> bool: and self.proxy_config == other.proxy_config and self.tool_filtering == other.tool_filtering and self.tools_approval == other.tools_approval + and self.skills == other.skills ) return False diff --git a/ols/src/skills/__init__.py b/ols/src/skills/__init__.py new file mode 100644 index 000000000..48cddc00e --- /dev/null +++ b/ols/src/skills/__init__.py @@ -0,0 +1 @@ +"""Skills RAG for selecting skills based on query relevance.""" diff --git a/ols/src/skills/skills_rag.py b/ols/src/skills/skills_rag.py new file mode 100644 index 000000000..d33f14561 --- /dev/null +++ b/ols/src/skills/skills_rag.py @@ -0,0 +1,261 @@ +"""Hybrid Skills RAG implementation for skill selection.""" + +import logging +from collections.abc import Callable +from dataclasses import dataclass +from pathlib import Path + +import frontmatter +from rank_bm25 import BM25Okapi + +from ols.src.tools.tools_rag.hybrid_tools_rag import QdrantStore + +logger = logging.getLogger(__name__) + +_SKILL_MD = "skill.md" + + +@dataclass(frozen=True, slots=True) +class Skill: + """A loaded skill artifact with parsed metadata and directory path.""" + + name: str + description: str + source_path: str + + def load_content(self) -> str: + """Read all files in the skill directory tree and concatenate on demand. + + The main skill file body (everything after frontmatter) is returned + first, followed by the contents of any additional text files found + anywhere in the skill directory tree, each separated by a header + showing the relative path. + + Returns: + Combined content of all files in the skill directory tree. + + Raises: + OSError: If the skill directory or its files cannot be read. + """ + skill_dir = Path(self.source_path) + parts: list[str] = [] + + for entry in sorted(skill_dir.rglob("*")): + if not entry.is_file(): + continue + try: + raw = entry.read_text(encoding="utf-8").strip() + except (UnicodeDecodeError, ValueError): + continue + if entry.name.lower() == _SKILL_MD: + parts.insert(0, frontmatter.loads(raw).content.strip()) + else: + rel = entry.relative_to(skill_dir) + parts.append(f"## {rel}\n\n{raw}") + + return "\n\n".join(parts) + + +def _find_skill_file(directory: Path) -> Path | None: + """Find the skill definition file in a directory (case-insensitive).""" + for child in directory.iterdir(): + if child.is_file() and child.name.lower() == _SKILL_MD: + return child + return None + + +def load_skills_from_directory(skills_dir: str | Path) -> list[Skill]: + """Load all skill definitions from a directory of skill subdirectories. + + Each immediate subdirectory of ``skills_dir`` is treated as a skill. + The subdirectory must contain a ``skill.md`` or ``SKILL.md`` file with + YAML frontmatter. + The subdirectory path is stored as ``source_path`` so that ``load_content`` + can read all files in it on demand. + + Args: + skills_dir: Root directory containing skill subdirectories. + + Returns: + List of parsed Skill objects. + """ + skills_path = Path(skills_dir) + if not skills_path.is_dir(): + logger.warning("Skills directory does not exist: %s", skills_dir) + return [] + + skills: list[Skill] = [] + for child in sorted(skills_path.iterdir()): + if not child.is_dir(): + continue + skill_file = _find_skill_file(child) + if skill_file is None: + logger.debug("Skipping directory without skill.md: %s", child) + continue + skill = _parse_skill_directory(child, skill_file) + if skill is not None: + skills.append(skill) + + logger.info("Loaded %d skills from %s", len(skills), skills_dir) + return skills + + +def _parse_skill_directory(skill_dir: Path, skill_file: Path) -> Skill | None: + """Parse frontmatter from a skill directory's skill definition file. + + Args: + skill_dir: Path to the skill directory (stored as source_path). + skill_file: Path to the skill definition file within the directory. + + Returns: + Parsed Skill with source_path pointing to the directory, + or None if the file is malformed. + """ + try: + post = frontmatter.load(str(skill_file)) + except Exception: + logger.warning("Cannot read or parse skill file: %s", skill_file) + return None + + name = post.metadata.get("name") + description = post.metadata.get("description", "") + if not name: + logger.warning("Skill file missing 'name' in frontmatter: %s", skill_file) + return None + + return Skill( + name=name, + description=description, + source_path=str(skill_dir), + ) + + +class SkillsRAG: + """Hybrid RAG system for skill selection using dense and sparse retrieval.""" + + def __init__( + self, + encode_fn: Callable[[str], list[float]], + alpha: float = 0.8, + top_k: int = 3, + threshold: float = 0.3, + ) -> None: + """Initialize the SkillsRAG system. + + Args: + encode_fn: Function that encodes text into an embedding vector. + alpha: Weight for dense vs sparse (1.0 = full dense, 0.0 = full sparse). + top_k: Number of candidate skills to consider during retrieval. + threshold: Minimum fused score to accept a skill match. + """ + self.alpha = alpha + self.top_k = top_k + self.threshold = threshold + self._encode = encode_fn + self.bm25: BM25Okapi | None = None + self.store = QdrantStore() + self._skills: dict[str, Skill] = {} + + def populate_skills(self, skills: list[Skill]) -> None: + """Index skills for hybrid retrieval. + + Args: + skills: List of Skill objects to index. + """ + ids: list[str] = [] + docs: list[str] = [] + vectors: list[list[float]] = [] + + for skill in skills: + text = f"{skill.name} {skill.description}" + vector = self._encode(text) + + ids.append(skill.source_path) + docs.append(text) + vectors.append(vector) + self._skills[skill.source_path] = skill + + self.store.upsert(ids, docs, vectors) + self._rebuild_bm25() + logger.info("Indexed %d skills for retrieval", len(skills)) + + def retrieve_skill(self, query: str) -> Skill | None: + """Retrieve the best matching skill for a query. + + Args: + query: User query to match against indexed skills. + + Returns: + The best matching Skill, or None if no skill exceeds the threshold + (caller should fall back to default behavior). + """ + if not self._skills: + return None + + q_vec = self._encode(query) + + dense_ids, _, _ = self.store.search_with_scores(q_vec, self.top_k) + dense_scores = {t: 1.0 - i / self.top_k for i, t in enumerate(dense_ids)} + + sparse_scores = self._retrieve_sparse_scores(query) + sparse_ids = sorted(sparse_scores, key=sparse_scores.get, reverse=True)[ + : self.top_k + ] + + fused: dict[str, float] = {} + for t in set(list(dense_ids) + sparse_ids): + d = dense_scores.get(t, 0) + s = sparse_scores.get(t, 0) + fused[t] = self.alpha * d + (1 - self.alpha) * s + + if not fused: + return None + + best_id = max(fused, key=fused.get) # type: ignore[arg-type] + best_score = fused[best_id] + + if best_score < self.threshold: + logger.debug( + "Best skill '%s' scored %.3f, below threshold %.3f", + best_id, + best_score, + self.threshold, + ) + return None + + logger.info( + "Selected skill '%s' with score %.3f for query: %s", + best_id, + best_score, + query[:80], + ) + return self._skills.get(best_id) + + def _rebuild_bm25(self) -> None: + """Rebuild BM25 index from stored documents.""" + all_data = self.store.get_all() + if not all_data["documents"]: + self.bm25 = None + return + sparse_docs = [doc.split() for doc in all_data["documents"]] + self.bm25 = BM25Okapi(sparse_docs) + + def _retrieve_sparse_scores(self, query: str) -> dict[str, float]: + """Retrieve BM25 scores normalized to 0-1 range. + + Args: + query: The query string. + + Returns: + Dictionary mapping skill IDs to normalized BM25 scores. + """ + if self.bm25 is None: + return {} + + all_data = self.store.get_all() + skill_ids = all_data["ids"] + + raw_scores = self.bm25.get_scores(query.split()) + max_score = max(raw_scores) if max(raw_scores) > 0 else 1 + + return {sid: score / max_score for sid, score in zip(skill_ids, raw_scores)} diff --git a/ols/utils/config.py b/ols/utils/config.py index 52ad2f501..8010476f6 100644 --- a/ols/utils/config.py +++ b/ols/utils/config.py @@ -2,8 +2,10 @@ from __future__ import annotations +import logging import traceback from functools import cached_property +from pathlib import Path from typing import TYPE_CHECKING, Any, Optional import yaml @@ -16,9 +18,12 @@ # as the index_loader.py is excluded from type checks, it confuses # mypy a bit, hence the [attr-defined] bellow from ols.src.rag_index.index_loader import IndexLoader # type: ignore [attr-defined] +from ols.src.skills.skills_rag import SkillsRAG, load_skills_from_directory from ols.src.tools.tools_rag.hybrid_tools_rag import ToolsRAG from ols.utils.redactor import Redactor +logger = logging.getLogger(__name__) + if TYPE_CHECKING: from io import TextIOBase @@ -185,6 +190,49 @@ def tools_rag(self) -> Optional[ToolsRAG]: ) return None + @cached_property + def skills_rag(self) -> Optional[SkillsRAG]: + """Return the SkillsRAG instance for skill selection. + + Only creates the instance if skills configuration exists. Loads skills + from the configured directory and populates the index eagerly. + """ + skills_config = self.config.ols_config.skills + if skills_config is None: + return None + + skills_dir = Path(skills_config.skills_dir) + if not skills_dir.is_dir(): + logger.warning("Skills directory does not exist: %s", skills_dir) + return None + + skills = load_skills_from_directory(skills_dir) + if not skills: + logger.warning("No skills found in %s", skills_dir) + return None + + embed_model = self.rag_index_loader.embed_model + if embed_model is None or isinstance(embed_model, str): + from llama_index.embeddings.huggingface import ( # pylint: disable=import-outside-toplevel + HuggingFaceEmbedding, + ) + + model_path = ( + skills_config.embed_model_path + or "sentence-transformers/all-mpnet-base-v2" + ) + embed_model = HuggingFaceEmbedding(model_name=model_path) + + rag = SkillsRAG( + encode_fn=embed_model.get_text_embedding, + alpha=skills_config.alpha, + top_k=skills_config.top_k, + threshold=skills_config.threshold, + ) + rag.populate_skills(skills) + + return rag + @property def proxy_config(self) -> Optional[config_model.ProxyConfig]: """Return the proxy configuration.""" @@ -229,6 +277,8 @@ def reload_from_yaml_file( del self.__dict__["mcp_servers_dict"] if "tools_rag" in self.__dict__: del self.__dict__["tools_rag"] + if "skills_rag" in self.__dict__: + del self.__dict__["skills_rag"] except Exception as e: print(f"Failed to load config file {config_file}: {e!s}") print(traceback.format_exc()) diff --git a/pyproject.toml b/pyproject.toml index df03c172a..44d4ffe92 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -125,6 +125,7 @@ dependencies = [ "mcp>=1.23.0", "qdrant-client>=1.13.3", # For ToolsRAG vector storage (pure Python, no Rust/Cargo) "rank-bm25>=0.2.2", # For ToolsRAG sparse retrieval + "python-frontmatter>=1.1.0", # For parsing skill YAML frontmatter ] requires-python = ">=3.11.1,<=3.12.10" readme = "README.md" diff --git a/skills/degraded-operator-recovery/skill.md b/skills/degraded-operator-recovery/skill.md new file mode 100644 index 000000000..44ce37ea6 --- /dev/null +++ b/skills/degraded-operator-recovery/skill.md @@ -0,0 +1,74 @@ +--- +name: degraded-operator-recovery +description: Diagnose and recover OpenShift cluster operators in Degraded, Unavailable, or not Progressing state. Use when a ClusterOperator is broken, failing, or showing errors. +--- + +# Degraded Operator Recovery + +When a user reports unhealthy cluster operators or a stuck upgrade, follow this structured approach to identify the blocking condition and provide recovery steps. + +## 1. Assess Cluster Operator Health + +Start by listing all cluster operators and their status conditions: + +- Identify operators with `Degraded=True` or `Available=False`. +- Note operators with `Progressing=True` — these may be mid-reconciliation and need time, not intervention. +- If multiple operators are degraded, identify dependencies. For example, if `kube-apiserver` is degraded, other operators that depend on the API server will also report issues. + +Focus on the **root cause operator** — the one whose degradation is not explained by another operator's failure. + +## 2. Read the Blocking Condition + +For each degraded operator: + +1. Read the operator's status conditions — the `message` field on the `Degraded` condition usually contains the specific error. +2. Check the `lastTransitionTime` to understand how long the operator has been in this state. +3. Look for common patterns in the condition message: + - Certificate expiry or rotation failures + - Webhook configuration errors + - Failed rollout of an operand deployment + - Resource contention (CPU/memory on control plane nodes) + - Quorum loss (etcd-specific) + +Do not skip this step. The condition message is the single most informative piece of data. + +## 3. Inspect Managed Operands + +If the condition message is not sufficient: + +1. Identify the operator's managed deployments, daemonsets, or statefulsets (usually in `openshift-*` namespaces). +2. Check if any operand pods are in CrashLoopBackOff, Pending, or Error state. +3. If operand pods are failing, triage them using the same approach as pod failure diagnosis — check logs and events. +4. For control plane operators (etcd, kube-apiserver, kube-controller-manager, kube-scheduler): check static pod status on control plane nodes. + +## 4. Check for Upgrade-Related Issues + +If this is happening during or after a cluster upgrade: + +1. Check `oc get clusterversion` for the upgrade status and any reported failures. +2. Determine if the operator is stuck waiting for a node reboot (MachineConfigPool not updated). +3. Check if pending CSRs need approval — node certificate renewal during upgrade can block operators. +4. For transient `Progressing=True` during upgrade: advise waiting (up to the operator's expected rollout window) before intervening. + +Distinguish between "upgrade in progress" (normal) and "upgrade stuck" (needs intervention). + +## 5. Provide Recovery Steps + +Once the blocking condition is identified: + +1. State which operator is degraded and what the blocking condition is. +2. Provide specific recovery actions: + - **Pending CSRs**: approve them with `oc adm certificate approve`. + - **Failed operand pod**: follow pod triage to fix the underlying issue. + - **Certificate issues**: check if cert rotation can be triggered or if manual renewal is needed. + - **Resource contention**: identify the pressure source on control plane nodes. + - **Webhook errors**: check if the webhook service is available and the CA bundle is correct. +3. If the issue is internal to the operator and cannot be resolved by the user, recommend opening a support case with the specific condition message. + +## Quality Standards + +- Always start with `oc get clusteroperators` before diving deeper — this prevents chasing symptoms of a different root cause. +- For etcd, kube-apiserver, and kube-controller-manager: warn about control plane impact before suggesting any restart or remediation. +- Never suggest force-deleting operator-managed resources without explicit warning — operators reconcile state and force-deletion can cause split-brain. +- If the operator is `Progressing=True`, advise waiting before intervening — premature action can make the situation worse. +- Distinguish upgrade-related transient degradation from persistent failures that need action. diff --git a/skills/namespace-troubleshooting/skill.md b/skills/namespace-troubleshooting/skill.md new file mode 100644 index 000000000..4971e1474 --- /dev/null +++ b/skills/namespace-troubleshooting/skill.md @@ -0,0 +1,85 @@ +--- +name: namespace-troubleshooting +description: Diagnose namespace-level issues including stuck termination, quota exhaustion, and RBAC misconfigurations. Use when a namespace is stuck deleting, resources cannot be created, permission denied, or forbidden errors occur. +--- + +# Namespace Troubleshooting + +When a user reports issues at the namespace/project level — stuck deletion, inability to create resources, or permission errors — follow this structured approach to identify the specific blocker and provide a targeted fix. + +## 1. Check Namespace Status + +Start by confirming the namespace exists and its phase: + +1. If the namespace is **Active**: the issue is likely quota, LimitRange, or RBAC. Proceed to the relevant section based on the user's symptom. +2. If the namespace is **Terminating**: proceed to the stuck termination triage. +3. If the namespace does not exist: confirm with the user whether it was recently deleted or never created. + +## 2. Stuck Terminating Namespace + +If the namespace is stuck in Terminating state: + +1. Check the namespace's `metadata.finalizers` — these are the hooks preventing deletion. +2. List all resources still remaining in the namespace. Kubernetes cannot remove the namespace until all resources with finalizers are cleaned up. +3. Identify which specific resources are blocking: + - **Custom Resources with finalizers**: the operator managing them may be absent or broken. Check if the CRD's controller is running. + - **PersistentVolumeClaims**: the underlying PV may be stuck in Released state or the storage provisioner is not cleaning up. + - **Pods with long graceful termination**: check if pods have unusually long `terminationGracePeriodSeconds` or are stuck in pre-stop hooks. +4. For each blocking resource, explain what the finalizer protects and the consequences of removing it. +5. If a finalizer must be removed manually, provide the specific command but warn that this skips cleanup — the external resource (PV, cloud resource, etc.) may be orphaned. + +Never suggest blanket removal of namespace finalizers ("just patch the namespace to remove all finalizers"). This skips all cleanup and leaks resources. Always identify and address the specific blocking resources first. + +## 3. Quota Exhaustion + +If deployments or pods fail to create with quota-related errors: + +1. List the ResourceQuota objects in the namespace and their current usage vs. limits. +2. Show the arithmetic: what the new pod/deployment requests + what is already consumed vs. the quota limit. +3. Identify which specific resource is exhausted (cpu, memory, pods count, services, configmaps, etc.). +4. Recommend the appropriate fix: + - If the quota is legitimately too low: increase it (and note who has permission to do so). + - If existing resources are consuming more than expected: identify the top consumers. + - If completed Jobs or failed pods are consuming quota: suggest cleaning them up. + +Always show the numbers. "Quota exceeded" without showing the actual usage vs. limit is not actionable. + +## 4. LimitRange Violations + +If pod creation fails with LimitRange-related errors: + +1. List the LimitRange objects in the namespace. +2. Compare the pod's resource requests and limits against the LimitRange constraints: + - **Min/Max violations**: the pod requests less than the minimum or more than the maximum allowed. + - **Default applied unexpectedly**: if the pod has no resource requests, the LimitRange default is applied — this may conflict with the application's actual needs. + - **MaxLimitRequestRatio**: the ratio between the pod's limit and request exceeds the allowed ratio. +3. Show exactly which constraint is violated and by how much. +4. Recommend adjusting either the pod's resources or the LimitRange, depending on which is the source of truth. + +## 5. RBAC / Permission Issues + +If the user gets `Forbidden` errors when operating in the namespace: + +1. Ask for or identify the exact error message — it contains the verb, resource, and API group being denied. +2. Check the user's RoleBindings in the namespace and their ClusterRoleBindings. +3. Identify whether the user has a Role that includes the required verb + resource combination. +4. If the permission is missing: recommend creating a specific RoleBinding. Specify the exact Role or ClusterRole to bind — do not default to "give them admin." +5. If the user should not have the permission: confirm this is expected and explain who does have it. + +Always specify the exact verb+resource the user is missing (e.g., "create deployments.apps") rather than vaguely saying "insufficient permissions." + +## 6. Project-Specific Considerations + +OpenShift Projects are namespaces with additional metadata. If the issue involves Projects specifically: + +1. Check if a ProjectRequest template is in use — it may inject default quotas, LimitRanges, or NetworkPolicies that cause unexpected restrictions. +2. Check if the user's ability to create Projects is controlled by a `self-provisioner` ClusterRoleBinding — removal of this binding prevents users from creating their own Projects. +3. For multi-tenant clusters: check if NetworkPolicies in the namespace are blocking expected traffic between namespaces. + +## Quality Standards + +- For stuck Terminating: always list the specific resources and finalizers blocking deletion. Never suggest removing namespace finalizers without first identifying what they protect. +- For quota: show the arithmetic (requested + existing usage vs. limit). An error message without numbers is not actionable. +- For RBAC: specify the exact verb+resource the user is missing, not just "add admin role." Least-privilege matters. +- For LimitRange: show which constraint is violated and by how much, not just that a violation exists. +- If the issue spans multiple categories (e.g., quota is full AND RBAC prevents the user from adjusting it), address both and clarify the dependency. diff --git a/skills/node-not-ready/skill.md b/skills/node-not-ready/skill.md new file mode 100644 index 000000000..bf092f2ea --- /dev/null +++ b/skills/node-not-ready/skill.md @@ -0,0 +1,72 @@ +--- +name: node-not-ready +description: Diagnose nodes in NotReady or SchedulingDisabled state. Use when a node is down, unschedulable, not accepting pods, or needs to be drained and restored. +--- + +# Node Not Ready Diagnosis + +When a user reports a node that is NotReady, flapping, or has evicted workloads unexpectedly, follow this structured approach to identify the cause and restore the node. + +## 1. Identify Affected Nodes + +List all nodes and their statuses: + +1. Note which nodes are NotReady and how long they have been in that state. Recent (minutes) vs. prolonged (hours/days) requires different urgency. +2. Determine if the affected node is a **control plane node** or a **worker**. Control plane nodes are higher priority — losing quorum affects the entire cluster. +3. Check if the node is `SchedulingDisabled` (cordoned) — this may be intentional (maintenance) or a side effect of a MachineHealthCheck remediation. + +## 2. Read Node Conditions + +Describe the affected node and examine its conditions: + +1. **MemoryPressure=True**: the node is running low on memory. Identify the top memory-consuming pods and recommend eviction or resource limit adjustments. +2. **DiskPressure=True**: the node is running low on disk. Check for large container logs, unused images, or full persistent volumes. Recommend cleanup actions. +3. **PIDPressure=True**: the node is exhausting process IDs. Identify pods with excessive process creation (fork bombs, misconfigured worker pools). +4. **NetworkUnavailable=True**: the node's network plugin is not ready. Check SDN/OVN-Kubernetes pod health on that specific node. +5. **Ready=False with no other pressure conditions**: kubelet is not posting status. This usually means the kubelet process is down or the node is unreachable. + +The specific condition tells you exactly where to look next — do not run through all possibilities if the condition is clear. + +## 3. Check Infrastructure Layer + +If the node conditions suggest the node itself is unreachable or the kubelet is down: + +1. Check the Machine and MachineSet objects — is the machine reported as running by the infrastructure provider? +2. Check if a MachineHealthCheck has already detected the issue and is remediating (creating a replacement machine). If so, advise waiting for the remediation to complete before manual intervention. +3. For bare-metal or user-provisioned infrastructure: the user needs to check the host directly (SSH, console, BMC). OLS cannot diagnose host-level issues without cluster-side evidence. + +## 4. Check Certificate Issues + +Node certificate problems are a common cause of NotReady, especially after extended downtime: + +1. Check for pending Certificate Signing Requests (CSRs) — nodes that were offline during certificate rotation may need their CSRs manually approved. +2. If CSRs are pending: approve them and verify the node returns to Ready state. +3. If no CSRs are pending but the node was offline for a long time: the kubelet certificates may have expired entirely and the node may need to be re-joined. + +## 5. Check Network Plugin Health + +If the condition is NetworkUnavailable: + +1. Identify which network plugin the cluster uses (OpenShift SDN or OVN-Kubernetes). +2. Check the network plugin pods running on the affected node — are they in CrashLoopBackOff or not scheduled? +3. Check the node's network plugin logs for specific errors (failed to configure interface, VXLAN/Geneve tunnel issues, OVS database corruption). + +## 6. Provide Recovery Steps + +Once the cause is identified: + +1. State the specific condition and evidence. +2. Provide the targeted fix: + - **Resource pressure**: identify consumers, suggest eviction or resource adjustments. + - **Kubelet down**: check machine status, recommend restart or machine replacement. + - **Pending CSRs**: approve them. + - **Network plugin failure**: restart the network plugin pod on the affected node or investigate the specific error. +3. After remediation, verify the node transitions back to Ready. + +## Quality Standards + +- Always report how long the node has been NotReady — this affects triage urgency and likely causes. +- Check whether a MachineHealthCheck is already remediating before suggesting manual action — do not compete with automated remediation. +- Warn before suggesting cordon/drain if the cluster is already capacity-constrained — draining a node when others are also unhealthy can cause cascading evictions. +- Distinguish control plane nodes from workers in all recommendations. Restarting a control plane node has different risk than restarting a worker. +- If the node is flapping (oscillating between Ready and NotReady), focus on the transition events rather than the current state. diff --git a/skills/pod-failure-diagnosis/skill.md b/skills/pod-failure-diagnosis/skill.md new file mode 100644 index 000000000..b79056270 --- /dev/null +++ b/skills/pod-failure-diagnosis/skill.md @@ -0,0 +1,73 @@ +--- +name: pod-failure-diagnosis +description: Diagnose pods stuck in CrashLoopBackOff, ImagePullBackOff, Pending, Error, or OOMKilled states. Use when a pod is not running, keeps restarting, fails to start, or containers are crashing. +--- + +# Pod Failure Diagnosis + +When a user reports a pod that is not running, follow this structured triage to identify root cause and provide remediation. + +## 1. Classify the Failure Mode + +Get the pod status and recent events to determine which failure category applies: + +- **CrashLoopBackOff** — container starts but exits repeatedly +- **ImagePullBackOff** — container image cannot be pulled +- **Pending** — pod is not scheduled to any node +- **Error / Init:Error** — init container or main container failed on startup +- **Terminating (stuck)** — pod is not cleaning up + +Ask the user for namespace and pod/deployment name if not provided. If they describe symptoms instead ("my app keeps restarting"), map to the correct category before proceeding. + +## 2. CrashLoopBackOff Triage + +If the pod is crash-looping: + +1. Retrieve container logs for the **current** attempt to see the latest error. +2. Retrieve container logs for the **previous** attempt (`--previous`) — the crash reason is often clearer there. +3. Check pod events for OOMKilled signals — this means the container exceeded its memory limit. +4. If OOMKilled: compare the container's memory limit against actual usage and recommend increasing it or fixing the memory leak. +5. If application error: cite the specific log line and recommend the fix (config error, missing dependency, failed health check, etc.). + +Do not suggest "just increase resources" unless OOMKilled is confirmed. + +## 3. ImagePullBackOff Triage + +If the image cannot be pulled: + +1. Check the exact image reference in the pod spec — look for typos, wrong tags, or missing registry prefix. +2. Check pod events for the specific pull error message (authentication required, not found, timeout). +3. For authentication errors: verify the imagePullSecrets on the pod and the referenced Secret's content. +4. For "not found" errors: verify the image exists in the registry with the specified tag. +5. For registry connectivity: check if the node can reach the registry (relevant for air-gapped or proxy environments). + +Report the exact error message from events — do not guess which sub-case applies. + +## 4. Pending Pod Triage + +If the pod is stuck in Pending: + +1. Check pod events for scheduling failure reasons. +2. **Insufficient resources**: compare the pod's resource requests against available node capacity. Show the arithmetic. +3. **Taints/tolerations**: identify which nodes have taints the pod does not tolerate. +4. **Node selectors / affinity**: verify the pod's constraints match at least one available node. +5. **PVC binding**: if the pod mounts a PersistentVolumeClaim, check if the PVC is Bound. If Pending, diagnose the PVC (missing StorageClass, insufficient capacity, zone mismatch). + +For each sub-case, provide the specific fix — do not list all possibilities when the events already tell you which one applies. + +## 5. Provide Remediation + +Once root cause is identified: + +1. State the root cause in one sentence with the supporting evidence (event message, log line, or metric). +2. Provide one or two actionable commands or manifest changes the user can apply. +3. If the fix involves deleting or force-replacing a resource, warn explicitly before suggesting it. +4. If the issue is an application bug (not a platform issue), say so clearly and recommend the user check their application code/config. + +## Quality Standards + +- Always cite the specific event message or log line that reveals the cause — do not provide generic troubleshooting checklists. +- Distinguish between application errors and platform/infrastructure issues. The remediation path is different. +- If multiple pods are failing, prioritize the earliest failure — cascading failures often share a root cause. +- Do not suggest destructive actions (delete pod, force delete, remove finalizers) without explicit warning about consequences. +- If evidence is insufficient to determine root cause, ask one focused clarification question rather than speculating. diff --git a/skills/route-ingress-troubleshooting/skill.md b/skills/route-ingress-troubleshooting/skill.md new file mode 100644 index 000000000..5d7431e84 --- /dev/null +++ b/skills/route-ingress-troubleshooting/skill.md @@ -0,0 +1,83 @@ +--- +name: route-ingress-troubleshooting +description: Diagnose why an application is not reachable through its OpenShift Route or Ingress. Use when an app is not accessible, returning 502 or 503 errors, connection refused, or the service is not exposed externally. +--- + +# Route and Ingress Troubleshooting + +When a user reports that their application URL returns errors (503, connection refused, TLS errors) or is completely unreachable, follow this structured approach to trace the request path and identify the broken layer. + +## 1. Verify the Route Exists and Is Admitted + +Check the Route object first: + +1. Confirm the Route exists in the expected namespace. +2. Check the Route's status — is it `Admitted` by the ingress controller? If not admitted, the ingress controller has rejected it. Check the rejection reason (conflicting hostname, invalid TLS config, etc.). +3. Verify the hostname — does it match what the user is trying to reach? Typos in the hostname are common. +4. If using a custom domain: confirm DNS resolves to the cluster's ingress VIP or load balancer. + +Do not proceed to service/endpoint debugging until the Route itself is confirmed as Admitted. + +## 2. Verify the Target Service + +Check that the Route points to a valid Service: + +1. Confirm the Service named in the Route's `spec.to` exists in the same namespace. +2. Check the Service's `selector` — does it match the labels on the running pods? +3. Check the Service's `targetPort` — does it match the port the application is actually listening on inside the container? + +If the Route references a Service that does not exist, that is the root cause. If the Service exists but has the wrong selector or port, report the specific mismatch. + +## 3. Check Endpoints + +Verify that the Service has backend pod IPs: + +1. Check the Endpoints object for the Service — are there IP addresses listed? +2. If **endpoints are empty**: the Service selector does not match any **running and ready** pods. Report the exact selector and the labels on available pods so the user can see the mismatch. +3. If **endpoints exist but the application returns 503**: the pods are registered but may be failing readiness probes. Check the pod readiness probe configuration and recent probe failure events. + +Empty endpoints are the most common cause of 503 errors on Routes. Always check this before investigating the ingress controller. + +## 4. Verify Pod Readiness + +If endpoints exist but the application is still unreachable: + +1. Check if the pods are `Ready` (all readiness probes passing). +2. If pods are not ready: check which readiness probe is failing and why (wrong path, wrong port, application not fully started). +3. Verify the container is listening on the expected port — a mismatch between the declared containerPort and the actual listening port causes silent failures. +4. If pods are ready and the port is correct: the issue may be application-level (the app returns errors for the specific request path). + +## 5. Diagnose TLS Issues + +If the error is TLS-related (certificate errors, HTTPS not working): + +1. Identify the Route's TLS termination type: **edge**, **reencrypt**, or **passthrough**. +2. For **edge** termination: + - The ingress controller terminates TLS. Check if the Route has a custom certificate or uses the default wildcard certificate. + - If custom certificate: verify the certificate matches the hostname and is not expired. +3. For **reencrypt** termination: + - The ingress controller terminates and re-encrypts to the backend. Check the `destinationCACertificate` — it must trust the backend pod's certificate. + - Verify the backend pod is serving TLS on the target port. +4. For **passthrough** termination: + - TLS is terminated by the application pod. The ingress controller does not inspect the certificate. + - Verify the pod is serving valid TLS on the expected port. + +Report which TLS layer has the issue — do not suggest regenerating all certificates when only one is wrong. + +## 6. Check the Ingress Controller + +If the Route is Admitted, the Service has endpoints, pods are ready, and TLS is correct, the issue may be at the ingress controller level: + +1. Check the IngressController/router pods in `openshift-ingress` — are they running and ready? +2. Check the IngressController's status conditions for errors. +3. If using a non-default IngressController: verify the Route is exposed by the correct one (check `routeSelector` and `namespaceSelector`). + +Ingress controller issues are rare compared to Service/Endpoint issues. Only investigate here after ruling out the common causes. + +## Quality Standards + +- Always trace the full chain: Route → Service → Endpoints → Pod. Do not skip layers — report which specific layer is broken. +- Report the exact selector mismatch if endpoints are empty — show both the Service selector and the pod labels side by side. +- For TLS issues: specify which termination type is in use and which side has the problem. Generic "check your certificates" is not helpful. +- Do not suggest creating a new Route if the existing one has a fixable misconfiguration. Fix what exists first. +- If the issue is application-level (the app itself returns errors), say so clearly rather than continuing to troubleshoot infrastructure. diff --git a/tests/test_skills/test-triage/skill.md b/tests/test_skills/test-triage/skill.md new file mode 100644 index 000000000..0f6036b6c --- /dev/null +++ b/tests/test_skills/test-triage/skill.md @@ -0,0 +1,46 @@ +--- +name: triage +description: Diagnose and triage incident reports involving data loss, corruption, slow performance, timeouts, or cosmetic issues. Use when the user reports a production incident or outage. +--- + +# Incident Report Triage + +When a user provides an incident report, follow these steps in order. + +## Step 1: Classify Severity + +Read the incident text and classify: +- Contains "data loss" or "corruption" → Critical → go to Step 2 +- Contains "slow" or "timeout" → Degraded → go to Step 3 +- Contains "cosmetic" or "typo" → Low → go to Step 4 + +State the severity classification and transition reason. + +## Step 2: Critical Incident Response + +1. State: "This is a Critical incident requiring immediate action." +2. Identify the affected component from the report text. +3. Recommend: rollback to last known good state. +4. Recommend: notify the on-call team. +5. Proceed to Step 5. + +## Step 3: Degraded Incident Response + +1. State: "This is a Degraded incident requiring investigation." +2. Identify the affected component from the report text. +3. Recommend: capture diagnostics. +4. Recommend: scale up if load-related. +5. Proceed to Step 5. + +## Step 4: Low Incident Response + +1. State: "This is a Low priority incident." +2. Recommend: file a backlog ticket. +3. Proceed to Step 5. + +## Step 5: Summary + +Provide a structured summary: +- Severity: (Critical/Degraded/Low) +- Component: (identified from text) +- Actions: (list from the corresponding step) diff --git a/tests/unit/app/models/test_config.py b/tests/unit/app/models/test_config.py index 727d137c1..2b04437c5 100644 --- a/tests/unit/app/models/test_config.py +++ b/tests/unit/app/models/test_config.py @@ -31,6 +31,7 @@ QuotaHandlersConfig, ReferenceContent, ReferenceContentIndex, + SkillsConfig, TLSConfig, TLSSecurityProfile, UserDataCollection, @@ -4065,3 +4066,33 @@ def test_proxy_config_no_proxy_env_var_with_certificates(monkeypatch): assert proxy_config.no_proxy_hosts == no_proxy.split(",") assert str(proxy_config.proxy_ca_cert_path) == "tests/config/empty_cert.crt" assert str(proxy_config.proxy_url) == "http://proxy.example.com:1234" + + +def test_skills_config_defaults(): + """Test SkillsConfig with default values.""" + cfg = SkillsConfig() + assert cfg.skills_dir == "skills" + assert cfg.alpha == 0.8 + assert cfg.top_k == 3 + assert cfg.threshold == 0.3 + + +def test_skills_config_custom_values(): + """Test SkillsConfig with custom values.""" + cfg = SkillsConfig(skills_dir="/opt/skills", alpha=0.5, top_k=5, threshold=0.4) + assert cfg.skills_dir == "/opt/skills" + assert cfg.alpha == 0.5 + assert cfg.top_k == 5 + assert cfg.threshold == 0.4 + + +def test_skills_config_validation(): + """Test SkillsConfig field validation boundaries.""" + with pytest.raises(ValidationError, match="greater than or equal to 0"): + SkillsConfig(alpha=-0.1) + with pytest.raises(ValidationError, match="less than or equal to 1"): + SkillsConfig(alpha=1.1) + with pytest.raises(ValidationError, match="greater than or equal to 1"): + SkillsConfig(top_k=0) + with pytest.raises(ValidationError, match="less than or equal to 20"): + SkillsConfig(top_k=21) diff --git a/tests/unit/skills/__init__.py b/tests/unit/skills/__init__.py new file mode 100644 index 000000000..3ea7f9cd3 --- /dev/null +++ b/tests/unit/skills/__init__.py @@ -0,0 +1 @@ +"""Unit tests for ols.src.skills.""" diff --git a/tests/unit/skills/test_skills_rag.py b/tests/unit/skills/test_skills_rag.py new file mode 100644 index 000000000..e3843e48f --- /dev/null +++ b/tests/unit/skills/test_skills_rag.py @@ -0,0 +1,380 @@ +"""Unit tests for skills_rag module.""" + +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + +from ols.src.skills.skills_rag import Skill, SkillsRAG, load_skills_from_directory + +DIMENSION = 8 + + +def _fake_encode(text: str) -> list[float]: + """Deterministic encode: sum of char ordinals spread across DIMENSION dims.""" + total = sum(ord(c) for c in text) + return [(total + i) / 1000.0 for i in range(DIMENSION)] + + +def _make_skill(name: str, description: str, source_path: str = "") -> Skill: + """Create a Skill with minimal fields for testing.""" + return Skill( + name=name, + description=description, + source_path=source_path or f"skills/{name}", + ) + + +def _make_rag(**kwargs: object) -> SkillsRAG: + """Create a SkillsRAG with fake encode and sensible test defaults.""" + defaults: dict = { + "encode_fn": _fake_encode, + "alpha": 0.5, + "top_k": 5, + "threshold": 0.0, + } + defaults.update(kwargs) + return SkillsRAG(**defaults) + + +def _sample_skills() -> list[Skill]: + """Return a fixed set of skills for testing.""" + return [ + _make_skill( + "pod-failure-diagnosis", + "Diagnose pods stuck in CrashLoopBackOff, ImagePullBackOff, Pending, or Error states", + ), + _make_skill( + "degraded-operator-recovery", + "Diagnose and recover OpenShift cluster operators in Degraded or Unavailable state", + ), + _make_skill( + "node-not-ready", + "Diagnose nodes in NotReady or SchedulingDisabled state and restore them", + ), + _make_skill( + "route-ingress-troubleshooting", + "Diagnose why an application is not reachable through its OpenShift Route or Ingress", + ), + _make_skill( + "namespace-troubleshooting", + "Diagnose namespace-level issues including stuck termination and quota exhaustion", + ), + ] + + +class TestSkillDataclass: + """Tests for the Skill dataclass.""" + + def test_skill_is_frozen(self) -> None: + """Verify Skill instances are immutable.""" + skill = _make_skill("test", "desc") + with pytest.raises(AttributeError): + skill.name = "changed" # type: ignore[misc] + + def test_skill_fields(self) -> None: + """Verify all fields are accessible.""" + skill = _make_skill("test-skill", "A test skill") + assert skill.name == "test-skill" + assert skill.description == "A test skill" + assert skill.source_path == "skills/test-skill" + + +class TestLoadSkillsFromDirectory: + """Tests for load_skills_from_directory.""" + + def test_loads_skills_from_valid_directory(self, tmp_path: Path) -> None: + """Verify skills are loaded from a directory with valid skill files.""" + skill_dir = tmp_path / "pod-diagnosis" + skill_dir.mkdir() + (skill_dir / "skill.md").write_text( + "---\nname: pod-diagnosis\ndescription: Diagnose pods\n" + "---\n\n# Pod Diagnosis\n\nWorkflow here.", + encoding="utf-8", + ) + skills = load_skills_from_directory(tmp_path) + assert len(skills) == 1 + assert skills[0].name == "pod-diagnosis" + assert skills[0].description == "Diagnose pods" + assert skills[0].source_path == str(skill_dir) + + def test_loads_skill_with_uppercase_filename(self, tmp_path: Path) -> None: + """Verify SKILL.md (uppercase) is recognised as a valid skill file.""" + skill_dir = tmp_path / "my-skill" + skill_dir.mkdir() + (skill_dir / "SKILL.md").write_text( + "---\nname: my-skill\ndescription: Upper case\n" + "---\n\n# My Skill\n\nContent.", + encoding="utf-8", + ) + skills = load_skills_from_directory(tmp_path) + assert len(skills) == 1 + assert skills[0].name == "my-skill" + + def test_returns_empty_for_nonexistent_directory(self) -> None: + """Verify empty list for missing directory.""" + skills = load_skills_from_directory("/nonexistent/path") + assert skills == [] + + def test_skips_dirs_without_skill_md(self, tmp_path: Path) -> None: + """Verify directories without skill.md are skipped.""" + empty_dir = tmp_path / "no-skill-file" + empty_dir.mkdir() + (empty_dir / "readme.md").write_text("Not a skill.", encoding="utf-8") + skills = load_skills_from_directory(tmp_path) + assert skills == [] + + def test_skips_files_without_frontmatter(self, tmp_path: Path) -> None: + """Verify skill.md without YAML frontmatter is skipped.""" + skill_dir = tmp_path / "bad-skill" + skill_dir.mkdir() + (skill_dir / "skill.md").write_text( + "# No Frontmatter\n\nJust markdown.", + encoding="utf-8", + ) + skills = load_skills_from_directory(tmp_path) + assert skills == [] + + def test_skips_files_with_missing_name(self, tmp_path: Path) -> None: + """Verify skill.md without 'name' in frontmatter is skipped.""" + skill_dir = tmp_path / "no-name" + skill_dir.mkdir() + (skill_dir / "skill.md").write_text( + "---\ndescription: Missing name field\n---\n\n# Content", + encoding="utf-8", + ) + skills = load_skills_from_directory(tmp_path) + assert skills == [] + + def test_skips_invalid_yaml(self, tmp_path: Path) -> None: + """Verify skill.md with broken YAML frontmatter is skipped.""" + skill_dir = tmp_path / "bad-yaml" + skill_dir.mkdir() + (skill_dir / "skill.md").write_text( + "---\n: [invalid yaml\n---\n\n# Content", + encoding="utf-8", + ) + skills = load_skills_from_directory(tmp_path) + assert skills == [] + + def test_loads_multiple_skills_sorted(self, tmp_path: Path) -> None: + """Verify multiple skills are loaded and sorted by path.""" + for name in ["beta-skill", "alpha-skill"]: + d = tmp_path / name + d.mkdir() + (d / "skill.md").write_text( + f"---\nname: {name}\ndescription: Skill {name}\n---\n\n# {name}", + encoding="utf-8", + ) + skills = load_skills_from_directory(tmp_path) + assert len(skills) == 2 + assert skills[0].name == "alpha-skill" + assert skills[1].name == "beta-skill" + + def test_description_defaults_to_empty(self, tmp_path: Path) -> None: + """Verify missing description defaults to empty string.""" + skill_dir = tmp_path / "no-desc" + skill_dir.mkdir() + (skill_dir / "skill.md").write_text( + "---\nname: no-desc\n---\n\n# No Description Skill", + encoding="utf-8", + ) + skills = load_skills_from_directory(tmp_path) + assert len(skills) == 1 + assert skills[0].description == "" + + def test_load_content_reads_skill_md_body(self, tmp_path: Path) -> None: + """Verify load_content reads the skill.md body on demand.""" + skill_dir = tmp_path / "test-skill" + skill_dir.mkdir() + (skill_dir / "skill.md").write_text( + "---\nname: test-skill\ndescription: Test\n" + "---\n\n# Test Skill\n\nBody content here.", + encoding="utf-8", + ) + skills = load_skills_from_directory(tmp_path) + assert len(skills) == 1 + content = skills[0].load_content() + assert "Body content here." in content + assert "# Test Skill" in content + + def test_load_content_includes_extra_files(self, tmp_path: Path) -> None: + """Verify load_content concatenates additional files from the directory.""" + skill_dir = tmp_path / "multi-file-skill" + skill_dir.mkdir() + (skill_dir / "skill.md").write_text( + "---\nname: multi-file\ndescription: Test\n" "---\n\n# Main Skill Body", + encoding="utf-8", + ) + (skill_dir / "checklist.md").write_text( + "- Step 1\n- Step 2", + encoding="utf-8", + ) + (skill_dir / "examples.yaml").write_text( + "example: value", + encoding="utf-8", + ) + skills = load_skills_from_directory(tmp_path) + content = skills[0].load_content() + assert "# Main Skill Body" in content + assert "## checklist.md" in content + assert "- Step 1" in content + assert "## examples.yaml" in content + assert "example: value" in content + + def test_load_content_ignores_non_text_files(self, tmp_path: Path) -> None: + """Verify load_content skips files with unsupported extensions.""" + skill_dir = tmp_path / "skip-binary" + skill_dir.mkdir() + (skill_dir / "skill.md").write_text( + "---\nname: skip-binary\ndescription: Test\n" "---\n\n# Skill Body", + encoding="utf-8", + ) + (skill_dir / "image.png").write_bytes(b"\x89PNG") + skills = load_skills_from_directory(tmp_path) + content = skills[0].load_content() + assert "image.png" not in content + + def test_load_content_recurses_into_subdirectories(self, tmp_path: Path) -> None: + """Verify load_content includes files from subdirectories.""" + skill_dir = tmp_path / "full-skill" + skill_dir.mkdir() + (skill_dir / "skill.md").write_text( + "---\nname: full-skill\ndescription: Test\n---\n\n# Main Skill Body", + encoding="utf-8", + ) + scripts_dir = skill_dir / "scripts" + scripts_dir.mkdir() + (scripts_dir / "deploy.sh").write_text( + "#!/bin/bash\necho deploy", + encoding="utf-8", + ) + refs_dir = skill_dir / "references" + refs_dir.mkdir() + (refs_dir / "guide.md").write_text( + "# Reference Guide\n\nDetailed steps.", + encoding="utf-8", + ) + (refs_dir / "data.bin").write_bytes(b"\x80\x81\x82\xff\xfe") + skills = load_skills_from_directory(tmp_path) + content = skills[0].load_content() + assert "# Main Skill Body" in content + assert "scripts/deploy.sh" in content + assert "echo deploy" in content + assert "references/guide.md" in content + assert "Reference Guide" in content + assert "data.bin" not in content + + +class TestSkillsRAGPopulate: + """Tests for SkillsRAG.populate_skills.""" + + def test_populate_stores_skills(self) -> None: + """Verify populate_skills stores all skills in the index.""" + rag = _make_rag() + skills = _sample_skills() + rag.populate_skills(skills) + + data = rag.store.get_all() + assert len(data["ids"]) == 5 + + def test_populate_builds_bm25_index(self) -> None: + """Verify BM25 index is built after population.""" + rag = _make_rag() + assert rag.bm25 is None + rag.populate_skills(_sample_skills()) + assert rag.bm25 is not None + + def test_populate_keeps_skill_references(self) -> None: + """Verify populated skills are retrievable by source_path.""" + rag = _make_rag() + skills = _sample_skills() + rag.populate_skills(skills) + assert skills[0].source_path in rag._skills + assert rag._skills[skills[0].source_path].name == skills[0].name + + def test_populate_calls_encode_fn(self) -> None: + """Verify the encode function is called for each skill.""" + mock_encode = MagicMock(return_value=[0.1] * DIMENSION) + rag = _make_rag(encode_fn=mock_encode) + rag.populate_skills(_sample_skills()) + assert mock_encode.call_count == 5 + + def test_populate_upsert_updates_existing(self) -> None: + """Verify re-populating with same skills does not duplicate.""" + rag = _make_rag() + skills = _sample_skills() + rag.populate_skills(skills) + rag.populate_skills(skills) + data = rag.store.get_all() + assert len(data["ids"]) == 5 + + +class TestSkillsRAGRetrieve: + """Tests for SkillsRAG.retrieve_skill.""" + + def _populated_rag(self, **kwargs: object) -> SkillsRAG: + """Create and populate a SkillsRAG with sample skills.""" + rag = _make_rag(**kwargs) + rag.populate_skills(_sample_skills()) + return rag + + def test_retrieve_returns_skill_object(self) -> None: + """Verify retrieve returns a Skill instance.""" + rag = self._populated_rag() + result = rag.retrieve_skill("my pod is crashing") + assert result is not None + assert isinstance(result, Skill) + assert result.name # has a name + + def test_retrieve_returns_none_when_empty(self) -> None: + """Verify None returned when no skills are populated.""" + rag = _make_rag() + result = rag.retrieve_skill("anything") + assert result is None + + def test_retrieve_returns_none_below_threshold(self) -> None: + """Verify None returned when best score is below threshold.""" + rag = self._populated_rag(threshold=0.99) + result = rag.retrieve_skill("completely unrelated query xyz abc") + assert result is None + + def test_retrieve_returns_one_of_indexed_skills(self) -> None: + """Verify retrieval returns a skill that was indexed.""" + rag = self._populated_rag() + result = rag.retrieve_skill("operator is degraded and unavailable") + assert result is not None + indexed_names = {s.name for s in _sample_skills()} + assert result.name in indexed_names + + def test_retrieve_skill_has_source_path(self) -> None: + """Verify returned skill has a source_path for on-demand content loading.""" + rag = self._populated_rag() + result = rag.retrieve_skill("node not ready") + assert result is not None + assert result.source_path + + +class TestSkillsRAGSparseScores: + """Tests for SkillsRAG._retrieve_sparse_scores.""" + + def test_returns_empty_when_no_bm25(self) -> None: + """Verify empty result when BM25 index not built.""" + rag = _make_rag() + scores = rag._retrieve_sparse_scores("query") + assert scores == {} + + def test_returns_normalized_scores(self) -> None: + """Verify BM25 scores are normalized between 0 and 1.""" + rag = _make_rag() + rag.populate_skills(_sample_skills()) + scores = rag._retrieve_sparse_scores("pod crash") + for score in scores.values(): + assert 0.0 <= score <= 1.0 + + def test_scores_contain_all_skills(self) -> None: + """Verify all indexed skills have a score.""" + rag = _make_rag() + rag.populate_skills(_sample_skills()) + scores = rag._retrieve_sparse_scores("troubleshooting") + assert len(scores) == 5 diff --git a/uv.lock b/uv.lock index 6b424e347..fc7f309ec 100644 --- a/uv.lock +++ b/uv.lock @@ -3226,6 +3226,7 @@ dependencies = [ { name = "prometheus-client" }, { name = "psycopg2-binary" }, { name = "pydantic" }, + { name = "python-frontmatter" }, { name = "qdrant-client" }, { name = "rank-bm25" }, { name = "requests" }, @@ -3312,6 +3313,7 @@ requires-dist = [ { name = "prometheus-client", specifier = ">=0.20.0" }, { name = "psycopg2-binary", specifier = ">=2.9.9" }, { name = "pydantic", specifier = ">=2.9.2" }, + { name = "python-frontmatter", specifier = ">=1.1.0" }, { name = "qdrant-client", specifier = ">=1.13.3" }, { name = "ragas", marker = "extra == 'evaluation'", specifier = ">=0.2.14" }, { name = "rank-bm25", specifier = ">=0.2.2" }, @@ -4200,6 +4202,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0b/d7/1959b9648791274998a9c3526f6d0ec8fd2233e4d4acce81bbae76b44b2a/python_dotenv-1.2.2-py3-none-any.whl", hash = "sha256:1d8214789a24de455a8b8bd8ae6fe3c6b69a5e3d64aa8a8e5d68e694bbcb285a", size = 22101, upload-time = "2026-03-01T16:00:25.09Z" }, ] +[[package]] +name = "python-frontmatter" +version = "1.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyyaml" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/96/de/910fa208120314a12f9a88ea63e03707261692af782c99283f1a2c8a5e6f/python-frontmatter-1.1.0.tar.gz", hash = "sha256:7118d2bd56af9149625745c58c9b51fb67e8d1294a0c76796dafdc72c36e5f6d", size = 16256, upload-time = "2024-01-16T18:50:04.052Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/49/87/3c8da047b3ec5f99511d1b4d7a5bc72d4b98751c7e78492d14dc736319c5/python_frontmatter-1.1.0-py3-none-any.whl", hash = "sha256:335465556358d9d0e6c98bbeb69b1c969f2a4a21360587b9873bfc3b213407c1", size = 9834, upload-time = "2024-01-16T18:50:00.911Z" }, +] + [[package]] name = "python-multipart" version = "0.0.22" @@ -5076,6 +5090,8 @@ dependencies = [ wheels = [ { url = "https://files.pythonhosted.org/packages/0f/8b/4b61d6e13f7108f36910df9ab4b58fd389cc2520d54d81b88660804aad99/torch-2.10.0-2-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:418997cb02d0a0f1497cf6a09f63166f9f5df9f3e16c8a716ab76a72127c714f", size = 79423467, upload-time = "2026-02-10T21:44:48.711Z" }, { url = "https://files.pythonhosted.org/packages/d3/54/a2ba279afcca44bbd320d4e73675b282fcee3d81400ea1b53934efca6462/torch-2.10.0-2-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:13ec4add8c3faaed8d13e0574f5cd4a323c11655546f91fbe6afa77b57423574", size = 79498202, upload-time = "2026-02-10T21:44:52.603Z" }, + { url = "https://files.pythonhosted.org/packages/36/ab/7b562f1808d3f65414cd80a4f7d4bb00979d9355616c034c171249e1a303/torch-2.10.0-3-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:ac5bdcbb074384c66fa160c15b1ead77839e3fe7ed117d667249afce0acabfac", size = 915518691, upload-time = "2026-03-11T14:15:43.147Z" }, + { url = "https://files.pythonhosted.org/packages/b3/7a/abada41517ce0011775f0f4eacc79659bc9bc6c361e6bfe6f7052a6b9363/torch-2.10.0-3-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:98c01b8bb5e3240426dcde1446eed6f40c778091c8544767ef1168fc663a05a6", size = 915622781, upload-time = "2026-03-11T14:17:11.354Z" }, { url = "https://files.pythonhosted.org/packages/78/89/f5554b13ebd71e05c0b002f95148033e730d3f7067f67423026cc9c69410/torch-2.10.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:3282d9febd1e4e476630a099692b44fdc214ee9bf8ee5377732d9d9dfe5712e4", size = 145992610, upload-time = "2026-01-21T16:25:26.327Z" }, { url = "https://files.pythonhosted.org/packages/ae/30/a3a2120621bf9c17779b169fc17e3dc29b230c29d0f8222f499f5e159aa8/torch-2.10.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:a2f9edd8dbc99f62bc4dfb78af7bf89499bca3d753423ac1b4e06592e467b763", size = 915607863, upload-time = "2026-01-21T16:25:06.696Z" }, { url = "https://files.pythonhosted.org/packages/6f/3d/c87b33c5f260a2a8ad68da7147e105f05868c281c63d65ed85aa4da98c66/torch-2.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:29b7009dba4b7a1c960260fc8ac85022c784250af43af9fb0ebafc9883782ebd", size = 113723116, upload-time = "2026-01-21T16:25:21.916Z" },