diff --git a/cc-registry-v2/README.md b/cc-registry-v2/README.md index 6657eca96ff0..b25be7bb151a 100644 --- a/cc-registry-v2/README.md +++ b/cc-registry-v2/README.md @@ -37,9 +37,9 @@ A modern, scalable registry for RunWhen CodeCollections with AI-powered enhancem | Service | Stack | Port | Purpose | |---------|-------|------|---------| | **frontend** | React 19 + TypeScript + MUI v7 | 3000 | SPA for browsing and managing CodeBundles | -| **backend** | FastAPI + SQLAlchemy 2.0 | 8001 | REST API (`/api/v1/`), business logic, AI enhancement | -| **mcp-server** | FastAPI (separate repo: `../mcp-server`) | 8000 | Stateless MCP tool server, delegates to backend API | -| **worker** | Celery (shares backend image) | -- | Background task processing | +| **backend** | FastAPI + SQLAlchemy 2.0 + pgvector | 8001 | REST API, business logic, AI enhancement, embedding generation, vector search | +| **mcp-server** | FastAPI (separate repo: `../mcp-server`) | 8000 | Stateless MCP tool server, delegates all queries to backend API | +| **worker** | Celery (shares backend image) | -- | Background tasks: sync, parse, enhance, embed | | **scheduler** | Celery Beat (shares backend image) | -- | Cron-driven task scheduling | | **database** | PostgreSQL 15 + pgvector | 5432 | Primary data store with vector extension | | **redis** | Redis 7 Alpine | 6379 | Celery broker and result backend | diff --git a/cc-registry-v2/backend/app/core/config.py b/cc-registry-v2/backend/app/core/config.py index c6da813ef7ed..073e28160117 100644 --- a/cc-registry-v2/backend/app/core/config.py +++ b/cc-registry-v2/backend/app/core/config.py @@ -35,12 +35,19 @@ class Settings(BaseSettings): AI_MODEL: str = "gpt-4" AI_ENHANCEMENT_ENABLED: bool = False - # Azure OpenAI Configuration + # Azure OpenAI Configuration (GPT / chat completions) AZURE_OPENAI_API_KEY: Optional[str] = None AZURE_OPENAI_ENDPOINT: Optional[str] = None AZURE_OPENAI_DEPLOYMENT_NAME: Optional[str] = None AZURE_OPENAI_API_VERSION: str = "2024-02-15-preview" + # Azure OpenAI Embedding Configuration (separate endpoint supported) + AZURE_OPENAI_EMBEDDING_ENDPOINT: Optional[str] = None + AZURE_OPENAI_EMBEDDING_API_KEY: Optional[str] = None + AZURE_OPENAI_EMBEDDING_API_VERSION: Optional[str] = None + AZURE_OPENAI_EMBEDDING_DEPLOYMENT: str = "text-embedding-3-small" + EMBEDDING_BATCH_SIZE: int = 100 + # AI Service Provider (openai, azure-openai) AI_SERVICE_PROVIDER: str = "openai" diff --git a/cc-registry-v2/backend/app/main.py b/cc-registry-v2/backend/app/main.py index 560ea649f838..dcd7f24c1177 100644 --- a/cc-registry-v2/backend/app/main.py +++ b/cc-registry-v2/backend/app/main.py @@ -99,22 +99,22 @@ async def health_check(): } # Include routers -from app.routers import admin, tasks, raw_data, admin_crud, task_execution_admin, versions, task_management, admin_inventory, helm_charts, mcp_chat, chat_debug, github_issues, schedule_config, analytics +from app.routers import admin, tasks, raw_data, admin_crud, task_execution_admin, versions, task_management, admin_inventory, helm_charts, mcp_chat, chat_debug, github_issues, schedule_config, analytics, vector_search app.include_router(admin.router) app.include_router(tasks.router) app.include_router(raw_data.router) app.include_router(admin_crud.router) -# AI config removed - now uses env vars only (AZURE_OPENAI_* in az.secret) app.include_router(task_execution_admin.router, prefix="/api/v1") app.include_router(admin_inventory.router) app.include_router(versions.router, prefix="/api/v1/registry") app.include_router(task_management.router) app.include_router(schedule_config.router) app.include_router(helm_charts.router, prefix="/api/v1", tags=["helm-charts"]) -app.include_router(mcp_chat.router) # MCP-powered chat (replaces legacy chat + simple_chat) -app.include_router(chat_debug.router) # Debug tools for chat quality analysis +app.include_router(mcp_chat.router) +app.include_router(chat_debug.router) app.include_router(github_issues.router, prefix="/api/v1") -app.include_router(analytics.router) # Analytics charts and metrics +app.include_router(analytics.router) +app.include_router(vector_search.router) @app.get("/api/v1/registry/collections") async def list_collections(): diff --git a/cc-registry-v2/backend/app/models/__init__.py b/cc-registry-v2/backend/app/models/__init__.py index 2feb6d8333a2..aaa2a2e5bfa7 100644 --- a/cc-registry-v2/backend/app/models/__init__.py +++ b/cc-registry-v2/backend/app/models/__init__.py @@ -8,5 +8,12 @@ from .task_execution import TaskExecution from .helm_chart import HelmChart, HelmChartVersion, HelmChartTemplate from .analytics import TaskGrowthMetric +from .vector_models import VectorCodebundle, VectorCodecollection, VectorLibrary, VectorDocumentation -__all__ = ["CodeCollection", "Codebundle", "RawYamlData", "RawRepositoryData", "CodeCollectionMetrics", "SystemMetrics", "AIConfiguration", "AIEnhancementLog", "CodeCollectionVersion", "VersionCodebundle", "TaskExecution", "HelmChart", "HelmChartVersion", "HelmChartTemplate", "TaskGrowthMetric"] +__all__ = [ + "CodeCollection", "Codebundle", "RawYamlData", "RawRepositoryData", + "CodeCollectionMetrics", "SystemMetrics", "AIConfiguration", "AIEnhancementLog", + "CodeCollectionVersion", "VersionCodebundle", "TaskExecution", + "HelmChart", "HelmChartVersion", "HelmChartTemplate", "TaskGrowthMetric", + "VectorCodebundle", "VectorCodecollection", "VectorLibrary", "VectorDocumentation", +] diff --git a/cc-registry-v2/backend/app/models/vector_models.py b/cc-registry-v2/backend/app/models/vector_models.py new file mode 100644 index 000000000000..3f53cd02d799 --- /dev/null +++ b/cc-registry-v2/backend/app/models/vector_models.py @@ -0,0 +1,61 @@ +""" +SQLAlchemy models for pgvector tables. + +Maps to the tables created by database/migrations/006_add_pgvector.sql. +""" +from sqlalchemy import Column, String, Text, DateTime, func, text +from sqlalchemy.dialects.postgresql import JSONB +from pgvector.sqlalchemy import Vector + +from app.core.database import Base + +# Must match the migration (006_add_pgvector.sql) and the Azure OpenAI +# text-embedding-3-small model output. Do NOT change without also +# altering the migration and rebuilding all vector tables. +EMBEDDING_DIMENSIONS = 1536 + +_JSONB_EMPTY = text("'{}'::jsonb") + + +class VectorCodebundle(Base): + __tablename__ = "vector_codebundles" + + id = Column(String, primary_key=True) + embedding = Column(Vector(EMBEDDING_DIMENSIONS)) + document = Column(Text) + metadata_ = Column("metadata", JSONB, nullable=False, server_default=_JSONB_EMPTY) + created_at = Column(DateTime(timezone=True), server_default=func.now()) + updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now()) + + +class VectorCodecollection(Base): + __tablename__ = "vector_codecollections" + + id = Column(String, primary_key=True) + embedding = Column(Vector(EMBEDDING_DIMENSIONS)) + document = Column(Text) + metadata_ = Column("metadata", JSONB, nullable=False, server_default=_JSONB_EMPTY) + created_at = Column(DateTime(timezone=True), server_default=func.now()) + updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now()) + + +class VectorLibrary(Base): + __tablename__ = "vector_libraries" + + id = Column(String, primary_key=True) + embedding = Column(Vector(EMBEDDING_DIMENSIONS)) + document = Column(Text) + metadata_ = Column("metadata", JSONB, nullable=False, server_default=_JSONB_EMPTY) + created_at = Column(DateTime(timezone=True), server_default=func.now()) + updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now()) + + +class VectorDocumentation(Base): + __tablename__ = "vector_documentation" + + id = Column(String, primary_key=True) + embedding = Column(Vector(EMBEDDING_DIMENSIONS)) + document = Column(Text) + metadata_ = Column("metadata", JSONB, nullable=False, server_default=_JSONB_EMPTY) + created_at = Column(DateTime(timezone=True), server_default=func.now()) + updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now()) diff --git a/cc-registry-v2/backend/app/routers/vector_search.py b/cc-registry-v2/backend/app/routers/vector_search.py new file mode 100644 index 000000000000..464dda23a7a1 --- /dev/null +++ b/cc-registry-v2/backend/app/routers/vector_search.py @@ -0,0 +1,211 @@ +""" +Vector search API endpoints. + +Exposes semantic (embedding-based) search over codebundles, codecollections, +libraries, and documentation. Used by the MCP server and the frontend chat. +""" +import logging +from typing import Any, Dict, List, Optional + +from fastapi import APIRouter, Depends, HTTPException, Query +from sqlalchemy.orm import Session + +from app.core.database import get_db +from app.services.embedding_service import get_embedding_service +from app.services.vector_service import VectorSearchResult, get_vector_service + +logger = logging.getLogger(__name__) + +router = APIRouter(prefix="/api/v1/vector", tags=["vector-search"]) + + +def _result_to_dict(r: VectorSearchResult) -> Dict[str, Any]: + return { + "id": r.id, + "document": r.document[:500], + "metadata": r.metadata, + "score": round(r.score, 4), + "distance": round(r.distance, 4), + } + + +# -------------------------------------------------------------------------- +# Unified semantic search +# -------------------------------------------------------------------------- + +@router.get("/search") +def semantic_search( + query: str, + tables: Optional[str] = Query( + None, + description="Comma-separated table keys to search (codebundles,codecollections,libraries,documentation). Default: all.", + ), + max_results: int = Query(10, ge=1, le=50), + platform: Optional[str] = None, + category: Optional[str] = None, + db: Session = Depends(get_db), +): + """Run a semantic similarity search across one or more vector tables.""" + embed_svc = get_embedding_service() + vec_svc = get_vector_service() + + if not embed_svc.available: + raise HTTPException( + status_code=503, + detail="Embedding service is not configured. Set AZURE_OPENAI_EMBEDDING_* environment variables.", + ) + + query_embedding = embed_svc.embed_text(query) + if not query_embedding: + raise HTTPException(status_code=500, detail="Failed to generate query embedding") + + table_keys = [t.strip() for t in tables.split(",")] if tables else None + + if table_keys: + valid_keys = {"codebundles", "codecollections", "libraries", "documentation"} + invalid = set(table_keys) - valid_keys + if invalid: + raise HTTPException(status_code=400, detail=f"Invalid table keys: {invalid}") + + filters: Optional[Dict[str, str]] = {} + if platform: + filters["platform"] = platform + if category: + filters["category"] = category + + results_map = vec_svc.search_all( + query_embedding, n_results=max_results, table_keys=table_keys, + metadata_filters=filters or None, db=db, + ) + + output: Dict[str, Any] = {} + for key, results in results_map.items(): + output[key] = [_result_to_dict(r) for r in results] + + return output + + +# -------------------------------------------------------------------------- +# Per-table endpoints +# -------------------------------------------------------------------------- + +@router.get("/search/codebundles") +def search_codebundles( + query: str, + max_results: int = Query(10, ge=1, le=50), + platform: Optional[str] = None, + collection_slug: Optional[str] = None, + db: Session = Depends(get_db), +): + """Semantic search over codebundles.""" + embed_svc = get_embedding_service() + vec_svc = get_vector_service() + + if not embed_svc.available: + raise HTTPException(status_code=503, detail="Embedding service not configured") + + query_embedding = embed_svc.embed_text(query) + if not query_embedding: + raise HTTPException(status_code=500, detail="Embedding generation failed") + + filters: Optional[Dict[str, str]] = {} + if platform: + filters["platform"] = platform + if collection_slug: + filters["collection_slug"] = collection_slug + + results = vec_svc.search( + "codebundles", query_embedding, n_results=max_results, + metadata_filters=filters or None, db=db, + ) + return {"results": [_result_to_dict(r) for r in results], "query": query} + + +@router.get("/search/documentation") +def search_documentation( + query: str, + max_results: int = Query(10, ge=1, le=50), + category: Optional[str] = None, + db: Session = Depends(get_db), +): + """Semantic search over documentation.""" + embed_svc = get_embedding_service() + vec_svc = get_vector_service() + + if not embed_svc.available: + raise HTTPException(status_code=503, detail="Embedding service not configured") + + query_embedding = embed_svc.embed_text(query) + if not query_embedding: + raise HTTPException(status_code=500, detail="Embedding generation failed") + + filters = {"category": category} if category else None + results = vec_svc.search( + "documentation", query_embedding, n_results=max_results, + metadata_filters=filters, db=db, + ) + return {"results": [_result_to_dict(r) for r in results], "query": query} + + +@router.get("/search/libraries") +def search_libraries( + query: str, + max_results: int = Query(10, ge=1, le=50), + category: Optional[str] = None, + db: Session = Depends(get_db), +): + """Semantic search over libraries.""" + embed_svc = get_embedding_service() + vec_svc = get_vector_service() + + if not embed_svc.available: + raise HTTPException(status_code=503, detail="Embedding service not configured") + + query_embedding = embed_svc.embed_text(query) + if not query_embedding: + raise HTTPException(status_code=500, detail="Embedding generation failed") + + filters = {"category": category} if category else None + results = vec_svc.search( + "libraries", query_embedding, n_results=max_results, + metadata_filters=filters, db=db, + ) + return {"results": [_result_to_dict(r) for r in results], "query": query} + + +# -------------------------------------------------------------------------- +# Stats / health +# -------------------------------------------------------------------------- + +@router.get("/stats") +def vector_stats(db: Session = Depends(get_db)): + """Return row counts for each vector table.""" + vec_svc = get_vector_service() + return vec_svc.get_stats(db=db) + + +@router.post("/reindex") +async def trigger_reindex(): + """Trigger a full reindex (async Celery task).""" + from app.tasks.indexing_tasks import reindex_all_task + + task = reindex_all_task.apply_async() + return {"task_id": task.id, "status": "queued"} + + +@router.post("/reindex/codebundles") +async def trigger_reindex_codebundles(): + """Trigger codebundle reindexing.""" + from app.tasks.indexing_tasks import index_codebundles_task + + task = index_codebundles_task.apply_async() + return {"task_id": task.id, "status": "queued"} + + +@router.post("/reindex/documentation") +async def trigger_reindex_documentation(): + """Trigger documentation reindexing.""" + from app.tasks.indexing_tasks import index_documentation_task + + task = index_documentation_task.apply_async() + return {"task_id": task.id, "status": "queued"} diff --git a/cc-registry-v2/backend/app/services/documentation_source_loader.py b/cc-registry-v2/backend/app/services/documentation_source_loader.py new file mode 100644 index 000000000000..7ac51336354a --- /dev/null +++ b/cc-registry-v2/backend/app/services/documentation_source_loader.py @@ -0,0 +1,105 @@ +""" +Load documentation sources from sources.yaml and crawl their content. + +sources.yaml lives in cc-registry-v2/ alongside schedules.yaml and is +mounted into the backend container at /app/sources.yaml. +""" +import logging +from pathlib import Path +from typing import Any, Dict, List, Optional + +import yaml + +from app.services.web_crawler import WebCrawler + +logger = logging.getLogger(__name__) + +SOURCES_PATHS = [ + Path("/app/sources.yaml"), + Path("/workspaces/codecollection-registry/cc-registry-v2/sources.yaml"), +] + + +def _find_sources_file() -> Optional[Path]: + for p in SOURCES_PATHS: + if p.exists(): + return p + return None + + +class DocumentationSourceLoader: + """Parse sources.yaml and optionally crawl linked pages.""" + + def __init__(self, sources_file: Optional[str] = None): + if sources_file: + self._path = Path(sources_file) + else: + self._path = _find_sources_file() + self._raw: Optional[Dict] = None + + def _load(self): + if self._raw is not None: + return + if not self._path or not self._path.exists(): + logger.warning(f"sources.yaml not found (tried {SOURCES_PATHS})") + self._raw = {} + return + with open(self._path) as f: + self._raw = yaml.safe_load(f) or {} + logger.info(f"Loaded sources from {self._path}") + + def get_all_docs(self, crawl: bool = True) -> List[Dict[str, Any]]: + """Return a flat list of documentation entries, optionally with crawled content. + + Each dict contains at minimum: name, url, description, category, topics. + If *crawl* is True, the ``crawled_content`` key is populated from the URL. + """ + self._load() + sources = self._raw.get("sources", {}) + docs: List[Dict[str, Any]] = [] + + for category, items in sources.items(): + if not isinstance(items, list): + continue + for item in items: + entry: Dict[str, Any] = {} + + if "name" in item: + entry["name"] = item["name"] + entry["url"] = item.get("url", "") + entry["description"] = item.get("description", "") + entry["topics"] = item.get("topics", []) + entry["usage_examples"] = item.get("usage_examples", []) + entry["key_points"] = item.get("key_points", []) + entry["priority"] = item.get("priority", "medium") + elif "question" in item: + entry["name"] = item["question"] + entry["url"] = "" + entry["description"] = item.get("answer", "") + entry["topics"] = item.get("topics", []) + else: + continue + + entry["category"] = category + docs.append(entry) + + if crawl: + self._crawl_docs(docs) + + logger.info(f"Loaded {len(docs)} documentation entries ({sum(1 for d in docs if d.get('crawled_content'))} crawled)") + return docs + + @staticmethod + def _crawl_docs(docs: List[Dict[str, Any]]): + crawler = WebCrawler() + for doc in docs: + url = doc.get("url") + if not url: + continue + result = crawler.crawl_url(url) + if result: + doc["crawled_content"] = result.get("content", "") + doc["crawled_headings"] = [ + h["text"] for h in result.get("headings", []) + ] + doc["crawled_code"] = result.get("code_blocks", []) diff --git a/cc-registry-v2/backend/app/services/embedding_service.py b/cc-registry-v2/backend/app/services/embedding_service.py new file mode 100644 index 000000000000..9a9728b0e3c7 --- /dev/null +++ b/cc-registry-v2/backend/app/services/embedding_service.py @@ -0,0 +1,112 @@ +""" +Embedding generation service using Azure OpenAI. + +Generates text embeddings for vector search. Supports separate embedding +endpoint configuration via AZURE_OPENAI_EMBEDDING_* env vars, falling back +to the main AZURE_OPENAI_* credentials. +""" +import logging +import threading +from typing import List, Optional + +from app.core.config import settings +from app.models.vector_models import EMBEDDING_DIMENSIONS + +logger = logging.getLogger(__name__) + + +class EmbeddingService: + """Generate embeddings via Azure OpenAI (or OpenAI directly).""" + + def __init__(self): + self._client = None + self._deployment: str = settings.AZURE_OPENAI_EMBEDDING_DEPLOYMENT + self._batch_size: int = settings.EMBEDDING_BATCH_SIZE + self._init_client() + + def _init_client(self): + endpoint = ( + settings.AZURE_OPENAI_EMBEDDING_ENDPOINT + or settings.AZURE_OPENAI_ENDPOINT + ) + api_key = ( + settings.AZURE_OPENAI_EMBEDDING_API_KEY + or settings.AZURE_OPENAI_API_KEY + ) + api_version = ( + settings.AZURE_OPENAI_EMBEDDING_API_VERSION + or settings.AZURE_OPENAI_API_VERSION + ) + + if not endpoint or not api_key: + logger.warning( + "Azure OpenAI embedding credentials not configured — " + "embedding generation will be unavailable" + ) + return + + try: + from openai import AzureOpenAI + + self._client = AzureOpenAI( + azure_endpoint=endpoint, + api_key=api_key, + api_version=api_version, + ) + is_dedicated = bool(settings.AZURE_OPENAI_EMBEDDING_ENDPOINT) + label = "dedicated embedding" if is_dedicated else "shared" + logger.info( + f"Embedding service initialised ({label} endpoint: {endpoint})" + ) + except Exception as e: + logger.error(f"Failed to initialise embedding client: {e}") + + @property + def available(self) -> bool: + return self._client is not None + + def embed_texts(self, texts: List[str]) -> List[List[float]]: + """Generate embeddings for a batch of texts. + + Returns a list of embedding vectors in the same order as *texts*. + Failed items are represented by empty lists. + """ + if not texts: + return [] + if not self._client: + logger.error("Embedding client not available") + return [[] for _ in texts] + + all_embeddings: List[List[float]] = [] + for start in range(0, len(texts), self._batch_size): + batch = texts[start : start + self._batch_size] + try: + response = self._client.embeddings.create( + input=batch, model=self._deployment, + dimensions=EMBEDDING_DIMENSIONS, + ) + for item in sorted(response.data, key=lambda x: x.index): + all_embeddings.append(item.embedding) + except Exception as e: + logger.error(f"Embedding batch {start // self._batch_size} failed: {e}") + all_embeddings.extend([[] for _ in batch]) + + return all_embeddings + + def embed_text(self, text: str) -> List[float]: + """Generate a single embedding vector.""" + results = self.embed_texts([text]) + return results[0] if results else [] + + +_instance: Optional[EmbeddingService] = None +_lock = threading.Lock() + + +def get_embedding_service() -> EmbeddingService: + global _instance + if _instance is None: + with _lock: + if _instance is None: + _instance = EmbeddingService() + return _instance diff --git a/cc-registry-v2/backend/app/services/vector_service.py b/cc-registry-v2/backend/app/services/vector_service.py new file mode 100644 index 000000000000..554bf77f86bb --- /dev/null +++ b/cc-registry-v2/backend/app/services/vector_service.py @@ -0,0 +1,360 @@ +""" +Vector storage and similarity search service backed by pgvector. + +Provides CRUD operations and cosine-similarity search across all four +vector tables (codebundles, codecollections, libraries, documentation). +""" +import json +import logging +import re +import threading +from typing import Any, Dict, List, Optional +from dataclasses import dataclass + +from sqlalchemy import text, delete +from sqlalchemy.orm import Session + +from app.core.database import SessionLocal +from app.models.vector_models import ( + VectorCodebundle, + VectorCodecollection, + VectorDocumentation, + VectorLibrary, +) + +logger = logging.getLogger(__name__) + +TABLE_MAP = { + "codebundles": VectorCodebundle, + "codecollections": VectorCodecollection, + "libraries": VectorLibrary, + "documentation": VectorDocumentation, +} + +_VALID_METADATA_KEYS = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]*$") + +# Per-table filter keys that the unified search endpoint maps automatically. +_TABLE_FILTER_KEYS: Dict[str, Dict[str, str]] = { + "codebundles": {"platform": "platform", "collection_slug": "collection_slug"}, + "documentation": {"category": "category"}, + "libraries": {"category": "category"}, +} + + +@dataclass +class VectorSearchResult: + id: str + document: str + metadata: Dict[str, Any] + distance: float + + @property + def score(self) -> float: + return 1.0 / (1.0 + self.distance) + + +class VectorService: + """Read / write pgvector tables and run similarity search.""" + + # ------------------------------------------------------------------ + # Upsert helpers + # ------------------------------------------------------------------ + + def upsert_vectors( + self, + table_key: str, + ids: List[str], + embeddings: List[List[float]], + documents: List[str], + metadatas: List[Dict[str, Any]], + clear_existing: bool = True, + db: Session = None, + ) -> int: + """Upsert embedding rows into a vector table. + + If *clear_existing* is True the table is truncated first (full rebuild). + Returns the number of rows actually written (skips empty embeddings). + + Raises ``ValueError`` if *clear_existing* is True but every embedding + in *embeddings* is empty — this prevents a transient API outage from + silently wiping all vector data. + """ + n = len(ids) + if len(embeddings) != n or len(documents) != n or len(metadatas) != n: + raise ValueError( + f"List length mismatch: ids={n}, embeddings={len(embeddings)}, " + f"documents={len(documents)}, metadatas={len(metadatas)}" + ) + + valid_count = sum(1 for e in embeddings if e) + if clear_existing and n > 0: + if valid_count == 0: + raise ValueError( + f"Refusing to truncate {table_key}: all {n} embeddings are empty " + "(possible upstream embedding failure)" + ) + ratio = valid_count / n + if ratio < 0.5: + raise ValueError( + f"Refusing to truncate {table_key}: only {valid_count}/{n} " + f"embeddings valid ({ratio:.0%}), likely partial API failure" + ) + + own_session = db is None + if own_session: + db = SessionLocal() + try: + model = TABLE_MAP[table_key] + + if clear_existing: + db.execute(delete(model)) + db.flush() + + table_name = model.__tablename__ + stmt = text( + f"INSERT INTO {table_name} (id, embedding, document, metadata, updated_at) " + f"VALUES (:id, CAST(:emb AS vector), :doc, CAST(:meta AS jsonb), NOW()) " + f"ON CONFLICT (id) DO UPDATE SET " + f" embedding = EXCLUDED.embedding, " + f" document = EXCLUDED.document, " + f" metadata = EXCLUDED.metadata, " + f" updated_at = NOW()" + ) + written = 0 + for i in range(len(ids)): + emb = embeddings[i] + if not emb: + continue + emb_literal = "[" + ",".join(str(v) for v in emb) + "]" + meta_json = json.dumps(metadatas[i]) if metadatas[i] else "{}" + db.execute( + stmt, + {"id": ids[i], "emb": emb_literal, "doc": documents[i], "meta": meta_json}, + ) + written += 1 + + db.commit() + logger.info(f"Upserted {written}/{len(ids)} rows into {table_name}") + return written + except Exception: + if own_session: + db.rollback() + raise + finally: + if own_session: + db.close() + + # ------------------------------------------------------------------ + # Similarity search + # ------------------------------------------------------------------ + + def search( + self, + table_key: str, + query_embedding: List[float], + n_results: int = 10, + metadata_filters: Optional[Dict[str, str]] = None, + db: Session = None, + ) -> List[VectorSearchResult]: + """Cosine-similarity search against a vector table. + + *metadata_filters* is a dict of key-value pairs that are ANDed + together, e.g. ``{"platform": "kubernetes"}``. + Keys must be alphanumeric/underscore identifiers. + """ + own_session = db is None + if own_session: + db = SessionLocal() + try: + model = TABLE_MAP[table_key] + table_name = model.__tablename__ + emb_literal = "[" + ",".join(str(v) for v in query_embedding) + "]" + + where_clauses = ["embedding IS NOT NULL"] + params: Dict[str, Any] = { + "emb": emb_literal, + "limit": n_results, + } + + if metadata_filters: + for idx, (key, value) in enumerate(metadata_filters.items()): + if not _VALID_METADATA_KEYS.match(key): + raise ValueError(f"Invalid metadata filter key: {key!r}") + param_name = f"mf_{idx}" + where_clauses.append(f"metadata->>'{key}' = :{param_name}") + params[param_name] = value + + where_sql = " AND ".join(where_clauses) + + sql = text( + f"SELECT id, document, metadata, " + f" (embedding <=> CAST(:emb AS vector)) AS distance " + f"FROM {table_name} " + f"WHERE {where_sql} " + f"ORDER BY distance " + f"LIMIT :limit" + ) + + rows = db.execute(sql, params).fetchall() + results = [] + for row in rows: + meta = row[2] if isinstance(row[2], dict) else json.loads(row[2]) + results.append( + VectorSearchResult( + id=row[0], + document=row[1] or "", + metadata=meta, + distance=float(row[3]), + ) + ) + return results + finally: + if own_session: + db.close() + + # ------------------------------------------------------------------ + # Multi-table search (unified semantic search) + # ------------------------------------------------------------------ + + def search_all( + self, + query_embedding: List[float], + n_results: int = 10, + table_keys: Optional[List[str]] = None, + metadata_filters: Optional[Dict[str, str]] = None, + db: Session = None, + ) -> Dict[str, List[VectorSearchResult]]: + """Run similarity search across multiple vector tables. + + *metadata_filters* are applied on a per-table basis: only filter + keys that are relevant to a given table are forwarded (e.g. + ``platform`` only applies to codebundles, ``category`` only to + documentation and libraries). + """ + keys = table_keys or list(TABLE_MAP.keys()) + results: Dict[str, List[VectorSearchResult]] = {} + for key in keys: + table_filters: Optional[Dict[str, str]] = None + if metadata_filters: + relevant = _TABLE_FILTER_KEYS.get(key, {}) + table_filters = { + v: metadata_filters[k] + for k, v in relevant.items() + if k in metadata_filters + } + if not table_filters: + table_filters = None + results[key] = self.search( + key, query_embedding, n_results=n_results, + metadata_filters=table_filters, db=db, + ) + return results + + # ------------------------------------------------------------------ + # Stats + # ------------------------------------------------------------------ + + def get_stats(self, db: Session = None) -> Dict[str, int]: + own_session = db is None + if own_session: + db = SessionLocal() + try: + stats = {} + for key, model in TABLE_MAP.items(): + count = db.query(model).count() + stats[key] = count + return stats + finally: + if own_session: + db.close() + + # ------------------------------------------------------------------ + # Document builders (shared logic, mirrors mcp-server helpers) + # ------------------------------------------------------------------ + + @staticmethod + def codebundle_to_document(cb: Dict[str, Any]) -> str: + parts = [] + for field in ("display_name", "name"): + if cb.get(field): + parts.append(cb[field]) + if cb.get("description"): + parts.append(cb["description"]) + if cb.get("ai_enhanced_description"): + parts.append(cb["ai_enhanced_description"]) + if cb.get("discovery_platform"): + parts.append(f"Platform: {cb['discovery_platform']}") + tags = cb.get("support_tags") or [] + if tags: + parts.append(f"Tags: {', '.join(tags[:15])}") + tasks = cb.get("tasks") or [] + if tasks: + parts.append(f"Tasks: {', '.join(tasks[:20])}") + readme = cb.get("readme") or "" + if readme: + parts.append(readme[:2000]) + return "\n".join(parts) + + @staticmethod + def codebundle_metadata(cb: Dict[str, Any]) -> Dict[str, Any]: + return { + "slug": cb.get("slug", ""), + "collection_slug": cb.get("collection_slug", ""), + "name": cb.get("name", ""), + "display_name": cb.get("display_name", ""), + "description": (cb.get("description") or "")[:500], + "platform": cb.get("discovery_platform") or cb.get("platform", ""), + "tags": ",".join((cb.get("support_tags") or [])[:10]), + } + + @staticmethod + def collection_to_document(cc: Dict[str, Any]) -> str: + return f"{cc.get('name', '')} — {cc.get('description', '')}" + + @staticmethod + def collection_metadata(cc: Dict[str, Any]) -> Dict[str, Any]: + return { + "slug": cc.get("slug", ""), + "name": cc.get("name", ""), + "description": (cc.get("description") or "")[:500], + "git_url": cc.get("git_url", ""), + "owner": cc.get("owner", ""), + } + + @staticmethod + def doc_to_document(doc: Dict[str, Any]) -> str: + parts = [f"# {doc.get('name', '')}"] + if doc.get("description"): + parts.append(doc["description"]) + if doc.get("crawled_content"): + parts.append(doc["crawled_content"][:12000]) + else: + for field in ("topics", "key_points", "usage_examples"): + items = doc.get(field) or [] + if items: + parts.append(f"{field.replace('_', ' ').title()}: {', '.join(items)}") + return "\n\n".join(parts) + + @staticmethod + def doc_metadata(doc: Dict[str, Any]) -> Dict[str, Any]: + return { + "name": doc.get("name", doc.get("question", "")), + "description": (doc.get("description", doc.get("answer", "")) or "")[:500], + "url": doc.get("url", ""), + "category": doc.get("category", "general"), + "topics": ",".join(doc.get("topics") or []), + "has_crawled_content": "true" if doc.get("crawled_content") else "false", + } + + +_instance: Optional[VectorService] = None +_lock = threading.Lock() + + +def get_vector_service() -> VectorService: + global _instance + if _instance is None: + with _lock: + if _instance is None: + _instance = VectorService() + return _instance diff --git a/cc-registry-v2/backend/app/services/web_crawler.py b/cc-registry-v2/backend/app/services/web_crawler.py new file mode 100644 index 000000000000..7b7b4f8b29ce --- /dev/null +++ b/cc-registry-v2/backend/app/services/web_crawler.py @@ -0,0 +1,117 @@ +""" +Lightweight web crawler for documentation indexing. + +Fetches page content with httpx + BeautifulSoup. Runs inside the backend +worker, so no headless-browser dependency is needed. +""" +import logging +import re +import time +from typing import Dict, List, Optional + +import httpx +from bs4 import BeautifulSoup + +logger = logging.getLogger(__name__) + +TIMEOUT = 30.0 +MAX_CONTENT_LENGTH = 50_000 +RATE_LIMIT_DELAY = 1.0 +USER_AGENT = "RunWhen-Registry-Indexer/2.0 (Documentation Indexer)" + + +class WebCrawler: + def __init__( + self, + timeout: float = TIMEOUT, + max_content_length: int = MAX_CONTENT_LENGTH, + rate_limit_delay: float = RATE_LIMIT_DELAY, + ): + self._timeout = timeout + self._max_content_length = max_content_length + self._rate_limit_delay = rate_limit_delay + self._last_request_time = 0.0 + + def _rate_limit(self): + elapsed = time.time() - self._last_request_time + if elapsed < self._rate_limit_delay: + time.sleep(self._rate_limit_delay - elapsed) + self._last_request_time = time.time() + + def crawl_url(self, url: str) -> Optional[Dict[str, str]]: + """Fetch a URL and extract structured content.""" + logger.info(f"Crawling: {url}") + self._rate_limit() + + try: + with httpx.Client(timeout=self._timeout, follow_redirects=True) as client: + resp = client.get(url, headers={"User-Agent": USER_AGENT}) + resp.raise_for_status() + html = resp.text + except Exception as e: + logger.error(f"Failed to fetch {url}: {e}") + return None + + try: + return self._extract(html, url) + except Exception as e: + logger.error(f"Failed to parse {url}: {e}") + return None + + def crawl_urls(self, urls: List[str]) -> List[Dict[str, str]]: + results = [] + for url in urls: + content = self.crawl_url(url) + if content: + results.append(content) + return results + + def _extract(self, html: str, url: str) -> Optional[Dict[str, str]]: + soup = BeautifulSoup(html, "lxml") + for tag in soup(["script", "style", "nav", "footer", "header"]): + tag.decompose() + + main = None + for sel in ( + "article", "main", ".content", ".article-content", + "#content", "#main-content", ".scroll-content", + ): + main = soup.select_one(sel) + if main: + break + if not main: + main = soup.body or soup + + title = "" + title_el = soup.find("h1") or soup.find("title") + if title_el: + title = title_el.get_text(strip=True) + + headings = [] + for h in main.find_all(["h1", "h2", "h3", "h4"]): + txt = h.get_text(strip=True) + if txt: + headings.append({"level": int(h.name[1]), "text": txt}) + + code_blocks = [] + for code in main.find_all(["code", "pre"]): + code_text = code.get_text(strip=True) + if code_text and len(code_text) > 10: + code_blocks.append(code_text[:2000]) + + text = main.get_text(separator=" ", strip=True) + text = re.sub(r"\s+", " ", text) + if len(text) > self._max_content_length: + text = text[: self._max_content_length] + "..." + + if not text.strip(): + return None + + logger.info(f" Extracted {len(text)} chars, {len(headings)} headings") + return { + "title": title, + "content": text, + "code_blocks": code_blocks[:10], + "headings": headings, + "url": url, + } diff --git a/cc-registry-v2/backend/app/tasks/celery_app.py b/cc-registry-v2/backend/app/tasks/celery_app.py index faee5d4959cd..69d371b74b32 100644 --- a/cc-registry-v2/backend/app/tasks/celery_app.py +++ b/cc-registry-v2/backend/app/tasks/celery_app.py @@ -69,7 +69,8 @@ def _configure_broker_url(): "app.tasks.data_population_tasks", "app.tasks.task_monitoring", "app.tasks.workflow_tasks", - "app.tasks.analytics_tasks" # Analytics computation tasks + "app.tasks.analytics_tasks", + "app.tasks.indexing_tasks", ] ) diff --git a/cc-registry-v2/backend/app/tasks/indexing_tasks.py b/cc-registry-v2/backend/app/tasks/indexing_tasks.py new file mode 100644 index 000000000000..1e552e94818f --- /dev/null +++ b/cc-registry-v2/backend/app/tasks/indexing_tasks.py @@ -0,0 +1,238 @@ +""" +Vector indexing tasks — native Celery tasks that generate embeddings +and store them in pgvector. + +Replaces the old mcp_tasks.py which shelled out to the MCP server's +indexer.py subprocess. All indexing now runs inside the backend worker. +""" +import logging +from typing import Any, Dict, List + +from app.tasks.celery_app import celery_app +from app.core.database import SessionLocal +from app.services.embedding_service import get_embedding_service +from app.services.vector_service import get_vector_service + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _rows_to_dicts(rows, collection_slug_map: Dict[int, str]) -> List[Dict[str, Any]]: + """Convert SQLAlchemy Codebundle rows to plain dicts for the vector service. + + Codebundles whose ``codecollection_id`` doesn't resolve to an active + collection slug are skipped — they're orphaned rows that would produce + non-unique vector IDs (e.g. ``/``). + """ + results = [] + skipped = 0 + for cb in rows: + coll_slug = collection_slug_map.get(cb.codecollection_id) + if not coll_slug: + skipped += 1 + continue + d = { + "id": cb.id, + "slug": cb.slug, + "name": cb.name, + "display_name": cb.display_name, + "description": cb.description, + "ai_enhanced_description": cb.ai_enhanced_description, + "readme": cb.readme, + "support_tags": cb.support_tags or [], + "tasks": cb.tasks or [], + "discovery_platform": cb.discovery_platform, + "collection_slug": coll_slug, + } + results.append(d) + if skipped: + logger.warning(f"Skipped {skipped} codebundles with no active collection") + return results + + +def _count_valid_embeddings(embeddings: List[List[float]]) -> int: + return sum(1 for e in embeddings if e) + + +# --------------------------------------------------------------------------- +# Index codebundles + codecollections +# --------------------------------------------------------------------------- + +@celery_app.task(bind=True, name="app.tasks.indexing_tasks.index_codebundles_task") +def index_codebundles_task(self) -> Dict[str, Any]: + """Generate embeddings for all codebundles and codecollections, store in pgvector.""" + try: + logger.info(f"Starting codebundle embedding indexing (task {self.request.id})") + embed_svc = get_embedding_service() + vec_svc = get_vector_service() + + if not embed_svc.available: + return {"status": "skipped", "reason": "embedding service unavailable"} + + db = SessionLocal() + try: + from app.models.code_collection import CodeCollection + from app.models.codebundle import Codebundle + + collections = db.query(CodeCollection).filter(CodeCollection.is_active == True).all() + slug_map = {c.id: c.slug for c in collections} + + codebundles = db.query(Codebundle).filter(Codebundle.is_active == True).all() + cb_dicts = _rows_to_dicts(codebundles, slug_map) + finally: + db.close() + + if not cb_dicts: + logger.info("No codebundles to index") + return {"status": "success", "codebundles": 0, "codecollections": 0} + + # --- Codebundle embeddings --- + documents = [vec_svc.codebundle_to_document(cb) for cb in cb_dicts] + embeddings = embed_svc.embed_texts(documents) + ids = [f"{cb['collection_slug']}/{cb['slug']}" for cb in cb_dicts] + if len(set(ids)) != len(ids): + dupes = [vid for vid in ids if ids.count(vid) > 1] + logger.error(f"Duplicate vector IDs detected: {set(dupes)}") + return {"status": "failed", "error": f"duplicate vector IDs: {set(dupes)}"} + metadatas = [vec_svc.codebundle_metadata(cb) for cb in cb_dicts] + + valid = _count_valid_embeddings(embeddings) + if valid == 0: + logger.error(f"All {len(embeddings)} codebundle embeddings are empty — skipping upsert to preserve existing data") + return {"status": "failed", "error": "all embeddings empty (upstream API failure)"} + + vec_svc.upsert_vectors( + "codebundles", ids, embeddings, documents, metadatas, clear_existing=True + ) + + # --- Codecollection embeddings --- + db2 = SessionLocal() + try: + cc_rows = db2.query(CodeCollection).filter(CodeCollection.is_active == True).all() + cc_dicts = [ + {"slug": c.slug, "name": c.name, "description": c.description, + "git_url": c.git_url, "owner": c.owner} + for c in cc_rows + ] + finally: + db2.close() + + cc_written = 0 + if cc_dicts: + cc_docs = [vec_svc.collection_to_document(cc) for cc in cc_dicts] + cc_embs = embed_svc.embed_texts(cc_docs) + cc_valid = _count_valid_embeddings(cc_embs) + if cc_valid == 0: + logger.error("All codecollection embeddings empty — skipping upsert") + else: + cc_ids = [cc["slug"] for cc in cc_dicts] + cc_metas = [vec_svc.collection_metadata(cc) for cc in cc_dicts] + cc_written = vec_svc.upsert_vectors( + "codecollections", cc_ids, cc_embs, cc_docs, cc_metas, clear_existing=True + ) + + logger.info(f"Codebundle indexing complete: {valid} codebundles, {cc_written} collections") + return { + "status": "success", + "codebundles": valid, + "codecollections": cc_written, + } + + except Exception as e: + logger.error(f"Codebundle indexing failed: {e}", exc_info=True) + return {"status": "failed", "error": str(e)} + + +# --------------------------------------------------------------------------- +# Index documentation from sources.yaml +# --------------------------------------------------------------------------- + +@celery_app.task(bind=True, name="app.tasks.indexing_tasks.index_documentation_task") +def index_documentation_task(self, crawl: bool = True) -> Dict[str, Any]: + """Crawl documentation URLs from sources.yaml, embed, and store in pgvector.""" + try: + logger.info(f"Starting documentation indexing (task {self.request.id})") + embed_svc = get_embedding_service() + vec_svc = get_vector_service() + + if not embed_svc.available: + return {"status": "skipped", "reason": "embedding service unavailable"} + + from app.services.documentation_source_loader import DocumentationSourceLoader + loader = DocumentationSourceLoader() + docs = loader.get_all_docs(crawl=crawl) + + if not docs: + logger.info("No documentation sources found") + return {"status": "success", "documentation": 0} + + documents = [vec_svc.doc_to_document(d) for d in docs] + embeddings = embed_svc.embed_texts(documents) + + valid = _count_valid_embeddings(embeddings) + if valid == 0: + logger.error(f"All {len(embeddings)} documentation embeddings are empty — skipping upsert to preserve existing data") + return {"status": "failed", "error": "all embeddings empty (upstream API failure)"} + + seen: set = set() + ids: List[str] = [] + for doc in docs: + name = doc.get("name", "unknown") + base_id = f"{doc.get('category', 'general')}/{name}".lower().replace(" ", "-") + final_id = base_id + counter = 1 + while final_id in seen: + final_id = f"{base_id}_{counter}" + counter += 1 + seen.add(final_id) + ids.append(final_id) + + metadatas = [vec_svc.doc_metadata(d) for d in docs] + written = vec_svc.upsert_vectors( + "documentation", ids, embeddings, documents, metadatas, clear_existing=True + ) + + logger.info(f"Documentation indexing complete: {written} entries") + return {"status": "success", "documentation": written} + + except Exception as e: + logger.error(f"Documentation indexing failed: {e}", exc_info=True) + return {"status": "failed", "error": str(e)} + + +# --------------------------------------------------------------------------- +# Full reindex (all tables) +# --------------------------------------------------------------------------- + +@celery_app.task(bind=True, name="app.tasks.indexing_tasks.reindex_all_task") +def reindex_all_task(self) -> Dict[str, Any]: + """Full reindex: codebundles + codecollections + documentation.""" + logger.info(f"Starting full reindex (task {self.request.id})") + + cb_result = index_codebundles_task() + doc_result = index_documentation_task(crawl=True) + + any_failed = ( + cb_result.get("status") == "failed" + or doc_result.get("status") == "failed" + ) + all_skipped = ( + cb_result.get("status") == "skipped" + and doc_result.get("status") == "skipped" + ) + + if any_failed: + status = "failed" + elif all_skipped: + status = "skipped" + else: + status = "success" + + return { + "status": status, + "codebundles": cb_result, + "documentation": doc_result, + } diff --git a/cc-registry-v2/backend/app/tasks/mcp_tasks.py b/cc-registry-v2/backend/app/tasks/mcp_tasks.py index 79964f2441ae..abcd1f943a6d 100644 --- a/cc-registry-v2/backend/app/tasks/mcp_tasks.py +++ b/cc-registry-v2/backend/app/tasks/mcp_tasks.py @@ -1,179 +1,49 @@ """ -MCP Server Tasks - Manage MCP server indexing and maintenance +DEPRECATED — MCP indexing tasks have moved to indexing_tasks.py. + +The old tasks shelled out to mcp-server/indexer.py as a subprocess. +The new tasks in indexing_tasks.py run natively inside the backend worker, +generating embeddings and storing them directly in pgvector. + +These stubs remain only so that any in-flight Celery messages referencing +the old task names don't cause import errors. They dispatch the new tasks +as proper sub-tasks to preserve task context and monitoring visibility. """ import logging -import subprocess -from pathlib import Path from typing import Dict, Any from app.tasks.celery_app import celery_app -from app.core.config import settings logger = logging.getLogger(__name__) @celery_app.task(bind=True, name='app.tasks.mcp_tasks.index_documentation_task') def index_documentation_task(self) -> Dict[str, Any]: - """ - Re-index documentation sources and update embeddings in the MCP server. - - This task: - 1. Calls the MCP server's indexer to process docs.yaml/sources.yaml - 2. Crawls documentation pages for content - 3. Generates embeddings - 4. Updates the vector database - - Returns: - Dict with indexing results - """ - try: - logger.info(f"Starting documentation indexing (task {self.request.id})") - - # Find the mcp-server directory - # Assuming structure: /workspaces/codecollection-registry/mcp-server - workspace_root = Path(__file__).parent.parent.parent.parent.parent - mcp_server_path = workspace_root / "mcp-server" - indexer_script = mcp_server_path / "indexer.py" - - if not indexer_script.exists(): - error_msg = f"MCP indexer script not found at {indexer_script}" - logger.error(error_msg) - return { - 'status': 'failed', - 'error': error_msg, - 'task_id': self.request.id - } - - # Run the indexer with --docs-only flag - # This will only re-index documentation, not codebundles - logger.info(f"Running indexer: python {indexer_script} --docs-only") - - result = subprocess.run( - ['python', str(indexer_script), '--docs-only'], - cwd=str(mcp_server_path), - capture_output=True, - text=True, - timeout=600 # 10 minute timeout - ) - - if result.returncode == 0: - logger.info("Documentation indexing completed successfully") - logger.info(f"Indexer output:\n{result.stdout}") - - return { - 'status': 'success', - 'message': 'Documentation indexed successfully', - 'task_id': self.request.id, - 'stdout': result.stdout[-1000:], # Last 1000 chars - 'stderr': result.stderr[-500:] if result.stderr else None - } - else: - error_msg = f"Indexer failed with exit code {result.returncode}" - logger.error(error_msg) - logger.error(f"Stderr: {result.stderr}") - - return { - 'status': 'failed', - 'error': error_msg, - 'task_id': self.request.id, - 'stdout': result.stdout[-1000:], - 'stderr': result.stderr[-500:] - } - - except subprocess.TimeoutExpired: - error_msg = "Documentation indexing timed out after 10 minutes" - logger.error(error_msg) - return { - 'status': 'failed', - 'error': error_msg, - 'task_id': self.request.id - } - except Exception as e: - error_msg = f"Documentation indexing failed: {e}" - logger.error(error_msg, exc_info=True) - return { - 'status': 'failed', - 'error': str(e), - 'task_id': self.request.id - } + """Deprecated — dispatches indexing_tasks.index_documentation_task.""" + logger.warning( + "mcp_tasks.index_documentation_task is deprecated; " + "update callers to use indexing_tasks.index_documentation_task" + ) + from app.tasks.indexing_tasks import index_documentation_task as new_task + result = new_task.apply_async() + return { + "status": "redirected", + "new_task_id": result.id, + "new_task_name": "app.tasks.indexing_tasks.index_documentation_task", + } @celery_app.task(bind=True, name='app.tasks.mcp_tasks.reindex_all_task') def reindex_all_task(self) -> Dict[str, Any]: - """ - Full re-index of all MCP server data (codebundles + documentation). - - This is a more comprehensive task that rebuilds the entire vector database. - Use sparingly as it can take several minutes. - - Returns: - Dict with indexing results - """ - try: - logger.info(f"Starting full MCP re-index (task {self.request.id})") - - # Find the mcp-server directory - workspace_root = Path(__file__).parent.parent.parent.parent.parent - mcp_server_path = workspace_root / "mcp-server" - indexer_script = mcp_server_path / "indexer.py" - - if not indexer_script.exists(): - error_msg = f"MCP indexer script not found at {indexer_script}" - logger.error(error_msg) - return { - 'status': 'failed', - 'error': error_msg, - 'task_id': self.request.id - } - - # Run the full indexer - logger.info(f"Running full indexer: python {indexer_script}") - - result = subprocess.run( - ['python', str(indexer_script)], - cwd=str(mcp_server_path), - capture_output=True, - text=True, - timeout=1800 # 30 minute timeout for full index - ) - - if result.returncode == 0: - logger.info("Full re-index completed successfully") - logger.info(f"Indexer output:\n{result.stdout}") - - return { - 'status': 'success', - 'message': 'Full MCP re-index completed successfully', - 'task_id': self.request.id, - 'stdout': result.stdout[-1000:], - 'stderr': result.stderr[-500:] if result.stderr else None - } - else: - error_msg = f"Full indexer failed with exit code {result.returncode}" - logger.error(error_msg) - logger.error(f"Stderr: {result.stderr}") - - return { - 'status': 'failed', - 'error': error_msg, - 'task_id': self.request.id, - 'stdout': result.stdout[-1000:], - 'stderr': result.stderr[-500:] - } - - except subprocess.TimeoutExpired: - error_msg = "Full re-index timed out after 30 minutes" - logger.error(error_msg) - return { - 'status': 'failed', - 'error': error_msg, - 'task_id': self.request.id - } - except Exception as e: - error_msg = f"Full re-index failed: {e}" - logger.error(error_msg, exc_info=True) - return { - 'status': 'failed', - 'error': str(e), - 'task_id': self.request.id - } + """Deprecated — dispatches indexing_tasks.reindex_all_task.""" + logger.warning( + "mcp_tasks.reindex_all_task is deprecated; " + "update callers to use indexing_tasks.reindex_all_task" + ) + from app.tasks.indexing_tasks import reindex_all_task as new_task + result = new_task.apply_async() + return { + "status": "redirected", + "new_task_id": result.id, + "new_task_name": "app.tasks.indexing_tasks.reindex_all_task", + } diff --git a/cc-registry-v2/backend/app/tasks/workflow_tasks.py b/cc-registry-v2/backend/app/tasks/workflow_tasks.py index 02e4123b51d8..8257530a2bce 100644 --- a/cc-registry-v2/backend/app/tasks/workflow_tasks.py +++ b/cc-registry-v2/backend/app/tasks/workflow_tasks.py @@ -1,9 +1,10 @@ """ -Workflow orchestration tasks - Chain multiple tasks in sequence +Workflow orchestration tasks — chain multiple tasks in sequence. + +Pipeline: Sync → Parse → AI Enhance → Generate Embeddings """ import logging from typing import Dict, Any -from celery import chain from app.tasks.celery_app import celery_app from app.core.database import SessionLocal @@ -14,124 +15,101 @@ @celery_app.task(bind=True) def sync_parse_enhance_workflow_task(self, limit: int = None): """ - Complete workflow: Sync → Parse → Enhance New - - This task orchestrates the full update cycle by calling subtasks directly (not via .get()). - 1. Sync all codecollections from their repos - 2. Parse all codebundles to find new ones - 3. AI enhance only NEW codebundles (pending/NULL status) - - Args: - limit: Optional limit for AI enhancement (None = all pending) - - Returns: - Dict with results from each step - - Note: Subtasks are called directly in this worker process to avoid the - "Never call result.get() within a task" anti-pattern. + Complete workflow: Sync → Parse → Enhance → Embed + + 1. Sync all codecollections from their git repos + 2. Parse codebundles (extract tasks, SLIs, metadata) + 3. AI enhance ONLY NEW codebundles (pending/NULL status) + 4. Generate embeddings and store in pgvector + + Subtasks are called directly in this worker process to avoid the + "never call result.get() inside a task" anti-pattern. """ try: - logger.info(f"Starting sync-parse-enhance workflow (task {self.request.id})") - - # Import subtasks + logger.info(f"Starting sync-parse-enhance-embed workflow (task {self.request.id})") + from app.tasks.registry_tasks import sync_all_collections_task, parse_all_codebundles_task from app.tasks.ai_enhancement_tasks import enhance_pending_codebundles_task - - # Step 1: Sync collections (call directly, not via apply_async) + from app.tasks.indexing_tasks import index_codebundles_task + + # Step 1: Sync collections self.update_state(state='PROGRESS', meta={ - 'step': 1, - 'total_steps': 3, + 'step': 1, 'total_steps': 4, 'current_step': 'Syncing codecollections from repos', - 'status': 'Checking for updates in repositories...' }) - - logger.info("Step 1/3: Syncing collections...") - + logger.info("Step 1/4: Syncing collections...") try: - # Call task function directly (eager execution in this worker) sync_result = sync_all_collections_task() logger.info(f"Sync completed: {sync_result}") except Exception as e: logger.error(f"Sync failed: {e}") sync_result = {'status': 'failed', 'error': str(e)} - + # Step 2: Parse codebundles self.update_state(state='PROGRESS', meta={ - 'step': 2, - 'total_steps': 3, + 'step': 2, 'total_steps': 4, 'current_step': 'Parsing codebundles', - 'status': 'Parsing Robot files from repositories...', - 'sync_result': sync_result }) - - logger.info("Step 2/3: Parsing codebundles...") - + logger.info("Step 2/4: Parsing codebundles...") try: - # Call task function directly (eager execution in this worker) parse_result = parse_all_codebundles_task() logger.info(f"Parse completed: {parse_result}") except Exception as e: logger.error(f"Parse failed: {e}") parse_result = {'status': 'failed', 'error': str(e)} - - # Step 3: Enhance only NEW codebundles + + # Step 3: AI enhance NEW codebundles self.update_state(state='PROGRESS', meta={ - 'step': 3, - 'total_steps': 3, + 'step': 3, 'total_steps': 4, 'current_step': 'AI enhancing new codebundles', - 'status': 'Enhancing codebundles with pending status...', - 'sync_result': sync_result, - 'parse_result': parse_result }) - - logger.info(f"Step 3/3: Enhancing NEW codebundles (limit={limit})...") - + logger.info(f"Step 3/4: Enhancing NEW codebundles (limit={limit})...") try: - # Call task function directly with limit parameter enhance_result = enhance_pending_codebundles_task(limit=limit) logger.info(f"Enhancement completed: {enhance_result}") except Exception as e: logger.error(f"Enhancement failed: {e}") enhance_result = {'status': 'failed', 'error': str(e)} - - # Final result + + # Step 4: Generate embeddings and store in pgvector + self.update_state(state='PROGRESS', meta={ + 'step': 4, 'total_steps': 4, + 'current_step': 'Generating embeddings for vector search', + }) + logger.info("Step 4/4: Generating embeddings...") + try: + embed_result = index_codebundles_task() + logger.info(f"Embedding indexing completed: {embed_result}") + except Exception as e: + logger.error(f"Embedding indexing failed: {e}") + embed_result = {'status': 'failed', 'error': str(e)} + final_result = { 'status': 'completed', 'workflow_id': self.request.id, 'steps': { '1_sync': sync_result, '2_parse': parse_result, - '3_enhance': enhance_result + '3_enhance': enhance_result, + '4_embed': embed_result, }, - 'message': 'Workflow completed: sync → parse → enhance new codebundles' + 'message': 'Workflow completed: sync → parse → enhance → embed' } - + logger.info(f"Workflow completed: {final_result}") return final_result - + except Exception as e: - logger.error(f"Workflow failed: {e}") - import traceback - logger.error(f"Traceback: {traceback.format_exc()}") - + logger.error(f"Workflow failed: {e}", exc_info=True) return { 'status': 'failed', 'error': str(e), 'error_type': type(e).__name__, - 'workflow_id': self.request.id + 'workflow_id': self.request.id, } @celery_app.task(bind=True) def quick_update_workflow_task(self, ai_limit: int = 20): - """ - Quick update workflow with AI limit - - Same as sync_parse_enhance_workflow_task but with a default limit - on AI enhancement to avoid large API costs. - - Args: - ai_limit: Max number of codebundles to enhance (default: 20) - """ - # Call the main workflow task directly (not via .get()) + """Quick update workflow with a limit on AI enhancement to control costs.""" return sync_parse_enhance_workflow_task(limit=ai_limit) diff --git a/cc-registry-v2/backend/requirements.txt b/cc-registry-v2/backend/requirements.txt index 0f7a4891fe4a..8949318bc275 100644 --- a/cc-registry-v2/backend/requirements.txt +++ b/cc-registry-v2/backend/requirements.txt @@ -8,6 +8,7 @@ pydantic-settings==2.1.0 sqlalchemy==2.0.23 alembic==1.13.1 psycopg2-binary==2.9.9 +pgvector>=0.2.4 # HTTP client httpx==0.25.2 @@ -20,6 +21,7 @@ python-multipart==0.0.6 # Background tasks celery==5.3.4 redis==5.0.1 +flower==2.0.1 # GitHub integration PyGithub==1.59.1 @@ -34,13 +36,13 @@ Jinja2==3.1.2 # Robot Framework robotframework==6.1.1 -# Task scheduling and background jobs -celery==5.3.4 -redis==5.0.1 -flower==2.0.1 - -# AI Integration +# AI Integration - embeddings and chat openai>=1.0.0 +numpy>=1.24.0 + +# Web crawling for documentation indexing +beautifulsoup4>=4.12.0 +lxml>=4.9.0 # Testing pytest==7.4.3 diff --git a/cc-registry-v2/backend/run_migrations.py b/cc-registry-v2/backend/run_migrations.py index ee523256238f..1d1bd1a7830f 100755 --- a/cc-registry-v2/backend/run_migrations.py +++ b/cc-registry-v2/backend/run_migrations.py @@ -25,9 +25,16 @@ def ensure_base_tables(): create_all() is safe to call on an existing DB — it only creates tables that are missing and never alters existing ones. """ + from sqlalchemy import text from app.core.database import Base, engine # Import all models so they are registered on Base.metadata from app.models import CodeCollection, Codebundle, CodeCollectionVersion, AIEnhancementLog, TaskGrowthMetric # noqa: F401 + from app.models.vector_models import VectorCodebundle, VectorCodecollection, VectorLibrary, VectorDocumentation # noqa: F401 + + # pgvector extension must exist before creating vector tables + with engine.connect() as conn: + conn.execute(text("CREATE EXTENSION IF NOT EXISTS vector")) + conn.commit() print("Ensuring base tables exist...") Base.metadata.create_all(bind=engine) diff --git a/cc-registry-v2/backend/sources.yaml b/cc-registry-v2/backend/sources.yaml new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/cc-registry-v2/docker-compose.yml b/cc-registry-v2/docker-compose.yml index 9d5bd3545a09..19325929a070 100644 --- a/cc-registry-v2/docker-compose.yml +++ b/cc-registry-v2/docker-compose.yml @@ -76,9 +76,10 @@ services: volumes: - ./backend:/app - /app/__pycache__ - - ../mcp-server/data/repos:/mcp-repos:ro # MCP cloned repos for genrules parsing - - ../map-tag-icons.yaml:/app/map-tag-icons.yaml:ro # Tag icon mappings - - ./schedules.yaml:/app/schedules.yaml:ro # Schedule configuration + - ../mcp-server/data/repos:/mcp-repos:ro + - ./sources.yaml:/app/sources.yaml:ro + - ../map-tag-icons.yaml:/app/map-tag-icons.yaml:ro + - ./schedules.yaml:/app/schedules.yaml:ro command: sh -c "python run_migrations.py && uvicorn app.main:app --host 0.0.0.0 --port 8001 --reload" # Note: Migrations run on startup, then uvicorn starts in reload mode for development healthcheck: @@ -92,7 +93,7 @@ services: build: ./worker container_name: registry-worker env_file: - - az.secret # Azure OpenAI credentials for AI enhancement + - az.secret environment: - DATABASE_URL=postgresql://user:password@database:5432/codecollection_registry - REDIS_URL=redis://redis:6379/0 @@ -105,8 +106,9 @@ services: backend: condition: service_healthy volumes: - - ./backend:/app # Share backend code - - ./schedules.yaml:/app/schedules.yaml:ro # Schedule configuration + - ./backend:/app + - ./sources.yaml:/app/sources.yaml:ro + - ./schedules.yaml:/app/schedules.yaml:ro command: celery -A app.tasks.celery_app worker --loglevel=info restart: unless-stopped @@ -115,7 +117,7 @@ services: build: ./worker container_name: registry-scheduler env_file: - - az.secret # Azure OpenAI credentials for AI enhancement + - az.secret environment: - DATABASE_URL=postgresql://user:password@database:5432/codecollection_registry - REDIS_URL=redis://redis:6379/0 @@ -128,8 +130,9 @@ services: backend: condition: service_healthy volumes: - - ./backend:/app # Share backend code - - ./schedules.yaml:/app/schedules.yaml:ro # Schedule configuration + - ./backend:/app + - ./sources.yaml:/app/sources.yaml:ro + - ./schedules.yaml:/app/schedules.yaml:ro command: celery -A app.tasks.celery_app beat --loglevel=info restart: unless-stopped diff --git a/cc-registry-v2/docs/ARCHITECTURE.md b/cc-registry-v2/docs/ARCHITECTURE.md index 145e1221b591..347b26acfb46 100644 --- a/cc-registry-v2/docs/ARCHITECTURE.md +++ b/cc-registry-v2/docs/ARCHITECTURE.md @@ -9,11 +9,11 @@ The registry runs as 8 Docker services coordinated by `docker-compose.yml`. | Service | Image / Stack | Port | Role | |---|---|---|---| | **frontend** | React 19 + TypeScript + MUI v7 | 3000 | SPA for browsing and managing CodeBundles | -| **backend** | FastAPI + SQLAlchemy 2.0 | 8001 | REST API (`/api/v1/`), business logic, AI enhancement | -| **mcp-server** | FastAPI (separate repo: `../mcp-server`) | 8000 | Stateless MCP tool server, delegates to backend API | -| **worker** | Celery (shares backend image) | -- | Background task processing | +| **backend** | FastAPI + SQLAlchemy 2.0 | 8001 | REST API (`/api/v1/`), business logic, AI enhancement, embedding generation | +| **mcp-server** | FastAPI (separate repo: `../mcp-server`) | 8000 | Stateless MCP tool server, delegates all queries to backend API | +| **worker** | Celery (shares backend image) | -- | Background task processing (sync, parse, enhance, embed) | | **scheduler** | Celery Beat (shares backend image) | -- | Cron-driven task scheduling | -| **database** | PostgreSQL 15 + pgvector (`pgvector/pgvector:pg15`) | 5432 | Primary data store, vector extension enabled | +| **database** | PostgreSQL 15 + pgvector (`pgvector/pgvector:pg15`) | 5432 | Primary data store + vector embeddings | | **redis** | Redis 7 Alpine | 6379 | Celery broker and result backend | | **flower** | Flower 2.0 | 5555 | Celery monitoring dashboard | @@ -34,8 +34,7 @@ The registry runs as 8 Docker services coordinated by `docker-compose.yml`. └───────────────────┘ call └────────┬───────────┘ └────────┬────────┘ │ │ │ │ REGISTRY_API_URL │ │ - │ (delegates all │ │ - │ data queries │ │ + │ (delegates all queries │ │ │ back to backend) │ │ └──────────────────►────────────┘ │ │ │ @@ -49,59 +48,133 @@ The registry runs as 8 Docker services coordinated by `docker-compose.yml`. └──────────────┘ ``` -## Data Flow +## Data Pipeline: Sync → Parse → Enhance → Embed -### Primary data path: Registry API +The backend Celery worker runs a unified pipeline that populates both the relational tables **and** the vector tables. The `sync_parse_enhance_workflow_task` runs every 6 hours: -All persistent data lives in PostgreSQL. The backend is the only service that talks to the database directly. +``` +Celery Beat dispatches scheduled-sync + │ + ▼ +Worker: sync_parse_enhance_workflow_task + │ + ├── Step 1: sync_all_collections_task + │ Clone/update git repos for each CodeCollection + │ + ├── Step 2: parse_all_codebundles_task + │ Parse meta.yaml, *.robot files, README.md + │ Extract tasks, SLIs, metadata, support tags + │ INSERT/UPDATE codebundles in PostgreSQL + │ + ├── Step 3: enhance_pending_codebundles_task + │ AI-enhance NEW codebundles only (pending/NULL status) + │ Generate descriptions, classify platforms, etc. + │ UPDATE codebundles in PostgreSQL + │ + └── Step 4: index_codebundles_task + Generate embeddings (Azure OpenAI text-embedding-3-small) + Upsert into vector_codebundles and vector_codecollections + via pgvector +``` + +A separate daily task crawls external documentation: ``` -Frontend ──HTTP──► Backend ──SQLAlchemy──► PostgreSQL +Celery Beat dispatches index-documentation-daily (3 AM) + │ + ▼ +Worker: index_documentation_task + │ + ├── Load documentation sources from sources.yaml + ├── Crawl each URL (httpx + BeautifulSoup) + ├── Generate embeddings for crawled content + └── Upsert into vector_documentation via pgvector ``` -### MCP Server path: Tool-based access +## Search + +### Keyword search (existing) -The MCP server is a **thin, stateless proxy**. It exposes MCP tools (find_codebundle, find_documentation, etc.) that clients can call. Every tool delegates to the backend REST API via `RegistryClient` -- the MCP server never touches the database. +All keyword-based search hits the backend's `GET /api/v1/codebundles?search=` endpoint, which runs **weighted ILIKE** directly on PostgreSQL: ``` -Client ──POST /tools/call──► MCP Server ──HTTP──► Backend API ──► PostgreSQL +Frontend / MCP Server / Chat + │ + ▼ +Backend: GET /api/v1/codebundles?search=... + │ + ▼ +PostgreSQL: ILIKE keyword matching + - name: weight 4 + - display_name: weight 3 + - support_tags: weight 3 + - description: weight 1 + - doc: weight 1 + Results ranked by aggregate relevance score ``` -### Background tasks: Celery pipeline +### Semantic (vector) search -Long-running operations (repo sync, codebundle parsing, AI enhancement, indexing) run as Celery tasks dispatched by the scheduler or triggered manually from the admin UI. +The backend exposes embedding-based search through `/api/v1/vector/search/*`: ``` -Scheduler (Beat) ──dispatches──► Redis ──consumed by──► Worker ──► PostgreSQL +MCP Server / Chat / Frontend + │ + ▼ +Backend: GET /api/v1/vector/search?query=...&tables=codebundles,documentation + │ + ├── Generate query embedding (Azure OpenAI) + ├── Cosine similarity search via pgvector (<=> operator) + └── Return ranked results with scores ``` -### Document indexing path +Available endpoints: -The indexer (`mcp-server/indexer.py`) is a standalone CLI tool. It clones GitHub repos, parses codebundles and libraries, crawls documentation URLs, generates vector embeddings, and stores them in a local vector index file (`data/vector_index.json`). This index is used by the MCP server for semantic search when configured, or the MCP server can delegate to the backend API for text-based search instead. +| Endpoint | Searches | +|---|---| +| `GET /api/v1/vector/search` | All tables (unified) | +| `GET /api/v1/vector/search/codebundles` | Codebundle embeddings | +| `GET /api/v1/vector/search/documentation` | Documentation embeddings | +| `GET /api/v1/vector/search/libraries` | Library embeddings | +| `GET /api/v1/vector/stats` | Row counts per table | +| `POST /api/v1/vector/reindex` | Trigger full reindex | + +## Chat: MCP Tool Delegation + +The chat system uses the MCP server as an intermediary: ``` -Indexer ──git clone──► GitHub - │ - ├── Parse codebundles (meta.yaml, *.robot, README.md) - ├── Parse libraries (Python AST, Robot keywords) - ├── Crawl documentation (sources.yaml URLs) - │ - ▼ -Embedding Generator ──API──► Azure OpenAI (text-embedding-3-small) - │ - ▼ -LocalVectorStore ──writes──► data/vector_index.json +User question → Frontend → POST /api/v1/chat/query + │ + ▼ +Backend (mcp_chat.py): + 1. Classify question type + 2. Call MCP Server: POST http://mcp-server:8000/tools/call + │ + ▼ +MCP Server (server_http.py): + 1. Look up tool in ToolRegistry + 2. Tool calls backend API (keyword search or vector search) + 3. Format results as markdown + │ + ▼ +Backend (mcp_chat.py): + 4. LLM synthesizes natural language answer from results + 5. Return structured response to frontend ``` -## PostgreSQL + pgvector +## Frontend Browsing -The database image is `pgvector/pgvector:pg15`, which bundles the pgvector extension. On first start, `database/init/01-init.sql` enables the extension: +Direct REST API calls from the frontend to the backend. No MCP server involvement. -```sql -CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; -CREATE EXTENSION IF NOT EXISTS vector; +``` +Frontend ──HTTP──► Backend ──SQLAlchemy──► PostgreSQL ``` +## PostgreSQL + pgvector + +The database image is `pgvector/pgvector:pg15`. The pgvector extension is enabled on first start. + ### Core tables (managed by Alembic) | Table | Purpose | @@ -115,72 +188,60 @@ CREATE EXTENSION IF NOT EXISTS vector; | `analytics` | Task growth metrics | | `task_executions` | Celery task execution history | -### Vector tables (created by migration `006_add_pgvector.sql`) +### Vector tables (created by `006_add_pgvector.sql`) -| Table | Embedding dim | Purpose | +| Table | Embedding dim | Content | |---|---|---| -| `vector_codebundles` | 1536 | CodeBundle embeddings | +| `vector_codebundles` | 1536 | CodeBundle embeddings (name, description, tasks, readme) | | `vector_codecollections` | 1536 | Collection embeddings | | `vector_libraries` | 1536 | Library/keyword embeddings | -| `vector_documentation` | 1536 | Documentation page embeddings | +| `vector_documentation` | 1536 | Documentation page embeddings (crawled from sources.yaml) | -Each vector table uses HNSW indexes for cosine similarity search plus B-tree indexes on metadata columns. - -**Current state:** The pgvector tables exist in the schema but are not yet used by the MCP server in production. The MCP server's indexer writes to a `LocalVectorStore` (in-memory numpy + JSON file). Migration to pgvector-backed search is planned. +Each vector table has HNSW indexes for cosine similarity (`vector_cosine_ops`) plus B-tree indexes on metadata JSONB columns. ## MCP Server Architecture The MCP server (`../mcp-server/`) is a sibling directory, not nested inside cc-registry-v2. It is built as a separate Docker image. -### Runtime mode: Stateless API - -In production, `server_http.py` runs as a FastAPI app. It registers MCP tools on startup and delegates all data access to the backend API through `RegistryClient`. - -- No database connection -- No local vector store at runtime -- No embedding generation at runtime -- Pure HTTP proxy with tool-call semantics - -### Indexer mode: Batch processing - -`indexer.py` is a separate CLI tool that runs offline (or via Celery task). It **does** use the local vector store and embedding generator to build the search index. +The server is **stateless**. `server_http.py` runs as a FastAPI app, registers MCP tools on startup, and delegates all data access to the backend API through `RegistryClient`. It does not have a database connection, vector store, or embedding generator. ### Tool categories | Category | Tools | Data source | |---|---|---| -| **search** | `find_codebundle`, `search_codebundles`, `find_codecollection`, `keyword_usage_help`, `find_library_info`, `find_documentation`, `check_existing_requests` | Backend API | -| **info** | `list_codebundles`, `list_codecollections`, `get_codebundle_details`, `get_development_requirements` | Backend API / local docs.yaml | +| **search** | `find_codebundle`, `search_codebundles`, `find_codecollection`, `find_documentation`, `find_library_info`, `keyword_usage_help`, `check_existing_requests` | Backend API (vector search with keyword fallback) | +| **info** | `list_codebundles`, `list_codecollections`, `get_codebundle_details`, `get_development_requirements` | Backend API | | **action** | `request_codebundle` | GitHub API | ## Celery Task System -### Task types +### Task modules -| Module | Key tasks | -|---|---| -| `workflow_tasks` | `sync_parse_enhance_workflow_task` -- the primary pipeline | -| `registry_tasks` | `sync_all_collections_task`, `parse_all_codebundles_task` | -| `ai_enhancement_tasks` | `enhance_pending_codebundles_task` | -| `mcp_tasks` | `index_documentation_task`, `reindex_all_task` | -| `data_population_tasks` | `update_collection_statistics_task` | -| `analytics_tasks` | `compute_task_growth_analytics` | -| `task_monitoring` | `cleanup_old_tasks_task`, `health_check_tasks_task` | +| Module | Key tasks | Purpose | +|---|---|---| +| `workflow_tasks` | `sync_parse_enhance_workflow_task` | 4-step pipeline: sync → parse → enhance → embed | +| `registry_tasks` | `sync_all_collections_task`, `parse_all_codebundles_task` | Steps 1-2 of the pipeline | +| `ai_enhancement_tasks` | `enhance_pending_codebundles_task` | Step 3: AI metadata enhancement | +| `indexing_tasks` | `index_codebundles_task`, `index_documentation_task`, `reindex_all_task` | Step 4: embedding generation + pgvector storage | +| `data_population_tasks` | `update_collection_statistics_task` | Hourly stats refresh | +| `analytics_tasks` | `compute_task_growth_analytics` | Daily analytics | +| `task_monitoring` | `cleanup_old_tasks_task`, `health_check_tasks_task` | Maintenance | +| `mcp_tasks` | *(deprecated stubs)* | Redirect to `indexing_tasks` | ### Scheduling -All schedules are defined in `schedules.yaml` and loaded by Celery Beat. Key schedules: +All schedules are defined in `schedules.yaml` and loaded by Celery Beat. | Schedule | Frequency | Task | |---|---|---| -| `scheduled-sync` | Every 6 hours | Full sync-parse-enhance workflow | -| `index-documentation-daily` | Daily 3 AM | Re-index documentation embeddings | +| `scheduled-sync` | Every 6 hours | Full pipeline: sync → parse → enhance → embed | +| `index-documentation-daily` | Daily 3 AM | Crawl documentation URLs, generate embeddings | +| `reindex-vectors-weekly` | Sunday 2 AM | Full rebuild of all vector embeddings | | `update-statistics-hourly` | Hourly | Refresh collection statistics | +| `compute-task-growth-analytics` | Daily 2:30 AM | Git history analysis for task growth | | `health-check` | Every 5 min | System health check | | `cleanup-old-tasks` | Daily 12:30 AM | Purge old task execution records | -See [SCHEDULES.md](SCHEDULES.md) and [MCP_INDEXING_SCHEDULE.md](MCP_INDEXING_SCHEDULE.md) for details. - ## Deployment Topology ### Local development @@ -199,8 +260,8 @@ See [DEPLOYMENT_GUIDE.md](DEPLOYMENT_GUIDE.md) and [k8s/README.md](../k8s/README ## Related Documentation -- [CONFIGURATION.md](CONFIGURATION.md) -- Environment variables and secrets -- [MCP_WORKFLOW.md](MCP_WORKFLOW.md) -- Document indexing pipeline and search flow -- [MCP_INDEXING_SCHEDULE.md](MCP_INDEXING_SCHEDULE.md) -- Automated indexing setup -- [AZURE_OPENAI_SETUP.md](AZURE_OPENAI_SETUP.md) -- Azure OpenAI configuration -- [DATABASE_REDIS_CONFIG.md](DATABASE_REDIS_CONFIG.md) -- Database and Redis setup +- [CONFIGURATION.md](CONFIGURATION.md) — Environment variables and secrets +- [MCP_WORKFLOW.md](MCP_WORKFLOW.md) — Search and indexing flows in detail +- [MCP_INDEXING_SCHEDULE.md](MCP_INDEXING_SCHEDULE.md) — Automated indexing setup +- [AZURE_OPENAI_SETUP.md](AZURE_OPENAI_SETUP.md) — Azure OpenAI configuration +- [DATABASE_REDIS_CONFIG.md](DATABASE_REDIS_CONFIG.md) — Database and Redis setup diff --git a/cc-registry-v2/docs/CONFIGURATION.md b/cc-registry-v2/docs/CONFIGURATION.md index ba35ab55899c..6ed84fa64666 100644 --- a/cc-registry-v2/docs/CONFIGURATION.md +++ b/cc-registry-v2/docs/CONFIGURATION.md @@ -104,11 +104,11 @@ AZURE_OPENAI_EMBEDDING_API_VERSION=2024-02-15-preview **Fallback to main endpoint:** -If `AZURE_OPENAI_EMBEDDING_*` vars are not set, the indexer uses `AZURE_OPENAI_ENDPOINT` and `AZURE_OPENAI_API_KEY` instead. +If `AZURE_OPENAI_EMBEDDING_*` vars are not set, the backend's embedding service uses `AZURE_OPENAI_ENDPOINT` and `AZURE_OPENAI_API_KEY` instead. **No Azure credentials:** -If neither set of credentials is available, the indexer falls back to local `sentence-transformers` (`all-MiniLM-L6-v2`, 384 dimensions). +If neither set of credentials is available, the embedding step is silently skipped. Vector tables remain empty and keyword search continues to work normally. See [AZURE_OPENAI_SETUP.md](AZURE_OPENAI_SETUP.md) for detailed setup instructions. @@ -158,7 +158,7 @@ codecollections: description: "CLI-based automation codebundles" ``` -### `mcp-server/sources.yaml` +### `cc-registry-v2/sources.yaml` Defines documentation URLs to crawl for indexing. See [MCP_WORKFLOW.md](MCP_WORKFLOW.md) for format details. diff --git a/cc-registry-v2/docs/MCP_INDEXING_SCHEDULE.md b/cc-registry-v2/docs/MCP_INDEXING_SCHEDULE.md index 52fe96431c48..489afef54a13 100644 --- a/cc-registry-v2/docs/MCP_INDEXING_SCHEDULE.md +++ b/cc-registry-v2/docs/MCP_INDEXING_SCHEDULE.md @@ -1,228 +1,92 @@ -# MCP Indexing: Scheduled Tasks +# Indexing and Scheduled Tasks -Automated and manual indexing of the MCP server's vector search index. +Scheduled tasks that keep the registry data and vector embeddings current. All tasks are defined in `schedules.yaml` and dispatched by Celery Beat. -## Scheduled Tasks - -Two indexing schedules are defined in `schedules.yaml` under the "MCP Server Indexing" section. They run as Celery tasks dispatched by the Beat scheduler. - -### Documentation Indexing (daily) +## Sync-Parse-Enhance-Embed Workflow (every 6 hours) | Field | Value | |---|---| -| Schedule name | `index-documentation-daily` | -| Celery task | `app.tasks.mcp_tasks.index_documentation_task` | -| Frequency | Daily at 3:00 AM | +| Schedule name | `scheduled-sync` | +| Celery task | `app.tasks.workflow_tasks.sync_parse_enhance_workflow_task` | +| Frequency | Every 6 hours (midnight, 6 AM, noon, 6 PM) | | Enabled | Yes | -Re-indexes documentation sources from `mcp-server/sources.yaml`. Crawls linked URLs, generates embeddings, and writes them to the local vector store (`data/vector_index.json`). Does **not** re-index codebundles. +The primary data pipeline runs four steps in sequence: -Typical duration: 5-10 minutes depending on the number of documentation URLs and web crawling speed. +1. **Sync** — Clone or pull all CodeCollection git repos +2. **Parse** — Walk repos, parse `meta.yaml` + `*.robot` files, upsert into PostgreSQL +3. **AI Enhance** — Send new/pending codebundles to Azure OpenAI GPT for metadata enrichment +4. **Embed** — Generate embeddings via Azure OpenAI text-embedding-3-small, upsert into pgvector tables (`vector_codebundles`, `vector_codecollections`) -### Full Re-index (weekly) +## Documentation Indexing (daily) | Field | Value | |---|---| -| Schedule name | `reindex-mcp-weekly` | -| Celery task | `app.tasks.mcp_tasks.reindex_all_task` | -| Frequency | Sunday at 2:00 AM | -| Enabled | **No** (disabled by default) | - -Complete rebuild of the vector index. Clones/updates all git repos, parses all codebundles and libraries, crawls all documentation, regenerates all embeddings. - -Typical duration: 20-30 minutes for the full registry. - -Enable it in `schedules.yaml` if you want periodic full refreshes: - -```yaml -- name: reindex-mcp-weekly - enabled: true # change from false to true -``` - -Then restart the scheduler: +| Schedule name | `index-documentation-daily` | +| Celery task | `app.tasks.indexing_tasks.index_documentation_task` | +| Frequency | Daily at 3:00 AM UTC | +| Enabled | Yes | -```bash -docker-compose restart scheduler -``` +Crawls documentation URLs defined in `sources.yaml`, generates embeddings, and stores them in `vector_documentation`. -## Manual Triggers +Steps: +1. Load documentation entries from `sources.yaml` +2. Crawl each URL with httpx + BeautifulSoup +3. Build searchable document text from crawled content +4. Generate embeddings +5. Upsert into `vector_documentation` -### From the Admin UI +## Full Vector Reindex (weekly) -1. Navigate to Admin Panel, then the Schedules tab -2. Find `index-documentation-daily` or `reindex-mcp-weekly` -3. Click "Run Now" -4. Monitor progress in the Task Manager view +| Field | Value | +|---|---| +| Schedule name | `reindex-vectors-weekly` | +| Celery task | `app.tasks.indexing_tasks.reindex_all_task` | +| Frequency | Sunday at 2:00 AM UTC | +| Enabled | Yes | -### Via API +Rebuilds all vector tables from scratch (codebundles + codecollections + documentation). Useful for recovering from drift or after schema changes. -```bash -# Documentation indexing -curl -X POST "http://localhost:8001/api/v1/schedules/index-documentation-daily/trigger" \ - -H "Authorization: Bearer $TOKEN" +## Other Schedules -# Full re-index -curl -X POST "http://localhost:8001/api/v1/schedules/reindex-mcp-weekly/trigger" \ - -H "Authorization: Bearer $TOKEN" -``` +| Schedule | Frequency | Task | Purpose | +|---|---|---|---| +| `validate-yaml-seed-daily` | Daily 1 AM | `sync_all_collections_task` | Ensure all YAML-defined collections exist in the database | +| `update-statistics-hourly` | Hourly | `update_collection_statistics_task` | Refresh collection statistics | +| `compute-task-growth-analytics` | Daily 2:30 AM | `compute_task_growth_analytics` | Analyze git history for task growth | +| `health-check` | Every 5 min | `health_check_task` | System health check | +| `health-check-tasks` | Every 10 min | `health_check_tasks_task` | Task queue health check | +| `cleanup-old-tasks` | Daily 12:30 AM | `cleanup_old_tasks_task` | Purge old task execution records | -### Via command line (direct) +## Manual Triggers -Run the indexer directly without Celery: +All indexing tasks can be triggered manually via the Admin UI or the API: ```bash -cd mcp-server +# Trigger full vector reindex +curl -X POST http://localhost:8001/api/v1/vector/reindex -# Documentation only (fast) -python indexer.py --docs-only - -# Full index (codebundles + libraries + documentation) -python indexer.py - -# Specific collection only -python indexer.py --collection rw-cli-codecollection - -# Use local embeddings (no Azure OpenAI API calls) -python indexer.py --local -``` +# Trigger codebundle embedding only +curl -X POST http://localhost:8001/api/v1/vector/reindex/codebundles -## How Scheduled Indexing Works +# Trigger documentation embedding only +curl -X POST http://localhost:8001/api/v1/vector/reindex/documentation -``` -1. Celery Beat (scheduler container) reads schedules.yaml -2. At the scheduled time, Beat dispatches the task to Redis -3. Celery Worker picks up the task from Redis -4. Worker executes mcp_tasks.py::index_documentation_task() -5. Task invokes the indexer as a subprocess: - cd mcp-server && python indexer.py --docs-only -6. Indexer crawls sources.yaml URLs, generates embeddings, writes vector_index.json -7. Task execution is recorded in the task_executions table +# Check vector table stats +curl http://localhost:8001/api/v1/vector/stats ``` ## Configuration -### Adjust schedule times - -Edit `schedules.yaml`. Crontab fields: - -```yaml -crontab: - hour: 3 # 0-23 - minute: 0 # 0-59 - day_of_week: 0 # 0=Sunday, 1=Monday, ... 6=Saturday (or null for every day) -``` - -### Enable/disable schedules - -Set `enabled: true` or `enabled: false` in `schedules.yaml`, then restart the scheduler: - -```bash -docker-compose restart scheduler -``` - -### Azure OpenAI credentials - -The indexer needs embedding API credentials. Set them in `az.secret` (loaded by all backend services via `env_file`): - -```bash -AZURE_OPENAI_EMBEDDING_ENDPOINT=https://your-instance.openai.azure.com/ -AZURE_OPENAI_EMBEDDING_API_KEY=your-key -AZURE_OPENAI_EMBEDDING_DEPLOYMENT=text-embedding-3-small -``` - -Or fall back to the shared Azure OpenAI credentials: - -```bash -AZURE_OPENAI_ENDPOINT=https://your-instance.openai.azure.com/ -AZURE_OPENAI_API_KEY=your-key -``` - -If no Azure credentials are available, the indexer falls back to local `sentence-transformers` (lower quality, zero API cost). - -## Monitoring - -### Check task execution history - -- Admin UI: Task Manager tab shows all recent task executions with status, duration, and error messages -- Flower: http://localhost:5555 for real-time Celery monitoring - -### Check worker and scheduler logs - -```bash -docker logs registry-worker --tail=100 -docker logs registry-scheduler --tail=100 -``` - -### Verify the MCP server has indexed data - -```bash -curl http://localhost:8000/health -``` - -The `data_stats` field shows counts for codebundles, collections, libraries, and documentation. - -## Sync vs Index +Embedding generation requires Azure OpenAI credentials. Set these in `az.secret` or as environment variables: -These are separate processes that serve different purposes: - -| | Sync (registry_tasks) | Index (mcp_tasks / indexer.py) | +| Variable | Purpose | Required | |---|---|---| -| **Writes to** | PostgreSQL (backend database) | `data/vector_index.json` (local file) | -| **Used by** | Web UI for browsing, REST API for querying | Semantic search (MCP server, when vector store is active) | -| **Data source** | GitHub repos (clone + parse) | GitHub repos (clone + parse) + documentation URLs | -| **Schedule** | Every 6 hours (`scheduled-sync`) | Daily at 3 AM (`index-documentation-daily`) | -| **Generates embeddings** | No | Yes (Azure OpenAI or local) | - -Both read from the same GitHub repos but maintain separate data stores. - -## Troubleshooting - -### Task not running on schedule - -1. Check that the scheduler container is running: - ```bash - docker ps | grep scheduler - ``` -2. Verify the schedule is enabled in `schedules.yaml` -3. Check scheduler logs for errors: - ```bash - docker logs registry-scheduler --tail=50 - ``` - -### Indexing task failing - -1. Check worker logs: - ```bash - docker logs registry-worker --tail=100 - ``` -2. Run the indexer manually to see detailed errors: - ```bash - cd mcp-server && python indexer.py --docs-only - ``` -3. Common causes: - - Azure OpenAI credentials missing or expired - - Network connectivity issues for web crawling - - Git clone failures (authentication, network) - -### Embeddings not generating - -Verify Azure OpenAI credentials are set: - -```bash -# Check if the env vars are reaching the worker -docker exec registry-worker env | grep AZURE_OPENAI -``` - -Or bypass the API entirely with local embeddings: - -```bash -cd mcp-server && python indexer.py --local --docs-only -``` +| `AZURE_OPENAI_EMBEDDING_ENDPOINT` | Dedicated embedding endpoint (falls back to `AZURE_OPENAI_ENDPOINT`) | If using dedicated endpoint | +| `AZURE_OPENAI_EMBEDDING_API_KEY` | Dedicated embedding API key (falls back to `AZURE_OPENAI_API_KEY`) | If using dedicated key | +| `AZURE_OPENAI_EMBEDDING_DEPLOYMENT` | Deployment name (default: `text-embedding-3-small`) | No | +| `EMBEDDING_BATCH_SIZE` | Texts per API call (default: `100`) | No | -## Related Documentation +> **Note:** Vector dimensions are fixed at **1536** to match the `text-embedding-3-small` model and the database schema. This is not configurable. -- [ARCHITECTURE.md](ARCHITECTURE.md) -- System architecture -- [MCP_WORKFLOW.md](MCP_WORKFLOW.md) -- Complete indexing pipeline details -- [CONFIGURATION.md](CONFIGURATION.md) -- Environment variables reference -- [SCHEDULES.md](SCHEDULES.md) -- General schedule management -- [AZURE_OPENAI_SETUP.md](AZURE_OPENAI_SETUP.md) -- Azure OpenAI credential setup +If embedding credentials are not configured, the embedding step is silently skipped and vector tables remain empty. Keyword search continues to work. diff --git a/cc-registry-v2/docs/MCP_WORKFLOW.md b/cc-registry-v2/docs/MCP_WORKFLOW.md index 02a00df5839f..4792aa485175 100644 --- a/cc-registry-v2/docs/MCP_WORKFLOW.md +++ b/cc-registry-v2/docs/MCP_WORKFLOW.md @@ -1,375 +1,191 @@ -# MCP Server Workflow: Indexing and Search +# MCP Server Workflow: Search and Indexing -How document indexing, embedding generation, and semantic search work in the CodeCollection Registry. +How data ingestion, embedding generation, and search work in the CodeCollection Registry. -## Overview +## Unified Pipeline -There are two distinct data paths: +There is **one pipeline** for all environments. The backend Celery worker: -1. **Runtime search** -- The MCP server (`server_http.py`) is a stateless API that delegates all queries to the backend Registry API over HTTP. No embeddings or vector store are involved at runtime. -2. **Offline indexing** -- The indexer (`indexer.py`) clones repos, parses codebundles, crawls docs, generates embeddings via Azure OpenAI, and writes them to a local vector index file. +1. Syncs CodeCollection repos from GitHub +2. Parses codebundles from Robot files +3. AI-enhances metadata for new codebundles +4. Generates embeddings and stores them in pgvector -The runtime search path is what production uses today. The offline indexer populates a local vector store that can supplement or replace the backend API search in future. +The MCP server is a **stateless HTTP proxy** that delegates all queries to the backend API. -## Runtime Search Flow +--- -### How a user query becomes results +## Data Ingestion + +### Sync → Parse → Enhance → Embed workflow + +The primary scheduled task `sync_parse_enhance_workflow_task` runs every 6 hours: ``` -User types question in Chat UI - │ - ▼ -Frontend: POST /api/v1/chat/query +Celery Beat dispatches → Redis → Worker picks up task │ ▼ -Backend (mcp_chat.py): - 1. Classifies question type - 2. Calls MCP Server via MCPClient +Step 1: sync_all_collections_task + - For each CodeCollection in the database: + - git clone (or git pull) the repo + - Store raw repo data │ ▼ -MCPClient: POST http://mcp-server:8000/tools/call - { - "tool_name": "find_codebundle", - "arguments": {"query": "...", "max_results": 10} - } +Step 2: parse_all_codebundles_task + - For each collection repo: + - Walk directory tree looking for meta.yaml + *.robot files + - Parse meta.yaml: extract name, description, tags + - Parse *.robot files: extract tasks, SLIs, keywords, variables + - Parse README.md for documentation content + - INSERT or UPDATE codebundle rows in PostgreSQL │ ▼ -MCP Server (server_http.py): - 1. Looks up tool in ToolRegistry - 2. Tool calls RegistryClient (HTTP) +Step 3: enhance_pending_codebundles_task + - Query codebundles WHERE enhancement_status IS NULL or 'pending' + - For each unenhanced codebundle: + - Send to Azure OpenAI GPT for analysis + - Generate: improved description, platform classification, + access level, IAM requirements, data classifications + - UPDATE the codebundle row │ ▼ -RegistryClient: GET http://backend:8001/api/v1/codebundles?search=... - │ - ▼ -Backend: Queries PostgreSQL with text search - │ - ▼ -Results flow back through the chain: - MCP Server formats as markdown - │ - ▼ -Backend: LLM synthesizes natural language answer +Step 4: index_codebundles_task + - Query all active codebundles and codecollections from PostgreSQL + - Build document text for each (name + description + tags + tasks + readme) + - Generate embeddings via Azure OpenAI text-embedding-3-small + - Upsert into vector_codebundles and vector_codecollections (pgvector) +``` + +### Documentation indexing + +A separate daily task crawls external documentation: + +``` +Celery Beat dispatches index-documentation-daily (3 AM) │ ▼ -Frontend: Displays answer + relevant codebundles +Worker: index_documentation_task + │ + ├── Load documentation entries from sources.yaml + │ (URLs for RunWhen docs, library references, FAQs, etc.) + │ + ├── Crawl each URL with httpx + BeautifulSoup + │ Extract title, headings, code blocks, body text + │ + ├── Build document text from crawled content + metadata + │ + ├── Generate embeddings via Azure OpenAI + │ + └── Upsert into vector_documentation (pgvector) ``` -### MCP tools used at runtime +### Full weekly reindex + +`reindex_all_task` runs weekly (Sunday 2 AM) and rebuilds all vector tables from scratch. + +--- + +## Search + +### Keyword search -All tools delegate to the backend API via `RegistryClient` (`utils/registry_client.py`). The MCP server makes HTTP requests to the backend -- it never queries a database or vector store directly. +The existing keyword search remains available at `GET /api/v1/codebundles?search=`. It runs weighted ILIKE matching directly on PostgreSQL: -| Tool | Backend endpoint | Purpose | -|---|---|---| -| `find_codebundle` | `GET /api/v1/codebundles?search=` | Natural language codebundle search | -| `search_codebundles` | `GET /api/v1/codebundles?search=&platform=&tags=` | Filtered keyword search | -| `list_codebundles` | `GET /api/v1/codebundles` | List all codebundles | -| `get_codebundle_details` | `GET /api/v1/collections/{coll}/codebundles/{cb}` | Single codebundle detail | -| `find_codecollection` | `GET /api/v1/registry/collections` | Search collections | -| `list_codecollections` | `GET /api/v1/registry/collections` | List all collections | -| `keyword_usage_help` | `GET /api/v1/codebundles?search=` | Robot Framework keyword help | -| `find_library_info` | `GET /api/v1/codebundles?search=` | Library information | -| `find_documentation` | Local `docs.yaml` | Search managed documentation | -| `get_development_requirements` | Local `docs.yaml` | Dev requirements for a feature | -| `request_codebundle` | GitHub API | Create GitHub issue | -| `check_existing_requests` | GitHub API | Search existing GitHub issues | +| Field | Weight | +|---|---| +| name | 4 | +| display_name | 3 | +| support_tags | 3 | +| description | 1 | +| doc | 1 | -## Offline Indexing Pipeline +Results are ranked by aggregate relevance score. This endpoint is used by the MCP server's `find_codebundle` and `search_codebundles` tools. -The indexer (`mcp-server/indexer.py`) is a batch CLI tool that builds a vector search index. It runs independently of the HTTP server. +### Semantic (vector) search -### Pipeline stages +The backend's `/api/v1/vector/search/*` endpoints provide embedding-based search: ``` -Stage 1: Data Acquisition -───────────────────────── - codecollections.yaml +Client sends query string │ ▼ - For each collection: - git clone / git pull - (repos stored in data/repos/) - -Stage 2: Parsing -──────────────── - For each repo: - codebundles/ directory: - ├── meta.yaml → name, author, tags, platform - ├── *.robot files → tasks, keywords, libraries (RobotParser) - └── README.md → description text - - libraries/ directory: - └── *.py files → functions, classes, docstrings (PythonParser AST) - -Stage 3: Document Creation -────────────────────────── - Each codebundle → single text document containing: - - Display name and slug - - Description - - Platform and support tags - - Task names and documentation (up to 20 capabilities) - - README excerpt (up to 2000 chars) - - Each library → single text document containing: - - Name and import path - - Category and description - - Function signatures + docstrings (up to 15) - - Class info + methods (up to 10) - - Robot Framework keywords (up to 20) - -Stage 4: Documentation Crawling -─────────────────────────────── - sources.yaml +Backend: embed query via Azure OpenAI │ ▼ - WebCrawler (crawl4ai headless browser, or httpx+BeautifulSoup fallback) - - Fetches each URL - - Extracts title, body text, headings, code blocks - - Converts to clean markdown - - Stores up to 12,000 chars per page in embedding document - -Stage 5: Embedding Generation -───────────────────────────── - All documents (codebundles + libraries + collections + docs) +PostgreSQL: cosine similarity search (embedding <=> query_vector) + - Uses HNSW index for fast approximate nearest-neighbor lookup + - Optional metadata filters (platform, category, collection_slug) │ ▼ - EmbeddingGenerator - - Azure OpenAI: text-embedding-3-small (1536 dimensions) - - Batches of 100 texts per API call - - Fallback: local sentence-transformers all-MiniLM-L6-v2 (384 dimensions) - -Stage 6: Vector Storage -─────────────────────── - LocalVectorStore (utils/vector_store.py) - - In-memory numpy arrays - - Persisted to data/vector_index.json - - 4 tables: vector_codebundles, vector_codecollections, - vector_libraries, vector_documentation - - Brute-force cosine similarity search - - Metadata filtering (platform, collection, category) +Return ranked results with similarity scores ``` -### No chunking - -Documents are **not** chunked. Each codebundle, library, collection, or documentation page is embedded as a single document. Text is truncated to fit within embedding model token limits: +The MCP server's `find_documentation` tool uses this path (with keyword fallback if the backend's vector tables are empty or the embedding service is unavailable). -| Document type | Max text length | -|---|---| -| CodeBundle README | 2,000 chars | -| CodeBundle capabilities | Up to 20 items | -| Library functions | Up to 15 signatures | -| Documentation page | 12,000 chars | -| Description fields | 500 chars | +--- -### Running the indexer +## MCP Server: Stateless Proxy -```bash -# Full index (codebundles + libraries + documentation) -cd mcp-server -python indexer.py +The MCP server (`../mcp-server/server_http.py`) is a FastAPI app that: -# Documentation only (faster) -python indexer.py --docs-only +1. Registers MCP tools on startup +2. Delegates all data access to the backend via `RegistryClient` +3. Formats results as markdown for LLM consumption -# Specific collection -python indexer.py --collection rw-cli-codecollection +### How `find_codebundle` works -# Use local embeddings instead of Azure OpenAI -python indexer.py --local ``` - -## Vector Store - -### Implementation: LocalVectorStore - -The vector store is a zero-infrastructure, in-memory implementation using numpy for cosine similarity search. It persists to a single JSON file. - -**Storage format** (`data/vector_index.json`): - -```json -{ - "vector_codebundles": { - "rw-cli-codecollection/k8s-pod-healthcheck": { - "embedding": [0.012, -0.034, ...], - "document": "Kubernetes Pod Healthcheck\nPlatform: Kubernetes\n...", - "metadata": { - "slug": "k8s-pod-healthcheck", - "collection_slug": "rw-cli-codecollection", - "platform": "Kubernetes", - "tags": "kubernetes,pods,health" - } - } - }, - "vector_codecollections": { ... }, - "vector_libraries": { ... }, - "vector_documentation": { ... } -} +1. MCP Server receives: find_codebundle(query="kubernetes pod restarts") +2. Tool extracts keywords, removes stop words +3. Calls RegistryClient.search_codebundles(search="kubernetes pod restarts") +4. RegistryClient hits: GET http://backend:8001/api/v1/codebundles?search=kubernetes+pod+restarts +5. Backend runs weighted ILIKE search on PostgreSQL +6. Results returned as JSON → MCP tool formats as markdown ``` -**Search algorithm:** -1. Normalize the query embedding vector -2. For each stored vector, compute cosine similarity: `dot(query_vec, stored_vec / norm)` -3. Convert to distance: `distance = 1.0 - cosine_similarity` -4. Apply optional metadata filters (platform, collection_slug, category) -5. Sort by distance ascending (lowest = most similar) -6. Return top N results as `SearchResult` objects - -This brute-force approach works well for the current dataset size (hundreds of vectors). For larger datasets, the pgvector tables in PostgreSQL can be used with HNSW indexing. +### How `find_documentation` works -### pgvector (future) +``` +1. MCP Server receives: find_documentation(query="how to install runwhen local") +2. Tool calls RegistryClient.vector_search_documentation(query="how to install runwhen local") +3. RegistryClient hits: GET http://backend:8001/api/v1/vector/search/documentation?query=... +4. Backend generates query embedding, runs pgvector cosine search +5. Results returned with similarity scores +6. If backend unavailable: falls back to keyword matching on local docs.yaml +``` -The PostgreSQL database has pgvector enabled and four vector tables created by migration `006_add_pgvector.sql`. These tables mirror the local vector store's structure with `vector(1536)` columns and HNSW indexes. They are ready for use but the MCP server has not been migrated to query them yet. +--- -## Embedding Generation +## Key Files -### Azure OpenAI (production) +### Backend (embedding & vector) -| Setting | Value | +| File | Purpose | |---|---| -| Model | `text-embedding-3-small` | -| Dimensions | 1536 | -| Batch size | 100 texts per API call | -| API version | `2024-02-15-preview` | - -Environment variables (checked in order): - -1. `AZURE_OPENAI_EMBEDDING_ENDPOINT` + `AZURE_OPENAI_EMBEDDING_API_KEY` (dedicated) -2. `AZURE_OPENAI_ENDPOINT` + `AZURE_OPENAI_API_KEY` (shared with GPT) - -Deployment name: `AZURE_OPENAI_EMBEDDING_DEPLOYMENT` (default: `text-embedding-3-small`) - -### Local fallback - -If no Azure credentials are available, the indexer uses `sentence-transformers/all-MiniLM-L6-v2` (384-dimensional vectors). No API cost, but lower quality. - -Force local mode: `python indexer.py --local` - -## Web Crawler - -Documentation sources defined in `mcp-server/sources.yaml` are crawled for content. - -### Crawl4AI (primary) +| `backend/app/services/embedding_service.py` | Azure OpenAI embedding generation | +| `backend/app/services/vector_service.py` | pgvector CRUD and similarity search | +| `backend/app/services/web_crawler.py` | Documentation URL crawling | +| `backend/app/services/documentation_source_loader.py` | sources.yaml loader | +| `backend/app/tasks/indexing_tasks.py` | Celery tasks for embedding generation | +| `backend/app/tasks/workflow_tasks.py` | Orchestrates the 4-step pipeline | +| `backend/app/routers/vector_search.py` | `/api/v1/vector/*` API endpoints | +| `backend/app/models/vector_models.py` | SQLAlchemy models for pgvector tables | -Uses headless Chromium to render JavaScript-heavy pages (Confluence, SPAs). Outputs clean markdown. Automatically strips navigation, headers, footers. - -### BeautifulSoup (fallback) - -Simple HTTP fetch + HTML parsing. Used when crawl4ai is not installed or fails for a URL. - -### Source configuration (`sources.yaml`) - -```yaml -sources: - documentation: - - name: "RunWhen Platform Docs" - url: "https://docs.runwhen.com/..." - description: "Platform documentation" - topics: ["platform", "setup"] - priority: high - - libraries: - - name: "RW.CLI Library" - url: "https://..." - description: "CLI automation library" - usage_examples: ["RW.CLI.Run Bash"] - - faq: - - question: "How do I create a codebundle?" - answer: "..." - topics: ["development"] - -index_config: - refresh_interval: 24 - crawl_linked_pages: true - max_crawl_depth: 3 - include_code_examples: true -``` - -## Key Files - -### cc-registry-v2 (backend) +### MCP Server | File | Purpose | |---|---| -| `backend/app/routers/mcp_chat.py` | Chat API endpoint, calls MCP server | -| `backend/app/services/mcp_client.py` | HTTP client for MCP server | -| `backend/app/tasks/mcp_tasks.py` | Celery tasks for triggering indexing | -| `schedules.yaml` | Celery Beat schedule configuration | +| `mcp-server/server_http.py` | FastAPI HTTP server (stateless) | +| `mcp-server/utils/registry_client.py` | HTTP client for backend API (includes vector search methods) | +| `mcp-server/tools/documentation_tools.py` | Documentation search (backend vector → local keyword fallback) | +| `cc-registry-v2/sources.yaml` | Documentation URLs for vector indexing | -### mcp-server +### Deprecated (kept for reference) -| File | Purpose | +| File | Replaced by | |---|---| -| `server_http.py` | Production HTTP server (stateless) | -| `indexer.py` | Offline indexing CLI tool | -| `utils/vector_store.py` | LocalVectorStore (numpy + JSON) | -| `utils/embeddings.py` | Azure OpenAI / local embedding generator | -| `utils/registry_client.py` | HTTP client for backend API | -| `utils/web_crawler.py` | Documentation page crawler | -| `utils/robot_parser.py` | Robot Framework file parser | -| `utils/python_parser.py` | Python AST parser for libraries | -| `tools/codebundle_tools.py` | CodeBundle search/list/detail tools | -| `tools/collection_tools.py` | CodeCollection tools | -| `tools/library_tools.py` | Library and keyword tools | -| `tools/documentation_tools.py` | Documentation search tools | -| `tools/github_issue.py` | GitHub issue creation tool | -| `sources.yaml` | Documentation source URLs | -| `docs.yaml` | Managed documentation catalog | -| `codecollections.yaml` | CodeCollection repo definitions | - -## Troubleshooting - -### Chat not returning results - -1. Verify MCP server is running: - ```bash - curl http://localhost:8000/health - ``` - -2. Verify backend API is reachable from MCP server: - ```bash - docker exec registry-mcp-server curl http://backend:8001/api/v1/health - ``` - -3. Check MCP server logs: - ```bash - docker logs registry-mcp-server --tail=50 - ``` - -### Indexer failures - -1. Run the indexer manually to see errors: - ```bash - cd mcp-server - python indexer.py --docs-only - ``` - -2. Check Azure OpenAI credentials: - ```bash - echo $AZURE_OPENAI_EMBEDDING_ENDPOINT - echo $AZURE_OPENAI_EMBEDDING_API_KEY - ``` - -3. Use local embeddings to bypass API issues: - ```bash - python indexer.py --local - ``` - -### Search results not relevant - -1. Re-index with fresh data: - ```bash - cd mcp-server && python indexer.py - ``` - -2. Check what was indexed: - ```bash - python -c "import json; d=json.load(open('data/vector_index.json')); print({k:len(v) for k,v in d.items()})" - ``` - -## Related Documentation - -- [ARCHITECTURE.md](ARCHITECTURE.md) -- System architecture overview -- [MCP_INDEXING_SCHEDULE.md](MCP_INDEXING_SCHEDULE.md) -- Automated indexing schedules -- [CONFIGURATION.md](CONFIGURATION.md) -- Environment variables and secrets -- [AZURE_OPENAI_SETUP.md](AZURE_OPENAI_SETUP.md) -- Azure OpenAI credential setup +| `mcp-server/indexer.py` | `backend/app/tasks/indexing_tasks.py` | +| `mcp-server/utils/vector_store.py` | `backend/app/services/vector_service.py` | +| `mcp-server/utils/embeddings.py` | `backend/app/services/embedding_service.py` | +| `mcp-server/utils/semantic_search.py` | Backend `/api/v1/vector/search/*` endpoints | +| `backend/app/tasks/mcp_tasks.py` | `backend/app/tasks/indexing_tasks.py` (stubs redirect) | diff --git a/cc-registry-v2/frontend/src/pages/AllTasks.tsx b/cc-registry-v2/frontend/src/pages/AllTasks.tsx index f4e38049a20f..56333073891b 100644 --- a/cc-registry-v2/frontend/src/pages/AllTasks.tsx +++ b/cc-registry-v2/frontend/src/pages/AllTasks.tsx @@ -232,9 +232,16 @@ const AllTasks: React.FC = () => { InputProps={{ startAdornment: ( - + - ) + ), + sx: { + backgroundColor: 'background.paper', + color: 'text.primary', + '& fieldset': { borderColor: 'divider' }, + '&:hover fieldset': { borderColor: 'primary.main' }, + '&.Mui-focused fieldset': { borderColor: 'primary.main' }, + } }} sx={{ maxWidth: 600 }} /> diff --git a/cc-registry-v2/frontend/src/pages/CodeBundles.tsx b/cc-registry-v2/frontend/src/pages/CodeBundles.tsx index d7bebfcc5c85..95b47bcd16e8 100644 --- a/cc-registry-v2/frontend/src/pages/CodeBundles.tsx +++ b/cc-registry-v2/frontend/src/pages/CodeBundles.tsx @@ -188,9 +188,16 @@ const CodeBundles: React.FC = () => { InputProps={{ startAdornment: ( - + ), + sx: { + backgroundColor: 'background.paper', + color: 'text.primary', + '& fieldset': { borderColor: 'divider' }, + '&:hover fieldset': { borderColor: 'primary.main' }, + '&.Mui-focused fieldset': { borderColor: 'primary.main' }, + } }} sx={{ flex: '1 1 300px' }} size="small" diff --git a/cc-registry-v2/frontend/src/pages/Home.tsx b/cc-registry-v2/frontend/src/pages/Home.tsx index e5c61e394374..9884386edba8 100644 --- a/cc-registry-v2/frontend/src/pages/Home.tsx +++ b/cc-registry-v2/frontend/src/pages/Home.tsx @@ -279,27 +279,28 @@ const Home: React.FC = () => { InputProps={{ startAdornment: ( - + ), endAdornment: searchQuery && ( - + ), sx: { - backgroundColor: 'white', + backgroundColor: 'background.paper', + color: 'text.primary', fontSize: '15px', borderRadius: 3, py: 1.25, fontWeight: 400, - '& fieldset': { borderColor: '#d0d0d0', borderWidth: 1.5 }, - '&:hover fieldset': { borderColor: '#5282f1' }, - '&.Mui-focused fieldset': { borderColor: '#5282f1', borderWidth: 1.5 }, + '& fieldset': { borderColor: 'divider', borderWidth: 1.5 }, + '&:hover fieldset': { borderColor: 'primary.main' }, + '&.Mui-focused fieldset': { borderColor: 'primary.main', borderWidth: 1.5 }, '& input::placeholder': { - color: '#666', + color: 'text.secondary', opacity: 0.8, fontSize: '15px', fontStyle: 'italic' diff --git a/cc-registry-v2/k8s/backend-deployment.yaml b/cc-registry-v2/k8s/backend-deployment.yaml index b6828fd6c777..d93117f9e73e 100644 --- a/cc-registry-v2/k8s/backend-deployment.yaml +++ b/cc-registry-v2/k8s/backend-deployment.yaml @@ -70,6 +70,10 @@ spec: mountPath: /app/schedules.yaml subPath: schedules.yaml readOnly: true + - name: documentation-sources + mountPath: /app/sources.yaml + subPath: sources.yaml + readOnly: true # - name: tag-icons # mountPath: /app/map-tag-icons.yaml # subPath: map-tag-icons.yaml @@ -109,6 +113,9 @@ spec: - name: schedules-config configMap: name: cc-registry-schedules + - name: documentation-sources + configMap: + name: documentation-sources # - name: tag-icons # configMap: # name: tag-icon-mappings diff --git a/cc-registry-v2/k8s/database-deployment.yaml b/cc-registry-v2/k8s/database-deployment.yaml index 6d7367a8a140..09125a80f1aa 100644 --- a/cc-registry-v2/k8s/database-deployment.yaml +++ b/cc-registry-v2/k8s/database-deployment.yaml @@ -140,3 +140,4 @@ data: -- Create required extensions CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; CREATE EXTENSION IF NOT EXISTS "pg_trgm"; + CREATE EXTENSION IF NOT EXISTS "vector"; diff --git a/cc-registry-v2/k8s/kustomization.yaml b/cc-registry-v2/k8s/kustomization.yaml index 3ab95563b41e..0ec17197bb09 100644 --- a/cc-registry-v2/k8s/kustomization.yaml +++ b/cc-registry-v2/k8s/kustomization.yaml @@ -67,6 +67,12 @@ configMapGenerator: literals: - ENVIRONMENT=production - LOG_LEVEL=info + # Documentation sources for vector embedding indexing. + # Before running `kustomize build`, copy sources.yaml into this directory: + # cp ../sources.yaml ./sources.yaml + - name: documentation-sources + files: + - sources.yaml # Secret generator (for non-sensitive defaults) # secretGenerator: diff --git a/cc-registry-v2/k8s/scheduler-deployment.yaml b/cc-registry-v2/k8s/scheduler-deployment.yaml index d5eceb457d5d..8d5fc065b13a 100644 --- a/cc-registry-v2/k8s/scheduler-deployment.yaml +++ b/cc-registry-v2/k8s/scheduler-deployment.yaml @@ -118,26 +118,26 @@ data: enabled: true # ============================================================================= - # MCP SERVER INDEXING + # VECTOR INDEXING (pgvector embeddings) # ============================================================================= - name: index-documentation-daily - task: app.tasks.mcp_tasks.index_documentation_task - description: Re-index documentation sources and update embeddings + task: app.tasks.indexing_tasks.index_documentation_task + description: Crawl documentation sources and update vector embeddings schedule_type: crontab crontab: hour: 3 minute: 0 enabled: true - - name: reindex-mcp-weekly - task: app.tasks.mcp_tasks.reindex_all_task - description: Full re-index of all MCP data (codebundles + documentation) + - name: reindex-vectors-weekly + task: app.tasks.indexing_tasks.reindex_all_task + description: Full rebuild of all vector embeddings in pgvector schedule_type: crontab crontab: hour: 2 minute: 0 day_of_week: 0 - enabled: false + enabled: true # ============================================================================= # ANALYTICS COMPUTATION @@ -226,6 +226,10 @@ spec: mountPath: /app/schedules.yaml subPath: schedules.yaml readOnly: true + - name: documentation-sources + mountPath: /app/sources.yaml + subPath: sources.yaml + readOnly: true - name: celerybeat-schedule mountPath: /tmp resources: @@ -247,6 +251,9 @@ spec: - name: schedules-config configMap: name: cc-registry-schedules + - name: documentation-sources + configMap: + name: documentation-sources - name: celerybeat-schedule emptyDir: {} --- diff --git a/cc-registry-v2/k8s/secrets-example.yaml b/cc-registry-v2/k8s/secrets-example.yaml index b56936bd049e..79de83dfc15f 100644 --- a/cc-registry-v2/k8s/secrets-example.yaml +++ b/cc-registry-v2/k8s/secrets-example.yaml @@ -3,11 +3,12 @@ # Use sealed-secrets, external-secrets, or your cluster's secret management solution # # IMPORTANT: Azure OpenAI Credentials -# - GPT/Chat credentials: Used by cc-registry-v2 backend for AI enhancement features -# - Embedding credentials: Used by MCP server for semantic search/vector embeddings +# - GPT/Chat credentials: Used by the backend for AI enhancement (codebundle descriptions) +# - Embedding credentials: Used by the backend for vector embedding generation (pgvector) # - These can be the SAME resource (same endpoint/key) with different deployments # - OR separate resources if you have embeddings on a different Azure OpenAI instance -# - If embedding credentials are not provided, MCP server falls back to main credentials +# - If embedding credentials are not set, the backend falls back to the main credentials +# - The MCP server does NOT need Azure credentials — it delegates all search to the backend --- apiVersion: v1 @@ -68,9 +69,9 @@ stringData: AZURE_OPENAI_DEPLOYMENT_NAME: "gpt-4" AZURE_OPENAI_DEPLOYMENT: "gpt-4" # Alternative name used by some services - # Azure OpenAI for Embeddings (used by MCP server semantic search) + # Azure OpenAI for Embeddings (used by backend for pgvector embedding generation) # Often embeddings use a different deployment or even different Azure OpenAI resource - # If not set, MCP server will fall back to the main AZURE_OPENAI_* credentials above + # If not set, backend falls back to the main AZURE_OPENAI_* credentials above AZURE_OPENAI_EMBEDDING_API_KEY: "your-embedding-api-key-here" AZURE_OPENAI_EMBEDDING_ENDPOINT: "https://your-embedding-resource.openai.azure.com/" AZURE_OPENAI_EMBEDDING_API_VERSION: "2024-02-15-preview" diff --git a/cc-registry-v2/k8s/worker-deployment.yaml b/cc-registry-v2/k8s/worker-deployment.yaml index 125d25d5a69d..c8c3dfa4ecc7 100644 --- a/cc-registry-v2/k8s/worker-deployment.yaml +++ b/cc-registry-v2/k8s/worker-deployment.yaml @@ -43,6 +43,14 @@ spec: - name: backend-code mountPath: /app readOnly: true + - name: schedules-config + mountPath: /app/schedules.yaml + subPath: schedules.yaml + readOnly: true + - name: documentation-sources + mountPath: /app/sources.yaml + subPath: sources.yaml + readOnly: true resources: requests: memory: "512Mi" @@ -61,6 +69,12 @@ spec: volumes: - name: backend-code emptyDir: {} + - name: schedules-config + configMap: + name: cc-registry-schedules + - name: documentation-sources + configMap: + name: documentation-sources imagePullSecrets: - name: ghcr-pull-secret initContainers: diff --git a/cc-registry-v2/schedules.yaml b/cc-registry-v2/schedules.yaml index 6e4569c8251d..cfbe388c2bf5 100644 --- a/cc-registry-v2/schedules.yaml +++ b/cc-registry-v2/schedules.yaml @@ -131,29 +131,29 @@ schedules: enabled: true # ============================================================================= - # MCP SERVER INDEXING + # VECTOR INDEXING (pgvector embeddings) # ============================================================================= - # Documentation Indexing - Re-index documentation sources daily + # Documentation Indexing — crawl sources.yaml URLs, generate embeddings, store in pgvector - name: index-documentation-daily - task: app.tasks.mcp_tasks.index_documentation_task - description: Re-index documentation sources and update embeddings + task: app.tasks.indexing_tasks.index_documentation_task + description: Crawl documentation sources and update vector embeddings schedule_type: crontab crontab: hour: 3 minute: 0 enabled: true - # Full MCP Re-index - Complete rebuild of vector database (weekly) - - name: reindex-mcp-weekly - task: app.tasks.mcp_tasks.reindex_all_task - description: Full re-index of all MCP data (codebundles + documentation) + # Full vector reindex — codebundles + codecollections + documentation + - name: reindex-vectors-weekly + task: app.tasks.indexing_tasks.reindex_all_task + description: Full rebuild of all vector embeddings in pgvector schedule_type: crontab crontab: hour: 2 minute: 0 day_of_week: 0 # Sunday at 2 AM - enabled: false # Disabled by default - enable if needed + enabled: true # ============================================================================= # ANALYTICS COMPUTATION diff --git a/mcp-server/sources.yaml b/cc-registry-v2/sources.yaml similarity index 100% rename from mcp-server/sources.yaml rename to cc-registry-v2/sources.yaml diff --git a/cc-registry-v2/worker/requirements.txt b/cc-registry-v2/worker/requirements.txt index ac5149b9263f..446965a6b240 100644 --- a/cc-registry-v2/worker/requirements.txt +++ b/cc-registry-v2/worker/requirements.txt @@ -11,6 +11,10 @@ pydantic-settings==2.1.0 sqlalchemy==2.0.23 alembic==1.13.1 psycopg2-binary==2.9.9 +pgvector>=0.2.4 + +# HTTP client +httpx==0.25.2 # Background tasks celery==5.3.4 @@ -29,10 +33,15 @@ robotframework==6.1.1 # Task scheduling and background jobs flower==2.0.1 -# AI Integration +# AI Integration - embeddings and chat openai>=1.0.0 +numpy>=1.24.0 requests>=2.25.0 +# Web crawling for documentation indexing +beautifulsoup4>=4.12.0 +lxml>=4.9.0 + # Environment python-dotenv==1.0.0 diff --git a/mcp-server/data/codebundles.json b/mcp-server/data/codebundles.json deleted file mode 100644 index 07910f22f54e..000000000000 --- a/mcp-server/data/codebundles.json +++ /dev/null @@ -1,6106 +0,0 @@ -{ - "codebundles": [ - { - "slug": "rw-public-codecollection-aws-cloudwatch-metricquery", - "collection_slug": "rw-public-codecollection", - "name": "aws-cloudwatch-metricquery", - "display_name": "aws-cloudwatch-metricquery", - "description": "Retrieve the result of an AWS CloudWatch Metrics Insights query.", - "platform": "AWS", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Running CloudWatch Metric Query And Pushing The Result" - ], - "capabilities": [], - "readme": "# AWS CloudWatch Metric Query\n\n## SLI \nRetrieve the result of an AWS CloudWatch Metrics Insights query.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Core", - "RW.AWS.CloudWatch" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/aws-cloudwatch-metricquery" - }, - { - "slug": "rw-public-codecollection-web-triage", - "collection_slug": "rw-public-codecollection", - "name": "web-triage", - "display_name": "web-triage", - "description": "Troubleshoot and triage a URL to inspect it for common issues such as an expired certification, missing DNS records, etc.", - "platform": "Unknown", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Validate Platform Egress", - "Perform Inspection On URL" - ], - "capabilities": [], - "readme": "# Web Triage\n\n## TaskSet\nTroubleshoot and triage a URL to inspect it for common issues such as an expired certification, missing DNS records, etc.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Core", - "RW.WebInspector" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/web-triage" - }, - { - "slug": "rw-public-codecollection-uptimecom-component-ok", - "collection_slug": "rw-public-codecollection", - "name": "uptimecom-component-ok", - "display_name": "uptimecom-component-ok", - "description": "Check the status of an Uptime.com component for a given site.", - "platform": "Unknown", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Check If Vault Endpoint Is Healthy" - ], - "capabilities": [], - "readme": "# Uptime.com Component OK\n\n## SLI\nCheck the status of an Uptime.com component for a given site.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Uptime.StatusPage", - "RW.Core" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/uptimecom-component-ok" - }, - { - "slug": "rw-public-codecollection-sysdig-monitor-metric", - "collection_slug": "rw-public-codecollection", - "name": "sysdig-monitor-metric", - "display_name": "sysdig-monitor-metric", - "description": "Queries the Sysdig data API to fetch metric data.", - "platform": "Unknown", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Query Sysdig Metric Data And Pushing Metric" - ], - "capabilities": [], - "readme": "# Sysdig Monitor Metric\n\n## SLI\nQueries the Sysdig data API to fetch metric data.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Core", - "RW.Sysdig" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/sysdig-monitor-metric" - }, - { - "slug": "rw-public-codecollection-gcp-serviceshealth", - "collection_slug": "rw-public-codecollection", - "name": "gcp-serviceshealth", - "display_name": "gcp-serviceshealth", - "description": "This codebundle sets up a monitor for a specific region and GCP Product, which is then periodically checked for ongoing incidents based on the history available at https://status.cloud.google.com/incidents.json filtered based on severity level.", - "platform": "GCP", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Get Number of GCP Incidents Effecting My Workspace" - ], - "capabilities": [], - "readme": "# GCP Service Health\n\n\n## SLI\nThis codebundle sets up a monitor for a specific region and GCP Product, which is then periodically checked for ongoing incidents based on the history available at https://status.cloud.google.com/incidents.json filtered based on severity level.\n\n## Use Cases\n### Use Case: SLI: Monitor for GCP Incidents with Google Kubernetes Engine & Google Compute Engine in 3 Regions\nThis sample configuration is used to demostrate how to monitor incidents for multiple GCP products in multiple regions within the last 15m: \n\n```\nWITHIN_TIME: 15m\nPRODUCTS: Google Kubernetes Engine,Google Compute Engine\nREGIONS: us-central1,us-west2,us-west1\nSEVERITY: low\n```\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Core", - "RW.GCP.ServiceHealth" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/gcp-serviceshealth" - }, - { - "slug": "rw-public-codecollection-discord-sendmessage", - "collection_slug": "rw-public-codecollection", - "name": "discord-sendmessage", - "display_name": "discord-sendmessage", - "description": "Sends a static message to a Discord chat channel via webhook. There is optional configuration for including live runsession info and links", - "platform": "Unknown", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Send Chat Message" - ], - "capabilities": [], - "readme": "# Discord Send Message\n\n## TaskSet \nSends a static message to a Discord chat channel via webhook. There is optional configuration for including live runsession info and links\nfor team members to quickly access running sessions.\n\n## Use Cases\n- Send an alert when an SLO is burning too much budget which contains a link to the active runsession.\n- Let your team members know you're in a live runsession and provide them with a link to join you.\n\n## Requirements\n- A `webhook_url` secret which allows the codebundle to perform an incoming webhook post request against the service API.\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Rest", - "RW.Core", - "RW.RunWhen.Papi", - "RW.Utils" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/discord-sendmessage" - }, - { - "slug": "rw-public-codecollection-k8s-kubectl-namespace-healthcheck", - "collection_slug": "rw-public-codecollection", - "name": "k8s-kubectl-namespace-healthcheck", - "display_name": "k8s-kubectl-namespace-healthcheck", - "description": "Check the health of a Kubernetes namespace and its objects.", - "platform": "Kubernetes", - "author": "Shea Stewart", - "support_tags": [ - "rw" - ], - "tasks": [ - "Get Event Count and Score", - "Get Container Restarts and Score", - "Get NotReady Pods", - "Generate Namspace Score", - "Trace Namespace Errors", - "Fetch Unready Pods", - "Triage Namespace", - "Object Condition Check", - "Namespace Get All" - ], - "capabilities": [], - "readme": "# Kubernetes Namespace Health Check\nCheck the health of a Kubernetes namespace and its objects. \n\n# SLI \nPeriodically inspect the state of a Kubernetes namespace to determine if its score is 1 (healthy) or 0 (unhealthy). Supports values that are between 0 and 1 depending on the result of each test. The suite of checks considered are:\n- how many **events** of a specific type (e.g. Warning) and of a certain age (e.g. 5m) are counted\n- how many **container restarts** a certain age (e.g. 5m) are counted\n- are any pods not ready\n\nThresholds can be configured for the total amount of **events** or **container** restarts that are considered to still be healthy. Any pod that is NotReady is considered unhealthy. \n\nEach of these checks receives a score of 1 (healthy) or 0 (unhealthy), and they are added up and divided by the total number of checks. This means that a namespace can have a health score between 0 and 1 depending on the types of issues that are occuring. \n\nExample configuration: \n```\nexport DISTRIBUTION=Kubernetes\nexport CONTEXT=default\nexport NAMESPACE=flux-system\nexport EVENT_AGE=5m\nexport EVENT_TYPE=Warning\nexport EVENT_THRESHOLD=0\nexport CONTAINER_RESTART_AGE=5m\nexport CONTAINER_RESTART_THRESHOLD=0\n```\n\nWith the example above, a namespace would be be considerd a 0 (unhealthy) if there are any container restarts within 5m, any Warning events within 5m, and any pod is NotReady. If all pods are ready but Warning events or container restarts occur within 5m, it could receive a score of 0.33 or 0.66. If the namespace has zero Warning events, zero container restarts, and all pods are Ready, the score is 1 (healthy). \n\n> This SLI supports one, multiple, or ALL namespaces. If creating a score from multiple namespaces, consider a larger interval, such as 60s or greater, between each run. This SLI directly accesses the Kube API and querying for pods & events across multiple or ALL namespaces can generate a lot of volume. \n\n## TaskSet\nThis taskset runs general troubleshooting checks against all applicable objects in a namespace, checks error events, and searches pod logs for error entries. Initially a trace will be done, where error events within a time window will be fetched, and involved pod objects will be queried for error logs using the `ERROR_PATTERN`, and if error logs are found those pod names are included in the report. A secondary report is fetched for a list of unready pods ordered by restart counts, and lastly the triage namespace task performed will query a list of namespace resources based on a csv of `Kinds` and produce a report of failing checks for those resources. Current the support checks are:\n- `deployment_replicas_healthy`\n- `statefulset_replicas_healthy`\n- `daemonset_replicas_healthy`\n\ncommands are also generated for users to quickly fetch additional information about those individual resources.\n\nExample Configuration:\n```\nexport DISTRIBUTION=Kubernetes\nexport CONTEXT=default\nexport NAMESPACE=flux-system\nexport EVENT_AGE=30m\nexport ERROR_PATTERN=(Error|Exception)\nexport RESOURCE_KINDS=Deployment,DaemonSet,StatefulSet\n```\n\n### Use Case: TaskSet: Triage App workloads in a Namespace\nYou can triage your app workloads with the following configuration, adjusted to your cluster & namespace:\n\n```\nconfigProvided:\n - name: RESOURCE_KINDS\n value: 'Deployment,DaemonSet,StatefulSet'\n - name: ERROR_PATTERN\n value: '(Error|Exception)'\n - name: CONTEXT\n value: [kubeconfig_context]\n - name: NAMESPACE\n value: [kubernetes_namespace]\n - name: DISTRIBUTION\n value: Kubernetes\n - name: EVENT_AGE\n value: '30m'\n```\n\nwhich will provide you with information on events, error logs, and resources with issues related to `Deployments`, `DaemonSets`, and `StatefulSets` in the given namespace.\n\n## Requirements\n- kubeconfig with appropriate RBAC permissions to `get` `pods` and `events` on desired namespaces\n\n\n## TODO\n- [ ] Optimize the multi-namespace configuration for SLI\n- [ ] link to kubeconfig rbac doc", - "libraries": [ - "RW.K8s", - "RW.Utils", - "RW.platform", - "RW.Core", - "BuiltIn", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/k8s-kubectl-namespace-healthcheck" - }, - { - "slug": "rw-public-codecollection-sli-alert-threshold", - "collection_slug": "rw-public-codecollection", - "name": "sli-alert-threshold", - "display_name": "sli-alert-threshold", - "description": "This codebundle allows you to monitor another SLI and trigger a TaskSet when the expected rate of a SLI value falls below a specified threshold.", - "platform": "Unknown", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Check If SLI Within Incident Threshold" - ], - "capabilities": [], - "readme": "# SLI Alert Threshold\nThis codebundle allows you to monitor another SLI and trigger a TaskSet when the expected rate of a SLI value falls below a specified threshold.\n\n## SLI\nDepending on your observability needs, the Multi-Window Multi-Burn algorithm + SLO error budgets approach may not apply to your use case. In those cases you can use this codebundle to create an alert threshold based on another SLI. A query will be performed on the monitored SLI's metrics for a given time window and resolution, and then the presence of a success (threshold) value will be checked. For example: fetch 1 hour of metric data at 5 minute intervals, for the monitored SLI; a `0` means failure and `1` means healthy. If we set the success value to `1` and a rate of `1.0` (100%) then when any failure occurs in the monitored SLI, this codebundle will immediately alert and trigger the given TaskSet.\n\n### Use Case: SLI: Trigger a slack message when my API health check fails\nFor our public API, it's uptime is critical, so we can monitor its health check and send a slack message to a team channel whenever the health check fails.\n\n```\nconfigProvided:\n - name: WORKSPACE_NAME\n value: 'tutorial-ws'\n - name: SLX_NAME\n value: public-api-health\n - name: HISTORY_WINDOW\n value: '1h'\n - name: RESOLUTION\n value: '15m'\n - name: THRESHOLD_VALUE\n value: 1\n - name: EXPECTED_THRESHOLD_RATE\n value: 1.0\n - name: INCIDENT_TASKSET\n value: tool-slackmsg\n```\n> Because the window in this example is `1h` and our threshold rate is `100%` then if 1 error is detected in the metric data, the threshold will be alerting for the next `1h` while it persists in the window. Consider this when determining your window, resolution and expected threshold rate in relation to how you want the TaskSet to behave.\n\n## Requirements\n- The name of the SLI you want to monitor\n- Verify that the SLI submits a consistent value that denotes a success (eg: 0 is always good, 1 is always good, etc) as you'll need to set this as your `threshold value`\n- The name of the workspace the SLX, SLI and TaskSet reside in\n\n## TODO\n- [ ] Add additional notes for tweaking threshold models to get the desired behaviour\n- [ ] Add docs for connecting to another workspace", - "libraries": [ - "RW.Core", - "RW.Rest", - "RW.Utils", - "RW.RunWhen.Papi" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/sli-alert-threshold" - }, - { - "slug": "rw-public-codecollection-k8s-kubectl-top", - "collection_slug": "rw-public-codecollection", - "name": "k8s-kubectl-top", - "display_name": "k8s-kubectl-top", - "description": "Retreieve aggregate data via kubectl top command.", - "platform": "Kubernetes", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Running Kubectl Top And Extracting Metric Data" - ], - "capabilities": [], - "readme": "# Kubernetes kubectl Top\n\n## SLI\nRetreieve aggregate data via kubectl top command.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.K8s", - "RW.Utils", - "RW.platform", - "RW.Core", - "BuiltIn", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/k8s-kubectl-top" - }, - { - "slug": "rw-public-codecollection-googlechat-sendmessage", - "collection_slug": "rw-public-codecollection", - "name": "googlechat-sendmessage", - "display_name": "googlechat-sendmessage", - "description": "Sends a static message to a Google chat channel via webhook. There is optional configuration for including live runsession info and links", - "platform": "Unknown", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Send Chat Message" - ], - "capabilities": [], - "readme": "# Google Chat Send Message\n\n## TaskSet \nSends a static message to a Google chat channel via webhook. There is optional configuration for including live runsession info and links\nfor team members to quickly access running sessions.\n\n## Use Cases\n- Send an alert when an SLO is burning too much budget which contains a link to the active runsession.\n- Let your team members know you're in a live runsession and provide them with a link to join you.\n\n## Requirements\n- A `webhook_url` secret which allows the codebundle to perform an incoming webhook post request against the service API.\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Rest", - "RW.Core", - "RW.RunWhen.Papi", - "RW.Utils" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/googlechat-sendmessage" - }, - { - "slug": "rw-public-codecollection-http-ok", - "collection_slug": "rw-public-codecollection", - "name": "http-ok", - "display_name": "http-ok", - "description": "Check if an HTTP request against a URL fails or times out of a given latency window.", - "platform": "Unknown", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Checking HTTP URL Is Available And Timely" - ], - "capabilities": [], - "readme": "# HTTP OK\n\n## SLI\nCheck if an HTTP request against a URL fails or times out of a given latency window.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.HTTP", - "RW.Core" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/http-ok" - }, - { - "slug": "rw-public-codecollection-datadog-system-load", - "collection_slug": "rw-public-codecollection", - "name": "datadog-system-load", - "display_name": "datadog-system-load", - "description": "Retrieve a DataDog instance's \"System Load\" metric", - "platform": "Unknown", - "author": "Vui Le", - "support_tags": [ - "rw" - ], - "tasks": [ - "Check Datadog System Load" - ], - "capabilities": [], - "readme": "# Datadog System Load\n\n## SLI \nRetrieve a DataDog instance's \"System Load\" metric\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Core", - "RW.Datadog" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/datadog-system-load" - }, - { - "slug": "rw-public-codecollection-k8s-postgres-triage", - "collection_slug": "rw-public-codecollection", - "name": "k8s-postgres-triage", - "display_name": "k8s-postgres-triage", - "description": "This codebundle leverages the Kubernetes API and a running database pod to triage a database cluster. It leverages standard `kubectl` commands to inspect the deployed resources (such as returning the status of each element, custome resource, and so on) along with using `kubectl exec` to run `psql` commands that can provide additional information such as the running configuration, query statistics, and so on.", - "platform": "Kubernetes", - "author": "Shea Stewart", - "support_tags": [ - "rw" - ], - "tasks": [ - "Get Standard Resources", - "Describe Custom Resources", - "Get Pod Logs & Events", - "Get Pod Resource Utilization", - "Get Running Configuration", - "Get Patroni Output", - "Run DB Queries" - ], - "capabilities": [], - "readme": "# Kubernetes Postgres Triage\nThis codebundle leverages the Kubernetes API and a running database pod to triage a database cluster. It leverages standard `kubectl` commands to inspect the deployed resources (such as returning the status of each element, custome resource, and so on) along with using `kubectl exec` to run `psql` commands that can provide additional information such as the running configuration, query statistics, and so on. \n\n## TaskSet\nThis codebundle provides a report of Kubernetes resource health along with Postgres health, with the following tasks: \n- Standard Kubernetes Resources: Outputs all resources associated with the desired labels, namespace, and context with abbreviated output. \n- Describe Custom Resources: Optional. Searches for custom resources based on a search string (e.g. postgres) and adds the output of `kubectl describe` of these resources to the report. \n- Get Pod Logs & Events: Fetches Kubernetes events and logs that are related to the desired labels. \n- Get Pod Resource Utilization: Fetchs the output of `kubectl top` for all pods and containers that are related to the desired labels. \n- Get Running Configuration: Uses `kubectl exec` to fetch the running config of the psql instance and adds the contents of that file to the report. \n- Get Patroni Output: Uses `kubectl exec` to fetch the output of `patronictl list`. \n- Get DB Stastics: Uses `kubectl exec` to execute psql queries that can provide insights in to long running queries and other helpful database level statistics. PSQL queries are configurable. \n\n\n## Use Cases\nThis codebundle can be used as a low-level information collection tool when RunWhen Map users want more details about all resources related a database instance.\n\n### Use Case: TaskSet: Triage CrunchyData Postgres Instance\nIn order to triage a CrunchyData postgres cluster that is deployed by the crunchydata postgres operator, the following TaskSet configuration might apply:\n\n```\nconfigProvided:\n - name: INCLUDE_CUSTOM_RESOURCES\n value: 'Yes'\n - name: CRD_FILTER\n value: postgres\n - name: LOG_LINES\n value: '100'\n - name: QUERY\n value: >-\n SELECT (total_exec_time / 1000 / 60) as total, (total_exec_time/calls) as\n avg, query FROM pg_stat_statements ORDER BY 1 DESC LIMIT 100;\n - name: CONTEXT\n value: [kubeconfig_context]\n - name: RESOURCE_LABELS\n value: postgres-operator.crunchydata.com/cluster=[cluster-name]\n - name: WORKLOAD_NAME\n value: >-\n -l postgres-operator.crunchydata.com/role=[primary-label],postgres-operator.crunchydata.com/cluster=[cluster-name]\n - name: NAMESPACE\n value: [kubernetes_namespace]\n - name: WORKLOAD_CONTAINER\n value: [database_container_name]\n - name: HOSTNAME\n value: ''\n - name: DISTRIBUTION\n value: Kubernetes\n\n```\n> Because the CrunchyData operator doesn't require the local pod to authenticate, the hostname and pg_username was left blank, and the pg_password was set to an arbitrary non_null value (like 'test'). This configuration will vary across deployments and should be validated prior to TaskSet configuration. This can easily be validated by running something like `kubectl exec [primary_pod_name] -c [database container] -n [namespace] -- /bin/bash -c \"PGPASSWORD=$pg_password psql -U $pg_username -d [database_name] -c '\\l '\"`. Since the codebundle uses `kubectl exec`, users can use their own terminals to determine the right configuration that works with their specific instance. \n\n#### Additional CrunchyData Postgres Configurations\nThe following CrunchyData postgres cluster additional configuration was used to deploy the instance with support for `pg_stat_statements`: \n\n```\napiVersion: postgres-operator.crunchydata.com/v1beta1\nkind: PostgresCluster\nmetadata:\n name: [cluster-name]\n namespace: [namespace]\nspec:\n image: registry.developers.crunchydata.com/crunchydata/crunchy-postgres:ubi8-13.9-0\n postgresVersion: 13\n patroni:\n dynamicConfiguration:\n postgresql:\n parameters:\n shared_preload_libraries: \"pg_stat_statements\"\n [additional instance configuration]\n ...\n```\n\nSince this specific configuration was using the `pg_stat_statements` extension, this needed to be enabled on the desired database: \n```\n# Either as one line; \nkubectl exec [primary_pod_name] -c [database container] -n [namespace] -- /bin/bash -c \"PGPASSWORD=$pg_password psql -U $pg_username -d [database_name] -c 'CREATE EXTENSION pg_stat_statements'\"`\n\n#Or: \nkubectl exec -it [primary_pod_name] -c [database container] -n [namespace] -- /bin/bash\nbash-4.4$ psql \npsql (13.9)\nType \"help\" for help.\n\npostgres=# \\c [database]\nYou are now connected to database \"[database]\" as user \"postgres\".\n[database]=# CREATE EXTENSION pg_stats_statements\n```\n#### Kubernetes RBAC Configuration\nAs this triage TaskSet performs a number of discovery tasks across standard resources, custom resources, and runtime database configuration details, the following RBAC configurations were used in testing this TaskSet:", - "libraries": [ - "RW.K8s", - "RW.Postgres", - "RW.Utils", - "RW.platform", - "RW.Core", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/k8s-postgres-triage" - }, - { - "slug": "rw-public-codecollection-aws-vm-triage", - "collection_slug": "rw-public-codecollection", - "name": "aws-vm-triage", - "display_name": "aws-vm-triage", - "description": "Triage and troubleshoot performance and usage of an AWS EC2 instance", - "platform": "AWS", - "author": "", - "support_tags": [ - "rw" - ], - "tasks": [ - "Get Max VM CPU Utilization In Last 3 Hours", - "Get Lowest VM CPU Credits In Last 3 Hours", - "Get Max VM CPU Credit Usage In Last 3 hours", - "Get Max VM Memory Utilization In Last 3 Hours", - "Get Max VM Volume Usage In Last 3 Hours" - ], - "capabilities": [], - "readme": "# AWS Instance Triage\n\n## TaskSet \nTriage and troubleshoot performance and usage of an AWS EC2 instance\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Core", - "RW.AWS.CloudWatch" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/aws-vm-triage" - }, - { - "slug": "rw-public-codecollection-aws-cloudwatch-tagmetricquery", - "collection_slug": "rw-public-codecollection", - "name": "aws-cloudwatch-tagmetricquery", - "display_name": "aws-cloudwatch-tagmetricquery", - "description": "Retrieve aggregate results from multiple AWS Cloudwatch Metrics Insights queries ran against tagged resources.", - "platform": "AWS", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Run CloudWatch Metric Query Across Set Of IDs And Push Metric" - ], - "capabilities": [], - "readme": "# AWS CloudWatch Tag Metric Query\n\n## SLI \nRetrieve aggregate results from multiple AWS Cloudwatch Metrics Insights queries ran against tagged resources.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Core", - "RW.AWS.CloudWatch", - "RW.AWS.EC2" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/aws-cloudwatch-tagmetricquery" - }, - { - "slug": "rw-public-codecollection-k8s-kubectl-run", - "collection_slug": "rw-public-codecollection", - "name": "k8s-kubectl-run", - "display_name": "k8s-kubectl-run", - "description": "A highly generic codebundle used for running bare kubectl commands (or equivalent binaries) and presenting the stdout as a report. This allows users to take their commonly used `kubectl` triage commands for their workloads and paste them into the codebundle config, both automating and version controlling their triage process as code, which can then be shared with their team.", - "platform": "Kubernetes", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Running Kubectl And Adding Stdout To Report" - ], - "capabilities": [], - "readme": "# Kubernetes kubectl Run\nA highly generic codebundle used for running bare kubectl commands (or equivalent binaries) and presenting the stdout as a report. This allows users to take their commonly used `kubectl` triage commands for their workloads and paste them into the codebundle config, both automating and version controlling their triage process as code, which can then be shared with their team.\n\n## TaskSet\n### Use Case: TaskSet: Fetch Pod Error Logs\nWe can generate a report containing pod logs who's entries have `Exception` or `Error` in the log line. Given the config:\n\n```\nconfigProvided:\n - name: DISTRIBUTION\n value: Kubernetes\n - name: KUBECTL_COMMAND\n value: >-\n kubectl logs deployment/my-app -n default -n my-namespace --tail=200 | grep -E -i \"(Exception|Error)\"\n```\n\nWhich will fetch us the last 200 logs lines and parse them for issues and present those in the taskset report for us to view on the platform.\n\n## Use Cases\n\n## Requirements\n- A kubeconfig with appropriate RBAC permissions to perform the desired command.\n\n## TODO\n- [ ] link to kubeconfig rbac doc\n- [ ] Add additional documentation", - "libraries": [ - "RW.K8s", - "RW.Core", - "RW.platform", - "RW.Utils" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/k8s-kubectl-run" - }, - { - "slug": "rw-public-codecollection-prometheus-queryrange-transform", - "collection_slug": "rw-public-codecollection", - "name": "prometheus-queryrange-transform", - "display_name": "prometheus-queryrange-transform", - "description": "Run a PromQL query against Prometheus range query API, perform a provided transform, and return the result.", - "platform": "Unknown", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Querying Prometheus Instance And Pushing Aggregated Data" - ], - "capabilities": [], - "readme": "# Prometheus Range Query\n\n## SLI\nRun a PromQL query against Prometheus range query API, perform a provided transform, and return the result.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Core", - "RW.Prometheus" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/prometheus-queryrange-transform" - }, - { - "slug": "rw-public-codecollection-aws-cloudwatch-logquery-rowcount-zeroerror", - "collection_slug": "rw-public-codecollection", - "name": "aws-cloudwatch-logquery-rowcount-zeroerror", - "display_name": "aws-cloudwatch-logquery-rowcount-zeroerror", - "description": "Retrieve binary result from an AWS CloudWatch Insights query.", - "platform": "AWS", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Running CloudWatch Log Query And Pushing 1 If No Results Found" - ], - "capabilities": [], - "readme": "# AWS CloudWatch Log Query Row Count Zero Error\n\n## SLI \nRetrieve binary result from an AWS CloudWatch Insights query.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Core", - "RW.AWS.CloudWatch" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/aws-cloudwatch-logquery-rowcount-zeroerror" - }, - { - "slug": "rw-public-codecollection-ping-host-availability", - "collection_slug": "rw-public-codecollection", - "name": "ping-host-availability", - "display_name": "ping-host-availability", - "description": "Ping a host and retrieve packet loss percentage.", - "platform": "Unknown", - "author": "Vui Le", - "support_tags": [ - "rw" - ], - "tasks": [ - "Ping host and collect packet lost percentage" - ], - "capabilities": [], - "readme": "# Ping Host Availability\n\n## SLI\nPing a host and retrieve packet loss percentage.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Core" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/ping-host-availability" - }, - { - "slug": "rw-public-codecollection-k8s-kubectl-eventquery", - "collection_slug": "rw-public-codecollection", - "name": "k8s-kubectl-eventquery", - "display_name": "k8s-kubectl-eventquery", - "description": "This codebundle returns the number of events in a Kubernetes namespace which have messages matching a regex pattern.", - "platform": "Kubernetes", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Get Number Of Matching Events" - ], - "capabilities": [], - "readme": "# Kubernetes Kubectl Event Query\n\n## SLI\nThis codebundle returns the number of events in a Kubernetes namespace which have messages matching a regex pattern.\nNote that this does not sum up the message occurence count, only the Kubernetes object count.\n\nPattern examples:\n- Return results which contain string: `mystring`\n- Return results for matches on 1 or 2: `(Search1|Search2)`\n\n## Use Cases\n- Measure the number of failed volume mounts occuring by setting the pattern to \"FailedMount\"\n\n## Requirements\n- A kubeconfig with get/list access on event objects in the chosen namespace.\n- A chosen `namespace` and `context` to use from the kubeconfig\n- A `event pattern` to use for selecting the event objects; refer to extended grep patterns for details on how to write these. run `man grep`.\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.K8s", - "RW.Utils", - "RW.platform", - "RW.Core", - "BuiltIn", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/k8s-kubectl-eventquery" - }, - { - "slug": "rw-public-codecollection-k8s-triage-deploymentreplicas", - "collection_slug": "rw-public-codecollection", - "name": "k8s-triage-deploymentreplicas", - "display_name": "k8s-triage-deploymentreplicas", - "description": "Triages issues related to a deployment's replicas.", - "platform": "Kubernetes", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Fetch Logs", - "Get Related Events", - "Check Deployment Replicas" - ], - "capabilities": [], - "readme": "# Kubernetes Triage Deployments\n\n## TaskSet\nTriages issues related to a deployment's replicas.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.K8s", - "RW.Utils", - "RW.platform", - "RW.Core", - "BuiltIn", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/k8s-triage-deploymentreplicas" - }, - { - "slug": "rw-public-codecollection-rest-explicitoauth2-basicauth", - "collection_slug": "rw-public-codecollection", - "name": "rest-explicitoauth2-basicauth", - "display_name": "rest-explicitoauth2-basicauth", - "description": "A general purpose rest codebundle for extracting data from a rest endpoint. See the [generic](https://docs.runwhen.com/public/v/codebundles/rest-generic) codebundle variant for more details.", - "platform": "Unknown", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Request Data From Rest Endpoint" - ], - "capabilities": [], - "readme": "# Rest OAuth2 With Basic Authentication\n\n## SLI\nA general purpose rest codebundle for extracting data from a rest endpoint. See the [generic](https://docs.runwhen.com/public/v/codebundles/rest-generic) codebundle variant for more details.\n\n## Use Cases\nRefer to the generic rest codebundle [here](https://docs.runwhen.com/public/v/codebundles/rest-generic) for a suite of setups and use cases.\n\n## Requirements\n--\n\n## TODO\n--", - "libraries": [ - "RW.Core", - "RW.Rest", - "RW.Utils" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/rest-explicitoauth2-basicauth" - }, - { - "slug": "rw-public-codecollection-aws-account-limit", - "collection_slug": "rw-public-codecollection", - "name": "aws-account-limit", - "display_name": "aws-account-limit", - "description": "Retrieve all recently created AWS accounts.", - "platform": "AWS", - "author": "Vui Le", - "support_tags": [ - "rw" - ], - "tasks": [ - "Get Count Of AWS Accounts In Organization", - "Get The Recently Created AWS Accounts" - ], - "capabilities": [], - "readme": "# AWS Account Limit\n\n## SLI\nRetrieve all recently created AWS accounts.\n\n## TaskSet \nRetrieve the count of all AWS accounts in an organization.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Core", - "RW.AWS", - "RW.Slack", - "RW.Report" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/aws-account-limit" - }, - { - "slug": "rw-public-codecollection-k8s-triage-patroni", - "collection_slug": "rw-public-codecollection", - "name": "k8s-triage-patroni", - "display_name": "k8s-triage-patroni", - "description": "Taskset to triage issues related to patroni.", - "platform": "Kubernetes", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Get Patroni Status", - "Get Pods Status", - "Fetch Logs" - ], - "capabilities": [], - "readme": "# Kubernetes Triage Patroni\n\n## TaskSet\nTaskset to triage issues related to patroni.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.K8s", - "RW.Utils", - "RW.platform", - "RW.Core", - "BuiltIn", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/k8s-triage-patroni" - }, - { - "slug": "rw-public-codecollection-k8s-troubleshoot-deployment", - "collection_slug": "rw-public-codecollection", - "name": "k8s-troubleshoot-deployment", - "display_name": "k8s-troubleshoot-deployment", - "description": "A taskset for troubleshooting general issues associated with typical kubernetes deployment resources.", - "platform": "Kubernetes", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Troubleshoot Resourcing", - "Troubleshoot Events", - "Troubleshoot PVC", - "Troubleshoot Pods", - "# RW.Core.Add Pre To Report ${pdb_report}", - "# RW.Core.Add Pre To Report ${networking_report}" - ], - "capabilities": [], - "readme": "# Kubernetes Troubleshoot Deployment\n\n## TaskSet\nA taskset for troubleshooting general issues associated with typical kubernetes deployment resources.\nSupports API interactions via both the API client and Kubectl binary through RunWhen Shell Services.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.K8s", - "RW.Utils", - "RW.platform", - "RW.Core", - "BuiltIn", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/k8s-troubleshoot-deployment" - }, - { - "slug": "rw-public-codecollection-mongodb-health-gcp-promql", - "collection_slug": "rw-public-codecollection", - "name": "mongodb-health-gcp-promql", - "display_name": "mongodb-health-gcp-promql", - "description": "This codebundle provides an opinionated healthcheck on mongoDB instances. It requires that the Mongodb Prometheus exporter (by Percona) is configured appropriately and that metrics are being sent to Google Managed Prometheus.", - "platform": "GCP", - "author": "Shea Stewart", - "support_tags": [ - "rw" - ], - "tasks": [ - "Get Access Token", - "Get Instance Status", - "Get Connection Utilization Rate", - "Get MongoDB Member State Health", - "Get MongoDB Replication Lag", - "Get MongoDB Queue Size", - "Get Assertion Rate", - "Generate MongoDB Score" - ], - "capabilities": [ - "Get Instance Status: Get the count of mongodb_up returning 1 dividided by the number of expected instances", - "Get Connection Utilization Rate: Get the connection utilization (current/max) for all instances and score against threshold (1 = below threshold, 0 = above)", - "Get MongoDB Member State Health: Fetch the replication state of each member and ensure they are within acceptable parameters. https://www.mongodb.com/docs/manual/reference/replica-states/", - "Get MongoDB Replication Lag: Fetch the replication lag (in seconds) of all instances and determine if they are within acceptable parameters.", - "Get MongoDB Queue Size: Fetch the total size of the globalLock current queue for all instances.", - "Get Assertion Rate: Fetch the assertion rate (over the last 5m) of all instances and determine if they are within acceptable parameters." - ], - "readme": "# MongoDB Health Google Managed Prometheus (promql)\nThis codebundle provides an opinionated healthcheck on mongoDB instances. It requires that the Mongodb Prometheus exporter (by Percona) is configured appropriately and that metrics are being sent to Google Managed Prometheus. \n \n\n## Service Level Indicator\nThe SLI codebundle provides a composite health check which provides a score between 0 (unhealthy) and 1 (healthy). Any value between 0 and 1 indicates that one of the following health checks produced a score of 0 for its individual check. The score is derived by adding up the value of each test and dividing by the total number of tests. \n\nEvaluations performed in this healthcheck: \n\n- Instance Status: Are the expected amount of members running for each instance?\n- Connection Utilization Rate: Is the current connection utilization (current/max) above the desired threshold for any instance?\n- Member Health: Are any of the members reporting an unhealthy state?\n- Replication Lag: Is the largest replication for any cluster above the desired threshold?\n- Queue Size: Is size of the queue (reads or writes) above the desired threshold?\n- Assertion Rate: Is the rate of assertions over the last 5m above the desired threshold for any instance?\n\nThis SLI does support measing health across multiple instances and often reports the Max value obtained across instances. The PROMQL_FILTER can be used to add specific labels for query filtering as necessary. \n\n> For those not looking for composite scores, the [gcp-opssuite-promql](https://docs.runwhen.com/public/v/codebundles/gcp-opssuite-promql) codebundle can be used to create specific SLIs for any specific metric. \n\n## Use Cases\n### Use Case: SLI: MongoDB Community Edition Health for All Instances in a Kubernetes Namespace\nThe following use case provides an example configuration in which the SLI can be used to provide a composite score across multiple mongodb clusters in the same namespace. \n\n> For a full walkthough on the setup of an environment with MongoDB Community Edition, Percona MongoDB Prometheus Exporter, and Google Mangaged Prometheus, please view [the complete docs located here](https://docs.runwhen.com/public/use-cases/kubernetes-environments/measuring-mongodb-health-with-promql). \n\n- Example MongoDB Community edition object: \n```\napiVersion: mongodbcommunity.mongodb.com/v1\nkind: MongoDBCommunity\nmetadata:\n name: sandbox-mongodb\n namespace: mongodb-test\nspec:\n members: 3\n type: ReplicaSet\n version: \"4.4.0\"\n security:\n authentication:\n modes: [\"SCRAM\"]\n users:\n - name: my-user\n db: admin\n passwordSecretRef: # a reference to the secret that will be used to generate the user's password\n name: my-user-password\n roles:\n - name: clusterAdmin\n db: admin\n - name: userAdminAnyDatabase\n db: admin\n scramCredentialsSecretName: my-scram\n additionalMongodConfig:\n storage.wiredTiger.engineConfig.journalCompressor: zlib\n net.maxIncomingConnections: 1000\n```\n\n- Example Percona MongoDB Prometheus Exporter:\n```\napiVersion: helm.toolkit.fluxcd.io/v2beta1\nkind: HelmRelease\nmetadata:\n name: mongodb-exporter\n namespace: mongodb-test\nspec:\n releaseName: mongodb-test-exporter\n chart:\n spec:\n chart: prometheus-mongodb-exporter\n # https://github.com/prometheus-community/helm-charts/blob/main/charts/prometheus-mongodb-exporter/values.yaml\n version: 3.1.2\n sourceRef:\n kind: HelmRepository\n name: prometheus-community\n namespace: flux-system\n interval: 5m\n values:\n image:\n pullPolicy: IfNotPresent\n repository: percona/mongodb_exporter\n tag: \"0.37.0\"\n mongodb:\n uri: \"mongodb://my-user:SuperSecretPassword@sandbox-mongodb-0.sandbox-mongodb-svc.mongodb-test.svc.cluster.local:27017\"\n```\n\n- Example codebundle configuration: \n```\nconfigProvided:\n - name: PROMQL_FILTER\n value: namespace=\"mongodb-test\"\n - name: CONNECTION_UTILIZATION_THRESHOLD\n value: '80'\n - name: MAX_LAG\n value: '60'\n - name: MAX_ASSERTION_RATE\n value: '1'\n - name: PROJECT_ID\n value: [gcp-project-id]\n - name: MAX_QUEUE_SIZE\n value: '0'\nsecretsProvided:\n - name: ops-suite-sa\n workspaceKey: [secret-name]\nservicesProvided:\n - name: curl\n locationServiceName: curl-service.shared\n```\nWith the example above, a score of less than 1 would be produced if any of the conditions are true: \n- Any members are not running\n- Any instance member is returning an unhealthy state\n- The amount of active connections vs max is 80% or greater\n- Any instance has a replication lag of 60s or larger\n- Any instance has assertions are being generated at a rate of 1/s or greater\n- Any instance has any read or write requests waiting in the queue\n\n## Requirements\n### Version Details\nThis codebundle was tested with MongoDB Community Edition Kubernetes Operator, with MongoDB versions: \n- 4.4.0\n- 6.0.5\n\nAlong with the Percona MongoDB Prometheus Exporter chart version 3.1.2 and ", - "libraries": [ - "Collections", - "RW.Prometheus", - "RW.Utils", - "RW.GCP.OpsSuite", - "RW.Core", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/mongodb-health-gcp-promql" - }, - { - "slug": "rw-public-codecollection-sysdig-monitor-promqlmetric", - "collection_slug": "rw-public-codecollection", - "name": "sysdig-monitor-promqlmetric", - "display_name": "sysdig-monitor-promqlmetric", - "description": "Queries the Sysdig data API with a PromQL query to fetch metric data.", - "platform": "Unknown", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Querying PromQL Endpoint And Pushing Metric Data" - ], - "capabilities": [], - "readme": "# Sysdig Monitor PromQL Metric\n\n## SLI\nQueries the Sysdig data API with a PromQL query to fetch metric data.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Core", - "RW.Sysdig" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/sysdig-monitor-promqlmetric" - }, - { - "slug": "rw-public-codecollection-gcp-opssuite-logquery-dashboard", - "collection_slug": "rw-public-codecollection", - "name": "gcp-opssuite-logquery-dashboard", - "display_name": "gcp-opssuite-logquery-dashboard", - "description": "Generate a link to the GCP Log Explorer.", - "platform": "GCP", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Get GCP Log Dashboard URL For Given Log Query" - ], - "capabilities": [], - "readme": "# GCP Operations Suite Log Query Dashboard \n\n## TaskSet\nGenerate a link to the GCP Log Explorer.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "DateTime", - "RW.Core", - "RW.GCP.OpsSuite" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/gcp-opssuite-logquery-dashboard" - }, - { - "slug": "rw-public-codecollection-kong-ingress-health-gcp-promql", - "collection_slug": "rw-public-codecollection", - "name": "kong-ingress-health-gcp-promql", - "display_name": "kong-ingress-health-gcp-promql", - "description": "This codebundle provides an opinionated healthcheck on ingress objects that are managed by the Kong ingress controller. It requires that the Prometheus plugin is configured appropriately and that metrics are being sent to Google Managed Prometheus.", - "platform": "GCP", - "author": "Shea Stewart", - "support_tags": [ - "rw" - ], - "tasks": [ - "Get Access Token", - "Get HTTP Error Rate", - "Get Upstream Health", - "Get Request Latency Rate", - "Generate Kong Ingress Score" - ], - "capabilities": [], - "readme": "# Kong Ingress Health Google Managed Prometheus (promql)\nThis codebundle provides an opinionated healthcheck on ingress objects that are managed by the Kong ingress controller. It requires that the Prometheus plugin is configured appropriately and that metrics are being sent to Google Managed Prometheus. \n\n\n## Service Level Indicator\nThis SLI queries the Google Managed Prometheus service for Kong related Prometheus metrics that are tied to a single ingress resource. Produces a Score of 1 (healthy) or 0 (unhealthy). Supports values that are between 0 and 1 depending on the result of each test. The suite of checks considered are:\n- is the **HTTP error rate** within acceptable levels?\n- are there any **upstream errors** reported?\n- are **request latencies** within acceptable levels?\n\nThresholds can be configured for the **HTTP Error rate** or **request latencies** to still be healthy. \n\nEach of these checks receives a score of 1 (healthy) or 0 (unhealthy), and they are added up and divided by the total number of checks. This means that an ingress object can have a health score between 0 and 1 depending on the types of issues that are occuring. \n\n\n\n## Use Cases\n### Use Case: SLI: Kong Ingress Object Health\nThe following use case provides an example configuration for an ingress object that looks like the following: \n- Example Kubernetes ingress object: \n```\napiVersion: networking.k8s.io/v1\nkind: Ingress\nmetadata:\n annotations:\n konghq.com/https-redirect-status-code: \"301\"\n konghq.com/protocols: https\n name: ob\n namespace: online-boutique\nspec:\n ingressClassName: kong\n rules:\n - host: b.demo.here.com\n http:\n paths:\n - backend:\n service:\n name: frontend-external\n port:\n number: 80\n path: /\n pathType: Prefix\n tls:\n - hosts:\n - ob.demo.here.com\n secretName: ob-demo-tls\n```\n\n- Example codebundle configuration: \n```\nconfigProvided:\n - name: HTTP_ERROR_CODES\n value: 5.*\n - name: HTTP_ERROR_RATE_WINDOW\n value: 1m\n - name: HTTP_ERROR_RATE_THRESHOLD\n value: '2'\n - name: PROJECT_ID\n value: [gcp-project-id]]\n - name: INGRESS_UPSTREAM\n value: frontend-external.online-boutique.80.svc\n - name: INGRESS_SERVICE\n value: online-boutique.ob.frontend-external.80\n - name: REQUEST_LATENCY_THRESHOLD\n value: '100'\n```\n\nWith the example above, an ingress would be be considerd a 0 (unhealthy) if: \n- Any http 500 codes are occuring at a rate > that 2/s\n- Upstream targets report dns_error or unhealthy status codes\n- The 99th percentile of request latencies are > 100ms\n\n## Requirements\n### Service Account Requirements \nThis codebundle requires a service account and accompanying json key uploaded as a secret to the workspace.\n\nThe service account should have the following roles: \n- Logs Viewer - `roles/logging.viewer`\n- Monitoring Viewer - `roles/monitoring.viewer`\n\n> Note: It's likely that only the Monitoring Viewer role is required for promql queries, but both roles are helpful when using other gcp-opssuite* codebundles. \n\nPlease see the [documentation for creating service accounts](https://cloud.google.com/iam/docs/creating-managing-service-accounts)\n\n## Helpful Resources\n- [https://docs.konghq.com/hub/kong-inc/prometheus/](https://docs.konghq.com/hub/kong-inc/prometheus/)", - "libraries": [ - "RW.Prometheus", - "RW.Utils", - "RW.GCP.OpsSuite", - "RW.Core", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/kong-ingress-health-gcp-promql" - }, - { - "slug": "rw-public-codecollection-gcp-opssuite-logquery", - "collection_slug": "rw-public-codecollection", - "name": "gcp-opssuite-logquery", - "display_name": "gcp-opssuite-logquery", - "description": "Retrieve the number of results of a GCP Log Explorer query.", - "platform": "GCP", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Running GCE Logging Query And Pushing Result Count Metric" - ], - "capabilities": [], - "readme": "# GCP Operations Suite Log Query\n\n## SLI \nRetrieve the number of results of a GCP Log Explorer query.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "DateTime", - "Collections", - "RW.GCP.OpsSuite", - "RW.Utils.RWUtils", - "OperatingSystem", - "RW.Core" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/gcp-opssuite-logquery" - }, - { - "slug": "rw-public-codecollection-pingdom-health", - "collection_slug": "rw-public-codecollection", - "name": "pingdom-health", - "display_name": "pingdom-health", - "description": "Check health of Pingdom platform.", - "platform": "Unknown", - "author": "Vui Le", - "support_tags": [ - "rw" - ], - "tasks": [ - "Check Pingdom Health" - ], - "capabilities": [], - "readme": "# Pingdom Health\n\n## SLI\nCheck health of Pingdom platform.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Core", - "RW.Pingdom" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/pingdom-health" - }, - { - "slug": "rw-public-codecollection-cert-manager-expirations", - "collection_slug": "rw-public-codecollection", - "name": "cert-manager-expirations", - "display_name": "cert-manager-expirations", - "description": "Retrieve number of expired TLS certificates managed by cert-manager within a given window.", - "platform": "Unknown", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Inspect Certification Expiration Dates" - ], - "capabilities": [], - "readme": "# Cert-Manager Expirations\n\n## SLI \nRetrieve number of expired TLS certificates managed by cert-manager within a given window.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.K8s", - "RW.Utils", - "RW.platform", - "RW.Core", - "BuiltIn", - "OperatingSystem", - "RW.CertManager" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/cert-manager-expirations" - }, - { - "slug": "rw-public-codecollection-prometheus-queryinstant-transform", - "collection_slug": "rw-public-codecollection", - "name": "prometheus-queryinstant-transform", - "display_name": "prometheus-queryinstant-transform", - "description": "Run a PromQL query against Prometheus instant query API, perform a provided transform, and return the result.", - "platform": "Unknown", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Querying Prometheus Instance And Pushing Aggregated Data" - ], - "capabilities": [], - "readme": "# Prometheus Instant Query\n\n## SLI\nRun a PromQL query against Prometheus instant query API, perform a provided transform, and return the result.\n\n## Use Cases\n\n### Use Case: SLI: Kubernetes Node Heartbeats with Kube State Metrics\nIf you want to monitor the number of heartbeats failing across nodes, provided your kube_state metrics are submitted to the prometheus instance, then you can enter this query, which will give you a count of failing heartbeats across the node fleet:\n\n```((max(sum by(condition) (kube_node_status_condition{condition!=\"Ready\", status=\"false\"}))+min(kube_node_status_condition{condition=\"Ready\", status=\"true\"}))*-1) + count( sum( kube_node_status_condition ) by (condition) )```\n\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Core", - "RW.Prometheus" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/prometheus-queryinstant-transform" - }, - { - "slug": "rw-public-codecollection-aws-billing-costsacrosstags", - "collection_slug": "rw-public-codecollection", - "name": "aws-billing-costsacrosstags", - "display_name": "aws-billing-costsacrosstags", - "description": "Creates a report of AWS line item costs filtered to a list of tagged resources", - "platform": "AWS", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Get All Billing Sliced By Tags" - ], - "capabilities": [], - "readme": "# AWS Billing Costs Across Tags\n\n## TaskSet \nCreates a report of AWS line item costs filtered to a list of tagged resources\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Core", - "RW.AWS.Billing" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/aws-billing-costsacrosstags" - }, - { - "slug": "rw-public-codecollection-aws-cloudwatch-metricquery-dashboard", - "collection_slug": "rw-public-codecollection", - "name": "aws-cloudwatch-metricquery-dashboard", - "display_name": "aws-cloudwatch-metricquery-dashboard", - "description": "Creates a URL to a AWS CloudWatch metrics dashboard with a running query.", - "platform": "AWS", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Get CloudWatch MetricQuery Insights URL" - ], - "capabilities": [], - "readme": "# AWS CloudWatch Metric Query Dashboard\n\n## TaskSet \nCreates a URL to a AWS CloudWatch metrics dashboard with a running query.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Core", - "RW.AWS.CloudWatch" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/aws-cloudwatch-metricquery-dashboard" - }, - { - "slug": "rw-public-codecollection-aws-cloudwatch-logquery", - "collection_slug": "rw-public-codecollection", - "name": "aws-cloudwatch-logquery", - "display_name": "aws-cloudwatch-logquery", - "description": "Retrieve number of results from an AWS CloudWatch Insights query.", - "platform": "AWS", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Running CloudWatch Log Query And Pushing The Count Of Results" - ], - "capabilities": [], - "readme": "# AWS CloudWatch Log Query\n\n## SLI \nRetrieve number of results from an AWS CloudWatch Insights query.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Core", - "RW.AWS.CloudWatch" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/aws-cloudwatch-logquery" - }, - { - "slug": "rw-public-codecollection-grafana-health", - "collection_slug": "rw-public-codecollection", - "name": "grafana-health", - "display_name": "grafana-health", - "description": "Check Grafana server health.", - "platform": "Unknown", - "author": "Vui Le", - "support_tags": [ - "rw" - ], - "tasks": [ - "Check Grafana Server Health" - ], - "capabilities": [], - "readme": "# Grafana Health\n\n## SLI\nCheck Grafana server health.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Core", - "RW.Grafana" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/grafana-health" - }, - { - "slug": "rw-public-codecollection-rest-basicauth", - "collection_slug": "rw-public-codecollection", - "name": "rest-basicauth", - "display_name": "rest-basicauth", - "description": "A general purpose rest codebundle for extracting data from a rest endpoint. See the [generic](https://docs.runwhen.com/public/v/codebundles/rest-generic) codebundle variant for more details.", - "platform": "Unknown", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Request Data From Rest Endpoint" - ], - "capabilities": [], - "readme": "# Rest Basic Authentication\n\n## SLI\nA general purpose rest codebundle for extracting data from a rest endpoint. See the [generic](https://docs.runwhen.com/public/v/codebundles/rest-generic) codebundle variant for more details.\n\n## Use Cases\nRefer to the generic rest codebundle [here](https://docs.runwhen.com/public/v/codebundles/rest-generic) for a suite of setups and use cases.\n\n## Requirements\n--\n\n## TODO\n--", - "libraries": [ - "RW.Core", - "RW.Rest" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/rest-basicauth" - }, - { - "slug": "rw-public-codecollection-k8s-kubectl-canaryvolumemount", - "collection_slug": "rw-public-codecollection", - "name": "k8s-kubectl-canaryvolumemount", - "display_name": "k8s-kubectl-canaryvolumemount", - "description": "An SLI which periodically creates a job which lists the contents of a directory on a pvc, if the list command succeeds than the SLI", - "platform": "Kubernetes", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Run Canary Job" - ], - "capabilities": [], - "readme": "# Kubernetes kubectl Canary Volume Mount\n\n## SLI\nAn SLI which periodically creates a job which lists the contents of a directory on a pvc, if the list command succeeds than the SLI\nreturns a score of 1, otherwise a 0 when it fails.\n\n## Use Cases\n- Validate that system storage is working and can be provisioned on the cluster.\n\n## Requirements\n- A kubeconfig with get/list access on deployment, pod, and PVC objects in the chosen namespace.\n- A chosen `namespace` and `context` to use from the kubeconfig.\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.K8s", - "RW.Utils", - "RW.platform", - "RW.Core", - "BuiltIn", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/k8s-kubectl-canaryvolumemount" - }, - { - "slug": "rw-public-codecollection-gcp-opssuite-metricquery", - "collection_slug": "rw-public-codecollection", - "name": "gcp-opssuite-metricquery", - "display_name": "gcp-opssuite-metricquery", - "description": "Performs a metric query using a Google MQL statement on the Ops Suite API", - "platform": "GCP", - "author": "", - "support_tags": [ - "rw" - ], - "tasks": [ - "Running GCP OpsSuite Metric Query" - ], - "capabilities": [], - "readme": "# GCP Operations Suite Metric Query \n\n\n## SLI\nPerforms a metric query using a Google MQL statement on the Ops Suite API\n\n## Use Cases\n\n### Use Case: SLI: QCP Exceeded Quotas\nIf quotas are being exeeced, you might be experienceing issues with providsioning new services. Use this code bundle with the following configuration to identify if any quotas are exceeded in the GCP project. \n\n- MQL Statement: \n```fetch consumer_quota | metric 'serviceruntime.googleapis.com/quota/exceeded' | group_by 10m, [value_exceeded_count_true: count_true(value.exceeded)] | every 10m | group_by [],[value_exceeded_count_true_aggregate: aggregate(value_exceeded_count_true)]```\n- No Result Overwite: `True`\n- No Result Value: `0`\n\nWith this query, it's a *good* sign when no data is returned, meaning that no quotas have been exceeded. With that said, you must set to the no `result overwrite` and `no result values` so that the codebundle doesn't error out when no data is returned. \n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Core", - "RW.GCP.OpsSuite" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/gcp-opssuite-metricquery" - }, - { - "slug": "rw-public-codecollection-elasticsearch-health", - "collection_slug": "rw-public-codecollection", - "name": "elasticsearch-health", - "display_name": "elasticsearch-health", - "description": "Check Elasticsearch cluster health", - "platform": "Unknown", - "author": "", - "support_tags": [ - "rw" - ], - "tasks": [], - "capabilities": [], - "readme": "# Elasticsearch Health\n\n## SLI \nCheck Elasticsearch cluster health\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/elasticsearch-health" - }, - { - "slug": "rw-public-codecollection-datadog-metricquery", - "collection_slug": "rw-public-codecollection", - "name": "datadog-metricquery", - "display_name": "datadog-metricquery", - "description": "Performs a metric query against the Datadog timeseries API as per the [docs](https://docs.datadoghq.com/api/latest/metrics/#query-timeseries-points) allowing you to feed your datadog metrics into the RunWhen platform.", - "platform": "Unknown", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Query Datadog Metrics" - ], - "capabilities": [], - "readme": "# Datadog Metric Query\n\n## SLI\nPerforms a metric query against the Datadog timeseries API as per the [docs](https://docs.datadoghq.com/api/latest/metrics/#query-timeseries-points) allowing you to feed your datadog metrics into the RunWhen platform.\nAfter fetching the timeseries it extracts data from the point using a json path. Typically this is the newest point in the timeseries.\n\n## Use Cases\n### Hook your datadog metrics up to the RunWhen platform\nFirst refer to either a pre-existing dashboard or use the explorer to find timeseries that interest you, eg: https://us3.datadoghq.com/metric/explorer \n> Note:your site may differ based on account location\n\nThen identify the query used to generate the timeseries or that is saved in the dashboard, eg: `max:system.cpu.user{*}`\n\nProvide that query in the configuration for the codebundle in the `METRIC_QUERY` field.\n\nIf your query is a traditional timeseries, you likely won't need to change the default `JSON_PATH` of `series[0].pointlist[-1][1]`\nYou can check the logs output by the codebundle to determine the desired data you'd like to extract, the response blob will be visible there. Eg: given the response:\n\n```\n{\n 'from_date': 1675977177000,\n 'group_by': [],\n 'message': '',\n 'query': 'max:system.cpu.user{*}',\n 'res_type': 'time_series',\n 'resp_version': 1,\n 'series': [{'aggr': 'max',\n 'attributes': {},\n 'display_name': 'system.cpu.user',\n 'end': 1675977224000,\n 'expression': 'max:system.cpu.user{*}',\n 'interval': 1,\n 'length': 4,\n 'metric': 'system.cpu.user',\n 'pointlist': [[1675977179000.0, 0.20256583392620087],\n [1675977194000.0, 0.20229265093803406],\n [1675977209000.0, 0.26990553736686707],\n [1675977224000.0, 0.2702702581882477]],\n 'query_index': 0,\n 'scope': '*',\n 'start': 1675977179000,\n 'tag_set': [],\n 'unit': [{'family': 'percentage',\n 'id': 17,\n 'name': 'percent',\n 'plural': 'percent',\n 'scale_factor': 1.0,\n 'short_name': '%'},\n None]}],\n 'status': 'ok',\n 'times': [],\n 'to_date': 1675977237000,\n 'values': []\n}\n```\n\nWe can access the newest data point using the default json path: `series[0].pointlist[-1][1]`\n\nThis is the data point that will be submitted as an SLI metric on the platform.\n\n## Requirements\n- A `datadog_api_key` secret in order to authenticate with the API.\n- A `datadog_app_key` secret to identify the scope and what application is interacting with the API.\n- Which datadog site to connect to, such as `us3.datadoghq.com` or `datadoghq.com` wherever the account resides.\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Core", - "RW.Datadog" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/datadog-metricquery" - }, - { - "slug": "rw-public-codecollection-k8s-cortexmetrics-ingestor-health", - "collection_slug": "rw-public-codecollection", - "name": "k8s-cortexmetrics-ingestor-health", - "display_name": "k8s-cortexmetrics-ingestor-health", - "description": "Periodically checks the state of the cortex metrics ingestors and returns a score of 1 (healthy) or 0 (unhealthy). This SLI performs the query by executing a `kubectl exec` into a Kubernetes resource, leveraging existing Kubernetes API authentication. For the ingesters to be considered healthy they must:", - "platform": "Kubernetes", - "author": "Shea Stewart", - "support_tags": [ - "rw" - ], - "tasks": [ - "Determine Cortex Ingester Ring Health", - "Fetch Ingestor Ring Member List and Status" - ], - "capabilities": [], - "readme": "# Kubernetes Cortex Metrics Ingester Health\n\n## SLI\nPeriodically checks the state of the cortex metrics ingestors and returns a score of 1 (healthy) or 0 (unhealthy). This SLI performs the query by executing a `kubectl exec` into a Kubernetes resource, leveraging existing Kubernetes API authentication. For the ingesters to be considered healthy they must:\n\n- Be considered \"ACTIVE\" in the ingester ring as published by the http api endpoint `/ring`\n- Have as many \"ACTIVE\" ingester ring members as specified in the SLI configuration variable EXPECTED_RING_MEMBERS\n\nThe defaults will target a distributor pod which can locally reach http://127.0.0.1:8080/ring to obtain the status, but this can be overridden if another pod is used to query this endpoint within the cluster. \n\n## TaskSet\nQueries the state of ingestors and returns the state of each along with the latest timestamp . This TaskSet performs the query by executing a `kubectl exec` into a Kubernetes resource, leveraging existing Kubernetes API authentication. \n\n## Use Cases\n### Use Case: SLI: Monitoring Grafana Mimir Ingester Health\nAs Grafana Mimir is based on Cortex metrics, this codebundle could be use in same way to inspect the health of Grafan Mimir ingesters. \n\n## Requirements\n- A kubeconfig with `get, list` access on cortex objects in the chosen namespace, along with the verb `create` on resource `pods/exec`\n- A chosen `namespace` and `context` to use from the kubeconfig\n- A cortex pod resource that has access to the `ring` api endpoint to exec into within the chosen `namespace` (often the distributor pods)\n\n## TODO\n- [ ] Add additional documentation\n- [ ] Add additional taskset checks ", - "libraries": [ - "RW.K8s", - "RW.Utils", - "RW.platform", - "RW.Core", - "BuiltIn", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/k8s-cortexmetrics-ingestor-health" - }, - { - "slug": "rw-public-codecollection-grpc-grpcurl-unary", - "collection_slug": "rw-public-codecollection", - "name": "grpc-grpcurl-unary", - "display_name": "grpc-grpcurl-unary", - "description": "A generic gRPC codebundle that uses the the grpcurl service to send requests to gRPC services. The user can paste in their favorite grpcurl shell commands and fetch data with them.", - "platform": "Unknown", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Run gRPCurl Command and Push Metric", - "Run gRPCurl Command and Show Output" - ], - "capabilities": [], - "readme": "# gRPC Unary\nA generic gRPC codebundle that uses the the grpcurl service to send requests to gRPC services. The user can paste in their favorite grpcurl shell commands and fetch data with them.\nSupports jq for processing output and expects to output in json format. \n\n## SLI \nA grpcurl SLI for querying and extracting data from a generic grpcurl call. Uses the hosted grpcurl service, supports jq for parsing, and should produce a single metric.\n\n## TaskSet\nA gprcurl TaskSet for querying and extracting data from a generic grpcurl call. Uses the hosted grpcurl service, supports jq for parsing, will typically output in json.\n\n## Use Cases\n### SLI: Use gRPC result as metric\nThis example uses the SLI to fetch json data from an arbitrary gRPC service and submit a value from the json payload as a metric.\n\n```\nGRPCURL_COMMAND=\"grpcurl -plaintext -d '{\"greeting\": \"1\"}' grpc.postman-echo.com:443 HelloService/SayHello | jq '(.reply | split(\" \"))[1]'\"\n```\n### TaskSet: Show gRPC service proto information\nThis example uses the TaskSet to show the proto information of a gRPC service.\n\n```\nGRPCURL_COMMAND=\"grpcurl -plaintext grpc.postman-echo.com:443 describe\"\n```\n\n## Requirements\n- The gRPCurl command to run\n- A gRPC service with server reflection enabled\n\n## TODO\n- [ ] Support proto file uploads\n- [ ] Add support for other streaming methods\n- [ ] Add additional report formatting so that it's not just json", - "libraries": [ - "String", - "RW.Core", - "RW.Utils", - "RW.gRPC.gRPCurl" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/grpc-grpcurl-unary" - }, - { - "slug": "rw-public-codecollection-k8s-daemonset-healthcheck", - "collection_slug": "rw-public-codecollection", - "name": "k8s-daemonset-healthcheck", - "display_name": "k8s-daemonset-healthcheck", - "description": "Periodically checks the state of a daemonset and returns a score of 1 (healthy) or 0 (unhealthy). For a daemonset to be considered healthy it must:", - "platform": "Kubernetes", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Health Check Daemonset" - ], - "capabilities": [], - "readme": "# Kubernetes Daemonset Healthcheck\n\n## SLI\nPeriodically checks the state of a daemonset and returns a score of 1 (healthy) or 0 (unhealthy). For a daemonset to be considered healthy it must:\n\n- Should not be above the allowed max unavailable count\n- Have 0 misscheduled pods\n- Have at least the minimum allowed pods\n- All scheduled pods should ready and available, indicating successful rollouts\n\n## Use Cases\n- Check your vault csi driver is healthy and properly deployed across your nodes.\n\n## Requirements\n- A kubeconfig with get/list access on daemonset objects in the chosen namespace.\n- A chosen `namespace` and `context` to use from the kubeconfig\n- A `daemonset name` to monitor within the chosen `namespace`.\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.K8s", - "RW.Utils", - "RW.platform", - "RW.Core", - "BuiltIn", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/k8s-daemonset-healthcheck" - }, - { - "slug": "rw-public-codecollection-k8s-decommission-workloads", - "collection_slug": "rw-public-codecollection", - "name": "k8s-decommission-workloads", - "display_name": "k8s-decommission-workloads", - "description": "Searches a namespace for matching objects and provides the commands to decommission them.", - "platform": "Kubernetes", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Generate Decomission Commands" - ], - "capabilities": [], - "readme": "# Kubernetes Decomission Workload\n\n## TaskSet\nSearches a namespace for matching objects and provides the commands to decommission them.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.K8s", - "RW.Utils", - "RW.platform", - "RW.Core", - "BuiltIn", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/k8s-decommission-workloads" - }, - { - "slug": "rw-public-codecollection-msteams-send-message", - "collection_slug": "rw-public-codecollection", - "name": "msteams-send-message", - "display_name": "msteams-send-message", - "description": "Send a message to an MS Teams channel.", - "platform": "Unknown", - "author": "Vui Lee", - "support_tags": [ - "rw" - ], - "tasks": [ - "Send a Message to an MS Teams Channel" - ], - "capabilities": [], - "readme": "# Microsoft Teams Send Message\n\n## TaskSet\nSend a message to an MS Teams channel.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Core", - "RW.MSTeams" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/msteams-send-message" - }, - { - "slug": "rw-public-codecollection-k8s-kubectl-apiserverhealth", - "collection_slug": "rw-public-codecollection", - "name": "k8s-kubectl-apiserverhealth", - "display_name": "k8s-kubectl-apiserverhealth", - "description": "Check the health of a Kubernetes API server using kubectl.", - "platform": "Kubernetes", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Running Kubectl Check Against API Server" - ], - "capabilities": [], - "readme": "# Kubernetes kubectl API Server Health\n\n## SLI\nCheck the health of a Kubernetes API server using kubectl.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.K8s", - "RW.platform", - "RW.Core", - "BuiltIn", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/k8s-kubectl-apiserverhealth" - }, - { - "slug": "rw-public-codecollection-slo-default", - "collection_slug": "rw-public-codecollection", - "name": "slo-default", - "display_name": "slo-default", - "description": "Default SLO query used for multi-window multi-burn.", - "platform": "Unknown", - "author": "", - "support_tags": [ - "rw" - ], - "tasks": [], - "capabilities": [], - "readme": "# SLO Default\n\n## SLO\nDefault SLO query used for multi-window multi-burn. \n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/slo-default" - }, - { - "slug": "rw-public-codecollection-aws-billing-tagcosts", - "collection_slug": "rw-public-codecollection", - "name": "aws-billing-tagcosts", - "display_name": "aws-billing-tagcosts", - "description": "Monitors AWS cost and usage data for the latest billing period.", - "platform": "AWS", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Get All Billing Sliced By Tags" - ], - "capabilities": [], - "readme": "# AWS Billing Tag Costs\n\n## SLI \nMonitors AWS cost and usage data for the latest billing period.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Core", - "RW.AWS.Billing" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/aws-billing-tagcosts" - }, - { - "slug": "rw-public-codecollection-gcp-opssuite-promql", - "collection_slug": "rw-public-codecollection", - "name": "gcp-opssuite-promql", - "display_name": "gcp-opssuite-promql", - "description": "This codebundle leverages the [Google Managed Prometheus](https://cloud.google.com/stackdriver/docs/managed-prometheus) service and the Promethues Query API for Google customers to query metrics in their projects using promql.", - "platform": "GCP", - "author": "Shea Stewart", - "support_tags": [ - "rw" - ], - "tasks": [ - "Run Prometheus Instant Query Against Google Prom API Endpoint" - ], - "capabilities": [], - "readme": "# GCP Operations Suite PromQL\nThis codebundle leverages the [Google Managed Prometheus](https://cloud.google.com/stackdriver/docs/managed-prometheus) service and the Promethues Query API for Google customers to query metrics in their projects using promql. \n \n## SLI\nPerforms a metric query using PromQL statement on the Ops Suite API and pushes it to the RunWhen platform. \n\n## Use Cases\n### Use Case: SLI: Query Prometheus for Kubernetes Deployment Health in a Namespace\nThis example demonstrates how to use this codebundle to capture an SLI about the health of a specific Kubernetes deployment. See [kube-state-metrics](https://github.com/kubernetes/kube-state-metrics/blob/main/docs/deployment-metrics.md) for more detail. \n\nSince a query such as `kube_deployment_status_condition{namespace=\"[namespace]\"}` returns a value for each possible status of every deployment condition in the namespace (such as `Available=true,false,unknown` or `Progressing=true,false,unknown`), we will use a query that adds up any deployments that are NOT `Available=true` or `Progressing=true`. This query could look like `sum(kube_deployment_status_condition{namespace=\"[namespace]\", condition=~\"Available|Progressing\", status=~\"false|unknown\"})`. Since this should add up any bad state, a number greater than `0` is considered bad. \n\nExample SLI codebundle configuration:\n\n```\nPROJECT_ID: gcp-project-id\nPROMQL_STATEMENT: sum(kube_deployment_status_condition{namespace=\"[namespace]\", condition=~\"Available|Progressing\", status=~\"false|unknown\"})\nTRANSFORM=Raw\nDATA_COLUMN=1\nNO_RESULT_OVERWRITE=No # Not needed as each status returns a 1 or 0 \n```\n\n**An example SLO for this would state**: \n\"In any 30 day period, the SLI should be `equal to` `0` approximately `99.5`% of the time. This implies an error budget of 22 minutes.\" \n\nWith an SLO set to 0, the error budget will burn if **ANY** deployment is in an unhealthy state as reported by `kube-state-metrics`. \n\nA few additional resources on Kubernetes deployment management: \n- https://unofficial-kubernetes.readthedocs.io/en/latest/concepts/workloads/controllers/deployment/\n- https://maelvls.dev/deployment-available-condition/\n\n\n### Use Case: SLI: Monitoring Crossplane Managed Resource Health with Kube State Metrics\nSee [here](https://docs.runwhen.com/public/use-cases/kubernetes-environments/crossplane-resources-health-check) for a very detailed use case on monitoring custom resources (using [Crossplane](https://www.crossplane.io/) managed resources as the example). \n\nReview the documentation above to configure monitoring of Custom Resources with Kube State Metrics. Once completed, the following SLI codebundle configuration can be used to return a count of our GKE Cluster, Kubernetes Objects, or Helm Releases that are unhealthy: \n```\nPROJECT_ID: gcp-project-id\nPROMQL_STATEMENT: >-\n sum(kube_crd_crossplane_cluster_ready{status=\"False\"}) +\n sum(kube_crd_crossplane_kubernetes_object_ready{status=\"False\"}) +\n sum(kube_crd_crossplane_helm_release_ready{status=\"False\"})\nTRANSFORM: Raw #Choose Raw since the promql query will perform all desired transformations\nDATA_COLUMN: '1' #The promql query provides a single result, so the data column is always 1 \nNO_RESULT_OVERWRITE: 'No' #Our query should always return a result, so this is not required\nNO_RESULT_VALUE: '0' #Not used since we aren't enabling no_result_overwrite\n```\n**An example SLO for this would state**: \n\"In any 30 day period, the SLI should be `equal to` `0` approximately `99.5`% of the time. This implies an error budget of 22 minutes.\" \n\nWith an SLO set to 0, the error budget will burn if **ANY** Crossplane resource is in an unhealthy state as reported by `kube-state-metrics`. \n\n## Requirements \n### Service Account Requirements \nThis codebundle requires a service account and accompanying json key uploaded as a secret to the workspace.\n\nThe service account should have the following roles: \n- Logs Viewer - `roles/logging.viewer`\n- Monitoring Viewer - `roles/monitoring.viewer`\n\n> Note: It's likely that only the Monitoring Viewer role is required for promql queries, but both roles are helpful when using other gcp-opssuite* codebundles. \n\nPlease see the [documentation for creating service accounts](https://cloud.google.com/iam/docs/creating-managing-service-accounts)\n\n\n## TODO", - "libraries": [ - "RW.Core", - "RW.Utils", - "RW.Prometheus", - "RW.GCP.OpsSuite" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/gcp-opssuite-promql" - }, - { - "slug": "rw-public-codecollection-k8s-patroni-lag", - "collection_slug": "rw-public-codecollection", - "name": "k8s-patroni-lag", - "display_name": "k8s-patroni-lag", - "description": "Manage lagging patroni cluster replica members by monitoring their lag and actioning them by reinitializing them with the accompanying taskset. Using the Kubernetes API, users can setup the maximum allowed lag of replicas as an SLI and attach an SLO to it which triggers an alert when a member passes the allowed lag threshold.", - "platform": "Kubernetes", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Measure Patroni Member Lag", - "Determine Patroni Health" - ], - "capabilities": [], - "readme": "# Kubernetes Patroni Replica Lag\nManage lagging patroni cluster replica members by monitoring their lag and actioning them by reinitializing them with the accompanying taskset. Using the Kubernetes API, users can setup the maximum allowed lag of replicas as an SLI and attach an SLO to it which triggers an alert when a member passes the allowed lag threshold.\n\n## SLI\nThe sli will use `kubectl` to access the Patroni API via `patronictl` in a workload pod with access to the patroni instance and fetch the state of the patroni cluster, eg:\n```\n[{'Cluster': 'mypatroni-1',\n 'Host': '0.0.0.0',\n 'Member': 'mypatroni-1-0',\n 'Role': 'Leader',\n 'State': 'running',\n 'TL': 12},\n {'Cluster': 'mypatroni-1',\n 'Host': '0.0.0.0',\n 'Lag in MB': 7,\n 'Member': 'mypatroni-1-1',\n 'Role': 'Replica',\n 'State': 'running',\n 'TL': 12}]\n```\n\nIn this case, the SLI will report the maximum lag value `7` as the SLI value. By configuring an SLO with for example a threshold of `5` this will cause an alert to fire if persistent for long enough to burn budget. You can automatically remediate severly lagging replicas which are unable catch up by reinitializing them. See the taskset use case below.\n\n## TaskSet\nThe taskset grabs a list of replicas who's lag (represented in MB) is beyond a configured tolerance and reinitialize them so that they're caught up to the leader. When a patroni replica member becomes too laggy, it may be unable to catch up in its replication process - this could be considered an unhealthy state.\n\n\n## Use Cases\n### Use Case: SLI: Measure Max Replica Lag In Kubernetes\nIn this use case, can monitor a Patroni cluster's maximum lag (in MB) in Kubernetes by using a configuration similar to:\n```\nconfigProvided:\n - name: PATRONI_RESOURCE_NAME\n value: statefulset/mypatroni-1\n - name: NAMESPACE\n value: mydata\n - name: CONTEXT\n value: default\n - name: DISTRIBUTION\n value: Kubernetes\n```\n\n### Use Case: TaskSet: Autoheal Laggy Replica Member\nIn this use case, users can use the taskset to reinitialize a laggy replica which is unable to catch up.\nGiven the following config:\n```\n - name: LAG_TOLERANCE\n value: '5'\n - name: PATRONI_RESOURCE_NAME\n value: statefulset/mypatroni-001\n - name: NAMESPACE\n value: mydata\n - name: CONTEXT\n value: default\n - name: DISTRIBUTION\n value: Kubernetes\n - name: DOC_LINK\n value: ''\n```\n\nIn this use case, if the lag is detected to be greater than 5MB, the TaskSet will automatically reinitialize the replicas. \n\n\n## Requirements\n- A kubeconfig with `get, list` access on Patroni objects in the chosen namespace, along with the verb `create` on resource `pods/exec`\n- The resource name of the Kubernetes workload object\n- The `namespace` where the wokload is located\n- A `context` to use from the `kubeconfig`\n- A selected `Distribution` to fit best for your cluster, eg: GKE, OpenShift, etc.\n- Determine a `lag tolerance` in MB which classifies what replicas need to be actioned\n", - "libraries": [ - "RW.K8s", - "RW.Utils", - "RW.platform", - "RW.Core", - "BuiltIn", - "OperatingSystem", - "RW.Patroni" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/k8s-patroni-lag" - }, - { - "slug": "rw-public-codecollection-gitlab-availability", - "collection_slug": "rw-public-codecollection", - "name": "gitlab-availability", - "display_name": "gitlab-availability", - "description": "Check availability of a GitLab server.", - "platform": "Unknown", - "author": "Vui Le", - "support_tags": [ - "rw" - ], - "tasks": [ - "Check GitLab Server Status", - "# ... ok_status=${res.ok_status}", - "Check GitLab Server Status" - ], - "capabilities": [], - "readme": "# GitLab Availability\n\n## SLI\nCheck availability of a GitLab server.\n\n## TaskSet\nTroubleshoot issues with GitLab server availability.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.HTTP", - "RW.Core", - "RW.Slack", - "RW.Report" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/gitlab-availability" - }, - { - "slug": "rw-public-codecollection-rocketchat-sendmessage", - "collection_slug": "rw-public-codecollection", - "name": "rocketchat-sendmessage", - "display_name": "rocketchat-sendmessage", - "description": "Sends a static message to a Rocketchat chat channel via webhook. There is optional configuration for including live runsession info and links", - "platform": "Unknown", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Send Chat Message" - ], - "capabilities": [], - "readme": "# Rocketchat Send Message\n\n## TaskSet \nSends a static message to a Rocketchat chat channel via webhook. There is optional configuration for including live runsession info and links\nfor team members to quickly access running sessions.\n\n## Use Cases\n- Send an alert when an SLO is burning too much budget which contains a link to the active runsession.\n- Let your team members know you're in a live runsession and provide them with a link to join you.\n\n## Requirements\n- A `webhook_url` secret which allows the codebundle to perform an incoming webhook post request against the service API.\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Rest", - "RW.Core", - "RW.RunWhen.Papi", - "RW.Utils" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/rocketchat-sendmessage" - }, - { - "slug": "rw-public-codecollection-cert-manager-healthcheck", - "collection_slug": "rw-public-codecollection", - "name": "cert-manager-healthcheck", - "display_name": "cert-manager-healthcheck", - "description": "Check the health of pods deployed by cert-manager.", - "platform": "Unknown", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Health Check cert-manager Pods" - ], - "capabilities": [], - "readme": "# Cert-Manager Health Check\n\n## SLI \nCheck the health of pods deployed by cert-manager.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.K8s", - "RW.Utils", - "RW.platform", - "RW.Core", - "BuiltIn", - "OperatingSystem", - "RW.CertManager" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/cert-manager-healthcheck" - }, - { - "slug": "rw-public-codecollection-k8s-triage-statefulset", - "collection_slug": "rw-public-codecollection", - "name": "k8s-triage-statefulset", - "display_name": "k8s-triage-statefulset", - "description": "A taskset for troubleshooting issues for StatefulSets and their related resources.", - "platform": "Kubernetes", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Check StatefulSets Replicas Ready", - "Get Events For The StatefulSet", - "Get StatefulSet Logs", - "Get StatefulSet Manifests Dump" - ], - "capabilities": [], - "readme": "# Kubernetes Triage StatefulSet\n\n## TaskSet\nA taskset for troubleshooting issues for StatefulSets and their related resources.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.K8s", - "RW.Utils", - "RW.platform", - "RW.Core", - "BuiltIn", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/k8s-triage-statefulset" - }, - { - "slug": "rw-public-codecollection-twitter-query-tweets", - "collection_slug": "rw-public-codecollection", - "name": "twitter-query-tweets", - "display_name": "twitter-query-tweets", - "description": "Queries Twitter to count amount of tweets within a specified time range for a specific user handle.", - "platform": "Unknown", - "author": "Shea Stewart", - "support_tags": [ - "rw" - ], - "tasks": [ - "Query Twitter", - "Query Twitter" - ], - "capabilities": [], - "readme": "# Twitter Query Tweets\n\n## SLI\nQueries Twitter to count amount of tweets within a specified time range for a specific user handle.\n\n## TaskSet\nQueries Twitter to fetch tweets within a specified time range for a specific user handle add them to a report.\n\n\n## Use Cases\n### SLI & TaskSet: Count and fetch tweets within the last day\nIn our use case, the twitter handle [gitbookstatus](https://twitter.com/gitbookstatus) uses twitter to post updates about their service. The SLI can be configured to fetch and count any tweets within the last day, and the Runbook can be configured in the same way, but delivering the tweet content.\n\nExample configuration parameters for both the SLI and TaskSet: \n```\nHandle: gitbookstatus\nMax Tweets: 5\nMax Tweet Age: 1\nMin Tweet Age: 0\n```\n\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.SocialScrape", - "RW.Core" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/twitter-query-tweets" - }, - { - "slug": "rw-public-codecollection-aws-cloudformation-triage", - "collection_slug": "rw-public-codecollection", - "name": "aws-cloudformation-triage", - "display_name": "aws-cloudformation-triage", - "description": "Triage and troubleshoot various issues with AWS CloudFormation", - "platform": "AWS", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Get All Recent Stack Events" - ], - "capabilities": [], - "readme": "# AWS CloudFormation Triage\n\n## TaskSet \nTriage and troubleshoot various issues with AWS CloudFormation\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Core", - "RW.AWS.CloudFormation", - "RW.Report" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/aws-cloudformation-triage" - }, - { - "slug": "rw-public-codecollection-artifactory-ok", - "collection_slug": "rw-public-codecollection", - "name": "artifactory-ok", - "display_name": "artifactory-ok", - "description": "Checks an Artifactory instance health endpoint to determine its operational status.", - "platform": "Unknown", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Check If Artifactory Endpoint Is Healthy" - ], - "capabilities": [], - "readme": "# Artifactory OK \n## SLI\nChecks an Artifactory instance health endpoint to determine its operational status.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Artifactory", - "RW.Core" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/artifactory-ok" - }, - { - "slug": "rw-public-codecollection-k8s-patroni-healthcheck", - "collection_slug": "rw-public-codecollection", - "name": "k8s-patroni-healthcheck", - "display_name": "k8s-patroni-healthcheck", - "description": "Uses kubectl (or equivalent) to query the state of a patroni cluster and determine if it's healthy.", - "platform": "Kubernetes", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Determine Patroni Health" - ], - "capabilities": [], - "readme": "# Kubernetes Patroni Health Check\n\n## SLI\nUses kubectl (or equivalent) to query the state of a patroni cluster and determine if it's healthy.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.K8s", - "RW.Utils", - "RW.platform", - "RW.Core", - "BuiltIn", - "OperatingSystem", - "RW.Patroni" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/k8s-patroni-healthcheck" - }, - { - "slug": "rw-public-codecollection-aws-ec2-securitycheck", - "collection_slug": "rw-public-codecollection", - "name": "aws-ec2-securitycheck", - "display_name": "aws-ec2-securitycheck", - "description": "Performs a suite of security checks against a set of AWS EC2 instances.", - "platform": "AWS", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Check For Untagged instances", - "Check For Dangling Volumes", - "Check For Open Routes", - "Check For Overused Instances", - "Check For Underused Instances", - "Check For Underused Volumes", - "Check For Overused Volumes" - ], - "capabilities": [], - "readme": "# AWS EC2 Security Check\n\n## TaskSet \nPerforms a suite of security checks against a set of AWS EC2 instances.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Core", - "RW.AWS.CloudWatch", - "RW.AWS.EC2" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/aws-ec2-securitycheck" - }, - { - "slug": "rw-public-codecollection-jira-search-issues-latency", - "collection_slug": "rw-public-codecollection", - "name": "jira-search-issues-latency", - "display_name": "jira-search-issues-latency", - "description": "Check Jira latency when searching issues by current user.", - "platform": "Unknown", - "author": "Vui Le", - "support_tags": [ - "rw" - ], - "tasks": [ - "Search Jira Issues By Current User", - "Create a new Jira Issue" - ], - "capabilities": [ - "Create a new Jira Issue: Create a new issue in Jira" - ], - "readme": "# Jira Search Issues Latency\n\n## SLI\nCheck Jira latency when searching issues by current user.\n\n## TaskSet\nCreate an issue in Jira.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Core", - "RW.Jira" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/jira-search-issues-latency" - }, - { - "slug": "rw-public-codecollection-github-actions-workflowtiming", - "collection_slug": "rw-public-codecollection", - "name": "github-actions-workflowtiming", - "display_name": "github-actions-workflowtiming", - "description": "Monitors the average timing of a github actions workflow file within a repo and returns the average runtime in minutes.", - "platform": "Unknown", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Get Average Run Time For Workflow" - ], - "capabilities": [], - "readme": "# GitHub Actions Workflow Timing\n\n## SLI\nMonitors the average timing of a github actions workflow file within a repo and returns the average runtime in minutes.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Core", - "RW.GitHub.Actions", - "RW.Utils" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/github-actions-workflowtiming" - }, - { - "slug": "rw-public-codecollection-aws-s3-stalecheck", - "collection_slug": "rw-public-codecollection", - "name": "aws-s3-stalecheck", - "display_name": "aws-s3-stalecheck", - "description": "Identify stale AWS S3 buckets, based on last modified object timestamp.", - "platform": "AWS", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Create Report For Stale Buckets" - ], - "capabilities": [], - "readme": "# AWS S3 Stale Check\n\n## TaskSet \nIdentify stale AWS S3 buckets, based on last modified object timestamp.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Core", - "RW.AWS.S3" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/aws-s3-stalecheck" - }, - { - "slug": "rw-public-codecollection-vault-ok", - "collection_slug": "rw-public-codecollection", - "name": "vault-ok", - "display_name": "vault-ok", - "description": "Check the health of a Vault server. The response code is used to determine if the service is healthy, resulting in a metric of 1 if it is, or 0 if not.", - "platform": "Unknown", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Check If Vault Endpoint Is Healthy" - ], - "capabilities": [], - "readme": "# Vault OK\n\n## SLI\nCheck the health of a Vault server. The response code is used to determine if the service is healthy, resulting in a metric of 1 if it is, or 0 if not.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Core", - "RW.HashiCorp.Vault" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/vault-ok" - }, - { - "slug": "rw-public-codecollection-k8s-postgres-query", - "collection_slug": "rw-public-codecollection", - "name": "k8s-postgres-query", - "display_name": "k8s-postgres-query", - "description": "This codebundle leverages the Kubernetes API and running pod to execute psql queries (using kubectl exec). These codebundles also capture the duration of the query.", - "platform": "Kubernetes", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Run Postgres Query And Return Result As Metric", - "Run Postgres Query And Results to Report" - ], - "capabilities": [], - "readme": "# Kubernetes Postgres Query\nThis codebundle leverages the Kubernetes API and running pod to execute psql queries (using kubectl exec). These codebundles also capture the duration of the query. \n\n## SLI\nThis codebundle uses a Kubernetes workload to run a postgres SQL query and pushes the query result as an SLI metric. The workload will run the query and return the result from stdout, along with the timing of the query. \n\nExample configuration: \n```\n## Variable Configuration\nContext: default # this depends on what the context is called in the kubeconfig\nWorkload Name: statefulset/mydb-1\nWorkload Namespace: my-database-namespace\nWorkload Container: database # Often there are many containers in the database pod\nQuery: SELECT COUNT(*) FROM user_table; # The targeted database is configured as a secret\nHostname: localhost\nDistribution: Kubernetes\n```\n\n## TaskSet\nThis codebundle uses a Kubernetes workload to run a postgres SQL query and returns the results in an aligned table with headers as a report. The workload will run the query and return the result from stdout, along with the timing of the query. \n\nExample configuration: \n```\n## Variable Configuration\nContext: default # this depends on what the context is called in the kubeconfig\nWorkload Name: statefulset/mydb-1\nWorkload Namespace: my-database-namespace\nWorkload Container: database # Often there are many containers in the database pod\nQuery: SELECT id,firstname,lastname FROM user_table ORDER BY lastname ASC; # The targeted database is configured as a secret\nHostname: localhost\nDistribution: Kubernetes\n```\n\n\n## Use Cases\n- When you have a postgres deployment or equivalent (eg: patroni) deployment in your kubernetes cluster and would like to use a query result as a metric without publicly exposing the database.\n- If you want to periodically check and measure an attribute of your database, such as slow queries, memory usage, index efficiency, etc.\n\n## Requirements\n- A kubeconfig with adequate access permissions to the workload running the query. For Kubernetes RBAC, the service account needs `create` permission on the `pods/exec` resource. \n- For SLIs, a postgres compatible query which returns a single result row. If you're getting multiple rows consider aggregating them via COUNT, MAX, SUM, GROUP BY, etc.\n- A kubernetes workload with `psql` binary as part of its image that can access the database within the constraints of its network. You can use the same workload as the one running the database.\n- The `hostname`, `user`, `password`, `database name` credentials, where the user has adequate permissions to perform the query on the desired table.\n\n## TODO\n- [ ] Add additional documentation or examples", - "libraries": [ - "RW.K8s", - "RW.Postgres", - "RW.Utils", - "RW.platform", - "RW.Core" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/k8s-postgres-query" - }, - { - "slug": "rw-public-codecollection-github-status-maintenances", - "collection_slug": "rw-public-codecollection", - "name": "github-status-maintenances", - "display_name": "github-status-maintenances", - "description": "Retrieve number of upcoming Github platform maintenances over a given window.", - "platform": "Unknown", - "author": "Paul Dittaro", - "support_tags": [ - "rw" - ], - "tasks": [ - "Get Scheduled and Active GitHub Maintenance Windows" - ], - "capabilities": [], - "readme": "# GitHub Status - Maintenance\n\n## SLI\nRetrieve number of upcoming Github platform maintenances over a given window.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Core", - "RW.GitHub.Status" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/github-status-maintenances" - }, - { - "slug": "rw-public-codecollection-github-status-components", - "collection_slug": "rw-public-codecollection", - "name": "github-status-components", - "display_name": "github-status-components", - "description": "Check status of the GitHub platform (https://www.githubstatus.com/) for a specified set of GitHub service components.", - "platform": "Unknown", - "author": "Paul Dittaro", - "support_tags": [ - "rw" - ], - "tasks": [ - "Get Availability of GitHub or Individual GitHub Components" - ], - "capabilities": [], - "readme": "# GitHub Status - Platform Components\n\n## SLI - Component Availability \nCheck status of the GitHub platform (https://www.githubstatus.com/) for a specified set of GitHub service components.\nThe metric supplied is a aggregated percentage indicating the availability of the components with 1 = 100% available. \n\n### SLI Metric Calculation Details\n> **NOTE:** See the [RW GitHub Status Library](../../libraries/RW/GitHub/Status.py) code for additional details. \n\n This SLI calculates an availability metric for the GitHub platform, between 0 and 1.\n Optionally takes a subset of components from which to calculate this total.\n\n When no components are provided, the score is mapped from the indicator on the\n GitHub status page using the following values:\n - ``none`` : 1\n - ``minor`` : 0.66\n - ``major`` : 0.33\n - ``critical`` : 0\n\n If the components are provided, this function provides the average component\n availability score of the number of components provided in the set. These\n values are mapped from the component status attribute as follows:\n - ``operational`` : 1\n - ``degraded_performance`` : 0.66\n - ``partial_outage`` : 0.33\n - ``major_outage`` : 0\n\n Parameters:\n components (Set[str]): Set of components to optionally calculate\n availability score from. Current possible values at time of this release\n are:\n - \"Git Operations\"\n - \"API Requests\"\n - \"Webhooks\"\n - \"Issues\"\n - \"Pull Requests\"\n - \"Actions\"\n - \"Packages\"\n - \"Pages\"\n - \"Codespaces\"\n - \"Copilot\"\n\n Raises:\n ValueError: If the components provided do not match the list fetched from\n GitHub\n\n Returns:\n Value between 0 and 1 corresponding to the availability of the GitHub\n platform\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Core", - "RW.GitHub.Status" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/github-status-components" - }, - { - "slug": "rw-public-codecollection-rest-explicitoauth2-tokenheader", - "collection_slug": "rw-public-codecollection", - "name": "rest-explicitoauth2-tokenheader", - "display_name": "rest-explicitoauth2-tokenheader", - "description": "A general purpose rest codebundle for extracting data from a rest endpoint. See the [generic](https://docs.runwhen.com/public/v/codebundles/rest-generic) codebundle variant for more details.", - "platform": "Unknown", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Request Data From Rest Endpoint" - ], - "capabilities": [], - "readme": "# Rest OAuth2 with Bearer-to-Access Authentication\n\n## SLI\nA general purpose rest codebundle for extracting data from a rest endpoint. See the [generic](https://docs.runwhen.com/public/v/codebundles/rest-generic) codebundle variant for more details.\n\n## Use Cases\nRefer to the generic rest codebundle [here](https://docs.runwhen.com/public/v/codebundles/rest-generic) for a suite of setups and use cases.\n\n## Requirements\n--\n\n## TODO\n--", - "libraries": [ - "RW.Core", - "RW.Rest", - "RW.Utils" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/rest-explicitoauth2-tokenheader" - }, - { - "slug": "rw-public-codecollection-slack-sendmessage", - "collection_slug": "rw-public-codecollection", - "name": "slack-sendmessage", - "display_name": "slack-sendmessage", - "description": "Sends a static message to a Slack chat channel via webhook. There is optional configuration for including live runsession info and links", - "platform": "Unknown", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Send Chat Message" - ], - "capabilities": [], - "readme": "# Slack Send Message\n\n## TaskSet \nSends a static message to a Slack chat channel via webhook. There is optional configuration for including live runsession info and links\nfor team members to quickly access running sessions.\n\n## Use Cases\n- Send an alert when an SLO is burning too much budget which contains a link to the active runsession.\n- Let your team members know you're in a live runsession and provide them with a link to join you.\n\n## Requirements\n- A `webhook_url` secret which allows the codebundle to perform an incoming webhook post request against the service API.\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Rest", - "RW.Core", - "RW.RunWhen.Papi", - "RW.Utils" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/slack-sendmessage" - }, - { - "slug": "rw-public-codecollection-curl-generic", - "collection_slug": "rw-public-codecollection", - "name": "curl-generic", - "display_name": "curl-generic", - "description": "A generic curl codebundle that uses the curl service. Supports jq for processing output and expects to output in json format.", - "platform": "Unknown", - "author": "Shea Stewart", - "support_tags": [ - "rw" - ], - "tasks": [ - "Run Curl Command and Push Metric", - "Run Curl Command and Add to Report" - ], - "capabilities": [], - "readme": "# CURL Generic\nA generic curl codebundle that uses the curl service. Supports jq for processing output and expects to output in json format. \n\n## SLI \nA curl SLI for querying and extracting data from a generic curl call. Uses the hosted curl service, supports jq for parsing, and should prodice a single metric.\n\n## TaskSet\nA curl TaskSet for querying and extracting data from a generic curl call. Uses the hosted curl service, supports jq for parsing, will output in json.\n\n## Use Cases\n### SLI: Count the number GitHub Repo Stargazers\nThis example uses the SLI to collect the list of stargazers on a GitHub repo, uses jq to count them up, and pushes the metric. \n\n```\nCURL_COMMAND=\"curl --silent -X GET https://api.github.com/repos/runwhen-contrib/rw-public-codecollection/stargazers | jq length\"\n```\n### TaskSet: Generate a report of GitHub Repo Stargazers by login-name\nThis example uses the SLI to collect the list of stargazers on a GitHub repo, uses jq to count them up, and pushes the metric. \n\n```\nCURL_COMMAND=\"curl -X GET https://api.github.com/repos/runwhen-contrib/rw-public-codecollection/stargazers | jq '.[] | .login'\"\n```\n\n## Requirements\n\n## TODO\n- [ ] Add additional filtering capabilities, SLI math (e.g. avg, sum, count)to mimic k8s-kubectl-get\n- [ ] Add additional report formatting so that it's not just json", - "libraries": [ - "RW.Curl", - "RW.Core", - "RW.Utils" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/curl-generic" - }, - { - "slug": "rw-public-codecollection-k8s-kubectl-get", - "collection_slug": "rw-public-codecollection", - "name": "k8s-kubectl-get", - "display_name": "k8s-kubectl-get", - "description": "The `kubectl get` codebundles run an arbitrary `kubectl get` command that fetches objects with json output. The results from the desired command are returned, filtered, and computed using json output and jmespath as directed by the configuration.", - "platform": "Kubernetes", - "author": "Shea Stewart", - "support_tags": [ - "rw" - ], - "tasks": [ - "Running Kubectl get and push the metric" - ], - "capabilities": [], - "readme": "# Kubernetes kubectl get\nThe `kubectl get` codebundles run an arbitrary `kubectl get` command that fetches objects with json output. The results from the desired command are returned, filtered, and computed using json output and jmespath as directed by the configuration. \n\n## SLI\nThe SLI and provides a single metric. First, the `kubectl get [parameters]` command can be input as desired with the results returned as json. The results can be further refined as needed through the `SEARCH_FILTER` and `CALCULATION_FIELD` configuration options, though most users will likely want to just copy/paste helpful `kubectl get` commands that they are familiar with (such as `kubectl get pods -l app=[labelname]`). \n\nThe SLI supports the following calculations: \n- **Count**: Returns the number of items returned from the query.\n- **Sum**: Sums up all values in the specified `calculation field` for all returned objects. \n- **Average**: Provides the average of all values in the specified `calculation field` for all returned objects. \n\n## Use Cases\n### Use Case: SLI: Query all Certificates that are **NOT** \"Ready\" in a namespace\nIn this use case, we can query a namespace for all certificate objects. This first configuration would return a count of certificates in the namespace: \n```\nCALCULATION='Count'\nSEARCH_FILTER=''\nKUBECTL_COMMAND='kubectl get certificates --namespace [my-namespace]'\nCALULATION_FIELD=''\n```\n\nThis SLI might not be all too helpful in determining health, but it can be expanded to search for Certificates with a ready status that is **NOT** \"True\": \n```\nCALCULATION='Count'\nSEARCH_FILTER='status.conditions[?type==`Ready` && status!=`True`]'\nKUBECTL_COMMAND='kubectl get certificates --namespace [my-namespace]'\nCALULATION_FIELD=''\n```\n\nWith this configuration, users could now apply an SLO to fire off alerts or TaskSets if this number is greater than 0 (since Certificates that aren't ready could cause problems). \n\n### Use Case: SLI: Count unhealthy Crossplane resources\nSee [here](https://docs.runwhen.com/public/use-cases/kubernetes-environments/crossplane-resources-health-check) for a very detailed use case on monitoring custom resources (using Crossplane managed resources as the example). \n\nIn this use case, we can query a cluster for the status of Crossplane managed resources (GKE clusters, Kubernetes Objects, Helm Releases): \n```\nDISTRIBUTION: Kubernetes\nKUBECTL_COMMAND: kubectl get clusters,objects,releases\nCALCULATION: Count\nCALCULATION_FIELD: ''\nSEARCH_FILTER: >-\n status.conditions[?(type==`Ready` && status!=`True`) || (type==`Synced` &&\n status!=`True`)]\n```\n\nWith this configuration, users could now apply an SLO to fire off alerts or TaskSets if any of these objects are considered NotReady). \n\n\n### Use Case: SLI: Sum, up all container restarts in a namespace\nIn this use case, we can query a namespace all pods and add up every container restart: \n```\nCALCULATION='Sum'\nSEARCH_FILTER=''\nKUBECTL_COMMAND='kubectl get pods --namespace [my-namespace]'\nCALULATION_FIELD='status.containerStatuses[].restartCount'\n```\n\nWith this configuration, users could now apply an SLO to fire off alerts or TaskSets if this number is abnormal (since a high amount of container restarts could indicate an issue). \n\n\n### Use Case: SLI: Count all Flux HelmReleases that are **NOT** \"Ready\"\nIn this use case, we can query a cluster for HelmReleases that are NOT in a Ready state: \n```\nCALCULATION='Count'\nSEARCH_FILTER='status.conditions[?type==`Ready` && status!=`True`]'\nKUBECTL_COMMAND='kubectl get helmreleases.helm.toolkit.fluxcd.io --all-namespaces'\nCALULATION_FIELD=''\n```\n\nWith this configuration, users could now apply an SLO to fire off alerts or TaskSets if this number is abnormal (since failing helmreleases might be affecting other other services). \n\n\n### Use Case: SLI: Count all Kubernetes API Services\nIn this use case, we can query a cluster for count of API Services: \n```\nCALCULATION='Count'\nSEARCH_FILTER=''\nKUBECTL_COMMAND='kubectl get apiservice'\nCALULATION_FIELD=''\n```\n\n### Use Case: SLI: Count all Kubernetes API Services that are **NOT** \"Ready\"\nIn this use case, we can query a cluster for API Services that are NOT in a Ready state: \n```\nCALCULATION='Count'\nSEARCH_FILTER='status.conditions[?type==`Ready` && status!=`True`]'\nKUBECTL_COMMAND='kubectl get apiservice'\nCALULATION_FIELD=''\n```\n\nWith this configuration, users could now apply an SLO to fire off alerts or TaskSets if this number is abnormal (since a failing apiservice might be affecting other other services).\n\n### Use Case: SLI: Count all Services without Endpoints\nIn this use case, we can query a namespace for all services that do not have an associated endpoint: \n```\nCALCULATION='Count'\nSEARCH_FILTER='!subsets'\nKUBECTL_COMMAND='kubectl get endpoints -n [namespace]'\nCALULATION_FIELD=''\n```\n> It may be desirable to have some services that do not have endpoints, but and the associated SLO could account for this, but mmany general application deployments will ha", - "libraries": [ - "RW.K8s.K8sUtils", - "RW.K8s", - "RW.Utils", - "RW.platform", - "RW.Core" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/k8s-kubectl-get" - }, - { - "slug": "rw-public-codecollection-gitlab-get-repos-latency", - "collection_slug": "rw-public-codecollection", - "name": "gitlab-get-repos-latency", - "display_name": "gitlab-get-repos-latency", - "description": "Check GitLab latency by getting a list of repo names.", - "platform": "Unknown", - "author": "Vui Le", - "support_tags": [ - "rw" - ], - "tasks": [ - "Check GitLab Latency With Get Repos" - ], - "capabilities": [], - "readme": "# GitLab Get Repository Latency\n\n## SLI\nCheck GitLab latency by getting a list of repo names.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Core", - "RW.GitLab" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/gitlab-get-repos-latency" - }, - { - "slug": "rw-public-codecollection-dns-latency", - "collection_slug": "rw-public-codecollection", - "name": "dns-latency", - "display_name": "dns-latency", - "description": "Check DNS latency for Google Resolver.", - "platform": "Unknown", - "author": "Vui Le", - "support_tags": [ - "rw" - ], - "tasks": [ - "Check DNS latency for Google Resolver" - ], - "capabilities": [ - "Check DNS latency for Google Resolver: Get DNS latency for Google resolver" - ], - "readme": "# DNS Latency\n\n## SLI \nCheck DNS latency for Google Resolver.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Core", - "RW.DNS" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/dns-latency" - }, - { - "slug": "rw-public-codecollection-rest-generic", - "collection_slug": "rw-public-codecollection", - "name": "rest-generic", - "display_name": "rest-generic", - "description": "This codebundle is the general-purpose base OAuth2 REST SLI. It should be capable of extracting metric data from the vast majority of REST API endpoints.", - "platform": "Unknown", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Request Data From Rest Endpoint" - ], - "capabilities": [], - "readme": "# Rest Standard OAuth2\n\n## SLI\nThis codebundle is the general-purpose base OAuth2 REST SLI. It should be capable of extracting metric data from the vast majority of REST API endpoints.\n\nIn many cases you can use this generic REST codebundle. If you're able to request your data with a curl call then it should directly translate to this codebundle. If you're unable to perform your flow with this codebundle, then for differing workflows see the various `When to use` sections below. For an example of the codebundle in action you can deploy it with the default fields and it will return a metric of 1.\n\nThe flow of this codebundle is:\n1. Authenticate via implicit OAuth2 with a long-lived access token\n2. Receive the response as JSON\n3. Use the configured JSON path string to traverse the payload and extract data\n4. Push this extracted data as a metric value\n\nIn practice with the default values:\n- Performs a GET on `https://postman-echo.com/get?mygetparam=1`\n- The received data is `\"args\":{\"mygetparam\":\"1\"}`\n- Use the JSON path `args.mygetparam` to extract the value `1`\n- Push `1` as a metric\n\n## Use Cases\n- Extract application-specific data from an endpoint for use as a metric\n- Integrate with various REST APIs such as Prometheus\n- Translate your curl calls to regular healthchecks by extending this codebundle\n\n### When to use [this](https://docs.runwhen.com/public/v/codebundles/rest-generic) codebundle:\n- You're able to authenticate and fetch your data in a single curl call and you'd like to translate it to a codebundle\n- your authentication is achieved with a long-lived access token in the header\n\n### When to use [Basic Authentication](https://docs.runwhen.com/public/v/codebundles/rest-basicauth):\n- If your REST endpoint is still using the username & password approach for authentication. This codebundle contains fields for setting those secrets.\n\n### When to use [Explicit OAuth2 with Basic Authentication](https://docs.runwhen.com/public/v/codebundles/rest-explicitoauth2-basicauth):\n- If your REST endpoint needs an access token in order to request data. This codebundle contains fields for handling those secrets and an adjusted flow.\n- and if the authorization endpoint is authenticated with via basic authentication in order to request an access token\n- or your bearer token is short-lived and needs to be routinely fetched\n\n### When to use [Explicit OAuth2 with Access token acquisition](https://docs.runwhen.com/public/v/codebundles/rest-explicitoauth2-tokenheader):\n- If your REST endpoint needs an access token in order to request data. This codebundle contains fields for handling those secrets and an adjusted flow.\n- and if the authorization endpoint is authenticated with using a bearer token in order to request an access token\n- or your bearer token is short-lived and needs to be routinely fetched\n\n## Requirements\n### For this codebundle:\n- `URL` the HTTP url to perform a request against\n- `JSON_PATH` which is the json path string used to extract data. Explore https://jmespath.org/ for examples.\n- If you require authentication against the HTTP endpoint, Provide a JSON string in the `HEADER` field describing your headers.\neg: `{\"Content-Type\":\"application/json\", \"my-header\":\"my-value\", \"Authorization\":\"Bearer mytoken\"}`\n### For basic auth:\n- `USERNAME` the username credential used to login for access\n- `PASSWORD` the password credential used when logging in for access\n### For OAuth2 With Basic Auth:\n- `AUTH_URL` the URL of the authorization endpoint used to request the token\n- `AUTH_TOKEN_JSON_PATH` the json path used to extract the token string from the authorization response\n> Plus username and password\n### For OAuth2 With Token:\n- `BEARER_TOKEN` is the long-lived token used to request an access token from the authorization endpoint\n> Plus the authorization endpoint fields from the oauth2 basic auth section\n\n\n## TODO\n- [ ] Add more use cases\n- [ ] Implement a smart variant\n", - "libraries": [ - "RW.Core", - "RW.Rest" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/rest-generic" - }, - { - "slug": "rw-public-codecollection-opsgenie-alert", - "collection_slug": "rw-public-codecollection", - "name": "opsgenie-alert", - "display_name": "opsgenie-alert", - "description": "Create an alert in Opsgenie.", - "platform": "Unknown", - "author": "Vui Lee", - "support_tags": [ - "rw" - ], - "tasks": [ - "Get Opsgenie System Info", - "Create An Alert" - ], - "capabilities": [ - "Get Opsgenie System Info: Get information about the Opsgenie system.", - "Create An Alert: Create a new alert in Opsgenie." - ], - "readme": "# Opsgenie Alert\n\n## TaskSet\nCreate an alert in Opsgenie.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Core", - "RW.Opsgenie" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/opsgenie-alert" - }, - { - "slug": "rw-public-codecollection-github-status-incidents", - "collection_slug": "rw-public-codecollection", - "name": "github-status-incidents", - "display_name": "github-status-incidents", - "description": "Check for unresolved incidents related to GitHub services, and provides a count of ongoing incidents as a metric.", - "platform": "Unknown", - "author": "Paul Dittaro", - "support_tags": [ - "rw" - ], - "tasks": [ - "Get Number of Incidents Affecting GitHub" - ], - "capabilities": [], - "readme": "# GitHub Status - Incidents\n\n## SLI\nCheck for unresolved incidents related to GitHub services, and provides a count of ongoing incidents as a metric.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Core", - "RW.GitHub.Status" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/github-status-incidents" - }, - { - "slug": "rw-public-codecollection-aws-cloudformation-stackevents-count", - "collection_slug": "rw-public-codecollection", - "name": "aws-cloudformation-stackevents-count", - "display_name": "aws-cloudformation-stackevents-count", - "description": "Retrieve the number of detected AWS CloudFormation stack events over a given history", - "platform": "AWS", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Fetch CloudFormation Stack Events" - ], - "capabilities": [], - "readme": "# AWS CloudFormation Stack Events Count\n\n## SLI \nRetrieve the number of detected AWS CloudFormation stack events over a given history\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Core", - "RW.AWS.CloudFormation" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/aws-cloudformation-stackevents-count" - }, - { - "slug": "rw-public-codecollection-http-latency", - "collection_slug": "rw-public-codecollection", - "name": "http-latency", - "display_name": "http-latency", - "description": "Measure HTTP latency against a given URL.", - "platform": "Unknown", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Check HTTP Latency to Well Known URL" - ], - "capabilities": [], - "readme": "# HTTP Latency\n\n## SLI\nMeasure HTTP latency against a given URL.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.HTTP", - "RW.Core" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/http-latency" - }, - { - "slug": "rw-public-codecollection-github-get-repos-latency", - "collection_slug": "rw-public-codecollection", - "name": "github-get-repos-latency", - "display_name": "github-get-repos-latency", - "description": "Check GitHub latency by getting a list of repo names.", - "platform": "Unknown", - "author": "Vui Le", - "support_tags": [ - "rw" - ], - "tasks": [ - "Check GitHub Latency With Get Repos", - "Check Latency When Creating a New GitHub Issue" - ], - "capabilities": [], - "readme": "# GitHub Get Repository Latency\n\n## SLI\nCheck GitHub latency by getting a list of repo names.\n\n## TaskSet \nCreate a new issue in GitHub Issues.\n\n## Use Cases\n\n## Requirements\n\n## TODO\n- [ ] Add additional documentation", - "libraries": [ - "RW.Core", - "RW.GitHub" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/github-get-repos-latency" - }, - { - "slug": "rw-public-codecollection-gcp-gcloudcli-generic", - "collection_slug": "rw-public-codecollection", - "name": "gcp-gcloudcli-generic", - "display_name": "gcp-gcloudcli-generic", - "description": "These two codebundle can be used to run arbitrary gcloud commands to perform automated tasks, capture output for a report, or return a metric for surfacing in an SLI.", - "platform": "GCP", - "author": "Jonathan Funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Run Gcloud CLI Command and Push metric", - "Run Gcloud CLI Command and Push metric" - ], - "capabilities": [], - "readme": "# Run Generic Gcloud Commands\nThese two codebundle can be used to run arbitrary gcloud commands to perform automated tasks, capture output for a report, or return a metric for surfacing in an SLI.\n\n> Note: the `gcloud auth activate-service-account` call is done for you implicitly, so there's no need to add it into your command string.\n\n## SLI \nA gcloud SLI for querying and extracting data from a generic gcloud call. Uses the hosted gcloud service, supports jq for parsing, and should prodice a single metric.\n\n## TaskSet\nRun a gcloud cli command and capture its output for use in a report, such as logs, restarting a VM, etc.\n\n## Use Cases\n### SLI: Get Number of Error Logs\nThis example uses the SLI fetches the up to 20 warning/error log entries in the last 15 minutes as json, before counting the number of entries and providing it as a metric for your SLI. \n\n```\nGCLOUD_COMMAND='gcloud logging read \"severity>=WARNING\" --freshness=15m --limit=20 --format=json | jq length'\n```\n\n### TaskSet: Fetch Last 5 Errors and Present in Report\nThis example uses the TaskSet variant of the codebundle to fetch stdout and place it into a report on the platform for display to to users. In this case we're adding the last 5 warning/error log entries to a report (the entries will default to yaml)\n\n```\nGCLOUD_COMMAND='gcloud logging read \"severity>=WARNING\" --freshness=15m --limit=5'\n```\n\n## Requirements\n- The gcloud command string you'd like to run\n- A service account credentials json file to be used for authentication\n\n## TODO\n- [ ] Expand on examples\n- [ ] Determine if/what other gcloud plugins need to be installed for complex use cases", - "libraries": [ - "RW.Core", - "RW.GCP.GCloudCLI", - "RW.Utils" - ], - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/codebundles/gcp-gcloudcli-generic" - }, - { - "slug": "rw-cli-codecollection-k8s-restart-resource", - "collection_slug": "rw-cli-codecollection", - "name": "k8s-restart-resource", - "display_name": "k8s-restart-resource", - "description": "Restarts a kubernetes resource in an attempt to get it out of a bad state. This would typically be used in conjunction with other", - "platform": "Kubernetes", - "author": "jon-funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Get Current Resource State with Labels `${LABELS}`", - "Get Resource Logs with Labels `${LABELS}`", - "Restart Resource with Labels `${LABELS}` in `${CONTEXT}`" - ], - "capabilities": [ - "Get Current Resource State with Labels `${LABELS}`: Gets the current state of the resource before applying the restart for report review.", - "Get Resource Logs with Labels `${LABELS}`: Collects the last approximately 200 lines of logs from the resource before restarting it.", - "Restart Resource with Labels `${LABELS}` in `${CONTEXT}`: Restarts the labeled resource in an attempt to get it out of a bad state." - ], - "readme": "# Kubernetes Restart Resource\n\nRestarts a kubernetes resource in an attempt to get it out of a bad state. This would typically be used in conjunction with other\ntasksets after collecting some information about the resource and what state it is in. This taskset supports Deployments,Daemonsets and StatefulSets.\nIt applies a `rollout restart` to the resource to respect rollout strategies and avoid downtime provided the resource is highly-available.\n\n## Tasks\n`Get Current Resource State`\n`Get Resource Logs`\n`Restart Resource`\n\n## Configuration\n\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\n- `kubeconfig`: The kubeconfig secret containing access info for the cluster.\n- `kubectl`: The location service used to interpret shell commands. Default value is `kubectl-service.shared`.\n- `KUBERNETES_DISTRIBUTION_BINARY`: Which binary to use for Kubernetes CLI commands. Default value is `kubectl`.\n- `CONTEXT`: The Kubernetes context to operate within.\n- `NAMESPACE`: The name of the namespace to search.\n- `LABELS`: The set of kubernetes labels used in the selector for the resource. Ensure this is specific enough to get the exact resource you want to restart.\n\n## Notes\n\nPlease note that these checks require Kubernetes RBAC exec permissions for the service account used.\n\n## TODO\n- [ ] Add documentation\n- [ ] Refine raised issues", - "libraries": [ - "DateTime", - "Collections", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/k8s-restart-resource" - }, - { - "slug": "rw-cli-codecollection-azure-acr-health", - "collection_slug": "rw-cli-codecollection", - "name": "azure-acr-health", - "display_name": "azure-acr-health", - "description": "This bundle provides comprehensive health checks for Azure Container Registries (ACR), including network configuration analysis, resource health monitoring, authentication testing, storage utilization analysis, pull/push metrics, and security assessments. It uses Robot Framework tasks and Bash scripts to collect, parse, and score ACR health with detailed troubleshooting guidance.", - "platform": "Azure", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Check ACR Reachability for Registry `${ACR_NAME}`", - "Check ACR Usage SKU Metric for Registry `${ACR_NAME}`", - "Check ACR Pull/Push Success Ratio for Registry `${ACR_NAME}`", - "Check ACR Storage Utilization for Registry `${ACR_NAME}`", - "Check ACR Network Configuration for Registry `${ACR_NAME}`", - "Check ACR Security Configuration", - "Generate Comprehensive ACR Health Score for Registry `${ACR_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Network Configuration for ACR `${ACR_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check DNS & TLS Reachability for Registry `${ACR_NAME}`", - "Check ACR Login & Authentication for Registry `${ACR_NAME}`", - "Check ACR SKU and Usage Metrics for Registry `${ACR_NAME}`", - "Check ACR Storage Utilization for Registry `${ACR_NAME}`", - "Analyze ACR Pull/Push Success Ratio for Registry `${ACR_NAME}`", - "Check ACR Repository Event Failures for Registry `${ACR_NAME}`", - "Check ACR Security Configuration and RBAC for Registry `${ACR_NAME}`" - ], - "capabilities": [ - "Check ACR Reachability for Registry `${ACR_NAME}`: Checks if the ACR endpoint is reachable.", - "Check ACR Usage SKU Metric for Registry `${ACR_NAME}`: Checks the SKU and usage limits for the ACR.", - "Check ACR Pull/Push Success Ratio for Registry `${ACR_NAME}`: Checks the success rate of image pull and push operations.", - "Check ACR Storage Utilization for Registry `${ACR_NAME}`: Checks the storage usage of the ACR.", - "Check ACR Network Configuration for Registry `${ACR_NAME}`: Checks network access rules, private endpoints, and connectivity.", - "Check ACR Security Configuration: Analyzes ACR security configuration including RBAC, admin user settings, network access, and authentication methods.", - "Generate Comprehensive ACR Health Score for Registry `${ACR_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`: Aggregates all health check scores into a comprehensive health score.", - "Check Network Configuration for ACR `${ACR_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Analyze network access rules, private endpoints, firewall settings, and connectivity.", - "Check DNS & TLS Reachability for Registry `${ACR_NAME}`: Verifies DNS resolution and HTTPS/TLS for ACR endpoint.", - "Check ACR Login & Authentication for Registry `${ACR_NAME}`: Attempts az acr login and docker login using intended workload identity.", - "Check ACR SKU and Usage Metrics for Registry `${ACR_NAME}`: Analyzes ACR SKU configuration, usage limits, and provides recommendations.", - "Check ACR Storage Utilization for Registry `${ACR_NAME}`: Comprehensive analysis of ACR storage usage, repository sizes, and cleanup recommendations.", - "Analyze ACR Pull/Push Success Ratio for Registry `${ACR_NAME}`: Analyzes pull and push operation success rates using Azure Monitor metrics and Log Analytics.", - "Check ACR Repository Event Failures for Registry `${ACR_NAME}`: Queries Log Analytics for recent failed pushes/pulls and repo errors.", - "Check ACR Security Configuration and RBAC for Registry `${ACR_NAME}`: Comprehensive security analysis of ACR including RBAC assignments, admin user status," - ], - "readme": "# Azure Container Registry (ACR) Health Bundle\n\nThis bundle provides comprehensive health checks for Azure Container Registries (ACR), including network configuration analysis, resource health monitoring, authentication testing, storage utilization analysis, pull/push metrics, and security assessments. It uses Robot Framework tasks and Bash scripts to collect, parse, and score ACR health with detailed troubleshooting guidance.\n\n## Included Health Checks\n\n\n- **Network Configuration**: Analyzes network access rules, private endpoints, firewall settings, and connectivity\n- **DNS & TLS Reachability**: Verifies DNS resolution and HTTPS/TLS connectivity to ACR endpoint\n- **Authentication & Authorization**: Tests login capabilities and analyzes authentication methods\n- **SKU & Usage Analysis**: Comprehensive analysis of ACR SKU, usage limits, and recommendations\n- **Storage Utilization**: Detailed storage usage analysis with cleanup recommendations and retention policy checks\n- **Pull/Push Success Metrics**: Analyzes operation success rates using Azure Monitor and Log Analytics\n- **Repository Events**: Queries Log Analytics for failed operations and error patterns\n- **Geo-replication Health**: Checks multi-region replication status (Premium SKU)\n- **Webhook Configuration**: Validates webhook endpoints and connectivity\n\n## Main Tasks\n\n### Runbook Tasks (Issue Detection & Remediation)\n\n- `Check Network Configuration for ACR`\n- `Check DNS & TLS Reachability for Registry`\n- `Check ACR Login & Authentication`\n- `Check ACR SKU and Usage Metrics`\n- `Check ACR Storage Utilization`\n- `Analyze ACR Pull/Push Success Ratio`\n- `Check ACR Repository Event Failures`\n- `Check ACR Security Configuration and RBAC`\n\n### SLI Tasks (Health Scoring)\n- `Check ACR Reachability`\n- `Check ACR Usage SKU Metric`\n- `Check ACR Pull/Push Success Ratio`\n- `Check ACR Storage Utilization`\n- `Check ACR Network Configuration`\n\n- `Check ACR Security Configuration`\n- `Generate Comprehensive ACR Health Score`\n\n## How It Works\n\n1. **Comprehensive Bash scripts** collect detailed data from Azure Container Registry, Azure Resource Health API, Azure Monitor, and Log Analytics\n2. **Robot Framework tasks** execute scripts, parse structured JSON output, and generate actionable issues with severity classification\n3. **Advanced error handling** provides detailed troubleshooting context including network configuration, IP whitelists, and authentication methods\n4. **Portal URL generation** for easy navigation to relevant Azure portal sections\n5. **SLI tasks** aggregate results into a comprehensive health score for monitoring\n\n## Configuration\n\n### Required Environment Variables\n\n- `AZURE_SUBSCRIPTION_ID`: The Azure subscription ID\n- `AZ_RESOURCE_GROUP`: The resource group containing the ACR\n- `ACR_NAME`: Azure Container Registry name\n- `ACR_PASSWORD`: ACR admin password or service principal credential (secret)\n\n### Optional Configuration Variables\n\n- `AZURE_SUBSCRIPTION_NAME`: Friendly name for the subscription (default: \"subscription-01\")\n- `LOG_WORKSPACE_ID`: Log Analytics Workspace ID for detailed event analysis\n- `USAGE_THRESHOLD`: Storage usage threshold percentage (default: 80)\n- `CRITICAL_THRESHOLD`: Critical storage threshold percentage (default: 95)\n- `TIME_PERIOD_HOURS`: Time period for pull/push analysis in hours (default: 24)\n- `PULL_SUCCESS_THRESHOLD`: Pull success rate threshold percentage (default: 95)\n- `PUSH_SUCCESS_THRESHOLD`: Push success rate threshold percentage (default: 98)\n\n### Example Usage\n\n```bash\n# Set required variables\nexport AZURE_SUBSCRIPTION_ID=\"your-subscription-id\"\nexport AZ_RESOURCE_GROUP=\"your-resource-group\"\nexport ACR_NAME=\"your-acr-name\"\nexport LOG_WORKSPACE_ID=\"your-log-analytics-workspace-id\"\n\n# Run comprehensive health check\nrobot runbook.robot\n\n# Run SLI scoring only\nrobot sli.robot\n```\n\n## Directory Structure\n\n### Core Files\n- `runbook.robot` - Main runbook with comprehensive health checks and issue generation\n- `sli.robot` - Service Level Indicator scoring for monitoring integration\n\n### Health Check Scripts\n\n- `acr_network_config.sh` - Network configuration and connectivity analysis\n- `acr_reachability.sh` - DNS and TLS connectivity testing\n- `acr_authentication.sh` - Authentication and login testing\n- `acr_usage_sku.sh` - SKU analysis and usage recommendations\n- `acr_storage_utilization.sh` - Comprehensive storage analysis with cleanup guidance\n- `acr_pull_push_ratio.sh` - Pull/push success rate analysis with Azure Monitor integration\n- `acr_events.sh` - Log Analytics event analysis\n- `acr_rbac_security.sh` - Security configuration and RBAC analysis\n\n### Test Infrastructure\n- `.test/terraform/` - Comprehensive Terraform infrastructure for testing\n - Creates Premium and Basic ACR instances\n - Log Analytics workspace with diagnostic settings\n - Virtual network and private endpoint configuration\n - RBAC assignments and webhook testing\n - Sample repository data for testing\n\n## Features\n\n### Advanced Error Handling\n-", - "libraries": [ - "Azure", - "Collections", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/azure-acr-health" - }, - { - "slug": "rw-cli-codecollection-azure-appservice-functionapp-health", - "collection_slug": "rw-cli-codecollection", - "name": "azure-appservice-functionapp-health", - "display_name": "azure-appservice-functionapp-health", - "description": "Checks key Function App metrics, individual function invocations, service plan utilization, fetches logs, config and activities for the service and generates a report of present issues for any found.", - "platform": "Azure", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Check for Resource Health Issues Affecting Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Function App `${FUNCTION_APP_NAME}` Health Check Metrics In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Function App `${FUNCTION_APP_NAME}` Configuration Health In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Deployment Health of Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Fetch Function App `${FUNCTION_APP_NAME}` Activities In Resource Group `${AZ_RESOURCE_GROUP}`", - "# RW.Core.Add Pre To Report ${log_errors.stdout}", - "# ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json", - "# END", - "Generate Function App Health Score for `${FUNCTION_APP_NAME}` in resource group `${AZ_RESOURCE_GROUP}`", - "Check for Resource Health Issues Affecting Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Log Every Function Invocation Result for Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Analyze Function Failure Patterns for Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Function App `${FUNCTION_APP_NAME}` Health in Resource Group `${AZ_RESOURCE_GROUP}`", - "Fetch Function App `${FUNCTION_APP_NAME}` Plan Utilization Metrics In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Individual Function Invocations Health for Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Get Function App `${FUNCTION_APP_NAME}` Logs and Analyze Errors In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Configuration Health of Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Deployment Health of Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Fetch Function App `${FUNCTION_APP_NAME}` Activities In Resource Group `${AZ_RESOURCE_GROUP}`", - "Fetch Azure Recommendations and Notifications for Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Recent Activities for Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Diagnostic Logs for Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`" - ], - "capabilities": [ - "Check for Resource Health Issues Affecting Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Fetch a list of issues that might affect the Function App as reported from Azure.", - "Check Function App `${FUNCTION_APP_NAME}` Health Check Metrics In Resource Group `${AZ_RESOURCE_GROUP}`: Checks the health check metric of a appservice workload. If issues are generated with severity 1 or 2, the score is 0 / unhealthy.", - "Check Function App `${FUNCTION_APP_NAME}` Configuration Health In Resource Group `${AZ_RESOURCE_GROUP}`: Checks the configuration health of a appservice workload. 1 = healthy, 0 = unhealthy.", - "Check Deployment Health of Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Fetch deployment health of the Function App", - "Fetch Function App `${FUNCTION_APP_NAME}` Activities In Resource Group `${AZ_RESOURCE_GROUP}`: Gets the events of appservice and checks for errors", - "Check for Resource Health Issues Affecting Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Fetch a list of issues that might affect the Function App as reported from Azure.", - "Log Every Function Invocation Result for Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Enhanced logging of every function invocation with detailed success/failure tracking and performance metrics.", - "Analyze Function Failure Patterns for Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Enhanced failure pattern analysis with temporal correlation and structured data collection.", - "Check Function App `${FUNCTION_APP_NAME}` Health in Resource Group `${AZ_RESOURCE_GROUP}`: Checks the health status of a appservice workload.", - "Fetch Function App `${FUNCTION_APP_NAME}` Plan Utilization Metrics In Resource Group `${AZ_RESOURCE_GROUP}`: Reviews key metrics for the Function App plan and generates a report", - "Check Individual Function Invocations Health for Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Analyzes the health and metrics of individual function invocations, including execution counts, errors, throttles, and performance metrics.", - "Get Function App `${FUNCTION_APP_NAME}` Logs and Analyze Errors In Resource Group `${AZ_RESOURCE_GROUP}`: Fetch logs of appservice workload and analyze for errors", - "Check Configuration Health of Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Fetch the configuration health of the Function App", - "Check Deployment Health of Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Fetch deployment health of the Function App", - "Fetch Function App `${FUNCTION_APP_NAME}` Activities In Resource Group `${AZ_RESOURCE_GROUP}`: Gets the events of function app and checks for start/stop operations and errors", - "Fetch Azure Recommendations and Notifications for Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Fetch Azure Advisor recommendations, Service Health notifications, and security assessments for the Function App", - "Check Recent Activities for Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Analyze recent Azure activities for the Function App, including critical operations and user actions.", - "Check Diagnostic Logs for Function App `${FUNCTION_APP_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Check for diagnostic logs configuration and search them for relevant events if they exist." - ], - "readme": "# Azure Function App Health Triage\nChecks key Function App metrics, individual function invocations, service plan utilization, fetches logs, config and activities for the service and generates a report of present issues for any found.\n\n## Enhanced Features\n\nThis codebundle now includes advanced monitoring and analysis capabilities:\n\n### \ud83d\udd0d Enhanced Invocation Logging\n- **Comprehensive tracking**: Logs every function invocation with detailed success/failure analysis\n- **Performance metrics**: Captures duration patterns, memory usage, and execution trends\n- **Health scoring**: Automatically categorizes function health status (Healthy/Warning/Unhealthy/Idle)\n- **Time-series analysis**: Provides detailed breakdowns of function performance over time\n\n### \ud83d\udea8 Advanced Failure Analysis\n- **Error categorization**: Classifies errors into types (Timeout, Memory, Throttling, Dependency, etc.)\n- **Temporal pattern detection**: Identifies patterns like Sporadic, Single_Incident, Spike, Recurring\n- **Health scoring**: Calculates comprehensive health scores (0-100) based on error rates, duration, and memory usage\n- **Structured reporting**: Generates detailed issue reports with actionable next steps\n\n## Configuration\n\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\nexport FUNCTION_APP_NAME\nexport AZ_RESOURCE_GROUP\nexport AZURE_RESOURCE_SUBSCRIPTION_ID\n\n## Optional Configuration Variables\n\nThe following variables can be customized to adjust thresholds for issue detection:\n\n- `RW_LOOKBACK_WINDOW`: Time period to look back for activities/events (default: 10)\n- `TIME_PERIOD_DAYS`: Time period to look back for recommendations (default: 7)\n- `CPU_THRESHOLD`: CPU % threshold for issues (default: 80)\n- `REQUESTS_THRESHOLD`: Requests/s threshold for issues (default: 1000)\n- `HTTP5XX_THRESHOLD`: HTTP 5XX errors/s threshold (default: 5)\n- `HTTP4XX_THRESHOLD`: HTTP 4XX errors/s threshold (default: 200)\n- `DISK_USAGE_THRESHOLD`: Disk usage % threshold (default: 90)\n- `AVG_RSP_TIME`: Average response time threshold in ms (default: 300)\n- `FUNCTION_ERROR_RATE_THRESHOLD`: Function error rate % threshold (default: 10)\n- `FUNCTION_MEMORY_THRESHOLD`: Function memory usage threshold in MB (default: 512)\n- `FUNCTION_DURATION_THRESHOLD`: Function execution duration threshold in ms (default: 5000)\n\n### Advanced Execution Units Monitoring\n\nThe codebundle includes intelligent execution units monitoring with baseline comparison and anomaly detection:\n\n- `EXECUTION_UNITS_COST_THRESHOLD`: Static threshold for cost alerts - represents ~$500/month at default (default: 10000000)\n- `EXECUTION_UNITS_ANOMALY_MULTIPLIER`: Multiplier for anomaly detection - alerts when execution units are X times higher than baseline (default: 5)\n- `BASELINE_LOOKBACK_DAYS`: Number of days to look back for baseline calculation (default: 7)\n\n**How it works:**\n1. **Cost Alert**: Always triggers when execution units exceed the static threshold (helps with budget management)\n2. **Anomaly Alert**: Compares current usage to the same time period N days ago and alerts if usage is significantly higher than normal\n3. **Baseline Calculation**: Uses historical Azure Monitor data to establish normal usage patterns for your specific Function App\n\n## Features\n\n- **Resource Health Check**: Monitors Azure resource health status\n- **Function App Health**: Checks overall Function App health metrics\n- **Plan Utilization**: Analyzes App Service Plan utilization metrics\n- **Individual Function Invocations**: Detailed analysis of each function's performance, errors, and throttles\n- **Enhanced Invocation Logging**: \ud83c\udd95 Comprehensive logging of every function invocation with success/failure tracking\n- **Advanced Failure Analysis**: \ud83c\udd95 Pattern detection and error categorization with structured data output\n- **Log Analysis**: \ud83c\udd95 Consolidated log retrieval and error analysis in a single task\n- **Configuration Health**: Checks Function App configuration\n- **Deployment Health**: Monitors deployment status\n- **Activity Monitoring**: Tracks recent activities and events with focus on start/stop operations\n- **Start/Stop Operations**: Creates severity 4 issues for function app start/stop/restart operations with user details\n- **Recommendations**: Fetches Azure Advisor recommendations\n\n## Enhanced Scripts\n\n### `function_invocation_logger.sh`\nProvides detailed logging of every function invocation:\n- Tracks success/failure counts and rates\n- Analyzes duration patterns (avg/max/min)\n- Categorizes function health status\n- Generates comprehensive JSON output with per-function metrics\n- Creates Robot Framework issues for invocation problems\n\n### `function_failure_analysis.sh` \nAdvanced failure pattern analysis:\n- Detects temporal failure patterns\n- Categorizes error types automatically\n- Calculates health scores for each function\n- Generates LLM-ready structured data for further analysis\n\n## Testing\n\n### Integration Tests\nThe codebundl", - "libraries": [ - "DateTime", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/azure-appservice-functionapp-health" - }, - { - "slug": "rw-cli-codecollection-gh-actions-artifact-analysis", - "collection_slug": "rw-cli-codecollection", - "name": "gh-actions-artifact-analysis", - "display_name": "gh-actions-artifact-analysis", - "description": "This codebundle is highly configurable and integrates with GitHub Actions and workflow artifacts. It downloads a specified artifact from the last workflow run, analyzes a artifact with a user provided command (typically using linux / bash tools like jq)", - "platform": "Unknown", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Analyze artifact from GitHub Workflow `${WORKFLOW_NAME}` in repository `${GITHUB_REPO}` and push metric", - "Analyze artifact from GitHub workflow `${WORKFLOW_NAME}` in repository `${GITHUB_REPO}`" - ], - "capabilities": [ - "Analyze artifact from GitHub Workflow `${WORKFLOW_NAME}` in repository `${GITHUB_REPO}` and push metric: Check GitHub workflow status, run a user provided analysis command, and push the metric. The analysis command should result in a single metric.", - "Analyze artifact from GitHub workflow `${WORKFLOW_NAME}` in repository `${GITHUB_REPO}`: Check GitHub workflow status and analyze artifact with a user provided command." - ], - "readme": "# GitHub Actions Artifact Analysis\nThis codebundle is highly configurable and integrates with GitHub Actions and workflow artifacts. It downloads a specified artifact from the last workflow run, analyzes a artifact with a user provided command (typically using linux / bash tools like jq) \n\n## SLI\nThis SLI downloads the artifact from the latest run of the GitHub Actions workflow, runs the analysis command (which must result in a metric), and pushes the metric to the RunWhen Platform. \n\n## TaskSet\nThis SLI downloads the artifact from the latest GitHub Actions workflow run, executes the analysis command and adds the details to the report. It can also generate Issues if: \n- a user specified string is found in the report output\n- the latest run didn't complete successfully\n- the latest run is older than the desired time period ($PERIOD_HOURS)\n", - "libraries": [ - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/gh-actions-artifact-analysis" - }, - { - "slug": "rw-cli-codecollection-k8s-podresources-health", - "collection_slug": "rw-cli-codecollection", - "name": "k8s-podresources-health", - "display_name": "k8s-podresources-health", - "description": "", - "platform": "Kubernetes", - "author": "jon-funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Show Pods Without Resource Limit or Resource Requests Set in Namespace `${NAMESPACE}`", - "Check Pod Resource Utilization with Top in Namespace `${NAMESPACE}`", - "Identify VPA Pod Resource Recommendations in Namespace `${NAMESPACE}`", - "Identify Overutilized Pods in Namespace `${NAMESPACE}`" - ], - "capabilities": [ - "Show Pods Without Resource Limit or Resource Requests Set in Namespace `${NAMESPACE}`: Scans a list of pods in a namespace using labels as a selector and checks if their resources are set.", - "Check Pod Resource Utilization with Top in Namespace `${NAMESPACE}`: Performs and a top command on list of labeled workloads to check pod resources.", - "Identify VPA Pod Resource Recommendations in Namespace `${NAMESPACE}`: Queries the namespace for any Vertical Pod Autoscaler resource recommendations.", - "Identify Overutilized Pods in Namespace `${NAMESPACE}`: Scans the namespace for pods that are over utilizing resources or may be experiencing resource problems like oomkills or restarts." - ], - "readme": "", - "libraries": [ - "DateTime", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/k8s-podresources-health" - }, - { - "slug": "rw-cli-codecollection-azure-loadbalancer-triage", - "collection_slug": "rw-cli-codecollection", - "name": "azure-loadbalancer-triage", - "display_name": "azure-loadbalancer-triage", - "description": "Queries the activity logs of internal loadbalancers (AKS ingress) objects in Azure and optionally inspects internal AKS ingress objects if available.", - "platform": "Azure", - "author": "jon-funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Check Activity Logs for Azure Load Balancer `${AZ_LB_NAME}`" - ], - "capabilities": [ - "Check Activity Logs for Azure Load Balancer `${AZ_LB_NAME}`: Queries a Azure Loadbalancer's health probe to determine if it's in a healthy state." - ], - "readme": "# Azure LoadBalancer Triage\n\nQueries the activity logs of internal loadbalancers (AKS ingress) objects in Azure and optionally inspects internal AKS ingress objects if available.\n\n## Tasks\n`Health Check Internal Azure Load Balancer`\n\n## Configuration\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\n- `AZ_USERNAME`: Azure service account username secret used to authenticate.\n- `AZ_CLIENT_SECRET`: Azure service account client secret used to authenticate.\n- `AZ_TENANT`: Azure tenant ID used to authenticate to.\n- `AZ_HISTORY_RANGE`: The history range to inspect for incidents in the activity log, in hours. Defaults to 24 hours.\n\n## Requirements\n- A kubeconfig with appropriate RBAC permissions to perform the desired command.\n\n## TODO\n- [ ] Refine issues raised\n- [ ] Array support for issues\n- [ ] Look at cross az/kubectl for better triage\n- [ ] Add additional documentation.\n\n", - "libraries": [ - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/azure-loadbalancer-triage" - }, - { - "slug": "rw-cli-codecollection-gcp-cloud-function-health", - "collection_slug": "rw-cli-codecollection", - "name": "gcp-cloud-function-health", - "display_name": "gcp-cloud-function-health", - "description": "This code checks if any GCP (Google Cloud Platform) cloud functions are unhealthy. It uses the gcloud command-line tool to interact with GCP APIs and retrieve the necessary information.", - "platform": "GCP", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Count unhealthy GCP Cloud Functions in GCP Project `${GCP_PROJECT_ID}`", - "List Unhealthy Cloud Functions in GCP Project `${GCP_PROJECT_ID}`", - "Get Error Logs for Unhealthy Cloud Functions in GCP Project `${GCP_PROJECT_ID}`" - ], - "capabilities": [ - "Count unhealthy GCP Cloud Functions in GCP Project `${GCP_PROJECT_ID}`: Counts all GCP Functions that are not in a Healthy state", - "List Unhealthy Cloud Functions in GCP Project `${GCP_PROJECT_ID}`: Fetches a list of GCP Cloud Functions that are not healthy.", - "Get Error Logs for Unhealthy Cloud Functions in GCP Project `${GCP_PROJECT_ID}`: Fetches GCP logs related to unhealthy Cloud Functions within the last 14 days" - ], - "readme": "# GCP Cloud Function Health\nThis code checks if any GCP (Google Cloud Platform) cloud functions are unhealthy. It uses the gcloud command-line tool to interact with GCP APIs and retrieve the necessary information.\n\n> Note: Only cloud functions v1 is supported at this time for automatic discovery with the RunWhen Local Discovery Process. The tasks will support either generation. \n\n## SLI\nThe SLI counts the number of cloud functions that are \"FAILED\" state and pushes the metric. \n\n## TaskSet \nThe Taskset lists provides the following tasks: \n\n- List Unhealhy Cloud Functions in GCP Project\n- Get Error Logs for Unhealthy Cloud Functions in GCP Project\n\n## Requirements\nThe following permissions are required on the GCP service account used with the gcloud utility: \n\n - `cloudfunctions.functions.get`\n - `cloudfunctions.functions.list`", - "libraries": [ - "DateTime", - "Collections", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/gcp-cloud-function-health" - }, - { - "slug": "rw-cli-codecollection-azure-appservice-plan-health", - "collection_slug": "rw-cli-codecollection", - "name": "azure-appservice-plan-health", - "display_name": "azure-appservice-plan-health", - "description": "This codebundle runs a suite of metrics checks for App Service Plan health in Azure. It identifies:", - "platform": "Azure", - "author": "saurabh3460", - "support_tags": [ - "rw" - ], - "tasks": [ - "Count App Service Plans with Health Status of `Available` in resource group `${AZURE_RESOURCE_GROUP}`", - "Count App Service Plans with High Capacity Usage in resource group `${AZURE_RESOURCE_GROUP}`", - "Generate Health Score", - "Check Azure App Service Plan Resource Health in resource group `${AZURE_RESOURCE_GROUP}`", - "Check App Service Plan Capacity and Recommendations in resource group `${AZURE_RESOURCE_GROUP}`", - "Analyze App Service Plan Cost Optimization Opportunities in resource group `${AZURE_RESOURCE_GROUP}`", - "Check App Service Plan Changes in resource group `${AZURE_RESOURCE_GROUP}`" - ], - "capabilities": [ - "Count App Service Plans with Health Status of `Available` in resource group `${AZURE_RESOURCE_GROUP}`: Count Azure App Service Plans with health status of `Available`", - "Count App Service Plans with High Capacity Usage in resource group `${AZURE_RESOURCE_GROUP}`: Count App Service Plans with high CPU, memory, or disk queue usage", - "Check Azure App Service Plan Resource Health in resource group `${AZURE_RESOURCE_GROUP}`: Check the Azure Resource Health API for any known issues affecting App Service Plans", - "Check App Service Plan Capacity and Recommendations in resource group `${AZURE_RESOURCE_GROUP}`: Check App Service Plan capacity, report high usage issues, and provide scaling recommendations", - "Analyze App Service Plan Cost Optimization Opportunities in resource group `${AZURE_RESOURCE_GROUP}`: Analyzes 30-day utilization trends using Azure Monitor to identify underutilized App Service Plans with cost savings opportunities. Provides Azure pricing-based estimates for potential monthly and annual savings with severity bands: Sev4 <$2k/month, Sev3 $2k-$10k/month, Sev2 >$10k/month.", - "Check App Service Plan Changes in resource group `${AZURE_RESOURCE_GROUP}`: Lists App Service Plan changes and operations from Azure Activity Log" - ], - "readme": "# Azure App Service Plan Health\nThis codebundle runs a suite of metrics checks for App Service Plan health in Azure. It identifies:\n- Check App Service Plan capacity Utilization\n- Check App Service Plan activity logs\n- Check App Service Plan recommendations\n- **Cost optimization analysis** that identifies underutilized App Service Plans with potential savings opportunities using 30-day Azure Monitor utilization trends\n\n## Features\n\n### Health Monitoring\n- **Resource Health**: Checks Azure-reported health status of App Service Plan resources\n- **Capacity Analysis**: Validates App Service Plan capacity utilization and identifies high usage issues\n- **Configuration Recommendations**: Provides scaling recommendations based on current usage patterns\n- **Activity Monitoring**: Analyzes recent activities for errors and warnings\n\n### Cost Optimization\n- **30-Day Utilization Analysis**: Uses Azure Monitor to analyze CPU and memory utilization trends\n- **Underutilization Detection**: Identifies App Service Plans with consistently low resource usage\n- **Cost Savings Estimates**: Provides monthly and annual savings estimates using Azure App Service pricing\n- **Severity-Based Alerts**: \n - **Severity 4**: <$2,000/month potential savings\n - **Severity 3**: $2,000-$10,000/month potential savings \n - **Severity 2**: >$10,000/month potential savings\n- **Azure App Service Pricing Database**: Comprehensive pricing for Free, Shared, Basic, Standard, Premium, and Isolated tiers\n- **Conservative Recommendations**: Accounts for overhead and safety margins in scaling suggestions\n\n## Configuration\n\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\n- `AZ_USERNAME`: Service principal's client ID\n- `AZ_SECRET_VALUE`: The credential secret value from the app registration\n- `AZ_TENANT`: The Azure tenancy ID\n- `AZ_SUBSCRIPTION`: The Azure subscription ID\n\n## Testing \nSee the .test directory for infrastructure test code. \n\n## Notes\n\nThis codebundle assumes the service principal authentication flow.\n\nThe cost optimization analysis requires Azure Monitor metrics to be available for the App Service Plans. Ensure that monitoring is enabled for accurate utilization data.", - "libraries": [ - "DateTime", - "Collections", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/azure-appservice-plan-health" - }, - { - "slug": "rw-cli-codecollection-k8s-chaos-workload", - "collection_slug": "rw-cli-codecollection", - "name": "k8s-chaos-workload", - "display_name": "k8s-chaos-workload", - "description": "This codebundle provides chaos injection for a specific workload within a Kubernetes namespace.", - "platform": "Kubernetes", - "author": "jon-funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Test `${WORKLOAD_NAME}` High Availability in Namespace `${NAMESPACE}`", - "OOMKill `${WORKLOAD_NAME}` Pod", - "Mangle Service Selector For `${WORKLOAD_NAME}` in `${NAMESPACE}`", - "Mangle Service Port For `${WORKLOAD_NAME}` in `${NAMESPACE}`", - "Fill Tmp Directory Of Pod From `${WORKLOAD_NAME}`" - ], - "capabilities": [ - "Test `${WORKLOAD_NAME}` High Availability in Namespace `${NAMESPACE}`: Kills a pod under this workload to test high availability.", - "OOMKill `${WORKLOAD_NAME}` Pod: Kills the oldest pod running under the configured workload.", - "Mangle Service Selector For `${WORKLOAD_NAME}` in `${NAMESPACE}`: Breaks a service's label selector to cause a network disruption", - "Mangle Service Port For `${WORKLOAD_NAME}` in `${NAMESPACE}`: Changes a service's port to cause a network disruption", - "Fill Tmp Directory Of Pod From `${WORKLOAD_NAME}`: Attaches to a pod and fills the /tmp directory with random data" - ], - "readme": "# Kubernetes Workload Chaos Engineering\n\nThis codebundle provides chaos injection for a specific workload within a Kubernetes namespace. \n\n## Configuration\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\n- `KUBECONFIG`: The kubeconfig secret containing access info for the cluster.\n- `CONTEXT`: The Kubernetes context to operate within.\n- `NAMESPACE`: The name of the namespace to search. Leave it blank to search in all namespaces.\n- `WORKLOAD_NAME`: The specific workload to inject chaos experiments into. Eg: deployment/my-app\n\n\n## Requirements\n- A kubeconfig with appropriate RBAC permissions to perform the desired command.\n\n## TODO\n- [ ] Add additional documentation.\n\n", - "libraries": [ - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem", - "String", - "Process" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/k8s-chaos-workload" - }, - { - "slug": "rw-cli-codecollection-k8s-istio-system-health", - "collection_slug": "rw-cli-codecollection", - "name": "k8s-istio-system-health", - "display_name": "k8s-istio-system-health", - "description": "This codebundle provides a task aimed at finding issues related to a Istio sidecar being available for the applications.", - "platform": "Kubernetes", - "author": "Nbarola", - "support_tags": [ - "rw" - ], - "tasks": [ - "Verify Istio Sidecar Injection for Cluster `${CONTEXT}`", - "Check Istio Sidecar Resource Usage for Cluster `${CONTEXT}`", - "Validate Istio Installation in Cluster `${CONTEXT}`", - "Check Istio Controlplane Logs For Errors in Cluster `${CONTEXT}`", - "Fetch Istio Proxy Logs in Cluster `${CONTEXT}`", - "Verify Istio SSL Certificates in Cluster `${CONTEXT}`", - "Check Istio Configuration Health in Cluster `${CONTEXT}`", - "Generate Health Score for Cluster ${CONTEXT}", - "Verify Istio Sidecar Injection for Cluster `${CONTEXT}`", - "Check Istio Sidecar Resource Usage for Cluster `${CONTEXT}`", - "Validate Istio Installation in Cluster `${CONTEXT}`", - "Check Istio Controlplane Logs For Errors in Cluster `${CONTEXT}`", - "Fetch Istio Proxy Logs in Cluster `${CONTEXT}`", - "Verify Istio SSL Certificates in Cluster `${CONTEXT}`", - "Check Istio Configuration Health in Cluster `${CONTEXT}`" - ], - "capabilities": [ - "Verify Istio Sidecar Injection for Cluster `${CONTEXT}`: Checks all deployments in specified namespaces for Istio sidecar injection status", - "Check Istio Sidecar Resource Usage for Cluster `${CONTEXT}`: Checks all pods in specified namespaces for Istio sidecar resources usage", - "Validate Istio Installation in Cluster `${CONTEXT}`: Verify Istio Istallation", - "Check Istio Controlplane Logs For Errors in Cluster `${CONTEXT}`: Check controlplane logs for known errors and warnings in Cluster", - "Fetch Istio Proxy Logs in Cluster `${CONTEXT}`: Check istio proxy logs for known errors and warnings in cluster", - "Verify Istio SSL Certificates in Cluster `${CONTEXT}`: Check Istio valid Root CA and mTLS Certificates in Cluster", - "Check Istio Configuration Health in Cluster `${CONTEXT}`: Check Istio configurations in Cluster", - "Verify Istio Sidecar Injection for Cluster `${CONTEXT}`: Checks all deployments in specified namespaces for Istio sidecar injection status", - "Check Istio Sidecar Resource Usage for Cluster `${CONTEXT}`: Checks all pods in specified namespaces for Istio sidecar resources usage", - "Validate Istio Installation in Cluster `${CONTEXT}`: Verify Istio Istallation in cluster", - "Check Istio Controlplane Logs For Errors in Cluster `${CONTEXT}`: Check istio controlplane logs for known errors and warnings in cluster ${CONTEXT}", - "Fetch Istio Proxy Logs in Cluster `${CONTEXT}`: Check istio proxy logs for known errors and warnings in cluster", - "Verify Istio SSL Certificates in Cluster `${CONTEXT}`: Check Istio valid Root CA and mTLS Certificates in cluster", - "Check Istio Configuration Health in Cluster `${CONTEXT}`: Check Istio configurations in cluster" - ], - "readme": "# Kubernetes Deployment Triage\n\nThis codebundle provides a task aimed at finding issues related to a Istio sidecar being available for the applications.\n\n## Tasks\n`Check Deployments for Istio Sidecar Injection`\n`Check Istio Sidecar resources usage`\n`Verify Istio Istallation`\n`Check Istio Controlplane logs for errors and warnings`\n`Check Istio Certificates for the Istio Components`\n`Analyze Istio configurations`\n\n\n## Configuration\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\n- `kubeconfig`: The kubeconfig secret containing access info for the cluster.\n- `KUBERNETES_DISTRIBUTION_BINARY`: Which binary to use for Kubernetes CLI commands. Default value is `kubectl`.\n- `CONTEXT`: The Kubernetes context to operate within.\n- `CLUSTER`: The Kubernetes cluster to operate within.\n- `EXCLUDED_NAMESPACE`: The name of the namespaces to exclude in search. Leave it blank to search in all namespaces.\n\n## Requirements\n- A kubeconfig with appropriate RBAC permissions to perform the desired command.\n\n## Infra\n- To Create Infra use `task build-infra`\n\n### Post Infra operations\n```\naws eks --region us-west-2 update-kubeconfig --name istio-cluster\n```\n\n- Create kubeconfig with service account token\n\n```\nkubectl -n kube-system create serviceaccount kubeconfig-sa\n```\n\n```\nkubectl create clusterrolebinding add-on-cluster-admin --clusterrole=view --serviceaccount=kube-system:kubeconfig-sa\n```\n\n```\ncat < kubeconfig-sa-token.yaml\napiVersion: v1\nkind: Secret\nmetadata:\n name: kubeconfig-sa-token\n namespace: kube-system\n annotations:\n kubernetes.io/service-account.name: kubeconfig-sa\ntype: kubernetes.io/service-account-token\nEOF\n```\n\n```\nkubectl apply -f terraform/kubeconfig-sa-token.yaml\n```\n\n```\nTOKEN=`kubectl -n kube-system get secret kubeconfig-sa-token -o jsonpath='{.data.token}' | base64 --decode`\n```\n\n```\nkubectl config set-credentials kubeconfig-sa --token=$TOKEN\n```\n\n```\nkubectl config set-context --current --user=kubeconfig-sa\n```\n\n```\nkubectl config view --minify --raw\n```\n\n\n\n", - "libraries": [ - "DateTime", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "RW.K8sHelper", - "OperatingSystem", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/k8s-istio-system-health" - }, - { - "slug": "rw-cli-codecollection-k8s-postgres-healthcheck", - "collection_slug": "rw-cli-codecollection", - "name": "k8s-postgres-healthcheck", - "display_name": "k8s-postgres-healthcheck", - "description": "", - "platform": "Kubernetes", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Check Patroni Database Lag in Namespace `${NAMESPACE}` on Host `${HOSTNAME}` using `patronictl`", - "Check Database Backup Status for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}`", - "Generate Namespace Score for Namespace `${NAMESPACE}`", - "List Resources Related to Postgres Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}`", - "Get Postgres Pod Logs & Events for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}`", - "Get Postgres Pod Resource Utilization for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}`", - "Get Running Postgres Configuration for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}`", - "Get Patroni Output and Add to Report for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}`", - "Fetch Patroni Database Lag for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}`", - "Check Database Backup Status for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}`", - "Run DB Queries for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}`" - ], - "capabilities": [ - "Check Patroni Database Lag in Namespace `${NAMESPACE}` on Host `${HOSTNAME}` using `patronictl`: Identifies the lag using patronictl and raises issues if necessary.", - "Check Database Backup Status for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}`: Ensure that backups are current and not stale.", - "List Resources Related to Postgres Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}`: Runs a simple fetch all for the resources in the given workspace under the configured labels.", - "Get Postgres Pod Logs & Events for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}`: Queries Postgres-related pods for their recent logs and checks for any warning-type events.", - "Get Postgres Pod Resource Utilization for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}`: Performs and a top command on list of labeled postgres-related workloads to check pod resources.", - "Get Running Postgres Configuration for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}`: Fetches the postgres instance's configuration information.", - "Get Patroni Output and Add to Report for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}`: Attempts to run the patronictl CLI within the workload if it's available to check the current state of a patroni cluster, if applicable.", - "Fetch Patroni Database Lag for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}`: Identifies the lag using patronictl and raises issues if necessary.", - "Check Database Backup Status for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}`: Checks the status of backup operations on Kubernets Postgres clusters. Raises issues if backups have not been completed or appear unhealthy.", - "Run DB Queries for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}`: Runs a suite of configurable queries to check for index issues, slow-queries, etc and create a report." - ], - "readme": "", - "libraries": [ - "DateTime", - "Collections", - "RW.CLI", - "RW.Core", - "RW.platform", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/k8s-postgres-healthcheck" - }, - { - "slug": "rw-cli-codecollection-azure-subscription-cost-health", - "collection_slug": "rw-cli-codecollection", - "name": "azure-subscription-cost-health", - "display_name": "azure-subscription-cost-health", - "description": "This codebundle analyzes Azure subscription cost health by identifying stopped Function Apps on App Service Plans, proposing consolidation opportunities, analyzing AKS node pool utilization, optimizing Databricks cluster configurations, and estimating potential cost savings across one or more subscriptions with configurable discount factors.", - "platform": "Azure", - "author": "assistant", - "support_tags": [ - "rw" - ], - "tasks": [ - "Generate Azure Cost Report By Service and Resource Group", - "Analyze App Service Plan Cost Optimization", - "Analyze AKS Node Pool Resizing Opportunities Based on Utilization Metrics", - "Analyze Databricks Cluster Auto-Termination and Over-Provisioning Opportunities", - "Analyze Virtual Machine Rightsizing and Deallocation Opportunities", - "Analyze Azure Storage Cost Optimization Opportunities" - ], - "capabilities": [ - "Generate Azure Cost Report By Service and Resource Group: Generates a detailed cost breakdown report for the last 30 days showing actual spending by resource group and Azure service using the Cost Management API. Includes period-over-period comparison and raises an issue if cost increase exceeds configured threshold.", - "Analyze App Service Plan Cost Optimization: Analyzes App Service Plans across subscriptions to identify empty plans, underutilized resources, and rightsizing opportunities with cost savings estimates. Supports three optimization strategies (aggressive/balanced/conservative) and provides comprehensive options tables with risk assessments for each plan.", - "Analyze AKS Node Pool Resizing Opportunities Based on Utilization Metrics: Analyzes AKS cluster node pools across specified subscriptions, examines both average and peak CPU/memory utilization over the past 30 days, and provides capacity-planned recommendations for reducing minimum node counts or changing VM types to optimize costs. Uses a two-tier approach: minimum nodes based on average utilization (150% safety margin), maximum nodes based on peak utilization (150% safety margin). This ensures cost-effective baseline capacity while maintaining ceiling for traffic spikes. Safety margins are configurable via MIN_NODE_SAFETY_MARGIN_PERCENT and MAX_NODE_SAFETY_MARGIN_PERCENT.", - "Analyze Databricks Cluster Auto-Termination and Over-Provisioning Opportunities: Analyzes Azure Databricks workspaces and clusters across specified subscriptions to identify cost optimization opportunities. Focuses on: 1) Clusters without auto-termination configured or running idle, 2) Over-provisioned clusters with low CPU/memory utilization. Calculates both VM costs and DBU (Databricks Unit) costs to provide accurate savings estimates.", - "Analyze Virtual Machine Rightsizing and Deallocation Opportunities: Analyzes Azure Virtual Machines across specified subscriptions to identify cost optimization opportunities. Focuses on: 1) VMs that are stopped but not deallocated (still incurring compute costs), 2) Oversized VMs with low CPU utilization that can be downsized to B-series burstable instances. Examines CPU utilization metrics over the past 30 days to provide data-driven rightsizing recommendations.", - "Analyze Azure Storage Cost Optimization Opportunities: Analyzes Azure storage resources across specified subscriptions to identify cost optimization opportunities. Focuses on: 1) Unattached/orphaned managed disks still incurring costs, 2) Old snapshots (>90 days by default) consuming storage, 3) Storage accounts without lifecycle management policies, 4) Over-provisioned redundancy (GRS/GZRS that could use LRS/ZRS), 5) Premium disks with low IOPS utilization that could be downgraded to Standard SSD." - ], - "readme": "# Azure Subscription Cost Health\n\nThis codebundle analyzes Azure subscription cost health by identifying stopped Function Apps on App Service Plans, proposing consolidation opportunities, analyzing AKS node pool utilization, optimizing Databricks cluster configurations, and estimating potential cost savings across one or more subscriptions with configurable discount factors.\n\n\n## Features\n\n### Cost Analysis & Optimization\n- **Cost Trend Analysis**: Compares current period costs to previous period with configurable alerting threshold (default: 10% increase)\n- **Period-over-Period Comparison**: Automatically analyzes trends and generates issues when costs increase beyond acceptable limits\n- **Stopped Function Discovery**: Identifies stopped Function Apps that are still consuming App Service Plan resources\n- **Consolidation Analysis**: Analyzes opportunities to consolidate underutilized App Service Plans\n- **AKS Node Pool Optimization**: Analyzes AKS cluster node pools and provides resizing recommendations based on actual CPU/memory utilization\n- **Databricks Cluster Optimization**: Identifies clusters without auto-termination, idle clusters, and over-provisioned clusters\n- **VM Optimization**: Identifies stopped-not-deallocated VMs and oversized instances with low CPU/memory utilization\n- **Configurable Discounts**: Apply custom discount percentages off MSRP to reflect your Azure pricing agreements (EA, CSP, etc.)\n- **Multi-Subscription Support**: Can analyze multiple Azure subscriptions in a single run\n- **Resource Group Scoping**: Supports filtering analysis to specific resource groups\n- **Cost Estimation**: Provides accurate monthly and annual cost savings estimates using Azure pricing databases\n\n### App Service Plan Optimization Strategies\n\nThe tool supports three optimization strategies to balance cost savings with operational safety:\n\n#### **Aggressive** (`OPTIMIZATION_STRATEGY=aggressive`)\n- **Target Utilization**: 85-90% max CPU after optimization\n- **Risk Tolerance**: Medium to High\n- **Best For**: Non-critical workloads, dev/test/staging environments\n- **Characteristics**:\n - Maximum cost savings approach\n - Accepts recommendations that may push utilization close to capacity\n - Suitable for workloads with predictable traffic patterns\n - Recommended when quick cost reduction is a priority\n\n#### **Balanced** (default, `OPTIMIZATION_STRATEGY=balanced`)\n- **Target Utilization**: 75-80% max CPU after optimization\n- **Risk Tolerance**: Low to Medium\n- **Best For**: Standard production workloads\n- **Characteristics**:\n - Default optimization approach\n - Balances cost savings with operational headroom\n - Maintains buffer for traffic spikes and growth\n - Suitable for most production environments\n - Recommended for general use\n\n#### **Conservative** (`OPTIMIZATION_STRATEGY=conservative`)\n- **Target Utilization**: 60-70% max CPU after optimization\n- **Risk Tolerance**: Low only\n- **Best For**: Critical production workloads, high-growth applications\n- **Characteristics**:\n - Safest optimization approach\n - Only LOW-risk recommendations\n - Preserves significant headroom for burst capacity\n - Accounts for traffic growth and seasonal spikes\n - Recommended for mission-critical applications\n\n#### Memory Thresholds (All Strategies)\n\nAll strategies now enforce memory safety limits to prevent out-of-memory issues:\n\n| Current Memory Max | SKU Downgrade Risk | Notes |\n|--------------------|-------------------|--------|\n| **> 90%** | \ud83d\udd34 **HIGH** (blocked) | SKU downgrade would halve memory - extremely dangerous |\n| **80-90%** | \ud83d\udd34 **HIGH** (warning) | Downgrade likely to cause memory pressure |\n| **70-80%** | \ud83d\udfe1 **MEDIUM** | Proceed with caution, monitor closely |\n| **< 70%** | Evaluated normally | Safe to consider SKU downgrade |\n\n**Why This Matters**: SKU downgrades (e.g., EP3 \u2192 EP2) reduce available memory by 50%. If memory is already elevated, this can cause application crashes, restarts, and service disruptions.\n\n### Enhanced Recommendations with Full Options Table\n\nFor each App Service Plan analyzed, the tool now provides:\n\n1. **Comprehensive Options Table**: Shows ALL possible optimization configurations including:\n - Current configuration (baseline)\n - Single instance reduction options\n - 50% capacity reduction scenarios\n - SKU downgrade options\n - Combined SKU + capacity optimizations\n\n2. **Memory-Aware Risk Assessment**:\n - Evaluates **both CPU and Memory** constraints\n - Prevents dangerous SKU downgrades when memory is already high (>80% max)\n - **LOW**: Safe to implement, minimal performance impact (CPU <75%, Memory <80%)\n - **MEDIUM**: Requires monitoring, implement during low-traffic periods (CPU <85%, Memory <90%)\n - **HIGH**: Requires careful evaluation and gradual rollout (CPU >85% or Memory >90%)\n - Critical warnings displayed when memory pressure detected\n\n3. **Projected Utilization**: For each option, see:\n - Projected average CPU and memory\n - Projected maximum CPU", - "libraries": [ - "Collections", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/azure-subscription-cost-health" - }, - { - "slug": "rw-cli-codecollection-k8s-labeledpods-healthcheck", - "collection_slug": "rw-cli-codecollection", - "name": "k8s-labeledpods-healthcheck", - "display_name": "k8s-labeledpods-healthcheck", - "description": "", - "platform": "Kubernetes", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Measure Number of Running Pods with Label in `${NAMESPACE}`" - ], - "capabilities": [ - "Measure Number of Running Pods with Label in `${NAMESPACE}`: Counts the number of running pods with the configured labels." - ], - "readme": "", - "libraries": [ - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/k8s-labeledpods-healthcheck" - }, - { - "slug": "rw-cli-codecollection-curl-gmp-nginx-ingress-inspection", - "collection_slug": "rw-cli-codecollection", - "name": "curl-gmp-nginx-ingress-inspection", - "display_name": "curl-gmp-nginx-ingress-inspection", - "description": "Runs a task which performs inspects the HTTP error code metrics related to your nginx ingress controller in your GKE kubernetes cluster and raises issues based on the number of ingress with errors.", - "platform": "Unknown", - "author": "jon-funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Fetch Nginx HTTP Errors From GMP for Ingress `${INGRESS_OBJECT_NAME}`", - "Find Owner and Service Health for Ingress `${INGRESS_OBJECT_NAME}`" - ], - "capabilities": [ - "Fetch Nginx HTTP Errors From GMP for Ingress `${INGRESS_OBJECT_NAME}`: Fetches metrics for the Nginx ingress host from GMP and performs an inspection on the results.", - "Find Owner and Service Health for Ingress `${INGRESS_OBJECT_NAME}`: Checks the ingress object service and endpoints. Also returns the owner of the pods that support the Ingress." - ], - "readme": "# GCP GMP Nginx Ingress Inspection\n\nRuns a task which performs inspects the HTTP error code metrics related to your nginx ingress controller in your GKE kubernetes cluster and raises issues based on the number of ingress with errors.\n\n## Tasks\n`Fetch Nginx Ingress Metrics From GMP And Perform Inspection On Results` - This task fetchs the HTTP metrics from GMP, and also uses kubectl to fetch details about the ingress object, it's health, and the service owner. \n\n## Configuration\n\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\n- `TIME_SLICE`: What duration to calculate the rate over. Defaults to 60 minutes.\n- `ERROR_CODES`: Which HTTP codes to consider as errors. defaults to 500, 501, and 502.\n- `GCLOUD_SERVICE`: The remote gcloud service to use for requests.\n- `gcp_credentials`: The json credentials secrets file used to authenticate with the GCP project. Should be a service account.\n- `GCP_PROJECT_ID`: The unique project ID identifier string.\n- `INGRESS_OBJECT_NAME`: The Kubernetes ingress object name.\n- `INGRESS_SEVICE`: The Kubernetes service name behind the ingress object.\n- `INGRESS_HOST`: The hostname of the ingress object.\n- `kubeconfig`: The kubeconfig secret containing access info for the cluster.\n- `kubectl`: The location service used to interpret shell commands. Default value is `kubectl-service.shared`.\n- `KUBERNETES_DISTRIBUTION_BINARY`: Which binary to use for Kubernetes CLI commands. Default value is `kubectl`.\n- `CONTEXT`: The Kubernetes context to operate within.\n- `NAMESPACE`: The name of the Ingress object.\n\n## Notes\n\nThe `gcp_credentials` service account will need view and list permissions on the GCP logging API.\nThe `kubectl secret` will need to get|list the ingress object, services, pods, deployments, relicasets, statefulsets and so on in the namespace. \n\n## TODO\n- [ ] Add documentation\n- [ ] Add examples for non-gke ingress objects for other cloud projects\n- [ ] Add IAM settings examples", - "libraries": [ - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "RW.K8sHelper", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/curl-gmp-nginx-ingress-inspection" - }, - { - "slug": "rw-cli-codecollection-azure-appgateway-health", - "collection_slug": "rw-cli-codecollection", - "name": "azure-appgateway-health", - "display_name": "azure-appgateway-health", - "description": "Checks key metrics for Azure Application Gateways and queries the health status of backend pools used by the gateway.", - "platform": "Azure", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Check for Resource Health Issues Affecting Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Configuration Health of Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Backend Pool Health for Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Fetch Metrics for Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check SSL Certificate Health for Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Logs for Errors with Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Generate Application Gateway Health Score", - "Check for Resource Health Issues Affecting Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Configuration Health of Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Backend Pool Health for Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Fetch Log Analytics for Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Fetch Metrics for Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check SSL Certificate Health for Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Logs for Errors with Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "List Related Azure Resources for Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`" - ], - "capabilities": [ - "Check for Resource Health Issues Affecting Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Fetch a list of issues that might affect the Application Gateway as reported from Azure.", - "Check Configuration Health of Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Fetch the config of the AKS cluster in azure", - "Check Backend Pool Health for Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Fetch the health of the application gateway backend pool members", - "Fetch Metrics for Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Fetch metrics for the application gateway", - "Check SSL Certificate Health for Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Fetch SSL certificates and validate expiry dates for Azure Application Gateway instances", - "Check Logs for Errors with Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Query log analytics workspace for common errors like IP mismatches or subnet issues", - "Check for Resource Health Issues Affecting Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Fetch a list of issues that might affect the application gateway cluster", - "Check Configuration Health of Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Fetch the details and health of the application gateway configuration", - "Check Backend Pool Health for Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Fetch the health of the application gateway backend pool members", - "Fetch Log Analytics for Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Fetch log analytics for the application gateway", - "Fetch Metrics for Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Fetch metrics for the application gateway", - "Check SSL Certificate Health for Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Fetch SSL certificates and validate expiry dates for Azure Application Gateway instances", - "Check Logs for Errors with Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Query log analytics workspace for common errors like IP mismatches or subnet issues", - "List Related Azure Resources for Application Gateway `${APP_GATEWAY_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Fetch a list of resources that are releated to the application gateway" - ], - "readme": "# Azure Application Gateway Health\nChecks key metrics for Azure Application Gateways and queries the health status of backend pools used by the gateway.\n\n\n## Configuration\n\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\n- `AZ_USERNAME`: Service principal's client ID\n- `AZ_SECRET_VALUE`: The credential secret value from the app registration\n- `AZ_TENANT`: The Azure tenancy ID\n- `AZ_SUBSCRIPTION`: The Azure subscription ID\n- `AZ_RESOURCE_GROUP`: The Azure resource group that these resources reside in\n- `APPGATEWAY`: The name of the application gateway in the resource group to target with checks\n\n## Notes\n\nThis codebundle assumes the service principal authentication flow.\n\n## TODO\n- [ ] config best practices check\n- [ ] Add documentation", - "libraries": [ - "DateTime", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/azure-appgateway-health" - }, - { - "slug": "rw-cli-codecollection-aws-s3-bucket-storage-report", - "collection_slug": "rw-cli-codecollection", - "name": "aws-s3-bucket-storage-report", - "display_name": "aws-s3-bucket-storage-report", - "description": "Outputs the current usage values of all S3 buckets in a given AWS region, and the number of objects stored in them.", - "platform": "AWS", - "author": "jon-funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Check AWS S3 Bucket Storage Utilization" - ], - "capabilities": [ - "Check AWS S3 Bucket Storage Utilization: This script checks and displays the storage utilization of a specified AWS S3 bucket. It uses the AWS CLI to list all objects in the bucket recursively, displaying the results in a human-readable format and providing a summary of the total storage used." - ], - "readme": "# aws-s3-bucket-storage-report CodeBundle\n### Tags:`AWS`, `S3 Bucket`, `Storage Issue`\n## CodeBundle Objective:\nOutputs the current usage values of all S3 buckets in a given AWS region, and the number of objects stored in them.\n\n## CodeBundle Inputs:\n\nexport AWS_REGION=\"PLACEHOLDER\"\nexport AWS_ACCESS_KEY_ID=\"PLACEHOLDER\"\nexport AWS_SECRET_ACCESS_KEY=\"PLACEHOLDER\"\n\n\n## CodeBundle Tasks:\n### `Check AWS S3 Bucket Storage Utilization`\n#### Tags:`Amazon Web Services`, `AWS S3`, `Bucket Storage`\n### Task Documentation:\nThis script checks and displays the storage utilization of a specified AWS S3 bucket. It uses the AWS CLI to list all objects in the bucket recursively, displaying the results in a human-readable format and providing a summary of the total storage used.\n#### Usage Example:\n`./check_AWS_S3_bucket_storage_utilization.sh`\n", - "libraries": [ - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem", - "String", - "Process" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/aws-s3-bucket-storage-report" - }, - { - "slug": "rw-cli-codecollection-curl-gmp-kong-ingress-inspection", - "collection_slug": "rw-cli-codecollection", - "name": "curl-gmp-kong-ingress-inspection", - "display_name": "curl-gmp-kong-ingress-inspection", - "description": "This code collects Kong ingress host metrics from Google Monitoring Platform (GMP) on Google Cloud Platform (GCP) and inspects the results for ingresses with a HTTP error code rate greater than zero over a configurable duration. It raises issues based on the number of ingresses with error codes.", - "platform": "Unknown", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Check If Kong Ingress HTTP Error Rate Violates HTTP Error Threshold in GCP Project `${GCP_PROJECT_ID}`", - "Check If Kong Ingress HTTP Request Latency Violates Threshold in GCP Project `${GCP_PROJECT_ID}`", - "Check If Kong Ingress Controller Reports Upstream Errors in GCP Project `${GCP_PROJECT_ID}`" - ], - "capabilities": [ - "Check If Kong Ingress HTTP Error Rate Violates HTTP Error Threshold in GCP Project `${GCP_PROJECT_ID}`: Fetches HTTP Error metrics for the Kong ingress host and service from GMP and performs an inspection on the results. If there are currently any results with more than the defined HTTP error threshold, their route and service names will be surfaced for further troubleshooting.", - "Check If Kong Ingress HTTP Request Latency Violates Threshold in GCP Project `${GCP_PROJECT_ID}`: Fetches metrics for the Kong ingress 99th percentile request latency from GMP and performs an inspection on the results. If there are currently any results with more than the defined request latency threshold, their route and service names will be surfaced for further troubleshooting.", - "Check If Kong Ingress Controller Reports Upstream Errors in GCP Project `${GCP_PROJECT_ID}`: Fetches metrics for the Kong ingress controller related to upstream healthchecks or dns errors." - ], - "readme": "# GCP GMP Kong Ingress Inspection\n\nThis code collects Kong ingress host metrics from Google Monitoring Platform (GMP) on Google Cloud Platform (GCP) and inspects the results for ingresses with a HTTP error code rate greater than zero over a configurable duration. It raises issues based on the number of ingresses with error codes.\n\n## Tasks\n- `Check If Kong Ingress HTTP Error Rate Violates HTTP Error Threshold` - This task fetches HTTP error metrics for the Kong ingress host and service from GMP and performs an inspection on the results. If there are currently any results with more than the defined HTTP error threshold, their route and service names will be surfaced for further troubleshooting.\n- `Check If Kong Ingress HTTP Request Latency Violates Threshold` - This task fetches metrics for the Kong ingress 99th percentile request latency from GMP and performs an inspection on the results. If there are currently any results with request latencies greater than the defined threshold, their route and service names will be surfaced for further troubleshooting.\n- `Check If Kong Ingress Controller Reports Upstream Errors` - This task fetches metrics for the Kong ingress controller related to upstream health checks or DNS errors. It checks if health checks are enabled for the specified upstream target and if there are any reported health check errors. The results are surfaced for further investigation.\n\n## Configuration\n\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\n- `TIME_SLICE`: What duration to calculate the rate over. Defaults to 1m.\n- `HTTP_ERROR_CODES`: Which HTTP codes to consider as errors. defaults to all 500 error codes. \n- `HTTP_ERROR_RATE_THRESHOLD`: Specify the error rate threshold that is considered unhealthy. Measured in errors/s\n- `REQUEST_LATENCY_THRESHOLD`: The threshold in ms for request latency to be considered unhealthy. \n- `GCLOUD_SERVICE`: The remote gcloud service to use for requests.\n- `gcp_credentials`: The json credentials secrets file used to authenticate with the GCP project. Should be a service account.\n- `GCP_PROJECT_ID`: The unique project ID identifier string.\n\n## Notes\n\nThe `gcp_credentials` service account will need view and list permissions on the GCP logging API.\n\n## TODO\n- [ ] Add documentation\n- [ ] Add examples for non-gke ingress objects for other cloud projects\n- [ ] Add IAM settings examples", - "libraries": [ - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/curl-gmp-kong-ingress-inspection" - }, - { - "slug": "rw-cli-codecollection-azure-vm-os-health", - "collection_slug": "rw-cli-codecollection", - "name": "azure-vm-os-health", - "display_name": "azure-vm-os-health", - "description": "This bundle provides comprehensive health checks for Azure Virtual Machines, including disk utilization, memory usage, uptime, and patch status. It uses Robot Framework tasks and Bash scripts to collect, parse, and score VM health.", - "platform": "Azure", - "author": "Nbarola", - "support_tags": [ - "rw" - ], - "tasks": [ - "Check Disk Utilization for VMs in Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Memory Utilization for VMs in Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Uptime for VMs in Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Last Patch Status for VMs in Resource Group `${AZ_RESOURCE_GROUP}`", - "Generate Comprehensive VM Health Score", - "Check Disk Utilization for VMs in Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Memory Utilization for VMs in Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Uptime for VMs in Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Last Patch Status for VMs in Resource Group `${AZ_RESOURCE_GROUP}`" - ], - "capabilities": [ - "Check Disk Utilization for VMs in Resource Group `${AZ_RESOURCE_GROUP}`: Checks disk utilization for VMs and parses each result.", - "Check Memory Utilization for VMs in Resource Group `${AZ_RESOURCE_GROUP}`: Checks memory utilization for VMs and parses each result.", - "Check Uptime for VMs in Resource Group `${AZ_RESOURCE_GROUP}`: Checks uptime for VMs and parses each result.", - "Check Last Patch Status for VMs in Resource Group `${AZ_RESOURCE_GROUP}`: Checks last patch status for VMs and parses each result.", - "Check Disk Utilization for VMs in Resource Group `${AZ_RESOURCE_GROUP}`: Checks disk utilization for VMs and parses each result.", - "Check Memory Utilization for VMs in Resource Group `${AZ_RESOURCE_GROUP}`: Checks memory utilization for VMs and parses each result.", - "Check Uptime for VMs in Resource Group `${AZ_RESOURCE_GROUP}`: Checks uptime for VMs and parses each result.", - "Check Last Patch Status for VMs in Resource Group `${AZ_RESOURCE_GROUP}`: Checks last patch status for VMs and parses each result." - ], - "readme": "# Azure VM OS Health Bundle\n\nThis bundle provides comprehensive health checks for Azure Virtual Machines, including disk utilization, memory usage, uptime, and patch status. It uses Robot Framework tasks and Bash scripts to collect, parse, and score VM health.\n\n## Included Health Checks\n\n- **Disk Utilization**: Checks if any disk is above the configured threshold.\n- **Memory Utilization**: Checks if memory usage is above the configured threshold.\n- **Uptime**: Checks if system uptime exceeds the configured threshold.\n- **Patch Status**: Checks if there are pending OS patches.\n\n## Main Tasks\n\n- `Check Disk Utilization for VMs in Resource Group`\n- `Check Memory Utilization for VMs in Resource Group`\n- `Check Uptime for VMs in Resource Group`\n- `Check Last Patch Status for VMs in Resource Group`\n- `Score Disk Utilization for VMs in Resource Group`\n- `Score Memory Utilization for VMs in Resource Group`\n- `Score Uptime for VMs in Resource Group`\n- `Score Last Patch Status for VMs in Resource Group`\n- `Generate Comprehensive VM Health Score`\n\n## How It Works\n\n1. **Bash scripts** (e.g., `vm_disk_utilization.sh`, `vm_memory_check.sh`, etc.) collect raw data from Azure VMs.\n2. **Robot Framework tasks** run these scripts, parse the output, and (for SLI) calculate a health score.\n3. **Next steps scripts** (e.g., `next_steps_disk_utilization.sh`) analyze the parsed output and generate JSON issues or recommendations.\n4. **SLI tasks** aggregate the results and push a health score metric.\n\n## Key Features\n\n### OS Filtering\n- **Linux-only**: Scripts automatically filter out Windows VMs and only process Linux machines\n- **OS Detection**: Uses Azure VM metadata to determine OS type before attempting commands\n\n### Robust Error Handling\n- **Graceful Failures**: Individual VM connection failures don't stop the entire script\n- **Issue Creation**: Failed connections create structured issues for tracking\n- **Detailed Logging**: Clear error messages for troubleshooting\n\n### Configurable Timeouts\n- **VM Status Timeout**: `VM_STATUS_TIMEOUT` (default: 10s) - Time to check VM power state\n- **Command Timeout**: `COMMAND_TIMEOUT` (default: 45-60s) - Time for run-command execution\n- **Overall Timeout**: `TIMEOUT_SECONDS` (default: 30s) - General script timeout\n\n## Usage\n\n- Configure your environment variables (resource group, subscription, thresholds, etc.).\n- Optionally set `VM_INCLUDE_LIST` and/or `VM_OMIT_LIST` to control which VMs are checked:\n - `VM_INCLUDE_LIST`: Comma-separated shell-style wildcards (e.g., `web-*,db-*`). Only VMs matching any pattern are included.\n - `VM_OMIT_LIST`: Comma-separated shell-style wildcards. Any VM matching a pattern is excluded.\n - If both are empty, all Linux VMs in the resource group are checked.\n- Run the desired Robot Framework task (e.g., from `runbook.robot` or `sli.robot`).\n- Review the output and health scores.\n\n### Environment Variables\n\n```bash\n# Required\nAZURE_SUBSCRIPTION_ID=\"your-subscription-id\"\nAZ_RESOURCE_GROUP=\"your-resource-group\"\n\n# Optional - VM filtering\nVM_INCLUDE_LIST=\"web-*,db-*\" # Only check VMs matching patterns\nVM_OMIT_LIST=\"*-test\" # Skip VMs matching patterns\n\n# Optional - Performance tuning\nMAX_PARALLEL_JOBS=5 # Number of concurrent VM checks\nVM_STATUS_TIMEOUT=10 # Seconds to check VM power state\nCOMMAND_TIMEOUT=45 # Seconds for run-command execution\nTIMEOUT_SECONDS=30 # General script timeout\n```\n\n### Example\n\nTo check only VMs starting with `web-` or `db-`, but omit any ending with `-test`:\n\n```bash\nexport VM_INCLUDE_LIST=\"web-*,db-*\"\nexport VM_OMIT_LIST=\"*-test\"\nexport COMMAND_TIMEOUT=60 # Longer timeout for patch checks\nrobot runbook.robot\n```\n\n## Directory Structure\n\n- `runbook.robot` - Main runbook for health checks and issue creation.\n- `sli.robot` - SLI/score-only version for health scoring.\n- `vm_disk_utilization.sh`, `vm_memory_check.sh`, `vm_uptime_check.sh`, `vm_last_patch_check.sh` - Data collection scripts.\n- `next_steps_disk_utilization.sh`, `next_steps_memory_check.sh`, `next_steps_uptime.sh`, `next_steps_patch_time.sh` - Next steps/issue analysis scripts.\n- `.test/` - Example and test cases (see below for Terraform usage).\n\n## Error Handling\n\nThe scripts handle various failure scenarios gracefully:\n\n- **Connection Failures**: When a VM can't be reached, an issue is created and the script continues\n- **Authentication Issues**: Clear error messages for Azure CLI authentication problems\n- **VM Power State**: Non-running VMs are skipped with appropriate status codes\n- **Command Timeouts**: Long-running commands are terminated with configurable timeouts\n- **Invalid Responses**: Malformed Azure responses are handled with error reporting\n\n### Issue Types\n\n- `ConnectionError`: Failed to connect to VM or get status\n- `VMNotRunning`: VM is not in running state\n- `CommandTimeout`: Run-command execution timed out\n- `InvalidResponse`: Unexpected response format from Azure\n\n## How to Use the Terraform Code\n\n1", - "libraries": [ - "DateTime", - "Azure", - "Collections", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/azure-vm-os-health" - }, - { - "slug": "rw-cli-codecollection-k8s-deployment-healthcheck", - "collection_slug": "rw-cli-codecollection", - "name": "k8s-deployment-healthcheck", - "display_name": "k8s-deployment-healthcheck", - "description": "This codebundle provides a suite of tasks aimed at triaging issues related to a deployment and its replicas in Kubernetes clusters.", - "platform": "Kubernetes", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Get Container Restarts and Score for Deployment `${DEPLOYMENT_NAME}`", - "Get Critical Log Errors and Score for Deployment `${DEPLOYMENT_NAME}`", - "Get NotReady Pods Score for Deployment `${DEPLOYMENT_NAME}`", - "Get Deployment Replica Status and Score for `${DEPLOYMENT_NAME}`", - "Get Recent Warning Events Score for `${DEPLOYMENT_NAME}`", - "Generate Deployment Health Score for `${DEPLOYMENT_NAME}`", - "Analyze Application Log Patterns for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`", - "Detect Event Anomalies for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`", - "Fetch Deployment Logs for `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`", - "Check Liveness Probe Configuration for Deployment `${DEPLOYMENT_NAME}`", - "Check Readiness Probe Configuration for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`", - "Inspect Deployment Warning Events for `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`", - "Check Deployment Replica Status for `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`", - "Inspect Container Restarts for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`", - "Identify Recent Configuration Changes for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`", - "Check HPA Health for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`" - ], - "capabilities": [ - "Get Container Restarts and Score for Deployment `${DEPLOYMENT_NAME}`: Counts the total sum of container restarts within a timeframe and determines if they're beyond a threshold.", - "Get Critical Log Errors and Score for Deployment `${DEPLOYMENT_NAME}`: Fetches logs and checks for critical error patterns that indicate application failures.", - "Get NotReady Pods Score for Deployment `${DEPLOYMENT_NAME}`: Fetches a count of unready pods for the specific deployment.", - "Get Deployment Replica Status and Score for `${DEPLOYMENT_NAME}`: Checks if deployment has the expected number of ready replicas and is available.", - "Get Recent Warning Events Score for `${DEPLOYMENT_NAME}`: Checks for recent warning events related to the deployment within a short time window, with filtering to reduce noise.", - "Analyze Application Log Patterns for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`: Fetches and analyzes logs from the deployment pods for errors, connection issues, and other patterns that indicate application health problems. Note: Warning messages about missing log files for excluded containers (like linkerd-proxy, istio-proxy) are expected and harmless.", - "Detect Event Anomalies for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`: Analyzes Kubernetes event patterns to identify anomalies such as sudden spikes in event rates, unusual patterns, or recurring issues that might indicate underlying problems with controllers, resources, or deployments.", - "Fetch Deployment Logs for `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`: Fetches and displays deployment logs in the report for manual review. Note: Issues are not created by this task - see \"Analyze Application Log Patterns\" for automated issue detection.", - "Check Liveness Probe Configuration for Deployment `${DEPLOYMENT_NAME}`: Validates if a Liveness probe has possible misconfigurations", - "Check Readiness Probe Configuration for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`: Validates if a readiness probe has possible misconfigurations", - "Inspect Deployment Warning Events for `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`: Fetches warning events related to the deployment workload in the namespace and triages any issues found in the events.", - "Check Deployment Replica Status for `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`: Inspects the deployment replica status including desired vs available replicas and identifies any scaling issues.", - "Inspect Container Restarts for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`: Checks for container restarts and provides details on restart patterns that might indicate application issues.", - "Identify Recent Configuration Changes for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`: Identifies recent configuration changes from ReplicaSet analysis that might be related to current issues.", - "Check HPA Health for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`: Checks if a HorizontalPodAutoscaler exists for the deployment and validates its configuration and current status." - ], - "readme": "# Kubernetes Deployment Triage\n\nThis codebundle provides a suite of tasks aimed at triaging issues related to a deployment and its replicas in Kubernetes clusters.\n\n## Tasks\n`Check Deployment Log For Issues`\n`Troubleshoot Deployment Warning Events`\n`Get Deployment Workload Details For Report`\n`Troubleshoot Deployment Replicas`\n`Check For Deployment Event Anomalies`\n`Check HPA Health for Deployment`\n\n### HPA Health Check\nThe HPA (HorizontalPodAutoscaler) health check task validates both configuration and runtime status:\n\n#### Configuration Health Checks\n- **MinReplicas=1**: Warns about availability risk with single replica minimum (severity 4)\n- **Narrow Scaling Range**: Identifies when max-min < 2, limiting scaling flexibility (severity 4)\n- **Missing Resource Requests**: Critical alert when HPA uses resource metrics but deployment lacks requests (severity 2)\n- **Aggressive CPU Targets**: Warns about targets < 50% causing over-provisioning (severity 4)\n- **Conservative CPU Targets**: Warns about targets > 95% lacking headroom (severity 4)\n- **Missing Behavior Config**: Suggests adding scaling behavior for better control (severity 4)\n\n#### Runtime Status Checks\n- **No HPA**: Raises informational issue if no HPA is configured (severity 4)\n- **At Maximum Replicas**: Warns if HPA is at max capacity and cannot scale further (severity 3)\n- **At Minimum Replicas**: Suggests cost optimization if consistently at minimum (severity 4)\n- **Missing Metrics**: Alerts if HPA has no metrics configured (severity 2)\n- **Scaling Limited**: Reports if HPA scaling is constrained (severity 3)\n- **Unable to Scale**: Critical alert if HPA cannot perform scaling operations (severity 2)\n- **Healthy**: Informational status when HPA is operating normally (severity 4)\n\n## Configuration\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\n- `kubeconfig`: The kubeconfig secret containing access info for the cluster.\n- `kubectl`: The location service used to interpret shell commands. Default value is `kubectl-service.shared`.\n- `KUBERNETES_DISTRIBUTION_BINARY`: Which binary to use for Kubernetes CLI commands. Default value is `kubectl`.\n- `CONTEXT`: The Kubernetes context to operate within.\n- `NAMESPACE`: The name of the namespace to search. Leave it blank to search in all namespaces.\n- `DEPLOYMENT_NAME`: The name of the deployment.\n- `EXPECTED_AVAILABILITY`: The number of replicas allowed.\n\n## Requirements\n- A kubeconfig with appropriate RBAC permissions to perform the desired command.\n\n## TODO\n- [ ] Add additional documentation.\n\n", - "libraries": [ - "DateTime", - "RW.K8sLog", - "Collections", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem", - "RW.NextSteps", - "RW.K8sHelper", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/k8s-deployment-healthcheck" - }, - { - "slug": "rw-cli-codecollection-k8s-fluxcd-kustomization-health", - "collection_slug": "rw-cli-codecollection", - "name": "k8s-fluxcd-kustomization-health", - "display_name": "k8s-fluxcd-kustomization-health", - "description": "The `k8s-fluxcd-kustomizations-health` codebundle checks for Kustomization resources within the Kubernetes cluster to surface up potential issues.", - "platform": "Kubernetes", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "List Suspended FluxCD Kustomization objects in Namespace `${NAMESPACE}` in Cluster `${CONTEXT}`", - "List Unready FluxCD Kustomizations in Namespace `${NAMESPACE}` in Cluster `${CONTEXT}`", - "Generate FluxCD Kustomization Health Score for Namespace `${NAMESPACE}` in Cluster `${CONTEXT}`", - "List All FluxCD Kustomization objects in Namespace `${NAMESPACE}` in Cluster `${CONTEXT}`", - "List Suspended FluxCD Kustomization objects in Namespace `${NAMESPACE}` in Cluster `${CONTEXT}`", - "List Unready FluxCD Kustomizations in Namespace `${NAMESPACE}` in Cluster `${CONTEXT}`" - ], - "capabilities": [ - "List Suspended FluxCD Kustomization objects in Namespace `${NAMESPACE}` in Cluster `${CONTEXT}`: List Suspended FluxCD kustomization objects.", - "List Unready FluxCD Kustomizations in Namespace `${NAMESPACE}` in Cluster `${CONTEXT}`: List all Kustomizations that are not found in a ready state in namespace.", - "List All FluxCD Kustomization objects in Namespace `${NAMESPACE}` in Cluster `${CONTEXT}`: List all FluxCD kustomization objects.", - "List Suspended FluxCD Kustomization objects in Namespace `${NAMESPACE}` in Cluster `${CONTEXT}`: List Suspended FluxCD kustomization objects.", - "List Unready FluxCD Kustomizations in Namespace `${NAMESPACE}` in Cluster `${CONTEXT}`: List all Kustomizations that are not found in a ready state in namespace." - ], - "readme": "# Kubernetes FluxCD Kustomization Health\nThe `k8s-fluxcd-kustomizations-health` codebundle checks for Kustomization resources within the Kubernetes cluster to surface up potential issues. \n\n## TaskSet\nThis TaskSet looks for any FluxCD managed Kustomizations in the specified namespace within the configured context and: \n- prints a list of every Kustomization and it's status\n- prints a list of all kustomizations that are not ready and associated reasons\n\nExample configuration: \n```\nDISTRIBUTION=Kubernetes\nCONTEXT=sandbox-cluster-1\nNAMESPACE=flux-system\nRESOURCE_NAME=kustomizations\n```\n\nWith the example above, the TaskSet will collect the above mentioned data from the specified namespace in the `sandbox-cluster-1` cluster for the resources with a shortname of `kustomizations`. \n\n\n## Requirements\n- A kubeconfig with `get` permissions to on the objects/namespaces that are involved in the query.\n\n\n## TODO\n- Add additional tasks\n- Add additional rbac and kubectl resources and use cases\n- Add an SLI for measuing Kustomization health via kubectl (as a prometheus codebundle exists already)\n- Add additional troubleshooting tasks as use cases evolve", - "libraries": [ - "RW.K8sLog", - "RW.CLI", - "RW.Core", - "RW.platform", - "RW.NextSteps", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/k8s-fluxcd-kustomization-health" - }, - { - "slug": "rw-cli-codecollection-k8s-chaos-flux", - "collection_slug": "rw-cli-codecollection", - "name": "k8s-chaos-flux", - "display_name": "k8s-chaos-flux", - "description": "The `k8s-chaos-flux` codebundle is built to facility chaos tests on Flux managed resources.", - "platform": "Kubernetes", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Suspend the Flux Resource Reconciliation for `${FLUX_RESOURCE_NAME}` in namespace `${FLUX_RESOURCE_NAMESPACE}`", - "Select Random FluxCD Workload for Chaos Target in Namespace `${FLUX_RESOURCE_NAMESPACE}`", - "Execute Chaos Command on `${TARGET_RESOURCE}` in Namespace `${TARGET_NAMESPACE}`", - "Execute Additional Chaos Command on ${FLUX_RESOURCE_TYPE} '${FLUX_RESOURCE_NAME}' in namespace '${FLUX_RESOURCE_NAMESPACE}'", - "Resume Flux Resource Reconciliation in `${TARGET_NAMESPACE}`" - ], - "capabilities": [ - "Suspend the Flux Resource Reconciliation for `${FLUX_RESOURCE_NAME}` in namespace `${FLUX_RESOURCE_NAMESPACE}`: Suspends a flux resource so that it can be manipulated for chaos purposes.", - "Select Random FluxCD Workload for Chaos Target in Namespace `${FLUX_RESOURCE_NAMESPACE}`: Inspects the Flux resource and randomly selects a deployment to tickle. Tehe. Only runs if RANDOMIZE = Yes.", - "Execute Chaos Command on `${TARGET_RESOURCE}` in Namespace `${TARGET_NAMESPACE}`: Run the desired chaos command within a targeted resource", - "Execute Additional Chaos Command on ${FLUX_RESOURCE_TYPE} '${FLUX_RESOURCE_NAME}' in namespace '${FLUX_RESOURCE_NAMESPACE}': Run the additional command as input, verbatim.", - "Resume Flux Resource Reconciliation in `${TARGET_NAMESPACE}`: Resumes Flux reconciliation on desired resource." - ], - "readme": "# Kubernetes Chaos Flux Codebundle\nThe `k8s-chaos-flux` codebundle is built to facility chaos tests on Flux managed resources. \n\n## TaskSet\nThe TaskSet provides the following tasks:\n\n- `Suspend the Flux Resource Reconciliation`: This task is responsible for pausing a Flux resource temporarily so that chaos tasks can be performed on it.\n- `Find Random FluxCD Workload as Chaos Target`: [Optional]This task inspects a Flux resource and randomly selects a specific part of it to be the target for chaos testing.\n- `Execute Chaos Command`: This task executes a specific chaos command within the chosen target resource, causing controlled chaos to occur. The command can be run multiple times if needed.\n- `Execute Additional Chaos Command`: This task executes an additional chaos command, if provided, within the chosen target resource. It allows for more flexibility in performing custom chaos operations.\n- `Resume Flux Resource Reconciliation`: This task resumes the normal operation of the Flux resource after chaos testing is completed, allowing it to function as before.\n\n\n## ELI5 Writeup \"ala chatGPT\" for Fun\nThis code is like a set of instructions for a robot that works with a special technology called Flux in a place called Kubernetes. The robot's job is to make things a bit chaotic on purpose, but only for testing. It can stop or pause a particular thing it's working on, like pressing a pause button. It can also randomly select something to play with, like picking a toy from a box. The robot can run special commands to make things go a bit crazy, but it knows how many times to do it, just like counting to 10. Sometimes it can even do some extra commands if we ask it nicely. And when it's done, the robot knows how to resume its work and make things normal again.", - "libraries": [ - "DateTime", - "Collections", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/k8s-chaos-flux" - }, - { - "slug": "rw-cli-codecollection-k8s-statefulset-healthcheck", - "collection_slug": "rw-cli-codecollection", - "name": "k8s-statefulset-healthcheck", - "display_name": "k8s-statefulset-healthcheck", - "description": "This set of tasks inspects the state of a statefulset resource is a namespace, checking replicas, events, status and raising issues if they're not at expected or minimum values.", - "platform": "Kubernetes", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Analyze Application Log Patterns for StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}`", - "Detect Log Anomalies for StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}`", - "Check Liveness Probe Configuration for StatefulSet `${STATEFULSET_NAME}`", - "Check Readiness Probe Configuration for StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}`", - "Check for Container Restarts in StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}`", - "Inspect StatefulSet Warning Events for `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}`", - "Fetch StatefulSet Workload Details For `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}`", - "Inspect StatefulSet Replicas for `${STATEFULSET_NAME}` in namespace `${NAMESPACE}`", - "Check StatefulSet PersistentVolumeClaims for `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}`", - "Identify Recent Configuration Changes for StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}`" - ], - "capabilities": [ - "Analyze Application Log Patterns for StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}`: Fetches and analyzes logs from the StatefulSet pods for errors, stack traces, connection issues, and other patterns that indicate application health problems.", - "Detect Log Anomalies for StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}`: Analyzes logs for repeating patterns, anomalous behavior, and unusual log volume that may indicate underlying issues.", - "Check Liveness Probe Configuration for StatefulSet `${STATEFULSET_NAME}`: Validates if a Liveness probe has possible misconfigurations", - "Check Readiness Probe Configuration for StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}`: Validates if a readiness probe has possible misconfigurations", - "Check for Container Restarts in StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}`: Analyzes container restart patterns in the StatefulSet pods to identify the root cause of restarts, distinguishing between OOM kills, liveness probe failures, and other termination causes.", - "Inspect StatefulSet Warning Events for `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}`: Fetches warning events related to the StatefulSet workload in the namespace and triages any issues found in the events.", - "Fetch StatefulSet Workload Details For `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}`: Fetches the current state of the StatefulSet for future review in the report.", - "Inspect StatefulSet Replicas for `${STATEFULSET_NAME}` in namespace `${NAMESPACE}`: Pulls the replica information for a given StatefulSet and checks if it's highly available, if the replica counts are the expected / healthy values, and raises issues if it is not progressing and is missing pods. Includes StatefulSet-specific checks for ordered deployment.", - "Check StatefulSet PersistentVolumeClaims for `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}`: Checks the status of PersistentVolumeClaims associated with the StatefulSet and identifies storage-related issues.", - "Identify Recent Configuration Changes for StatefulSet `${STATEFULSET_NAME}` in Namespace `${NAMESPACE}`: Identifies recent configuration changes from ControllerRevision analysis that might be related to current issues." - ], - "readme": "# Kubernetes Statefulset Triage\nThis set of tasks inspects the state of a statefulset resource is a namespace, checking replicas, events, status and raising issues if they're not at expected or minimum values.\n\n## Tasks\n`Fetch StatefulSet Logs`\n`Get Related StatefulSet Events`\n`Fetch StatefulSet Manifest Details`\n`Check StatefulSet Replicas`\n\n## Configuration\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\n- `kubeconfig`: The kubeconfig secret containing access info for the cluster.\n- `kubectl`: The location service used to interpret shell commands. Default value is `kubectl-service.shared`.\n- `KUBERNETES_DISTRIBUTION_BINARY`: Which binary to use for Kubernetes CLI commands. Default value is `kubectl`.\n- `CONTEXT`: The Kubernetes context to operate within.\n- `NAMESPACE`: The name of the namespace to search. Leave it blank to search in all namespaces.\n- `STATEFULSET_NAME`: The name of the statefulset to query for state and check for issues.\n- `LABELS`: What kubernetes labels to use for selecting resources when checking values.\n\n## Requirements\n- A kubeconfig with appropriate RBAC permissions to perform the desired command.\n\n## TODO\n- [ ] Add additional documentation.\n- [ ] Review label usage for ephemeral sets\n\n", - "libraries": [ - "DateTime", - "RW.K8sLog", - "Collections", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.NextSteps", - "RW.platform", - "RW.K8sHelper", - "OperatingSystem", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/k8s-statefulset-healthcheck" - }, - { - "slug": "rw-cli-codecollection-k8s-argocd-helm-health", - "collection_slug": "rw-cli-codecollection", - "name": "k8s-argocd-helm-health", - "display_name": "k8s-argocd-helm-health", - "description": "This codebundle is used to help measure and troubleshoot the health of an ArgoCD managed Helm deployments.", - "platform": "Kubernetes", - "author": "nmadhok", - "support_tags": [ - "rw" - ], - "tasks": [ - "Fetch all available ArgoCD Helm releases in namespace `${NAMESPACE}`", - "Fetch Installed ArgoCD Helm release versions in namespace `${NAMESPACE}`" - ], - "capabilities": [ - "Fetch all available ArgoCD Helm releases in namespace `${NAMESPACE}`: List all ArgoCD helm releases that are visible to the kubeconfig.", - "Fetch Installed ArgoCD Helm release versions in namespace `${NAMESPACE}`: Fetch Installed ArgoCD Helm release Versions." - ], - "readme": "# Kubernetes ArgoCD Helm Health\nThis codebundle is used to help measure and troubleshoot the health of an ArgoCD managed Helm deployments. \n\n## TaskSet\nThis taskset collects information and runs general troubleshooting checks against argocd Helm applications objects within a namespace.\n\nExample configuration for an application in which the ArgoCD Application object resides in the same namespace as the resources themselves: \n```\nexport DISTRIBUTION=Kubernetes\nexport CONTEXT=cluster-1\nexport NAMESPACE=otel-demo\nexport RESOURCE_NAME=\"applications.argoproj.io\"\n```\n\n## TODO\n- [ ] Try support for list of applications in conjunction with single application\n- [ ] Add documentation\n- [ ] Add issues\n", - "libraries": [ - "RW.CLI", - "RW.Core", - "RW.platform" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/k8s-argocd-helm-health" - }, - { - "slug": "rw-cli-codecollection-k8s-artifactory-health", - "collection_slug": "rw-cli-codecollection", - "name": "k8s-artifactory-health", - "display_name": "k8s-artifactory-health", - "description": "This codebundle queries the health REST endpoints of an Artifactory workload in Kubernetes, checking if the service is healthy, and raising issues if it's not.", - "platform": "Kubernetes", - "author": "jon-funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Check Artifactory Liveness and Readiness Endpoints in `NAMESPACE`" - ], - "capabilities": [ - "Check Artifactory Liveness and Readiness Endpoints in `NAMESPACE`: Runs a set of exec commands internally in the Artifactory workloads to curl the system health endpoints." - ], - "readme": "# Kubernetes Artifactory Triage\n\nThis codebundle queries the health REST endpoints of an Artifactory workload in Kubernetes, checking if the service is healthy, and raising issues if it's not.\n\n## Tasks\n`Check Artifactory Liveness and Readiness Endpoints`\n\n## Configuration\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\n- `kubeconfig`: The kubeconfig secret containing access info for the cluster.\n- `kubectl`: The location service used to interpret shell commands. Default value is `kubectl-service.shared`.\n- `KUBERNETES_DISTRIBUTION_BINARY`: Which binary to use for Kubernetes CLI commands. Default value is `kubectl`.\n- `CONTEXT`: The Kubernetes context to operate within.\n- `NAMESPACE`: The name of the namespace to search. Leave it blank to search in all namespaces.\n- `STATEFULSET_NAME`: The name of the Artifactory Statefulset\n- `EXPECTED_AVAILABILITY`: The number of replicas allowed.\n- `LABELS`: Labels used for selecting the workload(s).\n\n## Requirements\n- A kubeconfig with appropriate RBAC permissions to perform the desired command.\n\n## TODO\n- [ ] Add additional documentation.", - "libraries": [ - "DateTime", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/k8s-artifactory-health" - }, - { - "slug": "rw-cli-codecollection-gcloud-log-inspection", - "collection_slug": "rw-cli-codecollection", - "name": "gcloud-log-inspection", - "display_name": "gcloud-log-inspection", - "description": "Runs a task which performs an inspection on your logs in a GCP project, returning results regarding common issues, counts and related Kubernetes namespaces using a filter.", - "platform": "Unknown", - "author": "jon-funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Inspect GCP Logs For Common Errors in GCP Project `${GCP_PROJECT_ID}`" - ], - "capabilities": [ - "Inspect GCP Logs For Common Errors in GCP Project `${GCP_PROJECT_ID}`: Fetches logs from a Google Cloud Project and filters for a count of common error messages." - ], - "readme": "# GCP Log Inspection\n\nRuns a task which performs an inspection on your logs in a GCP project, returning results regarding common issues, counts and related Kubernetes namespaces using a filter.\n\n## Tasks\n`Inspect GCP Logs For Common Errors`\n\n## Configuration\n\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\n- `SEVERITY`: What severity to filter on, this will be the minimum severity returned in the log results.\n- `ADD_FILTERS`: An optional filter that can be added to the log query to customize results further.\n- `GCLOUD_SERVICE`: The remote gcloud service to use for requests.\n- `gcp_credentials`: The json credentials secrets file used to authenticate with the GCP project. Should be a service account.\n- `GCP_PROJECT_ID`: The unique project ID identifier string.\n\n## Notes\n\nThe `gcp_credentials` service account will need view and list permissions on the GCP logging API.\n\n## TODO\n- [ ] Add documentation\n- [ ] Add IAM settings examples\n- [ ] Add flexible result breakdown behaviour for non-kubernetes projects\n- [ ] Refine raised issues", - "libraries": [ - "RW.CLI", - "RW.Core", - "OperatingSystem", - "DateTime" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/gcloud-log-inspection" - }, - { - "slug": "rw-cli-codecollection-k8s-fluxcd-reconcile", - "collection_slug": "rw-cli-codecollection", - "name": "k8s-fluxcd-reconcile", - "display_name": "k8s-fluxcd-reconcile", - "description": "This codebundle measures the number of reconciliation errors in the fluxcd controllers and can generate a report of them.", - "platform": "Kubernetes", - "author": "jon-funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Health Check Flux Reconciliation", - "Check FluxCD Reconciliation Health in Kubernetes Namespace `${FLUX_NAMESPACE}`" - ], - "capabilities": [ - "Health Check Flux Reconciliation: Measures failing reconciliations for fluxcd", - "Check FluxCD Reconciliation Health in Kubernetes Namespace `${FLUX_NAMESPACE}`: Fetches reconciliation logs for flux and creates a report for them." - ], - "readme": "# Kubernetes FluxCD Reconciliation Errors\nThis codebundle measures the number of reconciliation errors in the fluxcd controllers and can generate a report of them.\n\n## TaskSet\nThis taskset generates a report containing a summary of logs for each controller and their errors counts, ending with a total error count.\n\nExample configuration: \n```\nCONTEXT=sandbox-cluster-1\n```\n\n## SLI\nThe SLI can be used to monitor the overall health of the reconciliation loops for FluxCD and alert developers when a bad manifest has been provided.\n\n## Requirements\n- A kubeconfig with `get` permissions to on the objects/namespaces that are involved in the query.\n\n## TODO\n- Add additional rbac and kubectl resources and use cases", - "libraries": [ - "RW.K8sLog", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem", - "String", - "Process" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/k8s-fluxcd-reconcile" - }, - { - "slug": "rw-cli-codecollection-k8s-app-troubleshoot", - "collection_slug": "rw-cli-codecollection", - "name": "k8s-app-troubleshoot", - "display_name": "k8s-app-troubleshoot", - "description": "This codebundle attempts to identify issues created in application code changes recently.", - "platform": "Kubernetes", - "author": "jon-funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Measure Application Exceptions in `${NAMESPACE}`", - "Get `${CONTAINER_NAME}` Application Logs from Workload `${WORKLOAD_NAME}` in Namespace `${NAMESPACE}`", - "Scan `${CONTAINER_NAME}` Application For Misconfigured Environment", - "Tail `${CONTAINER_NAME}` Application Logs For Stacktraces in Workload `${WORKLOAD_NAME}`", - "# Check Database Migrations" - ], - "capabilities": [ - "Measure Application Exceptions in `${NAMESPACE}`: Examines recent logs for exceptions, providing a count of them.", - "Get `${CONTAINER_NAME}` Application Logs from Workload `${WORKLOAD_NAME}` in Namespace `${NAMESPACE}`: Collects the last approximately 300 lines of logs from the workload", - "Scan `${CONTAINER_NAME}` Application For Misconfigured Environment: Compares codebase to configured infra environment variables and attempts to report missing environment variables in the app", - "Tail `${CONTAINER_NAME}` Application Logs For Stacktraces in Workload `${WORKLOAD_NAME}`: Performs an inspection on container logs for exceptions/stacktraces, parsing them and attempts to find relevant source code information" - ], - "readme": "# Kubernetes Application Troubleshoot\nThis codebundle attempts to identify issues created in application code changes recently. \n\n## Tasks\n`Get Resource Logs`\n`Scan For Misconfigured Environment`\n`Troubleshoot Application Logs`\n\n## Configuration\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\n- `kubeconfig`: The kubeconfig secret containing access info for the cluster.\n- `kubectl`: The location service used to interpret shell commands. Default value is `kubectl-service.shared`.\n- `KUBERNETES_DISTRIBUTION_BINARY`: Which binary to use for Kubernetes CLI commands. Default value is `kubectl`.\n- `CONTEXT`: The Kubernetes context to operate within.\n- `NAMESPACE`: The name of the namespace to search. Leave it blank to search in all namespaces.\n- `LABELS`: The labaels used for resource selection, particularly for fetching logs.\n- `REPO_URI`: The URI for the git repo used to fetch source code, can be a GitHub URL.\n- `NUM_OF_COMMITS`: How many commits to search through into the past to identify potential problems.\n- `CREATE_ISSUES`: A boolean flag whether or not to create github issues for the related parsed exceptions.\n- `LOGS_SINCE`: How far back to scan for logs, eg: 20m, 3h\n- `EXCLUDE_PATTERN`: a extended grep pattern used to filter out log results, such as exceptions/errors that you don't care about.\n- `CONTAINER_NAME`: the name of the container within the labeled workload to fetch logs from.\n- `MAX_LOG_LINES`: The maximum number of logs to fetch. Setting this too high can effect performance.\n\n## Requirements\n- A kubeconfig with appropriate RBAC permissions to perform the desired command, particularly exec\n- A oauth token for github authentication, with read permissions on repositories(s) and write permissions on issues.\n\n## Automated Building\nAdditionally you must have the following manifest changes in order for workspace builder to automatically setup this codebundle for you:\n\n- A deployment with the follow annotations and labels:\n - annotations.gitApplication: YOUR_GIT_URL\n - annotations.gitTokenName: THE_WORKSPACE_TOKEN_NAME\n - labels.app: app name that matches the container name in the pod to pull logs from\n\n## TODO\n- [ ] New keywords for code inspection\n- [ ] SPIKE for potential genAI integration\n- [ ] Add additional documentation.\n\n", - "libraries": [ - "RW.K8sApplications", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/k8s-app-troubleshoot" - }, - { - "slug": "rw-cli-codecollection-k8s-jenkins-healthcheck", - "collection_slug": "rw-cli-codecollection", - "name": "k8s-jenkins-healthcheck", - "display_name": "k8s-jenkins-healthcheck", - "description": "This taskset performs checks against its rest api to determine if there are any stuck jobs, which will result in raised issues if any are detected.", - "platform": "Kubernetes", - "author": "jon-funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Query The Jenkins Kubernetes Workload HTTP Endpoint in Kubernetes StatefulSet `${STATEFULSET_NAME}`", - "Query For Stuck Jenkins Jobs in Kubernetes Statefulset Workload `${STATEFULSET_NAME}`" - ], - "capabilities": [ - "Query The Jenkins Kubernetes Workload HTTP Endpoint in Kubernetes StatefulSet `${STATEFULSET_NAME}`: Performs a curl within the jenkins statefulset kubernetes workload to determine if the pod is up and healthy, and can serve requests.", - "Query For Stuck Jenkins Jobs in Kubernetes Statefulset Workload `${STATEFULSET_NAME}`: Performs a curl within the jenkins statefulset kubernetes workload to check for stuck jobs in the jenkins piepline queue." - ], - "readme": "# Kubernetes Jenkins Healthcheck\n\nThis taskset performs checks against its rest api to determine if there are any stuck jobs, which will result in raised issues if any are detected.\n\n## Tasks\n`Query The Jenkins Kubernetes Workload HTTP Endpoint`\n`Query For Stuck Jenkins Jobs`\n\n## Configuration\n\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\n- `kubeconfig`: The kubeconfig secret containing access info for the cluster.\n- `kubectl`: The location service used to interpret shell commands. Default value is `kubectl-service.shared`.\n- `KUBERNETES_DISTRIBUTION_BINARY`: Which binary to use for Kubernetes CLI commands. Default value is `kubectl`.\n- `CONTEXT`: The Kubernetes context to operate within.\n- `NAMESPACE`: The name of the namespace to search. Leave it blank to search in all namespaces.\n- `STATEFULSET_NAME`: The name of the statefulset running jenkins\n- `JENKINS_SA_USERNAME`: The jenkins username associated with the API token\n- `JENKINS_SA_TOKEN`: The API token used to perform healthcheck API requests against the endpoint\n\n## Notes\n\nPlease note that the script requires permissions to execute commands within the Kubernetes cluster, and it may require additional permissions depending on the tasks it performs (for example, fetching storage utilization for PVC mounts requires kubectl exec permissions). Make sure to review the tasks and the required permissions before running the script.\n\n## TODO\n- [ ] Add additional complex pipeline checks for various bad pipeline states\n- [ ] Add executor checks\n- [ ] Add documentation\n- [ ] Refine raised issues\n", - "libraries": [ - "DateTime", - "Collections", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/k8s-jenkins-healthcheck" - }, - { - "slug": "rw-cli-codecollection-gcloud-node-preempt", - "collection_slug": "rw-cli-codecollection", - "name": "gcloud-node-preempt", - "display_name": "gcloud-node-preempt", - "description": "This code checks if any GCP (Google Cloud Platform) nodes have an active preempt operation. It uses the gcloud command-line tool to interact with GCP APIs and retrieve the necessary information.", - "platform": "Unknown", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Count the number of nodes in active preempt operation in project `${GCP_PROJECT_ID}`", - "List all nodes in an active preempt operation for GCP Project `${GCP_PROJECT_ID}` within the last `${AGE}` hours" - ], - "capabilities": [ - "Count the number of nodes in active preempt operation in project `${GCP_PROJECT_ID}`: Counts all nodes that have been preempted within the defined time interval.", - "List all nodes in an active preempt operation for GCP Project `${GCP_PROJECT_ID}` within the last `${AGE}` hours: Fetches all nodes that have been preempted within the defined time interval." - ], - "readme": "# gcloud Node Preempt List\nThis code checks if any GCP (Google Cloud Platform) nodes have an active preempt operation. It uses the gcloud command-line tool to interact with GCP APIs and retrieve the necessary information.\n\n\n## SLI\nThe SLI lists all preempt node operations that have a status that does not match \"DONE\", counts the total nodes in this state, and pushes the metric. \n\n## TaskSet \nThe Taskset lists all preempt node operations that have a status that does not match \"DONE\" and returns the following details in json format: \n\n- startTime\n- targetLink\n- statusMessage\n- progress\n- zone\n- selfLink \n\n\n## Requirements\nThe following permissions are required on the GCP service account used with the gcloud utility: \n\n - 'compute.globalOperations.list'", - "libraries": [ - "DateTime", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/gcloud-node-preempt" - }, - { - "slug": "rw-cli-codecollection-k8s-pvc-healthcheck", - "collection_slug": "rw-cli-codecollection", - "name": "k8s-pvc-healthcheck", - "display_name": "k8s-pvc-healthcheck", - "description": "This taskset provides a set of commands to troubleshoot storage-related issues in a Kubernetes cluster. It leverages the `kubectl` command-line tool to interact with the cluster and retrieve relevant information about persistent volume claims (PVCs), persistent volumes (PVs), and associated events.", - "platform": "Kubernetes", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Fetch the Storage Utilization for PVC Mounts in Namespace `${NAMESPACE}`", - "Generate Namespace Score for Namespace `${NAMESPACE}`", - "Fetch Events for Unhealthy Kubernetes PersistentVolumeClaims in Namespace `${NAMESPACE}`", - "List PersistentVolumeClaims in Terminating State in Namespace `${NAMESPACE}`", - "List PersistentVolumes in Terminating State in Namespace `${NAMESPACE}`", - "List Pods with Attached Volumes and Related PersistentVolume Details in Namespace `${NAMESPACE}`", - "Fetch the Storage Utilization for PVC Mounts in Namespace `${NAMESPACE}`", - "Check for RWO Persistent Volume Node Attachment Issues in Namespace `${NAMESPACE}`" - ], - "capabilities": [ - "Fetch the Storage Utilization for PVC Mounts in Namespace `${NAMESPACE}`: For each pod in a namespace, fetch the utilization of any PersistentVolumeClaims mounted using the linux df command. Requires kubectl exec permissions.", - "Fetch Events for Unhealthy Kubernetes PersistentVolumeClaims in Namespace `${NAMESPACE}`: Lists events related to PersistentVolumeClaims within the namespace that are not bound to PersistentVolumes.", - "List PersistentVolumeClaims in Terminating State in Namespace `${NAMESPACE}`: Lists persistentvolumeclaims in a Terminating state.", - "List PersistentVolumes in Terminating State in Namespace `${NAMESPACE}`: Lists events related to persistent volumes in Terminating state.", - "List Pods with Attached Volumes and Related PersistentVolume Details in Namespace `${NAMESPACE}`: For each pod in a namespace, collect details on configured PersistentVolumeClaim, PersistentVolume, and node.", - "Fetch the Storage Utilization for PVC Mounts in Namespace `${NAMESPACE}`: For each pod in a namespace, fetch the utilization of any PersistentVolumeClaims mounted using the linux df command. Requires kubectl exec permissions.", - "Check for RWO Persistent Volume Node Attachment Issues in Namespace `${NAMESPACE}`: For each pod in a namespace, check if it has an RWO persistent volume claim and if so, validate that the pod and the pv are on the same node." - ], - "readme": "# Kubernetes Storage Troubleshooting TaskSet\n\nThis taskset provides a set of commands to troubleshoot storage-related issues in a Kubernetes cluster. It leverages the `kubectl` command-line tool to interact with the cluster and retrieve relevant information about persistent volume claims (PVCs), persistent volumes (PVs), and associated events.\n\n\n## Configuration\n\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\n- `kubeconfig`: The path to the Kubernetes kubeconfig YAML file containing connection configuration used to connect to cluster(s).\n- `kubectl`: The location service used to interpret shell commands. Default value is `kubectl-service.shared`.\n- `KUBERNETES_DISTRIBUTION_BINARY`: Which binary to use for Kubernetes CLI commands. Default value is `kubectl`.\n- `CONTEXT`: The Kubernetes context to operate within.\n- `NAMESPACE`: The name of the namespace to search. Leave it blank to search in all namespaces.\n\n## SLI \nThe SLI generates an aggregate score, where: \n- 1 = Healthy\n- 0 = Failed\n- 0 < x < 1 = Degraded\n\nCurretly, it uses - `Fetch the Storage Utilization for PVC Mounts` from the TaskSet and lowers the score if any issues arise. \n\n\n## TaskSet\n\nThe TaskSet provides the following tasks:\n\n- `Fetch Events for Unhealthy Kubernetes Persistent Volume Claims`: This task lists events related to persistent volume claims within the desired namespace that are not bound to a persistent volume. It retrieves the events and displays information like the last timestamp, name, and message associated with the PVC.\n- `List Persistent Volumes in Terminating State`: This task lists events related to persistent volumes in a Terminating state. It retrieves the events and displays information like the last timestamp, name, and message associated with the PV.\n- `List Pods with Attached Volumes and Related PV Details`: This task collects details on the configured persistent volume claim, persistent volume, and node for each pod in the specified namespace. It displays information such as the pod name, PVC name, PV name, status, node, zone, ingress class, access modes, and reclaim policy.\n- `Fetch the Storage Utilization for PVC Mounts`: This task retrieves the storage utilization for PVC mounts in each pod within the specified namespace. It executes the `df -h` command inside each pod and displays information about the pod, PVC, volume name, container name, and mount path. It also checks if the PVC utilization exceeds 95% and raises an issue if it does.\n- `Check for RWO Persistent Volume Node Attachment Issues`: This task finds pods with RWO type storage and prints a report of which node the pod is scheduled and where the storage is attached with an \"OK\" if they are the same and \"Error\" if they are mismatched. \n\n## Pre-requisites\n\nBefore running the runbook, ensure you have the following (for local use):\n\n- Access to the Kubernetes cluster\n- Permissions to: \n - List/Get PersistentVolumeClaims, PersistentVolumes, Nodes, Events\n - Execute on pods\n\nExample Kubernetes Role: \n- The following kubernetes role is provided as an example only, and should be modified to suit your environment: \n```\n# Role definition (role.yaml)\napiVersion: rbac.authorization.k8s.io/v1\nkind: Role\nmetadata:\n name: storage-troubleshooting-role\n namespace: \nrules:\n - apiGroups: [\"\"]\n resources: [\"persistentvolumeclaims\", \"events\"]\n verbs: [\"list\"]\n - apiGroups: [\"\"]\n resources: [\"persistentvolumes\"]\n verbs: [\"list\"]\n - apiGroups: [\"\"]\n resources: [\"pods\"]\n verbs: [\"get\"]\n - apiGroups: [\"\"]\n resources: [\"pods/exec\"]\n verbs: [\"create\"]\n - apiGroups: [\"\"]\n resources: [\"nodes\"]\n verbs: [\"list\", \"get\"]\n\n# RoleBinding definition (rolebinding.yaml)\napiVersion: rbac.authorization.k8s.io/v1\nkind: RoleBinding\nmetadata:\n name: storage-troubleshooting-rolebinding\n namespace: \nroleRef:\n apiGroup: rbac.authorization.k8s.io\n kind: Role\n name: storage-troubleshooting-role\nsubjects:\n - kind: User\n name: # Replace with the actual username or service account name\n\n```\n\n", - "libraries": [ - "DateTime", - "RW.K8sLog", - "Collections", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/k8s-pvc-healthcheck" - }, - { - "slug": "rw-cli-codecollection-k8s-fluxcd-helm-health", - "collection_slug": "rw-cli-codecollection", - "name": "k8s-fluxcd-helm-health", - "display_name": "k8s-fluxcd-helm-health", - "description": "The `k8s-fluxcd-helm-health` codebundle checks for helm related resources within the Kubernetes cluster to surface up potential issues.", - "platform": "Kubernetes", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "List all available FluxCD Helmreleases in Namespace `${NAMESPACE}`", - "Fetch Installed FluxCD Helmrelease Versions in Namespace `${NAMESPACE}`", - "Fetch Mismatched FluxCD HelmRelease Version in Namespace `${NAMESPACE}`", - "Fetch FluxCD HelmRelease Error Messages in Namespace `${NAMESPACE}`", - "Check for Available Helm Chart Updates in Namespace `${NAMESPACE}`" - ], - "capabilities": [ - "List all available FluxCD Helmreleases in Namespace `${NAMESPACE}`: List all FluxCD helmreleases that are visible to the kubeconfig.", - "Fetch Installed FluxCD Helmrelease Versions in Namespace `${NAMESPACE}`: List helmreleases and the last attempted software version and the current running version.", - "Fetch Mismatched FluxCD HelmRelease Version in Namespace `${NAMESPACE}`: List helmreleases and use jq to display any releases where the last attempted software revision doesn't match the current running revision. Requires jq.", - "Fetch FluxCD HelmRelease Error Messages in Namespace `${NAMESPACE}`: List helmreleases and display the status conditions message for any helmreleases that are not in a Ready state.", - "Check for Available Helm Chart Updates in Namespace `${NAMESPACE}`: List all helmreleases in namespace and check for available helmchart updates." - ], - "readme": "# Kubernetes FluxCD Helm Health\nThe `k8s-fluxcd-helm-health` codebundle checks for helm related resources within the Kubernetes cluster to surface up potential issues. \n\n## TaskSet\nThis TaskSet looks for any helmreleases in the specified namespace within the configured context and: \n- prints a list of every helmrelease and it's status\n- prints a list of all helm release version details\n- prints a list of helm releases that have mismatched versions (e.g. last attempted version doesn't match the running version)\n- prints all helmreleases that are not healthy along with the associated error messages\n\nExample configuration: \n```\nDISTRIBUTION=Kubernetes\nCONTEXT=sandbox-cluster-1\nNAMESPACE=vault\nRESOURCE_NAME=helmreleases\n```\n\nWith the example above, the TaskSet will collect the above mentioned data from the `vault` namespace in the `sandbox-cluster-1` cluster for the resources with a shortname of `helmreleases`. \n\n\n\n## Requirements\n- A kubeconfig with `get` permissions to on the objects/namespaces that are involved in the query.\n\n\n## TODO\n- Add additional rbac and kubectl resources and use cases\n- Add an SLI for measuing helmrelease health\n- Add additional troubleshooting tasks as use cases evolve", - "libraries": [ - "RW.CLI", - "RW.Core", - "RW.platform" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/k8s-fluxcd-helm-health" - }, - { - "slug": "rw-cli-codecollection-k8s-argocd-application-health", - "collection_slug": "rw-cli-codecollection", - "name": "k8s-argocd-application-health", - "display_name": "k8s-argocd-application-health", - "description": "This codebundle is used to help measure and troubleshoot the health of an ArgoCD managed application.", - "platform": "Kubernetes", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Fetch ArgoCD Application Sync Status & Health for `${APPLICATION}`", - "Fetch ArgoCD Application Last Sync Operation Details for `${APPLICATION}`", - "Fetch Unhealthy ArgoCD Application Resources for `${APPLICATION}`", - "Scan For Errors in Pod Logs Related to ArgoCD Application `${APPLICATION}`", - "Fully Describe ArgoCD Application `${APPLICATION}`" - ], - "capabilities": [ - "Fetch ArgoCD Application Sync Status & Health for `${APPLICATION}`: Shows the sync status and health of the ArgoCD application.", - "Fetch ArgoCD Application Last Sync Operation Details for `${APPLICATION}`: Fetches the last ArgoCD Application sync operation staus.", - "Fetch Unhealthy ArgoCD Application Resources for `${APPLICATION}`: Displays all resources in an ArgoCD Application that are not in a healthy state.", - "Scan For Errors in Pod Logs Related to ArgoCD Application `${APPLICATION}`: Grep for the error pattern across all pods managed by this Applications deployments.", - "Fully Describe ArgoCD Application `${APPLICATION}`: Describe all details regarding the ArgoCD Application. Useful if reviewing all content." - ], - "readme": "# Kubernetes ArgoCD Application Health\nThis codebundle is used to help measure and troubleshoot the health of an ArgoCD managed application. \n\n## TaskSet\nThis taskset collects information and runs general troubleshooting checks against argocd application objects within a namespace.\n\nExample configuration for an application in which the ArgoCD Application object resides in the same namespace as the resources themselves: \n```\nexport DISTRIBUTION=Kubernetes\nexport CONTEXT=cluster-1\nexport APPLICATION=otel-demo\nexport APPLICATION_TARGET_NAMESPACE=otel-demo\nexport APPLICATION_APP_NAMESPACE=otel-demo\nexport ERROR_PATTERN=\"Quota|Error|Exception\"\n```\n\n## TODO\n- [ ] Try support for list of applications in conjunction with single application\n- [ ] Add documentation\n- [ ] Add issues\n", - "libraries": [ - "DateTime", - "Collections", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/k8s-argocd-application-health" - }, - { - "slug": "rw-cli-codecollection-k8s-postgres-operations", - "collection_slug": "rw-cli-codecollection", - "name": "k8s-postgres-operations", - "display_name": "k8s-postgres-operations", - "description": "This codebundle provides **operational remediation capabilities** for PostgreSQL clusters running in Kubernetes. It focuses exclusively on **performing actions** to fix issues identified by monitoring tools, with the primary capability being **reinitializing failed cluster members**. It supports both CrunchyDB and Zalando PostgreSQL operators.", - "platform": "Kubernetes", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Reinitialize Failed PostgreSQL Cluster Members for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}`", - "Perform PostgreSQL Cluster Failover Operation for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}`", - "Restart PostgreSQL Cluster with Rolling Update for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}`", - "Verify Cluster Recovery and Generate Summary for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}`" - ], - "capabilities": [ - "Reinitialize Failed PostgreSQL Cluster Members for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}`: Identify and reinitialize any failed cluster members", - "Perform PostgreSQL Cluster Failover Operation for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}`: Execute failover operation to promote a specific replica or perform automatic failover", - "Restart PostgreSQL Cluster with Rolling Update for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}`: Perform rolling restart of all PostgreSQL cluster members", - "Verify Cluster Recovery and Generate Summary for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}`: Final verification of cluster health after operations" - ], - "readme": "# PostgreSQL Operations CodeBundle\n\n## Overview\n\nThis codebundle provides **operational remediation capabilities** for PostgreSQL clusters running in Kubernetes. It focuses exclusively on **performing actions** to fix issues identified by monitoring tools, with the primary capability being **reinitializing failed cluster members**. It supports both CrunchyDB and Zalando PostgreSQL operators.\n\n**Note**: This codebundle is designed to work alongside `k8s-postgres-healthcheck` for complete PostgreSQL cluster management - healthcheck identifies issues, operations fixes them.\n\n## Key Features\n\n### \ud83d\udd27 Primary Operations\n- **Failed Member Reinitialize**: Core capability to recover corrupted or failed cluster members\n- **Emergency Failover**: Promote replicas to master during outages\n- **Cluster Scaling**: Add or remove cluster members for capacity management\n- **Rolling Restart**: Safe maintenance operations across all cluster members\n- **Multi-Operator Support**: Works with both CrunchyDB and Zalando PostgreSQL operators\n\n### \ud83d\udcca Operational Focus\n- **Action-Oriented**: All tasks perform cluster modifications (read-write operations)\n- **Remediation-Focused**: Designed to fix issues, not just detect them\n- **Robot Framework Integration**: Full automation for operational workflows\n- **Comprehensive Reporting**: Detailed operation logs and success/failure tracking\n\n### \ud83d\ude80 Advanced Capabilities\n- **Smart Recovery Methods**: Uses patronictl reinit with pod recreation fallback\n- **Safety Checks**: Validates operations and monitors recovery progress\n- **Error Handling**: Comprehensive issue tracking with severity levels\n- **Post-Operation Verification**: Confirms successful completion of operations\n\n## Scripts\n\n### `reinitialize_cluster_member.sh`\n**Primary script for failed member recovery**\n\n```bash\n# Automatic detection and reinitialize of failed members\nbash reinitialize_cluster_member.sh\n```\n\n**Features:**\n- Detects failed cluster members using patronictl\n- Attempts patronictl reinit first (clean recovery)\n- Falls back to pod deletion/recreation if needed\n- Verifies recovery success and cluster health\n- Comprehensive error handling and reporting\n\n### `cluster_operations.sh`\n**Comprehensive cluster management operations**\n\n```bash\n# Get cluster overview (default)\nOPERATION=overview bash cluster_operations.sh\n\n# Perform emergency failover to specific member\nOPERATION=failover TARGET_MEMBER=cluster-member-2 bash cluster_operations.sh\n\n# Scale cluster to 5 members\nOPERATION=scale REPLICA_COUNT=5 bash cluster_operations.sh\n\n# Perform rolling restart for maintenance\nOPERATION=restart bash cluster_operations.sh\n\n# Get cluster overview (read-only, mainly for verification)\nOPERATION=overview bash cluster_operations.sh\n```\n\n## Supported Operators\n\n### CrunchyDB PostgreSQL Operator\n- **Resource Type**: `postgresclusters.postgres-operator.crunchydata.com`\n- **Container Name**: `database`\n- **Pod Labels**: `postgres-operator.crunchydata.com/cluster=`\n- **Master Label**: `postgres-operator.crunchydata.com/role=master`\n\n### Zalando PostgreSQL Operator\n- **Resource Type**: `postgresqls.acid.zalan.do`\n- **Container Name**: `postgres`\n- **Pod Labels**: `application=spilo,cluster-name=`\n- **Master Label**: `spilo-role=master`\n\n## Environment Variables\n\n| Variable | Description | Example | Required |\n|----------|-------------|---------|----------|\n| `KUBERNETES_DISTRIBUTION_BINARY` | Kubernetes CLI binary | `kubectl` | Yes |\n| `CONTEXT` | Kubernetes context | `my-cluster` | Yes |\n| `NAMESPACE` | Target namespace | `postgres-system` | Yes |\n| `OBJECT_NAME` | Cluster name | `my-postgres-cluster` | Yes |\n| `OBJECT_API_VERSION` | Cluster API version | `postgres-operator.crunchydata.com/v1beta1` | Yes |\n| `DATABASE_CONTAINER` | Database container name | `database` or `postgres` | Yes |\n| `OPERATION` | Operation type | `overview`, `failover`, `scale`, `restart` | No |\n| `TARGET_MEMBER` | Target for failover | `cluster-member-2` | No |\n| `REPLICA_COUNT` | Desired replica count | `3` | No |\n\n## Robot Framework Integration\n\n### Runbook Tasks\n1. **Reinitialize Failed PostgreSQL Cluster Members for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}`** `[access:read-write]`\n2. **Perform PostgreSQL Cluster Failover Operation for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}`** `[access:read-write]`\n3. **Scale PostgreSQL Cluster Replicas for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}`** `[access:read-write]`\n4. **Restart PostgreSQL Cluster with Rolling Update for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}`** `[access:read-write]`\n5. **Verify Cluster Recovery and Generate Summary for Cluster `${OBJECT_NAME}` in Namespace `${NAMESPACE}`** `[access:read-write]`\n\n**Note**: All tasks require `access:read-write` permissions as they perform cluster operations. Task names include cluster and namespace variables for clarity and consistency with healthcheck tasks.\n\n### Integration with k8s-postgres-healthcheck\nThis codebundle", - "libraries": [ - "DateTime", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/k8s-postgres-operations" - }, - { - "slug": "rw-cli-codecollection-aws-elasticache-redis-health", - "collection_slug": "rw-cli-codecollection", - "name": "aws-elasticache-redis-health", - "display_name": "aws-elasticache-redis-health", - "description": "This runbook provides a comprehensive guide to managing and troubleshooting AWS Elasticache Redis configurations. It details procedures for validating configurations, analyzing metrics, and performing a broad fleet scan.", - "platform": "AWS", - "author": "jon-funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Scan ElastiCaches in AWS Region `${AWS_REGION}`", - "Scan AWS Elasticache Redis Status in AWS Region `${AWS_REGION}`" - ], - "capabilities": [ - "Scan ElastiCaches in AWS Region `${AWS_REGION}`: Performs a broad health scan of all Elasticache instances in the region.", - "Scan AWS Elasticache Redis Status in AWS Region `${AWS_REGION}`: Checks the high level metrics and status of the elasticache redis instances in the region." - ], - "readme": "# aws-elasticache-redis-service-down CodeBundle\n### Tags:`AWS`, `Elasticache`, `Redis`\n## CodeBundle Objective:\nThis runbook provides a comprehensive guide to managing and troubleshooting AWS Elasticache Redis configurations. It details procedures for validating configurations, analyzing metrics, and performing a broad fleet scan.\n\n## CodeBundle Inputs:\n\nexport AWS_REGION=\"PLACEHOLDER\"\n\nexport AWS_ACCESS_KEY_ID=\"PLACEHOLDER\"\n\nexport AWS_SECRET_ACCESS_KEY=\"PLACEHOLDER\"\n\n\n## CodeBundle Tasks:\n### `Validate AWS Elasticache Redis State`\n#### Tags:`AWS Elasticache`\n### Task Documentation:\nScans the current fleet of ElastiCache instances configuration and state for issues.\n#### Usage Example:\n`./validate_aws_elasticache_redis_config.sh`\n\n### `Analyze AWS Elasticache Redis Metrics`\n#### Tags:`aws`, `bash`, `script`, `cloudwatch`, `metrics`, `elasticache`, `redis`\n### Task Documentation:\nFetches all events for a fleet of ElastiCache instances and raises issues for present events.\n#### Usage Example:\n`./analyze_aws_elasticache_redis_metrics.sh`", - "libraries": [ - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem", - "String", - "Process" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/aws-elasticache-redis-health" - }, - { - "slug": "rw-cli-codecollection-jenkins-health", - "collection_slug": "rw-cli-codecollection", - "name": "jenkins-health", - "display_name": "jenkins-health", - "description": "This CodeBundle monitors and evaluates the health of Jenkins using the Jenkins REST API", - "platform": "Unknown", - "author": "saurabh3460", - "support_tags": [ - "rw" - ], - "tasks": [ - "Check For Failed Build Logs in Jenkins Instance `${JENKINS_INSTANCE_NAME}`", - "Check For Long Running Builds in Jenkins Instance `${JENKINS_INSTANCE_NAME}`", - "Check For Recent Failed Tests in Jenkins Instance `${JENKINS_INSTANCE_NAME}`", - "Check For Jenkins Instance `${JENKINS_INSTANCE_NAME}` Health", - "Check For Long Queued Builds in Jenkins Instance `${JENKINS_INSTANCE_NAME}`", - "Check Jenkins Executor Utilization in Jenkins Instance `${JENKINS_INSTANCE_NAME}`", - "Generate Health Score", - "List Failed Build Logs in Jenkins Instance `${JENKINS_INSTANCE_NAME}`", - "List Long Running Builds in Jenkins Instance `${JENKINS_INSTANCE_NAME}`", - "List Recent Failed Tests in Jenkins Instance `${JENKINS_INSTANCE_NAME}`", - "Check Jenkins Instance `${JENKINS_INSTANCE_NAME}` Health", - "List Long Queued Builds in Jenkins Instance `${JENKINS_INSTANCE_NAME}`", - "List Executor Utilization in Jenkins Instance `${JENKINS_INSTANCE_NAME}`", - "Fetch Jenkins Instance `${JENKINS_INSTANCE_NAME}` Logs and Add to Report" - ], - "capabilities": [ - "Check For Failed Build Logs in Jenkins Instance `${JENKINS_INSTANCE_NAME}`: Check For Failed Build Logs in Jenkins", - "Check For Long Running Builds in Jenkins Instance `${JENKINS_INSTANCE_NAME}`: Check Jenkins builds that have been running longer than a specified threshold", - "Check For Recent Failed Tests in Jenkins Instance `${JENKINS_INSTANCE_NAME}`: Check For Recent Failed Tests in Jenkins", - "Check For Jenkins Instance `${JENKINS_INSTANCE_NAME}` Health: Check if Jenkins instance is reachable and responding", - "Check For Long Queued Builds in Jenkins Instance `${JENKINS_INSTANCE_NAME}`: Check for builds stuck in queue beyond threshold and calculate SLI score", - "Check Jenkins Executor Utilization in Jenkins Instance `${JENKINS_INSTANCE_NAME}`: Check if Jenkins executor utilization is above 80%", - "List Failed Build Logs in Jenkins Instance `${JENKINS_INSTANCE_NAME}`: Fetches logs from failed Jenkins builds using the Jenkins API", - "List Long Running Builds in Jenkins Instance `${JENKINS_INSTANCE_NAME}`: Identifies Jenkins builds that have been running longer than a specified threshold", - "List Recent Failed Tests in Jenkins Instance `${JENKINS_INSTANCE_NAME}`: List Recent Failed Tests in Jenkins Instance", - "Check Jenkins Instance `${JENKINS_INSTANCE_NAME}` Health: Check if Jenkins instance is reachable and responding", - "List Long Queued Builds in Jenkins Instance `${JENKINS_INSTANCE_NAME}`: Check for builds stuck in queue beyond threshold", - "List Executor Utilization in Jenkins Instance `${JENKINS_INSTANCE_NAME}`: Check Jenkins executor utilization across nodes", - "Fetch Jenkins Instance `${JENKINS_INSTANCE_NAME}` Logs and Add to Report: Fetches and displays Jenkins logs from the Atom feed" - ], - "readme": "# AWS Jenkins Health\n\nThis CodeBundle monitors and evaluates the health of Jenkins using the Jenkins REST API\n\n## SLI\nThe SLI produces a score of 0 (bad), 1(good), or a value in between. This score is generated by capturing the following: \n- Check if Jenkins instance is reachable and responding (endpoint)\n- Check For Failed Build Logs in Jenkins\n- Check For Long Running Builds in Jenkins\n- Check For Long Queued Builds in Jenkins\n- Check Jenkins Executor Utilization\n\n## TaskSet\nSimilar to the SLI, but produces a report on the specific jenkns apis and raises issues for each Jenkins check that requires attention. \n\n## Required Configuration\n\n```\n export JENKINS_URL=\"\"\n\texport JENKINS_USERNAME=\"\"\n\texport JENKINS_TOKEN=\"\"\n```\n\n## Testing \nSee the `.test` directory for infrastructure test code. ", - "libraries": [ - "Jenkins", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/jenkins-health" - }, - { - "slug": "rw-cli-codecollection-azure-acr-image-sync", - "collection_slug": "rw-cli-codecollection", - "name": "azure-acr-image-sync", - "display_name": "azure-acr-image-sync", - "description": "**Purpose**:", - "platform": "Azure", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Count Outdated Images in Azure Container Registry `${ACR_REGISTRY}`", - "Sync Container Images into Azure Container Registry `${ACR_REGISTRY}`" - ], - "capabilities": [ - "Count Outdated Images in Azure Container Registry `${ACR_REGISTRY}`: Counts the number of images that need updating in ACR from the upstream source.", - "Sync Container Images into Azure Container Registry `${ACR_REGISTRY}`: Synchronizes the latest container images into an ACR repository" - ], - "readme": "# Azure ACR Image Sync\n\n## Runbook: Azure ACR Image Sync\n\n**Purpose**: \nThis CodeBundle synchronizes container images from public repositories into an Azure Container Registry (ACR). It allows for automated image synchronization, applying an optional date tag, and handling tag conflicts based on user preferences.\n\n**Example Inputs**:\n\n- **ACR_REGISTRY**: \n - *Type*: `string` \n - *Description*: The name of the Azure Container Registry to import images into. \n - *Pattern*: `\\w*` \n - *Example*: `myacr.azurecr.io` \n - *Default*: `myacr.azurecr.io` \n\n- **IMAGE_MAPPINGS**: \n - *Type*: `string` \n - *Description*: JSON list of image source and destination mappings. \n - *Example*: `[{\"source\": \"docker.io/library/nginx:latest\", \"destination\": \"test/nginx\"}, {\"source\": \"docker.io/library/alpine:3.14\", \"destination\": \"test2/alpine\"}]` \n - *Default*: See example above. \n\n- **USE_DATE_TAG_PATTERN**: \n - *Type*: `bool` \n - *Description*: Whether to append the date to the image tag. \n - *Default*: `False` \n\n- **TAG_CONFLICT_HANDLING**: \n - *Type*: `enum (overwrite, rename)` \n - *Description*: How to handle tags that already exist. \n - *Default*: `rename` \n\n- **DOCKER_USERNAME** & **DOCKER_TOKEN**: \n - *Type*: `string` \n - *Description*: Docker credentials for authentication in case of rate limits. \n\n- **azure_credentials**: \n - *Type*: `string` \n - *Description*: Secret containing `AZURE_CLIENT_ID`, `AZURE_TENANT_ID`, `AZURE_CLIENT_SECRET`, `AZURE_SUBSCRIPTION_ID`.\n\n**Task Description**: \nThe task **\"Sync Container Images into Azure Container Registry `${ACR_REGISTRY}`\"** runs a bash script (`acr_sync_images.sh`) to sync container images into the specified ACR registry, using the provided environment variables and secrets.\n\n--\n\n## SLI: Outdated Azure Container Registry Image Count\n\n**Purpose**: \nThis CodeBundle counts the number of outdated container images in an Azure Container Registry (ACR) by comparing the images in ACR against the upstream sources. It provides an overview of which images need updating.\n\n**Example Inputs**:\n\n- **ACR_REGISTRY**: \n - *Type*: `string` \n - *Description*: The name of the Azure Container Registry to analyze. \n - *Pattern*: `\\w*` \n - *Example*: `myacr.azurecr.io` \n - *Default*: `myacr.azurecr.io` \n\n- **IMAGE_MAPPINGS**: \n - *Type*: `string` \n - *Description*: JSON list of image source and destination mappings. \n - *Example*: `[{\"source\": \"docker.io/library/nginx:latest\", \"destination\": \"test/nginx\"}, {\"source\": \"docker.io/library/alpine:3.14\", \"destination\": \"test2/alpine\"}]` \n - *Default*: See example above. \n\n- **USE_DATE_TAG_PATTERN**: \n - *Type*: `bool` \n - *Description*: Whether to append the date to the image tag. \n - *Default*: `False` \n\n- **TAG_CONFLICT_HANDLING**: \n - *Type*: `enum (overwrite, rename)` \n - *Description*: How to handle tags that already exist. \n - *Default*: `rename` \n\n- **DOCKER_USERNAME** & **DOCKER_TOKEN**: \n - *Type*: `string` \n - *Description*: Docker credentials for authentication in case of rate limits. \n\n- **azure_credentials**: \n - *Type*: `string` \n - *Description*: Secret containing `AZURE_CLIENT_ID`, `AZURE_TENANT_ID`, `AZURE_CLIENT_SECRET`, `AZURE_SUBSCRIPTION_ID`.\n\n**Task Description**: \nThe task **\"Count Outdated Images in Azure Container Registry `${ACR_REGISTRY}`\"** runs a bash script (`check_for_image_updates.sh`) to count outdated images and outputs the total count that require updates in ACR. The result is pushed as a metric.\n", - "libraries": [ - "RW.platform", - "RW.Core", - "BuiltIn", - "OperatingSystem", - "RW.CLI", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/azure-acr-image-sync" - }, - { - "slug": "rw-cli-codecollection-k8s-deployment-ops", - "collection_slug": "rw-cli-codecollection", - "name": "k8s-deployment-ops", - "display_name": "k8s-deployment-ops", - "description": "This codebundle provides a suite of operational tasks related to a deployment in Kubernetes clusters.", - "platform": "Kubernetes", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Restart Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`", - "Force Delete Pods in Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`", - "Rollback Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` to Previous Version", - "Scale Down Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`", - "Scale Up Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` by ${SCALE_UP_FACTOR}x", - "Clean Up Stale ReplicaSets for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`", - "Scale Down Stale ReplicaSets for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`", - "Scale Up HPA for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` by ${HPA_SCALE_FACTOR}x", - "Scale Down HPA for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` to Min ${HPA_MIN_REPLICAS}", - "Increase CPU Resources for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`", - "Increase Memory Resources for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`", - "Decrease CPU Resources for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`", - "Decrease Memory Resources for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`" - ], - "capabilities": [ - "Restart Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`: Perform a rollout restart on the deployment", - "Force Delete Pods in Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`: Force delete all pods related to the deployment", - "Rollback Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` to Previous Version: Perform a rollback to a known functional version", - "Scale Down Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`: Stops (or nearly stops) all running pods in a deployment to immediately halt a failing or runaway service.", - "Scale Up Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` by ${SCALE_UP_FACTOR}x: Increase deployment replicas", - "Clean Up Stale ReplicaSets for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`: Deletes all stale replicasets.", - "Scale Down Stale ReplicaSets for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`: Finds any old/stale replicasets that still have active pods and scales them down.", - "Scale Up HPA for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` by ${HPA_SCALE_FACTOR}x: Increase HPA min and max replicas by a scaling factor", - "Scale Down HPA for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` to Min ${HPA_MIN_REPLICAS}: Decrease HPA min and max replicas to specified minimum values or scale down by factor", - "Increase CPU Resources for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`: Intelligently increases CPU resources for a deployment based on VPA recommendations, HPA presence, or doubles current values. Does not apply if GitOps-managed or HPA exists.", - "Increase Memory Resources for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`: Intelligently increases memory resources for a deployment based on VPA recommendations, HPA presence, or doubles current values. Does not apply if GitOps-managed or HPA exists.", - "Decrease CPU Resources for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`: Intelligently decreases CPU resources for a deployment by dividing current values by scale down factor. Does not apply if GitOps-managed or HPA exists.", - "Decrease Memory Resources for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`: Intelligently decreases memory resources for a deployment by dividing current values by scale down factor. Does not apply if GitOps-managed or HPA exists." - ], - "readme": "# Kubernetes Deployment Operations\n\nThis codebundle provides a suite of operational tasks related to a deployment in Kubernetes clusters.\n\n## Tasks\n- Restart Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`\n- Force Delete Pods in Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`\n- Rollback Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` to Previous Version\n- Scale Down Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`\n- Scale Up Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` by ${SCALE_UP_FACTOR}x\n- Clean Up Stale ReplicaSets for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`\n- Scale Down Stale ReplicaSets for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`\n- **Scale Up HPA for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` by ${HPA_SCALE_FACTOR}x**\n- **Scale Down HPA for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}` to Min ${HPA_MIN_REPLICAS}**\n- **Increase CPU Resources for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`**\n- **Increase Memory Resources for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`**\n- **Decrease CPU Resources for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`**\n- **Decrease Memory Resources for Deployment `${DEPLOYMENT_NAME}` in Namespace `${NAMESPACE}`**\n\n### HPA Scaling Tasks\nThe HPA scaling tasks allow you to scale HorizontalPodAutoscaler min/max replicas:\n- **Scale Up HPA**: Multiplies current min/max replicas by `${HPA_SCALE_FACTOR}` (default: 2x)\n - Caps max replicas at `${HPA_MAX_REPLICAS}` to prevent excessive scaling\n - Useful during traffic spikes or capacity planning\n- **Scale Down HPA**: Sets both min and max replicas to `${HPA_MIN_REPLICAS}` (default: 1)\n - Useful for reducing resource usage during maintenance or off-peak hours\n - Effectively constrains autoscaling to a minimal level\n\n**GitOps Integration**: Both HPA scaling tasks check for GitOps management (Flux/ArgoCD labels and annotations). If an HPA is managed by GitOps, the tasks will only provide suggestions and not apply changes directly, with instructions to update the HPA manifest in your Git repository.\n\n### Resource Update Tasks\n\n#### Increase Resources\nThe resource increase tasks intelligently scale up CPU and memory resources based on:\n- **VPA Recommendations**: If a VerticalPodAutoscaler exists with recommendations, uses the upper bound value\n- **Default Behavior**: If no VPA exists, doubles the current resource request/limit\n- **CPU Format Support**: Properly handles all Kubernetes CPU formats:\n - Full cores: `1` (1000m), `0.5` (500m), `2` (2000m)\n - Millicores: `100m`, `500m`, `1000m`\n- **Memory Format Support**: Properly handles all Kubernetes memory formats (converts to Mi for calculation):\n - Binary units: `512Mi`, `1Gi`, `2Gi`, `1024Ki`\n - Decimal units: `512M`, `1G` (megabytes, gigabytes)\n- **GitOps-Managed Deployments**: Only provides suggestions (does not apply changes) if the deployment has GitOps annotations (Flux, ArgoCD)\n- **HPA Considerations**: Does not apply changes if HorizontalPodAutoscaler exists (only provides suggestions) to avoid conflicts\n\n#### Decrease Resources\nThe resource decrease tasks help optimize costs by reducing over-provisioned resources:\n- **Scale Down Factor**: Divides current CPU/memory requests and limits by `${RESOURCE_SCALE_DOWN_FACTOR}` (default: 2, meaning divide by 2)\n- **Safety Minimums**: Sets minimum thresholds (10m for CPU, 16Mi for memory) to prevent too-low values\n- **CPU Format Support**: Properly handles all Kubernetes CPU formats:\n - Full cores: `1` (1000m), `0.5` (500m), `2` (2000m)\n - Millicores: `100m`, `500m`, `1000m`\n- **Memory Format Support**: Properly handles all Kubernetes memory formats (converts to Mi for calculation):\n - Binary units: `512Mi`, `1Gi`, `2Gi`, `1024Ki`\n - Decimal units: `512M`, `1G` (megabytes, gigabytes)\n- **GitOps-Managed Deployments**: Only provides suggestions (does not apply changes) if the deployment has GitOps annotations (Flux, ArgoCD)\n- **HPA Considerations**: Does not apply changes if HorizontalPodAutoscaler exists (only provides suggestions) to avoid conflicts\n- **Use Cases**: Cost optimization, over-provisioned workloads, maintenance windows\n\n\n## Configuration\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\n- `kubeconfig`: The kubeconfig secret containing access info for the cluster.\n- `KUBERNETES_DISTRIBUTION_BINARY`: Which binary to use for Kubernetes CLI commands. Default value is `kubectl`.\n- `CONTEXT`: The Kubernetes context to operate within.\n- `NAMESPACE`: The name of the namespace to search. Leave it blank to search in all namespaces.\n- `DEPLOYMENT_NAME`: The name of the deployment.\n- `SCALE_UP_FACTOR`: A multiple in which to increase deployment replicas by (default: 2)\n- `MAX_REPLICAS`: Maximum replicas allowed for deployment scale up operations (default: 10)\n- `ALLOW", - "libraries": [ - "DateTime", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.NextSteps", - "RW.platform", - "RW.K8sHelper", - "OperatingSystem", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/k8s-deployment-ops" - }, - { - "slug": "rw-cli-codecollection-gke-cluster-health", - "collection_slug": "rw-cli-codecollection", - "name": "gke-cluster-health", - "display_name": "gke-cluster-health", - "description": "This codebundle performs comprehensive health checking for Google Kubernetes Engine (GKE) clusters, including node pool analysis, instance group evaluation, and resource optimization recommendations.", - "platform": "Unknown", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Identify GKE Service Account Issues in GCP Project `${GCP_PROJECT_ID}`", - "Fetch GKE Recommendations for GCP Project `${GCP_PROJECT_ID}`", - "Fetch GKE Cluster Health for GCP Project `${GCP_PROJECT_ID}`", - "Check for Quota Related GKE Autoscaling Issues in GCP Project `${GCP_PROJECT_ID}`", - "Quick Node Instance Group Health Check for GCP Project `${GCP_PROJECT_ID}`", - "Generate GKE Cluster Health Score", - "Identify GKE Service Account Issues in GCP Project `${GCP_PROJECT_ID}`", - "Fetch GKE Recommendations for GCP Project `${GCP_PROJECT_ID}`", - "Fetch GKE Cluster Health for GCP Project `${GCP_PROJECT_ID}`", - "Check for Quota Related GKE Autoscaling Issues in GCP Project `${GCP_PROJECT_ID}`", - "Validate GKE Node Sizes for GCP Project `${GCP_PROJECT_ID}`", - "Fetch GKE Cluster Operations for GCP Project `${GCP_PROJECT_ID}`", - "Check Node Pool Health for GCP Project `${GCP_PROJECT_ID}`" - ], - "capabilities": [ - "Identify GKE Service Account Issues in GCP Project `${GCP_PROJECT_ID}`: Checks for IAM Service Account issues that can affect Cluster functionality", - "Fetch GKE Recommendations for GCP Project `${GCP_PROJECT_ID}`: Fetch and summarize GCP Recommendations for GKE Clusters", - "Fetch GKE Cluster Health for GCP Project `${GCP_PROJECT_ID}`: Using kubectl, fetch overall basic health of the cluster by checking unhealth pods and overutilized nodes. Useful when stackdriver is not available. Requires iam permissions to fetch cluster credentials with viewer rights.", - "Check for Quota Related GKE Autoscaling Issues in GCP Project `${GCP_PROJECT_ID}`: Ensure that GKE Autoscaling will not be blocked by Quota constraints", - "Quick Node Instance Group Health Check for GCP Project `${GCP_PROJECT_ID}`: Fast detection of critical node instance group health issues like quota exhaustion and provisioning failures", - "Identify GKE Service Account Issues in GCP Project `${GCP_PROJECT_ID}`: Checks for IAM Service Account issues that can affect Cluster functionality", - "Fetch GKE Recommendations for GCP Project `${GCP_PROJECT_ID}`: Fetch and summarize GCP Recommendations for GKE Clusters", - "Fetch GKE Cluster Health for GCP Project `${GCP_PROJECT_ID}`: Using kubectl, fetch overall basic health of the cluster by checking unhealthy pods, overutilized nodes, and underutilized clusters with cost savings opportunities. Analyzes resource utilization and provides MSRP-based cost optimization recommendations. Useful when stackdriver is not available. Requires iam permissions to fetch cluster credentials with viewer rights.", - "Check for Quota Related GKE Autoscaling Issues in GCP Project `${GCP_PROJECT_ID}`: Ensure that GKE Autoscaling will not be blocked by Quota constraints", - "Validate GKE Node Sizes for GCP Project `${GCP_PROJECT_ID}`: Analyse live pod requests/limits, node usage, and propose suitable GKE node machine types.", - "Fetch GKE Cluster Operations for GCP Project `${GCP_PROJECT_ID}`: Fetches GKE Operations and identify stuck or failed tasks.", - "Check Node Pool Health for GCP Project `${GCP_PROJECT_ID}`: Performs comprehensive node pool health checking including instance group logs, compute operations, and Kubernetes events to surface hard-to-find issues like region exhaustion and quota blocking." - ], - "readme": "# GKE Cluster Health\n\nThis codebundle performs comprehensive health checking for Google Kubernetes Engine (GKE) clusters, including node pool analysis, instance group evaluation, and resource optimization recommendations.\n\n\n### Enhanced Instance Group Analysis\n- **Individual Instance Evaluation**: Now evaluates each instance within instance groups, not just group-level operations\n- **Instance State Monitoring**: Checks for `RUNNING`, `TERMINATED`, `FAILED`, and transitional states \n- **Missing Instance Detection**: Cross-validates Kubernetes nodes against compute instances to catch missing or orphaned instances\n- **Comprehensive Event Tracking**: Monitors both instance group and individual instance operations for failures\n\n### Improved Node Pool Health Monitoring\n- **Instance-Level Operations**: Tracks failed operations on individual instances (preemptions, quota failures, etc.)\n- **Node Readiness Validation**: Checks node conditions and taints that affect scheduling\n- **Cross-Platform Validation**: Ensures all expected instances from node pools are properly integrated into Kubernetes\n- **Enhanced Error Detection**: Better pattern matching for quota exhaustion, regional capacity issues, and permission problems\n\n### Node Sizing Analysis Improvements\n- **Better Node Discovery**: Enhanced validation that all nodes are being properly evaluated\n- **Unscheduled Pod Detection**: Identifies pods that cannot be scheduled due to resource constraints\n- **Node Pool Validation**: Cross-checks expected vs actual node counts to detect provisioning failures\n- **Resource Pressure Monitoring**: Improved detection of nodes under memory/disk pressure\n\n## Tasks\n\n### Identify GKE Service Account Issues\nChecks for IAM Service Account issues that can affect cluster functionality.\n\n### Fetch GKE Recommendations \nRetrieves and summarizes GCP Recommendations for GKE Clusters.\n\n### Fetch GKE Cluster Health\nPerforms comprehensive health checking including:\n- Pod health and crash loop detection\n- Node utilization analysis \n- **Cost optimization analysis** - identifies underutilized clusters with estimated MSRP cost savings\n- Resource availability assessment\n\n### Check for Quota Related Autoscaling Issues\nEnsures GKE Autoscaling will not be blocked by quota constraints.\n\n### Validate GKE Node Sizes\n**Enhanced with comprehensive instance evaluation:**\n- Analyzes live pod requests/limits and node usage\n- Proposes suitable GKE node machine types\n- **NEW**: Validates all instances are accounted for and healthy\n- **NEW**: Detects missing instances that should be part of the cluster\n- **NEW**: Cross-validates node pool expectations vs reality\n\n### Fetch GKE Cluster Operations\nFetches GKE Operations and identifies stuck or failed tasks.\n\n### Check Node Pool Health \u2b50 **SIGNIFICANTLY ENHANCED**\n**Now includes comprehensive instance-level analysis and log retrieval:**\n- **Individual Instance Health**: Evaluates each instance in every instance group\n- **Instance State Monitoring**: Tracks `RUNNING`, `TERMINATED`, `PROVISIONING` states\n- **Missing Instance Detection**: Identifies instances that failed to join the cluster\n- **Operation History Analysis**: Reviews recent operations on both groups and individual instances \n- **Quota Exhaustion Detection**: Enhanced patterns for identifying resource constraints\n- **Kubernetes Event Correlation**: Links compute instance issues with K8s node events\n- **Cross-Platform Validation**: Ensures compute instances and K8s nodes are properly synchronized\n- **\ud83c\udd95 Managed Instance Group Log Analysis**: Retrieves and analyzes MIG logs for resource exhaustion, quota issues, and disk attachment errors\n- **\ud83c\udd95 Individual Instance Serial Console Logs**: Analyzes instance boot logs for memory exhaustion, disk space issues, and kernel panics\n- **\ud83c\udd95 Cloud Logging Integration**: Retrieves GCP Cloud Logging entries for instances to catch storage, metadata, and Kubernetes service failures\n- **\ud83c\udd95 MIG Manager Activity Logs**: Analyzes autoscaling operations, quota failures, and regional capacity exhaustion\n- **\ud83c\udd95 GKE Autoscaler Logs**: Examines cluster autoscaler logs for scaling failures and node group readiness issues\n\n## Key Improvements\n\n### Comprehensive Managed Instance Log Analysis \ud83c\udd95\nThe enhanced implementation now retrieves and analyzes actual managed instance logs to surface critical issues:\n\n#### **Managed Instance Group Logs**\n- **Resource Quota Exhaustion**: `quota exceeded`, `QUOTA_EXCEEDED`, `resource exhausted`, `ZONE_RESOURCE_POOL_EXHAUSTED`\n- **Disk Attachment Failures**: `disk attach failed`, `disk mount failed`, `volume attach error`\n- **Network Resource Exhaustion**: `network unavailable`, `subnet exhausted`, `IP allocation failed`\n- **Permission Issues**: `permission denied`, `access denied`, `unauthorized`\n- **Instance Provisioning Failures**: `instance creation failed`, `instance start failed`, `provisioning failed`\n\n#### **Individual Instance Serial Console Logs**\n- **Memory Exhaustion**: `Out of memory`, `O", - "libraries": [ - "DateTime", - "Collections", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/gke-cluster-health" - }, - { - "slug": "rw-cli-codecollection-azure-servicebus-health", - "collection_slug": "rw-cli-codecollection", - "name": "azure-servicebus-health", - "display_name": "azure-servicebus-health", - "description": "This codebundle performs a health check on Azure Service Bus resources and provides insights and recommended actions for detected issues.", - "platform": "Azure", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Check for Resource Health Issues Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Basic Connectivity for Service Bus `${SB_NAMESPACE_NAME}`", - "Check Critical Metrics for Service Bus `${SB_NAMESPACE_NAME}`", - "Generate Enhanced Service Bus Health Score", - "Check for Resource Health Issues Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Configuration Health for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Metrics for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Queue Health for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Topic Health for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Log Analytics for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Capacity and Quota Headroom for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Geo-Disaster Recovery for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Security Configuration for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Discover Related Resources for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Test Connectivity to Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Azure Monitor Alerts for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`" - ], - "capabilities": [ - "Check for Resource Health Issues Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Fetch a list of issues that might affect the service bus instance", - "Check Basic Connectivity for Service Bus `${SB_NAMESPACE_NAME}`: Quick connectivity test to detect network issues", - "Check Critical Metrics for Service Bus `${SB_NAMESPACE_NAME}`: Quick check of critical metrics that indicate immediate issues", - "Check for Resource Health Issues Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Fetch a list of issues that might affect the service bus instance", - "Check Configuration Health for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Fetch the details and health of the service bus configuration", - "Check Metrics for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Analyze Service Bus metrics for potential issues", - "Check Queue Health for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Analyze Service Bus queues for health issues", - "Check Topic Health for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Analyze Service Bus topics and subscriptions for health issues", - "Check Log Analytics for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Query Log Analytics for Service Bus related logs and errors", - "Check Capacity and Quota Headroom for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Analyze Service Bus capacity utilization and quota headroom", - "Check Geo-Disaster Recovery for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Check the geo-disaster recovery configuration and health", - "Check Security Configuration for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Audit SAS keys and RBAC assignments for security best practices", - "Discover Related Resources for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Discover and map Azure resources related to the Service Bus namespace", - "Test Connectivity to Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Test network connectivity to the Service Bus namespace", - "Check Azure Monitor Alerts for Service Bus `${SB_NAMESPACE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Check for the presence and configuration of Azure Monitor alerts" - ], - "readme": "# Azure Service Bus Health\n\nThis codebundle performs a health check on Azure Service Bus resources and provides insights and recommended actions for detected issues.\n\n## Scripts\n\nThe codebundle includes the following scripts:\n\n- **service_bus_resource_health.sh**: Checks the Azure Resource Health status of the Service Bus namespace\n- **service_bus_config_health.sh**: Analyzes the configuration of the Service Bus namespace for best practices\n- **service_bus_metrics.sh**: Retrieves and analyzes Service Bus metrics for potential issues\n- **service_bus_queue_health.sh**: Checks the health of Service Bus queues (message counts, size, status)\n- **service_bus_topic_health.sh**: Checks the health of Service Bus topics and their subscriptions\n- **service_bus_log_analytics.sh**: Queries Log Analytics for Service Bus related logs and errors\n- **service_bus_capacity.sh**: Analyzes capacity utilization and quota headroom\n- **service_bus_disaster_recovery.sh**: Checks geo-disaster recovery configuration and health\n- **service_bus_security_audit.sh**: Audits SAS keys and RBAC assignments for security best practices\n- **service_bus_related_resources.sh**: Discovers and maps Azure resources related to the Service Bus\n- **service_bus_connectivity_test.sh**: Tests network connectivity to the Service Bus namespace\n- **service_bus_alerts_check.sh**: Checks for the presence and configuration of Azure Monitor alerts\n\n## Tasks\n\nThe runbook contains tasks to:\n\n1. Check Resource Health status for Service Bus namespaces\n2. Validate Service Bus configuration against best practices\n3. Analyze Service Bus metrics for anomalies\n4. Check queue health (dead letters, message counts, size limits)\n5. Check topic and subscription health\n6. Query and analyze logs from Log Analytics\n7. Analyze capacity utilization and quota headroom\n8. Check geo-disaster recovery configuration\n9. Audit security configurations (SAS keys, RBAC)\n10. Discover and map related Azure resources\n11. Test network connectivity to the Service Bus\n12. Check for proper Azure Monitor alerts\n\n## Required Variables\n\n- `AZ_RESOURCE_GROUP`: The resource group containing the Service Bus namespace\n- `SB_NAMESPACE_NAME`: The name of the Service Bus namespace to check\n\n## Optional Variables\n\n- `AZURE_RESOURCE_SUBSCRIPTION_ID`: The subscription ID (defaults to current az login context)\n- `METRIC_INTERVAL`: Time interval for metrics in ISO 8601 format (default: PT1H - 1 hour)\n- `QUERY_TIMESPAN`: Time span for log queries (default: P1D - 1 day)\n- `SAS_KEY_MAX_AGE_DAYS`: Maximum age for SAS keys in days (default: 90)\n\n### Configurable Thresholds\n\n- `ACTIVE_MESSAGE_THRESHOLD`: Threshold for active message count alerts (default: 1000)\n- `DEAD_LETTER_THRESHOLD`: Threshold for dead letter message count alerts (default: 100)\n- `SIZE_PERCENTAGE_THRESHOLD`: Size percentage threshold for namespace/queue/topic alerts (default: 80)\n- `LATENCY_THRESHOLD_MS`: Latency threshold in milliseconds for connectivity alerts (default: 100)\n\n## Authentication\n\nThis codebundle requires Azure credentials with read access to the Service Bus namespace and related resources.\n\n## Local Testing\n\nAzure Auth\n```\nln -s ~/.azure/ /var/tmp/runwhen/azure-servicebus-health/runbook.robot/\nln -s ~/.azure/ /var/tmp/runwhen/azure-servicebus-health/sli.robot/\n", - "libraries": [ - "DateTime", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/azure-servicebus-health" - }, - { - "slug": "rw-cli-codecollection-k8s-daemonset-healthcheck", - "collection_slug": "rw-cli-codecollection", - "name": "k8s-daemonset-healthcheck", - "display_name": "k8s-daemonset-healthcheck", - "description": "This codebundle provides a suite of tasks aimed at triaging issues related to a daemonset and its replicas in Kubernetes clusters.", - "platform": "Kubernetes", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Analyze Application Log Patterns for DaemonSet `${DAEMONSET_NAME}` in Namespace `${NAMESPACE}`", - "Detect Log Anomalies for DaemonSet `${DAEMONSET_NAME}` in Namespace `${NAMESPACE}`", - "Identify Recent Configuration Changes for DaemonSet `${DAEMONSET_NAME}` in Namespace `${NAMESPACE}`", - "Check Liveness Probe Configuration for DaemonSet `${DAEMONSET_NAME}`", - "Check Readiness Probe Configuration for DaemonSet `${DAEMONSET_NAME}` in Namespace `${NAMESPACE}`", - "Check for Container Restarts in DaemonSet `${DAEMONSET_NAME}` in Namespace `${NAMESPACE}`", - "Inspect DaemonSet Warning Events for `${DAEMONSET_NAME}` in Namespace `${NAMESPACE}`", - "Fetch DaemonSet Workload Details For `${DAEMONSET_NAME}` in Namespace `${NAMESPACE}`", - "Inspect DaemonSet Status for `${DAEMONSET_NAME}` in namespace `${NAMESPACE}`", - "Check Node Affinity and Tolerations for DaemonSet `${DAEMONSET_NAME}` in Namespace `${NAMESPACE}`" - ], - "capabilities": [ - "Analyze Application Log Patterns for DaemonSet `${DAEMONSET_NAME}` in Namespace `${NAMESPACE}`: Fetches and analyzes logs from the DaemonSet pods for errors, stack traces, connection issues, and other patterns that indicate application health problems.", - "Detect Log Anomalies for DaemonSet `${DAEMONSET_NAME}` in Namespace `${NAMESPACE}`: Analyzes logs for repeating patterns, anomalous behavior, and unusual log volume that may indicate underlying issues.", - "Identify Recent Configuration Changes for DaemonSet `${DAEMONSET_NAME}` in Namespace `${NAMESPACE}`: Identifies recent configuration changes from ControllerRevision analysis that might be related to current issues.", - "Check Liveness Probe Configuration for DaemonSet `${DAEMONSET_NAME}`: Validates if a Liveness probe has possible misconfigurations", - "Check Readiness Probe Configuration for DaemonSet `${DAEMONSET_NAME}` in Namespace `${NAMESPACE}`: Validates if a readiness probe has possible misconfigurations", - "Check for Container Restarts in DaemonSet `${DAEMONSET_NAME}` in Namespace `${NAMESPACE}`: Analyzes container restart patterns in the DaemonSet pods to identify the root cause of restarts, distinguishing between OOM kills, liveness probe failures, and other termination causes.", - "Inspect DaemonSet Warning Events for `${DAEMONSET_NAME}` in Namespace `${NAMESPACE}`: Fetches warning events related to the DaemonSet workload in the namespace and triages any issues found in the events.", - "Fetch DaemonSet Workload Details For `${DAEMONSET_NAME}` in Namespace `${NAMESPACE}`: Fetches the current state of the DaemonSet for future review in the report.", - "Inspect DaemonSet Status for `${DAEMONSET_NAME}` in namespace `${NAMESPACE}`: Pulls the status information for a given DaemonSet and checks if all pods are properly scheduled and running across nodes, identifying node scheduling issues.", - "Check Node Affinity and Tolerations for DaemonSet `${DAEMONSET_NAME}` in Namespace `${NAMESPACE}`: Checks the node affinity, tolerations, and scheduling constraints of the DaemonSet to identify potential scheduling issues." - ], - "readme": "# Kubernetes DaemonSet Triage\n\nThis codebundle provides a suite of tasks aimed at triaging issues related to a daemonset and its replicas in Kubernetes clusters.\n\n## Tasks\n`Get DaemonSet Log Details For Report`\n`Get Related Daemonset Events`\n`Check Daemonset Replicas`\n\n## Configuration\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\n- `kubeconfig`: The kubeconfig secret containing access info for the cluster.\n- `kubectl`: The location service used to interpret shell commands. Default value is `kubectl-service.shared`.\n- `KUBERNETES_DISTRIBUTION_BINARY`: Which binary to use for Kubernetes CLI commands. Default value is `kubectl`.\n- `CONTEXT`: The Kubernetes context to operate within.\n- `NAMESPACE`: The name of the namespace to search. Leave it blank to search in all namespaces.\n- `DAEMONSET_NAME`: The name of the daemonset.\n- `LABELS`: The labels used to query for resources.\n\n## Requirements\n- A kubeconfig with appropriate RBAC permissions to perform the desired command.\n\n## TODO\n- [ ] Add additional documentation.\n\n", - "libraries": [ - "DateTime", - "RW.K8sLog", - "Collections", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.NextSteps", - "RW.platform", - "RW.K8sHelper", - "OperatingSystem", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/k8s-daemonset-healthcheck" - }, - { - "slug": "rw-cli-codecollection-azure-adf-health", - "collection_slug": "rw-cli-codecollection", - "name": "azure-adf-health", - "display_name": "azure-adf-health", - "description": "This codebundle runs a suite of metrics checks for Data Factory in Azure. It identifies:", - "platform": "Azure", - "author": "saurabh3460", - "support_tags": [ - "rw" - ], - "tasks": [ - "Identify Health Issues Affecting Data Factories in resource group `${AZURE_RESOURCE_GROUP}`", - "Count Frequent Pipeline Errors in Data Factories in resource group `${AZURE_RESOURCE_GROUP}`", - "Count Failed Pipelines in Data Factories in resource group `${AZURE_RESOURCE_GROUP}`", - "Count Large Data Operations in Data Factories in resource group `${AZURE_RESOURCE_GROUP}`", - "Count Long Running Pipeline Runs in Data Factories in resource group `${AZURE_RESOURCE_GROUP}`", - "Generate Health Score", - "Check for Resource Health Issues Affecting Data Factories in resource group `${AZURE_RESOURCE_GROUP}`", - "List Frequent Pipeline Errors in Data Factories in resource group `${AZURE_RESOURCE_GROUP}`", - "List Failed Pipelines in Data Factories in resource group `${AZURE_RESOURCE_GROUP}`", - "Find Large Data Operations in Data Factories in resource group `${AZURE_RESOURCE_GROUP}`", - "Fetch Azure Data Factory Details in resource group `${AZURE_RESOURCE_GROUP}`", - "List Long Running Pipeline Runs in Data Factories in resource group `${AZURE_RESOURCE_GROUP}`" - ], - "capabilities": [ - "Identify Health Issues Affecting Data Factories in resource group `${AZURE_RESOURCE_GROUP}`: Fetch health status for all Data Factories in the resource group", - "Count Frequent Pipeline Errors in Data Factories in resource group `${AZURE_RESOURCE_GROUP}`: Count frequently occurring errors in Data Factory pipelines", - "Count Failed Pipelines in Data Factories in resource group `${AZURE_RESOURCE_GROUP}`: Count failed pipeline runs in Data Factory pipelines", - "Count Large Data Operations in Data Factories in resource group `${AZURE_RESOURCE_GROUP}`: Count large data operations in Data Factory pipelines", - "Count Long Running Pipeline Runs in Data Factories in resource group `${AZURE_RESOURCE_GROUP}`: Count long running pipeline runs in Data Factory pipelines", - "Generate Health Score: Calculate comprehensive health score with detailed reporting of each component", - "Check for Resource Health Issues Affecting Data Factories in resource group `${AZURE_RESOURCE_GROUP}`: Fetch health status for all Data Factories in the resource group", - "List Frequent Pipeline Errors in Data Factories in resource group `${AZURE_RESOURCE_GROUP}`: List frequently occurring errors in Data Factory pipelines", - "List Failed Pipelines in Data Factories in resource group `${AZURE_RESOURCE_GROUP}`: List failed pipeline runs in Data Factory pipelines", - "Find Large Data Operations in Data Factories in resource group `${AZURE_RESOURCE_GROUP}`: List large data operations in Data Factory pipelines", - "Fetch Azure Data Factory Details in resource group `${AZURE_RESOURCE_GROUP}`: List comprehensive details about Azure Data Factories", - "List Long Running Pipeline Runs in Data Factories in resource group `${AZURE_RESOURCE_GROUP}`: List long running pipeline runs in Data Factory pipelines" - ], - "readme": "# Azure Data Factory Health\nThis codebundle runs a suite of metrics checks for Data Factory in Azure. It identifies:\n- Check Azure Data Factory Availability\n\n## Configuration\n\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\n- `AZURE_SUBSCRIPTION_ID`: The Azure subscription ID\n- `AZURE_RESOURCE_GROUP`: The Azure Resource Group\n\n## Testing \nSee the .test directory for infrastructure test code. \n\n## Notes\n\nThis codebundle assumes the service principal authentication flow", - "libraries": [ - "DateTime", - "Jenkins", - "Collections", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/azure-adf-health" - }, - { - "slug": "rw-cli-codecollection-dns-health", - "collection_slug": "rw-cli-codecollection", - "name": "dns-health", - "display_name": "dns-health", - "description": "", - "platform": "Unknown", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "DNS Resolution Success Rate", - "DNS Query Latency", - "DNS Zone Health", - "External DNS Resolver Availability", - "Generate DNS Health Score", - "Check DNS Zone Records", - "Detect Broken Record Resolution", - "Test Forward Lookup Zones", - "External Resolution Validation", - "DNS Latency Check" - ], - "capabilities": [ - "DNS Resolution Success Rate: Measures the success rate of DNS resolution across all configured FQDNs and pushes a metric (0-100)", - "DNS Query Latency: Measures average DNS query latency in milliseconds across all configured FQDNs and pushes the metric", - "DNS Zone Health: Measures the health of configured DNS zones (1 for healthy, 0 for unhealthy)", - "External DNS Resolver Availability: Measures availability of external DNS resolvers (percentage of working resolvers)", - "Generate DNS Health Score: Calculates the overall DNS health score as the average of all individual task scores", - "Check DNS Zone Records: Verifies DNS zones and their record integrity", - "Detect Broken Record Resolution: Implements repeated DNS checks for multiple FQDNs to detect resolution failures", - "Test Forward Lookup Zones: Tests forward lookup zones and conditional forwarders for proper resolution", - "External Resolution Validation: Tests resolution of multiple public domains through multiple resolvers", - "DNS Latency Check: Tests DNS query latency for configured zones" - ], - "readme": "", - "libraries": [ - "Collections", - "RW.platform", - "RW.Core", - "BuiltIn", - "RW.CLI", - "OperatingSystem", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/dns-health" - }, - { - "slug": "rw-cli-codecollection-aws-cloudwatch-overused-ec2", - "collection_slug": "rw-cli-codecollection", - "name": "aws-cloudwatch-overused-ec2", - "display_name": "aws-cloudwatch-overused-ec2", - "description": "This taskset can be used to check a fleet of EC2 instance and return the list of instances which are classified as overutilized.", - "platform": "AWS", - "author": "jon-funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Check For Overutilized Ec2 Instances" - ], - "capabilities": [ - "Check For Overutilized Ec2 Instances: Fetches CloudWatch metrics for a list of EC2 instances and raises issues if they're over-utilized based on a configurable threshold." - ], - "readme": "# AWS CloudWatch EC2 Instance Utilization Check\nThis taskset can be used to check a fleet of EC2 instance and return the list of instances which are classified as overutilized.\n\n## Tasks\n`Check For Overutilized Ec2 Instances`\n\n## Configuration\n\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\n- `aws_access_key_id`: the service account's access key ID, used during the `aws sts` call.\n- `aws_secret_access_key`: The service account's secret access key, used during the `aws sts` call.\n- `aws_role_arn`: The full aws role ARN that will be assumed.\n- `aws_assume_role_name`: The name of the role to assume as part of the `aws sts` assume role call.\n- `AWS_DEFAULT_REGION`: The AWS region to perform API requests in and for resources.\n- `AWS_SERVICE`: The remote aws service to use for requests.\n- `UTILIZATION_THRESHOLD`: used to determine the threshold at which point a EC2 instance is considered over-utilized.\n\n\n## Notes\n\nThis codebundle assumes a traditional service account authentication using the assume role functionality of `aws sts`, and therefore a role with the correct access will be required so that it can be assumed by the service account for a temporary token.\n\n## TODO\n- [ ] Add documentation\n- [ ] Expand utilization checks\n", - "libraries": [ - "RW.CLI", - "RW.Core" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/aws-cloudwatch-overused-ec2" - }, - { - "slug": "rw-cli-codecollection-k8s-ingress-gce-healthcheck", - "collection_slug": "rw-cli-codecollection", - "name": "k8s-ingress-gce-healthcheck", - "display_name": "k8s-ingress-gce-healthcheck", - "description": "Triages the GCP HTTP Load Balancer resources that are created when an ingress object is detected and created by the ingress-gce controller.", - "platform": "Kubernetes", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Search For GCE Ingress Warnings in GKE Context `${CONTEXT}`", - "Identify Unhealthy GCE HTTP Ingress Backends in GKE Namespace `${NAMESPACE}`", - "Validate GCP HTTP Load Balancer Configurations in GCP Project `${GCP_PROJECT_ID}`", - "Fetch Network Error Logs from GCP Operations Manager for Ingress Backends in GCP Project `${GCP_PROJECT_ID}`", - "Review GCP Operations Logging Dashboard in GCP project `${GCP_PROJECT_ID}`" - ], - "capabilities": [ - "Search For GCE Ingress Warnings in GKE Context `${CONTEXT}`: Find warning events related to GCE Ingress and services objects", - "Identify Unhealthy GCE HTTP Ingress Backends in GKE Namespace `${NAMESPACE}`: Checks the backend annotations on the ingress object to determine if they are not regstered as healthy", - "Validate GCP HTTP Load Balancer Configurations in GCP Project `${GCP_PROJECT_ID}`: Extract GCP HTTP Load Balancer components from ingress annotations and check health of each object", - "Fetch Network Error Logs from GCP Operations Manager for Ingress Backends in GCP Project `${GCP_PROJECT_ID}`: Fetch logs from the last 1d that are specific to the HTTP Load Balancer within the last 60 minutes", - "Review GCP Operations Logging Dashboard in GCP project `${GCP_PROJECT_ID}`: Create urls that will help users obtain logs from the GCP Dashboard" - ], - "readme": "# Kubernetes Ingress-GCE HealthCheck\n\nTriages the GCP HTTP Load Balancer resources that are created when an ingress object is detected and created by the ingress-gce controller. \n\n## Tasks\n- `Search For GCE Ingress Warnings in GKE`- Executes CLI commands to find warning events related to GCE Ingress and services objects. Parses the CLI output to identify and report issues.\n\n- `Identify Unhealthy GCE HTTP Ingress Backends` - Uses CLI commands to check the backend annotations on the Ingress object for health issues. Parses the CLI output to identify and report unhealthy backends.\n\n- `Validate GCP HTTP Load Balancer Configurations` Executes bash scripts to validate GCP HTTP Load Balancer components extracted from Ingress annotations. Parses the output for issues and recommendations.\n\n- `Fetch Network Error Logs from GCP Operations Manager for Ingress Backends` - Executes CLI commands to fetch network error logs for Ingress backends. Parses the CLI output to identify and report network error issues.\n\n- `Review GCP Operations Logging Dashboard`: Generates URLs to access GCP Operations Logging Dashboard for Load Balancer logs and backend logs.\n\n## Configuration\n\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\n- `kubeconfig`: The kubeconfig secret containing access info for the cluster.\n- `KUBERNETES_DISTRIBUTION_BINARY`: Which binary to use for Kubernetes CLI commands. Default value is `kubectl`.\n- `CONTEXT`: The Kubernetes context to operate within.\n- `NAMESPACE`: The name of the namespace to search.\n- `INGRESS`: The name of the ingress object to triage. \n- `GCP_PROJECT_ID`: The id of the gcp project to query. \n- `gcp_credentials`: The name of the secret that contains GCP service account json details with project `Viewer` access. \n\n\n## TODO\n- [ ] Add documentation\n- [ ] Add github integration with source code vs image comparison\n- [ ] Find applicable raise issue use", - "libraries": [ - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/k8s-ingress-gce-healthcheck" - }, - { - "slug": "rw-cli-codecollection-gh-actions-health", - "collection_slug": "rw-cli-codecollection", - "name": "gh-actions-health", - "display_name": "gh-actions-health", - "description": "Comprehensive health monitoring for GitHub Actions across specified repositories and organizations.", - "platform": "Unknown", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Calculate Workflow Success Rate Across Specified Repositories", - "Calculate Organization Health Score Across Specified Organizations", - "Calculate Runner Availability Score Across Specified Organizations", - "Calculate Security Workflow Score Across Specified Repositories", - "Calculate Performance Score Across Specified Repositories", - "Calculate API Rate Limit Health Score", - "Generate Overall GitHub Actions Health Score", - "Check Recent Workflow Failures Across Specified Repositories", - "Check Long Running Workflows Across Specified Repositories", - "Check Repository Health Summary for Specified Repositories", - "Check GitHub Actions Runner Health Across Specified Organizations", - "Check Security Workflow Status Across Specified Repositories", - "Check GitHub Actions Billing and Usage Across Specified Organizations", - "Check GitHub API Rate Limits" - ], - "capabilities": [ - "Calculate Workflow Success Rate Across Specified Repositories: Calculates the success rate of workflows across the specified repositories over the specified period", - "Calculate Organization Health Score Across Specified Organizations: Calculates overall organization health score across all specified organizations", - "Calculate Runner Availability Score Across Specified Organizations: Calculates the availability score of GitHub Actions runners across the specified organizations", - "Calculate Security Workflow Score Across Specified Repositories: Calculates security workflow health score including vulnerability scanning across the specified repositories", - "Calculate Performance Score Across Specified Repositories: Calculates workflow performance score based on execution times across the specified repositories", - "Calculate API Rate Limit Health Score: Calculates GitHub API rate limit utilization health score", - "Generate Overall GitHub Actions Health Score: Generates a composite health score from all measured indicators", - "Check Recent Workflow Failures Across Specified Repositories: Analyzes recent workflow failures across the specified repositories and identifies common failure patterns", - "Check Long Running Workflows Across Specified Repositories: Identifies workflows that have been running longer than expected thresholds across the specified repositories", - "Check Repository Health Summary for Specified Repositories: Provides a comprehensive health summary across the specified repositories", - "Check GitHub Actions Runner Health Across Specified Organizations: Monitors the health and availability of GitHub Actions runners across the specified organizations", - "Check Security Workflow Status Across Specified Repositories: Monitors security-related workflows and dependency scanning results across the specified repositories", - "Check GitHub Actions Billing and Usage Across Specified Organizations: Monitors GitHub Actions usage patterns and potential billing concerns across the specified organizations", - "Check GitHub API Rate Limits: Monitors GitHub API rate limit usage to prevent throttling during health checks" - ], - "readme": "# GitHub Actions Health Monitoring\n\nComprehensive health monitoring for GitHub Actions across specified repositories and organizations.\n\n## Overview\n\nThis codebundle provides health monitoring capabilities for GitHub Actions workflows, focusing on:\n- Multi-repository analysis across specified repositories or entire organizations\n- Multi-organization support for enterprise-wide monitoring\n- Workflow failure detection and pattern analysis\n- Performance monitoring for long-running workflows\n- Security workflow status and vulnerability tracking\n- GitHub Actions runner health and utilization across organizations\n- Billing and usage monitoring aggregated across organizations\n- GitHub API rate limit monitoring\n- Service Level Indicator (SLI) calculations for health scoring\n\n## Use Cases\n\n### Multi-Repository Monitoring\nMonitor GitHub Actions health across multiple repositories simultaneously, whether specified individually or across entire organizations.\n\n### Multi-Organization Support\n- Monitor multiple GitHub organizations simultaneously\n- Aggregate health metrics across your entire enterprise\n- Compare organization performance and resource utilization\n- Centralized monitoring for organizations with distributed teams\n\n### Organization-Wide Health Assessment\nGet comprehensive health insights across all repositories in one or more GitHub organizations with configurable limits on the number of repositories analyzed.\n\n### Cross-Organization Repository Selection\n- Specify individual repositories from different organizations\n- Mix specific repositories with organization-wide analysis\n- Flexible scoping for complex enterprise environments\n\n### Failure Pattern Detection\nIdentify recurring workflow failures across repositories and organizations to detect common patterns that might indicate infrastructure or configuration issues.\n\n### Performance Monitoring\nTrack workflow performance across repositories and organizations, identifying workflows that consistently run longer than expected thresholds.\n\n### Security Posture Assessment\nMonitor security-related workflows (CodeQL, Dependabot, etc.) and track vulnerability status across your entire repository portfolio, spanning multiple organizations.\n\n### Resource Utilization Tracking\nMonitor GitHub Actions usage, billing metrics, and runner capacity across multiple organizations to optimize resource allocation and costs.\n\n## Tasks\n\n### Workflow Health Tasks\n1. **Check Recent Workflow Failures Across Specified Repositories**\n - Analyzes recent workflow failures across multiple repositories and organizations\n - Identifies failure patterns and provides actionable insights\n - Configurable lookback period\n\n2. **Check Long Running Workflows Across Specified Repositories**\n - Identifies workflows exceeding duration thresholds across repositories and organizations\n - Tracks both in-progress and recently completed long-duration workflows\n - Helps optimize workflow performance\n\n3. **Check Repository Health Summary for Specified Repositories**\n - Provides comprehensive health scoring across repositories and organizations\n - Calculates overall health metrics and failure rates\n - Identifies repositories requiring attention\n\n### Infrastructure Health Tasks\n4. **Check GitHub Actions Runner Health Across Specified Organizations**\n - Monitors self-hosted runner availability and status across multiple organizations\n - Tracks runner utilization and capacity aggregated across organizations\n - Alerts on offline or overutilized runners with organization context\n\n5. **Check Security Workflow Status Across Specified Repositories**\n - Monitors security-related workflow execution across repositories and organizations\n - Tracks Dependabot alerts and vulnerability status\n - Identifies critical security issues requiring immediate attention\n\n6. **Check GitHub Actions Billing and Usage Across Specified Organizations**\n - Monitors GitHub Actions usage against included minutes across multiple organizations\n - Aggregates billing metrics and usage patterns\n - Provides early warnings for high usage with organization-level breakdown\n\n7. **Check GitHub API Rate Limits**\n - Monitors API rate limit consumption to prevent throttling during health checks\n - Optimizes API usage patterns across multi-organization monitoring\n\n## Service Level Indicators (SLI)\n\nThe SLI calculation provides weighted health scoring across multiple dimensions:\n\n- **Workflow Success Rate** (25%): Overall success rate of workflows across specified repositories and organizations\n- **Organization Health** (20%): Health score for organization-wide metrics\n- **Security Posture** (20%): Security workflow success and vulnerability status\n- **Performance** (15%): Workflow duration and performance metrics\n- **Runner Availability** (15%): GitHub Actions runner health and capacity across organizations\n- **Rate Limit Management** (5%): API usage efficiency\n\n## Configuration\n\n### Required Configuration\n\n#### Secrets\n", - "libraries": [ - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem", - "RW.NextSteps", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/gh-actions-health" - }, - { - "slug": "rw-cli-codecollection-k8s-ingress-healthcheck", - "collection_slug": "rw-cli-codecollection", - "name": "k8s-ingress-healthcheck", - "display_name": "k8s-ingress-healthcheck", - "description": "The `k8s-ingress-healthchech` codebundle checks the health of ingress objects within a Namespace.", - "platform": "Kubernetes", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Fetch Ingress Object Health in Namespace `${NAMESPACE}`", - "Check for Ingress and Service Conflicts in Namespace `${NAMESPACE}`" - ], - "capabilities": [ - "Fetch Ingress Object Health in Namespace `${NAMESPACE}`: Fetches all ingress objects in the namespace and outputs the name, health status, services, and endpoints.", - "Check for Ingress and Service Conflicts in Namespace `${NAMESPACE}`: Look for conflicting configuration between service and ingress objects." - ], - "readme": "# Kubernetes Ingress Healthcheck\nThe `k8s-ingress-healthchech` codebundle checks the health of ingress objects within a Namespace. \n\n## Tasks\n`Fetch Ingress Object Health in Namespace` - This command will list every ingress object and determine whether it has a service and and endpoint. If so, it is considered healthy. It will print out the health result along with the error or the details regarding the service name and pod endpoint names and IPs. \n\nExample configuration: \n```\nKUBERNETES_DISTRIBUTION_BINARY=kubectl\nCONTEXT=sandbox-cluster-1\nNAMESPACE=my-namespace\n```\n\n## Requirements\n- A kubeconfig with `get` permissions to on the objects/namespaces that are involved in the query.\n\n\n## TODO\n- Add additional rbac and kubectl resources and use cases\n- Add additional troubleshooting tasks as use cases evolve", - "libraries": [ - "Collections", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/k8s-ingress-healthcheck" - }, - { - "slug": "rw-cli-codecollection-aws-eks-health", - "collection_slug": "rw-cli-codecollection", - "name": "aws-eks-health", - "display_name": "aws-eks-health", - "description": "This runbook outlines the necessary steps to manage and troubleshoot an EKS Fargate Cluster using the AWS CLI. It provides instructions on how to check the health status of the EKS Fargate Cluster and examine the AWS VPC CNI plugin for potential networking issues. Additionally, it includes guidance on how to debug the EKS Fargate Pod Execution Role. This runbook is an essential guide for maintaining the smooth operation of EKS Fargate Clusters.", - "platform": "AWS", - "author": "jon-funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Check Amazon EKS Cluster Health Status in AWS Region `${AWS_REGION}`", - "Check EKS Fargate Cluster Health Status in AWS Region `${AWS_REGION}`", - "Check Amazon EKS Cluster Health Status in AWS Region `${AWS_REGION}`", - "Monitor EKS Cluster Health in AWS Region `${AWS_REGION}`" - ], - "capabilities": [ - "Check Amazon EKS Cluster Health Status in AWS Region `${AWS_REGION}`: This script checks the health status of an Amazon EKS cluster.", - "Check EKS Fargate Cluster Health Status in AWS Region `${AWS_REGION}`: This script checks the health status of an Amazon EKS Fargate cluster.", - "Check Amazon EKS Cluster Health Status in AWS Region `${AWS_REGION}`: This script checks the health status of an Amazon EKS cluster.", - "Monitor EKS Cluster Health in AWS Region `${AWS_REGION}`: This bash script is designed to monitor the health and status of an Amazon EKS cluster." - ], - "readme": "# eks-fargate-cluster-health-issue CodeBundle\n### Tags:`AWS`, `EKS Fargate`, `Cluster Health`\n## CodeBundle Objective:\nThis runbook outlines the necessary steps to manage and troubleshoot an EKS Fargate Cluster using the AWS CLI. It provides instructions on how to check the health status of the EKS Fargate Cluster and examine the AWS VPC CNI plugin for potential networking issues. Additionally, it includes guidance on how to debug the EKS Fargate Pod Execution Role. This runbook is an essential guide for maintaining the smooth operation of EKS Fargate Clusters.\n\n## CodeBundle Inputs:\n\nexport AWS_REGION=\"PLACEHOLDER\"\nexport AWS_ACCESS_KEY_ID=\"PLACEHOLDER\"\nexport AWS_SECRET_ACCESS_KEY=\"PLACEHOLDER\"\n\n## CodeBundle Tasks:\n### `Check EKS Fargate Cluster Health Status using aws CLI`\n#### Tags:`EKS`, `Fargate`, `Cluster Health`, `AWS`, `Kubernetes`, `Pods`, `Nodes`, `Shell Script`, \n### Task Documentation:\nThis script checks the health status of an Amazon EKS Fargate cluster. It describes the Fargate profile, checks the status of all nodes and pods, and provides detailed information about each pod. The script requires the user to specify the cluster name, Fargate profile name, and AWS region.\n#### Usage Example:\n`./check_eks_fargate_cluster_health_status.sh`\n\n### `Examine AWS VPC CNI plugin for EKS Fargate Networking Issues`\n#### Tags:`AWS`, `EKS`, `Fargate`, `Bash Script`, `Node Health`, `Pod Status`, `CNI Version`, `Kubernetes`, \n### Task Documentation:\nThis bash script is designed to monitor the health and status of an Amazon EKS cluster. It fetches information about the Fargate profile, checks the health status of EKS nodes, verifies the status of all pods in all namespaces, and checks the CNI version. The script is intended to be run in an environment where AWS CLI and kubectl are installed and configured.\n#### Usage Example:\n`./examine_aws_vpc_cni_eks_fargate_networking_issues.sh`\n", - "libraries": [ - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem", - "String", - "Process" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/aws-eks-health" - }, - { - "slug": "rw-cli-codecollection-k8s-cluster-node-health", - "collection_slug": "rw-cli-codecollection", - "name": "k8s-cluster-node-health", - "display_name": "k8s-cluster-node-health", - "description": "The Service Level Indicator will generate a score for the health of the nodes in the cluster. This is an aggregate score from the tasks, which currently include:", - "platform": "Kubernetes", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Check for Node Restarts in Cluster `${CONTEXT}`", - "Generate Namespace Score in Kubernetes Cluster `$${CONTEXT}`", - "Check for Node Restarts in Cluster `${CONTEXT}` within Interval `${RW_LOOKBACK_WINDOW}`" - ], - "capabilities": [ - "Check for Node Restarts in Cluster `${CONTEXT}`: Count preempt / spot node restarts within the configured time interval.", - "Check for Node Restarts in Cluster `${CONTEXT}` within Interval `${RW_LOOKBACK_WINDOW}`: Identify nodes that are starting and stopping within the time interval." - ], - "readme": "# K8s Cluster Node Health\n\n## SLI\nThe Service Level Indicator will generate a score for the health of the nodes in the cluster. This is an aggregate score from the tasks, which currently include: \n- Check for Node Restarts in Cluster\n\n## TaskSet \n### Check for Node Restarts in Cluster \nCreate a report of all nodes start/stop/preempts/removals in the cluster. This will generate an information issue since node starts/stops may be routine, but users may want to be aware that they are happening if their pods are temporarily affected. \n\n## Requirements\n- Service account with permissions to: \n - get nodes\n - list nodes\n", - "libraries": [ - "RW.K8sLog", - "Collections", - "RW.CLI", - "RW.Core", - "RW.platform", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/k8s-cluster-node-health" - }, - { - "slug": "rw-cli-codecollection-k8s-stacktrace-health", - "collection_slug": "rw-cli-codecollection", - "name": "k8s-stacktrace-health", - "display_name": "k8s-stacktrace-health", - "description": "This codebundle provides comprehensive stacktrace/traceback detection and analysis for Kubernetes workloads (deployments, statefulsets, and daemonsets). It monitors application logs to identify Python, Java, and other language stacktraces that indicate runtime errors or exceptions.", - "platform": "Kubernetes", - "author": "akshayrw25", - "support_tags": [ - "rw" - ], - "tasks": [ - "Get Stacktrace Health Score for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}`", - "Generate Stacktrace Health Score for `${WORKLOAD_NAME}`", - "Analyze Workload Stacktraces for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` in Namespace `${NAMESPACE}`" - ], - "capabilities": [ - "Get Stacktrace Health Score for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}`: Checks for recent stacktraces/tracebacks related to the workload within a short time window, with filtering to reduce noise.", - "Generate Stacktrace Health Score for `${WORKLOAD_NAME}`: Generates the final stacktrace health score and report details", - "Analyze Workload Stacktraces for ${WORKLOAD_TYPE} `${WORKLOAD_NAME}` in Namespace `${NAMESPACE}`: Collects and analyzes stacktraces/tracebacks from all pods in the workload for troubleshooting application issues." - ], - "readme": "# Kubernetes Workload Stacktrace Health\n\nThis codebundle provides comprehensive stacktrace/traceback detection and analysis for Kubernetes workloads (deployments, statefulsets, and daemonsets). It monitors application logs to identify Python, Java, and other language stacktraces that indicate runtime errors or exceptions.\n\n## Use Cases\n\n### Troubleshooting Tasks\n- **Stacktrace Detection**: Automatically identifies and extracts stacktraces from workload logs across all pods and containers\n- **Multi-Language Support**: Detects Python tracebacks, Java stack traces, and other common error patterns\n- **Container Filtering**: Configurable filtering to ignore sidecar containers (linkerd, istio, etc.) that aren't relevant for application analysis\n- **Comprehensive Coverage**: Analyzes logs from all pods and containers in a workload for complete visibility\n- **Multi-Workload Support**: Works with deployments, statefulsets, and daemonsets\n\n### SLI Monitoring\n- **Health Score**: Provides a binary health score (0 = stacktraces detected, 1 = no stacktraces found)\n- **Fast Detection**: Optimized for frequent monitoring with configurable log limits and time windows\n- **Early Warning**: Detects application errors through stacktrace analysis before they impact users\n- **Scaled Workload Handling**: Properly handles workloads scaled to 0 replicas\n\n## Configuration\n\n### Required Configuration\n- `KUBERNETES_DISTRIBUTION_BINARY`: kubectl or oc\n- `CONTEXT`: Kubernetes context to use\n- `NAMESPACE`: Target namespace\n- `WORKLOAD_NAME`: Name of the workload to monitor\n- `WORKLOAD_TYPE`: Type of workload (deployment, statefulset, or daemonset)\n\n### Optional Configuration\n- `LOG_LINES`: Number of log lines to fetch (default: 100)\n- `LOG_AGE`: Time window for log analysis (default: 3h for runbook, 10m for SLI)\n- `LOG_SIZE`: Maximum log size in bytes (default: 2MB for runbook, 256KB for SLI)\n- `IGNORE_CONTAINERS_MATCHING`: Comma-separated list of container name patterns to ignore (default: \"linkerd\")\n- `MAX_LOG_LINES`: Maximum log lines for SLI checks (default: 100)\n- `MAX_LOG_BYTES`: Maximum log bytes for SLI checks (default: 256000)\n\n## Tasks\n\n### Analyze Workload Stacktraces\n**Type**: Troubleshooting Task \n**Objective**: Comprehensive stacktrace detection and analysis across all workload pods\n\nThis task:\n- Fetches logs from all pods and containers in the workload\n- Extracts Python tracebacks, Java stack traces, and other error patterns\n- Filters out irrelevant containers based on name patterns\n- Creates detailed issues for any stacktraces found\n- Provides actionable next steps for troubleshooting\n- Supports deployments, statefulsets, and daemonsets\n\n### Get Stacktrace Health Score\n**Type**: SLI Task \n**Objective**: Fast stacktrace detection for monitoring and alerting\n\nThis task:\n- Performs rapid stacktrace detection optimized for frequent monitoring\n- Returns a binary health score (0 or 1)\n- Uses optimized log limits to prevent API overload\n- Supports 5-minute interval monitoring\n- Handles scaled-down workloads appropriately\n\n## Supported Languages\n\n### Python\n- Detects standard Python tracebacks with `Traceback (most recent call last):`\n- Handles JSON-formatted logs with embedded stacktraces\n- Extracts timestamped traceback information\n- Supports various Python logging formats\n\n### Java\n- Detects Java stack traces with `at package.Class.method()` patterns\n- Handles exception messages and stack trace lines\n- Supports various Java logging frameworks\n- Extracts complete stack trace information\n\n### Extensible Architecture\nThe stacktrace detection library is designed to be easily extended for additional languages and error patterns.\n\n## Performance Considerations\n\n### Runbook Tasks\n- Designed for comprehensive analysis during troubleshooting\n- Configurable limits to balance thoroughness with performance\n- Handles large workloads with multiple pods and containers\n- Supports all Kubernetes workload types\n\n### SLI Tasks\n- Optimized for frequent monitoring (5-minute intervals)\n- Lower resource usage with configurable limits\n- Fast exit on first stacktrace detection\n- Prevents API overload with byte and line limits\n\n## Best Practices\n\n1. **Container Filtering**: Configure `IGNORE_CONTAINERS_MATCHING` to exclude sidecar containers\n2. **Log Limits**: Adjust `LOG_LINES` and `LOG_SIZE` based on your application's logging patterns\n3. **Time Windows**: Use shorter time windows for SLI monitoring, longer for troubleshooting\n4. **Resource Management**: Monitor API usage and adjust limits if needed\n5. **Alert Tuning**: Use SLI scores for alerting on application errors\n6. **Workload Types**: Ensure proper `WORKLOAD_TYPE` configuration for your specific workload\n\n## Troubleshooting\n\n### Common Issues\n- **No stacktraces detected**: Check log patterns and time windows\n- **API timeouts**: Reduce log limits or time windows\n- **Missing containers**: Verify workload labels and container filtering\n- **Performance issues**: Optimize log limits for your envir", - "libraries": [ - "RW.K8sLog", - "RW.LogAnalysis.ExtractTraceback", - "Collections", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem", - "RW.NextSteps", - "RW.K8sHelper", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/k8s-stacktrace-health" - }, - { - "slug": "rw-cli-codecollection-k8s-prometheus-healthcheck", - "collection_slug": "rw-cli-codecollection", - "name": "k8s-prometheus-healthcheck", - "display_name": "k8s-prometheus-healthcheck", - "description": "A set of tasks that troubleshoot the Kubernetes Prometheus Operator for issues.", - "platform": "Kubernetes", - "author": "jon-funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Check Prometheus Service Monitors in namespace `${NAMESPACE}`", - "Check For Successful Rule Setup in Kubernetes Namespace `${NAMESPACE}`", - "Verify Prometheus RBAC Can Access ServiceMonitors in Namespace `${PROM_NAMESPACE}`", - "Inspect Prometheus Operator Logs for Scraping Errors in Namespace `${NAMESPACE}`", - "Check Prometheus API Healthy in Namespace `${PROM_NAMESPACE}`" - ], - "capabilities": [ - "Check Prometheus Service Monitors in namespace `${NAMESPACE}`: Checks the selector mappings of service monitors are valid in the namespace", - "Check For Successful Rule Setup in Kubernetes Namespace `${NAMESPACE}`: Inspects operator instance logs for failed rules setup", - "Verify Prometheus RBAC Can Access ServiceMonitors in Namespace `${PROM_NAMESPACE}`: Fetch operator rbac and verify it has ServiceMonitors in rbac.", - "Inspect Prometheus Operator Logs for Scraping Errors in Namespace `${NAMESPACE}`: Inspect the prometheus operator logs for scraping errors and raise issues if any found", - "Check Prometheus API Healthy in Namespace `${PROM_NAMESPACE}`: Ping Prometheus healthy API endpoint for a 200 response code." - ], - "readme": "# Kubernetes Prometheus Operator Triage\n\nA set of tasks that troubleshoot the Kubernetes Prometheus Operator for issues.\n\n## Tasks\n\n`Check Prometheus Service Monitors`\n`Check For Successful Rule Setup`\n`Verify Prometheus RBAC Can Access ServiceMonitors`\n`Identify Endpoint Scraping Errors`\n`Check Prometheus API Healthy`\n\n## Configuration\n\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\n- `kubeconfig`: The kubeconfig secret containing access info for the cluster.\n- `kubectl`: The location service used to interpret shell commands. Default value is `kubectl-service.shared`.\n- `KUBERNETES_DISTRIBUTION_BINARY`: Which binary to use for Kubernetes CLI commands. Default value is `kubectl`.\n- `CONTEXT`: The Kubernetes context to operate within.\n- `NAMESPACE`: The name of the namespace to check ServiceMonitors in.\n- `PROM_NAMESPACE`: The name of the namespace that the Kubernetes Operator resides in, typically.\n\n## Notes\n\nPlease note that these checks require Kubernetes RBAC exec, get clusterrole and get/list ServiceMonitors permissions for the service account used.\n\n## TODO\n- [ ] Add documentation\n- [ ] Refine raised issues", - "libraries": [ - "DateTime", - "Collections", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/k8s-prometheus-healthcheck" - }, - { - "slug": "rw-cli-codecollection-azure-vmss-triage", - "collection_slug": "rw-cli-codecollection", - "name": "azure-vmss-triage", - "display_name": "azure-vmss-triage", - "description": "This codebundle runs a suite of metrics checks for a VM Scale Set in Azure. It fetches activities and the current configuration which is added to a report for review at that point in time.", - "platform": "Azure", - "author": "jon-funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Check Scale Set `${VMSCALESET}` Key Metrics In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Scale Set `${VMSCALESET}` Key Metrics In Resource Group `${AZ_RESOURCE_GROUP}`", - "Fetch VM Scale Set `${VMSCALESET}` Config In Resource Group `${AZ_RESOURCE_GROUP}`", - "Fetch Activities for VM Scale Set `${VMSCALESET}` In Resource Group `${AZ_RESOURCE_GROUP}`" - ], - "capabilities": [ - "Check Scale Set `${VMSCALESET}` Key Metrics In Resource Group `${AZ_RESOURCE_GROUP}`: Checks key metrics of VM Scale Set for issues.", - "Check Scale Set `${VMSCALESET}` Key Metrics In Resource Group `${AZ_RESOURCE_GROUP}`: Checks key metrics of VM Scale Set for issues.", - "Fetch VM Scale Set `${VMSCALESET}` Config In Resource Group `${AZ_RESOURCE_GROUP}`: Fetch the config of the scaled set in azure", - "Fetch Activities for VM Scale Set `${VMSCALESET}` In Resource Group `${AZ_RESOURCE_GROUP}`: Gets the events for the scaled set and checks for errors" - ], - "readme": "# Azure Virtual Machine Scale Set Triage\nThis codebundle runs a suite of metrics checks for a VM Scale Set in Azure. It fetches activities and the current configuration which is added to a report for review at that point in time.\n\n## Configuration\n\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\n- `AZ_USERNAME`: Service principal's client ID\n- `AZ_SECRET_VALUE`: The credential secret value from the app registration\n- `AZ_TENANT`: The Azure tenancy ID\n- `AZ_SUBSCRIPTION`: The Azure subscription ID\n- `AZ_RESOURCE_GROUP`: The Azure resource group that these resources reside in\n- `VMSCALESET`: The name of the VM Scale Set in the resource group to target with checks\n\n## Notes\n\nThis codebundle assumes the service principal authentication flow\n\n## TODO\n- [ ] remote exec functionality\n- [ ] look for notable activities in list\n- [ ] config best practices check\n- [ ] Add documentation", - "libraries": [ - "DateTime", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/azure-vmss-triage" - }, - { - "slug": "rw-cli-codecollection-k8s-certmanager-healthcheck", - "collection_slug": "rw-cli-codecollection", - "name": "k8s-certmanager-healthcheck", - "display_name": "k8s-certmanager-healthcheck", - "description": "This taskset looks into issues related to CertManager Certificates.", - "platform": "Kubernetes", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Count Unready and Expired Certificates in Namespace `${NAMESPACE}`", - "Get Namespace Certificate Summary for Namespace `${NAMESPACE}`", - "Find Unhealthy Certificates in Namespace `${NAMESPACE}`", - "Find Failed Certificate Requests and Identify Issues for Namespace `${NAMESPACE}`" - ], - "capabilities": [ - "Count Unready and Expired Certificates in Namespace `${NAMESPACE}`: Adds together the count of unready and expired certificates. A healthy SLI value is 0.", - "Get Namespace Certificate Summary for Namespace `${NAMESPACE}`: Gets a list of cert-manager certificates that are due for renewal and summarize their information for review.", - "Find Unhealthy Certificates in Namespace `${NAMESPACE}`: Gets a list of cert-manager certificates are not available.", - "Find Failed Certificate Requests and Identify Issues for Namespace `${NAMESPACE}`: Gets a list of failed cert-manager certificates and summarize their issues." - ], - "readme": "# Kubernetes CertManager Triage\n\nThis taskset looks into issues related to CertManager Certificates.\n\n## Tasks\n- `Get Namespace Certificate Summary`: This task retrieves a list of certmanager certificates and summarizes their information for review.\n- `Find Failed Certificate Requests and Identify Issues`: This task retrieves a list of failed certmanager certificates and summarizes their issues.\n\n## Configuration\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\n- `kubeconfig`: The kubeconfig secret containing access info for the cluster.\n- `kubectl`: The location service used to interpret shell commands. Default value is `kubectl-service.shared`.\n- `KUBERNETES_DISTRIBUTION_BINARY`: Which binary to use for Kubernetes CLI commands. Default value is `kubectl`.\n- `CONTEXT`: The Kubernetes context to operate within.\n- `NAMESPACE`: The name of the namespace to search. Leave it blank to search in all namespaces.\n\n## Requirements\n- A kubeconfig with appropriate RBAC permissions to perform the desired command.\n\n## TODO\n- [ ] Add additional documentation.\n- [ ] Add cert-manager operator namespace as config field and genrules", - "libraries": [ - "DateTime", - "Collections", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/k8s-certmanager-healthcheck" - }, - { - "slug": "rw-cli-codecollection-k8s-otelcollector", - "collection_slug": "rw-cli-codecollection", - "name": "k8s-otelcollector", - "display_name": "k8s-otelcollector", - "description": "Checks the OTEL collector's logs and metrics to determine its health, such as large queues or errors.", - "platform": "Kubernetes", - "author": "jon-funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Query Collector Queued Spans in Namespace `${NAMESPACE}`", - "Check OpenTelemetry Collector Logs For Errors In Namespace `${NAMESPACE}`", - "Query OpenTelemetry Logs For Dropped Spans In Namespace `${NAMESPACE}`" - ], - "capabilities": [ - "Query Collector Queued Spans in Namespace `${NAMESPACE}`: Query the collector metrics endpoint and inspect queue size", - "Check OpenTelemetry Collector Logs For Errors In Namespace `${NAMESPACE}`: Fetch logs and check for errors", - "Query OpenTelemetry Logs For Dropped Spans In Namespace `${NAMESPACE}`: Query the collector logs for dropped spans from errors" - ], - "readme": "# Kubernetes OpenTelemetry Health Check\nChecks the OTEL collector's logs and metrics to determine its health, such as large queues or errors.\n\nNote: if you're having trouble connecting to your otel collector, change the\n deployment name to another workload in the namespace\n\n## Tasks\n`Scan OpenTelemetry Logs For Dropped Spans In Namespace `\n\n`Check OpenTelemetry Collector Logs For Errors In Namespace`\n\n`Query Collector Queued Spans in Namespace`\n\n## Configuration\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\n- `kubeconfig`: The kubeconfig secret containing access info for the cluster.\n- `KUBERNETES_DISTRIBUTION_BINARY`: Which binary to use for Kubernetes CLI commands. Default value is `kubectl`.\n- `CONTEXT`: The Kubernetes context to operate within.\n- `NAMESPACE`: The name of the namespace to search. Leave it blank to search in all namespaces.\n- `WORKLOAD_SERVICE`: Service name to curl against for metrics.\n- `WORKLOAD_NAME`: Workload used for exec requests.\n- `METRICS_PORT`: The port to use to request metrics from.\n\n\n## Requirements\n- A kubeconfig with appropriate RBAC permissions to perform the desired command.\n\n## TODO\n- [ ] Consider additional tasks\n\n", - "libraries": [ - "DateTime", - "RW.K8sLog", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/k8s-otelcollector" - }, - { - "slug": "rw-cli-codecollection-k8s-loki-healthcheck", - "collection_slug": "rw-cli-codecollection", - "name": "k8s-loki-healthcheck", - "display_name": "k8s-loki-healthcheck", - "description": "A set of tasks to query the state and health of a Loki deployment in Kubernetes.", - "platform": "Kubernetes", - "author": "jon-funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Check Loki Ring API for Unhealthy Shards in Kubernetes Cluster `$${NAMESPACE}`", - "Check Loki API Ready in Kubernetes Cluster `${NAMESPACE}`" - ], - "capabilities": [ - "Check Loki Ring API for Unhealthy Shards in Kubernetes Cluster `$${NAMESPACE}`: Request and inspect the state of the Loki hash rings for non-active (potentially unhealthy) shards.", - "Check Loki API Ready in Kubernetes Cluster `${NAMESPACE}`: Pings the internal Loki API to check it's ready." - ], - "readme": "# Kubernetes Loki Healthcheck\n\nA set of tasks to query the state and health of a Loki deployment in Kubernetes.\n\n## Tasks\n`Check Loki Ring API`\n`Check Loki API Ready`\n\n## Configuration\n\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\n- `kubeconfig`: The kubeconfig secret containing access info for the cluster.\n- `kubectl`: The location service used to interpret shell commands. Default value is `kubectl-service.shared`.\n- `KUBERNETES_DISTRIBUTION_BINARY`: Which binary to use for Kubernetes CLI commands. Default value is `kubectl`.\n- `CONTEXT`: The Kubernetes context to operate within.\n- `NAMESPACE`: The name of the namespace to search.\n\n## Notes\n\nPlease note that these checks require Kubernetes RBAC exec permissions for the service account used.\n\n## TODO\n- [ ] Add documentation\n- [ ] Add more complex hash ring checks\n- [ ] Refine raised issues", - "libraries": [ - "DateTime", - "Collections", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/k8s-loki-healthcheck" - }, - { - "slug": "rw-cli-codecollection-azure-aks-triage", - "collection_slug": "rw-cli-codecollection", - "name": "azure-aks-triage", - "display_name": "azure-aks-triage", - "description": "This CodeBundle checks for AKS Cluster Health based on how Azure is reporting resource health, network configuration recommendations, activities that have occured, and provisioning status of resources. It also includes **cost optimization analysis** that identifies underutilized node pools with potential savings opportunities using 30-day Azure Monitor utilization trends.", - "platform": "Azure", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Check for Resource Health Issues Affecting AKS Cluster `${AKS_CLUSTER}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Fetch Activities for AKS Cluster `${AKS_CLUSTER}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Configuration Health of AKS Cluster `${AKS_CLUSTER}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Generate AKS Cluster Health Score", - "Check for Resource Health Issues Affecting AKS Cluster `${AKS_CLUSTER}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Configuration Health of AKS Cluster `${AKS_CLUSTER}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Network Configuration of AKS Cluster `${AKS_CLUSTER}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Fetch Activities for AKS Cluster `${AKS_CLUSTER}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Analyze AKS Cluster Cost Optimization Opportunities for `${AKS_CLUSTER}` In Resource Group `${AZ_RESOURCE_GROUP}`" - ], - "capabilities": [ - "Check for Resource Health Issues Affecting AKS Cluster `${AKS_CLUSTER}` In Resource Group `${AZ_RESOURCE_GROUP}`: Fetch a list of issues that might affect the AKS cluster as reported from Azure.", - "Fetch Activities for AKS Cluster `${AKS_CLUSTER}` In Resource Group `${AZ_RESOURCE_GROUP}`: Gets the activities for the AKS cluster set and checks for critical or error events within the configured time period.", - "Check Configuration Health of AKS Cluster `${AKS_CLUSTER}` In Resource Group `${AZ_RESOURCE_GROUP}`: Fetch the config of the AKS cluster in azure", - "Check for Resource Health Issues Affecting AKS Cluster `${AKS_CLUSTER}` In Resource Group `${AZ_RESOURCE_GROUP}`: Fetch a list of issues that might affect the AKS cluster", - "Check Configuration Health of AKS Cluster `${AKS_CLUSTER}` In Resource Group `${AZ_RESOURCE_GROUP}`: Fetch the config of the AKS cluster in azure", - "Check Network Configuration of AKS Cluster `${AKS_CLUSTER}` In Resource Group `${AZ_RESOURCE_GROUP}`: Fetch the network configuration, generating resource URLs and basic recommendations", - "Fetch Activities for AKS Cluster `${AKS_CLUSTER}` In Resource Group `${AZ_RESOURCE_GROUP}`: Gets the activities for the AKS cluster set and checks for errors", - "Analyze AKS Cluster Cost Optimization Opportunities for `${AKS_CLUSTER}` In Resource Group `${AZ_RESOURCE_GROUP}`: Analyzes 30-day utilization trends using Azure Monitor to identify underutilized node pools with cost savings opportunities. Provides Azure VM pricing-based estimates for potential monthly and annual savings with severity bands: Sev4 <$2k/month, Sev3 $2k-$10k/month, Sev2 >$10k/month." - ], - "readme": "# Azure AKS Cluster Triage\nThis CodeBundle checks for AKS Cluster Health based on how Azure is reporting resource health, network configuration recommendations, activities that have occured, and provisioning status of resources. It also includes **cost optimization analysis** that identifies underutilized node pools with potential savings opportunities using 30-day Azure Monitor utilization trends. \n\n## Configuration\n\nThe SLI & TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n- `AZ_RESOURCE_GROUP`: The Azure resource group that these resources reside in\n- `AKS_CLUSTER`: The name of the AKS Cluster in the resource group to target with checks\n- `RW_LOOKBACK_WINDOW`: The time window, in minutes, to look back for activities and events which may indicate issues. \n\n## Features\n\n### Health Monitoring\n- **Resource Health**: Checks Azure-reported health status of AKS cluster resources\n- **Configuration Analysis**: Validates cluster configuration and identifies potential issues\n- **Network Configuration**: Reviews network settings and provides recommendations\n- **Activity Monitoring**: Analyzes recent activities for errors and warnings\n\n### Cost Optimization\n- **30-Day Utilization Analysis**: Uses Azure Monitor to analyze CPU and memory utilization trends\n- **Underutilization Detection**: Identifies node pools with consistently low resource usage\n- **Cost Savings Estimates**: Provides monthly and annual savings estimates using Azure VM pricing\n- **Severity-Based Alerts**: \n - **Severity 4**: <$2,000/month potential savings\n - **Severity 3**: $2,000-$10,000/month potential savings \n - **Severity 2**: >$10,000/month potential savings\n- **Azure VM Pricing Database**: Comprehensive pricing for D-series, E-series, F-series, and B-series VMs\n- **Conservative Recommendations**: Accounts for overhead and safety margins in scaling suggestions\n\n## Notes\n\nThis codebundle assumes the service principal authentication flow which is handled from the import secret Keyword.\n\nThe cost optimization analysis requires Azure Monitor metrics to be available for the AKS cluster's Virtual Machine Scale Sets (VMSS). Ensure that monitoring is enabled for accurate utilization data.\n\n## TODO\n- [ ] Add documentation\n- [x] Implement cost optimization analysis with Azure Monitor integration\n- [x] Add Azure VM pricing database for cost calculations\n- [x] Implement severity-based cost savings alerts", - "libraries": [ - "DateTime", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/azure-aks-triage" - }, - { - "slug": "rw-cli-codecollection-azure-apim-health", - "collection_slug": "rw-cli-codecollection", - "name": "azure-apim-health", - "display_name": "azure-apim-health", - "description": "as login --use-device-code", - "platform": "Azure", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Check for Resource Health Issues Affecting APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`", - "Fetch Key Metrics for APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Logs for Errors with APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`", - "Verify APIM Policy Configurations for `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`", - "Check APIM SSL Certificates for `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`", - "Inspect Dependencies and Related Resources for APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`", - "Generate APIM Health Score", - "Gather APIM Resource Information for APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`", - "Check for Resource Health Issues Affecting APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`", - "Fetch Key Metrics for APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Logs for Errors with APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Activity Logs for APIM Management Operations `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Application Insights Integration for APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Key Vault Dependencies for APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`", - "Verify APIM Policy Configurations for `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`", - "Check APIM SSL Certificates for `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`", - "Inspect Dependencies and Related Resources for APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`" - ], - "capabilities": [ - "Check for Resource Health Issues Affecting APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`: Fetch Resource Health status and evaluate any reported issues for the APIM instance.", - "Fetch Key Metrics for APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`: Gather APIM metrics from Azure Monitor. Raises issues if thresholds are violated.", - "Check Logs for Errors with APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`: Run apim_diagnostic_logs.sh, parse results, raise issues if logs exceed thresholds.", - "Verify APIM Policy Configurations for `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`: Runs a shell script to enumerate all APIM policies and check for missing tags.", - "Check APIM SSL Certificates for `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`: Verify certificate validity, expiration, thumbprint, and domain matches", - "Inspect Dependencies and Related Resources for APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`: Runs inspect_apim_dependencies.sh to discover & validate Key Vault, backends, DNS, etc.", - "Gather APIM Resource Information for APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`: Collect fundamental details about the Azure subscription, resource group,", - "Check for Resource Health Issues Affecting APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`: Fetch Resource Health status and evaluate any reported issues for the APIM instance.", - "Fetch Key Metrics for APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`: Gather APIM metrics from Azure Monitor. Raises issues if thresholds are violated.", - "Check Logs for Errors with APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`: Run apim_diagnostic_logs.sh, parse results, raise issues if logs exceed thresholds.", - "Check Activity Logs for APIM Management Operations `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`: Review Azure Activity Logs for administrative operations on the APIM instance", - "Check Application Insights Integration for APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`: Verify Application Insights integration and analyze telemetry if configured", - "Check Key Vault Dependencies for APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`: Verify Key Vault dependencies and access for certificates and secrets", - "Verify APIM Policy Configurations for `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`: Validates APIM policies for malformed XML, authentication issues, and backend connectivity problems.", - "Check APIM SSL Certificates for `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`: Verify certificate validity, expiration, thumbprint, and domain matches", - "Inspect Dependencies and Related Resources for APIM `${APIM_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`: Runs inspect_apim_dependencies.sh to discover & validate Key Vault, backends, DNS, etc." - ], - "readme": "\nas login --use-device-code\n## Test 1\nexport APP_SERVICE_NAME=azure-apim-health-f1\nexport AZ_RESOURCE_GROUP=azure-apim-health\nexport APIM_NAME=azure-apim-health-apim\nexport AZURE_RESOURCE_SUBSCRIPTION_ID=$ARM_SUBSCRIPTION_ID\nexport AZURE_CONFIG_DIR=/var/tmp/runwhen/azure-apim-health/runbook.robot/.azure\naz login --use-device-code", - "libraries": [ - "DateTime", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/azure-apim-health" - }, - { - "slug": "rw-cli-codecollection-k8s-cluster-resource-health", - "collection_slug": "rw-cli-codecollection", - "name": "k8s-cluster-resource-health", - "display_name": "k8s-cluster-resource-health", - "description": "The Service Level Indicator will count the amount of nodes that are over 90% active utilization according to `kubectl top nodes`", - "platform": "Kubernetes", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Identify High Utilization Nodes for Cluster `${CONTEXT}`", - "Identify Pods with Resource Limits Exceeding Node Capacity in Cluster `${CONTEXT}`", - "Generate Cluster Resource Health Score", - "Identify High Utilization Nodes for Cluster `${CONTEXT}`", - "Identify Pods Causing High Node Utilization in Cluster `${CONTEXT}`", - "Identify Pods with Resource Limits Exceeding Node Capacity in Cluster `${CONTEXT}`" - ], - "capabilities": [ - "Identify High Utilization Nodes for Cluster `${CONTEXT}`: Fetch utilization of each node and raise issue if CPU or Memory is above 90% utilization . Requires jq. Requires get/list of nodes in \"metrics.k8s.io\"", - "Identify Pods with Resource Limits Exceeding Node Capacity in Cluster `${CONTEXT}`: Identify any Pods in the Cluster `${CONTEXT}` with resource limits (CPU or Memory) larger than the Node's allocatable capacity.", - "Identify High Utilization Nodes for Cluster `${CONTEXT}`: Identify nodes with high utilization . Requires jq.", - "Identify Pods Causing High Node Utilization in Cluster `${CONTEXT}`: Identify nodes with high utilization and match to pods that are significantly above their resource request configuration. Requires jq.", - "Identify Pods with Resource Limits Exceeding Node Capacity in Cluster `${CONTEXT}`: Identify any Pods in the Cluster `${CONTEXT}` with resource limits (CPU or Memory) larger than the Node's allocatable capacity." - ], - "readme": "# K8s Cluster Resource Health\n\n## SLI\nThe Service Level Indicator will count the amount of nodes that are over 90% active utilization according to `kubectl top nodes`\n\n## TaskSet \n### Identify High Utilization Nodes for Cluster\nCreate a report of all nodes that are above 90% utilization. Raise issues for each node that is in this state. \n\n### Identify Pods Causing High Node Utilization in Cluster\nThis task identifies overutilized nodes and creates a report of each pod that is using more than it's defined request. Since requests are what a cluster autoscaler uses to make decisions, this list should be used to increase the pod requests so that autoscalers can make better scaling decisions. \n\nRaises an issue for each namespace\n\n\n## Requirements\n- Service account with permissions to: \n - get nodes\n - list nodes\n - get/list nodes in api group \"metrics.k8s.io\"", - "libraries": [ - "DateTime", - "Collections", - "RW.CLI", - "RW.Core", - "RW.platform", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/k8s-cluster-resource-health" - }, - { - "slug": "rw-cli-codecollection-gcp-bucket-health", - "collection_slug": "rw-cli-codecollection", - "name": "gcp-bucket-health", - "display_name": "gcp-bucket-health", - "description": "This code checks if any GCP (Google Cloud Platform) buckets are unhealthy, focusing on:", - "platform": "GCP", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Fetch GCP Bucket Storage Utilization for `${PROJECT_IDS}`", - "Check GCP Bucket Security Configuration for `${PROJECT_IDS}`", - "Fetch GCP Bucket Storage Operations Rate for `${PROJECT_IDS}`", - "Generate Bucket Score in Project `${PROJECT_IDS}`", - "Fetch GCP Bucket Storage Utilization for `${PROJECT_IDS}`", - "Add GCP Bucket Storage Configuration for `${PROJECT_IDS}` to Report", - "Check GCP Bucket Security Configuration for `${PROJECT_IDS}`", - "Fetch GCP Bucket Storage Operations Rate for `${PROJECT_IDS}`" - ], - "capabilities": [ - "Fetch GCP Bucket Storage Utilization for `${PROJECT_IDS}`: Fetches all GCP buckets in each project and obtains the total size.", - "Check GCP Bucket Security Configuration for `${PROJECT_IDS}`: Fetches all GCP buckets in each project and checks for public buckets, risky IAM permissions, and encryption configuration.", - "Fetch GCP Bucket Storage Operations Rate for `${PROJECT_IDS}`: Fetches all GCP buckets in each project and obtains the read and write operations rate that incurrs cost.", - "Fetch GCP Bucket Storage Utilization for `${PROJECT_IDS}`: Fetches all GCP buckets in each project and obtains the total size.", - "Add GCP Bucket Storage Configuration for `${PROJECT_IDS}` to Report: Fetches all GCP buckets in each project and obtains the total size.", - "Check GCP Bucket Security Configuration for `${PROJECT_IDS}`: Fetches all GCP buckets in each project and checks for public buckets, risky IAM permissions, and encryption configuration.", - "Fetch GCP Bucket Storage Operations Rate for `${PROJECT_IDS}`: Fetches all GCP buckets in each project and obtains the read and write operations rate that incurrs cost. Generates issues if the rate is above a specified threshold." - ], - "readme": "# GCP Bucket Health\nThis code checks if any GCP (Google Cloud Platform) buckets are unhealthy, focusing on: \n- Utilization (with a user defined threshold for issue/alert generation)\n- Security Configuration (with a user defined threshold on when to generate issues/alerts for publicly accessible buckets)\n\n\n## SLI\nThe SLI: \n- counts the number of buckets that are above the user defined threshold\n- counts the number of publicly accessible buckets above the user defined threshold\n\n## TaskSet \nThe Taskset lists provides the following tasks: \n\n- Fetch GCP Bucket Storage Utilization for `${PROJECT_IDS}`\n- Add GCP Bucket Storage Configuration for `${PROJECT_IDS}` to Report\n- Check GCP Bucket Security Configuration for `${PROJECT_IDS}`\n\n## Requirements\nThe following roles are useful on the GCP service account used with the gcloud utility: \n\n- Viewer\n- Security Reviewer\n\n## TODO \nUpdate required GCP SA permissions. \n\n## Local testing\n- need `gcloud` SDK in the test-bed(docker container)\n- `gcloud auth login`\n- to test in env-tiger: `gcloud config set project runwhen-dev-tiger`\n- you would also need to set application-default credentials if you don't have service-account keys:\n - `gcloud auth application-default login`\n", - "libraries": [ - "DateTime", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/gcp-bucket-health" - }, - { - "slug": "rw-cli-codecollection-k8s-redis-healthcheck", - "collection_slug": "rw-cli-codecollection", - "name": "k8s-redis-healthcheck", - "display_name": "k8s-redis-healthcheck", - "description": "A set of tasks which performs a health check and read/write verification on a Redis workload running in a Kubernetes cluster.", - "platform": "Kubernetes", - "author": "jon-funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Ping `${DEPLOYMENT_NAME}` Redis Workload", - "Verify `${DEPLOYMENT_NAME}` Redis Read Write Operation in Kubernetes" - ], - "capabilities": [ - "Ping `${DEPLOYMENT_NAME}` Redis Workload: Verifies that a PING can be peformed against the redis workload.", - "Verify `${DEPLOYMENT_NAME}` Redis Read Write Operation in Kubernetes: Attempts to perform a write and read operation on the redis workload, checking that a key can be set, incremented, and read from." - ], - "readme": "# Kubernetes Redis Healthcheck\n\nA set of tasks which performs a health check and read/write verification on a Redis workload running in a Kubernetes cluster.\n\n## Tasks\n`Ping Redis Workload`\n`Verify Redis Read Write Operation`\n\n## Configuration\n\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\n- `kubeconfig`: The kubeconfig secret containing access info for the cluster.\n- `kubectl`: The location service used to interpret shell commands. Default value is `kubectl-service.shared`.\n- `KUBERNETES_DISTRIBUTION_BINARY`: Which binary to use for Kubernetes CLI commands. Default value is `kubectl`.\n- `CONTEXT`: The Kubernetes context to operate within.\n- `NAMESPACE`: The name of the namespace to search. Leave it blank to search in all namespaces.\n- `DEPLOYMENT_NAME`: The name of the deployment running Redis\n\n## Notes\n\nPlease note that the script requires permissions to execute commands within the Kubernetes cluster, and it may require additional permissions depending on the tasks it performs (for example, fetching storage utilization for PVC mounts requires kubectl exec permissions). Make sure to review the tasks and the required permissions before running the script.\n\n## TODO\n- [ ] Add documentation\n- [ ] Refine raised issues", - "libraries": [ - "DateTime", - "Collections", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/k8s-redis-healthcheck" - }, - { - "slug": "rw-cli-codecollection-k8s-tail-logs-dynamic", - "collection_slug": "rw-cli-codecollection", - "name": "k8s-tail-logs-dynamic", - "display_name": "k8s-tail-logs-dynamic", - "description": "This codebundle measures stack traces as they appear in your application logs and can produce reports for a breakdown of stack traces.", - "platform": "Kubernetes", - "author": "jon-funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Tail `${CONTAINER_NAME}` Application Logs For Stacktraces", - "Get `${CONTAINER_NAME}` Application Logs in Namespace `${NAMESPACE}`", - "Tail `${CONTAINER_NAME}` Application Logs For Stacktraces", - "#TODO: check if a service has a selector for this deployment" - ], - "capabilities": [ - "Tail `${CONTAINER_NAME}` Application Logs For Stacktraces: Tails logs and organizes output for measuring counts.", - "Get `${CONTAINER_NAME}` Application Logs in Namespace `${NAMESPACE}`: Collects the last approximately 300 lines of logs from the workload", - "Tail `${CONTAINER_NAME}` Application Logs For Stacktraces: Performs an inspection on container logs for exceptions/stacktraces, parsing them and attempts to find relevant source code information" - ], - "readme": "# Kubernetes Tail Application Logs For Stacktraces\n\nThis codebundle measures stack traces as they appear in your application logs and can produce reports for a breakdown of stack traces.\nIn order for it to appear in your workspace, just add the following annotations to your application deployments:\n`codecollection.runwhen.com/app` and `annotations.kubectl.kubernetes.io/default-container` with the value being the name of the container in the deployment to search for stacktraces.\n\n## Configuration\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\n- `kubeconfig`: The kubeconfig secret containing access info for the cluster.\n- `kubectl`: The location service used to interpret shell commands. Default value is `kubectl-service.shared`.\n- `KUBERNETES_DISTRIBUTION_BINARY`: Which binary to use for Kubernetes CLI commands. Default value is `kubectl`.\n- `CONTEXT`: The Kubernetes context to operate within.\n- `NAMESPACE`: The name of the namespace to search. Leave it blank to search in all namespaces.\n- `LABELS`: The labaels used for resource selection, particularly for fetching logs.\n- `LOGS_SINCE`: How far back to scan for logs, eg: 20m, 3h\n- `EXCLUDE_PATTERN`: a extended grep pattern used to filter out log results, such as exceptions/errors that you don't care about.\n- `CONTAINER_NAME`: the name of the container within the labeled workload to fetch logs from.\n- `MAX_LOG_LINES`: The maximum number of logs to fetch. Setting this too high can effect performance.\n- `STACKTRACE_PARSER`: What parser to use on log lines. If left as Dynamic then the first one to return a result will be used for the rest of the logs to parse.\n- `INPUT_MODE`: Determines how logs are fed into the parser. Typically the default should work.\n- `MAX_LOG_BYTES`: Maximum number of bytes to fetch for logs from containers.\n\n## Requirements\n- A kubeconfig with appropriate RBAC permissions to fetch logs.\n\n## Automated Building\nAdditionally you must have the following manifest changes in order for workspace builder to automatically setup this codebundle for you:\n\n- A deployment with the follow annotations and labels:\n - annotations.codecollection.runwhen.com/app: this annotation acts as a opt-in flag\n - annotations.kubectl.kubernetes.io/default-container: the name of the container in the pod to search for stacktraces\n - labels.app: selector used to grab logs from pods across a deployment\n\n## TODO\n- [ ] Add additional documentation.\n- [ ] Finish suggestions error msg lookup\n\n", - "libraries": [ - "RW.K8sApplications", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/k8s-tail-logs-dynamic" - }, - { - "slug": "rw-cli-codecollection-k8s-image-check", - "collection_slug": "rw-cli-codecollection", - "name": "k8s-image-check", - "display_name": "k8s-image-check", - "description": "Simple informational report that provides information about images in a namespace.", - "platform": "Kubernetes", - "author": "jon-funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Check Image Rollover Times for Namespace `${NAMESPACE}`", - "List Images and Tags for Every Container in Running Pods for Namespace `${NAMESPACE}`", - "List Images and Tags for Every Container in Failed Pods for Namespace `${NAMESPACE}`", - "List ImagePullBackOff Events and Test Path and Tags for Namespace `${NAMESPACE}`" - ], - "capabilities": [ - "Check Image Rollover Times for Namespace `${NAMESPACE}`: Fetches and checks when images last rolled over in a namespace.", - "List Images and Tags for Every Container in Running Pods for Namespace `${NAMESPACE}`: Display the status, image name, image tag, and container name for running pods in the namespace.", - "List Images and Tags for Every Container in Failed Pods for Namespace `${NAMESPACE}`: Display the status, image name, image tag, and container name for failed pods in the namespace.", - "List ImagePullBackOff Events and Test Path and Tags for Namespace `${NAMESPACE}`: Search events in the last 5 minutes for BackOff events related to image pull issues. Run Skopeo to test if the image path exists and what tags are available." - ], - "readme": "# Kubernetes Image Check\n\nSimple informational report that provides information about images in a namespace. \n\n## Tasks\n- `Check Image Rollover Times In Namespace` - Fetches the list of images in a namespace and shows the last time the container was started and therefore the age of the image pull\n- `List Images and Tags for Every Container in Running Pods` - Display the status, image name, image tag, and container name for running pods in the namespace.\n- `List Images and Tags for Every Container in Failed Pods` - Display the status, image name, image tag, and container name for failed pods in the namespace.\n- `List Image Pull Back-Off Events and Test Path and Tags` - Search events in the last 5 minutes for BackOff events related to image pull issues. Run Skopeo to test if the image path exists and what tags are available.\n\n## Configuration\n\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\n- `kubeconfig`: The kubeconfig secret containing access info for the cluster.\n- `kubectl`: The location service used to interpret shell commands. Default value is `kubectl-service.shared`.\n- `KUBERNETES_DISTRIBUTION_BINARY`: Which binary to use for Kubernetes CLI commands. Default value is `kubectl`.\n- `CONTEXT`: The Kubernetes context to operate within.\n- `NAMESPACE`: The name of the namespace to search.\n\n## TODO\n- [ ] Add documentation\n- [ ] Add github integration with source code vs image comparison\n- [ ] Find applicable raise issue use", - "libraries": [ - "DateTime", - "Collections", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/k8s-image-check" - }, - { - "slug": "rw-cli-codecollection-aws-lambda-health", - "collection_slug": "rw-cli-codecollection", - "name": "aws-lambda-health", - "display_name": "aws-lambda-health", - "description": "This runbook provides a comprehensive guide to managing and troubleshooting AWS Lambda functions. It covers tasks such as verifying the configuration of Lambda functions, analyzing invocation errors, monitoring performance metrics, and managing concurrency limits using AWS CLI. Additionally, it provides steps to inspect IAM roles and resource access permissions for Lambda. This runbook is essential for maintaining optimal function and security of AWS Lambda services.", - "platform": "AWS", - "author": "jon-funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Analyze AWS Lambda Invocation Errors in Region `${AWS_REGION}`", - "List Lambda Versions and Runtimes in AWS Region `${AWS_REGION}`", - "Analyze AWS Lambda Invocation Errors in Region `${AWS_REGION}`", - "Monitor AWS Lambda Performance Metrics in AWS Region `${AWS_REGION}`" - ], - "capabilities": [ - "Analyze AWS Lambda Invocation Errors in Region `${AWS_REGION}`: This bash script is designed to analyze AWS Lambda Invocation Errors for a specified function within a specified region.", - "List Lambda Versions and Runtimes in AWS Region `${AWS_REGION}`: This script is designed to list all the versions and runtimes of a specified AWS Lambda function.", - "Analyze AWS Lambda Invocation Errors in Region `${AWS_REGION}`: This bash script is designed to analyze AWS Lambda Invocation Errors for a specified function within a specified region.", - "Monitor AWS Lambda Performance Metrics in AWS Region `${AWS_REGION}`: This script is a bash utility for AWS Lambda functions the lists their notable metrics." - ], - "readme": "# aws-lambda-failure-issue-investigation CodeBundle\n### Tags:`AWS`, `AWS Lambda`, `Failure`, `Developer`, `Investigation`, `Service Disruption`, `Incident`, `Platform Issue`, `Troubleshooting`, `Resolution`, \n## CodeBundle Objective:\nThis runbook provides a comprehensive guide to managing and troubleshooting AWS Lambda functions. It covers tasks such as verifying the configuration of Lambda functions, analyzing invocation errors, monitoring performance metrics, and managing concurrency limits using AWS CLI. Additionally, it provides steps to inspect IAM roles and resource access permissions for Lambda. This runbook is essential for maintaining optimal function and security of AWS Lambda services.\n\n## CodeBundle Inputs:\n\nexport AWS_REGION=\"PLACEHOLDER\"\n\nexport LAMBDA_FUNCTION_NAME=\"PLACEHOLDER\"\n\nexport FUNCTION_NAME=\"PLACEHOLDER\"\n\nexport NEW_CONCURRENCY_LIMIT=\"PLACEHOLDER\"\n\n\n## CodeBundle Tasks:\n### `Verify AWS Lambda Function Configuration`\n#### Tags:`AWS`, `Lambda`, `Function Configuration`, `Verification`, `Script`, `Bash`, `AWS CLI`, `Function Policy`, `Function Aliases`, `Function Versions`, `Command Success`, `Command Failure`, \n### Task Documentation:\nThis script is used to verify the configuration of a specified AWS Lambda function. It retrieves and prints the function's configuration, policy, aliases, and versions. If any of these operations fail, the script will print an error message and exit with a status of 1. On successful completion, it prints a success message.\n#### Usage Example:\n`./verify_aws_lambda_function_config.sh`\n\n### `Analyze AWS Lambda Invocation Errors`\n#### Tags:`AWS`, `Lambda`, `Error Analysis`, `Invocation Errors`, `CloudWatch`, `Logs`, `Shell Script`, `Bash`, `Troubleshooting`, `Monitoring`, `Automation`, `AWS Region`, `Function Name`, `Log Streams`, `Log Events`, \n### Task Documentation:\nThis bash script is designed to analyze AWS Lambda Invocation Errors for a specified function within a specified region. It fetches the last 50 invocation errors from the AWS CloudWatch logs and prints them. If no errors are found, it prints a message stating that no invocation errors were found for the function. It requires AWS CLI and jq to be installed and properly configured.\n#### Usage Example:\n`./analyze_lambda_invocation_errors.sh`\n\n### `Monitor AWS Lambda Performance Metrics`\n#### Tags:`AWS`, `Lambda`, `CloudWatch`, `Logs`, `Metrics`, `Bash`, `Scripting`, `Function Monitoring`, `Error Tracking`, `Throttling`, `Invocations`, `Duration`, `Command Line Interface`, `AWS CLI`, `us-west-2`, `myLambdaFunction`, \n### Task Documentation:\nThis script is a bash utility for AWS Lambda functions. It retrieves and displays the details of a specified Lambda function, the last 100 log events, and various function metrics (Duration, Errors, Throttles, Invocations) for the past 24 hours in the AWS region 'us-west-2'. The function name and AWS region are defined as variables at the start of the script. This script requires AWS CLI and appropriate permissions to execute the commands.\n#### Usage Example:\n`./monitor_aws_lambda_performance_metrics.sh`\n\n### `Manage Lambda Concurrency Limits using AWS CLI`\n#### Tags:`Lambda Function`, `Concurrency Limit`, `AWS CLI`, `Shell Script`, `Bash`, `AWS Lambda`, `Cloud Computing`, `Infrastructure Management`, `Automation`, `DevOps`, \n### Task Documentation:\nThis script is designed to modify and verify the concurrency limit of a specified AWS Lambda function. It first retrieves the current concurrency limit of the function, then sets a new limit and verifies if the new limit has been set correctly. If the new limit is set correctly, it outputs a success message; otherwise, it outputs a failure message. The function name and new concurrency limit are specified by the user through the FUNCTION_NAME and NEW_CONCURRENCY_LIMIT variables.\n#### Usage Example:\n`./manage_lambda_concurrency_limits_awscli.sh`\n\n### `Inspect IAM roles and resource access permissions for Lambda`\n#### Tags:`bash script`, `AWS`, `IAM roles`, `Lambda function`, `policy document`, `list resources`, `get-policy`, `get-function-configuration`, `list-roles`, `list-attached-role-policies`, `get-policy-version`, `scripting`, `automation`, `cloud computing`, `security`, `access management`, \n### Task Documentation:\nThis script is designed to interact with AWS IAM and Lambda services to list IAM roles, retrieve the IAM role associated with a specified Lambda function, and get the policy attached to that role. It also obtains the policy document and lists the resources that the Lambda function has access to. The AWS region and the name of the Lambda function are defined as variables at the start of the script. The script uses AWS CLI commands and requires appropriate AWS credentials to be set up.\n#### Usage Example:\n`./inspect_lambda_iam_roles_permissions.sh us-east-1 your_lambda_function_name`\n", - "libraries": [ - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem", - "String", - "Process" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/aws-lambda-health" - }, - { - "slug": "rw-cli-codecollection-k8s-chaos-namespace", - "collection_slug": "rw-cli-codecollection", - "name": "k8s-chaos-namespace", - "display_name": "k8s-chaos-namespace", - "description": "This codebundle provides chaos injection for kubernetes namespaces", - "platform": "Kubernetes", - "author": "jon-funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Kill Random Pods In Namespace `${NAMESPACE}`", - "OOMKill Pods In Namespace `${NAMESPACE}`", - "# RW.Core.Add Pre To Report ${process.stdout}", - "Mangle Service Selector In Namespace `${NAMESPACE}`", - "Mangle Service Port In Namespace `${NAMESPACE}`", - "Fill Random Pod Tmp Directory In Namespace `${NAMESPACE}`" - ], - "capabilities": [ - "Kill Random Pods In Namespace `${NAMESPACE}`: Randomly selects up to 10 pods in a namespace to delete to test HA", - "OOMKill Pods In Namespace `${NAMESPACE}`: Randomly selects n number of pods to oomkill", - "Mangle Service Selector In Namespace `${NAMESPACE}`: Breaks a service's label selector to cause a network disruption", - "Mangle Service Port In Namespace `${NAMESPACE}`: Changes a service's port to cause a network disruption", - "Fill Random Pod Tmp Directory In Namespace `${NAMESPACE}`: Attaches to a pod and fills the /tmp directory with random data" - ], - "readme": "# Kubernetes Namespace Chaos Engineering\n\nThis codebundle provides chaos injection for kubernetes namespaces \n\n## Tasks\n\n`Test Namespace Highly Available`\n`Test Node Drain`\n`Mangle Service Selector`\n`Mangle Service Port`\n`Fill Pod Tmp`\n\n## Configuration\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\n- `KUBECONFIG`: The kubeconfig secret containing access info for the cluster.\n- `CONTEXT`: The Kubernetes context to operate within.\n- `NAMESPACE`: The name of the namespace to search. Leave it blank to search in all namespaces.\n\n\n## Requirements\n- A kubeconfig with appropriate RBAC permissions to perform the desired command.\n\n## TODO\n- [ ] Add additional documentation.\n\n", - "libraries": [ - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem", - "String", - "Process" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/k8s-chaos-namespace" - }, - { - "slug": "rw-cli-codecollection-gcp-project-cost-health", - "collection_slug": "rw-cli-codecollection", - "name": "gcp-project-cost-health", - "display_name": "gcp-project-cost-health", - "description": "Comprehensive toolkit for analyzing GCP costs and spending across projects using BigQuery billing export.", - "platform": "GCP", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Generate GCP Cost Report By Service and Project", - "Get GCP Cost Optimization Recommendations" - ], - "capabilities": [ - "Generate GCP Cost Report By Service and Project: Generates a detailed cost breakdown report for the last 30 days showing actual spending by project and GCP service using BigQuery billing export", - "Get GCP Cost Optimization Recommendations: Fetches COST-RELATED recommendations from GCP Recommender API (committed use discounts, idle resources, rightsizing, etc.). Filters out non-cost recommendations like security/IAM suggestions." - ], - "readme": "## GCP Project Cost Health & Reporting\n\nComprehensive toolkit for analyzing GCP costs and spending across projects using BigQuery billing export.\n\n## Overview\n\nThis codebundle provides detailed cost analysis and reporting for Google Cloud Platform (GCP) projects. It queries your BigQuery billing export to generate comprehensive cost reports showing spending by project, service, and SKU.\n\n## Features\n\n### \ud83d\udcca Historical Cost Reporting (`gcp_cost_historical_report.sh`)\n- **Multi-Project Analysis**: Analyze costs across multiple GCP projects simultaneously\n- **Service-Level Breakdown**: See costs by GCP service (Compute Engine, Cloud Storage, BigQuery, etc.)\n- **SKU-Level Detail**: Drill down to individual SKUs for granular cost visibility\n- **Summary Statistics**: Quick view of total costs, high-cost contributors, and spending trends\n- **Multiple Output Formats**: Table (human-readable), CSV (spreadsheet), JSON (programmatic)\n- **Time-Based Analysis**: Default 30-day lookback period (configurable)\n\n### \ud83d\udca1 Cost Optimization Recommendations (`gcp_recommendations.sh`)\n- **Committed Use Discounts (CUDs)**: Identify opportunities for 1-year and 3-year commitments\n- **Idle Resource Detection**: Find unused compute instances, disks, and databases\n- **Right-Sizing Recommendations**: Optimize machine types based on actual usage\n- **Automated Issue Generation**: Creates actionable issues with estimated savings\n- **Multi-Project Support**: Scans all accessible projects or specified project list\n\n## Prerequisites\n\n### 1. GCP Billing Export to BigQuery\n\nYou must have billing export enabled in your GCP organization:\n\n```bash\n# Enable billing export (run once per organization)\n# This is typically done through the GCP Console:\n# Billing > Billing export > BigQuery export\n```\n\nThe billing export creates a BigQuery dataset with a table like:\n```\nproject-id.billing_dataset.gcp_billing_export_v1_XXXXXX_XXXXXX_XXXXXX\n```\n\n### 2. Required GCP Permissions\n\nThe service account or user running these scripts needs permissions on **multiple resources**:\n\n#### For Historical Cost Reporting\n\n**On the Billing Export Project** (where BigQuery billing export is stored):\n- **BigQuery Data Viewer** (`roles/bigquery.dataViewer`) - to read billing export tables\n- **BigQuery Job User** (`roles/bigquery.jobUser`) - to run queries on billing data\n- **BigQuery Metadata Viewer** (`roles/bigquery.metadataViewer`) - to list datasets and tables (for auto-discovery)\n\n**Note**: The billing export project is typically a dedicated project (often named like \"billing-export\" or \"shared\") that contains the BigQuery dataset with billing export tables. This is different from the projects you're analyzing costs for.\n\n#### For Cost Optimization Recommendations\n\n**On Each Project Being Analyzed**:\n- **Recommender Viewer** (`roles/recommender.viewer`) - to read cost optimization recommendations\n- **Compute Viewer** (`roles/compute.viewer`) - to list compute instances for regional CUD recommendations\n- **Project Viewer** (`roles/viewer`) - to get project names and metadata\n\n**On Billing Account** (optional, for billing-level recommendations):\n- **Billing Account Viewer** (`roles/billing.viewer`) - to access billing account information\n- **Recommender Viewer** (`roles/recommender.billingAccountViewer`) - for billing-level CUD recommendations\n\n**Important**: The Recommender API must be enabled on each project:\n```bash\ngcloud services enable recommender.googleapis.com --project=PROJECT_ID\n```\n\n#### Minimum IAM Permissions Setup:\n\n```bash\n# 1. Grant permissions on the billing export project (for cost reporting)\ngcloud projects add-iam-policy-binding BILLING_EXPORT_PROJECT_ID \\\n --member=\"serviceAccount:SERVICE_ACCOUNT@PROJECT.iam.gserviceaccount.com\" \\\n --role=\"roles/bigquery.dataViewer\"\n\ngcloud projects add-iam-policy-binding BILLING_EXPORT_PROJECT_ID \\\n --member=\"serviceAccount:SERVICE_ACCOUNT@PROJECT.iam.gserviceaccount.com\" \\\n --role=\"roles/bigquery.jobUser\"\n\ngcloud projects add-iam-policy-binding BILLING_EXPORT_PROJECT_ID \\\n --member=\"serviceAccount:SERVICE_ACCOUNT@PROJECT.iam.gserviceaccount.com\" \\\n --role=\"roles/bigquery.metadataViewer\"\n\n# 2. Grant permissions on each project being analyzed (for recommendations)\ngcloud projects add-iam-policy-binding PROJECT_ID \\\n --member=\"serviceAccount:SERVICE_ACCOUNT@PROJECT.iam.gserviceaccount.com\" \\\n --role=\"roles/recommender.viewer\"\n\ngcloud projects add-iam-policy-binding PROJECT_ID \\\n --member=\"serviceAccount:SERVICE_ACCOUNT@PROJECT.iam.gserviceaccount.com\" \\\n --role=\"roles/compute.viewer\"\n\ngcloud projects add-iam-policy-binding PROJECT_ID \\\n --member=\"serviceAccount:SERVICE_ACCOUNT@PROJECT.iam.gserviceaccount.com\" \\\n --role=\"roles/viewer\"\n\n# 3. Enable Recommender API on each project\ngcloud services enable recommender.googleapis.com --project=PROJECT_ID\n\n# 4. (Optional) Grant billing account access for billing-level recommendations\ngcloud beta billing accounts add-iam-policy-binding BILLING_ACCOUNT_ID \\\n --membe", - "libraries": [ - "Collections", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/gcp-project-cost-health" - }, - { - "slug": "rw-cli-codecollection-azure-appservice-webapp-ops", - "collection_slug": "rw-cli-codecollection", - "name": "azure-appservice-webapp-ops", - "display_name": "azure-appservice-webapp-ops", - "description": "- Checks whether the plan supports deployment slots (Standard or Premium tier).", - "platform": "Azure", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Restart App Service `${APP_SERVICE_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`", - "Swap Deployment Slots for App Service `${APP_SERVICE_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`", - "Scale Up App Service `${APP_SERVICE_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`", - "Scale Down App Service `${APP_SERVICE_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`", - "Scale Out Instances for App Service `${APP_SERVICE_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` by ${SCALE_OUT_FACTOR}x", - "Scale In Instances for App Service `${APP_SERVICE_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` to 1/${SCALE_IN_FACTOR}", - "Redeploy App Service `${APP_SERVICE_NAME}` from Latest Source in Resource Group `${AZ_RESOURCE_GROUP}`" - ], - "capabilities": [ - "Restart App Service `${APP_SERVICE_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`: Restarts the Azure App Service and verifies success.", - "Swap Deployment Slots for App Service `${APP_SERVICE_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`: Calls the script that checks plan tier, lists slots, auto-determines source/target if only one non-prod slot", - "Scale Up App Service `${APP_SERVICE_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`: Scales up the App Service to the next plan from current SKU", - "Scale Down App Service `${APP_SERVICE_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`: Decreases SKU based on a predefined map (e.g. S2->S1, S1->B3, etc.)", - "Scale Out Instances for App Service `${APP_SERVICE_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` by ${SCALE_OUT_FACTOR}x: Multiplies current worker count by SCALE_OUT_FACTOR", - "Scale In Instances for App Service `${APP_SERVICE_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}` to 1/${SCALE_IN_FACTOR}: Decreases the number of instances within the current App Service Plan", - "Redeploy App Service `${APP_SERVICE_NAME}` from Latest Source in Resource Group `${AZ_RESOURCE_GROUP}`: Forces a re-deployment of the Azure App Service from the configured code or container source." - ], - "readme": "\n## Swap Deployment Slots for App Service `${APP_SERVICE_NAME}` in Resource Group `${AZ_RESOURCE_GROUP}`\n- Checks whether the plan supports deployment slots (Standard or Premium tier).\n- Lists all available slots.\n- If SOURCE_SLOT and TARGET_SLOT are not provided, it attempts to figure them out automatically, assuming:\n - The \u201cproduction\u201d slot is the default slot with \"isSlot\": false.\n - The non-production slot(s) have \"isSlot\": true.\n - If exactly one non-production slot exists, we set source to that slot and target to \"production\".\n - If there are multiple non-production slots, we fail unless the user specifies which ones to swap.", - "libraries": [ - "DateTime", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/azure-appservice-webapp-ops" - }, - { - "slug": "rw-cli-codecollection-azure-appservice-webapp-health", - "collection_slug": "rw-cli-codecollection", - "name": "azure-appservice-webapp-health", - "display_name": "azure-appservice-webapp-health", - "description": "Checks key App Service metrics and the service plan, fetches logs, config and activities for the service and generates a report of present issues for any found.", - "platform": "Azure", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Check for Resource Health Issues Affecting App Service `${APP_SERVICE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check App Service `${APP_SERVICE_NAME}` Health Check Metrics In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check App Service `${APP_SERVICE_NAME}` Configuration Health In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Deployment Health of App Service `${APP_SERVICE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Fetch App Service `${APP_SERVICE_NAME}` Activities In Resource Group `${AZ_RESOURCE_GROUP}`", - "Generate App Service Health Score for `${APP_SERVICE_NAME}` in resource group `${AZ_RESOURCE_GROUP}`", - "Check for Resource Health Issues Affecting App Service `${APP_SERVICE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check App Service `${APP_SERVICE_NAME}` Health in Resource Group `${AZ_RESOURCE_GROUP}`", - "Fetch App Service `${APP_SERVICE_NAME}` Utilization Metrics In Resource Group `${AZ_RESOURCE_GROUP}`", - "Get App Service `${APP_SERVICE_NAME}` Logs In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Configuration Health of App Service `${APP_SERVICE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Deployment Health of App Service `${APP_SERVICE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Fetch App Service `${APP_SERVICE_NAME}` Activities In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Recent Activities for App Service `${APP_SERVICE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Recommendations and Notifications for App Service `${APP_SERVICE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Diagnostic Logs for App Service `${APP_SERVICE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`", - "Check Logs for Errors in App Service `${APP_SERVICE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`" - ], - "capabilities": [ - "Check for Resource Health Issues Affecting App Service `${APP_SERVICE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Fetch a list of issues that might affect the APP Service as reported from Azure.", - "Check App Service `${APP_SERVICE_NAME}` Health Check Metrics In Resource Group `${AZ_RESOURCE_GROUP}`: Checks the health check metric of a appservice workload. If issues are generated with severity 1 or 2, the score is 0 / unhealthy.", - "Check App Service `${APP_SERVICE_NAME}` Configuration Health In Resource Group `${AZ_RESOURCE_GROUP}`: Checks the configuration health of a appservice workload. 1 = healthy, 0 = unhealthy.", - "Check Deployment Health of App Service `${APP_SERVICE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Fetch deployment health of the App Service", - "Fetch App Service `${APP_SERVICE_NAME}` Activities In Resource Group `${AZ_RESOURCE_GROUP}`: Gets the events of appservice and checks for errors", - "Check for Resource Health Issues Affecting App Service `${APP_SERVICE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Fetch a list of issues that might affect the APP Service as reported from Azure.", - "Check App Service `${APP_SERVICE_NAME}` Health in Resource Group `${AZ_RESOURCE_GROUP}`: Checks the health status of a appservice workload.", - "Fetch App Service `${APP_SERVICE_NAME}` Utilization Metrics In Resource Group `${AZ_RESOURCE_GROUP}`: Reviews all key metrics (CPU, Requests, Bandwidth, HTTP status codes, Threads, Disk, Response Time) for the last 30 minutes with 5-minute intervals", - "Get App Service `${APP_SERVICE_NAME}` Logs In Resource Group `${AZ_RESOURCE_GROUP}`: Download and display recent raw log files from App Service (last 50 lines from each log file)", - "Check Configuration Health of App Service `${APP_SERVICE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Fetch the configuration health of the App Service", - "Check Deployment Health of App Service `${APP_SERVICE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Fetch deployment health of the App Service", - "Fetch App Service `${APP_SERVICE_NAME}` Activities In Resource Group `${AZ_RESOURCE_GROUP}`: Gets the events of appservice and checks for errors", - "Check Recent Activities for App Service `${APP_SERVICE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Analyze recent Azure activities for the App Service, including critical operations and user actions.", - "Check Recommendations and Notifications for App Service `${APP_SERVICE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Fetch Azure Advisor, Service Health, and Security Center recommendations for the App Service.", - "Check Diagnostic Logs for App Service `${APP_SERVICE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Check diagnostic settings, query Log Analytics and Application Insights for errors and failed requests", - "Check Logs for Errors in App Service `${APP_SERVICE_NAME}` In Resource Group `${AZ_RESOURCE_GROUP}`: Analyze App Service logs for errors using Azure Monitor APIs and Application Insights - creates structured issues for detected problems" - ], - "readme": "# Azure App Service Triage\nChecks key App Service metrics and the service plan, fetches logs, config and activities for the service and generates a report of present issues for any found.\n\n## Configuration\n\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\nexport APPSERVICE\nexport AZ_RESOURCE_GROUP\n\n## Notes\n\nThis codebundle assumes the service principal authentication flow.\n\n## TODO\n- [ ] look for notable activities in list\n- [ ] config best practices check\n- [ ] Add documentation", - "libraries": [ - "DateTime", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/azure-appservice-webapp-health" - }, - { - "slug": "rw-cli-codecollection-k8s-serviceaccount-check", - "collection_slug": "rw-cli-codecollection", - "name": "k8s-serviceaccount-check", - "display_name": "k8s-serviceaccount-check", - "description": "Tasks that help debug or validate service accounts and their access.", - "platform": "Kubernetes", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Test Service Account Access to Kubernetes API Server in Namespace `${NAMESPACE}`" - ], - "capabilities": [ - "Test Service Account Access to Kubernetes API Server in Namespace `${NAMESPACE}`: Runs a curl pod as a specific serviceaccount and attempts to all the Kubernetes API server with the mounted token" - ], - "readme": "# Kubernetes Service Account Check\n\nTasks that help debug or validate service accounts and their access. \n\n## Tasks\n- `Test Service Account Access to Kubernetes API Server`- Runs a curl pod as a specific serviceaccount and attempts to all the Kubernetes API server with the mounted token\n\n\n## Configuration\n\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\n- `kubeconfig`: The kubeconfig secret containing access info for the cluster.\n- `kubectl`: The location service used to interpret shell commands. Default value is `kubectl-service.shared`.\n- `KUBERNETES_DISTRIBUTION_BINARY`: Which binary to use for Kubernetes CLI commands. Default value is `kubectl`.\n- `CONTEXT`: The Kubernetes context to operate within.\n- `NAMESPACE`: The name of the namespace to search.\n- `SERVICE_ACCOUNT`: The service account to test access with. Defaults to `default`\n\n## Requirements\nThis task creates and deletes a pod in the specified namespace, RBAC permissions must support this. \n\n## TODO\n- [ ] Add documentation\n- [ ] Add github integration with source code vs image comparison\n- [ ] Find applicable raise issue use", - "libraries": [ - "DateTime", - "Collections", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/k8s-serviceaccount-check" - }, - { - "slug": "rw-cli-codecollection-k8s-vault-healthcheck", - "collection_slug": "rw-cli-codecollection", - "name": "k8s-vault-healthcheck", - "display_name": "k8s-vault-healthcheck", - "description": "A taskset which checks the status of a Vault workload in Kubernetes.", - "platform": "Kubernetes", - "author": "jon-funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Fetch Vault CSI Driver Logs in Namespace `${NAMESPACE}`", - "Get Vault CSI Driver Warning Events in `${NAMESPACE}`", - "Check Vault CSI Driver Replicas", - "Fetch Vault Pod Workload Logs in Namespace `${NAMESPACE}` with Labels `${LABELS}`", - "Get Related Vault Events in Namespace `${NAMESPACE}`", - "Fetch Vault StatefulSet Manifest Details in `${NAMESPACE}`", - "Fetch Vault DaemonSet Manifest Details in Kubernetes Cluster `${NAMESPACE}`", - "Verify Vault Availability in Namespace `${NAMESPACE}` and Context `${CONTEXT}`", - "Check Vault StatefulSet Replicas in `NAMESPACE`" - ], - "capabilities": [ - "Fetch Vault CSI Driver Logs in Namespace `${NAMESPACE}`: Fetches the last 100 lines of logs for the vault CSI driver.", - "Get Vault CSI Driver Warning Events in `${NAMESPACE}`: Fetches warning-type events related to the vault CSI driver.", - "Check Vault CSI Driver Replicas: Performs an inspection on the replicas of the vault CSI driver daemonset.", - "Fetch Vault Pod Workload Logs in Namespace `${NAMESPACE}` with Labels `${LABELS}`: Fetches the last 100 lines of logs for all vault pod workloads in the vault namespace.", - "Get Related Vault Events in Namespace `${NAMESPACE}`: Fetches all warning-type events related to vault in the vault namespace.", - "Fetch Vault StatefulSet Manifest Details in `${NAMESPACE}`: Fetches the current state of the vault statefulset manifest for inspection.", - "Fetch Vault DaemonSet Manifest Details in Kubernetes Cluster `${NAMESPACE}`: Fetches the current state of the vault daemonset manifest for inspection.", - "Verify Vault Availability in Namespace `${NAMESPACE}` and Context `${CONTEXT}`: Curls the vault endpoint and checks the HTTP response code.", - "Check Vault StatefulSet Replicas in `NAMESPACE`: Pulls the replica information for the Vault statefulset and checks if it's highly available" - ], - "readme": "# Kubernetes Vault Triage\n\nA taskset which checks the status of a Vault workload in Kubernetes.\n\n## Tasks\n`Fetch Vault CSI Driver Logs`\n`Get Vault CSI Driver Warning Events`\n`Check Vault CSI Driver Replicas`\n`Fetch Vault Logs`\n`Get Related Vault Events`\n`Fetch Vault StatefulSet Manifest Details`\n`Fetch Vault DaemonSet Manifest Details`\n`Verify Vault Availability`\n`Check Vault StatefulSet Replicas`\n\n## Configuration\n\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\n- `kubeconfig`: The kubeconfig secret containing access info for the cluster.\n- `kubectl`: The location service used to interpret shell commands. Default value is `kubectl-service.shared`.\n- `KUBERNETES_DISTRIBUTION_BINARY`: Which binary to use for Kubernetes CLI commands. Default value is `kubectl`.\n- `CONTEXT`: The Kubernetes context to operate within.\n- `NAMESPACE`: The name of the namespace to search. Leave it blank to search in all namespaces.\n- `LABELS`: Labels used to select vault resources.\n- `VAULT_URL`: The url of the vault instance.\n\n## Notes\n\nPlease note that the script requires permissions to execute commands within the Kubernetes cluster, and it may require additional permissions depending on the tasks it performs (for example, fetching storage utilization for PVC mounts requires kubectl exec permissions). Make sure to review the tasks and the required permissions before running the script.\n\n## TODO\n- [ ] Add documentation\n- [ ] Refine raised issues ", - "libraries": [ - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/k8s-vault-healthcheck" - }, - { - "slug": "rw-cli-codecollection-k8s-gitops-gh-remediate", - "collection_slug": "rw-cli-codecollection", - "name": "k8s-gitops-gh-remediate", - "display_name": "k8s-gitops-gh-remediate", - "description": "This codebundle provides a suite of tasks aimed at remediating configuration issues related to Kubernetes deployments managed in github repositories.", - "platform": "Kubernetes", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Remediate Readiness and Liveness Probe GitOps Manifests in Namespace `${NAMESPACE}`", - "Increase ResourceQuota Limit for Namespace `${NAMESPACE}` in GitHub GitOps Repository", - "Adjust Pod Resources to Match VPA Recommendation in `${NAMESPACE}`", - "Expand Persistent Volume Claims in Namespace `${NAMESPACE}`" - ], - "capabilities": [ - "Remediate Readiness and Liveness Probe GitOps Manifests in Namespace `${NAMESPACE}`: Fixes misconfigured readiness or liveness probe configurations for pods in a namespace that are managed in a GitHub GitOps repository", - "Increase ResourceQuota Limit for Namespace `${NAMESPACE}` in GitHub GitOps Repository: Looks for a resourcequota object in the namespace and increases it if applicable, and if it is managed in a GitHub GitOps repository", - "Adjust Pod Resources to Match VPA Recommendation in `${NAMESPACE}`: Queries the namespace for any Vertical Pod Autoscaler resource recommendations and applies them to GitOps GitHub controlled manifests.", - "Expand Persistent Volume Claims in Namespace `${NAMESPACE}`: Checks the disk utilization for all PVCs and updates the GitOps manifest for any that are highly utilized." - ], - "readme": "# Kubernetes GitOps GitHub Remediate\n\nThis codebundle provides a suite of tasks aimed at remediating configuration issues related to Kubernetes deployments managed in github repositories.\n\n## Tasks\n`Remediate Readiness and Liveness Probe GitOps Manifests in Namespace`\n`Increase ResourceQuota for Namespace`\n`Adjust Pod Resources to Match VPA Recommendation in`\n\n## Configuration\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\n- `kubeconfig`: The kubeconfig secret containing access info for the cluster.\n- `KUBERNETES_DISTRIBUTION_BINARY`: Which binary to use for Kubernetes CLI commands. Default value is `kubectl`.\n- `CONTEXT`: The Kubernetes context to operate within.\n- `NAMESPACE`: The name of the namespace to search. Leave it blank to search in all namespaces.\n\n## Requirements\n- A kubeconfig with appropriate RBAC permissions to perform the desired command.\n\n## TODO\n- [ ] Add additional documentation.\n\n", - "libraries": [ - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.NextSteps", - "RW.platform", - "OperatingSystem", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/k8s-gitops-gh-remediate" - }, - { - "slug": "rw-cli-codecollection-aws-eks-node-reboot", - "collection_slug": "rw-cli-codecollection", - "name": "aws-eks-node-reboot", - "display_name": "aws-eks-node-reboot", - "description": "This codebundle queries the state of a nodegroup within a EKS cluster to check its state and raise issues if it's unhealthy.", - "platform": "AWS", - "author": "jon-funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "Check EKS Nodegroup Status in `${EKS_CLUSTER_NAME}`" - ], - "capabilities": [ - "Check EKS Nodegroup Status in `${EKS_CLUSTER_NAME}`: Performs a check on a given cluster's nodegroup, raising an issue if the status of the nodegroup is not healthy." - ], - "readme": "# AWS EKS Nodegroup Healthcheck\nThis codebundle queries the state of a nodegroup within a EKS cluster to check its state and raise issues if it's unhealthy.\n\n## Tasks\n`Check EKS Nodegroup Status`\n\n## Configuration\n\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\n- `aws_access_key_id`: the service account's access key ID, used during the `aws sts` call.\n- `aws_secret_access_key`: The service account's secret access key, used during the `aws sts` call.\n- `aws_role_arn`: The full aws role ARN that will be assumed.\n- `aws_assume_role_name`: The name of the role to assume as part of the `aws sts` assume role call.\n- `AWS_DEFAULT_REGION`: The AWS region to perform API requests in and for resources.\n- `AWS_SERVICE`: The remote aws service to use for requests.\n- `EKS_CLUSTER_NAME`: The name of the EKS cluster to query.\n- `EKS_NODEGROUP`: The nodegroup within the cluster to check the status of.\n\n## Notes\n\nThis codebundle assumes a traditional service account authentication using the assume role functionality of `aws sts`, and therefore a role with the correct access will be required so that it can be assumed by the service account for a temporary token.\n\n## TODO\n- [ ] Add documentation\n- [ ] Look at dynamic/scaled nodegroup support", - "libraries": [ - "RW.CLI", - "RW.Core" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/aws-eks-node-reboot" - }, - { - "slug": "rw-cli-codecollection-curl-http-ok", - "collection_slug": "rw-cli-codecollection", - "name": "curl-http-ok", - "display_name": "curl-http-ok", - "description": "This codebundle validates the response code of an endpoint using cURL and provides the total time of the request. It supports Linux, macOS, Windows, and HTTP.", - "platform": "Unknown", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Validate HTTP URL Availability and Timeliness", - "Check HTTP URL Availability and Timeliness" - ], - "capabilities": [ - "Validate HTTP URL Availability and Timeliness: Use cURL to validate single or multiple http responses", - "Check HTTP URL Availability and Timeliness: Use cURL to validate single or multiple http responses" - ], - "readme": "# cURL HTTP OK Codebundle\nThis codebundle validates the response code of an endpoint using cURL and provides the total time of the request. It supports Linux, macOS, Windows, and HTTP.\n\n## SLI\nIt periodically uses curl to validate the endpoint and pushes a metric to the RunWhen Platform of 1 or 0. A **1** indicates that an acceptable HTTP response code was received within the desired target latency. A **0** indicates that either an acceptable HTTP response code or the target latency was not achieved. \n\nThis codebundle configuration requires the following user variables:\n\n- ${URLS} (string): Comma-separated list of URLs to perform requests against. It accepts a string value and has a default value of https://www.runwhen.com. Example usage: https://www.runwhen.com,https://api.runwhen.com.\n\n- ${TARGET_LATENCY} (string): Represents the maximum latency in seconds, allowed for requests. It should be a float value. The default value is 1.2, and an example value is 1.2.\n\n- ${ACCEPTABLE_RESPONSE_CODES} (string): Comma-separated list of HTTP response codes that indicate success and connectivity. This allows endpoints to be considered healthy when returning various success responses (2xx), redirects (3xx), authentication challenges (401), or access denied (403). The default value is 200,201,202,204,301,302,307,401,403, and an example value is 200,201,202,204,301,302,307,401,403.\n\n- ${VERIFY_SSL} (string): Whether to verify SSL certificates. Set to 'false' to ignore SSL certificate errors for self-signed or untrusted certificates. It should be a string value. The default value is false, and an example value is true.\n\n## TaskSet\nSimilar to the SLI, this codebundle configuration requires the following user variables:\n\n- ${URLS} (string): Comma-separated list of URLs to perform requests against. It accepts a string value and has a default value of https://www.runwhen.com. Example usage: https://www.runwhen.com,https://api.runwhen.com.\n\n- ${TARGET_LATENCY} (string): Represents the maximum latency in seconds, allowed for requests. It should be a float value. The default value is 1.2, and an example value is 1.2.\n\n- ${ACCEPTABLE_RESPONSE_CODES} (string): Comma-separated list of HTTP response codes that indicate success and connectivity. This allows endpoints to be considered healthy when returning various success responses (2xx), redirects (3xx), authentication challenges (401), or access denied (403). The default value is 200,201,202,204,301,302,307,401,403, and an example value is 200,201,202,204,301,302,307,401,403.\n\n- ${VERIFY_SSL} (string): Whether to verify SSL certificates. Set to 'false' to ignore SSL certificate errors for self-signed or untrusted certificates. It should be a string value. The default value is false, and an example value is true.\n\nIf either an acceptable HTTP response code or the target latency are not achieved, an issue is raised with the RunWhen Platform so that further troubleshooting can take place.\n\n## Key Benefits\n\n- **Comprehensive Connectivity Detection**: Accepts a wide range of HTTP status codes (2xx, 3xx, 401, 403) that indicate the endpoint is reachable and functioning, even if not returning a perfect 200 OK.\n- **Solves Authentication Challenge Problem**: 401 (authentication required) and 403 (forbidden) responses are considered healthy since they indicate the server is working and responding.\n- **Handles All Redirect Types**: Supports 301 (permanent), 302 (temporary), and 307 (temporary with method preservation) redirects as healthy responses.\n- **Covers Success Variations**: Includes 201 (created), 202 (accepted), and 204 (no content) for APIs that return different success codes.\n- **Flexible Configuration**: Easily customize which HTTP status codes should be considered healthy for your specific use case. \n\n", - "libraries": [ - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/curl-http-ok" - }, - { - "slug": "rw-cli-codecollection-k8s-namespace-healthcheck", - "collection_slug": "rw-cli-codecollection", - "name": "k8s-namespace-healthcheck", - "display_name": "k8s-namespace-healthcheck", - "description": "This codebundle is used for searching in a namespace for possible issues to triage; covering things such as scraping logs, checking for anomalies in events, looking for pod restarts, etc. These tasks can be performed with just native kubernetes objects and do not require additional logging / tracing tools be setup by the user. Problems identified during triage will result in raised issues with intelligent severity adjustment - for example, scheduling failures are treated as lower severity (4) wh", - "platform": "Kubernetes", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Get Error Event Count within ${RW_LOOKBACK_WINDOW} and calculate Score", - "Get Container Restarts and Score in Namespace `${NAMESPACE}`", - "Get NotReady Pods in `${NAMESPACE}`", - "Generate Namespace Score in `${NAMESPACE}`", - "Inspect Warning Events in Namespace `${NAMESPACE}`", - "Inspect Container Restarts In Namespace `${NAMESPACE}`", - "Inspect Pending Pods In Namespace `${NAMESPACE}`", - "Inspect Failed Pods In Namespace `${NAMESPACE}`", - "Inspect Workload Status Conditions In Namespace `${NAMESPACE}`", - "Get Listing Of Resources In Namespace `${NAMESPACE}`", - "Check Event Anomalies in Namespace `${NAMESPACE}`", - "Check Missing or Risky PodDisruptionBudget Policies in Namepace `${NAMESPACE}`", - "Check Resource Quota Utilization in Namespace `${NAMESPACE}`" - ], - "capabilities": [ - "Get Error Event Count within ${RW_LOOKBACK_WINDOW} and calculate Score: Captures error events and counts them within the RW_LOOKBACK_WINDOW timeframe, consistent with runbook analysis.", - "Get Container Restarts and Score in Namespace `${NAMESPACE}`: Counts the total sum of container restarts within a timeframe and determines if they're beyond a threshold.", - "Get NotReady Pods in `${NAMESPACE}`: Fetches a count of unready pods.", - "Inspect Warning Events in Namespace `${NAMESPACE}`: Queries all warning events in a given namespace within the RW_LOOKBACK_WINDOW timeframe,", - "Inspect Container Restarts In Namespace `${NAMESPACE}`: Fetches pods that have container restarts and provides a detailed analysis of restart causes including proper OOM vs liveness probe failure detection.", - "Inspect Pending Pods In Namespace `${NAMESPACE}`: Fetches pods that are pending and provides details.", - "Inspect Failed Pods In Namespace `${NAMESPACE}`: Fetches all pods which are not running (unready) in the namespace and adds them to a report for future review.", - "Inspect Workload Status Conditions In Namespace `${NAMESPACE}`: Parses all workloads in a namespace and inspects their status conditions for issues. Status conditions with a status value of False are considered an error.", - "Get Listing Of Resources In Namespace `${NAMESPACE}`: Simple fetch all to provide a snapshot of information about the workloads in the namespace for future review in a report.", - "Check Event Anomalies in Namespace `${NAMESPACE}`: Fetches non warning events in a namespace within a timeframe and checks for unusual activity, raising issues for any found.", - "Check Missing or Risky PodDisruptionBudget Policies in Namepace `${NAMESPACE}`: Searches through deployemnts and statefulsets to determine if PodDistruptionBudgets are missing and/or are configured in a risky way that might affect maintenance activities.", - "Check Resource Quota Utilization in Namespace `${NAMESPACE}`: Lists any namespace resource quotas and checks their utilization, raising issues if they are above 80%" - ], - "readme": "# Kubernetes Namespace Triage\nThis codebundle is used for searching in a namespace for possible issues to triage; covering things such as scraping logs, checking for anomalies in events, looking for pod restarts, etc. These tasks can be performed with just native kubernetes objects and do not require additional logging / tracing tools be setup by the user. Problems identified during triage will result in raised issues with intelligent severity adjustment - for example, scheduling failures are treated as lower severity (4) when the deployment has the expected number of replicas or is actively scaling (HPA operations), but higher severity (3) when replicas are below the desired count and not scaling.\n\n## Tasks\n\n`Trace And Troubleshoot Namespace Warning Events And Errors`\n`Troubleshoot Unready Pods In Namespace For Report`\n`Troubleshoot Workload Status Conditions In Namespace`\n`Get Listing Of Workloads In Namespace`\n`Check For Namespace Event Anomalies`\n`Check Missing or Risky PodDisruptionBudget Policies`\n\n\n## Configuration\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\n- `kubeconfig`: The kubeconfig secret containing access info for the cluster.\n- `kubectl`: The location service used to interpret shell commands. Default value is `kubectl-service.shared`.\n- `KUBERNETES_DISTRIBUTION_BINARY`: Which binary to use for Kubernetes CLI commands. Default value is `kubectl`.\n- `CONTEXT`: The Kubernetes context to operate within.\n- `NAMESPACE`: The name of the namespace to search. Leave it blank to search in all namespaces.\n- `ERROR_PATTERN`: What error pattern to grep for in logs when tracing issues.\n- `SERVICE_ERROR_PATTERN`: The error pattern used when extracting and summarizing error logs from services.\n- `SERVICE_EXCLUDE_PATTERN`: What patterns used to exclude results when checking service logs in a namespace. Useful to reduce noise.\n- `ANOMALY_THRESHOLD`: What non-warning event count constitutes an anomaly for raising issues.\n- `RW_LOOKBACK_WINDOW`: The time window (in (h) hours or (m) minutes) to look back for time-sensitive issues like failed pods, pending pods, workload status conditions, event anomalies, container restart analysis, and warning events. Resources with issues older than this window will be ignored. This parameter is used consistently across both SLI and runbook tasks to ensure aligned analysis, with the SLI automatically normalizing the window for its execution frequency. Default value is `1h`.\n- `CONTAINER_RESTART_THRESHOLD`: The maximum total container restarts to be still considered healthy. Only containers exceeding this threshold will trigger detailed restart analysis. Default value is `3`.\n\n## Requirements\n- A kubeconfig with appropriate RBAC permissions to perform the desired command.\n\n## TODO\n- [ ] Add additional documentation.\n\n", - "libraries": [ - "DateTime", - "RW.K8sLog", - "Collections", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem", - "RW.K8sHelper", - "RW.NextSteps", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/k8s-namespace-healthcheck" - }, - { - "slug": "rw-cli-codecollection-terraform-cloud-workspace-lock-check", - "collection_slug": "rw-cli-codecollection", - "name": "terraform-cloud-workspace-lock-check", - "display_name": "terraform-cloud-workspace-lock-check", - "description": "", - "platform": "Unknown", - "author": "nmadhok", - "support_tags": [ - "rw" - ], - "tasks": [ - "Checking whether the Terraform Cloud Workspace '${TERRAFORM_WORKSPACE_NAME}' is in a locked state" - ], - "capabilities": [ - "Checking whether the Terraform Cloud Workspace '${TERRAFORM_WORKSPACE_NAME}' is in a locked state: Use curl to check whether the Terraform Cloud Workspace is in a locked state" - ], - "readme": "", - "libraries": [ - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/terraform-cloud-workspace-lock-check" - }, - { - "slug": "rw-cli-codecollection-azure-kv-health", - "collection_slug": "rw-cli-codecollection", - "name": "azure-kv-health", - "display_name": "azure-kv-health", - "description": "This codebundle runs a suite of metrics checks for Key Vault in Azure. It identifies:", - "platform": "Azure", - "author": "saurabh3460", - "support_tags": [ - "rw" - ], - "tasks": [ - "Count Key Vault Resource Health in resource group `${AZURE_RESOURCE_GROUP}`", - "Count Key Vault Availability in resource group `${AZURE_RESOURCE_GROUP}`", - "Count Key Vault configuration in resource group `${AZURE_RESOURCE_GROUP}`", - "Count Expiring Key Vault Items in resource group `${AZURE_RESOURCE_GROUP}`", - "Count Key Vault Log Issues in resource group `${AZURE_RESOURCE_GROUP}`", - "Count Key Vault Performance Metrics in resource group `${AZURE_RESOURCE_GROUP}`", - "Generate Comprehensive Key Vault Health Score", - "Check Key Vault Resource Health in resource group `${AZURE_RESOURCE_GROUP}`", - "Check Key Vault Availability in resource group `${AZURE_RESOURCE_GROUP}`", - "Check Key Vault Configuration in resource group `${AZURE_RESOURCE_GROUP}`", - "Check Expiring Key Vault Items in resource group `${AZURE_RESOURCE_GROUP}`", - "Check Key Vault Logs for Issues in resource group `${AZURE_RESOURCE_GROUP}`", - "Check Key Vault Performance Metrics in resource group `${AZURE_RESOURCE_GROUP}`" - ], - "capabilities": [ - "Count Key Vault Resource Health in resource group `${AZURE_RESOURCE_GROUP}`: Counts the health status of Key Vaults in the specified resource group", - "Count Key Vault Availability in resource group `${AZURE_RESOURCE_GROUP}`: Counts number of Azure key vault vaults with availability below 100%", - "Count Key Vault configuration in resource group `${AZURE_RESOURCE_GROUP}`: Count Key vault's miss-configuration", - "Count Expiring Key Vault Items in resource group `${AZURE_RESOURCE_GROUP}`: Count expiring secrets, certificates, and keys in Key Vaults", - "Count Key Vault Log Issues in resource group `${AZURE_RESOURCE_GROUP}`: Count Key Vault log issues", - "Count Key Vault Performance Metrics in resource group `${AZURE_RESOURCE_GROUP}`: Count Key Vault performance metrics issues", - "Check Key Vault Resource Health in resource group `${AZURE_RESOURCE_GROUP}`: Check the health status of Key Vaults in the specified resource group", - "Check Key Vault Availability in resource group `${AZURE_RESOURCE_GROUP}`: List number of Azure key vault vaults with availability below 100%", - "Check Key Vault Configuration in resource group `${AZURE_RESOURCE_GROUP}`: List Key Vault miss-configuration", - "Check Expiring Key Vault Items in resource group `${AZURE_RESOURCE_GROUP}`: Check for expiring secrets, certificates, and keys in Key Vaults", - "Check Key Vault Logs for Issues in resource group `${AZURE_RESOURCE_GROUP}`: Check Key Vault log issues", - "Check Key Vault Performance Metrics in resource group `${AZURE_RESOURCE_GROUP}`: Check Key Vault performance metrics for excessive requests and high latency" - ], - "readme": "# Azure Key Vault Health\nThis codebundle runs a suite of metrics checks for Key Vault in Azure. It identifies:\n- Check Key Vault Availability\n- Check Key Vault Configuration\n- Check Expiring Key Vault Items (Keys, Secrets and Certificates)\n- Check Key Vault Logs for Issues\n- Check Key Vault Performance Metrics\n\n## Configuration\n\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\n- `AZ_USERNAME`: Service principal's client ID\n- `AZ_SECRET_VALUE`: The credential secret value from the app registration\n- `AZ_TENANT`: The Azure tenancy ID\n- `AZ_SUBSCRIPTION`: The Azure subscription ID\n\n## Testing \nSee the .test directory for infrastructure test code. \n\n## Notes\n\nThis codebundle assumes the service principal authentication flow", - "libraries": [ - "DateTime", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/azure-kv-health" - }, - { - "slug": "rw-cli-codecollection-k8s-jaeger-http-query", - "collection_slug": "rw-cli-codecollection", - "name": "k8s-jaeger-http-query", - "display_name": "k8s-jaeger-http-query", - "description": "This codebundle is used for searching in a Jaeger instance for trace data that indicates issues with services.", - "platform": "Kubernetes", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Query Traces in Jaeger for Unhealthy HTTP Response Codes in Namespace `${NAMESPACE}`" - ], - "capabilities": [ - "Query Traces in Jaeger for Unhealthy HTTP Response Codes in Namespace `${NAMESPACE}`: Query Jaeger for all services and report on any HTTP related trace errors" - ], - "readme": "# Kubernetes Jaeger HTTP Query\nThis codebundle is used for searching in a Jaeger instance for trace data that indicates issues with services.\n\n## Tasks\n\n`Query Traces in Jaeger instance for Unhealthy HTTP Response Codes in Namespace` \nLocates the Jaeger query service in the configured namespace, port-forwards the service, and queries for all traces within the LOOKBACK period (5m by default) for every available service. Then processes the results and generates issues and next steps for non 200 http error codes. \n\n\n## Configuration\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\n- `kubeconfig`: The kubeconfig secret containing access info for the cluster.\n- `KUBERNETES_DISTRIBUTION_BINARY`: Which binary to use for Kubernetes CLI commands. Default value is `kubectl`.\n- `CONTEXT`: The Kubernetes context to operate within.\n- `NAMESPACE`: The name of the namespace to search. Leave it blank to search in all namespaces.\n- `SERVICE_EXCLUSIONS`: Optional. Services in Jaegar to ignore during trace analysis.\n- `LOOKBACK`: Optional. The age of traces to include in the query. Defaults to 5m (5 Minutes)\n\n## Requirements\n- A kubeconfig with appropriate RBAC permissions to perform the desired command.\n\n## TODO\n- [ ] Consider additional tasks\n\n", - "libraries": [ - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform" - ], - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/codebundles/k8s-jaeger-http-query" - }, - { - "slug": "rw-generic-codecollection-azure-cosmosdb-query", - "collection_slug": "rw-generic-codecollection", - "name": "azure-cosmosdb-query", - "display_name": "azure-cosmosdb-query", - "description": "A generic codebundle for executing user-provided SQL queries against Azure Cosmos DB using the Python SDK. Users configure their own queries, database, and container names.", - "platform": "Azure", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "${TASK_TITLE}", - "${TASK_TITLE}" - ], - "capabilities": [ - "${TASK_TITLE}: Executes a user-provided Cosmos DB SQL query and pushes the count of results as a metric.", - "${TASK_TITLE}: Executes a user-provided Cosmos DB SQL query and adds the results to the report." - ], - "readme": "# Azure Cosmos DB Query Generic\nA generic codebundle for executing user-provided SQL queries against Azure Cosmos DB using the Python SDK. Users configure their own queries, database, and container names.\n\n## TaskSet\nExecutes a user-provided Cosmos DB SQL query and adds the results to the report.\n\nExample: Query for error documents\n```sql\nSELECT * FROM c WHERE c.status = 'error' ORDER BY c._ts DESC\n```\n\n## SLI\nExecutes a user-provided Cosmos DB SQL query and pushes the count of results as a metric.\n\nExample: Count error documents\n```sql\nSELECT COUNT(1) FROM c WHERE c.status = 'error'\n```\n\n## Requirements\n\n### Required Variables\n- **COSMOSDB_ENDPOINT** (user variable): The Cosmos DB account endpoint URL (e.g., `https://myaccount.documents.azure.com:443/`)\n- **DATABASE_NAME** (user variable): The name of the Cosmos DB database\n- **CONTAINER_NAME** (user variable): The name of the Cosmos DB container\n- **COSMOSDB_QUERY** (user variable): The SQL query to execute\n- **QUERY_PARAMETERS** (user variable, optional): JSON string of query parameters for parameterized queries\n- **TASK_TITLE** (user variable, optional): Custom name for the task\n\n### Authentication (choose one approach)\n**Option 1: Service Principal with Data Plane RBAC** (most secure)\n- **azure_credentials** (secret): Service principal with AZURE_CLIENT_ID, AZURE_TENANT_ID, AZURE_CLIENT_SECRET\n- Requires: Data plane RBAC role granted (see setup below)\n\n**Option 2: Service Principal with Control Plane Key Retrieval** (easiest for admins)\n- **azure_credentials** (secret): Service principal with AZURE_CLIENT_ID, AZURE_TENANT_ID, AZURE_CLIENT_SECRET\n- **AZURE_SUBSCRIPTION_ID** (user variable): Your Azure subscription ID\n- **AZURE_RESOURCE_GROUP** (user variable): Resource group containing Cosmos DB\n- **COSMOSDB_ACCOUNT_NAME** (user variable): Cosmos DB account name\n- Requires: \"Cosmos DB Account Reader\" or \"Contributor\" role (visible in Azure Portal)\n\n**Option 3: Direct Key** (simplest, less secure)\n- **cosmosdb_key** (secret): The Cosmos DB account primary or secondary key\n\n## Authentication\n\nThis codebundle supports **three authentication methods** with intelligent automatic fallback:\n\n### Method 1: Service Principal with Data Plane RBAC (Most Secure) \u2b50\nDirect data access using Azure AD identity. **No keys involved.**\n\n**Setup:**\n```bash\n# Grant data plane RBAC permissions (via CLI only)\naz cosmosdb sql role assignment create \\\n --account-name \\\n --resource-group \\\n --scope \"/\" \\\n --principal-id \\\n --role-definition-id 00000000-0000-0000-0000-000000000001\n```\n\n**Required:**\n- `azure_credentials` secret (AZURE_CLIENT_ID, AZURE_TENANT_ID, AZURE_CLIENT_SECRET)\n\n**Note:** Data plane roles are NOT visible in Azure Portal IAM - they're managed separately via CLI.\n\n---\n\n### Method 2: Service Principal with Key Retrieval (Easiest for Admins) \u2b50\u2b50\nService principal retrieves the key from Azure control plane, then connects with that key.\n\n**Setup:**\n```bash\n# Grant control plane permission (visible in Azure Portal)\naz role assignment create \\\n --assignee \\\n --role \"Cosmos DB Account Reader\" \\\n --scope /subscriptions//resourceGroups//providers/Microsoft.DocumentDB/databaseAccounts/\n```\n\n**Required:**\n- `azure_credentials` secret (AZURE_CLIENT_ID, AZURE_TENANT_ID, AZURE_CLIENT_SECRET)\n- `AZURE_SUBSCRIPTION_ID` user variable\n- `AZURE_RESOURCE_GROUP` user variable \n- `COSMOSDB_ACCOUNT_NAME` user variable\n\n**Advantages:**\n- Uses familiar Azure Portal RBAC (Contributor, Cosmos DB Account Reader, etc.)\n- No need to configure data plane RBAC\n- One credential for both control plane and data access\n\n---\n\n### Method 3: Direct Key (Simplest, Less Secure)\nDirectly use the Cosmos DB account key.\n\n**Setup:**\n```bash\n# Get the key\naz cosmosdb keys list --name --resource-group --query primaryMasterKey -o tsv\n```\n\n**Required:**\n- `cosmosdb_key` secret\n\n---\n\n### Automatic Fallback Logic\nThe codebundle tries methods in this order:\n1. Try Method 1 (data plane RBAC)\n2. If fails and subscription/RG/account variables provided \u2192 Try Method 2 (retrieve key)\n3. If fails or variables not provided \u2192 Try Method 3 (direct key)\n4. If all fail \u2192 Raise descriptive issue\n\n**You only need to configure ONE method!** The codebundle automatically determines which one to use.\n\n## Usage Examples\n\n### TaskSet: Find Error Documents\n```\nDATABASE_NAME=\"mydatabase\"\nCONTAINER_NAME=\"mycontainer\"\nCOSMOSDB_QUERY=\"SELECT * FROM c WHERE c.status = 'error' ORDER BY c._ts DESC OFFSET 0 LIMIT 100\"\nTASK_TITLE=\"Find error documents in Cosmos DB\"\n```\n\n### TaskSet: Query with Parameters\n```\nDATABASE_NAME=\"mydatabase\"\nCONTAINER_NAME=\"mycontainer\"\nCOSMOSDB_QUERY=\"SELECT * FROM c WHERE c.status = @status AND c.timestamp > @startTime\"\nQUERY_PARAMETERS='{\"@status\": \"error\", \"@startTime\": \"2024-01-01T00:00:00Z\"}'\nTASK_TITLE=\"Find errors since start time\"\n```\n\n### SLI: Count Erro", - "libraries": [ - "RW.platform", - "RW.Core", - "BuiltIn", - "RW.Azure.Cosmosdb" - ], - "git_url": "https://github.com/runwhen-contrib/rw-generic-codecollection/tree/main/codebundles/azure-cosmosdb-query" - }, - { - "slug": "rw-generic-codecollection-curl-cmd", - "collection_slug": "rw-generic-codecollection", - "name": "curl-cmd", - "display_name": "curl-cmd", - "description": "A generic codebundle used for running bare curl commands in a bash shell.", - "platform": "Unknown", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "${TASK_TITLE}", - "${TASK_TITLE}" - ], - "capabilities": [ - "${TASK_TITLE}: Runs a user provided curl command and pushes the result as a metric.", - "${TASK_TITLE}: Runs a user provided curl command and adds the output to the report." - ], - "readme": "# Curl cmd\nA generic codebundle used for running bare curl commands in a bash shell. \n\n## SLI\nThe command provided must provide a single metric that is pushed to the RunWhen Platform. \n\nExample: `curl -X POST https://postman-echo.com/post --fail --silent --show-error | jq -r '.json | length'`\n\n## TaskSet\nThe command has all output added to the report for review during a RunSession. \n\nExample: `curl -X POST https://postman-echo.com/post --fail --silent --show-error | jq -r '.json'`\n\n## Requirements\n- A curl command string of your choosing", - "libraries": [ - "RW.DynamicIssues", - "RW.platform", - "RW.Core", - "BuiltIn", - "RW.CLI", - "OperatingSystem", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/rw-generic-codecollection/tree/main/codebundles/curl-cmd" - }, - { - "slug": "rw-generic-codecollection-k8s-kubectl-cmd", - "collection_slug": "rw-generic-codecollection", - "name": "k8s-kubectl-cmd", - "display_name": "k8s-kubectl-cmd", - "description": "A generic codebundle used for running bare kubectl commands in a bash shell.", - "platform": "Kubernetes", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "${TASK_TITLE}", - "${TASK_TITLE}" - ], - "capabilities": [ - "${TASK_TITLE}: Runs a user provided kubectl command and pushes the metric as an SLI", - "${TASK_TITLE}: Runs a user provided kubectl command and adds the output to the report." - ], - "readme": "# Kubernetes kubectl cmd\nA generic codebundle used for running bare kubectl commands in a bash shell. \n\n## SLI\nThe command provided must provide a single metric that is pushed to the RunWhen Platform. \n\nExample: `kubectl get pods -n online-boutique -o json | jq '[.items[]] | length`\n\n## TaskSet\nThe command has all output added to the report for review during a RunSession. \n\nExample: `kubectl describe pods -n online-boutique`\n\n## Requirements\n- A kubeconfig with appropriate RBAC permissions to perform the desired command.", - "libraries": [ - "RW.DynamicIssues", - "RW.platform", - "RW.Core", - "BuiltIn", - "RW.CLI", - "OperatingSystem", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/rw-generic-codecollection/tree/main/codebundles/k8s-kubectl-cmd" - }, - { - "slug": "rw-generic-codecollection-git-script-cmd-json", - "collection_slug": "rw-generic-codecollection", - "name": "git-script-cmd-json", - "display_name": "git-script-cmd-json", - "description": "A generic codebundle designed for safely executing scripts with arbitrary environment variables, particularly useful for private git repository operations and script execution with secrets.", - "platform": "Unknown", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "${TASK_TITLE}", - "${TASK_TITLE}" - ], - "capabilities": [ - "${TASK_TITLE}: Executes a user-provided script/command with all configured secrets loaded as environment variables and pushes the stdout as a metric.", - "${TASK_TITLE}: Executes a user-provided script/command with all configured secrets loaded as environment variables." - ], - "readme": "# Git Script Command\nA generic codebundle designed for safely executing scripts with arbitrary environment variables, particularly useful for private git repository operations and script execution with secrets.\n\n## Features\n- **Flexible Secret Management**: Load any number of secrets as environment variables\n- **Private Git Support**: Securely handle git credentials for private repository access\n- **Script Execution**: Execute scripts with full environment context\n- **SSH Key Support**: Handle SSH keys for git operations\n- **Multi-Environment**: Support for complex deployment scenarios requiring multiple secrets\n\n## SLI\nThe SLI pushes a health metric to the RunWhen Platform: 1 for success (healthy), 0 for failure (unhealthy). The metric is based on the script's exit code, not its output.\n\nExample: `git clone https://github.com/private/repo.git /tmp/repo && /tmp/repo/scripts/health-check.sh`\n\n## TaskSet\nThe command has all output added to the report for review during a RunSession. \n\nExample: `git clone git@github.com:private/repo.git /tmp/repo && bash /tmp/repo/scripts/deploy.sh`\n\n## Requirements\n- **SCRIPT_COMMAND**: The script/command to execute\n- **Secrets**: Any number of secrets that will be loaded as environment variables\n- **Optional SSH_PRIVATE_KEY**: SSH private key for git operations\n- **Optional GIT_USERNAME/GIT_TOKEN**: HTTPS git credentials\n\n## Common Use Cases\n\n### Private Git Repository with SSH\n```bash\n# Set SSH_PRIVATE_KEY secret, then:\nSCRIPT_COMMAND=\"git clone git@github.com:private/repo.git /tmp/repo && bash /tmp/repo/scripts/deploy.sh\"\n```\n\n### Private Git Repository with HTTPS Token\n```bash\n# Set GIT_USERNAME and GIT_TOKEN secrets, then:\nSCRIPT_COMMAND=\"git clone https://\\$GIT_USERNAME:\\$GIT_TOKEN@github.com/private/repo.git /tmp/repo && bash /tmp/repo/scripts/deploy.sh\"\n```\n\n### Multiple Environment Secrets\n```bash\n# Set DATABASE_URL, API_KEY, SLACK_TOKEN secrets, then:\nSCRIPT_COMMAND=\"git clone https://github.com/private/repo.git /tmp/repo && bash /tmp/repo/scripts/check-services.sh\"\n```\n\n## Security Features\n- All secrets are handled securely through RunWhen's secret management\n- SSH keys are written to temporary files with appropriate permissions\n- Environment variables are isolated to the command execution context\n- No secrets are logged or exposed in reports ", - "libraries": [ - "Collections", - "RW.DynamicIssues", - "RW.platform", - "RW.Core", - "BuiltIn", - "RW.CLI", - "OperatingSystem", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/rw-generic-codecollection/tree/main/codebundles/git-script-cmd-json" - }, - { - "slug": "rw-generic-codecollection-grafana-loki-query", - "collection_slug": "rw-generic-codecollection", - "name": "grafana-loki-query", - "display_name": "grafana-loki-query", - "description": "This CodeBundle queries Loki (via Grafana) using relative times like 30m, 2h, or 2d, which are automatically converted to nanosecond timestamps. If HEADERS is provided, '-K ./HEADERS' is appended for authentication. If POST_PROCESS is provided, the command output is piped to that command (e.g., jq).", - "platform": "Unknown", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "${TASK_TITLE}" - ], - "capabilities": [ - "${TASK_TITLE}: Builds and runs a Loki query through Grafana\u2019s proxy, allowing" - ], - "readme": "# Grafana Loki Query\nThis CodeBundle queries Loki (via Grafana) using relative times like 30m, 2h, or 2d, which are automatically converted to nanosecond timestamps. If HEADERS is provided, '-K ./HEADERS' is appended for authentication. If POST_PROCESS is provided, the command output is piped to that command (e.g., jq).\n\n\nexport GRAFANA_URL=https://mygrafana.company.net\nexport DATASOURCE_UID=logs-production\nexport LOKI_QUERY='{container=\"papi\"}'\nexport LOKI_LIMIT=100\nexport LOKI_START=2h\n", - "libraries": [ - "DateTime", - "RW.platform", - "RW.Core", - "BuiltIn", - "RW.CLI", - "OperatingSystem", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/rw-generic-codecollection/tree/main/codebundles/grafana-loki-query" - }, - { - "slug": "rw-generic-codecollection-gcloud-stdout-issue", - "collection_slug": "rw-generic-codecollection", - "name": "gcloud-stdout-issue", - "display_name": "gcloud-stdout-issue", - "description": "A generic codebundle used for running a gcloud command, commonly with grep, that raises an issue when the command output is non-empty, implying that an error was found via grepping the output.", - "platform": "Unknown", - "author": "jon-funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "${TASK_TITLE}", - "${TASK_TITLE}" - ], - "capabilities": [ - "${TASK_TITLE}: Runs a user provided gcloud command and if the return string is non-empty it indicates an error was found, pushing a health score of 0, otherwise pushes a 1.", - "${TASK_TITLE}: Runs a user provided gcloud command and adds the output to the report." - ], - "readme": "# GCloud Stdout Issue Detection\nA generic codebundle used for running a gcloud command, commonly with grep, that raises an issue when the command output is non-empty, implying that an error was found via grepping the output.\n\n## TaskSet\nThe generalized user-provided command that can raise a configurable issue if the return is non-empty\n\nExample: `gcloud projects list`\n\n## SLI\nA generalized SLI that pushes a 1 when the output is empty, indicating no errors were found. Pushes a 0 (unhealthy) metric when output is produced.\n\nExample: `gcloud projects list`\n\n## Requirements\n- A kubeconfig for authentication", - "libraries": [ - "RW.platform", - "RW.Core", - "BuiltIn", - "RW.CLI", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-generic-codecollection/tree/main/codebundles/gcloud-stdout-issue" - }, - { - "slug": "rw-generic-codecollection-gcloud-cmd", - "collection_slug": "rw-generic-codecollection", - "name": "gcloud-cmd", - "display_name": "gcloud-cmd", - "description": "A generic codebundle used for running user provided gcloud commands in a bash shell.", - "platform": "Unknown", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "${TASK_TITLE}", - "${TASK_TITLE}" - ], - "capabilities": [ - "${TASK_TITLE}: Runs a user provided gcloud command and pushes the metric to the RunWhen Platform.", - "${TASK_TITLE}: Runs a user provided gcloud command and adds the output to the report." - ], - "readme": "# Generic gcloud command\nA generic codebundle used for running user provided gcloud commands in a bash shell. \n\n## SLI\nThe command provided must provide a single metric that is pushed to the RunWhen Platform. \n\nExample: `gcloud projects list --format=\"json\" | jq '. | length'`\n\n## TaskSet\nThe command has all output added to the report for review during a RunSession. \n\nExample: `gcloud projects list`\n\n## Requirements\n- A GCP service account json with appropriate RBAC permissions to perform the desired command.", - "libraries": [ - "RW.DynamicIssues", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/rw-generic-codecollection/tree/main/codebundles/gcloud-cmd" - }, - { - "slug": "rw-generic-codecollection-azure-cmd", - "collection_slug": "rw-generic-codecollection", - "name": "azure-cmd", - "display_name": "azure-cmd", - "description": "A generic codebundle used for running a azure cli command, commonly with grep, that raises an issue when the command output is non-empty, implying that an error was found via grepping the output.", - "platform": "Azure", - "author": "jon-funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "${TASK_TITLE}", - "${TASK_TITLE}" - ], - "capabilities": [ - "${TASK_TITLE}: Runs a user provided azure cli command and if the return string is non-empty it indicates an error was found, pushing a health score of 0, otherwise pushes a 1.", - "${TASK_TITLE}: Runs a user provided azure cli command and adds the output to the report." - ], - "readme": "# Azure CLI CMD Generic\nA generic codebundle used for running a azure cli command, commonly with grep, that raises an issue when the command output is non-empty, implying that an error was found via grepping the output.\n\n## TaskSet\nThe generalized user-provided command that can raise a configurable issue if the return is non-empty\n\nExample: `az monitor metrics list --resource myapp --resource-group myrg --resource-type Microsoft.Web/sites --metric \"HealthCheckStatus\" --interval 5m | -r '.value[].timeseries[].data[].average'`\n\n## SLI\nA generalized SLI that pushes a 1 when the output is empty, indicating no errors were found. Pushes a 0 (unhealthy) metric when output is produced.\n\nExample: `az monitor metrics list --resource myapp --resource-group myrg --resource-type Microsoft.Web/sites --metric \"HealthCheckStatus\" --interval 5m | -r '.value[].timeseries[].data[].average'`\n\n## Requirements\n- AZ_RESOURCE_GROUP\n- AZ_USERNAME\n- AZ_SECRET_VALUE\n- AZ_TENANT\n- AZ_SUBSCRIPTION", - "libraries": [ - "RW.DynamicIssues", - "RW.platform", - "RW.Core", - "BuiltIn", - "RW.CLI", - "OperatingSystem", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/rw-generic-codecollection/tree/main/codebundles/azure-cmd" - }, - { - "slug": "rw-generic-codecollection-aws-stdout-issue", - "collection_slug": "rw-generic-codecollection", - "name": "aws-stdout-issue", - "display_name": "aws-stdout-issue", - "description": "A generic codebundle used for running a aws cli command, commonly with grep, that raises an issue when the command output is non-empty, implying that an error was found via grepping the output.", - "platform": "AWS", - "author": "jon-funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "${TASK_TITLE}", - "${TASK_TITLE}" - ], - "capabilities": [ - "${TASK_TITLE}: Runs a user provided aws cli command and if the return string is non-empty it indicates an error was found, pushing a health score of 0, otherwise pushes a 1.", - "${TASK_TITLE}: Runs a user provided aws cli command and if the return string is non-empty, it's added to a report and used to raise an issue." - ], - "readme": "# AWS CLI Stdout Issue Detection\nA generic codebundle used for running a aws cli command, commonly with grep, that raises an issue when the command output is non-empty, implying that an error was found via grepping the output.\n\n## TaskSet\nThe generalized user-provided command that can raise a configurable issue if the return is non-empty\n\nExample: `aws logs filter-log-events --log-group-name /aws/lambda/hello-world --filter-pattern \"ERROR\" | jq -r '.events[].message'`\n\n## SLI\nA generalized SLI that pushes a 1 when the output is empty, indicating no errors were found. Pushes a 0 (unhealthy) metric when output is produced.\n\nExample: `aws logs filter-log-events --log-group-name /aws/lambda/hello-world --filter-pattern \"ERROR\" | jq -r '.events[].message'`\n\n## Requirements\n- AWS_SECRET_ACCESS_KEY\n- AWS_ACCESS_KEY_ID\n- AWS_REGION", - "libraries": [ - "RW.platform", - "RW.Core", - "BuiltIn", - "RW.CLI", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-generic-codecollection/tree/main/codebundles/aws-stdout-issue" - }, - { - "slug": "rw-generic-codecollection-azure-stdout-issue", - "collection_slug": "rw-generic-codecollection", - "name": "azure-stdout-issue", - "display_name": "azure-stdout-issue", - "description": "A generic codebundle used for running a azure cli command, commonly with grep, that raises an issue when the command output is non-empty, implying that an error was found via grepping the output.", - "platform": "Azure", - "author": "jon-funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "${TASK_TITLE}", - "${TASK_TITLE}" - ], - "capabilities": [ - "${TASK_TITLE}: Runs a user provided azure cli command and if the return string is non-empty it indicates an error was found, pushing a health score of 0, otherwise pushes a 1.", - "${TASK_TITLE}: Runs a user provided azure cli command and if the return string is non-empty, it's added to a report and used to raise an issue." - ], - "readme": "# Azure CLI Stdout Issue Detection\nA generic codebundle used for running a azure cli command, commonly with grep, that raises an issue when the command output is non-empty, implying that an error was found via grepping the output.\n\n## TaskSet\nThe generalized user-provided command that can raise a configurable issue if the return is non-empty\n\nExample: `az monitor metrics list --resource myapp --resource-group myrg --resource-type Microsoft.Web/sites --metric \"HealthCheckStatus\" --interval 5m | -r '.value[].timeseries[].data[].average'`\n\n## SLI\nA generalized SLI that pushes a 1 when the output is empty, indicating no errors were found. Pushes a 0 (unhealthy) metric when output is produced.\n\nExample: `az monitor metrics list --resource myapp --resource-group myrg --resource-type Microsoft.Web/sites --metric \"HealthCheckStatus\" --interval 5m | -r '.value[].timeseries[].data[].average'`\n\n## Requirements\n- AZ_RESOURCE_GROUP\n- AZ_USERNAME\n- AZ_SECRET_VALUE\n- AZ_TENANT\n- AZ_SUBSCRIPTION", - "libraries": [ - "RW.platform", - "RW.Core", - "BuiltIn", - "RW.CLI", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-generic-codecollection/tree/main/codebundles/azure-stdout-issue" - }, - { - "slug": "rw-generic-codecollection-k8s-stdout-issue", - "collection_slug": "rw-generic-codecollection", - "name": "k8s-stdout-issue", - "display_name": "k8s-stdout-issue", - "description": "A generic codebundle used for running a kubectl command, commonly with grep, that raises an issue when the command output is non-empty, implying that an error was found via grepping the output.", - "platform": "Kubernetes", - "author": "jon-funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "${TASK_TITLE}", - "${TASK_TITLE}" - ], - "capabilities": [ - "${TASK_TITLE}: Runs a user provided kubectl command and if the return string is non-empty it indicates an error was found, pushing a health score of 0, otherwise pushes a 1.", - "${TASK_TITLE}: Runs a user provided kubectl command and if the return string is non-empty, it's added to a report and used to raise an issue." - ], - "readme": "# K8s Stdout Issue Detection\nA generic codebundle used for running a kubectl command, commonly with grep, that raises an issue when the command output is non-empty, implying that an error was found via grepping the output.\n\n## TaskSet\nThe generalized user-provided command that can raise a configurable issue if the return is non-empty\n\nExample: `kubectl get events | grep -i warning`\n\n## SLI\nA generalized SLI that pushes a 1 when the output is empty, indicating no errors were found. Pushes a 0 (unhealthy) metric when output is produced.\n\nExample: `kubectl get events | grep -i warning`\n\n## Requirements\n- A kubeconfig for authentication", - "libraries": [ - "RW.platform", - "RW.Core", - "BuiltIn", - "RW.CLI", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-generic-codecollection/tree/main/codebundles/k8s-stdout-issue" - }, - { - "slug": "rw-generic-codecollection-aws-cmd", - "collection_slug": "rw-generic-codecollection", - "name": "aws-cmd", - "display_name": "aws-cmd", - "description": "A generic codebundle used for running a aws cli command and adding the output to a report", - "platform": "AWS", - "author": "jon-funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "${TASK_TITLE}", - "${TASK_TITLE}" - ], - "capabilities": [ - "${TASK_TITLE}: Runs a user provided aws cli command and if the return string is non-empty it indicates an error was found, pushing a health score of 0, otherwise pushes a 1.", - "${TASK_TITLE}: Runs a user provided aws cli command and adds the output to the report." - ], - "readme": "# AWS CMD Generic\nA generic codebundle used for running a aws cli command and adding the output to a report\n\n## TaskSet\nThe generalized user-provided command that can raise a configurable issue if the return is non-empty\n\nExample: `aws logs filter-log-events --log-group-name /aws/lambda/hello-world --filter-pattern \"ERROR\" | jq -r '.events[].message'`\n\n## SLI\nA generalized SLI that pushes a 1 when the output is empty, indicating no errors were found. Pushes a 0 (unhealthy) metric when output is produced.\n\nExample: `aws logs filter-log-events --log-group-name /aws/lambda/hello-world --filter-pattern \"ERROR\" | jq -r '.events[].message'`\n\n## Requirements\n- AWS_SECRET_ACCESS_KEY\n- AWS_ACCESS_KEY_ID\n- AWS_REGION", - "libraries": [ - "RW.DynamicIssues", - "RW.platform", - "RW.Core", - "BuiltIn", - "RW.CLI", - "OperatingSystem", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/rw-generic-codecollection/tree/main/codebundles/aws-cmd" - }, - { - "slug": "rw-generic-codecollection-azure-cosmosdb-query-issue", - "collection_slug": "rw-generic-codecollection", - "name": "azure-cosmosdb-query-issue", - "display_name": "azure-cosmosdb-query-issue", - "description": "A generic codebundle for executing user-provided SQL queries against Azure Cosmos DB and raising issues when results are found. Users configure queries that filter for error/problem conditions.", - "platform": "Azure", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "${TASK_TITLE}", - "${TASK_TITLE}" - ], - "capabilities": [ - "${TASK_TITLE}: Executes a user-provided Cosmos DB SQL query and pushes 0 if results are found (unhealthy), 1 if no results (healthy).", - "${TASK_TITLE}: Executes a user-provided Cosmos DB SQL query and if results are returned, raises an issue." - ], - "readme": "# Azure Cosmos DB Query with Issue Detection\nA generic codebundle for executing user-provided SQL queries against Azure Cosmos DB and raising issues when results are found. Users configure queries that filter for error/problem conditions.\n\n## TaskSet\nExecutes a user-provided Cosmos DB SQL query and raises an issue if results are returned (indicating problems were found).\n\n## SLI\nExecutes a user-provided Cosmos DB SQL query and pushes a health metric: 1 (healthy) if no results, 0 (unhealthy) if results are found.\n\n## Requirements\n- **COSMOSDB_ENDPOINT** (user variable): The Cosmos DB account endpoint URL (e.g., `https://myaccount.documents.azure.com:443/`)\n- **Authentication** (secret, one of):\n - **azure_credentials** (recommended): Service principal credentials with AZURE_CLIENT_ID, AZURE_TENANT_ID, AZURE_CLIENT_SECRET\n - **cosmosdb_key** (fallback): The Cosmos DB account primary or secondary key\n- **DATABASE_NAME** (user variable): The name of the Cosmos DB database\n- **CONTAINER_NAME** (user variable): The name of the Cosmos DB container\n- **COSMOSDB_QUERY** (user variable): The SQL query to execute\n- **QUERY_PARAMETERS** (user variable, optional): JSON string of query parameters\n- **TASK_TITLE** (user variable, optional): Custom name for the task\n- **ISSUE_TITLE** (user variable, optional): Title for the issue if raised\n- **ISSUE_SEVERITY** (user variable, optional): Severity level 1-4 (default: 3)\n- **ISSUE_NEXT_STEPS** (user variable, optional): Next steps guidance\n- **ISSUE_DETAILS** (user variable, optional): Issue details\n- **ISSUE_ON** (user variable, optional): When to raise issue - see conditions below (default: \"results_found\")\n- **ISSUE_THRESHOLD** (user variable, optional): Numeric threshold for count_above/count_below (default: 0)\n\n## Authentication\nThis codebundle supports two authentication methods with automatic fallback:\n1. **Azure AD / Service Principal** (recommended) - Uses `azure_credentials` secret\n2. **Key-based authentication** (fallback) - Uses `cosmosdb_key` secret\n\nThe codebundle will automatically try service principal authentication first, and if that's not available, it will fall back to key-based authentication. You only need to configure one method.\n\n### Service Principal Setup (Azure AD Authentication)\nFor service principal authentication, you need **Cosmos DB Data Plane RBAC** permissions (not Azure ARM control plane roles):\n\n**Required Role:** `Cosmos DB Built-in Data Reader` (Role ID: `00000000-0000-0000-0000-000000000001`)\n\n```bash\n# Grant data plane RBAC permissions\naz cosmosdb sql role assignment create \\\n --account-name \\\n --resource-group \\\n --scope \"/\" \\\n --principal-id \\\n --role-definition-id 00000000-0000-0000-0000-000000000001\n```\n\n**Note:** These are **data plane** roles for accessing Cosmos DB data, not the Azure ARM control plane roles you see in the Azure Portal (like \"Cosmos DB Account Reader\" or \"Cosmos DB Operator\"). Data plane roles are managed separately via the Azure CLI.\n\n## Issue Conditions\n\nThis codebundle supports **flexible issue detection** using the `ISSUE_ON` parameter:\n\n### `results_found` (default)\nRaise issue when query returns any results.\n- **Use case:** Error detection - looking for problems\n- **Healthy:** No results (SLI = 1)\n- **Unhealthy:** Results found (SLI = 0, issue raised)\n\n### `no_results`\nRaise issue when query returns NO results.\n- **Use case:** Validation - expecting data to exist\n- **Healthy:** Results found (SLI = 1)\n- **Unhealthy:** No results (SLI = 0, issue raised)\n\n### `count_above`\nRaise issue when result count exceeds a threshold.\n- **Use case:** Volume monitoring - too many items\n- **Requires:** `ISSUE_THRESHOLD` (e.g., 100)\n- **Healthy:** Count \u2264 threshold (SLI = 1)\n- **Unhealthy:** Count > threshold (SLI = 0, issue raised)\n\n### `count_below`\nRaise issue when result count is below a threshold.\n- **Use case:** Minimum requirements - too few items\n- **Requires:** `ISSUE_THRESHOLD` (e.g., 10)\n- **Healthy:** Count \u2265 threshold (SLI = 1)\n- **Unhealthy:** Count < threshold (SLI = 0, issue raised)\n\n## Usage Examples\n\n### Example 1: Detect Error Documents (results_found - default)\n```bash\nDATABASE_NAME=\"mydatabase\"\nCONTAINER_NAME=\"mycontainer\"\nCOSMOSDB_QUERY=\"SELECT * FROM c WHERE c.status = 'error' ORDER BY c._ts DESC\"\nISSUE_ON=\"results_found\" # Default - can be omitted\nTASK_TITLE=\"Detect error documents\"\nISSUE_TITLE=\"Found error documents in Cosmos DB\"\nISSUE_SEVERITY=2\n```\n**Behavior:** Issue raised if ANY errors are found\n\n---\n\n### Example 2: Validate Expected Data Exists (no_results)\n```bash\nDATABASE_NAME=\"inventory\"\nCONTAINER_NAME=\"products\"\nCOSMOSDB_QUERY=\"SELECT * FROM c WHERE c.category = 'featured' AND c.inStock = true\"\nISSUE_ON=\"no_results\"\nTASK_TITLE=\"Validate featured products exist\"\nISSUE_TITLE=\"No featured products in stock\"\nISSUE_SEVERITY=2\nISSUE_NEXT_STEPS=\"Add featured products to inventory\"\n```\n**Behavior:** Issue raised if NO fea", - "libraries": [ - "RW.platform", - "RW.Core", - "BuiltIn", - "RW.Azure.Cosmosdb" - ], - "git_url": "https://github.com/runwhen-contrib/rw-generic-codecollection/tree/main/codebundles/azure-cosmosdb-query-issue" - }, - { - "slug": "rw-generic-codecollection-curl-stdout-issue", - "collection_slug": "rw-generic-codecollection", - "name": "curl-stdout-issue", - "display_name": "curl-stdout-issue", - "description": "A generic codebundle used for running a curl command, commonly with grep, that raises an issue when the command output is non-empty, implying that an error was found via grepping the output.", - "platform": "Unknown", - "author": "jon-funk", - "support_tags": [ - "rw" - ], - "tasks": [ - "${TASK_TITLE}", - "${TASK_TITLE}" - ], - "capabilities": [ - "${TASK_TITLE}: Runs a user provided curl command and if the return string is non-empty it indicates an error was found, pushing a health score of 0, otherwise pushes a 1.", - "${TASK_TITLE}: Runs a user provided curl command and if the return string is non-empty, it's added to a report and used to raise an issue." - ], - "readme": "# Curl Stdout Issue Detection\nA generic codebundle used for running a curl command, commonly with grep, that raises an issue when the command output is non-empty, implying that an error was found via grepping the output.\n\n## TaskSet\nThe generalized user-provided command that can raise a configurable issue if the return is non-empty\n\nExample: `curl -X POST https://postman-echo.com/post --fail --silent --show-error | jq -r '.json'`\n", - "libraries": [ - "RW.platform", - "RW.Core", - "BuiltIn", - "RW.CLI", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-generic-codecollection/tree/main/codebundles/curl-stdout-issue" - }, - { - "slug": "rw-generic-codecollection-curl-headers-stdout-issue", - "collection_slug": "rw-generic-codecollection", - "name": "curl-headers-stdout-issue", - "display_name": "curl-headers-stdout-issue", - "description": "A generic codebundle used for running a curl command, commonly with grep, that raises an issue when the command output is non-empty, implying that an error was found via grepping the output.", - "platform": "Unknown", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "${TASK_TITLE}", - "${TASK_TITLE}" - ], - "capabilities": [ - "${TASK_TITLE}: Runs a user-provided cURL command; whatever is returned in stdout is pushed as the metric.", - "${TASK_TITLE}: Runs a user-provided cURL command. If any headers are provided, appends `-K ./HEADERS`." - ], - "readme": "# Curl Stdout Issue Detection\nA generic codebundle used for running a curl command, commonly with grep, that raises an issue when the command output is non-empty, implying that an error was found via grepping the output.\n\n## TaskSet\nThe generalized user-provided command that can raise a configurable issue if the return is non-empty\n\nExample: `curl -X POST https://postman-echo.com/post --fail --silent --show-error | jq -r '.json'`\n", - "libraries": [ - "RW.platform", - "RW.Core", - "BuiltIn", - "RW.CLI", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-generic-codecollection/tree/main/codebundles/curl-headers-stdout-issue" - }, - { - "slug": "rw-generic-codecollection-git-script-cmd-env", - "collection_slug": "rw-generic-codecollection", - "name": "git-script-cmd-env", - "display_name": "git-script-cmd-env", - "description": "A generic codebundle designed for safely executing scripts with configurable environment variables from secrets. This provides the most flexible and secure approach for loading arbitrary secrets as environment variables.", - "platform": "Unknown", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "${TASK_TITLE}", - "${TASK_TITLE}" - ], - "capabilities": [ - "${TASK_TITLE}: Executes a user-provided script/command with up to 10 individually configured environment variables from secrets and pushes the stdout as a metric.", - "${TASK_TITLE}: Executes a user-provided script/command with up to 10 individually configured environment variables from secrets." - ], - "readme": "# Environment Script Command\nA generic codebundle designed for safely executing scripts with configurable environment variables from secrets. This provides the most flexible and secure approach for loading arbitrary secrets as environment variables.\n\n## Features\n- **Maximum Flexibility**: Load any number of secrets as individual environment variables\n- **Private Git Support**: Secure git operations with SSH keys or tokens\n- **Script Execution**: Execute any script with full environment context\n- **Individual Secret Control**: Each secret is configured separately for maximum security\n- **No JSON Parsing**: Avoids complexity and security issues of JSON parsing\n\n## SLI\nThe SLI pushes a health metric to the RunWhen Platform: 1 for success (healthy), 0 for failure (unhealthy). The metric is based on the script's exit code, not its output.\n\nExample: `git clone https://$GIT_TOKEN@github.com/private/repo.git ./repo && bash ./repo/scripts/health-check.sh`\n\n## TaskSet\nThe command has all output added to the report for review during a RunSession. \n\nExample: `git clone git@github.com:private/repo.git /tmp/repo && bash /tmp/repo/scripts/deploy.sh`\n\n## Requirements\n- **SCRIPT_COMMAND**: The script/command to execute\n- **Individual Secrets**: Configure each secret separately (ENV_VAR_1, ENV_VAR_2, etc.)\n- **Optional SSH_PRIVATE_KEY**: SSH private key for git operations\n- **Optional kubeconfig**: Kubernetes config file for cluster access\n\n## Configuration Approach\n\nThis codebundle uses individual secret imports rather than JSON parsing, making it:\n- **Safer**: No risk of JSON injection or parsing errors\n- **More Flexible**: Each secret can have its own description and validation\n- **Easier to Configure**: Clear separation of each environment variable\n- **More Secure**: Each secret is handled individually by RunWhen's secret management\n\n## Usage Examples\n\n### Basic Usage with Multiple Environment Variables\n```bash\n# Configure secrets individually:\n# - ENV_VAR_DATABASE_URL (secret)\n# - ENV_VAR_API_KEY (secret)\n# - ENV_VAR_SLACK_TOKEN (secret)\n\nSCRIPT_COMMAND=\"echo 'Database: $DATABASE_URL' && echo 'API Key configured: ${API_KEY:0:8}...' && curl -X POST $SLACK_WEBHOOK\"\n```\n\n### Private Git Repository with SSH\n```bash\n# Configure SSH_PRIVATE_KEY secret, then:\nSCRIPT_COMMAND=\"git clone git@github.com:private/repo.git ./repo && bash ./repo/scripts/deploy.sh\"\n```\n\n### Private Git Repository with Token\n```bash\n# Configure ENV_VAR_GIT_TOKEN secret, then:\nSCRIPT_COMMAND=\"git clone https://$GIT_TOKEN@github.com/private/repo.git ./repo && bash ./repo/scripts/deploy.sh\"\n```\n\n### Private Git + Kubernetes Operations\n```bash\n# Configure secrets:\n# - SSH_PRIVATE_KEY (secret) - for git access\n# - kubeconfig (secret) - for K8s access\n# - ENV_VAR_1_NAME = \"NAMESPACE\" \n# - ENV_VAR_1_VALUE = \"production\" (secret)\n\nSCRIPT_COMMAND=\"git clone git@github.com:private/k8s-repo.git ./repo && kubectl apply -f ./repo/manifests/ -n $NAMESPACE\"\n```\n\n## Security Features\n- Individual secret management through RunWhen's secure secret system\n- SSH keys handled automatically by RunWhen platform, permissions fixed to 600\n- Uses `GIT_SSH_COMMAND` to specify SSH key location for git operations\n- Works in shared runner environments without requiring home directory access\n- Creates local known_hosts file for GitHub SSH verification\n- Environment variables isolated to command execution context\n- No secret values logged or exposed in reports\n- Each secret can be individually validated and managed ", - "libraries": [ - "Collections", - "RW.DynamicIssues", - "RW.platform", - "RW.Core", - "BuiltIn", - "RW.CLI", - "OperatingSystem", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/rw-generic-codecollection/tree/main/codebundles/git-script-cmd-env" - }, - { - "slug": "rw-generic-codecollection-curl-headers-cmd", - "collection_slug": "rw-generic-codecollection", - "name": "curl-headers-cmd", - "display_name": "curl-headers-cmd", - "description": "A generic codebundle used for running bare curl commands in a bash shell.", - "platform": "Unknown", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "${TASK_TITLE}", - "${TASK_TITLE}" - ], - "capabilities": [ - "${TASK_TITLE}: Runs a user-provided cURL command. If HEADERS is set, applies -K ./HEADERS.", - "${TASK_TITLE}: Runs a user-provided cURL command, optionally includes headers (-K ./HEADERS), optionally pipes output to POST_PROCESS, and adds the outputs to the report." - ], - "readme": "# Curl cmd\nA generic codebundle used for running bare curl commands in a bash shell. \n\n## SLI\nThe command provided must provide a single metric that is pushed to the RunWhen Platform. \n\nExample: `curl -X POST https://postman-echo.com/post --fail --silent --show-error | jq -r '.json | length'`\n\n## TaskSet\nThe command has all output added to the report for review during a RunSession. \n\nExample: `curl -X POST https://postman-echo.com/post --fail --silent --show-error | jq -r '.json'`\n\n## Requirements\n- A curl command string of your choosing", - "libraries": [ - "RW.DynamicIssues", - "RW.platform", - "RW.Core", - "BuiltIn", - "RW.CLI", - "OperatingSystem", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/rw-generic-codecollection/tree/main/codebundles/curl-headers-cmd" - }, - { - "slug": "rw-workspace-utils-dynatrace-webbook-handler", - "collection_slug": "rw-workspace-utils", - "name": "dynatrace-webbook-handler", - "display_name": "dynatrace-webbook-handler", - "description": "", - "platform": "Unknown", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Start RunSession From Dynatrace Webhook Details" - ], - "capabilities": [ - "Start RunSession From Dynatrace Webhook Details: Parse webhook \u279c match SLXs \u279c search tasks \u279c (optionally) new RunSession" - ], - "readme": "", - "libraries": [ - "RW.Dynatrace", - "Collections", - "RW.platform", - "RW.Core", - "BuiltIn", - "RW.CLI", - "OperatingSystem", - "RW.RunSession", - "RW.Workspace" - ], - "git_url": "https://github.com/runwhen-contrib/rw-workspace-utils/tree/main/codebundles/dynatrace-webbook-handler" - }, - { - "slug": "rw-workspace-utils-azure-monitor-webhook-handler", - "collection_slug": "rw-workspace-utils", - "name": "azure-monitor-webhook-handler", - "display_name": "azure-monitor-webhook-handler", - "description": "", - "platform": "Azure", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Start RunSession From Azure Monitor Webhook Details" - ], - "capabilities": [ - "Start RunSession From Azure Monitor Webhook Details: Parse the azure monitor webhook and route and SLX where with matching SLX tags" - ], - "readme": "", - "libraries": [ - "RW.Azure", - "Collections", - "RW.platform", - "RW.Core", - "BuiltIn", - "RW.CLI", - "OperatingSystem", - "RW.RunSession", - "String", - "RW.Workspace" - ], - "git_url": "https://github.com/runwhen-contrib/rw-workspace-utils/tree/main/codebundles/azure-monitor-webhook-handler" - }, - { - "slug": "rw-workspace-utils-pagerduty-webhook-handler", - "collection_slug": "rw-workspace-utils", - "name": "pagerduty-webhook-handler", - "display_name": "pagerduty-webhook-handler", - "description": "", - "platform": "Unknown", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Run SLX Tasks with matching PagerDuty Webhook Service ID" - ], - "capabilities": [ - "Run SLX Tasks with matching PagerDuty Webhook Service ID: Parse the webhook details and route to the right SLX" - ], - "readme": "", - "libraries": [ - "RW.platform", - "RW.Core", - "BuiltIn", - "RW.CLI", - "OperatingSystem", - "RW.PagerDuty", - "RW.Workspace" - ], - "git_url": "https://github.com/runwhen-contrib/rw-workspace-utils/tree/main/codebundles/pagerduty-webhook-handler" - }, - { - "slug": "rw-workspace-utils-slack-post-message-from-runsession", - "collection_slug": "rw-workspace-utils", - "name": "slack-post-message-from-runsession", - "display_name": "slack-post-message-from-runsession", - "description": "This CodeBundle sends a slack message with a summary of users, open issues, and related resources from a RunSession.", - "platform": "Unknown", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Send Slack Notification to Channel `${SLACK_CHANNEL}` from RunSession" - ], - "capabilities": [ - "Send Slack Notification to Channel `${SLACK_CHANNEL}` from RunSession: Sends a Slack message containing the summarized details of the RunSession." - ], - "readme": "# Slack Post Message from RunSession\nThis CodeBundle sends a slack message with a summary of users, open issues, and related resources from a RunSession. \n\nNote> This CodeBundle will inspect the RunSession notes field for indications of being triggered from another runsession (called the Related RunSession). \nIf this data is found, the related runsession details are what's used. ", - "libraries": [ - "RW.platform", - "RW.Core", - "BuiltIn", - "RW.CLI", - "OperatingSystem", - "RW.RunSession", - "RW.Slack", - "RW.Workspace" - ], - "git_url": "https://github.com/runwhen-contrib/rw-workspace-utils/tree/main/codebundles/slack-post-message-from-runsession" - }, - { - "slug": "rw-workspace-utils-azure-rw-acr-sync", - "collection_slug": "rw-workspace-utils", - "name": "azure-rw-acr-sync", - "display_name": "azure-rw-acr-sync", - "description": "This codebundle synchronizes upstream RunWhen images (CodeCollection and RunWhen Local components) into a private Azure Container Registry. It is intended to be paired with azure-rw-acr-helm-update.", - "platform": "Azure", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Check for CodeCollection Updates against ACR Registry`${REGISTRY_NAME}`", - "Check for RunWhen Local Image Updates against ACR Registry`${REGISTRY_NAME}`", - "Count Images Needing Update and Push Metric", - "Sync CodeCollection Images to ACR Registry `${REGISTRY_NAME}`", - "Sync RunWhen Local Image Updates to ACR Registry`${REGISTRY_NAME}`" - ], - "capabilities": [ - "Check for CodeCollection Updates against ACR Registry`${REGISTRY_NAME}`: Count the number of CodeCollection image upates that need to be synced internally to the private registry.", - "Check for RunWhen Local Image Updates against ACR Registry`${REGISTRY_NAME}`: Count the number of RunWhen Local image upates that need to be synced internally to the private registry.", - "Sync CodeCollection Images to ACR Registry `${REGISTRY_NAME}`: Sync CodeCollection image upates that need to be synced internally to the private registry.", - "Sync RunWhen Local Image Updates to ACR Registry`${REGISTRY_NAME}`: Sync RunWhen Local image upates that need to be synced internally to the private registry." - ], - "readme": "# RunWhen Platform Azure ACR Image Sync\n\nThis codebundle synchronizes upstream RunWhen images (CodeCollection and RunWhen Local components) into a private Azure Container Registry. It is intended to be paired with azure-rw-acr-helm-update.\n\n## Purpose\n\n- **azure-rw-acr-sync** (**This CodeBundle**) - Synchronizes upstream RunWhen images into Azure Container Registry when updates are available\n- azure-rw-acr-helm-update (Not this CodeBundle) - Compares running Helm releases to available images in ACR and applies helm upgrades\n\n## Image Categories\n\nThis codebundle handles two categories of images with different tagging strategies:\n\n### CodeCollection Images\nCodeCollection images use a REF-based tagging strategy with digest verification:\n- Source: `us-west1-docker.pkg.dev/runwhen-nonprod-beta/public-images/`\n- Tags follow pattern: `${REF}-${HASH}` (e.g., `main-abc1234`)\n- Uses digest (SHA256) comparison to find correct tags\n- Excludes architecture-prefixed tags (e.g., `amd64-*`, `arm64-*`)\n- Syncs stable commit-based tags instead of `-latest` tags\n\n**Images:**\n- `runwhen-contrib-rw-cli-codecollection`\n- `runwhen-contrib-rw-public-codecollection`\n- `runwhen-contrib-rw-generic-codecollection`\n- `runwhen-contrib-rw-workspace-utils`\n- `runwhen-contrib-azure-c7n-codecollection`\n\n### RunWhen Local Images\nRunWhen Local images use simpler tagging:\n- Standard versioning or date-based tags\n- Latest tag selection via version sort\n- Supports optional date-based tagging with `USE_DATE_TAG=true`\n\n**Images:**\n- `runwhen-local`\n- `opentelemetry-collector`\n- `runner`\n\n## Configuration\n\n### Required Variables\n\n- **REGISTRY_NAME** - The ACR registry name (e.g., `myacr.azurecr.io` or `myacr`)\n- **REGISTRY_REPOSITORY_PATH** - Root path in ACR for image storage (e.g., `runwhen`)\n- **AZURE_RESOURCE_SUBSCRIPTION_ID** - Azure Subscription ID (auto-detected if not set)\n\n### Optional Variables\n\n- **SYNC_IMAGES** - Set to `true` to sync images; `false` generates report only (default: `false` for SLI, `true` for Taskset)\n- **REF** - Git reference (branch) for codecollection image tagging (default: `main`)\n- **USE_DATE_TAG** - Set to `true` to generate unique date-based tags for `latest` images (default: `false`)\n- **USE_DOCKER_AUTH** - Set to `true` to import Docker Hub credentials to bypass rate limits (default: `false`)\n\n### Required Secrets\n\n- **azure_credentials** - Contains AZURE_CLIENT_ID, AZURE_TENANT_ID, AZURE_CLIENT_SECRET, AZURE_SUBSCRIPTION_ID\n\n### Optional Secrets (if USE_DOCKER_AUTH=true)\n\n- **DOCKER_USERNAME** - Docker Hub username\n- **DOCKER_TOKEN** - Docker Hub token/password\n\n## How It Works\n\n### CodeCollection Sync Process\n\n1. Fetches all tags from source repository\n2. Looks for `${REF}-latest` tag (e.g., `main-latest`)\n3. Extracts the image digest (SHA256) for the specified architecture\n4. Finds sibling tags matching `${REF}-[0-9a-f]{7,}` with the same digest\n5. Syncs the stable hash-based tag to ACR (not the `-latest` tag)\n6. Verifies tag doesn't already exist before importing\n\n### RunWhen Local Sync Process\n\n1. Renders the latest RunWhen Local Helm chart\n2. Extracts image references from rendered manifests\n3. Compares creation dates between upstream and ACR images\n4. Imports newer images to ACR\n5. Optionally replaces `latest` tags with date-based tags\n\n## Usage\n\n### SLI (Service Level Indicator)\n\nThe SLI checks for available updates and pushes a metric with the total count of images needing sync:\n- Runs on a schedule (e.g., every hour)\n- Reports number of outdated images\n- Does NOT sync by default (set `SYNC_IMAGES=true` to enable)\n\n### Taskset (On-Demand Sync)\n\nThe Taskset performs the actual sync operation:\n- Runs on-demand or on schedule\n- Syncs images to ACR (default `SYNC_IMAGES=true`)\n- Adds detailed output to report\n\n## Examples\n\n### Check for Updates (Dry Run)\n```bash\nSYNC_IMAGES=false\nREF=main\n```\n\n### Sync Images from Main Branch\n```bash\nSYNC_IMAGES=true\nREF=main\nREGISTRY_NAME=myacr.azurecr.io\nREGISTRY_REPOSITORY_PATH=runwhen\n```\n\n### Sync Images from Dev Branch\n```bash\nSYNC_IMAGES=true\nREF=dev\nREGISTRY_NAME=myacr.azurecr.io\nREGISTRY_REPOSITORY_PATH=runwhen\n```\n\n### Use Date-Based Tags\n```bash\nSYNC_IMAGES=true\nUSE_DATE_TAG=true\n```\n\n## Workspace Configuration\n\nAdd to your workspaceInfo.yaml:\n\n```yaml\ncustom: \n private_registry: azure_acr\n azure_acr_registry: [ACR registry Name]\n azure_service_principal_secret_name: azure-sp\n```\n\n## Output\n\n### SLI Output\n- Metric: Total number of images requiring update\n- JSON files:\n - `cc_images_to_update.json` - CodeCollection images needing sync\n - `images_to_update.json` - RunWhen Local images needing sync\n\n### Taskset Output\n- Detailed sync results in report\n- Lists of imported images\n- Any errors or warnings\n\n## Troubleshooting\n\n### Common Issues\n\n**\"No ${REF}-latest found\"**\n- Verify REF matches available branches in source registry\n- Check that source images exist\n\n**\"Unable to resolve digest\"**\n- Check network connectivity to source registry\n- Verify no f", - "libraries": [ - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-workspace-utils/tree/main/codebundles/azure-rw-acr-sync" - }, - { - "slug": "rw-workspace-utils-github-create-issue-from-runsession", - "collection_slug": "rw-workspace-utils", - "name": "github-create-issue-from-runsession", - "display_name": "github-create-issue-from-runsession", - "description": "This CodeBundle creates a GitHub Issue with a summary of users, open issues, and related resources from a RunSession.", - "platform": "Unknown", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Create GitHub Issue in Repository `${GITHUB_REPOSITORY}` from RunSession" - ], - "capabilities": [ - "Create GitHub Issue in Repository `${GITHUB_REPOSITORY}` from RunSession: Create a GitHub Issue with the summarized details of the RunSession. Intended to be used as a final task in a workflow." - ], - "readme": "# GitHub Create Issue\nThis CodeBundle creates a GitHub Issue with a summary of users, open issues, and related resources from a RunSession. \n\nNote> This CodeBundle will inspect the RunSession notes field for indications of being triggered from another runsession (called the Related RunSession). \nIf this data is found, the related runsession details are what's used. ", - "libraries": [ - "RW.GitHub", - "RW.platform", - "RW.Core", - "BuiltIn", - "RW.CLI", - "OperatingSystem", - "RW.RunSession", - "RW.Workspace" - ], - "git_url": "https://github.com/runwhen-contrib/rw-workspace-utils/tree/main/codebundles/github-create-issue-from-runsession" - }, - { - "slug": "rw-workspace-utils-azure-rw-acr-helm-update", - "collection_slug": "rw-workspace-utils", - "name": "azure-rw-acr-helm-update", - "display_name": "azure-rw-acr-helm-update", - "description": "This is intended for use by customers running a private ACR registry which RunWhen Local must use for it's images. It is intended to be paired with azure-rw-acr-sync. These two CodeBundles function as follows:", - "platform": "Azure", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Check for Available RunWhen Helm Images in ACR Registry`${REGISTRY_NAME}`", - "Apply Available RunWhen Helm Images in ACR Registry`${REGISTRY_NAME}`" - ], - "capabilities": [ - "Check for Available RunWhen Helm Images in ACR Registry`${REGISTRY_NAME}`: Count the number of running RunWhen images that have updates available in ACR (via Helm CLI).", - "Apply Available RunWhen Helm Images in ACR Registry`${REGISTRY_NAME}`: Count the number of running RunWhen images that have updates available in ACR (via Helm CLI)." - ], - "readme": "# RunWhen Local Helm Update Check (ACR)\nThis is intended for use by customers running a private ACR registry which RunWhen Local must use for it's images. It is intended to be paired with azure-rw-acr-sync. These two CodeBundles function as follows: \n\n- azure-rw-acr-sync (Not this CodeBundle) - Synchronizes upstream RunWhen images into Azure Container Registry on a regular basis when updates are available. \n- azure-rw-acr-helm-update (**This CodeBundle**) - Compares the running Helm release to the available images in ACR and applys a helm upgrade (cli based) if new images are available. \n\n## Image Tagging Strategy\n\nThis codebundle handles two categories of images with different tagging strategies:\n\n### CodeCollection Images\nCodeCollection images use a REF-based tagging strategy with digest verification:\n- Tags follow the pattern: `${REF}-${HASH}` (e.g., `main-abc1234`)\n- The script looks for tags matching the current REF (default: `main`)\n- Compares image digests (SHA256) to ensure consistency\n- Excludes architecture-prefixed tags (e.g., `amd64-*`, `arm64-*`)\n- Falls back to `${REF}-latest` siblings with matching digests\n\n### RunWhen Local Images\nRunWhen Local images use simpler version-based tagging:\n- Uses standard semantic versioning\n- Selects the latest tag via version sort\n\n## CodeBundle Configuration\n\nRequired Variables:\n- **REGISTRY_NAME** - The ACR registry name (e.g., `myacr.azurecr.io`)\n- **REGISTRY_REPOSITORY_PATH** - The root path/directory in the ACR registry to search for images (e.g., `runwhen`)\n- **NAMESPACE** - The Kubernetes namespace (e.g., `runwhen-local`)\n- **CONTEXT** - The Kubernetes context\n- **HELM_RELEASE** - The name of the helm release to inspect and update (e.g., `runwhen-local`)\n\nOptional Variables:\n- **HELM_APPLY_UPGRADE** - Set to `true` to automatically apply the upgrade (default: `false`)\n- **REF** - The git reference (branch) for codecollection image tagging (default: `main`)\n- **AZURE_RESOURCE_SUBSCRIPTION_ID** - The Azure Subscription ID (auto-detected if not set)\n\nThis CodeBundle requires the following custom variables to be added to the workspaceInfo.yaml: \n\n```\ncustom: \n private_registry: azure_acr\n azure_acr_registry: [ACR registry Name]\n azure_service_principal_secret_name: azure-sp (not required if spSecretName is set)\n```\n\n## SLI\nThe SLI runs the helm_update.sh script on a regular basis (defaulted to every 10m), listing the running images the helm release, looking for newer images in ACR, and generating the `helm upgrade` command needed to apply the update. If `HELM_APPLY_UPGRADE=\"true\"`, the helm upgrade is automatically applied.\n\nPushes the metric of the total number of images that need to be updated. \n\n\n## Taskset\nPerforms the same function as the SLI, but adds the details to the report and can be run on demand. ", - "libraries": [ - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem" - ], - "git_url": "https://github.com/runwhen-contrib/rw-workspace-utils/tree/main/codebundles/azure-rw-acr-helm-update" - }, - { - "slug": "rw-workspace-utils-alertmanager-webbook-handler", - "collection_slug": "rw-workspace-utils", - "name": "alertmanager-webbook-handler", - "display_name": "alertmanager-webbook-handler", - "description": "", - "platform": "Unknown", - "author": "stewartshea", - "support_tags": [ - "rw" - ], - "tasks": [ - "Add Tasks to RunSession from AlertManager Webhook Details" - ], - "capabilities": [ - "Add Tasks to RunSession from AlertManager Webhook Details: Parse the alertmanager webhook commonLabels and route and SLX where commonLabels match SLX tags" - ], - "readme": "", - "libraries": [ - "Collections", - "RW.platform", - "RW.Core", - "BuiltIn", - "RW.CLI", - "OperatingSystem", - "RW.RunSession", - "RW.Workspace" - ], - "git_url": "https://github.com/runwhen-contrib/rw-workspace-utils/tree/main/codebundles/alertmanager-webbook-handler" - }, - { - "slug": "aws-c7n-codecollection-aws-c7n-monitoring-health", - "collection_slug": "aws-c7n-codecollection", - "name": "aws-c7n-monitoring-health", - "display_name": "aws-c7n-monitoring-health", - "description": "This CodeBundle evaluates the health of CloudWatch Log Groups in a given AWS Account and Region", - "platform": "AWS", - "author": "saurabh3460", - "support_tags": [ - "aws" - ], - "tasks": [ - "Check CloudWatch Log Groups Without Retention Period in AWS Region `${AWS_REGION}` in AWS account `${AWS_ACCOUNT_ID}`", - "Check if CloudTrail exists and is configured for multi-region in AWS Region `${AWS_REGION}` in AWS Account `${AWS_ACCOUNT_ID}`", - "Check CloudTrail Without CloudWatch Logs in AWS Region `${AWS_REGION}` in AWS Account `${AWS_ACCOUNT_ID}`", - "Generate Health Score", - "List CloudWatch Log Groups Without Retention Period in AWS Region `${AWS_REGION}` in AWS Account `${AWS_ACCOUNT_ID}`", - "Check CloudTrail Configuration in AWS Region `${AWS_REGION}` in AWS Account `${AWS_ACCOUNT_ID}`" - ], - "capabilities": [ - "Check CloudWatch Log Groups Without Retention Period in AWS Region `${AWS_REGION}` in AWS account `${AWS_ACCOUNT_ID}`: Check CloudWatch Log Groups without retention period", - "Check if CloudTrail exists and is configured for multi-region in AWS Region `${AWS_REGION}` in AWS Account `${AWS_ACCOUNT_ID}`: Check if CloudTrail exists and is configured for multi-region", - "Check CloudTrail Without CloudWatch Logs in AWS Region `${AWS_REGION}` in AWS Account `${AWS_ACCOUNT_ID}`: Check if CloudTrail exists and is configured for multi-region in AWS Region `${AWS_REGION}` in AWS Account `${AWS_ACCOUNT_ID}`", - "List CloudWatch Log Groups Without Retention Period in AWS Region `${AWS_REGION}` in AWS Account `${AWS_ACCOUNT_ID}`: List CloudWatch Log Groups Without Retention Period", - "Check CloudTrail Configuration in AWS Region `${AWS_REGION}` in AWS Account `${AWS_ACCOUNT_ID}`: Check for CloudTrail integration with CloudWatch Logs" - ], - "readme": "# AWS Cloud Custodian CloudWatch Health\n\nThis CodeBundle evaluates the health of CloudWatch Log Groups in a given AWS Account and Region\n\n## SLI\nThe SLI produces a score of 0 (bad), 1(good), or a value in between. This score is generated by capturing the following: \n- CloudWatch Log Groups without retention period\n- CloudTrail Trails that are not multi-region\n- CloudTrail Trails without CloudWatch Logs\n\n## TaskSet\nSimilar to the SLI, but produces a report on the specific resources and raises issues for each CloudWatch Log Group that requires attention. \n\n## Required Configuration\n\n```\nexport AWS_ACCESS_KEY_ID=[]\nexport AWS_SECRET_ACCESS_KEY=[]\nexport AWS_DEFAULT_REGION=[]\nexport AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query \"Account\" --output text)\n```\n\n## Testing \nSee the `.test` directory for infrastructure test code. ", - "libraries": [ - "RW.CLI", - "RW.Core", - "CloudCustodian.Core" - ], - "git_url": "https://github.com/runwhen-contrib/aws-c7n-codecollection/tree/main/codebundles/aws-c7n-monitoring-health" - }, - { - "slug": "aws-c7n-codecollection-aws-c7n-ec2-health", - "collection_slug": "aws-c7n-codecollection", - "name": "aws-c7n-ec2-health", - "display_name": "aws-c7n-ec2-health", - "description": "This CodeBundle evaluates the health of EC2 instances in a given AWS Account and Region", - "platform": "AWS", - "author": "saurabh3460", - "support_tags": [ - "aws" - ], - "tasks": [ - "Check for stale AWS EC2 instances in AWS Region `${AWS_REGION}` in AWS account `${AWS_ACCOUNT_ID}`", - "Check for stopped AWS EC2 instances in AWS Region `${AWS_REGION}` in AWS account `${AWS_ACCOUNT_ID}`", - "Check for invalid AWS Auto Scaling Groups in AWS Region `${AWS_REGION}` in AWS account `${AWS_ACCOUNT_ID}`", - "Generate Health Score", - "Suite Initialization", - "List stale AWS EC2 instances in AWS Region `${AWS_REGION}` in AWS account `${AWS_ACCOUNT_ID}`", - "List stopped AWS EC2 instances in AWS Region `${AWS_REGION}` in AWS account `${AWS_ACCOUNT_ID}`", - "List invalid AWS Auto Scaling Groups in AWS Region ${AWS_REGION} in AWS account ${AWS_ACCOUNT_ID}", - "Suite Initialization" - ], - "capabilities": [ - "Check for stale AWS EC2 instances in AWS Region `${AWS_REGION}` in AWS account `${AWS_ACCOUNT_ID}`: Check for stale EC2 instances in AWS Region.", - "Check for stopped AWS EC2 instances in AWS Region `${AWS_REGION}` in AWS account `${AWS_ACCOUNT_ID}`: Check for stopped EC2 instances in AWS Region.", - "Check for invalid AWS Auto Scaling Groups in AWS Region `${AWS_REGION}` in AWS account `${AWS_ACCOUNT_ID}`: Check for invalid Auto Scaling Groups.", - "List stale AWS EC2 instances in AWS Region `${AWS_REGION}` in AWS account `${AWS_ACCOUNT_ID}`: List stale EC2 instances in AWS Region.", - "List stopped AWS EC2 instances in AWS Region `${AWS_REGION}` in AWS account `${AWS_ACCOUNT_ID}`: List stopped EC2 instances in AWS Region.", - "List invalid AWS Auto Scaling Groups in AWS Region ${AWS_REGION} in AWS account ${AWS_ACCOUNT_ID}: List invalid Auto Scaling Groups" - ], - "readme": "# AWS Cloud Custodian EC2 Health\n\nThis CodeBundle evaluates the health of EC2 instances in a given AWS Account and Region\n\n## SLI\nThe SLI produces a score of 0 (bad), 1(good), or a value in between. This score is generated by capturing the following: \n- Stale AWS EC2 instances\n- Stopped AWS EC2 instances\n- Invalid Auto Scaling Group\n - Filter autoscale groups to find those that are structurally invalid.\n\n - Structurally invalid means that the auto scale group will not be able to launch an instance succesfully as the configuration has\n - invalid subnets\n - invalid security groups\n - invalid key pair name\n - invalid launch config volume snapshots\n - invalid amis\n - invalid health check elb (slower)\n\n**Reasoning**: Stale and stopped instances may pose security risks due to missed updates or inactivity.\n\n## TaskSet\nSimilar to the SLI, but produces a report on the specific resources and raises issues for each EC2 that requires attention. \n\n\n## Required Configuration\n\n```\nexport AWS_ACCESS_KEY_ID=[]\nexport AWS_SECRET_ACCESS_KEY=[]\nexport AWS_DEFAULT_REGION=[]\nexport AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query \"Account\" --output text)\n```\n\n\n## Testing \nSee the .test directory for infrastructure test code. ", - "libraries": [ - "RW.CLI", - "RW.Core", - "CloudCustodian.Core" - ], - "git_url": "https://github.com/runwhen-contrib/aws-c7n-codecollection/tree/main/codebundles/aws-c7n-ec2-health" - }, - { - "slug": "aws-c7n-codecollection-aws-c7n-ebs-health", - "collection_slug": "aws-c7n-codecollection", - "name": "aws-c7n-ebs-health", - "display_name": "aws-c7n-ebs-health", - "description": "This CodeBundle evaluates the health of EBS volumes in a given AWS Account and Region", - "platform": "AWS", - "author": "saurabh3460", - "support_tags": [ - "aws" - ], - "tasks": [ - "Check Unattached EBS Volumes in `${AWS_REGION}`", - "Check Unencrypted EBS Volumes in `${AWS_REGION}`", - "Check Unused EBS Snapshots in `${AWS_REGION}`", - "Generate EBS Score", - "Suite Initialization", - "List Unattached EBS Volumes in AWS Region `${AWS_REGION}` in AWS account `${AWS_ACCOUNT_ID}`", - "List Unencrypted EBS Volumes in AWS Region `${AWS_REGION}` in AWS account `${AWS_ACCOUNT_ID}`", - "List Unused EBS Snapshots in AWS Region `${AWS_REGION}` in AWS account `${AWS_ACCOUNT_ID}`", - "Suite Initialization" - ], - "capabilities": [ - "Check Unattached EBS Volumes in `${AWS_REGION}`: Check for unattached EBS volumes in the specified region.", - "Check Unencrypted EBS Volumes in `${AWS_REGION}`: Check for unencrypted EBS volumes and report any found that do not meet encryption requirements.", - "Check Unused EBS Snapshots in `${AWS_REGION}`: Check for unused EBS snapshots.", - "List Unattached EBS Volumes in AWS Region `${AWS_REGION}` in AWS account `${AWS_ACCOUNT_ID}`: Check for unattached EBS volumes in the specified region.", - "List Unencrypted EBS Volumes in AWS Region `${AWS_REGION}` in AWS account `${AWS_ACCOUNT_ID}`: Check for Unencrypted EBS Volumes in the specified region.", - "List Unused EBS Snapshots in AWS Region `${AWS_REGION}` in AWS account `${AWS_ACCOUNT_ID}`: Check for Unused EBS Snapshots in the specified region." - ], - "readme": "# AWS Cloud Custodian EBS Health\n\nThis CodeBundle evaluates the health of EBS volumes in a given AWS Account and Region\n\n## SLI\nThe SLI produces a score of 0 (bad), 1(good), or a value in between. This score is generated by capturing the following: \n- EBS Volumes that are unattached\n- EBS Volumes that are unencrypted \n- EBS Snapshots that are unused\n\nThe score of each check is added up and then divided by the total amount of checks. \n\n\n## TaskSet\nSimilar to the SLI, but produces a report on the specific resources and raises issues for each volume that requires attention. \n\n\n## Required Configuration\n\n```\nexport AWS_ACCESS_KEY_ID=[]\nexport AWS_SECRET_ACCESS_KEY=[]\nexport AWS_DEFAULT_REGION=[]\nexport AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query \"Account\" --output text)\n```\n\n\n## Testing \nSee the .test directory for infrastructure test code. ", - "libraries": [ - "RW.CLI", - "RW.Core", - "CloudCustodian.Core" - ], - "git_url": "https://github.com/runwhen-contrib/aws-c7n-codecollection/tree/main/codebundles/aws-c7n-ebs-health" - }, - { - "slug": "aws-c7n-codecollection-aws-c7n-acm-health", - "collection_slug": "aws-c7n-codecollection", - "name": "aws-c7n-acm-health", - "display_name": "aws-c7n-acm-health", - "description": "This CodeBundle evaluates the health of ACM Certificate in a given AWS Account and Region", - "platform": "AWS", - "author": "saurabh3460", - "support_tags": [ - "aws" - ], - "tasks": [ - "Check for unused ACM certificates in AWS Region `${AWS_REGION}` in AWS account `${AWS_ACCOUNT_ID}`", - "Check for Expiring ACM certificates in AWS Region `${AWS_REGION}` in AWS account `${AWS_ACCOUNT_ID}`", - "Check for expired ACM certificates in AWS Region `${AWS_REGION}` in AWS account `${AWS_ACCOUNT_ID}`", - "Check for Failed Status ACM Certificates in AWS Region `${AWS_REGION}` in AWS Account `${AWS_ACCOUNT_ID}`", - "Check for Pending Validation ACM Certificates in AWS Region `${AWS_REGION}` in AWS Account `${AWS_ACCOUNT_ID}`", - "Generate Health Score", - "List Unused ACM Certificates in AWS Region `${AWS_REGION}` in AWS Account `${AWS_ACCOUNT_ID}`", - "List Expiring ACM Certificates in AWS Region `${AWS_REGION}` in AWS Account `${AWS_ACCOUNT_ID}`", - "List Expired ACM Certificates in AWS Region `${AWS_REGION}` in AWS Account `${AWS_ACCOUNT_ID}`", - "List Failed Status ACM Certificates in AWS Region `${AWS_REGION}` in AWS Account `${AWS_ACCOUNT_ID}`", - "List Pending Validation ACM Certificates in AWS Region `${AWS_REGION}` in AWS Account `${AWS_ACCOUNT_ID}`" - ], - "capabilities": [ - "Check for unused ACM certificates in AWS Region `${AWS_REGION}` in AWS account `${AWS_ACCOUNT_ID}`: Find unused ACM certificates", - "Check for Expiring ACM certificates in AWS Region `${AWS_REGION}` in AWS account `${AWS_ACCOUNT_ID}`: Find Expiring ACM certificates", - "Check for expired ACM certificates in AWS Region `${AWS_REGION}` in AWS account `${AWS_ACCOUNT_ID}`: Find expired ACM certificates", - "Check for Failed Status ACM Certificates in AWS Region `${AWS_REGION}` in AWS Account `${AWS_ACCOUNT_ID}`: Find failed status ACM certificates", - "Check for Pending Validation ACM Certificates in AWS Region `${AWS_REGION}` in AWS Account `${AWS_ACCOUNT_ID}`: Find pending validation ACM certificates", - "List Unused ACM Certificates in AWS Region `${AWS_REGION}` in AWS Account `${AWS_ACCOUNT_ID}`: Find unused ACM certificates", - "List Expiring ACM Certificates in AWS Region `${AWS_REGION}` in AWS Account `${AWS_ACCOUNT_ID}`: Find Expiring ACM certificates", - "List Expired ACM Certificates in AWS Region `${AWS_REGION}` in AWS Account `${AWS_ACCOUNT_ID}`: Find expired ACM certificates", - "List Failed Status ACM Certificates in AWS Region `${AWS_REGION}` in AWS Account `${AWS_ACCOUNT_ID}`: Find failed status ACM certificates", - "List Pending Validation ACM Certificates in AWS Region `${AWS_REGION}` in AWS Account `${AWS_ACCOUNT_ID}`: Find pending validation ACM certificates" - ], - "readme": "# AWS Cloud Custodian ACM Certificate Health\n\nThis CodeBundle evaluates the health of ACM Certificate in a given AWS Account and Region\n\n## SLI\nThe SLI produces a score of 0 (bad), 1(good), or a value in between. This score is generated by capturing the following: \n- Expired ACM certificates\n- Soon to expire ACM certificates\n- Unused ACM certificates\n\n## TaskSet\nSimilar to the SLI, but produces a report on the specific resources and raises issues for each ACM Certificate that requires attention. \n\n## Required Configuration\n\n```\nexport AWS_ACCESS_KEY_ID=[]\nexport AWS_SECRET_ACCESS_KEY=[]\nexport AWS_DEFAULT_REGION=[]\nexport AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query \"Account\" --output text)\n```\n\n## Testing \nSee the `.test` directory for infrastructure test code. ", - "libraries": [ - "RW.CLI", - "RW.Core", - "CloudCustodian.Core" - ], - "git_url": "https://github.com/runwhen-contrib/aws-c7n-codecollection/tree/main/codebundles/aws-c7n-acm-health" - }, - { - "slug": "aws-c7n-codecollection-aws-c7n-network-health", - "collection_slug": "aws-c7n-codecollection", - "name": "aws-c7n-network-health", - "display_name": "aws-c7n-network-health", - "description": "This CodeBundle evaluates the health of Network volumes in a given AWS Account and Region", - "platform": "AWS", - "author": "saurabh3460", - "support_tags": [ - "aws" - ], - "tasks": [ - "Check for publicly accessible security groups in AWS account `${AWS_ACCOUNT_ID}`", - "Check for unused Elastic IPs in AWS account `${AWS_ACCOUNT_ID}`", - "Check for unused ELBs in AWS account `${AWS_ACCOUNT_ID}`", - "Check for VPCs with Flow Logs disabled in AWS account `${AWS_ACCOUNT_ID}`", - "Generate Health Score", - "Suite Initialization", - "List Publicly Accessible Security Groups in AWS account `${AWS_ACCOUNT_ID}`", - "List unused Elastic IPs in AWS account `${AWS_ACCOUNT_ID}`", - "List unused ELBs in AWS account `${AWS_ACCOUNT_ID}`", - "List VPCs with Flow Logs Disabled in AWS account `${AWS_ACCOUNT_ID}`", - "Suite Initialization" - ], - "capabilities": [ - "Check for publicly accessible security groups in AWS account `${AWS_ACCOUNT_ID}`: Find publicly accessible security groups (e.g., \"0.0.0.0/0\" or \"::/0\")", - "Check for unused Elastic IPs in AWS account `${AWS_ACCOUNT_ID}`: Find unused Elastic IPs that are not associated with any instance or network interface", - "Check for unused ELBs in AWS account `${AWS_ACCOUNT_ID}`: Find unused Application Load Balancers (ALBs) and Network Load Balancers (NLBs) that do not have any associated targets", - "Check for VPCs with Flow Logs disabled in AWS account `${AWS_ACCOUNT_ID}`: Find VPCs that do not have Flow Logs enabled", - "List Publicly Accessible Security Groups in AWS account `${AWS_ACCOUNT_ID}`: Find publicly accessible security groups (e.g., \"0.0.0.0/0\" or \"::/0\")", - "List unused Elastic IPs in AWS account `${AWS_ACCOUNT_ID}`: Find unused Elastic IPs that are not associated with any instance or network interface", - "List unused ELBs in AWS account `${AWS_ACCOUNT_ID}`: Find unused Application Load Balancers (ALBs) and Network Load Balancers (NLBs) that do not have any associated targets", - "List VPCs with Flow Logs Disabled in AWS account `${AWS_ACCOUNT_ID}`: Find VPCs that do not have flow logs enabled" - ], - "readme": "# AWS Cloud Custodian Network Health\n\nThis CodeBundle evaluates the health of Network volumes in a given AWS Account and Region\n\n## SLI\nThe SLI produces a score of 0 (bad), 1(good), or a value in between. This score is generated by capturing the following: \n- Public IP access in security group\n- Unused Elastic IP\n- Unused Elastic Loadbalancer\n- VPCs with flow logs disabled\n\nThe score of each check is added up and then divided by the total amount of checks. \n\n\n## TaskSet\nSimilar to the SLI, but produces a report on the specific resources and raises issues for each volume that requires attention. \n\n\n## Required Configuration\n\n```\nexport AWS_ACCESS_KEY_ID=[]\nexport AWS_SECRET_ACCESS_KEY=[]\nexport AWS_DEFAULT_REGION=[]\nexport AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query \"Account\" --output text)\n```\n\n\n## Testing \nSee the .test directory for infrastructure test code. ", - "libraries": [ - "RW.CLI", - "RW.Core", - "CloudCustodian.Core" - ], - "git_url": "https://github.com/runwhen-contrib/aws-c7n-codecollection/tree/main/codebundles/aws-c7n-network-health" - }, - { - "slug": "aws-c7n-codecollection-aws-c7n-rds-health", - "collection_slug": "aws-c7n-codecollection", - "name": "aws-c7n-rds-health", - "display_name": "aws-c7n-rds-health", - "description": "This CodeBundle evaluates the health of RDS instances in a given AWS Account and Region", - "platform": "AWS", - "author": "saurabh3460", - "support_tags": [ - "aws" - ], - "tasks": [ - "Check for unencrypted RDS instances in AWS Region `${AWS_REGION}` in AWS account `${AWS_ACCOUNT_ID}`", - "Check for publicly accessible RDS instances in AWS Region `${AWS_REGION}` in AWS account `${AWS_ACCOUNT_ID}`", - "Check for disabled backup RDS instances in AWS Region `${AWS_REGION}` in AWS account `${AWS_ACCOUNT_ID}`", - "Generate Health Score", - "List Unencrypted RDS Instances in AWS Region `${AWS_REGION}` in AWS Account `${AWS_ACCOUNT_ID}`", - "List Publicly Accessible RDS Instances in AWS Region `${AWS_REGION}` in AWS Account `${AWS_ACCOUNT_ID}`", - "List RDS Instances with Backups Disabled in AWS Region `${AWS_REGION}` in AWS Account `${AWS_ACCOUNT_ID}`" - ], - "capabilities": [ - "Check for unencrypted RDS instances in AWS Region `${AWS_REGION}` in AWS account `${AWS_ACCOUNT_ID}`: Find unencrypted RDS instances", - "Check for publicly accessible RDS instances in AWS Region `${AWS_REGION}` in AWS account `${AWS_ACCOUNT_ID}`: Find publicly accessible RDS instances", - "Check for disabled backup RDS instances in AWS Region `${AWS_REGION}` in AWS account `${AWS_ACCOUNT_ID}`: Find RDS instances with backups disabled", - "List Unencrypted RDS Instances in AWS Region `${AWS_REGION}` in AWS Account `${AWS_ACCOUNT_ID}`: Find unencrypted RDS instances", - "List Publicly Accessible RDS Instances in AWS Region `${AWS_REGION}` in AWS Account `${AWS_ACCOUNT_ID}`: Find publicly accessible RDS instances", - "List RDS Instances with Backups Disabled in AWS Region `${AWS_REGION}` in AWS Account `${AWS_ACCOUNT_ID}`: Identify RDS instances with backups disabled" - ], - "readme": "# AWS Cloud Custodian RDS Health\n\nThis CodeBundle evaluates the health of RDS instances in a given AWS Account and Region\n\n## SLI\nThe SLI produces a score of 0 (bad), 1(good), or a value in between. This score is generated by capturing the following: \n- Unencrypted RDS instances\n- Publicly accessible RDS instances\n- RDS instances with backups disabled\n\n## TaskSet\nSimilar to the SLI, but produces a report on the specific resources and raises issues for each RDS that requires attention. \n\n## Required Configuration\n\n```\nexport AWS_ACCESS_KEY_ID=[]\nexport AWS_SECRET_ACCESS_KEY=[]\nexport AWS_DEFAULT_REGION=[]\nexport AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query \"Account\" --output text)\n```\n\n## Testing \nSee the `.test` directory for infrastructure test code. ", - "libraries": [ - "RW.CLI", - "RW.Core", - "CloudCustodian.Core" - ], - "git_url": "https://github.com/runwhen-contrib/aws-c7n-codecollection/tree/main/codebundles/aws-c7n-rds-health" - }, - { - "slug": "aws-c7n-codecollection-aws-c7n-s3-health", - "collection_slug": "aws-c7n-codecollection", - "name": "aws-c7n-s3-health", - "display_name": "aws-c7n-s3-health", - "description": "This codebundle starts out as an example of integrating the custodian (c7n) cli into RunWhen.", - "platform": "AWS", - "author": "stewartshea", - "support_tags": [ - "aws" - ], - "tasks": [ - "Count S3 Buckets With Public Access in AWS Account `${AWS_ACCOUNT_NAME}`", - "Suite Initialization", - "List S3 Buckets With Public Access in AWS Account `${AWS_ACCOUNT_NAME}`", - "Suite Initialization" - ], - "capabilities": [ - "Count S3 Buckets With Public Access in AWS Account `${AWS_ACCOUNT_NAME}`: Fetch total number of S3 buckets with public access enabled.", - "List S3 Buckets With Public Access in AWS Account `${AWS_ACCOUNT_NAME}`: Fetch total number of S3 buckets with public access enabled and raises an issue if any exist." - ], - "readme": "# AWS Cloud Custodian S3 Health\n\nThis codebundle starts out as an example of integrating the custodian (c7n) cli into RunWhen. \n\n## SLI\nA simple SLI that counts S3 buckets that are public. Uses the custodian cli. \n\n## TaskSet\nSimilar to the SLI, but produces a report on the specific resources and raises issues for each public bucket. \n\n\n## Required Configuration\n\n```\nexport AWS_ACCESS_KEY_ID=[]\nexport AWS_SECRET_ACCESS_KEY=[]\nexport AWS_DEFAULT_REGION=[]\nexport AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query \"Account\" --output text)\n```\n\n\n## Testing \nSee the .test directory for infrastructure test code. ", - "libraries": [ - "RW.CLI", - "RW.Core", - "CloudCustodian.Core" - ], - "git_url": "https://github.com/runwhen-contrib/aws-c7n-codecollection/tree/main/codebundles/aws-c7n-s3-health" - }, - { - "slug": "azure-c7n-codecollection-azure-vm-health", - "collection_slug": "azure-c7n-codecollection", - "name": "azure-vm-health", - "display_name": "azure-vm-health", - "description": "This codebundle runs a suite of metrics checks for VMs in Azure. It identifies:", - "platform": "Azure", - "author": "saurabh3460", - "support_tags": [ - "azure" - ], - "tasks": [ - "Check Azure VM Health in resource group `${AZURE_RESOURCE_GROUP}`", - "Check for VMs With Public IP in resource group `${AZURE_RESOURCE_GROUP}`", - "Check for Stopped VMs in resource group `${AZURE_RESOURCE_GROUP}`", - "Check for VMs With High CPU Usage in resource group `${AZURE_RESOURCE_GROUP}`", - "Check for Underutilized VMs Based on CPU Usage in resource group `${AZURE_RESOURCE_GROUP}`", - "Check for VMs With High Memory Usage in resource group `${AZURE_RESOURCE_GROUP}`", - "Check for Underutilized VMs Based on Memory Usage in resource group `${AZURE_RESOURCE_GROUP}`", - "Check for Unused Network Interfaces in resource group `${AZURE_RESOURCE_GROUP}`", - "Check for Unused Public IPs in resource group `${AZURE_RESOURCE_GROUP}`", - "Check VMs Agent Status in resource group `${AZURE_RESOURCE_GROUP}`", - "Generate Health Score", - "List VMs Health in resource group `${AZURE_RESOURCE_GROUP}`", - "List VMs With Public IP in resource group `${AZURE_RESOURCE_GROUP}`", - "List Stopped VMs in resource group `${AZURE_RESOURCE_GROUP}`", - "List VMs With High CPU Usage in resource group `${AZURE_RESOURCE_GROUP}`", - "List Underutilized VMs Based on CPU Usage in resource group `${AZURE_RESOURCE_GROUP}`", - "List VMs With High Memory Usage in resource group `${AZURE_RESOURCE_GROUP}`", - "List Underutilized VMs Based on Memory Usage in resource group `${AZURE_RESOURCE_GROUP}`", - "List Unused Network Interfaces in resource group `${AZURE_RESOURCE_GROUP}`", - "List Unused Public IPs in resource group `${AZURE_RESOURCE_GROUP}`", - "List VMs Agent Status in resource group `${AZURE_RESOURCE_GROUP}`" - ], - "capabilities": [ - "Check Azure VM Health in resource group `${AZURE_RESOURCE_GROUP}`: Checks the health status of Azure VMs using the Microsoft.ResourceHealth provider", - "Check for VMs With Public IP in resource group `${AZURE_RESOURCE_GROUP}`: Lists VMs with public IP address", - "Check for Stopped VMs in resource group `${AZURE_RESOURCE_GROUP}`: Count VMs that are in a stopped state", - "Check for VMs With High CPU Usage in resource group `${AZURE_RESOURCE_GROUP}`: Checks for VMs with high CPU usage", - "Check for Underutilized VMs Based on CPU Usage in resource group `${AZURE_RESOURCE_GROUP}`: Count VMs that are underutilized based on CPU usage", - "Check for VMs With High Memory Usage in resource group `${AZURE_RESOURCE_GROUP}`: Count VMs that have high memory usage based on available memory percentage", - "Check for Underutilized VMs Based on Memory Usage in resource group `${AZURE_RESOURCE_GROUP}`: Count VMs that are underutilized based on memory usage", - "Check for Unused Network Interfaces in resource group `${AZURE_RESOURCE_GROUP}`: Count network interfaces that are not attached to any virtual machine", - "Check for Unused Public IPs in resource group `${AZURE_RESOURCE_GROUP}`: Count public IP addresses that are not attached to any resource", - "Check VMs Agent Status in resource group `${AZURE_RESOURCE_GROUP}`: Lists VMs that have VM agent status issues", - "List VMs Health in resource group `${AZURE_RESOURCE_GROUP}`: Checks the health status of Azure VMs using the Microsoft.ResourceHealth provider", - "List VMs With Public IP in resource group `${AZURE_RESOURCE_GROUP}`: Lists VMs with public IP address", - "List Stopped VMs in resource group `${AZURE_RESOURCE_GROUP}`: Lists VMs that are in a stopped state", - "List VMs With High CPU Usage in resource group `${AZURE_RESOURCE_GROUP}`: Checks for VMs with high CPU usage", - "List Underutilized VMs Based on CPU Usage in resource group `${AZURE_RESOURCE_GROUP}`: List Azure Virtual Machines (VMs) that have low CPU utilization based on a defined threshold and timeframe.", - "List VMs With High Memory Usage in resource group `${AZURE_RESOURCE_GROUP}`: List Azure Virtual Machines (VMs) that have high memory usage based on a defined threshold and timeframe.", - "List Underutilized VMs Based on Memory Usage in resource group `${AZURE_RESOURCE_GROUP}`: List Azure Virtual Machines (VMs) that are underutilized based on memory usage", - "List Unused Network Interfaces in resource group `${AZURE_RESOURCE_GROUP}`: Lists network interfaces that are not attached to any virtual machine", - "List Unused Public IPs in resource group `${AZURE_RESOURCE_GROUP}`: Lists public IP addresses that are not attached to any resource", - "List VMs Agent Status in resource group `${AZURE_RESOURCE_GROUP}`: Lists VMs that have VM agent status issues" - ], - "readme": "# Azure Virtual Machine Health\nThis codebundle runs a suite of metrics checks for VMs in Azure. It identifies:\n- Check for VMs With Public IP\n- Check for Stopped VMs\n- Check for VMs With High CPU Usage\n- Check for Underutilized VMs Based on CPU Usage\n- Check for VMs With High Memory Usage\n- Check for Underutilized VMs Based on Memory\n- Check for Unused Network Interfaces\n- Check for Unused Public IPs\n\n## Configuration\n\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\n- `AZ_USERNAME`: Service principal's client ID\n- `AZ_SECRET_VALUE`: The credential secret value from the app registration\n- `AZ_TENANT`: The Azure tenancy ID\n- `AZ_SUBSCRIPTION`: The Azure subscription ID\n\n## Testing \nSee the .test directory for infrastructure test code. \n\n## Notes\n\nThis codebundle assumes the service principal authentication flow", - "libraries": [ - "Collections", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "CloudCustodian.Core", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/azure-c7n-codecollection/tree/main/codebundles/azure-vm-health" - }, - { - "slug": "azure-c7n-codecollection-azure-storage-health", - "collection_slug": "azure-c7n-codecollection", - "name": "azure-storage-health", - "display_name": "azure-storage-health", - "description": "This codebundle runs a suite of metrics checks for Storage health in Azure. It identifies:", - "platform": "Azure", - "author": "saurabh3460", - "support_tags": [ - "azure" - ], - "tasks": [ - "Count Azure Storage Accounts with Health Status of `Available` in resource group `${AZURE_RESOURCE_GROUP}`", - "Count Unused Disks in resource group `${AZURE_RESOURCE_GROUP}`", - "Count Unused Snapshots in resource group `${AZURE_RESOURCE_GROUP}`", - "Count Unused Storage Accounts in resource group `${AZURE_RESOURCE_GROUP}`", - "Count Storage Containers with Public Access in resource group `${AZURE_RESOURCE_GROUP}`", - "Count Storage Account Misconfigurations in resource group `${AZURE_RESOURCE_GROUP}`", - "Count Storage Account Changes with Critical/High Security Risk in resource group `${AZURE_RESOURCE_GROUP}`", - "Generate Health Score", - "Check Azure Storage Resource Health in resource group `${AZURE_RESOURCE_GROUP}`", - "List Unused Azure Disks in resource group `${AZURE_RESOURCE_GROUP}`", - "List Unused Azure Snapshots in resource group `${AZURE_RESOURCE_GROUP}`", - "List Unused Azure Storage Accounts in resource group `${AZURE_RESOURCE_GROUP}`", - "List Storage Containers with Public Access in resource group `${AZURE_RESOURCE_GROUP}`", - "List Storage Account Misconfigurations in resource group `${AZURE_RESOURCE_GROUP}`", - "List Storage Account Changes in resource group `${AZURE_RESOURCE_GROUP}`" - ], - "capabilities": [ - "Count Azure Storage Accounts with Health Status of `Available` in resource group `${AZURE_RESOURCE_GROUP}`: Count Azure storage accounts with health status of `Available`", - "Count Unused Disks in resource group `${AZURE_RESOURCE_GROUP}`: Count disks that are not attached to any VM", - "Count Unused Snapshots in resource group `${AZURE_RESOURCE_GROUP}`: Count snapshots that are not attached to any disk", - "Count Unused Storage Accounts in resource group `${AZURE_RESOURCE_GROUP}`: Count storage accounts with no transactions", - "Count Storage Containers with Public Access in resource group `${AZURE_RESOURCE_GROUP}`: Count storage containers with public access enabled", - "Count Storage Account Misconfigurations in resource group `${AZURE_RESOURCE_GROUP}`: Count storage accounts with misconfigurations", - "Count Storage Account Changes with Critical/High Security Risk in resource group `${AZURE_RESOURCE_GROUP}`: Count storage account operations with critical or high security risk from Azure Activity Log", - "Check Azure Storage Resource Health in resource group `${AZURE_RESOURCE_GROUP}`: Check the Azure Resource Health API for any known issues affecting storage resources", - "List Unused Azure Disks in resource group `${AZURE_RESOURCE_GROUP}`: List Azure disks that are not attached to any VM", - "List Unused Azure Snapshots in resource group `${AZURE_RESOURCE_GROUP}`: List Azure snapshots that are not attached", - "List Unused Azure Storage Accounts in resource group `${AZURE_RESOURCE_GROUP}`: List Azure storage accounts with no transactions", - "List Storage Containers with Public Access in resource group `${AZURE_RESOURCE_GROUP}`: List Azure storage containers with public access enabled", - "List Storage Account Misconfigurations in resource group `${AZURE_RESOURCE_GROUP}`: Identify Azure storage accounts with security or configuration misconfigurations", - "List Storage Account Changes in resource group `${AZURE_RESOURCE_GROUP}`: Lists storage account changes and operations from Azure Activity Log" - ], - "readme": "# Azure Storage Health\nThis codebundle runs a suite of metrics checks for Storage health in Azure. It identifies:\n- Check for Unused Disks\n- Check for Unused Snapshots\n- Check for Unused Storage Accounts\n- Check for Public Accessible Storage Accounts\n\n## Configuration\n\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\n- `AZ_USERNAME`: Service principal's client ID\n- `AZ_SECRET_VALUE`: The credential secret value from the app registration\n- `AZ_TENANT`: The Azure tenancy ID\n- `AZ_SUBSCRIPTION`: The Azure subscription ID\n\n## Testing \nSee the .test directory for infrastructure test code. \n\n## Notes\n\nThis codebundle assumes the service principal authentication flow", - "libraries": [ - "DateTime", - "Collections", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "CloudCustodian.Core", - "OperatingSystem", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/azure-c7n-codecollection/tree/main/codebundles/azure-storage-health" - }, - { - "slug": "azure-c7n-codecollection-azure-db-health", - "collection_slug": "azure-c7n-codecollection", - "name": "azure-db-health", - "display_name": "azure-db-health", - "description": "This codebundle runs a suite of metrics checks for Database in Azure. It identifies:", - "platform": "Azure", - "author": "saurabh3460", - "support_tags": [ - "azure" - ], - "tasks": [ - "Score Database Availability in resource group `${AZURE_RESOURCE_GROUP}`", - "Count Publicly Accessible Databases in resource group `${AZURE_RESOURCE_GROUP}`", - "Count Databases Without Replication in resource group `${AZURE_RESOURCE_GROUP}`", - "Count Databases Without High Availability in resource group `${AZURE_RESOURCE_GROUP}`", - "Count Databases With High CPU Usage in resource group `${AZURE_RESOURCE_GROUP}`", - "Count Databases With High Memory Usage in resource group `${AZURE_RESOURCE_GROUP}`", - "Count Redis Caches With High Cache Miss Rate in resource group `${AZURE_RESOURCE_GROUP}`", - "Count Databases With Health Issues in resource group `${AZURE_RESOURCE_GROUP}`", - "Count Risky Database Configuration Changes in resource group `${AZURE_RESOURCE_GROUP}`", - "Generate Health Score", - "List Database Availability in resource group `${AZURE_RESOURCE_GROUP}`", - "List Publicly Accessible Databases in resource group `${AZURE_RESOURCE_GROUP}`", - "List Databases Without Replication in resource group `${AZURE_RESOURCE_GROUP}`", - "List Databases Without High Availability in resource group `${AZURE_RESOURCE_GROUP}`", - "List Databases With High CPU Usage in resource group `${AZURE_RESOURCE_GROUP}`", - "List All Databases With High Memory Usage in resource group `${AZURE_RESOURCE_GROUP}`", - "List Redis Caches With High Cache Miss Rate in resource group `${AZURE_RESOURCE_GROUP}`", - "List Database Resource Health in resource group `${AZURE_RESOURCE_GROUP}`", - "List Database Changes in resource group `${AZURE_RESOURCE_GROUP}`" - ], - "capabilities": [ - "Score Database Availability in resource group `${AZURE_RESOURCE_GROUP}`: Count databases that have availability below 100%", - "Count Publicly Accessible Databases in resource group `${AZURE_RESOURCE_GROUP}`: Count databases that have public network access enabled", - "Count Databases Without Replication in resource group `${AZURE_RESOURCE_GROUP}`: Count databases that have no replication configured", - "Count Databases Without High Availability in resource group `${AZURE_RESOURCE_GROUP}`: Count databases that have high availability disabled", - "Count Databases With High CPU Usage in resource group `${AZURE_RESOURCE_GROUP}`: Count databases that have high CPU usage", - "Count Databases With High Memory Usage in resource group `${AZURE_RESOURCE_GROUP}`: Count databases that have high memory usage", - "Count Redis Caches With High Cache Miss Rate in resource group `${AZURE_RESOURCE_GROUP}`: Count Redis caches that have high cache miss rate", - "Count Databases With Health Issues in resource group `${AZURE_RESOURCE_GROUP}`: Count databases that have health issues using Azure ResourceHealth API", - "Count Risky Database Configuration Changes in resource group `${AZURE_RESOURCE_GROUP}`: Count risky database configuration changes using audit functionality", - "List Database Availability in resource group `${AZURE_RESOURCE_GROUP}`: Lists databases that have availability below 100%", - "List Publicly Accessible Databases in resource group `${AZURE_RESOURCE_GROUP}`: Lists databases that have public network access enabled", - "List Databases Without Replication in resource group `${AZURE_RESOURCE_GROUP}`: Lists databases that have no replication configured", - "List Databases Without High Availability in resource group `${AZURE_RESOURCE_GROUP}`: Lists databases that have high availability disabled", - "List Databases With High CPU Usage in resource group `${AZURE_RESOURCE_GROUP}`: Lists databases that have high CPU usage", - "List All Databases With High Memory Usage in resource group `${AZURE_RESOURCE_GROUP}`: Lists all database types that have high memory usage", - "List Redis Caches With High Cache Miss Rate in resource group `${AZURE_RESOURCE_GROUP}`: Lists Redis caches with high cache miss rate", - "List Database Resource Health in resource group `${AZURE_RESOURCE_GROUP}`: Lists unhealthy databases using Azure ResourceHealth API", - "List Database Changes in resource group `${AZURE_RESOURCE_GROUP}`: Lists database changes in the specified resource group" - ], - "readme": "# Azure Database Health\nThis codebundle runs a suite of metrics checks for Database in Azure. It identifies:\n- Databases that are publicly accessible\n- Databases without replication configured \n- Databases without high availability configuration\n- Databases with high CPU usage\n- Databases with high memory usage\n- Redis caches with high cache miss rate\n\n## Configuration\n\nThe TaskSet requires initialization to import necessary secrets, services, and user variables. The following variables should be set:\n\n- `AZ_USERNAME`: Service principal's client ID\n- `AZ_SECRET_VALUE`: The credential secret value from the app registration\n- `AZ_TENANT`: The Azure tenancy ID\n- `AZ_SUBSCRIPTION`: The Azure subscription ID\n\n## Testing \nSee the .test directory for infrastructure test code. \n\n## Notes\n\nThis codebundle assumes the service principal authentication flow", - "libraries": [ - "DateTime", - "Collections", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "CloudCustodian.Core", - "String" - ], - "git_url": "https://github.com/runwhen-contrib/azure-c7n-codecollection/tree/main/codebundles/azure-db-health" - }, - { - "slug": "ternary-codecollection-fetch-report-from-query", - "collection_slug": "ternary-codecollection", - "name": "fetch-report-from-query", - "display_name": "fetch-report-from-query", - "description": "To use a query from an existing runsession,", - "platform": "Unknown", - "author": "stewartshea", - "support_tags": [ - "ternary" - ], - "tasks": [ - "Fetch Ternary Report from Query" - ], - "capabilities": [ - "Fetch Ternary Report from Query: Connects to Ternary and searches for reports that best match the user query. Returns a list of reports and urls." - ], - "readme": "# Local Testing \nTo use a query from an existing runsession,\n```\nexport RW_USER_TOKEN=[]\nexport RW_WORKSPACE=[workspace]\nexport RW_WORKSPACE_API_URL=[papi_url]\nexport RW_SESSION_ID=[runsession]\n```\n", - "libraries": [ - "Ternary.Utils", - "RW.CLI", - "RW.Core", - "BuiltIn", - "RW.platform", - "OperatingSystem", - "RW.RunSession", - "RW.Workspace" - ], - "git_url": "https://github.com/runwhen-contrib/ternary-codecollection/tree/main/codebundles/fetch-report-from-query" - } - ] -} \ No newline at end of file diff --git a/mcp-server/data/codecollections.json b/mcp-server/data/codecollections.json deleted file mode 100644 index 0df49eec5d4c..000000000000 --- a/mcp-server/data/codecollections.json +++ /dev/null @@ -1,68 +0,0 @@ -{ - "codecollections": [ - { - "name": "RunWhen Public CodeCollection", - "slug": "rw-public-codecollection", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection", - "git_ref": "main", - "owner": "RunWhen", - "owner_icon": "https://assets-global.website-files.com/64f9646ad0f39e9ee5c116c4/659f80c7391d64a0ec2a840e_icon_rw-platform.svg", - "owner_email": "shea.stewart@runwhen.com", - "description": "Python based CodeCollections that do not leverage a command line binary or bash script" - }, - { - "name": "RunWhen CLI CodeCollection", - "slug": "rw-cli-codecollection", - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection", - "owner": "RunWhen", - "owner_icon": "https://assets-global.website-files.com/64f9646ad0f39e9ee5c116c4/659f80c7391d64a0ec2a840e_icon_rw-platform.svg", - "owner_email": "shea.stewart@runwhen.com", - "description": "CodeCollections based on command line binaries and bash scripts" - }, - { - "name": "RunWhen Generic CodeCollection", - "slug": "rw-generic-codecollection", - "git_url": "https://github.com/runwhen-contrib/rw-generic-codecollection", - "owner": "RunWhen", - "owner_icon": "https://assets-global.website-files.com/64f9646ad0f39e9ee5c116c4/659f80c7391d64a0ec2a840e_icon_rw-platform.svg", - "owner_email": "shea.stewart@runwhen.com", - "description": "Run Generic CLI Commands with User Input" - }, - { - "name": "RunWhen Workspace Utilities CodeCollection", - "slug": "rw-workspace-utils", - "git_url": "https://github.com/runwhen-contrib/rw-workspace-utils", - "owner": "RunWhen", - "owner_icon": "https://assets-global.website-files.com/64f9646ad0f39e9ee5c116c4/659f80c7391d64a0ec2a840e_icon_rw-platform.svg", - "owner_email": "shea.stewart@runwhen.com", - "description": "Workspace Utilies such as Webhook integrations" - }, - { - "name": "AWS CloudCustodian CodeCollection", - "slug": "aws-c7n-codecollection", - "git_url": "https://github.com/runwhen-contrib/aws-c7n-codecollection", - "owner": "RunWhen", - "owner_icon": "https://assets-global.website-files.com/64f9646ad0f39e9ee5c116c4/659f80c7391d64a0ec2a840e_icon_rw-platform.svg", - "owner_email": "shea.stewart@runwhen.com", - "description": "AWS CloudCustodian CodeBundles" - }, - { - "name": "Azure CloudCustodian CodeCollection", - "slug": "azure-c7n-codecollection", - "git_url": "https://github.com/runwhen-contrib/azure-c7n-codecollection", - "owner": "RunWhen", - "owner_icon": "https://assets-global.website-files.com/64f9646ad0f39e9ee5c116c4/659f80c7391d64a0ec2a840e_icon_rw-platform.svg", - "owner_email": "shea.stewart@runwhen.com", - "description": "Azure CloudCustodian CodeBundles" - }, - { - "name": "Ternary CodeCollection", - "slug": "ternary-codecollection", - "git_url": "https://github.com/runwhen-contrib/ternary-codecollection", - "owner": "RunWhen", - "owner_icon": "https://assets-global.website-files.com/64f9646ad0f39e9ee5c116c4/659f80c7391d64a0ec2a840e_icon_rw-platform.svg", - "owner_email": "shea.stewart@runwhen.com", - "description": "Ternary CodeBundles" - } - ] -} \ No newline at end of file diff --git a/mcp-server/data/libraries.json b/mcp-server/data/libraries.json deleted file mode 100644 index 2c36d08aeeda..000000000000 --- a/mcp-server/data/libraries.json +++ /dev/null @@ -1,2661 +0,0 @@ -{ - "libraries": [ - { - "name": "Jira", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/Jira.py", - "collection_slug": "rw-public-codecollection", - "description": "Jira keyword library\n\nScope: Global", - "functions": [], - "classes": [ - { - "name": "Jira", - "docstring": "Jira is a keyword library for integrating with the Jira system.\nYou need to provide a Jira server URL, a Jira User, and a Jira User Token\nto use this library.\nThe first step is to authenticate using `Connect To Jira`.", - "methods": "connect_to_jira, create_issue, get_issue, assign_issue, search_issues" - } - ], - "keywords": [], - "category": "general", - "import_path": "Jira", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/Jira.py" - }, - { - "name": "Slack", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/Slack.py", - "collection_slug": "rw-public-codecollection", - "description": "Slack keyword library\n\nScope: Global", - "functions": [], - "classes": [ - { - "name": "Slack", - "docstring": "Slack keyword library can be used to send messages to Slack.", - "methods": "post_message" - } - ], - "keywords": [], - "category": "general", - "import_path": "Slack", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/Slack.py" - }, - { - "name": "Grafana", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/Grafana.py", - "collection_slug": "rw-public-codecollection", - "description": "Grafana keyword library\n\nScope: Global", - "functions": [], - "classes": [ - { - "name": "Grafana", - "docstring": "Grafana is a keyword library for integrating with the Grafana Dashboard.\nYou need to provide a Grafana URL and a Grafana API Key to use\nthis library.\nThe first step is to authenticate using `Grafana Create Session`.", - "methods": "grafana_create_session, grafana_close_session, get_health_status" - } - ], - "keywords": [], - "category": "general", - "import_path": "Grafana", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/Grafana.py" - }, - { - "name": "Opsgenie", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/Opsgenie.py", - "collection_slug": "rw-public-codecollection", - "description": "Opsgenie keyword library\n\nScope: Global", - "functions": [], - "classes": [ - { - "name": "Opsgenie", - "docstring": "Opsgenie is a keyword library for integrating with the Opsgenie services.\nIt can be used to send alerts to Opsgenie.", - "methods": "connect_to_opsgenie, get_info, create_alert" - } - ], - "keywords": [], - "category": "general", - "import_path": "Opsgenie", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/Opsgenie.py" - }, - { - "name": "WebInspector", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/WebInspector.py", - "collection_slug": "rw-public-codecollection", - "description": "WebInspector keyword library\n\nScope: Global", - "functions": [], - "classes": [ - { - "name": "WebInspector", - "docstring": "The WebInspector keyword library is a set of functions that can be used to diagnose site issues.", - "methods": "verify_egress, get_certificate, get_latency_measurements, get_dns_info, get_cert_valid_from, get_cert_valid_until, inspect_url" - } - ], - "keywords": [], - "category": "general", - "import_path": "WebInspector", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/WebInspector.py" - }, - { - "name": "MSTeams", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/MSTeams.py", - "collection_slug": "rw-public-codecollection", - "description": "MS Teams keyword library\n\nScope: Global", - "functions": [], - "classes": [ - { - "name": "MSTeams", - "docstring": "MS Teams keyword library can be used to send alerts/notifications\nto a channel in Teams.\n\n* You need to define a team in Microsoft 365, then this team will show up\n in MS Teams.\n* In MS Teams, select the team and create a channel for it.\n* In the channel, set up a Connector and choose Incoming Webhook.\n* After configuring the Incoming Webhook, you'll get a Webhook URL\n which can be used by pymsteams to send a message to the channel.\n\nSee https://github.com/rveachkc/pymsteams for more informati", - "methods": "send_message" - } - ], - "keywords": [], - "category": "general", - "import_path": "MSTeams", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/MSTeams.py" - }, - { - "name": "GitLab", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/GitLab.py", - "collection_slug": "rw-public-codecollection", - "description": "GitLab keyword library\n\nScope: Global", - "functions": [], - "classes": [ - { - "name": "GitLab", - "docstring": "GitLab is a keyword library for integrating with the GitLab system.\nYou need to provide a GitLab URL and a GitLab API Token to use\nthis library.\nThe first step is to authenticate using `Create Session`.", - "methods": "create_session, get_projects" - } - ], - "keywords": [], - "category": "general", - "import_path": "GitLab", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/GitLab.py" - }, - { - "name": "PagerDuty", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/PagerDuty.py", - "collection_slug": "rw-public-codecollection", - "description": "PagerDuty keyword library\n\nScope: Global", - "functions": [], - "classes": [ - { - "name": "PagerDuty", - "docstring": "PagerDuty keyword library can be used to create new incident in PagerDuty.", - "methods": "set_api_token, get_user_id, get_service_id, create_incident, create_incident_and_assign_user" - } - ], - "keywords": [], - "category": "general", - "import_path": "PagerDuty", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/PagerDuty.py" - }, - { - "name": "HTTP", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/HTTP.py", - "collection_slug": "rw-public-codecollection", - "description": "HTTP keyword library\n\nScope: Global", - "functions": [], - "classes": [ - { - "name": "HTTP", - "docstring": "HTTP keyword library defines HTTP/REST-related keywords.", - "methods": "create_session, create_authenticated_session, close_session, update_session_headers, get_session_headers, get, post, put, patch, delete" - } - ], - "keywords": [], - "category": "general", - "import_path": "HTTP", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/HTTP.py" - }, - { - "name": "Kubectl", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/Kubectl.py", - "collection_slug": "rw-public-codecollection", - "description": "Kubectl keyword library\n\nScope: Global", - "functions": [], - "classes": [ - { - "name": "Kubectl", - "docstring": "Kubectl keyword library can be used to interact with Kubernetes clusters via kubectl location service.", - "methods": "set_kubeconfig, kubectl, stdout_to_lists, get_kubectl_list_column, remove_units" - } - ], - "keywords": [], - "category": "kubernetes", - "import_path": "Kubectl", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/Kubectl.py" - }, - { - "name": "Remote", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/Remote.py", - "collection_slug": "rw-public-codecollection", - "description": "", - "functions": [], - "classes": [ - { - "name": "RemoteException", - "docstring": "", - "methods": "" - }, - { - "name": "Remote", - "docstring": "", - "methods": "hello_world_message, use_remoter, remote_run, remote_check, remote_logs, kub_get_current_namespace" - } - ], - "keywords": [], - "category": "general", - "import_path": "Remote", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/Remote.py" - }, - { - "name": "Rocketchat", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/Rocketchat.py", - "collection_slug": "rw-public-codecollection", - "description": "Rocketchat keyword library\nScope: Global", - "functions": [], - "classes": [ - { - "name": "Rocketchat", - "docstring": "Rocketchat keyword library can be used to send messages to Rocketchat channels.", - "methods": "send_message, incoming_webhook" - } - ], - "keywords": [], - "category": "general", - "import_path": "Rocketchat", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/Rocketchat.py" - }, - { - "name": "Pingdom", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/Pingdom.py", - "collection_slug": "rw-public-codecollection", - "description": "Pingdom keyword library\n\nScope: Global", - "functions": [], - "classes": [ - { - "name": "Pingdom", - "docstring": "Pingdom keyword library", - "methods": "get_health_status" - } - ], - "keywords": [], - "category": "general", - "import_path": "Pingdom", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/Pingdom.py" - }, - { - "name": "MyTest", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/MyTest.py", - "collection_slug": "rw-public-codecollection", - "description": "MyTest keyword library\n\nScope: Global", - "functions": [], - "classes": [ - { - "name": "MyTest", - "docstring": "MyTest keyword library is used for internal testing.", - "methods": "my_test_kw" - } - ], - "keywords": [], - "category": "general", - "import_path": "MyTest", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/MyTest.py" - }, - { - "name": "DNS", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/DNS.py", - "collection_slug": "rw-public-codecollection", - "description": "DNS keyword library\n\nScope: Global", - "functions": [], - "classes": [ - { - "name": "DNS", - "docstring": "DNS keyword library", - "methods": "lookup, lookup_latency_in_seconds, lookup_latency_in_milliseconds" - } - ], - "keywords": [], - "category": "general", - "import_path": "DNS", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/DNS.py" - }, - { - "name": "restclient", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/restclient.py", - "collection_slug": "rw-public-codecollection", - "description": "Simple REST client", - "functions": [ - { - "name": "create_session", - "signature": "def create_session(headers: Union[str, object, None]) -> object", - "docstring": "" - }, - { - "name": "close_session", - "signature": "def close_session(session) -> None", - "docstring": "" - }, - { - "name": "update_session_headers", - "signature": "def update_session_headers(session: object, headers: Union[str, object]) -> object", - "docstring": "" - }, - { - "name": "get_session_headers", - "signature": "def get_session_headers(session: object) -> object", - "docstring": "" - } - ], - "classes": [ - { - "name": "RestClient", - "docstring": "REST client based on requests library.", - "methods": "base_url, get, post, put, patch, delete" - } - ], - "keywords": [], - "category": "general", - "import_path": "restclient", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/restclient.py" - }, - { - "name": "Elasticsearch", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/Elasticsearch.py", - "collection_slug": "rw-public-codecollection", - "description": "Elasticsearch keyword library\n\nScope: Global", - "functions": [], - "classes": [ - { - "name": "Elasticsearch", - "docstring": "Elasticsearch is a keyword library for integrating with the Elasticsearch\nsearch engine.\nAt this time, basic authentication is done by passing the username/password\nin the URL.", - "methods": "get_health_status, get_shard_health_status" - } - ], - "keywords": [], - "category": "general", - "import_path": "Elasticsearch", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/Elasticsearch.py" - }, - { - "name": "papi", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/RunWhen/papi.py", - "collection_slug": "rw-public-codecollection", - "description": "RunWhen PAPI keyword library\n\nScope: Global", - "functions": [], - "classes": [ - { - "name": "Papi", - "docstring": "Papi is a keyword library that integrates with the RunWhen Public API.", - "methods": "get_workspaces, get_slxs, get_slx_metrics, get_slis, get_sli, get_sli_recent, get_all_recents_in_all_workspaces, get_all_recents_in_workspace, validate_recent_results, validate_all_workspace_recent_results, get_runsessions, get_runsession, get_runrequest_report, get_runsession_report, get_runsession_url, get_runsession_info, request_taskset" - } - ], - "keywords": [], - "category": "general", - "import_path": "papi", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/papi.py" - }, - { - "name": "argocd", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/ArgoCD/argocd.py", - "collection_slug": "rw-public-codecollection", - "description": "Argocd keyword library\n\nScope: Global", - "functions": [], - "classes": [ - { - "name": "ArgoCD", - "docstring": "ArgoCD keyword library", - "methods": "health_check" - } - ], - "keywords": [], - "category": "general", - "import_path": "argocd", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/argocd.py" - }, - { - "name": "Discord", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/Discord/Discord.py", - "collection_slug": "rw-public-codecollection", - "description": "", - "functions": [], - "classes": [ - { - "name": "Discord", - "docstring": "Discord integration to send messages via webhook to channels.", - "methods": "send_message" - } - ], - "keywords": [], - "category": "general", - "import_path": "Discord", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/Discord.py" - }, - { - "name": "Sysdig", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/Sysdig/Sysdig.py", - "collection_slug": "rw-public-codecollection", - "description": "Sysdig keyword library\n\nScope: Global", - "functions": [], - "classes": [ - { - "name": "Sysdig", - "docstring": "Sysdig is a keyword library for integrating with the Sysdig Secure and Monitor products.\n\nNote: Only Sysdig Monitor product is supported at this time.\n\nYou need to provide a Sysdig region URL and a Sysdig Monitor API Token to use\nthis library.", - "methods": "get_metrics_dict, get_metrics_list, get_metric_data, promql_query, transform_data" - } - ], - "keywords": [], - "category": "general", - "import_path": "Sysdig", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/Sysdig.py" - }, - { - "name": "datadog", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/Datadog/datadog.py", - "collection_slug": "rw-public-codecollection", - "description": "Datadog keyword library\n\nScope: Global", - "functions": [], - "classes": [ - { - "name": "Datadog", - "docstring": "Datadog is a keyword library for integrating with Datadog product.\n\nYou need to provide a Datadog API Key and a Datadog App Key to use\nthis library.", - "methods": "handle_timeseries_data, metric_query" - } - ], - "keywords": [], - "category": "general", - "import_path": "datadog", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/datadog.py" - }, - { - "name": "pdb_tasks_mixin", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/K8s/pdb_tasks_mixin.py", - "collection_slug": "rw-public-codecollection", - "description": "", - "functions": [], - "classes": [ - { - "name": "PdbTasksMixin", - "docstring": "", - "methods": "check_pdb, format_pdb_report" - } - ], - "keywords": [], - "category": "kubernetes", - "import_path": "pdb_tasks_mixin", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/pdb_tasks_mixin.py" - }, - { - "name": "deployment_tasks_mixin", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/K8s/deployment_tasks_mixin.py", - "collection_slug": "rw-public-codecollection", - "description": "", - "functions": [], - "classes": [ - { - "name": "DeploymentTasksMixin", - "docstring": "", - "methods": "get_available_replicas, get_desired_replicas, has_hpa, troubleshoot_deployment, check_resources, format_resources_report" - } - ], - "keywords": [], - "category": "kubernetes", - "import_path": "deployment_tasks_mixin", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/deployment_tasks_mixin.py" - }, - { - "name": "k8s_connection", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/K8s/k8s_connection.py", - "collection_slug": "rw-public-codecollection", - "description": "", - "functions": [], - "classes": [ - { - "name": "K8sConnection", - "docstring": "Static class that is used to provide other classes in the K8s keyword library\nwith a standardized mode of communicating with Kubernetes Clusters.", - "methods": "clear_shell_history, pop_shell_history, get_shell_history, get_last_shell_command, get_binary_name, shell, template_workload, template_shell, loop_template_shell" - } - ], - "keywords": [], - "category": "kubernetes", - "import_path": "k8s_connection", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/k8s_connection.py" - }, - { - "name": "k8sutils", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/K8s/k8sutils.py", - "collection_slug": "rw-public-codecollection", - "description": "K8s util library for k8s specific formatting and other tools.\n\nScope: Global", - "functions": [], - "classes": [ - { - "name": "K8sUtils", - "docstring": "K8s helper functions.", - "methods": "convert_to_metric, convert_age_to_search_time, jmespath_namespace_search_string" - } - ], - "keywords": [], - "category": "kubernetes", - "import_path": "k8sutils", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/k8sutils.py" - }, - { - "name": "network_tasks_mixin", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/K8s/network_tasks_mixin.py", - "collection_slug": "rw-public-codecollection", - "description": "", - "functions": [], - "classes": [ - { - "name": "NetworkTasksMixin", - "docstring": "", - "methods": "check_networking, format_networking_report" - } - ], - "keywords": [], - "category": "kubernetes", - "import_path": "network_tasks_mixin", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/network_tasks_mixin.py" - }, - { - "name": "pod_tasks_mixin", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/K8s/pod_tasks_mixin.py", - "collection_slug": "rw-public-codecollection", - "description": "", - "functions": [], - "classes": [ - { - "name": "PodTasksMixin", - "docstring": "", - "methods": "get_pod_names_with_logs, check_pods, format_pods_report" - } - ], - "keywords": [], - "category": "kubernetes", - "import_path": "pod_tasks_mixin", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/pod_tasks_mixin.py" - }, - { - "name": "job_tasks_mixin", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/K8s/job_tasks_mixin.py", - "collection_slug": "rw-public-codecollection", - "description": "", - "functions": [], - "classes": [ - { - "name": "JobTasksMixin", - "docstring": "", - "methods": "job_successful, wait_until_job_successful" - } - ], - "keywords": [], - "category": "kubernetes", - "import_path": "job_tasks_mixin", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/job_tasks_mixin.py" - }, - { - "name": "daemonset_tasks_mixin", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/K8s/daemonset_tasks_mixin.py", - "collection_slug": "rw-public-codecollection", - "description": "", - "functions": [], - "classes": [ - { - "name": "DaemonsetTasksMixin", - "docstring": "", - "methods": "healthcheck_daemonset" - } - ], - "keywords": [], - "category": "kubernetes", - "import_path": "daemonset_tasks_mixin", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/daemonset_tasks_mixin.py" - }, - { - "name": "event_tasks_mixin", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/K8s/event_tasks_mixin.py", - "collection_slug": "rw-public-codecollection", - "description": "", - "functions": [], - "classes": [ - { - "name": "EventTasksMixin", - "docstring": "", - "methods": "get_involved_object_name_list, check_events, format_events_report" - } - ], - "keywords": [], - "category": "kubernetes", - "import_path": "event_tasks_mixin", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/event_tasks_mixin.py" - }, - { - "name": "k8s", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/K8s/k8s.py", - "collection_slug": "rw-public-codecollection", - "description": "K8s keyword library, version 2, based on shellservice base.\n\nScope: Global", - "functions": [], - "classes": [ - { - "name": "K8s", - "docstring": "K8s keyword library can be used to interact with Kubernetes clusters.", - "methods": "compose_kubectl_cmd" - } - ], - "keywords": [], - "category": "kubernetes", - "import_path": "k8s", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/k8s.py" - }, - { - "name": "pvc_tasks_mixin", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/K8s/pvc_tasks_mixin.py", - "collection_slug": "rw-public-codecollection", - "description": "", - "functions": [], - "classes": [ - { - "name": "PvcTasksMixin", - "docstring": "", - "methods": "check_pvc, format_pvc_report" - } - ], - "keywords": [], - "category": "kubernetes", - "import_path": "pvc_tasks_mixin", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/pvc_tasks_mixin.py" - }, - { - "name": "statefulset_tasks_mixin", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/K8s/statefulset_tasks_mixin.py", - "collection_slug": "rw-public-codecollection", - "description": "", - "functions": [], - "classes": [ - { - "name": "StatefuletTasksMixin", - "docstring": "", - "methods": "stateful_sets_ready" - } - ], - "keywords": [], - "category": "kubernetes", - "import_path": "statefulset_tasks_mixin", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/statefulset_tasks_mixin.py" - }, - { - "name": "namespace_tasks_mixin", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/K8s/namespace_tasks_mixin.py", - "collection_slug": "rw-public-codecollection", - "description": "", - "functions": [], - "classes": [ - { - "name": "NamespaceTasksMixin", - "docstring": "", - "methods": "get_object_names, get_objects_by_name, search_namespace_objects_for_string, check_namespace_objects, check_namespace_errors, get_event_count, count_events_by_age_and_type, count_container_restarts_by_age, count_notready_pods, get_custom_resources, describe_custom_resources, fetch_pod_logs_and_events_by_label, fetch_pod_names_by_label, fetch_pod_resource_utilization_by_label, triage_namespace, trace_namespace_errors, object_condition_check" - } - ], - "keywords": [], - "category": "kubernetes", - "import_path": "namespace_tasks_mixin", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/namespace_tasks_mixin.py" - }, - { - "name": "__init__", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/Chat/__init__.py", - "collection_slug": "rw-public-codecollection", - "description": "RunWhen Chat keyword library\n\nScope: Global", - "functions": [], - "classes": [ - { - "name": "Chat", - "docstring": "RunWhen Chat keyword library for integrating with various chat systems like Slack and Discord.", - "methods": "send_message" - } - ], - "keywords": [], - "category": "general", - "import_path": "__init__", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/__init__.py" - }, - { - "name": "ChatProviderStrategy", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/Chat/strategies/ChatProviderStrategy.py", - "collection_slug": "rw-public-codecollection", - "description": "", - "functions": [], - "classes": [ - { - "name": "ChatProviderStrategy", - "docstring": "", - "methods": "send_message" - } - ], - "keywords": [], - "category": "general", - "import_path": "ChatProviderStrategy", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/ChatProviderStrategy.py" - }, - { - "name": "GoogleChatProviderStrategy", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/Chat/strategies/GoogleChatProviderStrategy.py", - "collection_slug": "rw-public-codecollection", - "description": "", - "functions": [], - "classes": [ - { - "name": "GoogleChatProviderStrategy", - "docstring": "", - "methods": "send_message" - } - ], - "keywords": [], - "category": "general", - "import_path": "GoogleChatProviderStrategy", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/GoogleChatProviderStrategy.py" - }, - { - "name": "RocketChatProviderStrategy", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/Chat/strategies/RocketChatProviderStrategy.py", - "collection_slug": "rw-public-codecollection", - "description": "", - "functions": [], - "classes": [ - { - "name": "RocketChatProviderStrategy", - "docstring": "", - "methods": "send_message" - } - ], - "keywords": [], - "category": "general", - "import_path": "RocketChatProviderStrategy", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/RocketChatProviderStrategy.py" - }, - { - "name": "DiscordChatProviderStrategy", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/Chat/strategies/DiscordChatProviderStrategy.py", - "collection_slug": "rw-public-codecollection", - "description": "", - "functions": [], - "classes": [ - { - "name": "DiscordChatProviderStrategy", - "docstring": "", - "methods": "send_message" - } - ], - "keywords": [], - "category": "general", - "import_path": "DiscordChatProviderStrategy", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/DiscordChatProviderStrategy.py" - }, - { - "name": "SlackChatProviderStrategy", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/Chat/strategies/SlackChatProviderStrategy.py", - "collection_slug": "rw-public-codecollection", - "description": "", - "functions": [], - "classes": [ - { - "name": "SlackChatProviderStrategy", - "docstring": "", - "methods": "send_message" - } - ], - "keywords": [], - "category": "general", - "import_path": "SlackChatProviderStrategy", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/SlackChatProviderStrategy.py" - }, - { - "name": "Curl", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/Curl/Curl.py", - "collection_slug": "rw-public-codecollection", - "description": "", - "functions": [], - "classes": [ - { - "name": "Curl", - "docstring": "A keyword library for housing general-purpose Curl keywords.", - "methods": "run_curl" - } - ], - "keywords": [], - "category": "general", - "import_path": "Curl", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/Curl.py" - }, - { - "name": "SocialScrape", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/SocialScrape/SocialScrape.py", - "collection_slug": "rw-public-codecollection", - "description": "SocialScrape keyword library\nBased on snscrape https://github.com/JustAnotherArchivist/snscrape\n \nScope: Global", - "functions": [], - "classes": [ - { - "name": "SocialScrape", - "docstring": "Twitter Scraper keyword library\nUses https://github.com/JustAnotherArchivist/snscrape", - "methods": "twitter_scrape_handle" - } - ], - "keywords": [], - "category": "general", - "import_path": "SocialScrape", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/SocialScrape.py" - }, - { - "name": "Check", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/Utils/Check.py", - "collection_slug": "rw-public-codecollection", - "description": "", - "functions": [], - "classes": [ - { - "name": "Check", - "docstring": "", - "methods": "" - } - ], - "keywords": [], - "category": "general", - "import_path": "Check", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/Check.py" - }, - { - "name": "RWUtils", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/Utils/RWUtils.py", - "collection_slug": "rw-public-codecollection", - "description": "RWUtils keyword library.\n\nThis exposes some of the rw.utils functions (python interfaces) to\nrobot authors as robot interfaces.\n\nScope: Global", - "functions": [], - "classes": [ - { - "name": "RWUtils", - "docstring": "Utility keyword library for useful bits and bobs.", - "methods": "prettify, is_string, is_integer, is_boolean, to_json, string_to_json, search_json, json_to_metric, from_json, to_boolean, to_integer, parse_url, get_hostname_from_url, get_port_from_url, get_protocol_from_url, get_path_from_url, get_params_from_url, get_query_string_from_url, generate_random_integer, encode_url" - } - ], - "keywords": [], - "category": "general", - "import_path": "RWUtils", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/RWUtils.py" - }, - { - "name": "utils", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/Utils/utils.py", - "collection_slug": "rw-public-codecollection", - "description": "rw.utils defines some common functions available to Library/Keyword\nauthors as python interfaces. Some of these are also exposed as\nRobot Keywords via RW.Utils.", - "functions": [ - { - "name": "is_bytes", - "signature": "def is_bytes(val) -> bool", - "docstring": "" - }, - { - "name": "is_str", - "signature": "def is_str(val) -> bool", - "docstring": "" - }, - { - "name": "is_str_or_bytes", - "signature": "def is_str_or_bytes(val) -> bool", - "docstring": "" - }, - { - "name": "is_int", - "signature": "def is_int(val) -> bool", - "docstring": "" - }, - { - "name": "is_float", - "signature": "def is_float(val) -> bool", - "docstring": "" - }, - { - "name": "is_bool", - "signature": "def is_bool(val) -> bool", - "docstring": "" - }, - { - "name": "is_scalar", - "signature": "def is_scalar(val) -> bool", - "docstring": "" - }, - { - "name": "is_list", - "signature": "def is_list(val) -> bool", - "docstring": "" - }, - { - "name": "is_dict", - "signature": "def is_dict(val) -> bool", - "docstring": "" - }, - { - "name": "is_xml", - "signature": "def is_xml(val) -> bool", - "docstring": "" - }, - { - "name": "is_yaml", - "signature": "def is_yaml(val) -> bool", - "docstring": "" - }, - { - "name": "is_json", - "signature": "def is_json(val, strict: bool) -> bool", - "docstring": "" - }, - { - "name": "from_json", - "signature": "def from_json(json_str, strict: bool) -> object", - "docstring": "" - }, - { - "name": "to_json", - "signature": "def to_json(data: object) -> str", - "docstring": "" - }, - { - "name": "string_to_json", - "signature": "def string_to_json(data: str) -> str", - "docstring": "" - }, - { - "name": "search_json", - "signature": "def search_json(data: dict, pattern: str) -> dict", - "docstring": "" - }, - { - "name": "json_to_metric", - "signature": "def json_to_metric(data: str, search_filter: str, calculation_field: str, calculation: str) -> float", - "docstring": "Takes in a json data result from kubectl and calculation parameters to return a single float metric.\nAssumes that the return is a \"list\" type and automatically searches through the \"items\" list, along with\nother search filters provided buy the user (using jmespath search).\n\nArgs:\n :data str: JSON data to search through.\n :search_filter str: A jmespah filter used to help filter search results. See https://jmespath.org/? to test search strings.\n :calculation_field str: The field from the " - }, - { - "name": "from_yaml", - "signature": "def from_yaml(yaml_str) -> object", - "docstring": "" - }, - { - "name": "to_yaml", - "signature": "def to_yaml(data: object) -> str", - "docstring": "" - }, - { - "name": "to_str", - "signature": "def to_str(v) -> str", - "docstring": "" - }, - { - "name": "to_bool", - "signature": "def to_bool(v) -> bool", - "docstring": "Convert the input parameter into a boolean value." - }, - { - "name": "to_int", - "signature": "def to_int(v) -> Union[int, list[int]]", - "docstring": "Convert the input parameter, which may be a scalar or a list, into\ninteger value(s)." - }, - { - "name": "to_float", - "signature": "def to_float(v) -> Union[float, list[float]]", - "docstring": "Convert the input parameter, which may be a scalar or a list, into\nfloat value(s)." - }, - { - "name": "prettify", - "signature": "def prettify(data) -> str", - "docstring": "" - }, - { - "name": "latency", - "signature": "def latency(func)", - "docstring": "" - }, - { - "name": "parse_url", - "signature": "def parse_url(url: str, verbose: bool) -> Union[str, int]", - "docstring": "" - }, - { - "name": "encode_url", - "signature": "def encode_url(hostname: str, params: dict, verbose: bool) -> str", - "docstring": "" - }, - { - "name": "parse_numerical", - "signature": "def parse_numerical(numeric_str: str)", - "docstring": "" - }, - { - "name": "parse_timedelta", - "signature": "def parse_timedelta(timestring: str) -> datetime.timedelta", - "docstring": "" - }, - { - "name": "stdout_to_list", - "signature": "def stdout_to_list(stdout: str, delimiter: str)", - "docstring": "" - }, - { - "name": "stdout_to_grid", - "signature": "def stdout_to_grid(stdout)", - "docstring": "" - }, - { - "name": "get_stdout_grid_column", - "signature": "def get_stdout_grid_column(stdout_grid, index: int)", - "docstring": "Helper function to return a column as a list from the stdout lists of a kubectl command" - }, - { - "name": "remove_units", - "signature": "def remove_units(data_points)", - "docstring": "Iterates over list and removes units" - }, - { - "name": "aggregate", - "signature": "def aggregate(method: str, column: list)", - "docstring": "" - }, - { - "name": "yaml_to_dict", - "signature": "def yaml_to_dict(yaml_str: str)", - "docstring": "" - }, - { - "name": "dict_to_yaml", - "signature": "def dict_to_yaml(data: Union[dict, benedict])", - "docstring": "" - }, - { - "name": "list_to_string", - "signature": "def list_to_string(data_list: list, join_with: str) -> str", - "docstring": "" - }, - { - "name": "string_if_else", - "signature": "def string_if_else(check_boolean: bool, if_str: str, else_str) -> str", - "docstring": "" - }, - { - "name": "remove_spaces", - "signature": "def remove_spaces(initial_str: str, remove: list[str]) -> str", - "docstring": "" - }, - { - "name": "csv_to_list", - "signature": "def csv_to_list(csv_str: str, strip_entries: bool) -> list", - "docstring": "" - }, - { - "name": "lists_to_dict", - "signature": "def lists_to_dict(keys: list, values: list) -> dict", - "docstring": "" - }, - { - "name": "templated_string_list", - "signature": "def templated_string_list(template_string: str, values: list, key_name) -> list", - "docstring": "" - }, - { - "name": "create_secrets_list", - "signature": "def create_secrets_list() -> ...", - "docstring": "" - }, - { - "name": "get_source_dir", - "signature": "def get_source_dir() -> str", - "docstring": "" - }, - { - "name": "create_secret", - "signature": "def create_secret(key: str, val: Any) -> platform.Secret", - "docstring": "" - }, - { - "name": "merge_json_secrets", - "signature": "def merge_json_secrets() -> platform.Secret", - "docstring": "" - }, - { - "name": "secret_to_curl_headers", - "signature": "def secret_to_curl_headers(optional_headers: platform.Secret, default_headers: str) -> platform.Secret", - "docstring": "" - }, - { - "name": "create_curl", - "signature": "def create_curl(cmd, optional_headers: platform.Secret) -> str", - "docstring": "Helper method to generate a curl string equivalent to a Requests object (roughly)\nNote that headers are inserted as a $variable to be substituted in the location service by an environment variable.\nThis is identified by the secret.key" - }, - { - "name": "quote_curl", - "signature": "def quote_curl(curl: str) -> str", - "docstring": "Simple helper method to escape specific characters in complex curl commands\n\nArgs:\n query (str): the curl string to execute\n\nReturns:\n str: a curl string with inner \" characters escaped to prevent shell eval issues" - }, - { - "name": "rate_of_occurence", - "signature": "def rate_of_occurence(data: list, count_value: any, default_value: float, operand: str) -> float", - "docstring": "" - } - ], - "classes": [ - { - "name": "Status", - "docstring": "", - "methods": "" - } - ], - "keywords": [ - "Json To Metric", - "To Bool", - "To Int", - "To Float", - "Get Stdout Grid Column", - "Remove Units", - "Create Curl", - "Quote Curl" - ], - "category": "general", - "import_path": "utils", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/utils.py" - }, - { - "name": "rest", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/Rest/rest.py", - "collection_slug": "rw-public-codecollection", - "description": "", - "functions": [], - "classes": [ - { - "name": "Rest", - "docstring": "A keyword library for housing general-purpose REST keywords.\nNote: session was avoided on purpose to reduce state\nTODO: support explicit oauth2 flow", - "methods": "request, request_as_secret, handle_response, create_basic_auth, create_basic_auth_secret, create_bearer_token_header" - } - ], - "keywords": [], - "category": "general", - "import_path": "rest", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/rest.py" - }, - { - "name": "Billing", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/AWS/Billing.py", - "collection_slug": "rw-public-codecollection", - "description": "AWS Billing keyword library\n\nScope: Global", - "functions": [], - "classes": [ - { - "name": "Billing", - "docstring": "", - "methods": "get_cost_and_usage, get_costs_per_tag, get_cost_metric_from_results, run_report_on_tagged_costs" - } - ], - "keywords": [], - "category": "aws", - "import_path": "Billing", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/Billing.py" - }, - { - "name": "CloudWatch", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/AWS/CloudWatch.py", - "collection_slug": "rw-public-codecollection", - "description": "AWS CloudWatch keyword library\n\nScope: Global", - "functions": [], - "classes": [ - { - "name": "CloudWatch", - "docstring": "CloudWatch is a keyword library for integrating with AWS CloudWatch.", - "methods": "metric_query, templated_metric_query, multi_metric_query, get_volume_usages, filter_metric_dict, transform_metric_dict, most_recent_metric_from_results, largest_metric_from_results, smallest_metric_from_results, log_query, get_cloudwatch_metric_insights_url, get_cloudwatch_logs_insights_url, aws_quote_list, aws_quote_dict, encode_aws_params, aws_encode_key, aws_encode_var, aws_quote_logquery_str, aws_quote_metricquery_str, aws_glue_encoded_list" - } - ], - "keywords": [], - "category": "aws", - "import_path": "CloudWatch", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/CloudWatch.py" - }, - { - "name": "EC2", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/AWS/EC2.py", - "collection_slug": "rw-public-codecollection", - "description": "AWS EC2 keyword library\n\nScope: Global", - "functions": [], - "classes": [ - { - "name": "EC2", - "docstring": "", - "methods": "get_ec2_instances, get_ec2_instance_ids, get_vpcs, get_subnets, get_route_tables, get_volumes, get_volumes_with_no_attachments, get_volume_ids, get_block_devices_from_instances, get_untagged_instances, get_vpcs_ids_from_instances, get_subnet_ids_from_instances, find_open_routes, filter_dicts_with_list, get_intersections, check_keypath_intersection, get_list_of_values_from_dicts, run_untagged_ec2_checks, run_open_routes_check, run_dangling_volumes_check" - } - ], - "keywords": [], - "category": "aws", - "import_path": "EC2", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/EC2.py" - }, - { - "name": "CloudFormation", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/AWS/CloudFormation.py", - "collection_slug": "rw-public-codecollection", - "description": "AWS CloudFormation keyword library\n\nScope: Global", - "functions": [], - "classes": [ - { - "name": "CloudFormation", - "docstring": "AWS CloudFormation keyword library for integrating with AWS CloudFormation.", - "methods": "get_stack_events, get_all_stack_events, get_stack_summaries, filter_stack_events, filter_stack_events_by_status, filter_stack_events_by_time, json_stringify" - } - ], - "keywords": [], - "category": "aws", - "import_path": "CloudFormation", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/CloudFormation.py" - }, - { - "name": "S3", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/AWS/S3.py", - "collection_slug": "rw-public-codecollection", - "description": "AWS S3 keyword library\n\nScope: Global", - "functions": [], - "classes": [ - { - "name": "S3", - "docstring": "", - "methods": "get_buckets, get_bucket_objects, get_bucket_last_access_time, get_last_access_time_of_buckets, get_stale_buckets, run_s3_checks" - } - ], - "keywords": [], - "category": "aws", - "import_path": "S3", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/S3.py" - }, - { - "name": "AWSAuthenticationMixin", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/AWS/mixins/AWSAuthenticationMixin.py", - "collection_slug": "rw-public-codecollection", - "description": "", - "functions": [], - "classes": [ - { - "name": "AWSAuthenticationMixin", - "docstring": "Mixin abstract base class for abstracting authentication workflow from AWS keywords. Acts as the context in a strategy pattern.", - "methods": "set_aws_keys, set_get_client_strategy, authenticate, get_client" - } - ], - "keywords": [], - "category": "aws", - "import_path": "AWSAuthenticationMixin", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/AWSAuthenticationMixin.py" - }, - { - "name": "test_queries", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/AWS/robot_tests/test_queries.py", - "collection_slug": "rw-public-codecollection", - "description": "", - "functions": [], - "classes": [], - "keywords": [], - "category": "aws", - "import_path": "test_queries", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/test_queries.py" - }, - { - "name": "GetClientStrategy", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/AWS/strategies/GetClientStrategy.py", - "collection_slug": "rw-public-codecollection", - "description": "", - "functions": [], - "classes": [ - { - "name": "GetClientStrategy", - "docstring": "", - "methods": "get_client" - } - ], - "keywords": [], - "category": "aws", - "import_path": "GetClientStrategy", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/GetClientStrategy.py" - }, - { - "name": "UserGetClientStrategy", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/AWS/strategies/UserGetClientStrategy.py", - "collection_slug": "rw-public-codecollection", - "description": "", - "functions": [], - "classes": [ - { - "name": "UserGetClientStrategy", - "docstring": "", - "methods": "get_client" - } - ], - "keywords": [], - "category": "aws", - "import_path": "UserGetClientStrategy", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/UserGetClientStrategy.py" - }, - { - "name": "RoleGetClientStrategy", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/AWS/strategies/RoleGetClientStrategy.py", - "collection_slug": "rw-public-codecollection", - "description": "", - "functions": [], - "classes": [ - { - "name": "RoleGetClientStrategy", - "docstring": "", - "methods": "get_client" - } - ], - "keywords": [], - "category": "aws", - "import_path": "RoleGetClientStrategy", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/RoleGetClientStrategy.py" - }, - { - "name": "cert_manager", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/CertManager/cert_manager.py", - "collection_slug": "rw-public-codecollection", - "description": "cert-manager keyword library, based on shellservice base.\n\nScope: Global", - "functions": [], - "classes": [ - { - "name": "CertManager", - "docstring": "cert-manager keyword library can be used monitor and health check cert-manager resources.", - "methods": "get_expiring_certs, get_now, health_check" - } - ], - "keywords": [], - "category": "general", - "import_path": "cert_manager", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/cert_manager.py" - }, - { - "name": "StatusPage", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/Uptime/StatusPage.py", - "collection_slug": "rw-public-codecollection", - "description": "", - "functions": [], - "classes": [ - { - "name": "StatusPage", - "docstring": "Used to fetch and validate data/metrics from a Uptime.com status page and its components.\n\nReturns:\n _type_: None", - "methods": "get_component_status, validate_component_status" - } - ], - "keywords": [], - "category": "general", - "import_path": "StatusPage", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/StatusPage.py" - }, - { - "name": "__init__", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/GitHub/__init__.py", - "collection_slug": "rw-public-codecollection", - "description": "GitHub Core keyword library\n\nScope: Global", - "functions": [], - "classes": [ - { - "name": "GitHub", - "docstring": "GitHub keyword library defines keywords for interacting with GitHub\nservices.", - "methods": "set_token, get_token, create_issue, get_user, get_repo, get_repos" - } - ], - "keywords": [], - "category": "general", - "import_path": "__init__", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/__init__.py" - }, - { - "name": "Actions", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/GitHub/Actions.py", - "collection_slug": "rw-public-codecollection", - "description": "", - "functions": [], - "classes": [ - { - "name": "Actions", - "docstring": "", - "methods": "get_workflow_usage, get_workflow_runs, get_workflow_run_usage, get_workflow_times" - } - ], - "keywords": [], - "category": "general", - "import_path": "Actions", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/Actions.py" - }, - { - "name": "Status", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/GitHub/Status.py", - "collection_slug": "rw-public-codecollection", - "description": "GitHub Status service keyword library\n\nScope: Global", - "functions": [], - "classes": [ - { - "name": "Status", - "docstring": "GitHub Status keyword library", - "methods": "get_github_availability, get_unresolved_incidents, get_scheduled_maintenances" - } - ], - "keywords": [], - "category": "general", - "import_path": "Status", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/Status.py" - }, - { - "name": "Artifactory", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/Artifactory/Artifactory.py", - "collection_slug": "rw-public-codecollection", - "description": "", - "functions": [], - "classes": [ - { - "name": "Artifactory", - "docstring": "_summary_\n\nReturns:\n _type_: _description_", - "methods": "get_health, validate_health" - } - ], - "keywords": [], - "category": "general", - "import_path": "Artifactory", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/Artifactory.py" - }, - { - "name": "postgres", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/Postgres/postgres.py", - "collection_slug": "rw-public-codecollection", - "description": "Postgres keyword library\n\nScope: Global", - "functions": [], - "classes": [ - { - "name": "Postgres", - "docstring": "Postgres keyword library", - "methods": "template_command, template_command_with_file, parse_metric_and_time, quote_query" - } - ], - "keywords": [], - "category": "database", - "import_path": "postgres", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/postgres.py" - }, - { - "name": "OpsSuite", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/GCP/OpsSuite.py", - "collection_slug": "rw-public-codecollection", - "description": "Operations Suite keyword library\n\nScope: Global", - "functions": [], - "classes": [ - { - "name": "OpsSuite", - "docstring": "Operations Suite keyword library", - "methods": "authenticate, get_credentials, get_token, get_access_token_header, run_mql, metric_query, get_last_point_in_series_set, average_numeric_across_instances, highest_numeric_across_instances, sum_numeric_across_instances, remove_units, get_gce_logs, get_logs_dashboard_url, add_time_range" - } - ], - "keywords": [], - "category": "gcp", - "import_path": "OpsSuite", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/OpsSuite.py" - }, - { - "name": "ServiceHealth", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/GCP/ServiceHealth.py", - "collection_slug": "rw-public-codecollection", - "description": "Google Cloud Service Health keyword library\n\nScope: Global", - "functions": [], - "classes": [ - { - "name": "ServiceHealth", - "docstring": "Google Cloud Service Health keyword library", - "methods": "get_status_json, filter_status_results, filter_history_by_time" - } - ], - "keywords": [], - "category": "gcp", - "import_path": "ServiceHealth", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/ServiceHealth.py" - }, - { - "name": "GCloudCLI", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/GCP/GCloudCLI.py", - "collection_slug": "rw-public-codecollection", - "description": "", - "functions": [ - { - "name": "shell", - "signature": "def shell(cmd: str, target_service: platform.Service, gcp_credentials_json: platform.Secret, project_id: str) -> any", - "docstring": "" - } - ], - "classes": [], - "keywords": [], - "category": "cli", - "import_path": "GCloudCLI", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/GCloudCLI.py" - }, - { - "name": "Chat", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/GCP/Chat.py", - "collection_slug": "rw-public-codecollection", - "description": "Google Chat keyword library\n\nScope: Global", - "functions": [], - "classes": [ - { - "name": "Chat", - "docstring": "Google Chat integration to send messages via webhook to channels.\n\nTo allow a channel to receive a webhook follow: https://developers.google.com/chat/how-tos/webhooks", - "methods": "send_message" - } - ], - "keywords": [], - "category": "gcp", - "import_path": "Chat", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/Chat.py" - }, - { - "name": "grpcurl", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/gRPC/grpcurl.py", - "collection_slug": "rw-public-codecollection", - "description": "", - "functions": [], - "classes": [ - { - "name": "gRPCurl", - "docstring": "A keyword set for running dynamic gRPC calls against gRPC services using the gRPCurl", - "methods": "grpcurl_unary, run_grpcurl" - } - ], - "keywords": [], - "category": "general", - "import_path": "grpcurl", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/grpcurl.py" - }, - { - "name": "patroni", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/Patroni/patroni.py", - "collection_slug": "rw-public-codecollection", - "description": "Patroni keyword library\n\nScope: Global", - "functions": [], - "classes": [ - { - "name": "Patroni", - "docstring": "Patroni keyword library", - "methods": "k8s_patroni_state_healthy, k8s_patroni_get_max_lag, k8s_patroni_get_max_lag_member, k8s_patroni_get_laggy_members, k8s_patroni_get_cluster_name, k8s_patroni_template_deletemember" - } - ], - "keywords": [], - "category": "general", - "import_path": "patroni", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/patroni.py" - }, - { - "name": "Vault", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/HashiCorp/Vault.py", - "collection_slug": "rw-public-codecollection", - "description": "HashiCorp Vault keyword library\n\nScope: Global", - "functions": [], - "classes": [ - { - "name": "Vault", - "docstring": "HashiCorp Vault keyword library", - "methods": "get_health, check_health" - } - ], - "keywords": [], - "category": "general", - "import_path": "Vault", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/Vault.py" - }, - { - "name": "Prometheus", - "module_path": "/app/data/repos/rw-public-codecollection/libraries/RW/Prometheus/Prometheus.py", - "collection_slug": "rw-public-codecollection", - "description": "", - "functions": [], - "classes": [ - { - "name": "Prometheus", - "docstring": "Keyword Integration for the Prometheus HTTP API which can be used to fetch data from a Prometheus instance.\nImplemented according to https://prometheus.io/docs/prometheus/latest/querying/api/", - "methods": "query_instant, query_range, list_labels, query_label, transform_data" - } - ], - "keywords": [], - "category": "prometheus", - "import_path": "Prometheus", - "git_url": "https://github.com/runwhen-contrib/rw-public-codecollection/tree/main/libraries/Prometheus.py" - }, - { - "name": "stdout_parser", - "module_path": "/app/data/repos/rw-cli-codecollection/libraries/RW/CLI/stdout_parser.py", - "collection_slug": "rw-cli-codecollection", - "description": "CLI Generic keyword library for running and parsing CLI stdout\n\nScope: Global", - "functions": [ - { - "name": "parse_cli_output_by_line", - "signature": "def parse_cli_output_by_line(rsp: platform.ShellServiceResponse, lines_like_regexp: str, issue_if_no_capture_groups: bool, set_severity_level: int, set_issue_expected: str, set_issue_actual: str, set_issue_reproduce_hint: str, set_issue_title: str, set_issue_details: str, set_issue_next_steps: str, expected_rsp_statuscodes: list[int], expected_rsp_returncodes: list[int], contains_stderr_ok: bool, raise_issue_if_no_groups_found: bool, raise_issue_from_rsp_code: bool) -> platform.ShellServiceResponse", - "docstring": "A parser that executes platform API requests as it traverses the provided stdout by line.\nThis allows authors to 'raise an issue' for a given line in stdout, providing valuable information for troubleshooting.\n\nFor each line traversed, the parser will check the contents using a variety of functions based on the kwargs provided\nwith the following structure:\n\n __raise_issue_=\n\nthe following capture groups are always set:\n- _stdout: the entire stdout conten" - } - ], - "classes": [], - "keywords": [ - "Parse Cli Output By Line" - ], - "category": "cli", - "import_path": "stdout_parser", - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/libraries/stdout_parser.py" - }, - { - "name": "cli_utils", - "module_path": "/app/data/repos/rw-cli-codecollection/libraries/RW/CLI/cli_utils.py", - "collection_slug": "rw-cli-codecollection", - "description": "Utility method to verify the ShellServieResponse is in the desired state\nand raise exceptions if not.\n\nArgs:\n rsp (platform.ShellServiceResponse): the rsp to verify\n expected_rsp_statuscodes (list[int], optional): the http response code returned by the process/shell service API, not the same as the bash return code. Defaults to [200].\n expected_rsp_returncodes (list[int], optional): the shell return code. Defaults to [0].\n contains_stderr_ok (bool, optional): if the presence of stder", - "functions": [ - { - "name": "verify_rsp", - "signature": "def verify_rsp(rsp: platform.ShellServiceResponse, expected_rsp_statuscodes: list[int], expected_rsp_returncodes: list[int], contains_stderr_ok: bool) -> None", - "docstring": "Utility method to verify the ShellServieResponse is in the desired state\nand raise exceptions if not.\n\nArgs:\n rsp (platform.ShellServiceResponse): the rsp to verify\n expected_rsp_statuscodes (list[int], optional): the http response code returned by the process/shell service API, not the same as the bash return code. Defaults to [200].\n expected_rsp_returncodes (list[int], optional): the shell return code. Defaults to [0].\n contains_stderr_ok (bool, optional): if the presence of stder" - }, - { - "name": "from_json", - "signature": "def from_json(json_str: str)", - "docstring": "Wrapper keyword for json loads\n\nArgs:\n json_str (str): json string blob\n\nReturns:\n any: the loaded json object" - }, - { - "name": "to_json", - "signature": "def to_json(json_data: any)", - "docstring": "Wrapper keyword for json dumps\n\nArgs:\n json_data (any): json data\n\nReturns:\n str: the str representation of the json blob" - }, - { - "name": "filter_by_time", - "signature": "def filter_by_time(list_data: list, field_name: str, operand: str, duration_str: str)", - "docstring": "Utility keyword to iterate through a list of dictionaries and remove list entries where\nthe specified key datetime is older than the given duration string.\n\nArgs:\n list_data (list): list of dictionaries to filter\n field_name (str): what key to use for comparisons\n operand (str, optional): Defaults to \"filter_older_than\".\n duration_str (str, optional): Duration string in the form of 3d2h1s. Defaults to \"30m\".\n\nReturns:\n _type_: _description_" - }, - { - "name": "escape_str_for_exec", - "signature": "def escape_str_for_exec(string: str, escapes: int) -> str", - "docstring": "Simple helper method to escape specific characters that cause issues in the pod exec passthrough\nArgs:\n string (str): original string for exec passthrough\nReturns:\n str: string with triple escaped quotes for passthrough" - } - ], - "classes": [ - { - "name": "IssueCheckResults", - "docstring": "Used to keep function signatures from getting too busy when passing issue data around.", - "methods": "" - } - ], - "keywords": [ - "Verify Rsp", - "From Json", - "To Json", - "Filter By Time", - "Escape Str For Exec" - ], - "category": "cli", - "import_path": "cli_utils", - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/libraries/cli_utils.py" - }, - { - "name": "postgres_helper", - "module_path": "/app/data/repos/rw-cli-codecollection/libraries/RW/CLI/postgres_helper.py", - "collection_slug": "rw-cli-codecollection", - "description": "", - "functions": [ - { - "name": "get_password", - "signature": "def get_password(context: str, namespace: str, kubeconfig: platform.Secret, env: dict, labels: str, workload_name: str, container_name: str) -> platform.Secret", - "docstring": "" - }, - { - "name": "get_user", - "signature": "def get_user(context: str, namespace: str, kubeconfig: platform.Secret, env: dict, labels: str, workload_name: str, container_name: str) -> platform.Secret", - "docstring": "" - }, - { - "name": "get_database", - "signature": "def get_database(context: str, namespace: str, kubeconfig: platform.Secret, env: dict, labels: str, workload_name: str, container_name: str) -> str", - "docstring": "" - }, - { - "name": "k8s_postgres_query", - "signature": "def k8s_postgres_query(query: str, context: str, namespace: str, kubeconfig: platform.Secret, binary_name: str, env: dict, labels: str, workload_name: str, container_name: str, database_name: str, opt_flags: str) -> platform.ShellServiceResponse", - "docstring": "" - } - ], - "classes": [], - "keywords": [], - "category": "cli", - "import_path": "postgres_helper", - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/libraries/postgres_helper.py" - }, - { - "name": "CLI", - "module_path": "/app/data/repos/rw-cli-codecollection/libraries/RW/CLI/CLI.py", - "collection_slug": "rw-cli-codecollection", - "description": "CLI Generic keyword library for running and parsing CLI stdout\n\nScope: Global", - "functions": [ - { - "name": "escape_string", - "signature": "def escape_string(string)", - "docstring": "" - }, - { - "name": "escape_bash_command", - "signature": "def escape_bash_command(command)", - "docstring": "Escapes a command for safe execution in bash." - }, - { - "name": "pop_shell_history", - "signature": "def pop_shell_history() -> str", - "docstring": "Deletes the shell history up to this point and returns it as a string for display.\n\nReturns:\n str: the string of shell command history" - }, - { - "name": "execute_command", - "signature": "def execute_command(cmd: str, service: platform.Service, request_secrets: list[platform.ShellServiceRequestSecret], env: dict, files: dict, timeout_seconds: int, cwd: str) -> platform.ShellServiceResponse", - "docstring": "If 'service' is None, run the command locally via 'execute_local_command'.\nOtherwise, run it via 'platform.execute_shell_command'." - }, - { - "name": "find_file", - "signature": "def find_file()", - "docstring": "Helper function to check if a file exists in the given paths. " - }, - { - "name": "resolve_path_to_robot", - "signature": "def resolve_path_to_robot()", - "docstring": "" - }, - { - "name": "run_bash_file", - "signature": "def run_bash_file(bash_file: str, target_service: platform.Service, env: dict, include_in_history: bool, cmd_override: str, timeout_seconds: int) -> platform.ShellServiceResponse", - "docstring": "Runs a bash file from the local file system or remotely on a shellservice,\nautomatically staging it in CODEBUNDLE_TEMP_DIR if available.\n\n1) Find the local path to `bash_file` (or fallback via resolve_path_to_robot).\n2) Copy that script and all sibling files into CODEBUNDLE_TEMP_DIR (if set),\n or else an ephemeral tmp directory.\n3) Call `execute_command()` to actually run the script from that directory.\n4) If 'service' is provided, run on a remote shell; if not, run locally.\n\nSecrets and envir" - }, - { - "name": "run_cli", - "signature": "def run_cli(cmd: str, target_service: platform.Service, env: dict, loop_with_items: list, run_in_workload_with_name: str, run_in_workload_with_labels: str, optional_namespace: str, optional_context: str, include_in_history: bool, timeout_seconds: int, debug: bool) -> platform.ShellServiceResponse", - "docstring": "Executes a string of shell commands either locally or remotely (if target_service is given).\n- If CODEBUNDLE_TEMP_DIR is set, commands are run from that directory.\n- Preserves the existing logic for:\n * loop_with_items\n * run_in_workload_with_name / run_in_workload_with_labels\n * secrets\n * environment\n * debug/logging" - }, - { - "name": "string_to_datetime", - "signature": "def string_to_datetime(duration_str: str) -> datetime", - "docstring": "Helper to convert readable duration strings (eg: 1d2m36s) to a datetime." - } - ], - "classes": [], - "keywords": [ - "Escape Bash Command", - "Pop Shell History", - "Execute Command", - "Find File", - "Run Bash File", - "Run Cli", - "String To Datetime" - ], - "category": "cli", - "import_path": "CLI", - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/libraries/CLI.py" - }, - { - "name": "local_process", - "module_path": "/app/data/repos/rw-cli-codecollection/libraries/RW/CLI/local_process.py", - "collection_slug": "rw-cli-codecollection", - "description": "Runs a local bash command via subprocess, with optional secrets, environment, and file copying.\nInstead of ephemeral mkdtemp usage, we store everything in CODEBUNDLE_TEMP_DIR if set,\notherwise in the current directory, so the files persist after execution.\n\nArgs:\n cmd (str): The command to run (e.g., \"ls -l\").\n request_secrets (list): Secrets to inject either as environment variables or as files.\n env (dict): Additional environment variables for this process.\n files (dict): A dict of", - "functions": [ - { - "name": "execute_local_command", - "signature": "def execute_local_command(cmd: str, request_secrets: list[platform.ShellServiceRequestSecret], env: dict, files: dict, timeout_seconds: int, cwd: str) -> platform.ShellServiceResponse", - "docstring": "Runs a local bash command via subprocess, with optional secrets, environment, and file copying.\nInstead of ephemeral mkdtemp usage, we store everything in CODEBUNDLE_TEMP_DIR if set,\notherwise in the current directory, so the files persist after execution.\n\nArgs:\n cmd (str): The command to run (e.g., \"ls -l\").\n request_secrets (list): Secrets to inject either as environment variables or as files.\n env (dict): Additional environment variables for this process.\n files (dict): A dict of" - } - ], - "classes": [], - "keywords": [ - "Execute Local Command" - ], - "category": "cli", - "import_path": "local_process", - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/libraries/local_process.py" - }, - { - "name": "json_parser", - "module_path": "/app/data/repos/rw-cli-codecollection/libraries/RW/CLI/json_parser.py", - "collection_slug": "rw-cli-codecollection", - "description": "Parser for json blob data that can raise issues to the RunWhen platform based on data found.\nQueries can be performed on the data using various kwarg structures with the following syntax:\n\nkwarg syntax:\n- extract_path_to_var__{variable_name}\n- from_var_with_path__{variable1}__to__{variable2}\n- assign_stdout_from_var\n- {variable_name}__raise_issue_if_gt|lt|contains|ncontains|eq|neq\n\nUsing the `__` delimiters to separate values and prefixes.\n\n\nArgs:\n rsp (platform.ShellServiceResponse): _descri", - "functions": [ - { - "name": "parse_cli_json_output", - "signature": "def parse_cli_json_output(rsp: platform.ShellServiceResponse, set_severity_level: int, set_issue_expected: str, set_issue_actual: str, set_issue_reproduce_hint: str, set_issue_title: str, set_issue_details: str, set_issue_next_steps: str, expected_rsp_statuscodes: list[int], expected_rsp_returncodes: list[int], raise_issue_from_rsp_code: bool, contains_stderr_ok: bool) -> platform.ShellServiceResponse", - "docstring": "Parser for json blob data that can raise issues to the RunWhen platform based on data found.\nQueries can be performed on the data using various kwarg structures with the following syntax:\n\nkwarg syntax:\n- extract_path_to_var__{variable_name}\n- from_var_with_path__{variable1}__to__{variable2}\n- assign_stdout_from_var\n- {variable_name}__raise_issue_if_gt|lt|contains|ncontains|eq|neq\n\nUsing the `__` delimiters to separate values and prefixes.\n\n\nArgs:\n rsp (platform.ShellServiceResponse): _descri" - } - ], - "classes": [], - "keywords": [ - "Parse Cli Json Output" - ], - "category": "cli", - "import_path": "json_parser", - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/libraries/json_parser.py" - }, - { - "name": "k8s_helper", - "module_path": "/app/data/repos/rw-cli-codecollection/libraries/RW/K8sHelper/k8s_helper.py", - "collection_slug": "rw-cli-codecollection", - "description": "Parse a Kubernetes object JSON for specific annotations or labels and return recommendations.\n\nArgs:\nobj_json (dict): The Kubernetes object JSON.\n\nReturns:\nstr: Recommendations based on the object's annotations or labels.", - "functions": [ - { - "name": "get_related_resource_recommendations", - "signature": "def get_related_resource_recommendations(k8s_object)", - "docstring": "Parse a Kubernetes object JSON for specific annotations or labels and return recommendations.\n\nArgs:\nobj_json (dict): The Kubernetes object JSON.\n\nReturns:\nstr: Recommendations based on the object's annotations or labels." - }, - { - "name": "sanitize_messages", - "signature": "def sanitize_messages(input_string)", - "docstring": "Sanitize the message string by replacing ncharacters that can't be processed into json issue details.\n\nArgs:\n- input_string: The string to be sanitized.\n\nReturns:\n- The sanitized string." - } - ], - "classes": [], - "keywords": [ - "Get Related Resource Recommendations", - "Sanitize Messages" - ], - "category": "general", - "import_path": "k8s_helper", - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/libraries/k8s_helper.py" - }, - { - "name": "k8s_log", - "module_path": "/app/data/repos/rw-cli-codecollection/libraries/RW/K8sLog/k8s_log.py", - "collection_slug": "rw-cli-codecollection", - "description": "Signal handler for timeout operations.", - "functions": [ - { - "name": "timeout_handler", - "signature": "def timeout_handler(signum, frame)", - "docstring": "Signal handler for timeout operations." - } - ], - "classes": [ - { - "name": "TimeoutError", - "docstring": "Custom timeout exception for log scanning operations.", - "methods": "" - }, - { - "name": "K8sLog", - "docstring": "K8s Log Analysis Library\n\nThis library provides keywords for fetching and analyzing Kubernetes workload logs\nfor various error patterns, anomalies, and issues. It supports deployments, \nstatefulsets, and daemonsets.\n\nThe library consolidates multiple log scanning patterns into efficient, reusable\nkeywords that can be used across different codebundles.", - "methods": "fetch_workload_logs, scan_logs_for_issues, analyze_log_anomalies, summarize_log_issues, format_scan_results_for_display, calculate_log_health_score, extract_last_termination_timestamp, get_first_false_condition_timestamp, cleanup_temp_files, extract_timestamp_from_line" - } - ], - "keywords": [ - "Timeout Handler" - ], - "category": "general", - "import_path": "k8s_log", - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/libraries/k8s_log.py" - }, - { - "name": "k8s_applications", - "module_path": "/app/data/repos/rw-cli-codecollection/libraries/RW/K8sApplications/k8s_applications.py", - "collection_slug": "rw-cli-codecollection", - "description": "", - "functions": [ - { - "name": "format_process_list", - "signature": "def format_process_list(proc_list: str) -> list", - "docstring": "" - }, - { - "name": "serialize_env", - "signature": "def serialize_env(printenv: str) -> dict", - "docstring": "" - }, - { - "name": "test_search", - "signature": "def test_search(repo: Repository, exceptions: list[StackTraceData]) -> list[RepositorySearchResult]", - "docstring": "" - }, - { - "name": "get_test_data", - "signature": "def get_test_data()", - "docstring": "" - }, - { - "name": "stacktrace_report_data", - "signature": "def stacktrace_report_data(stacktraces: list[StackTraceData], max_report_stacktraces: int) -> dict", - "docstring": "" - }, - { - "name": "stacktrace_report", - "signature": "def stacktrace_report(stacktraces: list[StackTraceData]) -> str", - "docstring": "" - }, - { - "name": "parse_django_stacktraces", - "signature": "def parse_django_stacktraces(logs: str) -> list[StackTraceData]", - "docstring": "" - }, - { - "name": "parse_django_json_stacktraces", - "signature": "def parse_django_json_stacktraces(logs: str, show_debug: bool) -> list[StackTraceData]", - "docstring": "" - }, - { - "name": "parse_golang_json_stacktraces", - "signature": "def parse_golang_json_stacktraces(logs: str, show_debug: bool) -> list[StackTraceData]", - "docstring": "" - }, - { - "name": "dynamic_parse_stacktraces", - "signature": "def dynamic_parse_stacktraces(logs: str, parser_name: str, parse_mode: str, show_debug: bool) -> list[StackTraceData]", - "docstring": "Allows for dynamic parsing of stacktraces based on the first log line\nif no parser name is provided, the first log line will be used to determine the parser to use\nbased on a map lookup of parser types to their respective parsers\n\nArgs:\n logs (str): the log data to parse\n parser_name (str, optional): the name of the parser to lookup for use. Defaults to \"\".\n parse_mode (ParseMode, optional): how to modify the ingested logs, typically we want to split them on newlines. Defaults to ParseM" - }, - { - "name": "determine_parser", - "signature": "def determine_parser(first_line: str) -> BaseStackTraceParse", - "docstring": "" - }, - { - "name": "parse_stacktraces", - "signature": "def parse_stacktraces(logs: str, parse_mode: ParseMode, parser_override: BaseStackTraceParse, show_debug: bool) -> list[StackTraceData]", - "docstring": "" - }, - { - "name": "clone_repo", - "signature": "def clone_repo(git_uri, git_token, number_of_commits_history: int) -> Repository", - "docstring": "" - }, - { - "name": "troubleshoot_application", - "signature": "def troubleshoot_application(repos: list[Repository], exceptions: list[StackTraceData], env: dict, process_list: list[str], app_name: str) -> dict", - "docstring": "" - }, - { - "name": "get_file_contents_peek", - "signature": "def get_file_contents_peek(filename: str, st: StackTraceData) -> str", - "docstring": "" - }, - { - "name": "create_github_issue", - "signature": "def create_github_issue(repo: Repository, content: str, app_name: str) -> str", - "docstring": "" - }, - { - "name": "scale_up_hpa", - "signature": "def scale_up_hpa(infra_repo: Repository, manifest_file_path: str, increase_value: int, set_value: int, max_allowed_replicas: int) -> dict", - "docstring": "" - } - ], - "classes": [ - { - "name": "ParseMode", - "docstring": "", - "methods": "" - } - ], - "keywords": [ - "Dynamic Parse Stacktraces" - ], - "category": "general", - "import_path": "k8s_applications", - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/libraries/k8s_applications.py" - }, - { - "name": "migrations_inspector", - "module_path": "/app/data/repos/rw-cli-codecollection/libraries/RW/K8sApplications/migrations_inspector.py", - "collection_slug": "rw-cli-codecollection", - "description": "", - "functions": [], - "classes": [], - "keywords": [], - "category": "general", - "import_path": "migrations_inspector", - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/libraries/migrations_inspector.py" - }, - { - "name": "repository", - "module_path": "/app/data/repos/rw-cli-codecollection/libraries/RW/K8sApplications/repository.py", - "collection_slug": "rw-cli-codecollection", - "description": "", - "functions": [], - "classes": [ - { - "name": "GitCommit", - "docstring": "", - "methods": "changed_files, diff, diff_additions, diff_deletions" - }, - { - "name": "RepositorySearchResult", - "docstring": "", - "methods": "" - }, - { - "name": "RepositoryFile", - "docstring": "", - "methods": "search, content_peek, git_add, write_content" - }, - { - "name": "RepositoryFiles", - "docstring": "", - "methods": "add_source_file, file_paths, all_files, all_basenames" - }, - { - "name": "Repository", - "docstring": "", - "methods": "find_file, clone_repo, git_commit, git_push_branch, git_pr, get_repo_base_url, is_text_file, create_file_list, search, get_commits_that_changed_file, serialize_git_commits, get_git_log, list_issues, create_issue" - } - ], - "keywords": [], - "category": "general", - "import_path": "repository", - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/libraries/repository.py" - }, - { - "name": "parsers", - "module_path": "/app/data/repos/rw-cli-codecollection/libraries/RW/K8sApplications/parsers.py", - "collection_slug": "rw-cli-codecollection", - "description": "", - "functions": [], - "classes": [ - { - "name": "StackTraceData", - "docstring": "", - "methods": "has_results, errors_summary, first_file, first_line_nums, get_first_file_line_nums_as_str" - }, - { - "name": "BaseStackTraceParse", - "docstring": "Base class for stacktrace parsing functions.\nShould be stateless so it can be used as a utility class.\n\nNote that the default behavior assumes python stack traces, and inheritors can override for other languages.", - "methods": "is_json, parse_log, extract_line_nums, extract_files, extract_urls, extract_endpoints, extract_sentences, extract_timestamp" - }, - { - "name": "CSharpStackTraceParse", - "docstring": "", - "methods": "parse_log" - }, - { - "name": "PythonStackTraceParse", - "docstring": "", - "methods": "extract_sentences, extract_line_nums, extract_files, parse_log" - }, - { - "name": "DRFStackTraceParse", - "docstring": "", - "methods": "parse_log" - }, - { - "name": "GoogleDRFStackTraceParse", - "docstring": "", - "methods": "parse_log" - }, - { - "name": "GoLangStackTraceParse", - "docstring": "", - "methods": "parse_log, extract_files, extract_endpoints, extract_sentences, extract_line_nums" - }, - { - "name": "GoLangJsonStackTraceParse", - "docstring": "", - "methods": "parse_log" - } - ], - "keywords": [], - "category": "general", - "import_path": "parsers", - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/libraries/parsers.py" - }, - { - "name": "Suggest", - "module_path": "/app/data/repos/rw-cli-codecollection/libraries/RW/NextSteps/Suggest.py", - "collection_slug": "rw-cli-codecollection", - "description": "Utility library for suggesting next steps based on a static troubleshooting yaml database\n\nSee https://github.com/seatgeek/thefuzz\n\nScope: Global", - "functions": [ - { - "name": "format", - "signature": "def format(suggestions: str, expand_arrays: bool) -> str", - "docstring": "" - }, - { - "name": "suggest", - "signature": "def suggest(search, platform: str, pretty_answer: bool, include_object_hints: bool, k_nearest: int, minimum_match_score: int)", - "docstring": "" - } - ], - "classes": [], - "keywords": [], - "category": "general", - "import_path": "Suggest", - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/libraries/Suggest.py" - }, - { - "name": "traceback", - "module_path": "/app/data/repos/rw-cli-codecollection/libraries/RW/LogAnalysis/traceback.py", - "collection_slug": "rw-cli-codecollection", - "description": "Main Traceback Extraction library class\n\nAuthor: akshayrw25", - "functions": [], - "classes": [ - { - "name": "ExtractTraceback", - "docstring": "Traceback Extraction Library\n\nThis library provides a keyword for extracting tracebacks from any set of given logs.", - "methods": "extract_logs_from_logs_dir, extract_tracebacks" - } - ], - "keywords": [], - "category": "general", - "import_path": "traceback", - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/libraries/traceback.py" - }, - { - "name": "fetch_tracebacks", - "module_path": "/app/data/repos/rw-cli-codecollection/libraries/RW/LogAnalysis/python/fetch_tracebacks.py", - "collection_slug": "rw-cli-codecollection", - "description": "Extract Python-Tracebacks from a given log-string.\nLog-string represents a dictionary(or a json), and hence starts and ends with curly-braces", - "functions": [], - "classes": [ - { - "name": "PythonTracebackExtractor", - "docstring": "", - "methods": "extract_traceback_from_dict_log, extract_traceback_from_string_log, extract_tracebacks_from_logs, extract_tracebacks_from_log_files" - } - ], - "keywords": [], - "category": "general", - "import_path": "fetch_tracebacks", - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/libraries/fetch_tracebacks.py" - }, - { - "name": "tb_utils", - "module_path": "/app/data/repos/rw-cli-codecollection/libraries/RW/LogAnalysis/python/tb_utils.py", - "collection_slug": "rw-cli-codecollection", - "description": "This module provides functionality to find and extract the longest balanced\nsequence of curly braces from a given string. This is commonly useful for\nparsing JSON-like structures or other brace-delimited content where you need\nto identify complete, properly nested brace pairs.\n\nThe module uses a stack-based approach to track opening and closing braces,\nensuring that the extracted sequence is properly balanced (every opening\nbrace has a corresponding closing brace in the correct order).", - "functions": [ - { - "name": "longest_balanced_curlies_sequence", - "signature": "def longest_balanced_curlies_sequence(s: str) -> Tuple[int, int]", - "docstring": "Find the longest balanced sequence of curly braces in a string.\n\nThis function scans through a string and identifies the first complete\nbalanced sequence of curly braces. A balanced sequence means that every\nopening brace '{' has a corresponding closing brace '}' and they are\nproperly nested.\n\n:param s: The input string to search for balanced curly braces\n:type s: str\n:returns: A tuple containing the start index (inclusive) and end index \n (exclusive) of the balanced sequence. Returns (" - } - ], - "classes": [], - "keywords": [ - "Longest Balanced Curlies Sequence" - ], - "category": "general", - "import_path": "tb_utils", - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/libraries/tb_utils.py" - }, - { - "name": "traceback_extractor", - "module_path": "/app/data/repos/rw-cli-codecollection/libraries/RW/LogAnalysis/python/traceback_extractor.py", - "collection_slug": "rw-cli-codecollection", - "description": "Traceback Extractor - Extract Python tracebacks from timestamped log files\n\nThis script extracts complete Python tracebacks from log files, handling:\n- Standard tracebacks\n- Exception chaining (caused by, during handling)\n- Exception groups (Python 3.11+)\n- PEP 657 enhanced tracebacks with precise error locations\n- KeyboardInterrupt and other exceptions\n- Multi-line exception messages\n- Various timestamp formats", - "functions": [], - "classes": [ - { - "name": "TimestampedTracebackExtractor", - "docstring": "Extracts Python tracebacks from timestamped log files.", - "methods": "extract_timestamp, is_traceback_start, is_chain_indicator, is_file_reference, is_exception_line, is_pep657_marker, is_group_marker, is_continuation_line, looks_like_new_log_entry, extract_tracebacks_from_lines, extract_from_string" - } - ], - "keywords": [], - "category": "general", - "import_path": "traceback_extractor", - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/libraries/traceback_extractor.py" - }, - { - "name": "fetch_tracebacks", - "module_path": "/app/data/repos/rw-cli-codecollection/libraries/RW/LogAnalysis/java/fetch_tracebacks.py", - "collection_slug": "rw-cli-codecollection", - "description": "Java Stacktrace Extraction Module\n\nThis module provides comprehensive functionality to extract and analyze Java stacktraces \nfrom log files. It processes multiple log files, intelligently reconstructs fragmented \nlog entries, identifies stacktrace patterns, and performs deduplication and aggregation.\n\nProcessing Flow:\n1. File Processing: Reads multiple log files with robust error handling\n2. Log Reconstruction: Reassembles multi-line log entries split across lines using timestamp detection\n3. Stacktrace Identification: Filters logs to find Java stacktrace patterns and exception information\n4. Intelligent Aggregation: Groups related stacktrace entries based on timestamp proximity and content\n5. Deduplication: Removes duplicate stacktraces while preserving chronological order\n6. Output: Returns a clean list of unique, meaningful stacktraces\n\nKey Features:\n- Batch processing of multiple log files (primary entry point: extract_tracebacks_from_log_files)\n- Intelligent timestamp handling via", - "functions": [], - "classes": [ - { - "name": "JavaTracebackExtractor", - "docstring": "Java Stacktrace Extractor for log analysis.\n\nThis class provides functionality to extract Java stacktraces from log files.\nIt handles various log formats and can identify stacktrace patterns even when\nlogs are split across multiple lines.\n\nKey Features:\n- Extracts Java stacktraces from log entries\n- Reconstructs multi-line log entries that may have been split\n- Filters logs to find those containing stacktrace information\n- Supports both complete stacktraces (with exceptions) and standalone stack", - "methods": "extract_tracebacks_from_logs, extract_tracebacks_from_log_files" - } - ], - "keywords": [], - "category": "general", - "import_path": "fetch_tracebacks", - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/libraries/fetch_tracebacks.py" - }, - { - "name": "timestamp_handler", - "module_path": "/app/data/repos/rw-cli-codecollection/libraries/RW/LogAnalysis/java/timestamp_handler.py", - "collection_slug": "rw-cli-codecollection", - "description": "Timestamp Handling Module for Log Analysis\n\nThis module provides comprehensive functionality for handling timestamps in log files.\nIt supports multiple timestamp formats commonly found in Java application logs and\nprovides utilities for parsing, extracting, and manipulating timestamp data.\n\nKey Features:\n- Detection of multiple timestamp formats (DD-MM-YYYY, ISO 8601, etc.)\n- Extraction of timestamps from log lines with position information\n- Parsing timestamp strings to datetime objects\n- Finding earliest/latest timestamps in multi-line log entries\n- Removing timestamps for content-based comparison\n- Robust handling of various timestamp edge cases\n\nSupported Timestamp Formats:\n- DD-MM-YYYY HH:MM:SS.mmm (e.g., \"15-01-2024 10:30:45.123\")\n- ISO 8601 (e.g., \"2024-01-15T10:30:45.123Z\", \"2024-01-15T10:30:45.123456789Z\")\n- YYYY-MM-DD HH:MM:SS.nnn (e.g., \"2024-01-15 10:30:45.123\")\n\nUsage:\n handler = TimestampHandler()\n timestamp = handler.extract_timestamp_from_line(log_line)\n dt = h", - "functions": [], - "classes": [ - { - "name": "TimestampHandler", - "docstring": "A comprehensive handler for timestamp operations in log analysis.\n\nThis class provides methods for detecting, extracting, parsing, and manipulating\ntimestamps in various formats commonly found in Java application logs. It supports\nmultiple timestamp patterns and provides robust error handling for edge cases.\n\nThe class is designed to be stateless and thread-safe, making it suitable for\nuse in concurrent log processing environments.\n\nAttributes:\n TIMESTAMP_PATTERNS (List[str]): Regex patterns ", - "methods": "matches_any_timestamp_pattern, has_timestamp_at_alphanumeric_start, extract_timestamp_from_line, parse_timestamp_to_datetime, get_timestamp_from_stacktrace, get_min_timestamp_from_stacktrace, get_max_timestamp_from_stacktrace, remove_timestamps_from_stacktrace" - } - ], - "keywords": [], - "category": "general", - "import_path": "timestamp_handler", - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/libraries/timestamp_handler.py" - }, - { - "name": "jenkins", - "module_path": "/app/data/repos/rw-cli-codecollection/libraries/Jenkins/jenkins.py", - "collection_slug": "rw-cli-codecollection", - "description": "Normalize logs to improve pattern matching.", - "functions": [ - { - "name": "normalize_log", - "signature": "def normalize_log(log: str) -> str", - "docstring": "Normalize logs to improve pattern matching." - } - ], - "classes": [ - { - "name": "Jenkins", - "docstring": "This Robot Framework library exposes its keywords so that each one\naccepts jenkins_url, jenkins_username, and jenkins_token directly.\n\nThe `jenkins_username` and `jenkins_token` parameters are expected\nto be `platform.Secret` objects, so we do `jenkins_username.value`\nand `jenkins_token.value` to retrieve the actual strings.\n\nExample usage in Robot:\n\n*** Settings ***\nLibrary Jenkins\n\n*** Variables ***\n${JENKINS_URL} https://my-jenkins.example\n${JENKINS_USERNAME} MyJenkinsUsernameSecret\n$", - "methods": "get_failed_tests, get_queued_builds, get_executor_utilization, build_logs_analytics, parse_atom_feed, analyze_logs" - } - ], - "keywords": [ - "Normalize Log" - ], - "category": "general", - "import_path": "jenkins", - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/libraries/jenkins.py" - }, - { - "name": "az_invoke_parser", - "module_path": "/app/data/repos/rw-cli-codecollection/libraries/Azure/az_invoke_parser.py", - "collection_slug": "rw-cli-codecollection", - "description": "CLI parser to extract stdout and stderr from Azure invoke raw output.\n\nScope: Global", - "functions": [ - { - "name": "run_invoke_cmd_parser", - "signature": "def run_invoke_cmd_parser(input_file: str, timeout_seconds: int)", - "docstring": "Parses the output of an Azure invoke command stored in a file,\nextracting stdout and stderr sections from any message containing both." - } - ], - "classes": [], - "keywords": [ - "Run Invoke Cmd Parser" - ], - "category": "azure", - "import_path": "az_invoke_parser", - "git_url": "https://github.com/runwhen-contrib/rw-cli-codecollection/tree/main/libraries/az_invoke_parser.py" - }, - { - "name": "DynamicIssues", - "module_path": "/app/data/repos/rw-generic-codecollection/libraries/RW/DynamicIssues.py", - "collection_slug": "rw-generic-codecollection", - "description": "RW.DynamicIssues - Library for dynamically generating issues from files and JSON output\n\nThis library provides two methods for issue generation:\n1. File-based: Check for issues.json and report.txt files\n2. JSON query-based: Search for configurable patterns in JSON output\n\nAuthor: RunWhen", - "functions": [], - "classes": [ - { - "name": "DynamicIssues", - "docstring": "Library for dynamically generating issues from multiple sources", - "methods": "process_file_based_issues, process_json_query_issues" - } - ], - "keywords": [], - "category": "general", - "import_path": "DynamicIssues", - "git_url": "https://github.com/runwhen-contrib/rw-generic-codecollection/tree/main/libraries/DynamicIssues.py" - }, - { - "name": "Cosmosdb", - "module_path": "/app/data/repos/rw-generic-codecollection/libraries/RW/Azure/Cosmosdb.py", - "collection_slug": "rw-generic-codecollection", - "description": "Azure Cosmos DB Python library for Robot Framework.\n\nThis library provides keywords for executing SQL queries against Azure Cosmos DB using the Python SDK.", - "functions": [], - "classes": [ - { - "name": "Cosmosdb", - "docstring": "Library for querying Azure Cosmos DB.\n\nProvides keywords for executing user-provided SQL queries against Cosmos DB containers.\nSupports both key-based authentication and Azure AD authentication (service principals).", - "methods": "connect_to_cosmosdb, connect_to_cosmosdb_with_azure_credentials, connect_to_cosmosdb_with_azure_credentials_and_retrieve_key, query_container, count_query_results" - } - ], - "keywords": [], - "category": "azure", - "import_path": "Cosmosdb", - "git_url": "https://github.com/runwhen-contrib/rw-generic-codecollection/tree/main/libraries/Cosmosdb.py" - }, - { - "name": "workspace_utils", - "module_path": "/app/data/repos/rw-workspace-utils/libraries/RW/Workspace/workspace_utils.py", - "collection_slug": "rw-workspace-utils", - "description": "Workspace keyword library for interacting with RunWhen Workspace resources.\n\nScope: GLOBAL", - "functions": [ - { - "name": "warning_log", - "signature": "def warning_log(msg: str) -> None", - "docstring": "" - }, - { - "name": "get_slxs_with_tag", - "signature": "def get_slxs_with_tag(tag_list: List[Any]) -> List[Dict]", - "docstring": "Return all SLXs whose *spec.tags* contain at least one tag in *tag_list*.\n\n`tag_list` may contain either\n \u2022 {\"name\": \"...\", \"value\": \"...\"} dictionaries\n \u2022 \"name:value\" strings\n\nMatching is case-insensitive on both name and value." - }, - { - "name": "get_slxs_with_entity_reference", - "signature": "def get_slxs_with_entity_reference(entity_refs: List[str]) -> List[Dict]", - "docstring": "Return SLXs that reference entities in *entity_refs* using a tiered matching strategy:\n1. First try to match specific tag types (resource_name, child_resource)\n2. Then try broader matches but limit scope to prevent API overload\n\nThis function prioritizes precision over recall to keep search scopes manageable." - }, - { - "name": "get_slxs_with_targeted_entity_reference", - "signature": "def get_slxs_with_targeted_entity_reference(entity_refs: List[str], tag_types: List[str]) -> List[Dict]", - "docstring": "Return SLXs that reference entities in *entity_refs* using specific tag types.\n\nArgs:\n entity_refs: List of entity names/identifiers to search for\n tag_types: List of specific tag names to match against (e.g., [\"resource_name\", \"child_resource\"])\n If None, defaults to [\"resource_name\", \"child_resource\", \"entity_name\"]\n\nReturns:\n List of SLXs that have matching tags of the specified types" - }, - { - "name": "run_tasks_for_slx", - "signature": "def run_tasks_for_slx(slx: str) -> Optional[Dict]", - "docstring": "Create a runRequest containing all tasks in the SLX runbook." - }, - { - "name": "import_platform_variable", - "signature": "def import_platform_variable(varname: str) -> str", - "docstring": "Return the value of a RunWhen platform-provided var or raise ImportError." - }, - { - "name": "import_runsession_details", - "signature": "def import_runsession_details(runsession_id: Optional[str]) -> Optional[str]", - "docstring": "Fetch full RunSession details as JSON string. Uses RW_USER_TOKEN if set." - }, - { - "name": "import_memo_variable", - "signature": "def import_memo_variable(key: str) -> Optional[str]", - "docstring": "Retrieve a memo value by key from the current runsession's runRequests.\nReturns JSON string or None." - }, - { - "name": "import_related_runsession_details", - "signature": "def import_related_runsession_details(json_string: str, api_token: Optional[platform.Secret], poll_interval: float, max_wait_seconds: float) -> Optional[str]", - "docstring": "Parse 'runsessionId' from notes and poll until runRequests stable.\nReturns JSON string of final runsession or None." - }, - { - "name": "get_workspace_config", - "signature": "def get_workspace_config() -> list | dict", - "docstring": "Return workspace.yaml (already rendered to JSON by the Workspace-API).\n\nThe function behaves correctly both:\n \u2022 inside the RunWhen runtime \u2013 where `platform.get_authenticated_session()`\n already carries the service-mesh auth headers, and\n \u2022 during local / unit testing \u2013 where you may export RW_USER_TOKEN to\n override the auth header.\n\nFalls back to an empty dictionary on any failure." - }, - { - "name": "get_nearby_slxs", - "signature": "def get_nearby_slxs(workspace_config: dict, slx_name: str) -> list", - "docstring": "Given a RunWhen workspace config (in dictionary form) and the short name\nof a specific SLX (e.g. \"rc-ob-grnsucsc1c-redis-health-a7c33f4e\"),\nreturn all SLXs in the same slxGroup.\n\n:param workspace_config: Dict representing workspace.yaml as JSON.\n:param slx_name: The SLX short name to look for.\n:return: A list of SLX short names in the same slxGroup as `slx_name`.\n If no group is found containing `slx_name`, returns an empty list." - }, - { - "name": "get_workspace_slxs", - "signature": "def get_workspace_slxs(rw_api_url: str, api_token: platform.Secret, rw_workspace: str) -> str", - "docstring": "Get all SLXs in a workspace (paginated) and return combined JSON string." - }, - { - "name": "perform_task_search_with_persona", - "signature": "def perform_task_search_with_persona(query: str, persona: str, slx_scope: Optional[List[str]], timeout: float) -> Dict", - "docstring": "Perform a task search as the given persona." - }, - { - "name": "perform_task_search", - "signature": "def perform_task_search(query: str, slx_scope: Optional[List[str]], timeout: float) -> Dict", - "docstring": "Perform a task search with no persona." - }, - { - "name": "build_task_report_md", - "signature": "def build_task_report_md(search_response: Dict, score_threshold: float, heading: str) -> Tuple[str, int]", - "docstring": "Build a Markdown table of tasks whose score \u2265 threshold and\nALSO return the *total* number of tasks in the search_response.\n\nReturns:\n (markdown_table: str, total_tasks: int)" - }, - { - "name": "perform_improved_task_search", - "signature": "def perform_improved_task_search(entity_data: List[str], persona: str, confidence_threshold: float, slx_scope: Optional[List[str]]) -> Tuple[Dict, str, List[str], str]", - "docstring": "Perform an improved multi-tier search strategy for webhook handlers:\n\n1. Search with just the specific entity data (highest specificity)\n2. Search with entity data + resource_type from matching SLXs \n3. Search with SLX spec.tag \"resource_name\" \n4. Search with \"child_resource\" tag names\n\nArgs:\n entity_data: List of entity names/identifiers extracted from webhook\n persona: Persona to use for search\n confidence_threshold: Minimum confidence score for high-quality results\n slx_scope: Opt" - } - ], - "classes": [], - "keywords": [ - "Get Slxs With Tag", - "Get Slxs With Entity Reference", - "Get Slxs With Targeted Entity Reference", - "Run Tasks For Slx", - "Import Platform Variable", - "Import Runsession Details", - "Import Memo Variable", - "Import Related Runsession Details", - "Get Workspace Config", - "Get Nearby Slxs", - "Get Workspace Slxs", - "Perform Task Search With Persona", - "Perform Task Search", - "Build Task Report Md", - "Perform Improved Task Search" - ], - "category": "general", - "import_path": "workspace_utils", - "git_url": "https://github.com/runwhen-contrib/rw-workspace-utils/tree/main/libraries/workspace_utils.py" - }, - { - "name": "pagerduty", - "module_path": "/app/data/repos/rw-workspace-utils/libraries/RW/PagerDuty/pagerduty.py", - "collection_slug": "rw-workspace-utils", - "description": "PagerDuty keyword library for performing tasks for interacting with PagerDuty incidents.\n\nScope: Global", - "functions": [ - { - "name": "get_user_email", - "signature": "def get_user_email(userid: str, secret_token: platform.Secret)", - "docstring": "Gets user email from incident Json, which is needed to add\nnotes to the incident. \n\nArgs:\n userid (str): the PagerDuty user ID to look up\n secret_token (platform.Secret): the token needed for PD auth\n\nReturns:\n email: email address of the userID " - }, - { - "name": "add_runsession_note_to_incident", - "signature": "def add_runsession_note_to_incident(data: dict, secret_token: platform.Secret)", - "docstring": "Gets user email from incident Json, which is needed to add\nnotes to the incident. \n\nArgs:\n incident (dict): the PagerDuty user ID to look up\n secret_token (platform.Secret): the token needed for PD auth\n\nReturns:\n email: email address of the userID " - } - ], - "classes": [], - "keywords": [ - "Get User Email", - "Add Runsession Note To Incident" - ], - "category": "general", - "import_path": "pagerduty", - "git_url": "https://github.com/runwhen-contrib/rw-workspace-utils/tree/main/libraries/pagerduty.py" - }, - { - "name": "dynatrace_parser", - "module_path": "/app/data/repos/rw-workspace-utils/libraries/RW/Dynatrace/dynatrace_parser.py", - "collection_slug": "rw-workspace-utils", - "description": "Return a unique list of cleaned entity names (best-guess order).", - "functions": [ - { - "name": "parse_dynatrace_entities", - "signature": "def parse_dynatrace_entities(payload: str | Dict[str, Any]) -> List[str]", - "docstring": "Return a unique list of cleaned entity names (best-guess order)." - } - ], - "classes": [], - "keywords": [ - "Parse Dynatrace Entities" - ], - "category": "general", - "import_path": "dynatrace_parser", - "git_url": "https://github.com/runwhen-contrib/rw-workspace-utils/tree/main/libraries/dynatrace_parser.py" - }, - { - "name": "github_issues", - "module_path": "/app/data/repos/rw-workspace-utils/libraries/RW/GitHub/github_issues.py", - "collection_slug": "rw-workspace-utils", - "description": "Creates a GitHub issue with the given title and body.", - "functions": [ - { - "name": "create_github_issue", - "signature": "def create_github_issue(title, body, github_token: platform.Secret, repo, github_server)", - "docstring": "Creates a GitHub issue with the given title and body." - } - ], - "classes": [], - "keywords": [ - "Create Github Issue" - ], - "category": "general", - "import_path": "github_issues", - "git_url": "https://github.com/runwhen-contrib/rw-workspace-utils/tree/main/libraries/github_issues.py" - }, - { - "name": "runsession_utils", - "module_path": "/app/data/repos/rw-workspace-utils/libraries/RW/RunSession/runsession_utils.py", - "collection_slug": "rw-workspace-utils", - "description": "Return a direct link to the RunSession.", - "functions": [ - { - "name": "get_runsession_url", - "signature": "def get_runsession_url(rw_runsession)", - "docstring": "Return a direct link to the RunSession." - }, - { - "name": "get_runsession_source", - "signature": "def get_runsession_source(payload: dict) -> str", - "docstring": "Given a RunWhen payload dictionary, return the \"source\" string based on:\n 1) If top-level \"source\" key exists, return that\n 2) Otherwise, look at the first (earliest) runRequest by 'created' time, and\n check in order:\n fromSearchQuery, fromIssue, fromSliAlert, fromAlert\n Return the name of whichever key is non-null. \n 3) If nothing is found, return \"Unknown\"." - }, - { - "name": "count_open_issues", - "signature": "def count_open_issues(data: str)", - "docstring": "Return a count of issues that have not been closed." - }, - { - "name": "get_open_issues", - "signature": "def get_open_issues(data: str)", - "docstring": "Return a count of issues that have not been closed." - }, - { - "name": "generate_open_issue_markdown_table", - "signature": "def generate_open_issue_markdown_table(data_list)", - "docstring": "Generates a markdown report sorted by severity." - }, - { - "name": "get_open_issues", - "signature": "def get_open_issues(data: str)", - "docstring": "Return a count of issues that have not been closed." - }, - { - "name": "summarize_runsession_users", - "signature": "def summarize_runsession_users(data: str, output_format: str) -> str", - "docstring": "Parse a JSON string representing a RunWhen 'runsession' object\n(with 'runRequests' entries), gather the unique participants and\nthe engineering assistants involved, and return a summary in either\nplain text or Markdown format.\n\n:param data: JSON string with top-level 'runRequests' list, each item\n possibly containing 'requester' and 'persona->spec->fullName'.\n:param output_format: \"text\" or \"markdown\" (default: \"text\").\n:return: A string summarizing the participants and engineering a" - }, - { - "name": "extract_issue_keywords", - "signature": "def extract_issue_keywords(data: str)", - "docstring": "" - }, - { - "name": "get_most_referenced_resource", - "signature": "def get_most_referenced_resource(data: str)", - "docstring": "" - }, - { - "name": "create_runsession_from_task_search", - "signature": "def create_runsession_from_task_search() -> dict | str", - "docstring": "Create a RunSession from a task-search response." - }, - { - "name": "get_persona_details", - "signature": "def get_persona_details(persona: str) -> dict", - "docstring": "Get persona configuration details\n\n:param persona: The personaShortName\n\n:return: Parsed JSON response of the persona configuration." - }, - { - "name": "add_tasks_to_runsession_from_search", - "signature": "def add_tasks_to_runsession_from_search(search_response: dict, runsession_id: str | None, api_token: platform.Secret, rw_api_url: str, rw_workspace: str, score_threshold: float, source_query: str | None, dry_run: bool)", - "docstring": "Append tasks (score \u2265 threshold) from *search_response* to an existing\nRunSession .\n\n\u2022 Builds a JSON-Merge-Patch body:\n {\n \"runRequests\": [\n { \"slxName\": \"...\", \"fromSearchQuery\": \"...\", \"taskTitles\": [...] },\n \u2026\n ]\n }\n\u2022 Sends PATCH /workspaces//runsessions/\n Content-Type: application/merge-patch+json\n\nReturns:\n \u2022 The server's JSON response (on success) \u2013or\u2013\n \u2022 The patch body (when dry_run=True)." - } - ], - "classes": [], - "keywords": [ - "Get Runsession Url", - "Get Runsession Source", - "Count Open Issues", - "Get Open Issues", - "Generate Open Issue Markdown Table", - "Get Open Issues", - "Summarize Runsession Users", - "Create Runsession From Task Search", - "Get Persona Details", - "Add Tasks To Runsession From Search" - ], - "category": "general", - "import_path": "runsession_utils", - "git_url": "https://github.com/runwhen-contrib/rw-workspace-utils/tree/main/libraries/runsession_utils.py" - }, - { - "name": "azure_alert_parser", - "module_path": "/app/data/repos/rw-workspace-utils/libraries/RW/Azure/azure_alert_parser.py", - "collection_slug": "rw-workspace-utils", - "description": "azure_alert_parser.py\n---------------------\n\nParse Azure Monitor common-schema webhook payloads from *any* alert type\nand expose a Robot-Framework keyword library.\n\nSupported alert types\n---------------------\n- activity_log (Activity Log Alerts)\n- availability (App Insights availability tests)\n- budget (Cost Management budget actuals)\n- cost_budget (Cost budget threshold reached)\n- forecast_budget (Forecasted-cost budget threshold)\n- log_v1 (Log Analytics classic alerts)\n- log_v2 (Log Analytics scheduled alerts \u2013 LA v2)\n- metric (Metric alert \u2013 static or dynamic)\n- resource_health (Azure Resource Health alerts)\n- service_health (Service Health advisories/incidents)\n- smart (App Insights Smart Detection)", - "functions": [ - { - "name": "parse_azure_monitor_alert", - "signature": "def parse_azure_monitor_alert(payload: Union[str, Dict[str, Any]]) -> Dict[str, Any]", - "docstring": "" - } - ], - "classes": [ - { - "name": "Azure", - "docstring": "Robot library exposing keyword **Parse Alert**.", - "methods": "parse_alert, extract_kql_entities, extract_kql_entities_with_query" - } - ], - "keywords": [], - "category": "azure", - "import_path": "azure_alert_parser", - "git_url": "https://github.com/runwhen-contrib/rw-workspace-utils/tree/main/libraries/azure_alert_parser.py" - }, - { - "name": "slack", - "module_path": "/app/data/repos/rw-workspace-utils/libraries/RW/Slack/slack.py", - "collection_slug": "rw-workspace-utils", - "description": "", - "functions": [], - "classes": [ - { - "name": "Slack", - "docstring": "", - "methods": "send_slack_message, create_runsession_summary_payload" - } - ], - "keywords": [], - "category": "general", - "import_path": "slack", - "git_url": "https://github.com/runwhen-contrib/rw-workspace-utils/tree/main/libraries/slack.py" - }, - { - "name": "Core", - "module_path": "/app/data/repos/aws-c7n-codecollection/libraries/CloudCustodian/Core/Core.py", - "collection_slug": "aws-c7n-codecollection", - "description": "Parse resource type from ARN.\n:param arn: AWS ARN string.\n:return: Resource type or 'Unknown Type' if ARN cannot be parsed.", - "functions": [ - { - "name": "parse_resource_type_from_arn", - "signature": "def parse_resource_type_from_arn(arn)", - "docstring": "Parse resource type from ARN.\n:param arn: AWS ARN string.\n:return: Resource type or 'Unknown Type' if ARN cannot be parsed." - }, - { - "name": "process_ec2_resource", - "signature": "def process_ec2_resource(resource, subdir_name, resource_summary)", - "docstring": "" - }, - { - "name": "process_asg_resource", - "signature": "def process_asg_resource(resource, subdir_name, resource_summary)", - "docstring": "" - }, - { - "name": "parse_custodian_results", - "signature": "def parse_custodian_results(input_dir: str)", - "docstring": "Parses Cloud Custodian results and summarizes resources, metadata, and run health.\n\n:param input_dir: Path to the root directory containing Cloud Custodian output files.\n:return: A string summary of the results in tabular format or a message if no results found." - }, - { - "name": "parse_ebs_results", - "signature": "def parse_ebs_results(input_dir: str)", - "docstring": "" - }, - { - "name": "find_value_recursive", - "signature": "def find_value_recursive(obj, target_key)", - "docstring": "Recursively search for a key in a nested dictionary or list.\nReturns all matching values found." - }, - { - "name": "generate_policy", - "signature": "def generate_policy(template_path)", - "docstring": "" - } - ], - "classes": [], - "keywords": [ - "Parse Resource Type From Arn", - "Parse Custodian Results", - "Find Value Recursive" - ], - "category": "general", - "import_path": "Core", - "git_url": "https://github.com/runwhen-contrib/aws-c7n-codecollection/tree/main/libraries/Core.py" - }, - { - "name": "Core", - "module_path": "/app/data/repos/azure-c7n-codecollection/libraries/CloudCustodian/Core/Core.py", - "collection_slug": "azure-c7n-codecollection", - "description": "", - "functions": [ - { - "name": "generate_policy", - "signature": "def generate_policy(template_path)", - "docstring": "" - } - ], - "classes": [], - "keywords": [], - "category": "general", - "import_path": "Core", - "git_url": "https://github.com/runwhen-contrib/azure-c7n-codecollection/tree/main/libraries/Core.py" - }, - { - "name": "utils", - "module_path": "/app/data/repos/ternary-codecollection/libraries/Ternary/Utils/utils.py", - "collection_slug": "ternary-codecollection", - "description": "Return up to `limit` best matches for `query` against the JSON list of\nobjects (each having id and name). Each result dict has keys: score, id, name.", - "functions": [ - { - "name": "get_top_matches", - "signature": "def get_top_matches(data_json: str, query: str, limit: int)", - "docstring": "Return up to `limit` best matches for `query` against the JSON list of\nobjects (each having id and name). Each result dict has keys: score, id, name." - } - ], - "classes": [], - "keywords": [ - "Get Top Matches" - ], - "category": "general", - "import_path": "utils", - "git_url": "https://github.com/runwhen-contrib/ternary-codecollection/tree/main/libraries/utils.py" - } - ] -} \ No newline at end of file diff --git a/mcp-server/indexer.py b/mcp-server/indexer.py index b9b759f81f48..d234d11779a2 100644 --- a/mcp-server/indexer.py +++ b/mcp-server/indexer.py @@ -1,11 +1,16 @@ #!/usr/bin/env python3 """ -CodeCollection Indexer +DEPRECATED — CodeCollection Indexer (standalone CLI) -Clones/updates all codecollection repositories, parses codebundles, -generates embeddings, and stores them in the vector database. +Embedding generation and vector storage have moved into the backend +(cc-registry-v2/backend/app/tasks/indexing_tasks.py). The backend +stores embeddings directly in pgvector and exposes them via +/api/v1/vector/search/* endpoints. -Usage: +This file is kept for reference only. Use the backend's Celery tasks +or the Admin UI to trigger reindexing. + +Original usage (no longer recommended): python indexer.py # Full index python indexer.py --local # Use local embeddings (no API) python indexer.py --collection rw-cli-codecollection # Index specific collection diff --git a/mcp-server/tools/documentation_tools.py b/mcp-server/tools/documentation_tools.py index 2ba279618021..89db8650205f 100644 --- a/mcp-server/tools/documentation_tools.py +++ b/mcp-server/tools/documentation_tools.py @@ -2,7 +2,8 @@ Documentation Tools Tools for finding documentation, guides, and development resources. -Uses docs.yaml for curated URLs and the Registry API for search. +Primary path: semantic search via the backend's vector search API. +Fallback: local docs.yaml keyword matching (for offline/disconnected use). """ import os import yaml @@ -17,7 +18,7 @@ # ============================================================================= -# Documentation Data Management (local docs.yaml — ships with the MCP server) +# Local docs.yaml fallback (used only when backend vector search is unavailable) # ============================================================================= @dataclass @@ -32,69 +33,59 @@ class DocEntry: class DocumentationManager: - """ - Manages documentation from docs.yaml. - Provides accurate, managed URLs for documentation resources. - """ - + """Keyword-match search over local docs.yaml — fallback only.""" + def __init__(self, docs_file: str = None): if docs_file is None: docs_file = Path(__file__).parent.parent / "docs.yaml" self.docs_file = Path(docs_file) self._docs: Optional[Dict] = None self._entries: List[DocEntry] = [] - + def _load(self) -> None: - """Load and parse docs.yaml""" if self._docs is not None: return - if not self.docs_file.exists(): logger.warning(f"docs.yaml not found at {self.docs_file}") self._docs = {} return - try: with open(self.docs_file, 'r') as f: self._docs = yaml.safe_load(f) or {} - docs = self._docs.get("documentation", {}) for category, items in docs.items(): - if isinstance(items, list): - for item in items: - if "name" in item: - self._entries.append(DocEntry( - name=item.get("name", ""), - url=item.get("url", ""), - description=item.get("description", ""), - keywords=item.get("keywords", []), - category=category, - examples=item.get("examples") - )) - elif "question" in item: - self._entries.append(DocEntry( - name=item.get("question", ""), - url="", - description=item.get("answer", ""), - keywords=item.get("keywords", []), - category="faq" - )) + if not isinstance(items, list): + continue + for item in items: + if "name" in item: + self._entries.append(DocEntry( + name=item.get("name", ""), + url=item.get("url", ""), + description=item.get("description", ""), + keywords=item.get("keywords", []), + category=category, + examples=item.get("examples"), + )) + elif "question" in item: + self._entries.append(DocEntry( + name=item.get("question", ""), + url="", + description=item.get("answer", ""), + keywords=item.get("keywords", []), + category="faq", + )) except Exception as e: logger.error(f"Failed to load docs.yaml: {e}") self._docs = {} - + def search(self, query: str, category: str = None, limit: int = 5) -> List[DocEntry]: - """Search documentation entries""" self._load() - query_lower = query.lower() query_words = set(query_lower.split()) - scored = [] for entry in self._entries: if category and entry.category != category: continue - score = 0 if query_lower in entry.name.lower(): score += 5 @@ -107,29 +98,27 @@ def search(self, query: str, category: str = None, limit: int = 5) -> List[DocEn for word in query_words: if word in all_text: score += 0.5 - if score > 0: scored.append((score, entry)) - scored.sort(key=lambda x: x[0], reverse=True) return [entry for _, entry in scored[:limit]] - + def get_by_category(self, category: str) -> List[DocEntry]: self._load() return [e for e in self._entries if e.category == category] - + def list_categories(self) -> List[str]: self._load() return list(set(e.category for e in self._entries)) - + def get_all_urls(self) -> Dict[str, str]: self._load() return {e.name: e.url for e in self._entries if e.url} -# Global instance _doc_manager: Optional[DocumentationManager] = None + def get_doc_manager() -> DocumentationManager: global _doc_manager if _doc_manager is None: @@ -144,12 +133,14 @@ def get_doc_manager() -> DocumentationManager: class FindDocumentationTool(BaseTool): """ Find documentation, guides, and FAQs. - Uses docs.yaml for curated URLs and keyword matching. + + Primary: semantic vector search via backend API (/api/v1/vector/search/documentation). + Fallback: keyword matching on local docs.yaml. """ - + def __init__(self, registry_client=None): self._client = registry_client - + @property def definition(self) -> ToolDefinition: return ToolDefinition( @@ -157,55 +148,43 @@ def definition(self) -> ToolDefinition: description="Find documentation, guides, examples, and FAQs for CodeBundle development. Ask how-to questions or search for specific topics.", category="search", parameters=[ - ToolParameter( - name="query", - type="string", - description="What documentation to find", - required=True - ), - ToolParameter( - name="category", - type="string", - description="Documentation category filter", - required=False, - default="all", - enum=["codebundle_development", "libraries", "platform", "faq", "all"] - ), - ToolParameter( - name="max_results", - type="integer", - description="Maximum results", - required=False, - default=10 - ) - ] + ToolParameter(name="query", type="string", description="What documentation to find", required=True), + ToolParameter(name="category", type="string", description="Documentation category filter", required=False, default="all", + enum=["codebundle_development", "libraries", "platform", "faq", "all"]), + ToolParameter(name="max_results", type="integer", description="Maximum results", required=False, default=10), + ], ) - - async def execute( - self, - query: str, - category: str = "all", - max_results: int = 10 - ) -> str: - """Find documentation using docs.yaml keyword matching.""" + + async def execute(self, query: str, category: str = "all", max_results: int = 10) -> str: + # Try backend vector search first + if self._client: + try: + cat = category if category != "all" else None + results = await self._client.vector_search_documentation( + query=query, max_results=max_results, category=cat, + ) + if results: + return self._format_vector_results(query, results) + logger.info("Backend vector search returned no results, falling back to keyword search") + except Exception as e: + logger.warning(f"Backend vector search unavailable ({e}), falling back to keyword search") + + return self._keyword_search(query, category, max_results) + + def _keyword_search(self, query: str, category: str, max_results: int) -> str: doc_manager = get_doc_manager() - results = doc_manager.search( query=query, category=category if category != "all" else None, - limit=max_results + limit=max_results, ) - if not results: return f"No documentation found matching: {query}\n\nCheck the official RunWhen documentation at https://docs.runwhen.com" - - output = f"# Documentation: {query}\n\n" - output += f"Found {len(results)} resource(s):\n\n" - + + output = f"# Documentation: {query}\n\nFound {len(results)} resource(s):\n\n" for entry in results: output += f"## **{entry.name}**\n\n" output += f"**Category:** {entry.category}\n\n" - if entry.description: output += f"**Description:** {entry.description}\n\n" if entry.url: @@ -215,16 +194,34 @@ async def execute( for ex in entry.examples[:2]: output += f"```robot\n{ex}\n```\n\n" output += "---\n\n" - + return output + + @staticmethod + def _format_vector_results(query: str, results: List[Dict[str, Any]]) -> str: + output = f"# Documentation: {query}\n\nFound {len(results)} result(s) (semantic search):\n\n" + for r in results: + meta = r.get("metadata", {}) + name = meta.get("name", r.get("id", "")) + output += f"## **{name}**\n\n" + if meta.get("category"): + output += f"**Category:** {meta['category']}\n\n" + desc = meta.get("description", "") + if desc: + output += f"**Description:** {desc}\n\n" + url = meta.get("url", "") + if url: + output += f"**Link:** [{url}]({url})\n\n" + score = r.get("score", 0) + output += f"**Relevance:** {score:.2%}\n\n---\n\n" return output class GetDevelopmentRequirementsTool(BaseTool): """Get development requirements and best practices.""" - + def __init__(self, registry_client=None): self._client = registry_client - + @property def definition(self) -> ToolDefinition: return ToolDefinition( @@ -232,22 +229,25 @@ def definition(self) -> ToolDefinition: description="Get development requirements, best practices, and documentation for specific features", category="info", parameters=[ - ToolParameter( - name="feature", - type="string", - description="Feature to get requirements for (e.g., 'secrets', 'tasks', 'slis')", - required=True - ) - ] + ToolParameter(name="feature", type="string", description="Feature to get requirements for (e.g., 'secrets', 'tasks', 'slis')", required=True), + ], ) - + async def execute(self, feature: str) -> str: - """Get development requirements from docs.yaml.""" + # Try backend vector search first + if self._client: + try: + results = await self._client.vector_search_documentation( + query=feature, max_results=3, + ) + if results: + return FindDocumentationTool._format_vector_results(feature, results) + except Exception: + pass + doc_manager = get_doc_manager() results = doc_manager.search(feature, limit=3) - output = f"# Development Requirements: {feature}\n\n" - if results: for entry in results: output += f"## **{entry.name}**\n\n" @@ -263,5 +263,4 @@ async def execute(self, feature: str) -> str: else: output += f"No specific documentation found for '{feature}'.\n\n" output += "Check the official RunWhen documentation at https://docs.runwhen.com/public/runwhen-authors\n" - return output diff --git a/mcp-server/utils/embeddings.py b/mcp-server/utils/embeddings.py index 3efc368d45bb..2ea77c9728f9 100644 --- a/mcp-server/utils/embeddings.py +++ b/mcp-server/utils/embeddings.py @@ -1,5 +1,10 @@ """ -Embedding generation utilities using Azure OpenAI or local models. +DEPRECATED — Embedding generation utilities. + +Embedding generation has moved to the backend: + cc-registry-v2/backend/app/services/embedding_service.py + +This file is kept for reference only. """ import os import logging diff --git a/mcp-server/utils/registry_client.py b/mcp-server/utils/registry_client.py index 5f640f55736f..326274caabeb 100644 --- a/mcp-server/utils/registry_client.py +++ b/mcp-server/utils/registry_client.py @@ -138,6 +138,78 @@ async def get_stats(self) -> Dict[str, Any]: resp.raise_for_status() return resp.json() + # ----------------------------------------------------------------- + # Vector / semantic search (backed by pgvector in the backend) + # ----------------------------------------------------------------- + + async def vector_search( + self, + query: str, + tables: str = None, + max_results: int = 10, + ) -> Dict[str, Any]: + """Unified semantic search across vector tables.""" + params: Dict[str, Any] = {"query": query, "max_results": max_results} + if tables: + params["tables"] = tables + resp = await self._client.get("/api/v1/vector/search", params=params) + resp.raise_for_status() + return resp.json() + + async def vector_search_codebundles( + self, + query: str, + max_results: int = 10, + platform: str = None, + collection_slug: str = None, + ) -> List[Dict[str, Any]]: + """Semantic search over codebundle embeddings.""" + params: Dict[str, Any] = {"query": query, "max_results": max_results} + if platform: + params["platform"] = platform + if collection_slug: + params["collection_slug"] = collection_slug + resp = await self._client.get("/api/v1/vector/search/codebundles", params=params) + resp.raise_for_status() + data = resp.json() + return data.get("results", []) + + async def vector_search_documentation( + self, + query: str, + max_results: int = 10, + category: str = None, + ) -> List[Dict[str, Any]]: + """Semantic search over documentation embeddings.""" + params: Dict[str, Any] = {"query": query, "max_results": max_results} + if category: + params["category"] = category + resp = await self._client.get("/api/v1/vector/search/documentation", params=params) + resp.raise_for_status() + data = resp.json() + return data.get("results", []) + + async def vector_search_libraries( + self, + query: str, + max_results: int = 10, + category: str = None, + ) -> List[Dict[str, Any]]: + """Semantic search over library embeddings.""" + params: Dict[str, Any] = {"query": query, "max_results": max_results} + if category: + params["category"] = category + resp = await self._client.get("/api/v1/vector/search/libraries", params=params) + resp.raise_for_status() + data = resp.json() + return data.get("results", []) + + async def vector_stats(self) -> Dict[str, int]: + """Return row counts for each vector table.""" + resp = await self._client.get("/api/v1/vector/stats") + resp.raise_for_status() + return resp.json() + # ----------------------------------------------------------------- # Lifecycle # ----------------------------------------------------------------- diff --git a/mcp-server/utils/semantic_search.py b/mcp-server/utils/semantic_search.py index f13dac276b01..df7d5ae93f65 100644 --- a/mcp-server/utils/semantic_search.py +++ b/mcp-server/utils/semantic_search.py @@ -1,5 +1,10 @@ """ -Semantic search utilities combining embeddings and vector store. +DEPRECATED — Semantic search utilities (local in-memory). + +Semantic search is now handled by the backend via pgvector. +The MCP server delegates to /api/v1/vector/search/* endpoints. + +This file is kept for reference only. """ import os import logging diff --git a/mcp-server/utils/vector_store.py b/mcp-server/utils/vector_store.py index 75c6b3d31265..e1fd4b01330d 100644 --- a/mcp-server/utils/vector_store.py +++ b/mcp-server/utils/vector_store.py @@ -1,12 +1,12 @@ """ -Vector store for semantic search. +DEPRECATED — Local vector store (in-memory / JSON file). -In-memory vector store with JSON file persistence. Embeddings are stored -in a local JSON file and loaded into memory on startup. Brute-force -cosine similarity with numpy. Fast enough for thousands of vectors -(our dataset is ~200-500 items). +Vector storage and search have moved to the backend, which uses pgvector +in PostgreSQL. See: + - cc-registry-v2/backend/app/services/vector_service.py + - cc-registry-v2/backend/app/tasks/indexing_tasks.py -Zero external infrastructure required beyond the embedding API itself. +This file is kept for reference only. """ import json import logging