Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
3308701
feat: Add mcpchecker evals for obs-mcp tools
slashpai Feb 25, 2026
2e558dd
feat(evals): enable parallel execution and update for mcpchecker nightly
slashpai Mar 11, 2026
bcd8391
chore(evals): organize evals into separate subdirectories
slashpai Mar 11, 2026
5b59e75
feat(evals): switch default mcpchecker agent to openai:gpt-4o-mini
slashpai Mar 11, 2026
652e6a7
docs: add MCPChecker evals to TESTING.md and fix README accuracy
slashpai Mar 11, 2026
1e6415e
chore: add .env to gitignore
slashpai Mar 11, 2026
2c416bd
docs(evals): streamline mcpchecker README agent configuration section
slashpai Mar 11, 2026
ffd253f
docs(evals): clarify agent vs judge LLM roles in mcpchecker README
slashpai Mar 11, 2026
bd2d421
chore: add Makefile targets to deploy additional kube-prometheus scra…
slashpai Mar 11, 2026
2a884b9
evals: strengthen mcpchecker assertions and add hard-difficulty tasks
slashpai Mar 13, 2026
d2db200
evals: relax query assertions to accept either instant or range queries
slashpai Mar 13, 2026
6105385
docs: add PROMPTS.md with example prompts for testing obs-mcp tools
slashpai Mar 13, 2026
9609d1f
docs: move PROMPTS.md and METRICS_REFERENCE.md to docs/dev/
slashpai Mar 13, 2026
09aa69f
evals: update mcpchecker config for v0.0.14
slashpai Apr 2, 2026
c83e3c8
evals: switch agent and judge model to gpt-5-nano
slashpai Apr 2, 2026
c79a357
chore: add makefile target to install mcpchecker
slashpai Apr 14, 2026
4eaf77f
docs: consolidate mcpchecker eval docs and update for v0.0.15
slashpai Apr 14, 2026
206362d
docs: add CATEGORY filter for mcpchecker evals
slashpai Apr 14, 2026
b46ecca
chore: Add 4 new mcpchecker tasks
slashpai Apr 14, 2026
1abac33
evals: add smoke test, time range task, and fix weak contains check
slashpai Apr 14, 2026
c1852c5
evals: reduce default runs to 1, improve assertions, and add visualiz…
slashpai Apr 14, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
**/.testcache

# Development
.env
.devcontainer/dev.env
integration-tests/videos
integration-tests/screenshots
Expand All @@ -20,3 +21,7 @@ tmp/*

cpu.prof
mem.prof

# mcpchecker eval output files
evals/mcpchecker/*-out*.json
evals/mcpchecker/results/
64 changes: 64 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ CONTAINER_CLI ?= docker
IMAGE ?= ghcr.io/rhobs/obs-mcp
TAG ?= $(shell git rev-parse --short HEAD)
TOOLS_DIR := hack/tools
MCPCHECKER_VERSION ?= 0.0.15

ROOT_DIR := $(shell pwd)
TOOLS_BIN_DIR := $(ROOT_DIR)/tmp/bin
Expand Down Expand Up @@ -169,6 +170,69 @@ test-e2e-teardown: ## Teardown E2E test cluster
chmod +x hack/e2e/teardown-cluster.sh
CLUSTER_NAME=$(KIND_CLUSTER_NAME) ./hack/e2e/teardown-cluster.sh

MCPCHECKER_OS := $(shell uname -s | tr '[:upper:]' '[:lower:]')
MCPCHECKER_ARCH := $(shell uname -m | sed 's/x86_64/amd64/' | sed 's/aarch64/arm64/')

$(TOOLS_BIN_DIR)/mcpchecker: | $(TOOLS_BIN_DIR)
@echo "==> Installing mcpchecker v$(MCPCHECKER_VERSION) ($(MCPCHECKER_OS)/$(MCPCHECKER_ARCH))..."
@curl -fsSL -o $(TOOLS_BIN_DIR)/mcpchecker.zip \
https://github.com/mcpchecker/mcpchecker/releases/download/v$(MCPCHECKER_VERSION)/mcpchecker-$(MCPCHECKER_OS)-$(MCPCHECKER_ARCH).zip
@unzip -o -q $(TOOLS_BIN_DIR)/mcpchecker.zip -d $(TOOLS_BIN_DIR)
@rm -f $(TOOLS_BIN_DIR)/mcpchecker.zip
@chmod +x $(TOOLS_BIN_DIR)/mcpchecker
@echo "✓ mcpchecker v$(MCPCHECKER_VERSION) installed to $(TOOLS_BIN_DIR)/mcpchecker"

.PHONY: install-mcpchecker
install-mcpchecker: $(TOOLS_BIN_DIR)/mcpchecker ## Install mcpchecker CLI for running evals

MCPCHECKER_EVAL_DIR := evals/mcpchecker
RUNS ?= 1

.PHONY: run-mcpchecker-eval
run-mcpchecker-eval: $(TOOLS_BIN_DIR)/mcpchecker ## Run mcpchecker eval (TASK=name, CATEGORY=queries, RUNS=3 for consistency testing)
ifdef TASK
cd $(MCPCHECKER_EVAL_DIR) && $(TOOLS_BIN_DIR)/mcpchecker check eval.yaml --run "$(TASK)" --runs $(RUNS) --verbose
else ifdef CATEGORY
cd $(MCPCHECKER_EVAL_DIR) && $(TOOLS_BIN_DIR)/mcpchecker check eval.yaml --label-selector "category=$(CATEGORY)" --runs $(RUNS) --parallel 4
else
cd $(MCPCHECKER_EVAL_DIR) && $(TOOLS_BIN_DIR)/mcpchecker check eval.yaml --runs $(RUNS) --parallel 4
endif

.PHONY: deploy-kube-state-metrics
deploy-kube-state-metrics: ## Deploy kube-state-metrics from kube-prometheus (for mcpchecker evals)
@if [ ! -d "tmp/kube-prometheus" ]; then \
echo "Error: tmp/kube-prometheus not found. Run 'make test-e2e-setup' first."; exit 1; \
fi
@echo "==> Installing kube-state-metrics..."
@for f in tmp/kube-prometheus/manifests/kubeStateMetrics-*.yaml; do \
kubectl apply -f "$$f"; \
done
kubectl -n monitoring rollout status deployment/kube-state-metrics --timeout=3m

.PHONY: deploy-node-exporter
deploy-node-exporter: ## Deploy node-exporter from kube-prometheus (for mcpchecker evals)
@if [ ! -d "tmp/kube-prometheus" ]; then \
echo "Error: tmp/kube-prometheus not found. Run 'make test-e2e-setup' first."; exit 1; \
fi
@echo "==> Installing node-exporter..."
@for f in tmp/kube-prometheus/manifests/nodeExporter-*.yaml; do \
kubectl apply -f "$$f"; \
done
kubectl -n monitoring rollout status daemonset/node-exporter --timeout=3m

.PHONY: deploy-kubelet-servicemonitors
deploy-kubelet-servicemonitors: ## Deploy kubelet/cAdvisor scrape configs from kube-prometheus (for container_* metrics)
@if [ ! -d "tmp/kube-prometheus" ]; then \
echo "Error: tmp/kube-prometheus not found. Run 'make test-e2e-setup' first."; exit 1; \
fi
@echo "==> Installing kubelet/cAdvisor ServiceMonitors..."
@for f in tmp/kube-prometheus/manifests/kubernetesControlPlane-*.yaml; do \
kubectl apply -f "$$f"; \
done

.PHONY: deploy-more-kube-prom-targets
deploy-more-kube-prom-targets: deploy-kube-state-metrics deploy-node-exporter deploy-kubelet-servicemonitors ## Deploy additional kube-prometheus scrape targets (kube-state-metrics, node-exporter, kubelet)

.PHONY: test-e2e-full
test-e2e-full: test-e2e-setup test-e2e-deploy test-e2e test-e2e-teardown ## Run full E2E test cycle (setup, test, teardown)

Expand Down
17 changes: 16 additions & 1 deletion TESTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,11 @@ make run-openshift-pf-prometheus # port-forwards prometheus-k8s-0:9090 + ale
**kube-prometheus or any other backend** — set URLs explicitly:

```bash
PROMETHEUS_URL=http://localhost:9090 ALERTMANAGER_URL=http://localhost:9093 make run
PROMETHEUS_URL=http://localhost:9090 ALERTMANAGER_URL=http://localhost:9093 AUTH_MODE=header make run
```

> **Note:** `AUTH_MODE=header` is required for Kind clusters because their kubeconfig uses client certificates instead of bearer tokens. The default `kubeconfig` auth mode will fail with a "kubeconfig doesn't contain a bearer token" error.

Override other defaults as needed:

```bash
Expand Down Expand Up @@ -99,3 +101,16 @@ OBS_MCP_URL=http://localhost:9100 make test-e2e # full MCP tool smok
```

> Note: `make test-e2e` without `OBS_MCP_URL` will attempt a port-forward to a Kind/k8s cluster. It will fail if no `obs-mcp` pod is running in the `obs-mcp` namespace.

## MCPChecker Evals

Validates that AI agents can discover and correctly use obs-mcp tools. See [`evals/mcpchecker/README.md`](evals/mcpchecker/README.md) for installation, environment setup, and detailed usage.

Quick start:

```bash
make run-mcpchecker-eval # run all tasks in parallel (1 run each)
make run-mcpchecker-eval CATEGORY=queries # run by category (metrics, labels, queries, alerts)
make run-mcpchecker-eval TASK=cpu-usage # single task, verbose
make run-mcpchecker-eval RUNS=3 # multiple runs for consistency testing
```
46 changes: 46 additions & 0 deletions docs/dev/METRICS_REFERENCE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Metrics Reference
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doc is for development reference


A quick reference mapping common questions to Prometheus metrics. Use this when `list_metrics` returns no relevant results—try the suggested regex patterns. Metrics vary by deployment (kube-prometheus, OpenShift, etc.); not all may exist in your cluster.

## list_metrics Regex Tips

Prometheus uses **full-string** regex matching. `kube_pod` does not match `kube_pod_container_status_terminated`. Use:

- **Prefix search:** `kube_pod_container_status.*` (matches any metric starting with that prefix)
- **Substring search:** `.*terminated.*` (matches any metric containing "terminated")

## Common Questions → Metrics

| Question | Suggested Metric(s) | list_metrics regex | Notes |
|----------|---------------------|--------------------|-------|
| OOMKilled containers | `kube_pod_container_status_last_terminated_reason` | `.*terminated_reason.*` | Check `reason="OOMKilled"` label. May not exist in all kube-state-metrics setups. |
| Pending pods | `kube_pod_status_phase` | `kube_pod_status_phase` | Filter `phase="Pending"` |
| Running pods | `kube_pod_status_phase` | `kube_pod_status_phase` | Filter `phase="Running"` |
| Crashlooping pods | `kube_pod_container_status_restarts_total` | `.*restarts.*` | Use range query with `increase()` |
| Pods created | `kube_pod_created` | `kube_pod_created` | Timestamp of pod creation |
| CPU usage (pods) | `container_cpu_usage_seconds_total` or `node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate` | `.*cpu.*` | Raw metric or pre-aggregated recording rule |
| Memory usage (pods) | `container_memory_working_set_bytes` or `node_namespace_pod_container:container_memory_working_set_bytes` | `.*memory.*` | Raw metric or pre-aggregated recording rule |
| Network traffic | `node_network_receive_bytes_total`, `node_network_transmit_bytes_total` | `node_network.*` | |
| Prometheus head series | `prometheus_tsdb_head_series` | `prometheus_tsdb.*` | |
| Prometheus WAL size | `prometheus_tsdb_wal_storage_size_bytes` | `prometheus_tsdb.*` | |
| Prometheus request rate | `prometheus_http_requests_total` | `prometheus_http.*` | Use `rate()` |

## Query Efficiency

Agents should prefer aggregated PromQL over querying individual series. For example:

| Goal | Inefficient (N queries) | Efficient (1 query) |
|------|------------------------|---------------------|
| Top CPU pods | One `execute_range_query` per pod | `topk(5, sum by (pod) (rate(container_cpu_usage_seconds_total[5m])))` |
| Namespace resource usage | One query per namespace | `sum by (namespace) (container_memory_working_set_bytes)` |
| Pod restart rate | One query per pod | `topk(10, increase(kube_pod_container_status_restarts_total[1h]))` |

Use `topk()`, `bottomk()`, `sum by()`, `avg by()`, and `rate()` to answer questions in 1-3 queries instead of one per entity.

## When a Metric Doesn't Exist

If `list_metrics` with the suggested regex returns nothing:

1. The metric may not be scraped in your setup (e.g. `kube_pod_container_status_last_terminated_reason` requires specific kube-state-metrics config).
2. Try broader patterns: `kube.*`, `node.*`, `container.*`.
3. Inform the user that the metric is not available in their cluster.
51 changes: 51 additions & 0 deletions docs/dev/PROMPTS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Prompts You Can Try
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doc is for development reference


This document lists example prompts you can use to test obs-mcp when connected to Cursor or another MCP client. These prompts align with the [MCPChecker evals](../evals/mcpchecker/) and exercise different obs-mcp tools.

For metric discovery tips (e.g. regex behavior, common question → metric mapping), see [METRICS_REFERENCE.md](./METRICS_REFERENCE.md).

## Metric Discovery

- List all available Prometheus metrics that contain 'kube' in the name.
- What node-related metrics are available in Prometheus?

## Label Exploration

- What labels are available for the kube_pod_info metric?
- What are the unique namespace values for the kube_pod_info metric?
- How many time series exist for the kube_pod_info metric? Show the cardinality.

## Queries

- Which pods are using the most CPU?
- Which pods are stuck in pending state?
- Which pods are receiving the most network traffic?
- How many head series does Prometheus have?
- What is the current storage size of the Prometheus WAL?
- How many requests per second are being made to Prometheus?
- How many pods were created in the last 5 minutes?
- Which pods were crashlooping in the last 5 minutes?

## Alerts

- Are there any currently firing alerts in the cluster?
- Are there any active silences in Alertmanager?
- Check if there are any firing alerts. If there are, investigate the related metrics for the most critical alert and summarize what's happening.

## Multi-Step Investigation

These prompts are part of the eval suite (hard difficulty) and test complex reasoning:

- Which namespace is consuming the most CPU and memory? Show me the top namespace for each.
- Is the cluster healthy? Give me an overview of any issues.

## Bonus: Additional Prompts

These prompts go beyond the eval suite and test more complex workflows:

- What's the memory usage of pods in the monitoring namespace?
- Show me the container restart count for all pods over the last hour.
- Which nodes have the highest CPU utilization?
- What's the disk usage on the cluster nodes?
- Are any containers in OOMKilled state?
- How many pods are running in the cluster?
File renamed without changes.
6 changes: 3 additions & 3 deletions evals/README.md → evals/lightspeed/README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# obs-mcp evals
# Lightspeed Evals

The evaluations testset for the obs-mcp based on [lightspeed-evaluation](https://github.com/lightspeed-core/lightspeed-evaluation).

Expand All @@ -15,7 +15,7 @@ The evaluations testset for the obs-mcp based on [lightspeed-evaluation](https:/
- OpenShift cluster with:
- Thanos Querier or Prometheus accessible
- Valid kubeconfig or service account credentials
- obs-mcp server running and connected to Prometheus/Thanos Querier, [check readme for the instructions](../README.md)
- obs-mcp server running and connected to Prometheus/Thanos Querier, [check readme for the instructions](../../README.md)
- OpenAI API key

## Quickstart
Expand All @@ -35,7 +35,7 @@ git clone https://github.com/lightspeed-core/lightspeed-stack.git
cd lightspeed-stack
```

Copy the lightspeed configs from this [repo](../hack/lightspeed-stack) to above directory
Copy the lightspeed configs from this [repo](../../hack/lightspeed-stack) to above directory

**Note:** Adjust the path where obs-mcp is located accordingly in the below command

Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
2 changes: 2 additions & 0 deletions evals/mcpchecker/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
mcpchecker-*-out.json
*-error.txt
Loading