From b54f2bc64299bc325b5f78d61194723da93deef8 Mon Sep 17 00:00:00 2001 From: Harshal Patil <12152047+harche@users.noreply.github.com> Date: Thu, 19 Mar 2026 09:18:46 -0400 Subject: [PATCH] feat(openshift): add nodes_debug_exec tool for node-level debugging Add a privileged debug pod tool that mimics `oc debug node/` for running commands on OpenShift nodes. The tool creates a temporary pod with the UBI9 toolbox image, executes the command, collects output, and cleans up automatically. Key design decisions based on PR review feedback: - All new code lives in pkg/ocp/ and pkg/toolsets/openshift/ only, no upstream packages (pkg/kubernetes/, pkg/mcp/) are modified - Tool is in the "openshift" toolset (not "core") to avoid conflicts - NodeDebugClient interface decouples from concrete k8s client for testability - Error messages include command output/logs for better diagnostics - Default timeout is 1 minute (within MCP client timeout limits) - Gated behind ReadOnly config (read_only=false required to use) Co-Authored-By: Claude Opus 4.6 (1M context) --- README.md | 33 +++ docs/configuration.md | 1 + internal/tools/update-readme/main.go | 1 + pkg/config/config_default_overrides.go | 2 +- pkg/ocp/nodes_debug.go | 343 ++++++++++++++++++++++++ pkg/ocp/nodes_debug_test.go | 327 ++++++++++++++++++++++ pkg/ocp/nodes_debug_testhelpers_test.go | 153 +++++++++++ pkg/toolsets/openshift/nodes.go | 127 +++++++++ pkg/toolsets/openshift/nodes_test.go | 74 +++++ pkg/toolsets/openshift/toolset.go | 4 +- 10 files changed, 1063 insertions(+), 2 deletions(-) create mode 100644 pkg/ocp/nodes_debug.go create mode 100644 pkg/ocp/nodes_debug_test.go create mode 100644 pkg/ocp/nodes_debug_testhelpers_test.go create mode 100644 pkg/toolsets/openshift/nodes.go create mode 100644 pkg/toolsets/openshift/nodes_test.go diff --git a/README.md b/README.md index 6476348c2..4a65d2c4c 100644 --- a/README.md +++ b/README.md @@ -268,6 +268,7 @@ The following sets of tools are available (toolsets marked with ✓ in the Defau | kcp | Manage kcp workspaces and multi-tenancy features | | | kubevirt | KubeVirt virtual machine management tools, check the [KubeVirt documentation](https://github.com/containers/kubernetes-mcp-server/blob/main/docs/kubevirt.md) for more details. | | | observability | Cluster observability tools for querying Prometheus metrics and Alertmanager alerts | | +| openshift | OpenShift-specific tools for cluster management and troubleshooting | ✓ | | ossm | Most common tools for managing OSSM, check the [OSSM documentation](https://github.com/openshift/openshift-mcp-server/blob/main/docs/OSSM.md) for more details. | | @@ -500,6 +501,19 @@ Common use cases:
+openshift + +- **nodes_debug_exec** - Run commands on an OpenShift node using a privileged debug pod with comprehensive troubleshooting utilities. The debug pod uses the UBI9 toolbox image which includes: systemd tools (systemctl, journalctl), networking tools (ss, ip, ping, traceroute, nmap), process tools (ps, top, lsof, strace), file system tools (find, tar, rsync), and debugging tools (gdb). The host filesystem is mounted at /host, allowing commands to chroot /host if needed to access node-level resources. Output is truncated to the most recent 100 lines, so prefer filters like grep when expecting large logs. + - `command` (`array`) **(required)** - Command to execute on the node. All standard debugging utilities from the UBI9 toolbox are available. The host filesystem is mounted at /host - use 'chroot /host ' to access node-level resources, or run commands directly in the toolbox environment. Provide each argument as a separate array item (e.g. ['chroot', '/host', 'systemctl', 'status', 'kubelet'] or ['journalctl', '-u', 'kubelet', '--since', '1 hour ago']). + - `image` (`string`) - Container image to use for the debug pod (optional). Defaults to registry.access.redhat.com/ubi9/toolbox:latest which provides comprehensive debugging and troubleshooting utilities. + - `namespace` (`string`) - Namespace to create the temporary debug pod in (optional, defaults to the current namespace or 'default'). + - `node` (`string`) **(required)** - Name of the node to debug (e.g. worker-0). + - `timeout_seconds` (`integer`) - Maximum time to wait for the command to complete before timing out (optional, defaults to 60 seconds). + +
+ +
+ ossm - **ossm_mesh_graph** - Returns the topology of a specific namespaces, health, status of the mesh and namespaces. Includes a mesh health summary overview with aggregated counts of healthy, degraded, and failing apps, workloads, and services. Use this for high-level overviews @@ -590,6 +604,25 @@ Common use cases:
+
+ +openshift + +- **plan_mustgather** - Plan for collecting a must-gather archive from an OpenShift cluster. Must-gather is a tool for collecting cluster data related to debugging and troubleshooting like logs, kubernetes resources, etc. + - `node_name` (`string`) - Specific node name to run must-gather pod on + - `node_selector` (`string`) - Node selector in key=value,key2=value2 format to filter nodes for the pod + - `source_dir` (`string`) - Custom gather directory inside pod (default: /must-gather) + - `namespace` (`string`) - Privileged namespace to use for must-gather (auto-generated if not specified) + - `gather_command` (`string`) - Custom gather command eg. /usr/bin/gather_audit_logs (default: /usr/bin/gather) + - `timeout` (`string`) - Timeout duration for gather command (eg. 30m, 1h) + - `since` (`string`) - Only gather data newer than this duration (eg. 5s, 2m5s, or 3h6m10s) defaults to all data. + - `host_network` (`string`) - Use host network for must-gather pod (true/false) + - `keep_resources` (`string`) - Keep pod resources after collection (true/false, default: false) + - `all_component_images` (`string`) - Include must-gather images from all installed operators (true/false) + - `images` (`string`) - Comma-separated list of custom must-gather container images + +
+ diff --git a/docs/configuration.md b/docs/configuration.md index edf910454..10eda64b7 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -267,6 +267,7 @@ Toolsets group related tools together. Enable only the toolsets you need to redu | kcp | Manage kcp workspaces and multi-tenancy features | | | kubevirt | KubeVirt virtual machine management tools, check the [KubeVirt documentation](https://github.com/containers/kubernetes-mcp-server/blob/main/docs/kubevirt.md) for more details. | | | observability | Cluster observability tools for querying Prometheus metrics and Alertmanager alerts | | +| openshift | OpenShift-specific tools for cluster management and troubleshooting | ✓ | | ossm | Most common tools for managing OSSM, check the [OSSM documentation](https://github.com/openshift/openshift-mcp-server/blob/main/docs/OSSM.md) for more details. | | diff --git a/internal/tools/update-readme/main.go b/internal/tools/update-readme/main.go index 4163318a3..4807dfb4a 100644 --- a/internal/tools/update-readme/main.go +++ b/internal/tools/update-readme/main.go @@ -20,6 +20,7 @@ import ( _ "github.com/containers/kubernetes-mcp-server/pkg/toolsets/kiali" _ "github.com/containers/kubernetes-mcp-server/pkg/toolsets/kubevirt" _ "github.com/containers/kubernetes-mcp-server/pkg/toolsets/observability" + _ "github.com/containers/kubernetes-mcp-server/pkg/toolsets/openshift" ) type OpenShift struct{} diff --git a/pkg/config/config_default_overrides.go b/pkg/config/config_default_overrides.go index 9bcf4ab76..9e901b393 100644 --- a/pkg/config/config_default_overrides.go +++ b/pkg/config/config_default_overrides.go @@ -5,6 +5,6 @@ func defaultOverrides() StaticConfig { // IMPORTANT: this file is used to override default config values in downstream builds. // For current release we want to just expose the settings below: ReadOnly: true, - Toolsets: []string{"core", "config"}, + Toolsets: []string{"core", "config", "openshift"}, } } diff --git a/pkg/ocp/nodes_debug.go b/pkg/ocp/nodes_debug.go new file mode 100644 index 000000000..f4214a5e0 --- /dev/null +++ b/pkg/ocp/nodes_debug.go @@ -0,0 +1,343 @@ +package ocp + +import ( + "context" + "errors" + "fmt" + "strings" + "time" + + "github.com/containers/kubernetes-mcp-server/pkg/api" + "github.com/containers/kubernetes-mcp-server/pkg/kubernetes" + "github.com/containers/kubernetes-mcp-server/pkg/version" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/rand" + corev1client "k8s.io/client-go/kubernetes/typed/core/v1" + "k8s.io/utils/ptr" +) + +const ( + // DefaultNodeDebugImage is the UBI9 toolbox image that provides comprehensive debugging and troubleshooting utilities. + // This image includes: systemd tools (systemctl, journalctl), networking tools (ss, ip, ping, traceroute, nmap), + // process tools (ps, top, lsof, strace), file system tools (find, tar, rsync), debugging tools (gdb), + // and many other utilities commonly needed for node-level debugging and diagnostics. + DefaultNodeDebugImage = "registry.access.redhat.com/ubi9/toolbox:latest" + // NodeDebugContainerName is the name used for the debug container, matching 'oc debug node' defaults. + NodeDebugContainerName = "debug" + // DefaultNodeDebugTimeout is the maximum time to wait for the debug pod to finish executing. + DefaultNodeDebugTimeout = 1 * time.Minute +) + +// NodeDebugClient defines the minimal interface for node debug operations. +// This allows for easier testing and decoupling from the concrete kubernetes client. +type NodeDebugClient interface { + NamespaceOrDefault(namespace string) string + Pods(namespace string) corev1client.PodInterface + PodsLog(ctx context.Context, namespace, name, container string, previous bool, tail int64) (string, error) +} + +// nodeDebugAdapter adapts api.KubernetesClient to implement NodeDebugClient. +type nodeDebugAdapter struct { + k api.KubernetesClient +} + +// NewNodeDebugClient creates a NodeDebugClient from an api.KubernetesClient. +func NewNodeDebugClient(k api.KubernetesClient) NodeDebugClient { + return &nodeDebugAdapter{k: k} +} + +func (a *nodeDebugAdapter) NamespaceOrDefault(namespace string) string { + return a.k.NamespaceOrDefault(namespace) +} + +func (a *nodeDebugAdapter) Pods(namespace string) corev1client.PodInterface { + return a.k.CoreV1().Pods(namespace) +} + +func (a *nodeDebugAdapter) PodsLog(ctx context.Context, namespace, name, container string, previous bool, tail int64) (string, error) { + return kubernetes.NewCore(a.k).PodsLog(ctx, namespace, name, container, previous, tail) +} + +// NodesDebugExec mimics `oc debug node/ -- ` by creating a privileged pod on the target +// node, running the provided command, collecting its output, and removing the pod afterwards. +// The host filesystem is mounted at /host, allowing commands to chroot /host if needed to access node resources. +// +// When namespace is empty, the configured namespace (or "default" if none) is used. When image is empty the +// default debug image is used. Timeout controls how long we wait for the pod to complete. +func NodesDebugExec( + ctx context.Context, + k NodeDebugClient, + namespace string, + nodeName string, + image string, + command []string, + timeout time.Duration, +) (string, error) { + if nodeName == "" { + return "", errors.New("node name is required") + } + if len(command) == 0 { + return "", errors.New("command is required") + } + + ns := k.NamespaceOrDefault(namespace) + if ns == "" { + ns = "default" + } + debugImage := image + if debugImage == "" { + debugImage = DefaultNodeDebugImage + } + if timeout <= 0 { + timeout = DefaultNodeDebugTimeout + } + + // Create the debug pod + created, err := createDebugPod(ctx, k, nodeName, ns, debugImage, command) + if err != nil { + return "", err + } + + // Ensure the pod is deleted regardless of completion state. + defer func() { + deleteCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + _ = k.Pods(ns).Delete(deleteCtx, created.Name, metav1.DeleteOptions{}) + }() + + // Poll for debug pod completion + terminated, lastPod, waitMsg, pollErr := pollForCompletion(ctx, k, ns, created.Name, timeout) + + // Retrieve logs even on poll errors (e.g. timeout) — the pod may have produced partial output. + logs, _ := retrieveLogs(context.Background(), k, ns, created.Name) + + if pollErr != nil { + if logs != "" { + return "", fmt.Errorf("%w\nOutput:\n%s", pollErr, logs) + } + return "", pollErr + } + + // Process the results + return processResults(terminated, lastPod, waitMsg, logs) +} + +// createDebugPod creates a privileged pod on the target node to run debug commands. +func createDebugPod( + ctx context.Context, + k NodeDebugClient, + nodeName string, + namespace string, + image string, + command []string, +) (*corev1.Pod, error) { + sanitizedNode := sanitizeForName(nodeName) + hostPathType := corev1.HostPathDirectory + + suffix := rand.String(5) + maxNodeLen := 63 - len("node-debug-") - 1 - len(suffix) + if maxNodeLen < 1 { + maxNodeLen = 1 + } + if len(sanitizedNode) > maxNodeLen { + sanitizedNode = strings.TrimRight(sanitizedNode[:maxNodeLen], "-") + } + podName := fmt.Sprintf("node-debug-%s-%s", sanitizedNode, suffix) + + debugPod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: podName, + Namespace: namespace, + Labels: map[string]string{ + kubernetes.AppKubernetesManagedBy: version.BinaryName, + kubernetes.AppKubernetesComponent: "node-debug", + kubernetes.AppKubernetesName: fmt.Sprintf("node-debug-%s", sanitizedNode), + }, + }, + Spec: corev1.PodSpec{ + AutomountServiceAccountToken: ptr.To(false), + HostNetwork: true, + HostPID: true, + HostIPC: true, + NodeName: nodeName, + RestartPolicy: corev1.RestartPolicyNever, + SecurityContext: &corev1.PodSecurityContext{ + RunAsUser: ptr.To[int64](0), + }, + Tolerations: []corev1.Toleration{ + {Operator: corev1.TolerationOpExists}, + {Operator: corev1.TolerationOpExists, Effect: corev1.TaintEffectNoSchedule}, + {Operator: corev1.TolerationOpExists, Effect: corev1.TaintEffectNoExecute}, + }, + Volumes: []corev1.Volume{ + { + Name: "host-root", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: "/", + Type: &hostPathType, + }, + }, + }, + }, + Containers: []corev1.Container{ + { + Name: NodeDebugContainerName, + Image: image, + ImagePullPolicy: corev1.PullIfNotPresent, + Command: command, + SecurityContext: &corev1.SecurityContext{ + Privileged: ptr.To(true), + RunAsUser: ptr.To[int64](0), + }, + VolumeMounts: []corev1.VolumeMount{ + {Name: "host-root", MountPath: "/host"}, + }, + }, + }, + }, + } + + created, err := k.Pods(namespace).Create(ctx, debugPod, metav1.CreateOptions{}) + if err != nil { + return nil, fmt.Errorf("failed to create debug pod: %w", err) + } + + return created, nil +} + +// pollForCompletion polls the debug pod until it completes or times out. +func pollForCompletion( + ctx context.Context, + k NodeDebugClient, + namespace string, + podName string, + timeout time.Duration, +) (*corev1.ContainerStateTerminated, *corev1.Pod, string, error) { + pollCtx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + ticker := time.NewTicker(2 * time.Second) + defer ticker.Stop() + + var ( + lastPod *corev1.Pod + terminated *corev1.ContainerStateTerminated + waitMsg string + ) + + for { + current, err := k.Pods(namespace).Get(pollCtx, podName, metav1.GetOptions{}) + if err != nil { + return nil, nil, "", fmt.Errorf("failed to get debug pod status: %w", err) + } + lastPod = current + + if status := containerStatusByName(current.Status.ContainerStatuses, NodeDebugContainerName); status != nil { + if status.State.Waiting != nil { + waitMsg = fmt.Sprintf("container waiting: %s", status.State.Waiting.Reason) + // Image pull issues should fail fast. + if status.State.Waiting.Reason == "ErrImagePull" || status.State.Waiting.Reason == "ImagePullBackOff" { + return nil, nil, "", fmt.Errorf("debug container failed to start (%s): %s", status.State.Waiting.Reason, status.State.Waiting.Message) + } + } + if status.State.Terminated != nil { + terminated = status.State.Terminated + break + } + } + + if current.Status.Phase == corev1.PodFailed { + break + } + + // Wait for the next tick interval before checking pod status again, or timeout if context is done. + select { + case <-pollCtx.Done(): + return nil, lastPod, waitMsg, fmt.Errorf("timed out waiting for debug pod %s to complete: %w", podName, pollCtx.Err()) + case <-ticker.C: + } + } + + return terminated, lastPod, waitMsg, nil +} + +// retrieveLogs retrieves the logs from the debug pod. +func retrieveLogs(ctx context.Context, k NodeDebugClient, namespace, podName string) (string, error) { + logCtx, logCancel := context.WithTimeout(ctx, 30*time.Second) + defer logCancel() + logs, logErr := k.PodsLog(logCtx, namespace, podName, NodeDebugContainerName, false, 0) + if logErr != nil { + return "", fmt.Errorf("failed to retrieve debug pod logs: %w", logErr) + } + return strings.TrimSpace(logs), nil +} + +// processResults processes the debug pod completion status and returns the appropriate result. +func processResults(terminated *corev1.ContainerStateTerminated, lastPod *corev1.Pod, waitMsg, logs string) (string, error) { + if terminated != nil { + if terminated.ExitCode != 0 { + errMsg := fmt.Sprintf("command exited with code %d", terminated.ExitCode) + if terminated.Reason != "" { + errMsg = fmt.Sprintf("%s (%s)", errMsg, terminated.Reason) + } + if terminated.Message != "" { + errMsg = fmt.Sprintf("%s: %s", errMsg, terminated.Message) + } + if logs != "" { + errMsg = fmt.Sprintf("%s\nOutput:\n%s", errMsg, logs) + } + return "", errors.New(errMsg) + } + return logs, nil + } + + if lastPod != nil && lastPod.Status.Reason != "" { + if logs != "" { + return "", fmt.Errorf("debug pod failed: %s\nOutput:\n%s", lastPod.Status.Reason, logs) + } + return "", fmt.Errorf("debug pod failed: %s", lastPod.Status.Reason) + } + if waitMsg != "" { + if logs != "" { + return "", fmt.Errorf("debug container did not complete: %s\nOutput:\n%s", waitMsg, logs) + } + return "", fmt.Errorf("debug container did not complete: %s", waitMsg) + } + if logs != "" { + return "", fmt.Errorf("debug container did not reach a terminal state\nOutput:\n%s", logs) + } + return "", errors.New("debug container did not reach a terminal state") +} + +func sanitizeForName(name string) string { + lower := strings.ToLower(name) + var b strings.Builder + b.Grow(len(lower)) + for _, r := range lower { + if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '-' { + b.WriteRune(r) + continue + } + b.WriteRune('-') + } + sanitized := strings.Trim(b.String(), "-") + if sanitized == "" { + sanitized = "node" + } + if len(sanitized) > 40 { + sanitized = sanitized[:40] + } + return sanitized +} + +func containerStatusByName(statuses []corev1.ContainerStatus, name string) *corev1.ContainerStatus { + for idx := range statuses { + if statuses[idx].Name == name { + return &statuses[idx] + } + } + return nil +} diff --git a/pkg/ocp/nodes_debug_test.go b/pkg/ocp/nodes_debug_test.go new file mode 100644 index 000000000..6a3f300eb --- /dev/null +++ b/pkg/ocp/nodes_debug_test.go @@ -0,0 +1,327 @@ +package ocp + +import ( + "context" + "fmt" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/suite" + corev1 "k8s.io/api/core/v1" +) + +type NodesDebugSuite struct { + suite.Suite +} + +func (s *NodesDebugSuite) TestNodesDebugExecCreatesPrivilegedPod() { + env := NewNodeDebugTestEnv(s.T()) + env.Pods.Logs = "kernel 6.8" + + out, err := NodesDebugExec(context.Background(), env.Client, "", "worker-0", "", []string{"uname", "-a"}, 2*time.Minute) + s.Run("returns logs on success", func() { + s.Require().NoError(err) + s.Equal("kernel 6.8", out) + }) + + created := env.Pods.Created + s.Require().NotNil(created, "expected debug pod to be created") + + s.Run("uses default namespace fallback", func() { + s.Equal("default", created.Namespace) + }) + s.Run("targets correct node", func() { + s.Equal("worker-0", created.Spec.NodeName) + }) + s.Run("deletes pod after execution", func() { + s.True(env.Pods.Deleted) + }) + s.Run("creates single container with correct defaults", func() { + s.Require().Len(created.Spec.Containers, 1) + container := created.Spec.Containers[0] + s.Equal(DefaultNodeDebugImage, container.Image) + s.Equal([]string{"uname", "-a"}, container.Command) + s.Require().NotNil(container.SecurityContext) + s.Require().NotNil(container.SecurityContext.Privileged) + s.True(*container.SecurityContext.Privileged) + s.Require().Len(container.VolumeMounts, 1) + s.Equal("/host", container.VolumeMounts[0].MountPath) + }) + s.Run("runs as root", func() { + s.Require().NotNil(created.Spec.SecurityContext) + s.Require().NotNil(created.Spec.SecurityContext.RunAsUser) + s.Equal(int64(0), *created.Spec.SecurityContext.RunAsUser) + }) + s.Run("mounts host root volume", func() { + s.Require().Len(created.Spec.Volumes, 1) + s.Require().NotNil(created.Spec.Volumes[0].HostPath) + }) +} + +func (s *NodesDebugSuite) TestNodesDebugExecReturnsErrorForNonZeroExit() { + env := NewNodeDebugTestEnv(s.T()) + env.Pods.ExitCode = 5 + env.Pods.TerminatedReason = "Error" + env.Pods.TerminatedMessage = "some failure" + env.Pods.Logs = "bad things happened" + + out, err := NodesDebugExec(context.Background(), env.Client, "debug-ns", "infra-node", "registry.example/custom:latest", []string{"journalctl", "-xe"}, time.Minute) + + s.Run("returns error with logs included", func() { + s.Require().Error(err) + s.Contains(err.Error(), "bad things happened") + s.Contains(err.Error(), "command exited with code 5") + }) + s.Run("returns empty output on error", func() { + s.Empty(out) + }) + s.Run("uses provided namespace and image", func() { + s.Require().NotNil(env.Pods.Created) + s.Equal("debug-ns", env.Pods.Created.Namespace) + s.Equal("registry.example/custom:latest", env.Pods.Created.Spec.Containers[0].Image) + }) +} + +func (s *NodesDebugSuite) TestCreateDebugPod() { + env := NewNodeDebugTestEnv(s.T()) + + created, err := createDebugPod(context.Background(), env.Client, "worker-1", "test-ns", "custom:v1", []string{"ls", "-la"}) + s.Require().NoError(err) + s.Require().NotNil(created) + + s.Run("sets correct namespace", func() { + s.Equal("test-ns", created.Namespace) + }) + s.Run("targets correct node", func() { + s.Equal("worker-1", created.Spec.NodeName) + }) + s.Run("generates valid pod name", func() { + s.True(strings.HasPrefix(created.Name, "node-debug-worker-1-")) + s.LessOrEqual(len(created.Name), 63, "pod name exceeds DNS label length") + }) + s.Run("uses specified image and command", func() { + s.Require().Len(created.Spec.Containers, 1) + s.Equal("custom:v1", created.Spec.Containers[0].Image) + s.Equal([]string{"ls", "-la"}, created.Spec.Containers[0].Command) + s.Require().NotNil(created.Spec.Containers[0].SecurityContext) + s.True(*created.Spec.Containers[0].SecurityContext.Privileged) + }) +} + +func (s *NodesDebugSuite) TestPollForCompletion() { + tests := []struct { + name string + exitCode int32 + terminatedReason string + waitingReason string + waitingMessage string + expectError bool + expectTerminated bool + errorContains []string + expectedExitCode int32 + expectedReason string + }{ + { + name: "successful completion", + exitCode: 0, + expectTerminated: true, + expectedExitCode: 0, + }, + { + name: "non-zero exit code", + exitCode: 42, + terminatedReason: "Error", + expectTerminated: true, + expectedExitCode: 42, + expectedReason: "Error", + }, + { + name: "image pull error", + waitingReason: "ErrImagePull", + waitingMessage: "image not found", + expectError: true, + errorContains: []string{"ErrImagePull", "image not found"}, + }, + { + name: "image pull backoff", + waitingReason: "ImagePullBackOff", + waitingMessage: "back-off pulling image", + expectError: true, + errorContains: []string{"ImagePullBackOff", "back-off pulling image"}, + }, + } + + for _, tt := range tests { + s.Run(tt.name, func() { + env := NewNodeDebugTestEnv(s.T()) + env.Pods.ExitCode = tt.exitCode + env.Pods.TerminatedReason = tt.terminatedReason + env.Pods.WaitingReason = tt.waitingReason + env.Pods.WaitingMessage = tt.waitingMessage + + created, _ := createDebugPod(context.Background(), env.Client, "node-1", "default", DefaultNodeDebugImage, []string{"echo", "test"}) + + terminated, lastPod, waitMsg, err := pollForCompletion(context.Background(), env.Client, "default", created.Name, time.Minute) + + if tt.expectError { + s.Require().Error(err) + for _, substr := range tt.errorContains { + s.Contains(err.Error(), substr) + } + return + } + + s.Require().NoError(err) + + if tt.expectTerminated { + s.Require().NotNil(terminated) + s.Equal(tt.expectedExitCode, terminated.ExitCode) + if tt.expectedReason != "" { + s.Equal(tt.expectedReason, terminated.Reason) + } + s.NotNil(lastPod) + } + + if tt.waitingReason == "" { + s.Empty(waitMsg) + } + }) + } +} + +func (s *NodesDebugSuite) TestRetrieveLogs() { + env := NewNodeDebugTestEnv(s.T()) + env.Pods.Logs = " some output with whitespace \n" + + created, _ := createDebugPod(context.Background(), env.Client, "node-1", "default", DefaultNodeDebugImage, []string{"echo", "test"}) + + logs, err := retrieveLogs(context.Background(), env.Client, "default", created.Name) + s.Require().NoError(err) + s.Equal("some output with whitespace", logs) +} + +func (s *NodesDebugSuite) TestProcessResults() { + tests := []struct { + name string + terminated *corev1.ContainerStateTerminated + pod *corev1.Pod + waitMsg string + logs string + expectError bool + errorContains []string + expectedResult string + }{ + { + name: "successful completion", + terminated: &corev1.ContainerStateTerminated{ExitCode: 0}, + logs: "success output", + expectedResult: "success output", + }, + { + name: "non-zero exit code with logs", + terminated: &corev1.ContainerStateTerminated{ + ExitCode: 127, + Reason: "CommandNotFound", + Message: "command not found", + }, + logs: "error logs", + expectError: true, + errorContains: []string{"127", "CommandNotFound", "command not found", "error logs", "Output:"}, + }, + { + name: "non-zero exit code without reason but with logs", + terminated: &corev1.ContainerStateTerminated{ExitCode: 1}, + logs: "failed output", + expectError: true, + errorContains: []string{"command exited with code 1", "failed output", "Output:"}, + }, + { + name: "non-zero exit code without logs", + terminated: &corev1.ContainerStateTerminated{ExitCode: 1}, + expectError: true, + errorContains: []string{"command exited with code 1"}, + }, + { + name: "pod failed with logs", + pod: &corev1.Pod{Status: corev1.PodStatus{Reason: "Evicted"}}, + logs: "pod evicted logs", + expectError: true, + errorContains: []string{"Evicted", "pod evicted logs", "Output:"}, + }, + { + name: "pod failed without logs", + pod: &corev1.Pod{Status: corev1.PodStatus{Reason: "Evicted"}}, + expectError: true, + errorContains: []string{"Evicted"}, + }, + { + name: "container waiting with logs", + waitMsg: "container waiting: ImagePullBackOff", + logs: "waiting logs", + expectError: true, + errorContains: []string{"did not complete", "waiting logs", "Output:"}, + }, + { + name: "container waiting without logs", + waitMsg: "container waiting: ImagePullBackOff", + expectError: true, + errorContains: []string{"did not complete"}, + }, + { + name: "no terminal state with logs", + logs: "incomplete logs", + expectError: true, + errorContains: []string{"did not reach a terminal state", "incomplete logs", "Output:"}, + }, + { + name: "no terminal state without logs", + expectError: true, + errorContains: []string{"did not reach a terminal state"}, + }, + } + + for _, tt := range tests { + s.Run(tt.name, func() { + result, err := processResults(tt.terminated, tt.pod, tt.waitMsg, tt.logs) + + if tt.expectError { + s.Require().Error(err) + for _, substr := range tt.errorContains { + s.Contains(err.Error(), substr) + } + s.Empty(result) + } else { + s.Require().NoError(err) + s.Equal(tt.expectedResult, result) + } + }) + } +} + +func (s *NodesDebugSuite) TestSanitizeForName() { + tests := []struct { + input string + expected string + }{ + {"worker-0", "worker-0"}, + {"WORKER-0", "worker-0"}, + {"worker.0", "worker-0"}, + {"worker_0", "worker-0"}, + {"ip-10-0-1-42.ec2.internal", "ip-10-0-1-42-ec2-internal"}, + {"", "node"}, + {"---", "node"}, + {strings.Repeat("a", 50), strings.Repeat("a", 40)}, + {"Worker-Node_123.domain", "worker-node-123-domain"}, + } + + for _, tt := range tests { + s.Run(fmt.Sprintf("sanitize(%q)", tt.input), func() { + s.Equal(tt.expected, sanitizeForName(tt.input)) + }) + } +} + +func TestNodesDebug(t *testing.T) { + suite.Run(t, new(NodesDebugSuite)) +} diff --git a/pkg/ocp/nodes_debug_testhelpers_test.go b/pkg/ocp/nodes_debug_testhelpers_test.go new file mode 100644 index 000000000..99d97417f --- /dev/null +++ b/pkg/ocp/nodes_debug_testhelpers_test.go @@ -0,0 +1,153 @@ +package ocp + +import ( + "context" + "fmt" + "io" + "net/http" + "net/url" + "strings" + "testing" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + schemek8s "k8s.io/client-go/kubernetes/scheme" + corev1client "k8s.io/client-go/kubernetes/typed/core/v1" + restclient "k8s.io/client-go/rest" +) + +// NodeDebugTestEnv bundles a test client with a controllable pods client for tests. +type NodeDebugTestEnv struct { + Client *FakeNodeDebugClient + Pods *FakePodInterface +} + +// NewNodeDebugTestEnv constructs a testing harness for exercising NodesDebugExec. +func NewNodeDebugTestEnv(t *testing.T) *NodeDebugTestEnv { + t.Helper() + + podsClient := &FakePodInterface{} + fakeClient := &FakeNodeDebugClient{ + pods: podsClient, + namespace: "default", + } + + return &NodeDebugTestEnv{ + Client: fakeClient, + Pods: podsClient, + } +} + +// FakeNodeDebugClient implements the NodeDebugClient interface for testing. +type FakeNodeDebugClient struct { + pods *FakePodInterface + namespace string +} + +func (f *FakeNodeDebugClient) NamespaceOrDefault(namespace string) string { + if namespace == "" { + return f.namespace + } + return namespace +} + +func (f *FakeNodeDebugClient) Pods(_ string) corev1client.PodInterface { + return f.pods +} + +func (f *FakeNodeDebugClient) PodsLog(ctx context.Context, namespace, name, container string, previous bool, tail int64) (string, error) { + req := f.pods.GetLogs(name, &corev1.PodLogOptions{Container: container, Previous: previous}) + res := req.Do(ctx) + if res.Error() != nil { + return "", res.Error() + } + rawData, err := res.Raw() + if err != nil { + return "", err + } + return string(rawData), nil +} + +// FakePodInterface implements corev1client.PodInterface with deterministic behaviour for tests. +type FakePodInterface struct { + corev1client.PodInterface + Created *corev1.Pod + Deleted bool + ExitCode int32 + TerminatedReason string + TerminatedMessage string + WaitingReason string + WaitingMessage string + Logs string +} + +func (f *FakePodInterface) Create(_ context.Context, pod *corev1.Pod, _ metav1.CreateOptions) (*corev1.Pod, error) { + copy := pod.DeepCopy() + if copy.Name == "" && copy.GenerateName != "" { + copy.Name = copy.GenerateName + "test" + } + f.Created = copy + return copy.DeepCopy(), nil +} + +func (f *FakePodInterface) Get(_ context.Context, _ string, _ metav1.GetOptions) (*corev1.Pod, error) { + if f.Created == nil { + return nil, fmt.Errorf("pod not created yet") + } + pod := f.Created.DeepCopy() + + // If waiting state is set, return that instead of terminated + if f.WaitingReason != "" { + waiting := &corev1.ContainerStateWaiting{Reason: f.WaitingReason} + if f.WaitingMessage != "" { + waiting.Message = f.WaitingMessage + } + pod.Status.ContainerStatuses = []corev1.ContainerStatus{{ + Name: NodeDebugContainerName, + State: corev1.ContainerState{Waiting: waiting}, + }} + pod.Status.Phase = corev1.PodPending + return pod, nil + } + + // Otherwise return terminated state + terminated := &corev1.ContainerStateTerminated{ExitCode: f.ExitCode} + if f.TerminatedReason != "" { + terminated.Reason = f.TerminatedReason + } + if f.TerminatedMessage != "" { + terminated.Message = f.TerminatedMessage + } + pod.Status.ContainerStatuses = []corev1.ContainerStatus{{ + Name: NodeDebugContainerName, + State: corev1.ContainerState{Terminated: terminated}, + }} + pod.Status.Phase = corev1.PodSucceeded + return pod, nil +} + +func (f *FakePodInterface) Delete(_ context.Context, _ string, _ metav1.DeleteOptions) error { + f.Deleted = true + return nil +} + +func (f *FakePodInterface) GetLogs(name string, opts *corev1.PodLogOptions) *restclient.Request { + body := io.NopCloser(strings.NewReader(f.Logs)) + client := &http.Client{Transport: roundTripperFunc(func(*http.Request) (*http.Response, error) { + return &http.Response{StatusCode: http.StatusOK, Body: body}, nil + })} + content := restclient.ClientContentConfig{ + ContentType: runtime.ContentTypeJSON, + GroupVersion: schema.GroupVersion{Version: "v1"}, + Negotiator: runtime.NewClientNegotiator(schemek8s.Codecs.WithoutConversion(), schema.GroupVersion{Version: "v1"}), + } + return restclient.NewRequestWithClient(&url.URL{Scheme: "https", Host: "localhost"}, "", content, client).Verb("GET") +} + +type roundTripperFunc func(*http.Request) (*http.Response, error) + +func (f roundTripperFunc) RoundTrip(req *http.Request) (*http.Response, error) { + return f(req) +} diff --git a/pkg/toolsets/openshift/nodes.go b/pkg/toolsets/openshift/nodes.go new file mode 100644 index 000000000..5af3f707c --- /dev/null +++ b/pkg/toolsets/openshift/nodes.go @@ -0,0 +1,127 @@ +package openshift + +import ( + "errors" + "fmt" + "time" + + "github.com/google/jsonschema-go/jsonschema" + "k8s.io/utils/ptr" + + "github.com/containers/kubernetes-mcp-server/pkg/api" + "github.com/containers/kubernetes-mcp-server/pkg/ocp" +) + +func initNodes() []api.ServerTool { + return []api.ServerTool{ + { + Tool: api.Tool{ + Name: "nodes_debug_exec", + Description: "Run commands on an OpenShift node using a privileged debug pod with comprehensive troubleshooting utilities. The debug pod uses the UBI9 toolbox image which includes: systemd tools (systemctl, journalctl), networking tools (ss, ip, ping, traceroute, nmap), process tools (ps, top, lsof, strace), file system tools (find, tar, rsync), and debugging tools (gdb). The host filesystem is mounted at /host, allowing commands to chroot /host if needed to access node-level resources. Output is truncated to the most recent 100 lines, so prefer filters like grep when expecting large logs.", + InputSchema: &jsonschema.Schema{ + Type: "object", + Properties: map[string]*jsonschema.Schema{ + "node": { + Type: "string", + Description: "Name of the node to debug (e.g. worker-0).", + }, + "command": { + Type: "array", + Description: "Command to execute on the node. All standard debugging utilities from the UBI9 toolbox are available. The host filesystem is mounted at /host - use 'chroot /host ' to access node-level resources, or run commands directly in the toolbox environment. Provide each argument as a separate array item (e.g. ['chroot', '/host', 'systemctl', 'status', 'kubelet'] or ['journalctl', '-u', 'kubelet', '--since', '1 hour ago']).", + Items: &jsonschema.Schema{Type: "string"}, + }, + "namespace": { + Type: "string", + Description: "Namespace to create the temporary debug pod in (optional, defaults to the current namespace or 'default').", + }, + "image": { + Type: "string", + Description: "Container image to use for the debug pod (optional). Defaults to registry.access.redhat.com/ubi9/toolbox:latest which provides comprehensive debugging and troubleshooting utilities.", + }, + "timeout_seconds": { + Type: "integer", + Description: "Maximum time to wait for the command to complete before timing out (optional, defaults to 60 seconds).", + Minimum: ptr.To(float64(1)), + }, + }, + Required: []string{"node", "command"}, + }, + Annotations: api.ToolAnnotations{ + Title: "Nodes: Debug Exec", + ReadOnlyHint: ptr.To(false), + DestructiveHint: ptr.To(true), + IdempotentHint: ptr.To(false), + OpenWorldHint: ptr.To(true), + }, + }, + Handler: nodesDebugExec, + }, + } +} + +func nodesDebugExec(params api.ToolHandlerParams) (*api.ToolCallResult, error) { + nodeArg := params.GetArguments()["node"] + nodeName, ok := nodeArg.(string) + if nodeArg == nil || !ok || nodeName == "" { + return api.NewToolCallResult("", errors.New("missing required argument: node")), nil + } + + commandArg := params.GetArguments()["command"] + command, err := toStringSlice(commandArg) + if err != nil { + return api.NewToolCallResult("", fmt.Errorf("invalid command argument: %w", err)), nil + } + + namespace := "" + if nsArg, ok := params.GetArguments()["namespace"].(string); ok { + namespace = nsArg + } + + image := "" + if imageArg, ok := params.GetArguments()["image"].(string); ok { + image = imageArg + } + + var timeout time.Duration + if timeoutRaw, exists := params.GetArguments()["timeout_seconds"]; exists && timeoutRaw != nil { + switch v := timeoutRaw.(type) { + case float64: + timeout = time.Duration(int64(v)) * time.Second + case int: + timeout = time.Duration(v) * time.Second + case int64: + timeout = time.Duration(v) * time.Second + default: + return api.NewToolCallResult("", errors.New("timeout_seconds must be a numeric value")), nil + } + } + + client := ocp.NewNodeDebugClient(params.KubernetesClient) + output, execErr := ocp.NodesDebugExec(params.Context, client, namespace, nodeName, image, command, timeout) + if output == "" && execErr == nil { + output = fmt.Sprintf("Command executed successfully on node %s but produced no output.", nodeName) + } + return api.NewToolCallResult(output, execErr), nil +} + +func toStringSlice(arg any) ([]string, error) { + if arg == nil { + return nil, errors.New("command is required") + } + raw, ok := arg.([]interface{}) + if !ok { + return nil, errors.New("command must be an array of strings") + } + if len(raw) == 0 { + return nil, errors.New("command array cannot be empty") + } + command := make([]string, 0, len(raw)) + for _, item := range raw { + str, ok := item.(string) + if !ok { + return nil, errors.New("command items must be strings") + } + command = append(command, str) + } + return command, nil +} diff --git a/pkg/toolsets/openshift/nodes_test.go b/pkg/toolsets/openshift/nodes_test.go new file mode 100644 index 000000000..395c63572 --- /dev/null +++ b/pkg/toolsets/openshift/nodes_test.go @@ -0,0 +1,74 @@ +package openshift + +import ( + "testing" + + "github.com/containers/kubernetes-mcp-server/pkg/api" + "github.com/stretchr/testify/suite" +) + +type NodesHandlerSuite struct { + suite.Suite +} + +type staticRequest struct { + args map[string]any +} + +func (s staticRequest) GetArguments() map[string]any { + return s.args +} + +func (s *NodesHandlerSuite) TestValidatesInput() { + s.Run("missing node", func() { + params := api.ToolHandlerParams{ + ToolCallRequest: staticRequest{args: map[string]any{}}, + } + result, err := nodesDebugExec(params) + s.Require().NoError(err) + s.Require().NotNil(result.Error) + s.Equal("missing required argument: node", result.Error.Error()) + }) + + s.Run("invalid command type", func() { + params := api.ToolHandlerParams{ + ToolCallRequest: staticRequest{args: map[string]any{ + "node": "worker-0", + "command": "ls -la", + }}, + } + result, err := nodesDebugExec(params) + s.Require().NoError(err) + s.Require().NotNil(result.Error) + s.Equal("invalid command argument: command must be an array of strings", result.Error.Error()) + }) + + s.Run("missing command", func() { + params := api.ToolHandlerParams{ + ToolCallRequest: staticRequest{args: map[string]any{ + "node": "worker-0", + }}, + } + result, err := nodesDebugExec(params) + s.Require().NoError(err) + s.Require().NotNil(result.Error) + s.Contains(result.Error.Error(), "command is required") + }) + + s.Run("empty command array", func() { + params := api.ToolHandlerParams{ + ToolCallRequest: staticRequest{args: map[string]any{ + "node": "worker-0", + "command": []interface{}{}, + }}, + } + result, err := nodesDebugExec(params) + s.Require().NoError(err) + s.Require().NotNil(result.Error) + s.Contains(result.Error.Error(), "command array cannot be empty") + }) +} + +func TestNodesHandler(t *testing.T) { + suite.Run(t, new(NodesHandlerSuite)) +} diff --git a/pkg/toolsets/openshift/toolset.go b/pkg/toolsets/openshift/toolset.go index a3eb145b5..5a5ecc874 100644 --- a/pkg/toolsets/openshift/toolset.go +++ b/pkg/toolsets/openshift/toolset.go @@ -21,7 +21,9 @@ func (t *Toolset) GetDescription() string { } func (t *Toolset) GetTools(o api.Openshift) []api.ServerTool { - return nil + return slices.Concat( + initNodes(), + ) } func (t *Toolset) GetPrompts() []api.ServerPrompt {