diff --git a/content/openapi/oneks_v1.json b/content/openapi/oneks_v1.json index f8e59b40..a98ebf13 100644 --- a/content/openapi/oneks_v1.json +++ b/content/openapi/oneks_v1.json @@ -268,6 +268,109 @@ } } }, + "/clusters/check": { + "get": { + "tags": [ + "clusters" + ], + "summary": "Get readiness check service status", + "description": "Returns whether the OneKS readiness check service is enabled in the OneKS server configuration.", + "responses": { + "200": { + "description": "Readiness check service status", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean", + "description": "Whether the OneKS readiness check service is enabled." + } + } + }, + "example": { + "enabled": true + } + } + } + }, + "401": { + "$ref": "#/components/responses/401" + }, + "403": { + "$ref": "#/components/responses/403" + }, + "500": { + "$ref": "#/components/responses/500" + } + } + }, + "post": { + "tags": [ + "clusters" + ], + "summary": "Run a readiness check", + "description": "Runs the OneKS readiness check service against a public and private Virtual Network pair before provisioning a Kubernetes cluster. The response is a Server-Sent Events stream. Each progress event uses the `check_cluster` event name and sends a JSON object inline in the event `data` field with the following structure: `name` is the check step, `state` is one of `started`, `success`, or `failure`, and `context` is an optional error or detail message.", + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "type": "object", + "required": [ + "public_network", + "private_network" + ], + "properties": { + "public_network": { + "type": "integer", + "description": "Public Virtual Network ID used by the readiness probe VM." + }, + "private_network": { + "type": "integer", + "description": "Private Virtual Network ID validated by the readiness probe VM." + } + } + }, + "example": { + "public_network": 105, + "private_network": 106 + } + } + } + }, + "responses": { + "200": { + "description": "Readiness check progress stream", + "content": { + "text/event-stream": { + "schema": { + "type": "string", + "description": "Server-Sent Events stream. Each event is named `check_cluster` and contains JSON data with `name`, `state`, and optional `context` fields." + }, + "example": "event: check_cluster\ndata: {\"name\":\"Starting OneKS readiness checks\",\"state\":\"success\"}\n\nevent: check_cluster\ndata: {\"name\":\"Creating probe VM\",\"state\":\"started\"}\n\nevent: check_cluster\ndata: {\"name\":\"Creating probe VM\",\"state\":\"success\"}\n\n" + } + } + }, + "400": { + "$ref": "#/components/responses/400" + }, + "401": { + "$ref": "#/components/responses/401" + }, + "403": { + "$ref": "#/components/responses/403" + }, + "405": { + "$ref": "#/components/responses/405" + }, + "500": { + "$ref": "#/components/responses/500" + } + } + } + }, "/clusters/{id}": { "get": { "tags": [ diff --git a/content/platform_services/oneks/management/configuration.md b/content/platform_services/oneks/management/configuration.md index f844673a..564d5fa7 100644 --- a/content/platform_services/oneks/management/configuration.md +++ b/content/platform_services/oneks/management/configuration.md @@ -59,6 +59,22 @@ These options are used when OneKS runs Kubernetes commands from the Front-end Ho | `:kubeconfig_path` | `/etc/rancher/rke2/rke2.yaml` | Kubeconfig file used by `kubectl` operations executed by the service. The file must be readable by the service user. | | `:k8s_timeout` | `15` | Timeout, in seconds, while waiting for Kubernetes command execution results. Increase it for slow API servers or busy management clusters. | +## Readiness Check Configuration + +The optional `:readiness` section enables the OneKS readiness check service. When it is enabled, users can validate the public and private Virtual Networks that will be used by a K8s Cluster before starting a deployment. + +On startup, OneKS verifies that the readiness appliance is available. If the appliance is not already imported, OneKS imports the configured marketplace appliance into the configured datastore, creating the OneKS readiness service VM template and its backing image as part of the import. + +Comment out the full `:readiness` section to disable the readiness check service. + +| Attribute | Default | Description | +|-----------------------------------|-----------------------------------------|-------------| +| `:readiness` / `:appliance_name` | `OneKS Readiness Service` | Marketplace appliance name used when importing the readiness probe template if it is missing. | +| `:readiness` / `:appliance_id` | `97383e01-6150-4a1f-8830-fc5d745056e0` | Marketplace appliance UUID used to find or import the readiness probe VM template. By default, this value points to an Alpine 3.20 appliance from the OpenNebula Marketplace. | +| `:readiness` / `:appliance_ds` | `1` | Datastore ID where the readiness appliance image is imported. | +| `:readiness` / `:external_url` | `https://get.rke2.io` | Public URL used by the probe VM to validate DNS resolution and outbound internet access. | +| `:readiness` / `:timeout` | `60` | Maximum time, in seconds, to wait for the probe VM and each readiness check step. | + ## Operational Defaults These values control retry behavior, concurrency, cooldowns, and generated resource names. @@ -182,6 +198,17 @@ OneKS relies on several OpenNebula services and network endpoints: :kubeconfig_path: '/etc/rancher/rke2/rke2.yaml' :k8s_timeout: 15 +################################################################################ +# Cluster Readiness Check +################################################################################ + +:readiness: + :appliance_name: 'OneKS Readiness Service' + :appliance_id: 97383e01-6150-4a1f-8830-fc5d745056e0 + :appliance_ds: 1 + :external_url: 'https://get.rke2.io' + :timeout: 60 + ################################################################################ # Defaults ################################################################################ diff --git a/content/platform_services/oneks/management/monitoring_and_troubleshooting.md b/content/platform_services/oneks/management/monitoring_and_troubleshooting.md index 58cc9871..5c330fae 100644 --- a/content/platform_services/oneks/management/monitoring_and_troubleshooting.md +++ b/content/platform_services/oneks/management/monitoring_and_troubleshooting.md @@ -148,6 +148,51 @@ curl -u "$(cat /var/lib/one/.one/one_auth)" http://:10780/api/v1/c K8s Cluster provisioning can fail for different infrastructure or network-related reasons. In most cases, these failures surface as a timeout during provisioning and the K8s Cluster eventually moves to `PROVISIONING_FAILURE`. The following checks can help identify the most common causes. +### OneKS Readiness Check + +If the OneKS readiness check service is enabled, use it to validate the public and private Virtual Networks used by a K8s Cluster. The check deploys a temporary lightweight probe VM and verifies the main requirements needed by OneKS provisioning, including OneGate access, internet connectivity, and private network paths. For configuration details, see the [Readiness Check Configuration]({{% relref "platform_services/oneks/management/configuration#readiness-check-configuration" %}}). For command syntax, see the [`oneks check` CLI reference]({{% relref "platform_services/oneks/references/oneks_cli#readiness-checks" %}}). + +When a deployment fails, run the readiness check against the failed K8s Cluster. OneKS resolves the networks from the Cluster document: + +```shell +oneks check cluster +``` + +When all checks pass, the command reports each validated step: + +```default +[OK] Starting OneKS readiness checks +[OK] Creating probe VM +[OK] Waiting for probe VM RUNNING state +[OK] Waiting for probe VM context +[OK] Checking OneGate access +[OK] Checking Internet connectivity +[OK] Checking private network paths +[OK] Cleanup probe VM +[OK] All OneKS readiness checks passed +``` + +You can also run the check before creating a K8s Cluster by passing the networks explicitly: + +```shell +oneks check --public-network --private-network +``` + +If the readiness check fails, the tool reports the failing step and the environment problem that must be fixed. For example, if the probe VM cannot access OneGate: + +```default +[OK] Starting OneKS readiness checks +[OK] Creating probe VM +[OK] Waiting for probe VM RUNNING state +[OK] Waiting for probe VM context +[FAIL] Checking OneGate access + OneKS cluster cannot reach OneGate endpoint: ONEGATE_ENDPOINT not found in the cluster context +[OK] Cleanup probe VM +[FAIL] One or more OneKS readiness checks failed +``` + +Before recovering or recreating the Kubernetes Cluster, review and resolve any reported configuration issues to ensure the environment is properly prepared for deployment. The following subsections describe the recommended resolution steps for reported error. + ### OneGate is Not Properly Configured OneKS relies on the seed VM to report progress and update OpenNebula resources during provisioning. If OneGate is not properly configured or the seed VM cannot reach the OneGate service, the seed VM cannot publish the expected updates back to OpenNebula. As a result, OneKS waits until the provisioning timeout is reached and the K8s Cluster enters `PROVISIONING_FAILURE`. @@ -172,7 +217,7 @@ For more information, refer to the [OpenNebula OneGate Documentation]({{% relref ### VMs Cannot Access the Internet -During provisioning, the seed VM needs Internet access to download the required artifacts and images used to bootstrap and connect the Kubernetes nodes. If the seed VM or the target nodes cannot reach the Internet, provisioning may stall until the timeout is reached and the K8s Cluster moves to `PROVISIONING_FAILURE`. +During provisioning, the seed VM needs internet access to download the required artifacts and images used to bootstrap and connect the Kubernetes nodes. If the seed VM or the target nodes cannot reach the internet, provisioning may stall until the timeout is reached and the K8s Cluster moves to `PROVISIONING_FAILURE`. From the affected VM, check basic network connectivity: diff --git a/content/platform_services/oneks/references/oneks_cli.md b/content/platform_services/oneks/references/oneks_cli.md index 16131e8e..e3acb87a 100644 --- a/content/platform_services/oneks/references/oneks_cli.md +++ b/content/platform_services/oneks/references/oneks_cli.md @@ -43,6 +43,7 @@ Some builds may expose node groups through the lower-level `group` resource in C * `oneks create nodegroup --cluster-id `: Create a node group. * `oneks recover cluster `: Recover a K8s Cluster from selected failure states. * `oneks recover nodegroup `: Recover a node group from selected failure states. +* `oneks check cluster `: Run the OneKS readiness check using the networks from an existing K8s Cluster. * `oneks delete cluster `: Delete a K8s Cluster. * `oneks delete nodegroup `: Delete a node group. * `oneks logs cluster `: Show K8s Cluster logs. @@ -93,6 +94,13 @@ oneks recover cluster 42 oneks recover nodegroup 7 ``` +Run OneKS readiness checks: + +```shell +oneks check cluster 42 +oneks check --public-network 105 --private-network 106 +``` + Inspect logs: ```shell