From 4373790f5d2a55767c7cdb43ebd91c3ae2c028a1 Mon Sep 17 00:00:00 2001 From: DanNiESh Date: Tue, 26 May 2026 11:00:32 -0400 Subject: [PATCH] Add automatic reconnect on Ironic authentication failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When gophercloud's built-in token refresh (ReauthFunc) fails — e.g. due to a network interruption or Keystone being temporarily unavailable — the Ironic client was left permanently stuck with a stale token until the pod restarted. This adds a fallback that detects 401/reauth errors, recreates the service client from scratch, and retries the operation once. Co-Authored-By: Claude Opus 4.6 --- cmd/main.go | 6 +- internal/ironic/client.go | 81 ++++++++++- internal/ironic/client_internal_test.go | 182 ++++++++++++++++++++++++ internal/ironic/client_test.go | 9 +- 4 files changed, 267 insertions(+), 11 deletions(-) create mode 100644 internal/ironic/client_internal_test.go diff --git a/cmd/main.go b/cmd/main.go index ffd54ac..179dda9 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -17,12 +17,14 @@ limitations under the License. package main import ( + "context" "crypto/tls" "flag" "fmt" "os" "path/filepath" "strconv" + "time" // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) // to ensure that exec-entrypoint and run can make use of them. @@ -233,7 +235,9 @@ func main() { // Ironic client for bare metal management var ironicClient *ironic.Client - if ironicClient, err = ironic.NewClient(); err != nil { + ironicCtx, ironicCancel := context.WithTimeout(context.Background(), 30*time.Second) + defer ironicCancel() + if ironicClient, err = ironic.NewClient(ironicCtx); err != nil { setupLog.Error(err, "failed to create Ironic client") os.Exit(1) } diff --git a/internal/ironic/client.go b/internal/ironic/client.go index dca64c4..d41af3a 100644 --- a/internal/ironic/client.go +++ b/internal/ironic/client.go @@ -19,11 +19,14 @@ package ironic import ( "context" + "errors" "fmt" + "net/http" "github.com/gophercloud/gophercloud/v2" "github.com/gophercloud/gophercloud/v2/openstack/baremetal/v1/nodes" "github.com/gophercloud/utils/v2/openstack/clientconfig" + ctrllog "sigs.k8s.io/controller-runtime/pkg/log" ) // NodeClient is the interface for interacting with Ironic nodes. @@ -36,7 +39,8 @@ type NodeClient interface { // Client talks to Ironic over REST via gophercloud. type ( Client struct { - serviceClient *gophercloud.ServiceClient + serviceClient *gophercloud.ServiceClient + newServiceClient func(ctx context.Context) (*gophercloud.ServiceClient, error) } TargetPowerState struct { @@ -56,24 +60,77 @@ var ( SoftRebooting = TargetPowerState{nodes.SoftRebooting} ) -// NewClient creates an Ironic client with no-auth (typical for in-cluster Ironic with no Keystone). -func NewClient() (*Client, error) { - client, err := clientconfig.NewServiceClient(context.TODO(), "baremetal", nil) +// NewClient creates an Ironic client configured via clouds.yaml or OS_* environment variables. +func NewClient(ctx context.Context) (*Client, error) { + factory := func(ctx context.Context) (*gophercloud.ServiceClient, error) { + return clientconfig.NewServiceClient(ctx, "baremetal", nil) + } + sc, err := factory(ctx) if err != nil { return nil, fmt.Errorf("failed to create baremetal client: %w", err) } - if client.Endpoint == "" { + if sc == nil { + return nil, fmt.Errorf("failed to create baremetal client: nil service client") + } + if sc.Endpoint == "" { return nil, fmt.Errorf("baremetal client has no endpoint configured") } return &Client{ - serviceClient: client, + serviceClient: sc, + newServiceClient: factory, }, nil } +func isAuthError(err error) bool { + if err == nil { + return false + } + if gophercloud.ResponseCodeIs(err, http.StatusUnauthorized) { + return true + } + var errReauth *gophercloud.ErrUnableToReauthenticate + if errors.As(err, &errReauth) { + return true + } + var errAfterReauth *gophercloud.ErrErrorAfterReauthentication + return errors.As(err, &errAfterReauth) +} + +func (c *Client) reconnect(ctx context.Context) error { + log := ctrllog.FromContext(ctx) + log.Info("recreating ironic service client after authentication failure") + sc, err := c.newServiceClient(ctx) + if err != nil { + log.Error(err, "failed to recreate ironic service client") + return fmt.Errorf("failed to recreate baremetal client: %w", err) + } + if sc == nil { + return fmt.Errorf("failed to recreate baremetal client: nil service client") + } + if sc.Endpoint == "" { + return fmt.Errorf("recreated baremetal client has no endpoint configured") + } + c.serviceClient = sc + log.Info("ironic service client reconnected successfully", "endpoint", sc.Endpoint) + return nil +} + // GetNode fetches a node by UUID or name from Ironic. func (c *Client) GetNode(ctx context.Context, nodeID string) (*nodes.Node, error) { + log := ctrllog.FromContext(ctx) node, err := nodes.Get(ctx, c.serviceClient, nodeID).Extract() if err != nil { + if isAuthError(err) { + log.Info("auth error on GetNode, attempting reconnect", "nodeID", nodeID, "error", err) + if reconnErr := c.reconnect(ctx); reconnErr != nil { + return nil, fmt.Errorf("get node %s: reconnect failed: %w", nodeID, reconnErr) + } + node, err = nodes.Get(ctx, c.serviceClient, nodeID).Extract() + if err != nil { + return nil, fmt.Errorf("get node %s after reconnect: %w", nodeID, err) + } + return node, nil + } return nil, fmt.Errorf("get node %s: %w", nodeID, err) } return node, nil @@ -91,8 +148,20 @@ func (c *Client) IsNodePowerTransitioning(node *nodes.Node) bool { // SetPowerState requests power on or off for the node via Ironic. func (c *Client) SetPowerState(ctx context.Context, nodeID string, target TargetPowerState) error { + log := ctrllog.FromContext(ctx) res := nodes.ChangePowerState(ctx, c.serviceClient, nodeID, nodes.PowerStateOpts{Target: target.powerstate}) if err := res.ExtractErr(); err != nil { + if isAuthError(err) { + log.Info("auth error on SetPowerState, attempting reconnect", "nodeID", nodeID, "target", target.String(), "error", err) + if reconnErr := c.reconnect(ctx); reconnErr != nil { + return fmt.Errorf("failed to set power state on node %s: reconnect failed: %w", nodeID, reconnErr) + } + res = nodes.ChangePowerState(ctx, c.serviceClient, nodeID, nodes.PowerStateOpts{Target: target.powerstate}) + if err := res.ExtractErr(); err != nil { + return fmt.Errorf("failed to set power state on node %s after reconnect: %w", nodeID, err) + } + return nil + } return fmt.Errorf("failed to set power state on node %s: %w", nodeID, err) } return nil diff --git a/internal/ironic/client_internal_test.go b/internal/ironic/client_internal_test.go new file mode 100644 index 0000000..cd2b024 --- /dev/null +++ b/internal/ironic/client_internal_test.go @@ -0,0 +1,182 @@ +/* +Copyright 2026. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package ironic + +import ( + "context" + "fmt" + "net/http" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/gophercloud/gophercloud/v2" +) + +const ( + oldEndpoint = "http://old:6385/v1/" + newEndpoint = "http://new:6385/v1/" +) + +var _ = Describe("isAuthError", func() { + It("should return false for nil error", func() { + Expect(isAuthError(nil)).To(BeFalse()) + }) + + It("should return false for generic errors", func() { + Expect(isAuthError(fmt.Errorf("some random error"))).To(BeFalse()) + }) + + It("should return true for 401 ErrUnexpectedResponseCode", func() { + err := gophercloud.ErrUnexpectedResponseCode{ + Actual: http.StatusUnauthorized, + Expected: []int{http.StatusOK}, + } + Expect(isAuthError(err)).To(BeTrue()) + }) + + It("should return false for 404 ErrUnexpectedResponseCode", func() { + err := gophercloud.ErrUnexpectedResponseCode{ + Actual: http.StatusNotFound, + Expected: []int{http.StatusOK}, + } + Expect(isAuthError(err)).To(BeFalse()) + }) + + It("should return false for 500 ErrUnexpectedResponseCode", func() { + err := gophercloud.ErrUnexpectedResponseCode{ + Actual: http.StatusInternalServerError, + Expected: []int{http.StatusOK}, + } + Expect(isAuthError(err)).To(BeFalse()) + }) + + It("should return true for *ErrUnableToReauthenticate (pointer, as gophercloud returns)", func() { + err := &gophercloud.ErrUnableToReauthenticate{ + ErrOriginal: fmt.Errorf("original"), + ErrReauth: fmt.Errorf("reauth failed"), + } + Expect(isAuthError(err)).To(BeTrue()) + }) + + It("should return true for *ErrErrorAfterReauthentication (pointer, as gophercloud returns)", func() { + err := &gophercloud.ErrErrorAfterReauthentication{ + ErrOriginal: fmt.Errorf("still failing"), + } + Expect(isAuthError(err)).To(BeTrue()) + }) + + It("should return true for wrapped *ErrUnableToReauthenticate", func() { + inner := &gophercloud.ErrUnableToReauthenticate{ + ErrOriginal: fmt.Errorf("original"), + ErrReauth: fmt.Errorf("reauth failed"), + } + wrapped := fmt.Errorf("operation failed: %w", inner) + Expect(isAuthError(wrapped)).To(BeTrue()) + }) + + It("should return true for wrapped *ErrErrorAfterReauthentication", func() { + inner := &gophercloud.ErrErrorAfterReauthentication{ + ErrOriginal: fmt.Errorf("still failing"), + } + wrapped := fmt.Errorf("operation failed: %w", inner) + Expect(isAuthError(wrapped)).To(BeTrue()) + }) + + It("should return true for wrapped 401 error", func() { + inner := gophercloud.ErrUnexpectedResponseCode{ + Actual: http.StatusUnauthorized, + Expected: []int{http.StatusOK}, + } + wrapped := fmt.Errorf("operation failed: %w", inner) + Expect(isAuthError(wrapped)).To(BeTrue()) + }) + + It("should return false for wrapped non-auth error", func() { + inner := gophercloud.ErrUnexpectedResponseCode{ + Actual: http.StatusNotFound, + Expected: []int{http.StatusOK}, + } + wrapped := fmt.Errorf("operation failed: %w", inner) + Expect(isAuthError(wrapped)).To(BeFalse()) + }) +}) + +var _ = Describe("reconnect", func() { + It("should swap the service client on success", func() { + oldSC := &gophercloud.ServiceClient{Endpoint: oldEndpoint} + newSC := &gophercloud.ServiceClient{Endpoint: newEndpoint} + + c := &Client{ + serviceClient: oldSC, + newServiceClient: func(context.Context) (*gophercloud.ServiceClient, error) { + return newSC, nil + }, + } + + Expect(c.reconnect(context.Background())).To(Succeed()) + Expect(c.serviceClient).To(Equal(newSC)) + Expect(c.GetEndpoint()).To(Equal(newEndpoint)) + }) + + It("should return error when factory fails", func() { + oldSC := &gophercloud.ServiceClient{Endpoint: oldEndpoint} + + c := &Client{ + serviceClient: oldSC, + newServiceClient: func(context.Context) (*gophercloud.ServiceClient, error) { + return nil, fmt.Errorf("keystone is down") + }, + } + + err := c.reconnect(context.Background()) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("keystone is down")) + Expect(c.serviceClient).To(Equal(oldSC), "should keep old client on failure") + }) + + It("should return error when factory returns nil client without error", func() { + oldSC := &gophercloud.ServiceClient{Endpoint: oldEndpoint} + + c := &Client{ + serviceClient: oldSC, + newServiceClient: func(context.Context) (*gophercloud.ServiceClient, error) { + return nil, nil + }, + } + + err := c.reconnect(context.Background()) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("nil service client")) + Expect(c.serviceClient).To(Equal(oldSC), "should keep old client on nil return") + }) + + It("should return error when new client has empty endpoint", func() { + oldSC := &gophercloud.ServiceClient{Endpoint: oldEndpoint} + + c := &Client{ + serviceClient: oldSC, + newServiceClient: func(context.Context) (*gophercloud.ServiceClient, error) { + return &gophercloud.ServiceClient{Endpoint: ""}, nil + }, + } + + err := c.reconnect(context.Background()) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("no endpoint configured")) + }) +}) diff --git a/internal/ironic/client_test.go b/internal/ironic/client_test.go index 2d1125a..3a63e0b 100644 --- a/internal/ironic/client_test.go +++ b/internal/ironic/client_test.go @@ -17,6 +17,7 @@ limitations under the License. package ironic_test import ( + "context" "os" "testing" @@ -106,7 +107,7 @@ var _ = Describe("Client", func() { }) It("should return an error when no credentials are available", func() { - client, err := ironic.NewClient() + client, err := ironic.NewClient(context.Background()) Expect(err).To(HaveOccurred()) Expect(err.Error()).To(ContainSubstring("failed to create baremetal client")) Expect(client).To(BeNil()) @@ -121,14 +122,14 @@ var _ = Describe("Client", func() { }) It("should create a client successfully", func() { - client, err := ironic.NewClient() + client, err := ironic.NewClient(context.Background()) Expect(err).NotTo(HaveOccurred()) Expect(client).NotTo(BeNil()) Expect(client.GetEndpoint()).NotTo(BeEmpty()) }) It("should have a non-empty endpoint", func() { - client, err := ironic.NewClient() + client, err := ironic.NewClient(context.Background()) Expect(err).NotTo(HaveOccurred()) endpoint := client.GetEndpoint() @@ -145,7 +146,7 @@ var _ = Describe("Client", func() { }) It("should return a valid HTTP(S) endpoint", func() { - client, err := ironic.NewClient() + client, err := ironic.NewClient(context.Background()) Expect(err).NotTo(HaveOccurred()) endpoint := client.GetEndpoint()