From 19e98b0a1d534691905ed96dfa0542bd592edf48 Mon Sep 17 00:00:00 2001 From: DanNiESh Date: Fri, 5 Jun 2026 10:20:01 -0400 Subject: [PATCH] Add automatic reconnect on Ironic authentication failure Port the reconnect logic from the old internal/ironic package (PR #7) into the new internal/management layer. When gophercloud's built-in token refresh fails, the client detects auth errors (401, reauth failures), recreates the service client from scratch, and retries the operation once. Co-Authored-By: Claude Opus 4.6 --- go.mod | 2 +- go.sum | 2 + internal/management/openstack.go | 113 ++++++++++++-- .../management/openstack_internal_test.go | 142 ++++++++++++++++++ 4 files changed, 242 insertions(+), 17 deletions(-) create mode 100644 internal/management/openstack_internal_test.go diff --git a/go.mod b/go.mod index 840d3b3..d0669d7 100644 --- a/go.mod +++ b/go.mod @@ -58,7 +58,7 @@ require ( github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect - github.com/osac-project/bare-metal-fulfillment-operator v0.0.0-20260518182443-35f2a9590dcc + github.com/osac-project/bare-metal-fulfillment-operator v0.0.0-20260604222633-b2bc28802356 github.com/prometheus/client_golang v1.23.2 // indirect github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/common v0.66.1 // indirect diff --git a/go.sum b/go.sum index c162036..64e095f 100644 --- a/go.sum +++ b/go.sum @@ -113,6 +113,8 @@ github.com/onsi/gomega v1.39.1 h1:1IJLAad4zjPn2PsnhH70V4DKRFlrCzGBNrNaru+Vf28= github.com/onsi/gomega v1.39.1/go.mod h1:hL6yVALoTOxeWudERyfppUcZXjMwIMLnuSfruD2lcfg= github.com/osac-project/bare-metal-fulfillment-operator v0.0.0-20260518182443-35f2a9590dcc h1:Ayw/y3WiCxQh8I8nKuyacNxrBeq1GCw60bNLfFD16rQ= github.com/osac-project/bare-metal-fulfillment-operator v0.0.0-20260518182443-35f2a9590dcc/go.mod h1:N+t/4UPZajrHeDSjKAy+PwyydVaiONx4Uv0DRhk6o0I= +github.com/osac-project/bare-metal-fulfillment-operator v0.0.0-20260604222633-b2bc28802356 h1:Dpe/UiWv24zzUjYiNaUyk5pROxSj9m71B4RL+TzAvqs= +github.com/osac-project/bare-metal-fulfillment-operator v0.0.0-20260604222633-b2bc28802356/go.mod h1:N+t/4UPZajrHeDSjKAy+PwyydVaiONx4Uv0DRhk6o0I= github.com/osac-project/osac-operator v0.1.1-0.20260511193951-8bf9632098a0 h1:w2POmUbIU+ecHGUky0SifYFWV5W05VijjixjnpWN2xw= github.com/osac-project/osac-operator v0.1.1-0.20260511193951-8bf9632098a0/go.mod h1:c207O7XvQtIY6hFIqUAv3yB2JQt1ZRrsWA+wEla87aQ= github.com/osac-project/osac-operator/api v0.0.2-0.20260511193951-8bf9632098a0 h1:ddxVNqBMGOk/xx19OgAuLZpKpId7A14pElhQ6eslAl8= diff --git a/internal/management/openstack.go b/internal/management/openstack.go index 7c27e9f..7602962 100644 --- a/internal/management/openstack.go +++ b/internal/management/openstack.go @@ -3,6 +3,7 @@ package management import ( "context" "encoding/json" + "errors" "fmt" "net/http" @@ -10,6 +11,7 @@ import ( "github.com/gophercloud/gophercloud/v2/openstack" "github.com/gophercloud/gophercloud/v2/openstack/baremetal/v1/nodes" "github.com/gophercloud/utils/v2/openstack/clientconfig" + ctrllog "sigs.k8s.io/controller-runtime/pkg/log" ) var ( @@ -22,7 +24,8 @@ func init() { } type OpenStackClient struct { - client *gophercloud.ServiceClient + client *gophercloud.ServiceClient + newServiceClient func(ctx context.Context) (*gophercloud.ServiceClient, error) } func NewOpenStackClient(ctx context.Context, cfg *Config) (Client, error) { @@ -39,36 +42,97 @@ func NewOpenStackClient(ctx context.Context, cfg *Config) (Client, error) { } } - clientOpts := clientconfig.ClientOpts{ - Cloud: cloud.Cloud, - AuthType: cloud.AuthType, - AuthInfo: cloud.AuthInfo, - RegionName: cloud.RegionName, - EndpointType: cloud.EndpointType, + factory := func(ctx context.Context) (*gophercloud.ServiceClient, error) { + clientOpts := clientconfig.ClientOpts{ + Cloud: cloud.Cloud, + AuthType: cloud.AuthType, + AuthInfo: cloud.AuthInfo, + RegionName: cloud.RegionName, + EndpointType: cloud.EndpointType, + } + + providerClient, err := clientconfig.AuthenticatedClient(ctx, &clientOpts) + if err != nil { + return nil, fmt.Errorf("failed to create authenticated client (check cloud credentials and endpoint configuration)") + } + + ironicClient, err := openstack.NewBareMetalV1(providerClient, gophercloud.EndpointOpts{ + Region: cloud.RegionName, + Availability: gophercloud.Availability(cloud.EndpointType), + }) + if err != nil { + return nil, fmt.Errorf("failed to create baremetal client (check endpoint configuration and region)") + } + + return ironicClient, nil } - providerClient, err := clientconfig.AuthenticatedClient(ctx, &clientOpts) + sc, err := factory(ctx) if err != nil { - return nil, fmt.Errorf("failed to create authenticated client (check cloud credentials and endpoint configuration)") + return nil, err } - ironicClient, err := openstack.NewBareMetalV1(providerClient, gophercloud.EndpointOpts{ - Region: cloud.RegionName, - Availability: gophercloud.Availability(cloud.EndpointType), - }) - if err != nil { - return nil, fmt.Errorf("failed to create baremetal client (check endpoint configuration and region)") + return &OpenStackClient{ + client: sc, + newServiceClient: factory, + }, nil +} + +func isAuthError(err error) bool { + if err == nil { + return false } + if gophercloud.ResponseCodeIs(err, http.StatusUnauthorized) { + return true + } + var errReauth *gophercloud.ErrUnableToReauthenticate + if errors.As(err, &errReauth) { + return true + } + var errAfterReauth *gophercloud.ErrErrorAfterReauthentication + return errors.As(err, &errAfterReauth) +} - return &OpenStackClient{client: ironicClient}, nil +func (c *OpenStackClient) reconnect(ctx context.Context) error { + log := ctrllog.FromContext(ctx) + log.Info("recreating ironic service client after authentication failure") + sc, err := c.newServiceClient(ctx) + if err != nil { + return fmt.Errorf("failed to recreate baremetal client: %w", err) + } + if sc == nil { + return fmt.Errorf("failed to recreate baremetal client: nil service client") + } + if sc.Endpoint == "" { + return fmt.Errorf("recreated baremetal client has no endpoint configured") + } + c.client = sc + log.Info("ironic service client reconnected successfully") + return nil } func (c *OpenStackClient) GetPowerState(ctx context.Context, hostID string) (*PowerStatus, error) { + log := ctrllog.FromContext(ctx) node, err := nodes.Get(ctx, c.client, hostID).Extract() if err != nil { + if isAuthError(err) { + log.Info("auth error on GetPowerState, attempting reconnect", "nodeID", hostID) + if reconnErr := c.reconnect(ctx); reconnErr != nil { + return nil, fmt.Errorf("get node %s: reconnect failed: %w", hostID, reconnErr) + } + node, err = nodes.Get(ctx, c.client, hostID).Extract() + if err != nil { + return nil, fmt.Errorf("get node %s after reconnect: %w", hostID, err) + } + return nodePowerStatus(node, hostID) + } return nil, fmt.Errorf("get node %s: %w", hostID, err) } + return nodePowerStatus(node, hostID) +} + +func nodePowerStatus(node *nodes.Node, hostID string) (*PowerStatus, error) { state := PowerState(node.PowerState) switch state { case PowerOn, PowerOff: @@ -83,6 +147,7 @@ func (c *OpenStackClient) GetPowerState(ctx context.Context, hostID string) (*Po } func (c *OpenStackClient) SetPowerState(ctx context.Context, hostID string, target PowerState) error { + log := ctrllog.FromContext(ctx) switch target { case PowerOn, PowerOff: default: @@ -93,6 +158,22 @@ func (c *OpenStackClient) SetPowerState(ctx context.Context, hostID string, targ Target: nodes.TargetPowerState(target), }) if err := res.ExtractErr(); err != nil { + if isAuthError(err) { + log.Info("auth error on SetPowerState, attempting reconnect", "nodeID", hostID, "target", target) + if reconnErr := c.reconnect(ctx); reconnErr != nil { + return fmt.Errorf("failed to set power state on node %s: reconnect failed: %w", hostID, reconnErr) + } + res = nodes.ChangePowerState(ctx, c.client, hostID, nodes.PowerStateOpts{ + Target: nodes.TargetPowerState(target), + }) + if err := res.ExtractErr(); err != nil { + if gophercloud.ResponseCodeIs(err, http.StatusConflict) { + return fmt.Errorf("node %s: %w", hostID, ErrTransitioning) + } + return fmt.Errorf("failed to set power state on node %s after reconnect: %w", hostID, err) + } + return nil + } if gophercloud.ResponseCodeIs(err, http.StatusConflict) { return fmt.Errorf("node %s: %w", hostID, ErrTransitioning) } diff --git a/internal/management/openstack_internal_test.go b/internal/management/openstack_internal_test.go new file mode 100644 index 0000000..369da3d --- /dev/null +++ b/internal/management/openstack_internal_test.go @@ -0,0 +1,142 @@ +package management + +import ( + "context" + "fmt" + "net/http" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/gophercloud/gophercloud/v2" +) + +const ( + oldEndpoint = "http://old:6385/v1/" + newEndpoint = "http://new:6385/v1/" +) + +var _ = Describe("isAuthError", func() { + It("should return false for nil error", func() { + Expect(isAuthError(nil)).To(BeFalse()) + }) + + It("should return false for generic errors", func() { + Expect(isAuthError(fmt.Errorf("some random error"))).To(BeFalse()) + }) + + It("should return true for 401 ErrUnexpectedResponseCode", func() { + err := gophercloud.ErrUnexpectedResponseCode{ + Actual: http.StatusUnauthorized, + Expected: []int{http.StatusOK}, + } + Expect(isAuthError(err)).To(BeTrue()) + }) + + It("should return false for 404 ErrUnexpectedResponseCode", func() { + err := gophercloud.ErrUnexpectedResponseCode{ + Actual: http.StatusNotFound, + Expected: []int{http.StatusOK}, + } + Expect(isAuthError(err)).To(BeFalse()) + }) + + It("should return true for *ErrUnableToReauthenticate", func() { + err := &gophercloud.ErrUnableToReauthenticate{ + ErrOriginal: fmt.Errorf("original"), + ErrReauth: fmt.Errorf("reauth failed"), + } + Expect(isAuthError(err)).To(BeTrue()) + }) + + It("should return true for *ErrErrorAfterReauthentication", func() { + err := &gophercloud.ErrErrorAfterReauthentication{ + ErrOriginal: fmt.Errorf("still failing"), + } + Expect(isAuthError(err)).To(BeTrue()) + }) + + It("should return true for wrapped 401 error", func() { + inner := gophercloud.ErrUnexpectedResponseCode{ + Actual: http.StatusUnauthorized, + Expected: []int{http.StatusOK}, + } + wrapped := fmt.Errorf("operation failed: %w", inner) + Expect(isAuthError(wrapped)).To(BeTrue()) + }) + + It("should return false for wrapped non-auth error", func() { + inner := gophercloud.ErrUnexpectedResponseCode{ + Actual: http.StatusNotFound, + Expected: []int{http.StatusOK}, + } + wrapped := fmt.Errorf("operation failed: %w", inner) + Expect(isAuthError(wrapped)).To(BeFalse()) + }) +}) + +var _ = Describe("reconnect", func() { + It("should swap the service client on success", func() { + oldSC := &gophercloud.ServiceClient{Endpoint: oldEndpoint} + newSC := &gophercloud.ServiceClient{Endpoint: newEndpoint} + + c := &OpenStackClient{ + client: oldSC, + newServiceClient: func(context.Context) (*gophercloud.ServiceClient, error) { + return newSC, nil + }, + } + + Expect(c.reconnect(context.Background())).To(Succeed()) + Expect(c.client).To(Equal(newSC)) + Expect(c.client.Endpoint).To(Equal(newEndpoint)) + }) + + It("should return error when factory fails", func() { + oldSC := &gophercloud.ServiceClient{Endpoint: oldEndpoint} + + c := &OpenStackClient{ + client: oldSC, + newServiceClient: func(context.Context) (*gophercloud.ServiceClient, error) { + return nil, fmt.Errorf("keystone is down") + }, + } + + err := c.reconnect(context.Background()) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("keystone is down")) + Expect(c.client).To(Equal(oldSC), "should keep old client on failure") + }) + + It("should return error when factory returns nil client without error", func() { + oldSC := &gophercloud.ServiceClient{Endpoint: oldEndpoint} + + c := &OpenStackClient{ + client: oldSC, + newServiceClient: func(context.Context) (*gophercloud.ServiceClient, error) { + return nil, nil + }, + } + + err := c.reconnect(context.Background()) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("nil service client")) + Expect(c.client).To(Equal(oldSC), "should keep old client on nil return") + }) + + It("should return error when new client has empty endpoint", func() { + oldSC := &gophercloud.ServiceClient{Endpoint: oldEndpoint} + + c := &OpenStackClient{ + client: oldSC, + newServiceClient: func(context.Context) (*gophercloud.ServiceClient, error) { + return &gophercloud.ServiceClient{Endpoint: ""}, nil + }, + } + + err := c.reconnect(context.Background()) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("no endpoint configured")) + Expect(c.client).To(Equal(oldSC), "should keep old client on empty endpoint") + }) +})