Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 22 additions & 3 deletions cmd/api/api/instances.go
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,13 @@ func (s *ApiService) CreateInstance(ctx context.Context, request oapi.CreateInst
Message: err.Error(),
}, nil
}
restartPolicy, err := toDomainRestartPolicy(request.Body.RestartPolicy)
if err != nil {
return oapi.CreateInstance400JSONResponse{
Code: "invalid_restart_policy",
Message: err.Error(),
}, nil
}

domainReq := instances.CreateInstanceRequest{
Name: request.Body.Name,
Expand All @@ -319,6 +326,7 @@ func (s *ApiService) CreateInstance(ctx context.Context, request oapi.CreateInst
SkipGuestAgent: request.Body.SkipGuestAgent != nil && *request.Body.SkipGuestAgent,
AutoStandby: autoStandby,
HealthCheck: healthCheck,
RestartPolicy: restartPolicy,
}
if request.Body.SnapshotPolicy != nil {
snapshotPolicy, err := toInstanceSnapshotPolicy(*request.Body.SnapshotPolicy)
Expand Down Expand Up @@ -1044,11 +1052,20 @@ func (s *ApiService) UpdateInstance(ctx context.Context, request oapi.UpdateInst
Message: err.Error(),
}, nil
}
restartPolicy, err := toDomainRestartPolicy(request.Body.RestartPolicy)
if err != nil {
return oapi.UpdateInstance400JSONResponse{
Code: "invalid_restart_policy",
Message: err.Error(),
}, nil
}

result, err := s.InstanceManager.UpdateInstance(ctx, inst.Id, instances.UpdateInstanceRequest{
Env: env,
AutoStandby: autoStandby,
HealthCheck: healthCheck,
Env: env,
AutoStandby: autoStandby,
HealthCheck: healthCheck,
RestartPolicy: restartPolicy,
RestartPolicySet: request.Body.RestartPolicy != nil,
})
if err != nil {
switch {
Expand Down Expand Up @@ -1182,6 +1199,8 @@ func instanceToOAPI(inst instances.Instance) oapi.Instance {
oapiInst.AutoStandby = toOAPIAutoStandbyPolicy(inst.AutoStandby)
oapiInst.HealthCheck = toOAPIHealthCheck(inst.HealthCheck)
oapiInst.HealthStatus = toOAPIHealthStatus(healthcheck.Snapshot(inst.HealthCheck, string(inst.State), inst.HealthCheckRuntime))
oapiInst.RestartPolicy = toOAPIRestartPolicy(inst.RestartPolicy)
oapiInst.RestartStatus = toOAPIRestartStatus(inst.RestartStatus)

// Convert volume attachments
if len(inst.Volumes) > 0 {
Expand Down
147 changes: 147 additions & 0 deletions cmd/api/api/instances_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
mw "github.com/kernel/hypeman/lib/middleware"
"github.com/kernel/hypeman/lib/oapi"
"github.com/kernel/hypeman/lib/paths"
restartpolicy "github.com/kernel/hypeman/lib/restart-policy"
"github.com/kernel/hypeman/lib/system"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
Expand Down Expand Up @@ -281,6 +282,7 @@ func (m *captureUpdateManager) UpdateInstance(ctx context.Context, id string, re
Env: req.Env,
AutoStandby: req.AutoStandby,
HealthCheck: req.HealthCheck,
RestartPolicy: req.RestartPolicy,
CreatedAt: now,
HypervisorType: hypervisor.TypeCloudHypervisor,
},
Expand All @@ -304,6 +306,7 @@ func (m *captureCreateManager) CreateInstance(ctx context.Context, req instances
Vcpus: req.Vcpus,
AutoStandby: req.AutoStandby,
HealthCheck: req.HealthCheck,
RestartPolicy: req.RestartPolicy,
CreatedAt: now,
HypervisorType: hypervisor.TypeCloudHypervisor,
},
Expand Down Expand Up @@ -705,6 +708,48 @@ func TestCreateInstance_MapsHealthCheckPolicy(t *testing.T) {
assert.Equal(t, oapi.InstanceHealthStatusStatusStarting, instance.HealthStatus.Status)
}

func TestCreateInstance_MapsRestartPolicy(t *testing.T) {
t.Parallel()

svc := newTestService(t)
origMgr := svc.InstanceManager
mockMgr := &captureCreateManager{Manager: origMgr}
svc.InstanceManager = mockMgr

policy := oapi.OnFailure
backoff := "7s"
stableAfter := "2m"
maxAttempts := 4

resp, err := svc.CreateInstance(ctx(), oapi.CreateInstanceRequestObject{
Body: &oapi.CreateInstanceRequest{
Name: "test-restart-policy",
Image: "docker.io/library/alpine:latest",
RestartPolicy: &oapi.RestartPolicy{
Policy: &policy,
Backoff: &backoff,
StableAfter: &stableAfter,
MaxAttempts: &maxAttempts,
},
},
})
require.NoError(t, err)

created, ok := resp.(oapi.CreateInstance201JSONResponse)
require.True(t, ok, "expected 201 response")
require.NotNil(t, mockMgr.lastReq)
require.NotNil(t, mockMgr.lastReq.RestartPolicy)
assert.Equal(t, restartpolicy.PolicyOnFailure, mockMgr.lastReq.RestartPolicy.Policy)
assert.Equal(t, "7s", mockMgr.lastReq.RestartPolicy.Backoff)
assert.Equal(t, "2m", mockMgr.lastReq.RestartPolicy.StableAfter)
assert.Equal(t, 4, mockMgr.lastReq.RestartPolicy.MaxAttempts)

instance := oapi.Instance(created)
require.NotNil(t, instance.RestartPolicy)
require.NotNil(t, instance.RestartPolicy.Policy)
assert.Equal(t, oapi.OnFailure, *instance.RestartPolicy.Policy)
}

func TestUpdateInstance_MapsEnvPatch(t *testing.T) {
t.Parallel()
svc := newTestService(t)
Expand Down Expand Up @@ -883,6 +928,108 @@ func TestUpdateInstance_MapsHealthCheckPatch(t *testing.T) {
assert.Equal(t, oapi.InstanceHealthStatusStatusUnknown, instance.HealthStatus.Status)
}

func TestUpdateInstance_MapsRestartPolicyPatch(t *testing.T) {
t.Parallel()
svc := newTestService(t)

origMgr := svc.InstanceManager
now := time.Now()
mockMgr := &captureUpdateManager{
Manager: origMgr,
result: &instances.Instance{
StoredMetadata: instances.StoredMetadata{
Id: "inst-update-restart-policy",
Name: "inst-update-restart-policy",
Image: "docker.io/library/alpine:latest",
CreatedAt: now,
HypervisorType: hypervisor.TypeCloudHypervisor,
RestartPolicy: &restartpolicy.Policy{
Policy: restartpolicy.PolicyAlways,
Backoff: "5s",
StableAfter: "10m0s",
},
RestartStatus: restartpolicy.Status{
BlockedReason: restartpolicy.BlockedReasonManualStop,
},
},
State: instances.StateStopped,
},
}
svc.InstanceManager = mockMgr

policy := oapi.Always
resolved := &instances.Instance{
StoredMetadata: instances.StoredMetadata{
Id: "inst-update-restart-policy",
Name: "inst-update-restart-policy",
Image: "docker.io/library/alpine:latest",
CreatedAt: now,
HypervisorType: hypervisor.TypeCloudHypervisor,
},
State: instances.StateStopped,
}

resp, err := svc.UpdateInstance(mw.WithResolvedInstance(ctx(), resolved.Id, resolved), oapi.UpdateInstanceRequestObject{
Id: resolved.Id,
Body: &oapi.UpdateInstanceRequest{
RestartPolicy: &oapi.RestartPolicy{Policy: &policy},
},
})
require.NoError(t, err)
updated, ok := resp.(oapi.UpdateInstance200JSONResponse)
require.True(t, ok, "expected 200 response")

require.NotNil(t, mockMgr.lastReq)
assert.True(t, mockMgr.lastReq.RestartPolicySet)
require.NotNil(t, mockMgr.lastReq.RestartPolicy)
assert.Equal(t, restartpolicy.PolicyAlways, mockMgr.lastReq.RestartPolicy.Policy)

instance := oapi.Instance(updated)
require.NotNil(t, instance.RestartPolicy)
require.NotNil(t, instance.RestartStatus)
require.NotNil(t, instance.RestartStatus.BlockedReason)
assert.Equal(t, oapi.ManualStop, *instance.RestartStatus.BlockedReason)
}

func TestUpdateInstance_RejectsInvalidRestartPolicy(t *testing.T) {
t.Parallel()
svc := newTestService(t)

origMgr := svc.InstanceManager
mockMgr := &captureUpdateManager{Manager: origMgr}
svc.InstanceManager = mockMgr

now := time.Now()
resolved := &instances.Instance{
StoredMetadata: instances.StoredMetadata{
Id: "inst-update-restart-policy",
Name: "inst-update-restart-policy",
Image: "docker.io/library/alpine:latest",
CreatedAt: now,
HypervisorType: hypervisor.TypeCloudHypervisor,
},
State: instances.StateStopped,
}
policy := oapi.OnFailure
backoff := "0s"

resp, err := svc.UpdateInstance(mw.WithResolvedInstance(ctx(), resolved.Id, resolved), oapi.UpdateInstanceRequestObject{
Id: resolved.Id,
Body: &oapi.UpdateInstanceRequest{
RestartPolicy: &oapi.RestartPolicy{
Policy: &policy,
Backoff: &backoff,
},
},
})
require.NoError(t, err)

badReq, ok := resp.(oapi.UpdateInstance400JSONResponse)
require.True(t, ok, "expected 400 response")
assert.Equal(t, "invalid_restart_policy", badReq.Code)
assert.Nil(t, mockMgr.lastReq)
}

func TestUpdateInstance_RejectsZeroAutoStandbyIgnoreDestinationPort(t *testing.T) {
t.Parallel()
svc := newTestService(t)
Expand Down
79 changes: 79 additions & 0 deletions cmd/api/api/restart_policy.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
package api

import (
"github.com/kernel/hypeman/lib/oapi"
restartpolicy "github.com/kernel/hypeman/lib/restart-policy"
"github.com/samber/lo"
)

func toDomainRestartPolicy(policy *oapi.RestartPolicy) (*restartpolicy.Policy, error) {
if policy == nil {
return nil, nil
}

out := &restartpolicy.Policy{}
if policy.Policy != nil {
out.Policy = restartpolicy.PolicyMode(*policy.Policy)
}
if policy.Backoff != nil {
out.Backoff = *policy.Backoff
}
if policy.MaxAttempts != nil {
out.MaxAttempts = *policy.MaxAttempts
}
if policy.StableAfter != nil {
out.StableAfter = *policy.StableAfter
}
if _, err := restartpolicy.NormalizePolicy(out); err != nil {
return nil, err
}
return out, nil
}

func toOAPIRestartPolicy(policy *restartpolicy.Policy) *oapi.RestartPolicy {
if policy == nil {
return nil
}

mode := oapi.RestartPolicyPolicy(policy.Policy)
out := &oapi.RestartPolicy{
Policy: &mode,
}
if policy.Backoff != "" {
out.Backoff = lo.ToPtr(policy.Backoff)
}
if policy.MaxAttempts > 0 {
out.MaxAttempts = lo.ToPtr(policy.MaxAttempts)
}
if policy.StableAfter != "" {
out.StableAfter = lo.ToPtr(policy.StableAfter)
}
return out
}

func toOAPIRestartStatus(status restartpolicy.Status) *oapi.RestartStatus {
if status.IsZero() {
return nil
}

out := &oapi.RestartStatus{
Attempts: lo.ToPtr(status.Attempts),
}
if status.BlockedReason != "" {
reason := oapi.RestartStatusBlockedReason(status.BlockedReason)
out.BlockedReason = &reason
}
if status.LastAttemptAt != nil {
lastAttemptAt := status.LastAttemptAt.UTC()
out.LastAttemptAt = &lastAttemptAt
}
if status.NextAttemptAt != nil {
nextAttemptAt := status.NextAttemptAt.UTC()
out.NextAttemptAt = &nextAttemptAt
}
if status.LastReason != "" {
reason := oapi.RestartStatusLastReason(status.LastReason)
out.LastReason = &reason
}
return out
}
8 changes: 8 additions & 0 deletions cmd/api/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -571,6 +571,14 @@ func run() error {
return app.HealthCheckController.Run(gctx)
})
}
if restartController, ok := app.InstanceManager.(interface {
StartRestartPolicyController(context.Context) error
}); ok {
grp.Go(func() error {
logger.Info("starting restart policy controller")
return restartController.StartRestartPolicyController(gctx)
})
}

// Run the server
grp.Go(func() error {
Expand Down
8 changes: 6 additions & 2 deletions lib/healthcheck/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,10 @@ Stopping, deleting, standing by, or restoring an instance stops active checks. S

## Restart Policy

Health checks only report health. They do not restart instances.
Health checks do not restart instances by themselves.

If Hypeman later adds restart-on-unhealthy behavior, it should consume `health_status=unhealthy` explicitly rather than making health checks mutate lifecycle state.
When an instance also has `restart_policy.policy=on_failure` or `restart_policy.policy=always`, an `unhealthy` health status becomes a restart-policy failure signal. The restart policy applies its normal backoff, max attempts, manual-stop suppression, and stable-window reset before Hypeman restarts the whole instance.

With `restart_policy.policy=never` or no restart policy, health checks only report status.

Health checks still do not mutate lifecycle state directly. The instance remains `Running` while unhealthy until restart policy chooses to stop and start it.
7 changes: 7 additions & 0 deletions lib/healthcheck/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,13 @@ func (c *Controller) runCheck(ctx context.Context, id string) {
if err := c.store.SetRuntime(ctx, id, runtime); err != nil {
c.log.Warn("failed to persist health check status", "instance_id", id, "error", err)
}
if runtime.Status == StatusUnhealthy {
if handler, ok := c.store.(UnhealthyHandler); ok {
if err := handler.HandleUnhealthy(ctx, inst, runtime); err != nil {
c.log.Warn("failed to handle unhealthy instance", "instance_id", id, "error", err)
}
}
}

interval, _, _, err := DurationConfig(policy)
if err != nil {
Expand Down
Loading
Loading