Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
223 changes: 123 additions & 100 deletions api/deployment/v1/message.pb.go

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions common/worker_versioning/worker_versioning.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,11 @@ const (
WorkerDeploymentVersionWorkflowIDInitialSize = len(WorkerDeploymentVersionWorkflowIDPrefix) + len(WorkerDeploymentVersionDelimiter) // 39
WorkerDeploymentNameFieldName = "WorkerDeploymentName"
WorkerDeploymentBuildIDFieldName = "BuildID"

// SignalSyncValidationStatus is sent by the WCI workflow to the version workflow
// when ValidationStatus changes, so the deployment workflow can maintain an
// up-to-date connectivity summary in its memo.
SignalSyncValidationStatus = "sync-validation-status"
)

// FormatPinnedVersionNotInTaskQueueError formats the error message when a pinned version
Expand Down
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,8 @@ require (
go.opentelemetry.io/otel/sdk v1.43.0
go.opentelemetry.io/otel/sdk/metric v1.43.0
go.opentelemetry.io/otel/trace v1.43.0
go.temporal.io/api v1.62.15-0.20260615235047-378792ab2240
go.temporal.io/auto-scaled-workers v0.0.0-20260407181057-edd947d743d2
go.temporal.io/api v1.62.15-0.20260622232249-060670b1b866
go.temporal.io/auto-scaled-workers v0.0.0-20260622220320-9b1e3849116d
go.temporal.io/sdk v1.41.1
go.uber.org/fx v1.24.0
go.uber.org/goleak v1.3.0
Expand Down
8 changes: 4 additions & 4 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -471,10 +471,10 @@ go.opentelemetry.io/proto/slim/otlp/collector/profiles/v1development v0.3.0 h1:R
go.opentelemetry.io/proto/slim/otlp/collector/profiles/v1development v0.3.0/go.mod h1:I89cynRj8y+383o7tEQVg2SVA6SRgDVIouWPUVXjx0U=
go.opentelemetry.io/proto/slim/otlp/profiles/v1development v0.3.0 h1:CQvJSldHRUN6Z8jsUeYv8J0lXRvygALXIzsmAeCcZE0=
go.opentelemetry.io/proto/slim/otlp/profiles/v1development v0.3.0/go.mod h1:xSQ+mEfJe/GjK1LXEyVOoSI1N9JV9ZI923X5kup43W4=
go.temporal.io/api v1.62.15-0.20260615235047-378792ab2240 h1:Up/CNfkScGxN1TdrGZ3ez+0k6MIIhuhlbBgdZnrPhm0=
go.temporal.io/api v1.62.15-0.20260615235047-378792ab2240/go.mod h1:0k75tRljEuELWGeXjEZZO7zYqBln4+1FrG6+IMOMy7Q=
go.temporal.io/auto-scaled-workers v0.0.0-20260407181057-edd947d743d2 h1:1hKeH3GyR6YD6LKMHGCZ76t6h1Sgha0hXVQBxWi3dlQ=
go.temporal.io/auto-scaled-workers v0.0.0-20260407181057-edd947d743d2/go.mod h1:T8dnzVPeO+gaUTj9eDgm/lT2lZH4+JXNvrGaQGyVi50=
go.temporal.io/api v1.62.15-0.20260622232249-060670b1b866 h1:VRS5gok7O0T64Irinc37h/hD/Dtzpt2bB7OolUUUTTc=
go.temporal.io/api v1.62.15-0.20260622232249-060670b1b866/go.mod h1:0k75tRljEuELWGeXjEZZO7zYqBln4+1FrG6+IMOMy7Q=
go.temporal.io/auto-scaled-workers v0.0.0-20260622220320-9b1e3849116d h1:f7+FCJHSrYWz9zvJp2OxKo8Fu/dsBUdnZZA+m5CEOS0=
go.temporal.io/auto-scaled-workers v0.0.0-20260622220320-9b1e3849116d/go.mod h1:HOnbQTZCW18EPcutFHTkZrDcGO4tUjQ8N2pjDyrGhrY=
go.temporal.io/sdk v1.41.1 h1:yOpvsHyDD1lNuwlGBv/SUodCPhjv9nDeC9lLHW/fJUA=
go.temporal.io/sdk v1.41.1/go.mod h1:/InXQT5guZ6AizYzpmzr5avQ/GMgq1ZObcKlKE2AhTc=
go.uber.org/atomic v1.5.0/go.mod h1:sABNBOSYdrvTF6hTgEIbc7YasKWGhgEQZyfxyTvoXHQ=
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,9 @@ message VersionLocalState {

// Cached compute config summary, kept in sync with the WCI on each compute config update.
temporal.api.compute.v1.ComputeConfigSummary compute_config = 18;

// Cached compute status, updated when WCI signals this version workflow.
temporal.api.deployment.v1.ComputeStatus compute_status = 19;
}

// Data specific to a task queue, from the perspective of a worker deployment version.
Expand Down Expand Up @@ -233,6 +236,9 @@ message WorkerDeploymentVersionSummary {
// Compute config summary for this version. Synced from the version workflow on each compute config update.
// Also set by the deployment workflow at version creation time if a compute config was provided.
temporal.api.compute.v1.ComputeConfigSummary compute_config = 13;

// Compute status for this version. Synced from the version workflow when WCI signals a status change.
temporal.api.deployment.v1.ComputeStatus compute_status = 14;
}

// used as Worker Deployment Version workflow update input:
Expand Down
2 changes: 2 additions & 0 deletions service/worker/workerdeployment/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -630,6 +630,7 @@ func (d *ClientImpl) DescribeWorkerDeployment(
if err != nil {
return nil, nil, err
}

return dInfo, queryResponse.GetState().GetConflictToken(), nil
}

Expand Down Expand Up @@ -1798,6 +1799,7 @@ func (d *ClientImpl) deploymentStateToDeploymentInfo(deploymentName string, stat
LastDeactivationTime: v.GetLastDeactivationTime(),
Status: v.GetStatus(),
ComputeConfig: v.GetComputeConfig(),
ComputeStatus: v.GetComputeStatus(),
})
Comment thread
smuneebahmad marked this conversation as resolved.
}

Expand Down
17 changes: 17 additions & 0 deletions service/worker/workerdeployment/compute_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@ package workerdeployment

import (
computepb "go.temporal.io/api/compute/v1"
deploymentpb "go.temporal.io/api/deployment/v1"
wciiface "go.temporal.io/auto-scaled-workers/wci/workflow/iface"
"go.temporal.io/sdk/workflow"
"google.golang.org/protobuf/types/known/timestamppb"
)

func computeConfigScalingGroupsToWCISpec(scalingGroups map[string]*computepb.ComputeConfigScalingGroup) *wciiface.WorkerControllerInstanceSpec {
Expand Down Expand Up @@ -103,3 +105,18 @@ func scalingGroupsToUpsertUpdates(scalingGroups map[string]*computepb.ComputeCon
}
return updates
}

// wciValidationStatusToComputeStatus converts a WCI ValidationStatus to the public ComputeStatus proto.
// A successful validation results in an empty error_message; a failed validation sets the error_message.
func wciValidationStatusToComputeStatus(vs *wciiface.ValidationStatus) *deploymentpb.ComputeStatus {
if vs == nil {
return nil
Comment thread
smuneebahmad marked this conversation as resolved.
}
pv := &deploymentpb.ComputeStatus_ProviderValidationStatus{
LastCheckTime: timestamppb.New(vs.LastValidationTime),
}
if vs.Status == wciiface.ValidationResultFailed {
pv.ErrorMessage = vs.ErrMessage
}
return &deploymentpb.ComputeStatus{ProviderValidation: pv}
}
27 changes: 27 additions & 0 deletions service/worker/workerdeployment/compute_util_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@ package workerdeployment

import (
"testing"
"time"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
computepb "go.temporal.io/api/compute/v1"
enumspb "go.temporal.io/api/enums/v1"
wciiface "go.temporal.io/auto-scaled-workers/wci/workflow/iface"
Expand Down Expand Up @@ -49,3 +51,28 @@ func TestComputeConfigScalingGroupsToWCISpec_WithComputeAndScaling(t *testing.T)
assert.Equal(t, wciiface.ComputeProviderType("aws-ecs"), g2.Compute.ProviderType)
assert.Nil(t, g2.Scaling, "no scaler means nil scaling spec")
}

func TestWciValidationStatusToComputeStatus_Nil(t *testing.T) {
t.Parallel()
require.Nil(t, wciValidationStatusToComputeStatus(nil))
}

func TestWciValidationStatusToComputeStatus_Success(t *testing.T) {
t.Parallel()
ts := time.Date(2025, 1, 15, 12, 0, 0, 0, time.UTC)
result := wciValidationStatusToComputeStatus(wciiface.NewValidationStatusSuccess(ts))
require.NotNil(t, result)
require.NotNil(t, result.ProviderValidation)
require.Empty(t, result.ProviderValidation.ErrorMessage)
require.Equal(t, ts, result.ProviderValidation.LastCheckTime.AsTime())
}

func TestWciValidationStatusToComputeStatus_Failed(t *testing.T) {
t.Parallel()
ts := time.Date(2025, 1, 15, 12, 0, 0, 0, time.UTC)
result := wciValidationStatusToComputeStatus(wciiface.NewValidationStatusFailed(ts, "lambda unreachable"))
require.NotNil(t, result)
require.NotNil(t, result.ProviderValidation)
require.Equal(t, "lambda unreachable", result.ProviderValidation.ErrorMessage)
require.Equal(t, ts, result.ProviderValidation.LastCheckTime.AsTime())
}
16 changes: 16 additions & 0 deletions service/worker/workerdeployment/version_workflow.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
deploymentpb "go.temporal.io/api/deployment/v1"
enumspb "go.temporal.io/api/enums/v1"
"go.temporal.io/api/serviceerror"
wciiface "go.temporal.io/auto-scaled-workers/wci/workflow/iface"
sdkclient "go.temporal.io/sdk/client"
sdklog "go.temporal.io/sdk/log"
"go.temporal.io/sdk/temporal"
Expand Down Expand Up @@ -227,6 +228,20 @@ func (d *VersionWorkflowRunner) listenToSignals(ctx workflow.Context) {
})
}

// Version gate for sync-validation-status signal to prevent NDEs during rollback
if workflow.GetVersion(ctx, "sync-validation-status-signal", workflow.DefaultVersion, 0) >= 0 {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems we've been doing this patchings for other signal handlers, but I don't think patching (GetVersion) really helps with NDEs related to the new signal. Because the handler registration itself does not create history events and is safe to hit during replay of a workflow ran in the previous version without signal handler.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. Will clean this up in a follow-up PR

syncValidationStatusChannel := workflow.GetSignalChannel(ctx, worker_versioning.SignalSyncValidationStatus)
d.signalHandler.signalSelector.AddReceive(syncValidationStatusChannel, func(c workflow.ReceiveChannel, more bool) {
d.signalHandler.processingSignals++
defer func() { d.signalHandler.processingSignals-- }()

var vs wciiface.ValidationStatus
c.Receive(ctx, &vs)
d.VersionState.ComputeStatus = wciValidationStatusToComputeStatus(&vs)
d.syncSummary(ctx) // propagate updated ComputeStatus to deployment workflow
})
Comment thread
smuneebahmad marked this conversation as resolved.
}
Comment thread
smuneebahmad marked this conversation as resolved.

// Keep waiting for signals, when it's time to CaN the main goroutine will exit.
for {
d.signalHandler.signalSelector.Select(ctx)
Expand Down Expand Up @@ -1056,6 +1071,7 @@ func versionStateToSummary(s *deploymentspb.VersionLocalState) *deploymentspb.Wo
LastDeactivationTime: s.LastDeactivationTime,
Status: s.Status,
ComputeConfig: s.ComputeConfig,
ComputeStatus: s.ComputeStatus,
}
}

Expand Down
95 changes: 95 additions & 0 deletions service/worker/workerdeployment/version_workflow_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
computepb "go.temporal.io/api/compute/v1"
deploymentpb "go.temporal.io/api/deployment/v1"
enumspb "go.temporal.io/api/enums/v1"
wciiface "go.temporal.io/auto-scaled-workers/wci/workflow/iface"
"go.temporal.io/sdk/temporal"
"go.temporal.io/sdk/testsuite"
"go.temporal.io/sdk/workflow"
Expand Down Expand Up @@ -2604,3 +2605,97 @@ func (s *VersionWorkflowSuite) Test_ReactivateVersion_IgnoredWhenNotDrainedOrIna

s.True(s.env.IsWorkflowCompleted())
}

// Test_SyncValidationStatus_SuccessValidation verifies that a successful WCI validation signal
// sets ComputeStatus with an empty error message and propagates it to the deployment workflow.
func (s *VersionWorkflowSuite) Test_SyncValidationStatus_SuccessValidation() {
tv := testvars.New(s.T())
now := timestamppb.New(time.Now())
validationTime := time.Date(2025, 1, 15, 12, 0, 0, 0, time.UTC)

var a *VersionActivities
s.env.RegisterActivity(a.StartWorkerDeploymentWorkflow)
s.env.OnActivity(a.StartWorkerDeploymentWorkflow, mock.Anything, mock.Anything).Return(nil).Maybe()
s.env.OnSignalExternalWorkflow(mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(nil).Maybe()

s.env.RegisterDelayedCallback(func() {
s.env.SignalWorkflow(worker_versioning.SignalSyncValidationStatus, wciiface.NewValidationStatusSuccess(validationTime))

s.env.RegisterDelayedCallback(func() {
queryResp := &deploymentspb.QueryDescribeVersionResponse{}
val, err := s.env.QueryWorkflow(QueryDescribeVersion)
s.Require().NoError(err)
s.Require().NoError(val.Get(queryResp))

cs := queryResp.VersionState.ComputeStatus
s.Require().NotNil(cs)
s.Require().NotNil(cs.ProviderValidation)
s.Empty(cs.ProviderValidation.ErrorMessage)
s.Equal(validationTime, cs.ProviderValidation.LastCheckTime.AsTime())
}, 10*time.Millisecond)
}, 10*time.Millisecond)

s.env.ExecuteWorkflow(WorkerDeploymentVersionWorkflowType, &deploymentspb.WorkerDeploymentVersionWorkflowArgs{
NamespaceName: tv.NamespaceName().String(),
NamespaceId: tv.NamespaceID().String(),
VersionState: &deploymentspb.VersionLocalState{
Version: &deploymentspb.WorkerDeploymentVersion{
DeploymentName: tv.DeploymentSeries(),
BuildId: tv.BuildID(),
},
Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT,
CurrentSinceTime: now,
SyncBatchSize: int32(s.workerDeploymentClient.getSyncBatchSize()),
StartedDeploymentWorkflow: true,
},
})

s.True(s.env.IsWorkflowCompleted())
}

// Test_SyncValidationStatus_FailedValidation verifies that a failed WCI validation signal
// sets ComputeStatus with the error message and propagates it to the deployment workflow.
func (s *VersionWorkflowSuite) Test_SyncValidationStatus_FailedValidation() {
tv := testvars.New(s.T())
now := timestamppb.New(time.Now())
validationTime := time.Date(2025, 1, 15, 12, 0, 0, 0, time.UTC)

var a *VersionActivities
s.env.RegisterActivity(a.StartWorkerDeploymentWorkflow)
s.env.OnActivity(a.StartWorkerDeploymentWorkflow, mock.Anything, mock.Anything).Return(nil).Maybe()
s.env.OnSignalExternalWorkflow(mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(nil).Maybe()

s.env.RegisterDelayedCallback(func() {
s.env.SignalWorkflow(worker_versioning.SignalSyncValidationStatus, wciiface.NewValidationStatusFailed(validationTime, "lambda unreachable"))

s.env.RegisterDelayedCallback(func() {
queryResp := &deploymentspb.QueryDescribeVersionResponse{}
val, err := s.env.QueryWorkflow(QueryDescribeVersion)
s.Require().NoError(err)
s.Require().NoError(val.Get(queryResp))

cs := queryResp.VersionState.ComputeStatus
s.Require().NotNil(cs)
s.Require().NotNil(cs.ProviderValidation)
s.Equal("lambda unreachable", cs.ProviderValidation.ErrorMessage)
s.Equal(validationTime, cs.ProviderValidation.LastCheckTime.AsTime())
}, 10*time.Millisecond)
}, 10*time.Millisecond)

s.env.ExecuteWorkflow(WorkerDeploymentVersionWorkflowType, &deploymentspb.WorkerDeploymentVersionWorkflowArgs{
NamespaceName: tv.NamespaceName().String(),
NamespaceId: tv.NamespaceID().String(),
VersionState: &deploymentspb.VersionLocalState{
Version: &deploymentspb.WorkerDeploymentVersion{
DeploymentName: tv.DeploymentSeries(),
BuildId: tv.BuildID(),
},
Status: enumspb.WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT,
CurrentSinceTime: now,
SyncBatchSize: int32(s.workerDeploymentClient.getSyncBatchSize()),
StartedDeploymentWorkflow: true,
},
})

s.True(s.env.IsWorkflowCompleted())
}
1 change: 1 addition & 0 deletions service/worker/workerdeployment/workflow.go
Original file line number Diff line number Diff line change
Expand Up @@ -1822,5 +1822,6 @@ func (d *WorkflowRunner) getWorkerDeploymentInfoVersionSummary(versionSummary *d
LastCurrentTime: versionSummary.GetLastCurrentTime(),
LastDeactivationTime: versionSummary.GetLastDeactivationTime(),
ComputeConfig: versionSummary.GetComputeConfig(),
ComputeStatus: versionSummary.GetComputeStatus(),
}
}
Loading