Skip to content

Commit 055e8db

Browse files
committed
Fix multiple starts causing errors
1 parent cbfbd7a commit 055e8db

2 files changed

Lines changed: 12 additions & 12 deletions

File tree

pkg/fleet/daemon.go

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -185,9 +185,13 @@ func (d *Daemon) startDatadogAgentExperiment(ctx context.Context, req remoteAPIR
185185
}
186186

187187
if err := canStart(getExperimentPhase(dda)); err != nil {
188-
// If the experiment is already running with the same ID, treat as idempotent success.
189-
if dda.Status.Experiment != nil && dda.Status.Experiment.ID == req.ID {
190-
logger.Info("Experiment already running with this ID, acknowledging", "phase", dda.Status.Experiment.Phase)
188+
// If an experiment is already running, treat repeat start signals as idempotent.
189+
// The backend retries with new task UUIDs until it sees an ack, and in-memory
190+
// state (ExperimentConfigVersion) doesn't survive operator restarts, so we
191+
// cannot reliably match on config version. The backend is responsible for not
192+
// sending a start for a different experiment while one is already running.
193+
if getExperimentPhase(dda) == v2alpha1.ExperimentPhaseRunning {
194+
logger.Info("Experiment already running, acknowledging start signal as idempotent", "experimentID", dda.Status.Experiment.ID)
191195
return nil
192196
}
193197
return fmt.Errorf("start DatadogAgent experiment: %w", err)

pkg/fleet/daemon_test.go

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -127,24 +127,20 @@ func TestStartDatadogAgentExperiment_DDANotFound(t *testing.T) {
127127
assert.Error(t, d.startDatadogAgentExperiment(context.Background(), testStartRequest()))
128128
}
129129

130-
func TestStartDatadogAgentExperiment_Running(t *testing.T) {
131-
d, _ := testDaemon(testDDAObject(v2alpha1.ExperimentPhaseRunning), testInstallerConfigWithDDA())
132-
assert.Error(t, d.startDatadogAgentExperiment(context.Background(), testStartRequest()))
133-
}
134-
135-
func TestStartDatadogAgentExperiment_Running_SameID_Idempotent(t *testing.T) {
130+
func TestStartDatadogAgentExperiment_Running_Idempotent(t *testing.T) {
136131
dda := testDDAObject(v2alpha1.ExperimentPhaseRunning)
137-
req := testStartRequest()
138-
dda.Status.Experiment.ID = req.ID // same ID as the incoming request
139132
d, c := testDaemon(dda, testInstallerConfigWithDDA())
133+
// Backend retries with a new task UUID; should be treated as idempotent.
134+
req := testStartRequest()
135+
req.ID = "retry-task-uuid"
140136
require.NoError(t, d.startDatadogAgentExperiment(context.Background(), req))
141137

142138
// DDA should be unchanged — no re-patch, no status update.
143139
got := &v2alpha1.DatadogAgent{}
144140
require.NoError(t, c.Get(context.Background(), testDDANSN, got))
145141
require.NotNil(t, got.Status.Experiment)
146142
assert.Equal(t, v2alpha1.ExperimentPhaseRunning, got.Status.Experiment.Phase)
147-
assert.Equal(t, req.ID, got.Status.Experiment.ID)
143+
assert.Equal(t, "old-exp", got.Status.Experiment.ID)
148144
}
149145

150146
func TestStartDatadogAgentExperiment_Stopped(t *testing.T) {

0 commit comments

Comments
 (0)