diff --git a/integration/sandbox_clean_remove_test.go b/integration/sandbox_clean_remove_test.go index 5b369d8ee540d..1e35c9bd587c3 100644 --- a/integration/sandbox_clean_remove_test.go +++ b/integration/sandbox_clean_remove_test.go @@ -22,6 +22,8 @@ import ( "os" "path/filepath" "strings" + "sync" + "syscall" "testing" "time" @@ -123,3 +125,67 @@ func TestSandboxRemoveWithoutIPLeakage(t *testing.T) { t.Logf("Should not be able to find the pod ip in host-local checkpoint") assert.False(t, checkIP(ip)) } + +// TestSandboxStopWithNilCNIResult verifies that StopPodSandbox succeeds when +// CNIResult is nil (network setup never completed) even if the network teardown +// itself fails. This exercises the condition in sandbox_stop.go where a nil +// CNIResult causes the teardown error to be logged as a warning instead of +// returned as a hard error. +func TestSandboxStopWithNilCNIResult(t *testing.T) { + t.Log("Init PodSandboxConfig with specific label") + sbName := t.Name() + labels := map[string]string{ + sbName: "true", + } + sbConfig := PodSandboxConfig(sbName, "failpoint", WithPodLabels(labels)) + + t.Log("Inject CNI failpoint: delay Add (so CNIResult is never set) and fail Del") + conf := &failpointConf{ + // Delay CNI Add for 1 day so network setup never completes and CNIResult stays nil + Add: "1*delay(86400000)", + // Make CNI Del fail so teardownPodNetwork returns an error during stop + Del: "1*error(network-teardown-injected-error)", + } + injectCNIFailpoint(t, sbConfig, conf) + + var wg sync.WaitGroup + wg.Add(1) + + go func() { + defer wg.Done() + t.Log("Create a sandbox (will hang on CNI Add)") + _, err := runtimeService.RunPodSandbox(sbConfig, failpointRuntimeHandler) + assert.Error(t, err) + }() + + t.Log("Wait for CNI Add to start running") + assert.NoError(t, ensureCNIAddRunning(t, sbName), "CNI Add should be running") + + // Kill containerd while CNI Add is in progress so CNIResult is never set + // and no deferred cleanup runs. + t.Log("Kill containerd with SIGKILL to leave sandbox with nil CNIResult") + RestartContainerd(t, syscall.SIGKILL) + + wg.Wait() + + t.Log("ListPodSandbox with the specific label") + l, err := runtimeService.ListPodSandbox(&runtime.PodSandboxFilter{LabelSelector: labels}) + require.NoError(t, err) + require.Len(t, l, 1) + + sb := l[0] + require.Equal(t, runtime.PodSandboxState_SANDBOX_NOTREADY, sb.State) + + t.Log("Get sandbox info and verify CNIResult is nil") + _, info, err := SandboxInfo(sb.Id) + require.NoError(t, err) + require.Nil(t, info.CNIResult, "CNIResult should be nil because CNI setup never completed") + + t.Log("StopPodSandbox should succeed even though CNI Del fails, because CNIResult is nil") + err = runtimeService.StopPodSandbox(sb.Id) + assert.NoError(t, err, "StopPodSandbox should not return error when CNIResult is nil") + + t.Log("RemovePodSandbox should succeed") + err = runtimeService.RemovePodSandbox(sb.Id) + assert.NoError(t, err) +} diff --git a/internal/cri/server/sandbox_stop.go b/internal/cri/server/sandbox_stop.go index b44e74e7c56e2..dacb7149a2392 100644 --- a/internal/cri/server/sandbox_stop.go +++ b/internal/cri/server/sandbox_stop.go @@ -114,9 +114,11 @@ func (c *criService) stopPodSandbox(ctx context.Context, sandbox sandboxstore.Sa } else if closed { sandbox.NetNSPath = "" } - if sandbox.CNIResult != nil { - if err := c.teardownPodNetwork(ctx, sandbox); err != nil { + if err := c.teardownPodNetwork(ctx, sandbox); err != nil { + if sandbox.CNIResult != nil { return fmt.Errorf("failed to destroy network for sandbox %q: %w", id, err) + } else { + log.G(ctx).WithError(err).Warnf("failed to destroy network for sandbox %q; and ignoring because the sandbox network setup result is nil indicating the network setup never completed", id) } } if err := sandbox.NetNS.Remove(); err != nil {