Skip to content

Commit e3b292d

Browse files
authored
[shim] Don't check image downloaded size (#2903)
Sometimes dockerd emits less "Download complete" messages than expected, but the image is pulled successfully. The reason is unclear, but, anyway, this check is redundant since we also rely on the status message, which is emitted only in the case of succesfull pull. In addition, this patch adds/changes the following: * Write dockerd pull stream (JSON Lines) to {runnerDir}/pull.log — useful for debugging (in conjunction with DSTACK_SERVER_KEEP_SHIM_TASKS=1) * Use `errorDetail.message` instead of deprecated `error` * Move `ctx.Err()` check upper, otherwise it's shadowed by pull errors Fixes: #2503
1 parent 1602c80 commit e3b292d

5 files changed

Lines changed: 81 additions & 51 deletions

File tree

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ repos:
1010
rev: v1.62.0 # Should match .github/workflows/build.yml
1111
hooks:
1212
- id: golangci-lint-full
13-
language_version: 1.23.0 # Should match runner/go.mod
13+
language_version: 1.23.8 # Should match runner/go.mod
1414
entry: bash -c 'cd runner && golangci-lint run'
1515
stages: [manual]
1616
- repo: https://github.com/pre-commit/pre-commit-hooks

runner/go.mod

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,10 +79,11 @@ require (
7979
golang.org/x/mod v0.17.0 // indirect
8080
golang.org/x/net v0.24.0 // indirect
8181
golang.org/x/sync v0.7.0 // indirect
82+
golang.org/x/time v0.5.0 // indirect
8283
golang.org/x/tools v0.20.0 // indirect
8384
google.golang.org/genproto/googleapis/api v0.0.0-20240401170217-c3f982113cda // indirect
8485
google.golang.org/genproto/googleapis/rpc v0.0.0-20240401170217-c3f982113cda // indirect
8586
gopkg.in/warnings.v0 v0.1.2 // indirect
8687
gopkg.in/yaml.v3 v3.0.1 // indirect
87-
gotest.tools/v3 v3.5.0 // indirect
88+
gotest.tools/v3 v3.5.1 // indirect
8889
)

runner/go.sum

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -285,8 +285,9 @@ golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
285285
golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
286286
golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ=
287287
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
288-
golang.org/x/time v0.0.0-20170424234030-8be79e1e0910 h1:bCMaBn7ph495H+x72gEvgcv+mDRd9dElbzo/mVCMxX4=
289288
golang.org/x/time v0.0.0-20170424234030-8be79e1e0910/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
289+
golang.org/x/time v0.5.0 h1:o7cqy6amK/52YcAKIPlM3a+Fpj35zvRj2TP+e1xFSfk=
290+
golang.org/x/time v0.5.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=
290291
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
291292
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
292293
golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
@@ -322,5 +323,5 @@ gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
322323
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
323324
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
324325
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
325-
gotest.tools/v3 v3.5.0 h1:Ljk6PdHdOhAb5aDMWXjDLMMhph+BpztA4v1QdqEW2eY=
326-
gotest.tools/v3 v3.5.0/go.mod h1:isy3WKz7GK6uNw/sbHzfKBLvlvXwUyV06n6brMxxopU=
326+
gotest.tools/v3 v3.5.1 h1:EENdUnS3pdur5nybKYIh2Vfgc8IUNBjxDPSjtiJcOzU=
327+
gotest.tools/v3 v3.5.1/go.mod h1:isy3WKz7GK6uNw/sbHzfKBLvlvXwUyV06n6brMxxopU=

runner/internal/shim/docker.go

Lines changed: 64 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,13 @@ func (d *DockerRunner) Run(ctx context.Context, taskID string) error {
274274
cfg := task.config
275275
var err error
276276

277+
runnerDir, err := d.dockerParams.MakeRunnerDir(task.containerName)
278+
if err != nil {
279+
return tracerr.Wrap(err)
280+
}
281+
task.runnerDir = runnerDir
282+
log.Debug(ctx, "runner dir", "task", task.ID, "path", runnerDir)
283+
277284
if cfg.GPU != 0 {
278285
gpuIDs, err := d.gpuLock.Acquire(ctx, cfg.GPU)
279286
if err != nil {
@@ -335,7 +342,10 @@ func (d *DockerRunner) Run(ctx context.Context, taskID string) error {
335342
if err := d.tasks.Update(task); err != nil {
336343
return tracerr.Errorf("%w: failed to update task %s: %w", ErrInternal, task.ID, err)
337344
}
338-
if err = pullImage(pullCtx, d.client, cfg); err != nil {
345+
// Although it's called "runner dir", we also use it for shim task-related data.
346+
// Maybe we should rename it to "task dir" (including the `/root/.dstack/runners` dir on the host).
347+
pullLogPath := filepath.Join(runnerDir, "pull.log")
348+
if err = pullImage(pullCtx, d.client, cfg, pullLogPath); err != nil {
339349
errMessage := fmt.Sprintf("pullImage error: %s", err.Error())
340350
log.Error(ctx, errMessage)
341351
task.SetStatusTerminated(string(types.TerminationReasonCreatingContainerError), errMessage)
@@ -655,7 +665,7 @@ func mountDisk(ctx context.Context, deviceName, mountPoint string, fsRootPerms o
655665
return nil
656666
}
657667

658-
func pullImage(ctx context.Context, client docker.APIClient, taskConfig TaskConfig) error {
668+
func pullImage(ctx context.Context, client docker.APIClient, taskConfig TaskConfig, logPath string) error {
659669
if !strings.Contains(taskConfig.ImageName, ":") {
660670
taskConfig.ImageName += ":latest"
661671
}
@@ -685,51 +695,70 @@ func pullImage(ctx context.Context, client docker.APIClient, taskConfig TaskConf
685695
if err != nil {
686696
return tracerr.Wrap(err)
687697
}
688-
defer func() { _ = reader.Close() }()
698+
defer reader.Close()
699+
700+
logFile, err := os.OpenFile(logPath, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0o644)
701+
if err != nil {
702+
return tracerr.Wrap(err)
703+
}
704+
defer logFile.Close()
705+
706+
teeReader := io.TeeReader(reader, logFile)
689707

690708
current := make(map[string]uint)
691709
total := make(map[string]uint)
692710

693-
type ProgressDetail struct {
694-
Current uint `json:"current"`
695-
Total uint `json:"total"`
696-
}
697-
type Progress struct {
698-
Id string `json:"id"`
699-
Status string `json:"status"`
700-
ProgressDetail ProgressDetail `json:"progressDetail"` //nolint:tagliatelle
701-
Error string `json:"error"`
711+
// dockerd reports pulling progress as a stream of JSON Lines. The format of records is not documented in the API documentation,
712+
// although it's occasionally mentioned, e.g., https://docs.docker.com/reference/api/engine/version-history/#v148-api-changes
713+
714+
// https://github.com/moby/moby/blob/e77ff99ede5ee5952b3a9227863552ae6e5b6fb1/pkg/jsonmessage/jsonmessage.go#L144
715+
// All fields are optional
716+
type PullMessage struct {
717+
Id string `json:"id"` // layer id
718+
Status string `json:"status"`
719+
ProgressDetail struct {
720+
Current uint `json:"current"` // bytes
721+
Total uint `json:"total"` // bytes
722+
} `json:"progressDetail"`
723+
ErrorDetail struct {
724+
Message string `json:"message"`
725+
} `json:"errorDetail"`
702726
}
703727

704-
var status bool
728+
var pullCompleted bool
705729
pullErrors := make([]string, 0)
706730

707-
scanner := bufio.NewScanner(reader)
731+
scanner := bufio.NewScanner(teeReader)
708732
for scanner.Scan() {
709733
line := scanner.Bytes()
710-
var progressRow Progress
711-
if err := json.Unmarshal(line, &progressRow); err != nil {
734+
var pullMessage PullMessage
735+
if err := json.Unmarshal(line, &pullMessage); err != nil {
712736
continue
713737
}
714-
if progressRow.Status == "Downloading" {
715-
current[progressRow.Id] = progressRow.ProgressDetail.Current
716-
total[progressRow.Id] = progressRow.ProgressDetail.Total
738+
if pullMessage.Status == "Downloading" {
739+
current[pullMessage.Id] = pullMessage.ProgressDetail.Current
740+
total[pullMessage.Id] = pullMessage.ProgressDetail.Total
717741
}
718-
if progressRow.Status == "Download complete" {
719-
current[progressRow.Id] = total[progressRow.Id]
742+
if pullMessage.Status == "Download complete" {
743+
current[pullMessage.Id] = total[pullMessage.Id]
720744
}
721-
if progressRow.Error != "" {
722-
log.Error(ctx, "error pulling image", "name", taskConfig.ImageName, "err", progressRow.Error)
723-
pullErrors = append(pullErrors, progressRow.Error)
745+
if pullMessage.ErrorDetail.Message != "" {
746+
log.Error(ctx, "error pulling image", "name", taskConfig.ImageName, "err", pullMessage.ErrorDetail.Message)
747+
pullErrors = append(pullErrors, pullMessage.ErrorDetail.Message)
724748
}
725-
if strings.HasPrefix(progressRow.Status, "Status:") {
726-
status = true
727-
log.Debug(ctx, progressRow.Status)
749+
// If the pull is successful, the last two entries must be:
750+
// "Digest: sha256:<hash>"
751+
// "Status: <message>"
752+
// where <message> is either "Downloaded newer image for <tag>" or "Image is up to date for <tag>".
753+
// See: https://github.com/moby/moby/blob/e77ff99ede5ee5952b3a9227863552ae6e5b6fb1/daemon/containerd/image_pull.go#L134-L152
754+
// See: https://github.com/moby/moby/blob/e77ff99ede5ee5952b3a9227863552ae6e5b6fb1/daemon/containerd/image_pull.go#L257-L263
755+
if strings.HasPrefix(pullMessage.Status, "Status:") {
756+
pullCompleted = true
757+
log.Debug(ctx, pullMessage.Status)
728758
}
729759
}
730760

731761
duration := time.Since(startTime)
732-
733762
var currentBytes uint
734763
var totalBytes uint
735764
for _, v := range current {
@@ -738,9 +767,13 @@ func pullImage(ctx context.Context, client docker.APIClient, taskConfig TaskConf
738767
for _, v := range total {
739768
totalBytes += v
740769
}
741-
742770
speed := bytesize.New(float64(currentBytes) / duration.Seconds())
743-
if status && currentBytes == totalBytes {
771+
772+
if err := ctx.Err(); err != nil {
773+
return tracerr.Errorf("image pull interrupted: downloaded %d bytes out of %d (%s/s): %w", currentBytes, totalBytes, speed, err)
774+
}
775+
776+
if pullCompleted {
744777
log.Debug(ctx, "image successfully pulled", "bytes", currentBytes, "bps", speed)
745778
} else {
746779
return tracerr.Errorf(
@@ -749,21 +782,11 @@ func pullImage(ctx context.Context, client docker.APIClient, taskConfig TaskConf
749782
)
750783
}
751784

752-
err = ctx.Err()
753-
if err != nil {
754-
return tracerr.Errorf("imagepull interrupted: downloaded %d bytes out of %d (%s/s): %w", currentBytes, totalBytes, speed, err)
755-
}
756785
return nil
757786
}
758787

759788
func (d *DockerRunner) createContainer(ctx context.Context, task *Task) error {
760-
runnerDir, err := d.dockerParams.MakeRunnerDir(task.containerName)
761-
if err != nil {
762-
return tracerr.Wrap(err)
763-
}
764-
task.runnerDir = runnerDir
765-
766-
mounts, err := d.dockerParams.DockerMounts(runnerDir)
789+
mounts, err := d.dockerParams.DockerMounts(task.runnerDir)
767790
if err != nil {
768791
return tracerr.Wrap(err)
769792
}

runner/internal/shim/docker_test.go

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,9 @@ func TestDocker_SSHServer(t *testing.T) {
2626
t.Parallel()
2727

2828
params := &dockerParametersMock{
29-
commands: []string{"echo 1"},
30-
sshPort: nextPort(),
29+
commands: []string{"echo 1"},
30+
sshPort: nextPort(),
31+
runnerDir: t.TempDir(),
3132
}
3233

3334
timeout := 180 // seconds
@@ -58,6 +59,7 @@ func TestDocker_SSHServerConnect(t *testing.T) {
5859
commands: []string{"sleep 5"},
5960
sshPort: nextPort(),
6061
publicSSHKey: string(publicBytes),
62+
runnerDir: t.TempDir(),
6163
}
6264

6365
timeout := 180 // seconds
@@ -103,7 +105,8 @@ func TestDocker_ShmNoexecByDefault(t *testing.T) {
103105
t.Parallel()
104106

105107
params := &dockerParametersMock{
106-
commands: []string{"mount | grep '/dev/shm .*size=65536k' | grep noexec"},
108+
commands: []string{"mount | grep '/dev/shm .*size=65536k' | grep noexec"},
109+
runnerDir: t.TempDir(),
107110
}
108111

109112
timeout := 180 // seconds
@@ -125,7 +128,8 @@ func TestDocker_ShmExecIfSizeSpecified(t *testing.T) {
125128
t.Parallel()
126129

127130
params := &dockerParametersMock{
128-
commands: []string{"mount | grep '/dev/shm .*size=1024k' | grep -v noexec"},
131+
commands: []string{"mount | grep '/dev/shm .*size=1024k' | grep -v noexec"},
132+
runnerDir: t.TempDir(),
129133
}
130134

131135
timeout := 180 // seconds
@@ -148,6 +152,7 @@ type dockerParametersMock struct {
148152
commands []string
149153
sshPort int
150154
publicSSHKey string
155+
runnerDir string
151156
}
152157

153158
func (c *dockerParametersMock) DockerPrivileged() bool {
@@ -184,7 +189,7 @@ func (c *dockerParametersMock) DockerMounts(string) ([]mount.Mount, error) {
184189
}
185190

186191
func (c *dockerParametersMock) MakeRunnerDir(string) (string, error) {
187-
return "", nil
192+
return c.runnerDir, nil
188193
}
189194

190195
/* Utilities */

0 commit comments

Comments
 (0)