Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 6 additions & 7 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, macos-latest]
os: [ubuntu-latest]
steps:
- uses: actions/checkout@v4
- name: Set up Go
Expand Down Expand Up @@ -167,9 +167,9 @@ jobs:
strategy:
matrix:
include:
- { goos: "linux", goarch: "amd64" }
- { goos: "linux", goarch: "arm64" }
runs-on: ubuntu-latest
- { runs-on: "ubuntu-24.04", goos: "linux", goarch: "amd64" }
- { runs-on: "ubuntu-24.04-arm", goos: "linux", goarch: "arm64" }
runs-on: ${{ matrix.runs-on }}
steps:
- uses: actions/checkout@v4
- name: Set up Go
Expand All @@ -181,11 +181,10 @@ jobs:
env:
GOOS: ${{ matrix.goos }}
GOARCH: ${{ matrix.goarch }}
CGO_ENABLED: 0
run: |
VERSION=$((${{ github.run_number }} + ${{ env.BUILD_INCREMENT }}))
go build -ldflags "-X 'main.Version=$VERSION' -extldflags '-static'" -o dstack-runner-$GOOS-$GOARCH $REPO_NAME/runner/cmd/runner
go build -ldflags "-X 'main.Version=$VERSION' -extldflags '-static'" -o dstack-shim-$GOOS-$GOARCH $REPO_NAME/runner/cmd/shim
CGO_ENABLED=0 go build -ldflags "-X 'main.Version=$VERSION' -extldflags '-static'" -o dstack-runner-$GOOS-$GOARCH $REPO_NAME/runner/cmd/runner
CGO_ENABLED=1 go build -ldflags "-X 'main.Version=$VERSION'" -o dstack-shim-$GOOS-$GOARCH $REPO_NAME/runner/cmd/shim
echo $VERSION
- uses: actions/upload-artifact@v4
with:
Expand Down
4 changes: 4 additions & 0 deletions docs/docs/reference/environment-variables.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,10 @@ For more details on the options below, refer to the [server deployment](../guide
- `DSTACK_SERVER_BACKGROUND_PROCESSING_DISABLED`{ #DSTACK_SERVER_BACKGROUND_PROCESSING_DISABLED } - Disables background processing if set to any value. Useful to run only web frontend and API server.
- `DSTACK_SERVER_MAX_PROBES_PER_JOB`{ #DSTACK_SERVER_MAX_PROBES_PER_JOB } - Maximum number of probes allowed in a run configuration. Validated at apply time.
- `DSTACK_SERVER_MAX_PROBE_TIMEOUT`{ #DSTACK_SERVER_MAX_PROBE_TIMEOUT } - Maximum allowed timeout for a probe. Validated at apply time.
- `DSTACK_SERVER_METRICS_RUNNING_TTL_SECONDS`{ #DSTACK_SERVER_METRICS_RUNNING_TTL_SECONDS } – Maximum age of metrics samples for running jobs.
- `DSTACK_SERVER_METRICS_FINISHED_TTL_SECONDS`{ #DSTACK_SERVER_METRICS_FINISHED_TTL_SECONDS } – Maximum age of metrics samples for finished jobs.
- `DSTACK_SERVER_INSTANCE_HEALTH_TTL_SECONDS`{ #DSTACK_SERVER_INSTANCE_HEALTH_TTL_SECONDS } – Maximum age of instance health checks.
- `DSTACK_SERVER_INSTANCE_HEALTH_MIN_COLLECT_INTERVAL_SECONDS`{ #DSTACK_SERVER_INSTANCE_HEALTH_MIN_COLLECT_INTERVAL_SECONDS } – Minimum time interval between consecutive health checks of the same instance.

??? info "Internal environment variables"
The following environment variables are intended for development purposes:
Expand Down
31 changes: 28 additions & 3 deletions runner/cmd/shim/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,14 @@ func main() {
Destination: &args.DCGMExporter.Interval,
EnvVars: []string{"DSTACK_DCGM_EXPORTER_INTERVAL"},
},
/* DCGM Parameters */
&cli.StringFlag{
Name: "dcgm-address",
Usage: "nv-hostengine `hostname`, e.g., `localhost`",
DefaultText: "start libdcgm in embedded mode",
Destination: &args.DCGM.Address,
EnvVars: []string{"DSTACK_DCGM_ADDRESS"},
},
/* Docker Parameters */
&cli.BoolFlag{
Name: "privileged",
Expand Down Expand Up @@ -196,6 +204,7 @@ func start(ctx context.Context, args shim.CLIArgs, serviceMode bool) (err error)
}

var dcgmExporter *dcgm.DCGMExporter
var dcgmWrapper *dcgm.DCGMWrapper

if common.GetGpuVendor() == common.GpuVendorNvidia {
dcgmExporterPath, err := dcgm.GetDCGMExporterExecPath(ctx)
Expand All @@ -207,16 +216,32 @@ func start(ctx context.Context, args shim.CLIArgs, serviceMode bool) (err error)
if err == nil {
log.Info(ctx, "using DCGM Exporter")
defer func() {
_ = dcgmExporter.Stop(ctx)
if err := dcgmExporter.Stop(ctx); err != nil {
log.Error(ctx, "failed to stop DCGM Exporter", "err", err)
}
}()
} else {
log.Warning(ctx, "not using DCGM Exporter", "err", err)
dcgmExporter = nil
}

dcgmWrapper, err = dcgm.NewDCGMWrapper(args.DCGM.Address)
if err == nil {
log.Info(ctx, "using libdcgm")
defer func() {
if err := dcgmWrapper.Shutdown(); err != nil {
log.Error(ctx, "failed to shut down libdcgm", "err", err)
}
}()
if err := dcgmWrapper.EnableHealthChecks(); err != nil {
log.Error(ctx, "failed to enable libdcgm health checks", "err", err)
}
} else {
log.Warning(ctx, "not using libdcgm", "err", err)
}
}

address := fmt.Sprintf(":%d", args.Shim.HTTPPort)
shimServer := api.NewShimServer(ctx, address, dockerRunner, dcgmExporter, Version)
shimServer := api.NewShimServer(ctx, address, Version, dockerRunner, dcgmExporter, dcgmWrapper)

defer func() {
shutdownCtx, cancelShutdown := context.WithTimeout(ctx, 5*time.Second)
Expand Down
97 changes: 95 additions & 2 deletions runner/docs/shim.openapi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ openapi: 3.1.1

info:
title: dstack-shim API
version: v2/0.18.34
version: v2/0.19.22
x-logo:
url: https://avatars.githubusercontent.com/u/54146142?s=260
description: >
Expand Down Expand Up @@ -50,10 +50,25 @@ paths:
schema:
$ref: "#/components/schemas/HealthcheckResponse"

/instance/health:
get:
summary: Get instance health

description: (since [0.19.22](https://github.com/dstackai/dstack/releases/tag/0.19.22)) Returns an object of optional passive system checks
tags: [Instance]
responses:
"200":
description: ""
content:
application/json:
schema:
$ref: "#/components/schemas/InstanceHealthResponse"

/tasks:
get:
summary: Get task list
description: Returns a list of all tasks known to shim, including terminated ones
tags: [Tasks]
responses:
"200":
description: ""
Expand All @@ -63,6 +78,7 @@ paths:
$ref: "#/components/schemas/TaskListResponse"
post:
summary: Submit and run new task
tags: [Tasks]
requestBody:
required: true
content:
Expand All @@ -86,6 +102,7 @@ paths:
/tasks/{id}:
get:
summary: Get task info
tags: [Tasks]
parameters:
- $ref: "#/parameters/taskId"
responses:
Expand All @@ -102,6 +119,7 @@ paths:
Stops the task, that is, cancels image pulling if in progress,
stops the container if running, and sets the status to `terminated`.
No-op if the task is already terminated
tags: [Tasks]
parameters:
- in: path
name: id
Expand Down Expand Up @@ -131,6 +149,7 @@ paths:
description: >
Removes the task from in-memory storage and destroys its associated
resources: a container, logs, etc.
tags: [Tasks]
parameters:
- $ref: "#/parameters/taskId"
responses:
Expand Down Expand Up @@ -270,7 +289,7 @@ components:
type: string
default: ""
description: Mount point inside container

GPUDevice:
title: shim.GPUDevice
type: object
Expand All @@ -284,6 +303,72 @@ components:
default: ""
description: Path inside container

DCGMHealth:
title: shim.dcgm.Health
type: object
properties:
overall_health:
type: integer
description: >
[dcgmHealthWatchResult_enum](https://docs.nvidia.com/datacenter/dcgm/latest/dcgm-api/dcgm-api-data-structs.html#_CPPv426dcgmHealthWatchResult_enum)
examples:
- 10
incidents:
type: array
items:
$ref: "#/components/schemas/DCGMHealthIncident"
required:
- overall_health
- incidents
additionalProperties: false

DCGMHealthIncident:
title: shim.dcgm.HealthIncident
type: object
properties:
system:
type: integer
description: >
[dcgmHealthSystems_enum](https://docs.nvidia.com/datacenter/dcgm/latest/dcgm-api/dcgm-api-data-structs.html#_CPPv422dcgmHealthSystems_enum)
examples:
- 1
health:
type: integer
description: >
[dcgmHealthWatchResult_enum](https://docs.nvidia.com/datacenter/dcgm/latest/dcgm-api/dcgm-api-data-structs.html#_CPPv426dcgmHealthWatchResult_enum)
examples:
- 10
error_message:
type: string
examples:
- >
Detected more than 16 PCIe replays per minute for GPU 0 : 99 Reconnect PCIe card.
Run system side PCIE diagnostic utilities to verify hops off the GPU board. If issue is on the board, run the field diagnostic.
error_code:
type: integer
description: >
[dcgmError_enum](https://github.com/NVIDIA/DCGM/blob/master/dcgmlib/dcgm_errors.h)
examples:
- 3
entity_group_id:
type: integer
description: >
[dcgm_field_entity_group_t](https://docs.nvidia.com/datacenter/dcgm/latest/dcgm-api/dcgm-api-field-entity.html#_CPPv425dcgm_field_entity_group_t)
examples:
- 1
entity_id:
type: integer
examples:
- 0
required:
- system
- health
- error_message
- error_code
- entity_group_id
- entity_id
additionalProperties: false

HealthcheckResponse:
title: shim.api.HealthcheckResponse
type: object
Expand All @@ -299,6 +384,14 @@ components:
- version
additionalProperties: false

InstanceHealthResponse:
title: shim.api.InstanceHealthResponse
type: object
properties:
dcgm:
$ref: "#/components/schemas/DCGMHealth"
additionalProperties: false

TaskListResponse:
title: shim.api.TaskListResponse
type: object
Expand Down
2 changes: 2 additions & 0 deletions runner/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ module github.com/dstackai/dstack/runner
go 1.23.8

require (
github.com/NVIDIA/go-dcgm v0.0.0-20250707210631-823394f2bd9b
github.com/alexellis/go-execute/v2 v2.2.1
github.com/bluekeyes/go-gitdiff v0.7.2
github.com/codeclysm/extract/v4 v4.0.0
Expand All @@ -29,6 +30,7 @@ require (
dario.cat/mergo v1.0.0 // indirect
github.com/Microsoft/go-winio v0.6.1 // indirect
github.com/ProtonMail/go-crypto v1.0.0 // indirect
github.com/bits-and-blooms/bitset v1.22.0 // indirect
github.com/cloudflare/circl v1.3.7 // indirect
github.com/containerd/log v0.1.0 // indirect
github.com/cpuguy83/go-md2man/v2 v2.0.4 // indirect
Expand Down
4 changes: 4 additions & 0 deletions runner/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03
github.com/Microsoft/go-winio v0.5.2/go.mod h1:WpS1mjBmmwHBEWmogvA2mj8546UReBk4v8QkMxJ6pZY=
github.com/Microsoft/go-winio v0.6.1 h1:9/kr64B9VUZrLm5YYwbGtUJnMgqWVOdUAXu6Migciow=
github.com/Microsoft/go-winio v0.6.1/go.mod h1:LRdKpFKfdobln8UmuiYcKPot9D2v6svN5+sAH+4kjUM=
github.com/NVIDIA/go-dcgm v0.0.0-20250707210631-823394f2bd9b h1:FL0NJYUNMX1ezl2Dv0azgedHPBXDuqHnqGDtqj6aqZM=
github.com/NVIDIA/go-dcgm v0.0.0-20250707210631-823394f2bd9b/go.mod h1:cA0Bv7+JtAd8sqCCZizhAQjj4+Z47x/d8KD60iYBT+g=
github.com/ProtonMail/go-crypto v1.0.0 h1:LRuvITjQWX+WIfr930YHG2HNfjR1uOfyf5vE0kC2U78=
github.com/ProtonMail/go-crypto v1.0.0/go.mod h1:EjAoLdwvbIOoOQr3ihjnSoLZRtE8azugULFRteWMNc0=
github.com/alexellis/go-execute/v2 v2.2.1 h1:4Ye3jiCKQarstODOEmqDSRCqxMHLkC92Bhse743RdOI=
Expand All @@ -17,6 +19,8 @@ github.com/arduino/go-paths-helper v1.12.1 h1:WkxiVUxBjKWlLMiMuYy8DcmVrkxdP7aKxQ
github.com/arduino/go-paths-helper v1.12.1/go.mod h1:jcpW4wr0u69GlXhTYydsdsqAjLaYK5n7oWHfKqOG6LM=
github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio=
github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs=
github.com/bits-and-blooms/bitset v1.22.0 h1:Tquv9S8+SGaS3EhyA+up3FXzmkhxPGjQQCkcs2uw7w4=
github.com/bits-and-blooms/bitset v1.22.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
github.com/bluekeyes/go-gitdiff v0.7.2 h1:42jrcVZdjjxXtVsFNYTo/I6T1ZvIiQL+iDDLiH904hw=
github.com/bluekeyes/go-gitdiff v0.7.2/go.mod h1:QpfYYO1E0fTVHVZAZKiRjtSGY9823iCdvGXBcEzHGbM=
github.com/bradfitz/gomemcache v0.0.0-20170208213004-1952afaa557d/go.mod h1:PmM6Mmwb0LSuEubjR8N7PtNe1KxZLtOUHtbeikc5h60=
Expand Down
14 changes: 14 additions & 0 deletions runner/internal/shim/api/handlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,20 @@ func (s *ShimServer) HealthcheckHandler(w http.ResponseWriter, r *http.Request)
}, nil
}

func (s *ShimServer) InstanceHealthHandler(w http.ResponseWriter, r *http.Request) (interface{}, error) {
ctx := r.Context()
response := InstanceHealthResponse{}
if s.dcgmWrapper != nil {
if dcgmHealth, err := s.dcgmWrapper.GetHealth(); err != nil {
log.Error(ctx, "failed to get health from DCGM", "err", err)
} else {
response.DCGM = &dcgmHealth
}
}

return &response, nil
}

func (s *ShimServer) TaskListHandler(w http.ResponseWriter, r *http.Request) (interface{}, error) {
return &TaskListResponse{IDs: s.runner.TaskIDs()}, nil
}
Expand Down
4 changes: 2 additions & 2 deletions runner/internal/shim/api/handlers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ func TestHealthcheck(t *testing.T) {
request := httptest.NewRequest("GET", "/api/healthcheck", nil)
responseRecorder := httptest.NewRecorder()

server := NewShimServer(context.Background(), ":12345", NewDummyRunner(), nil, "0.0.1.dev2")
server := NewShimServer(context.Background(), ":12345", "0.0.1.dev2", NewDummyRunner(), nil, nil)

f := common.JSONResponseHandler(server.HealthcheckHandler)
f(responseRecorder, request)
Expand All @@ -30,7 +30,7 @@ func TestHealthcheck(t *testing.T) {
}

func TestTaskSubmit(t *testing.T) {
server := NewShimServer(context.Background(), ":12340", NewDummyRunner(), nil, "0.0.1.dev2")
server := NewShimServer(context.Background(), ":12340", "0.0.1.dev2", NewDummyRunner(), nil, nil)
requestBody := `{
"id": "dummy-id",
"name": "dummy-name",
Expand Down
9 changes: 8 additions & 1 deletion runner/internal/shim/api/schemas.go
Original file line number Diff line number Diff line change
@@ -1,12 +1,19 @@
package api

import "github.com/dstackai/dstack/runner/internal/shim"
import (
"github.com/dstackai/dstack/runner/internal/shim"
"github.com/dstackai/dstack/runner/internal/shim/dcgm"
)

type HealthcheckResponse struct {
Service string `json:"service"`
Version string `json:"version"`
}

type InstanceHealthResponse struct {
DCGM *dcgm.Health `json:"dcgm"`
}

type TaskListResponse struct {
IDs []string `json:"ids"`
}
Expand Down
8 changes: 7 additions & 1 deletion runner/internal/shim/api/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,15 @@ type ShimServer struct {
runner TaskRunner

dcgmExporter *dcgm.DCGMExporter
dcgmWrapper *dcgm.DCGMWrapper

version string
}

func NewShimServer(ctx context.Context, address string, runner TaskRunner, dcgmExporter *dcgm.DCGMExporter, version string) *ShimServer {
func NewShimServer(
ctx context.Context, address string, version string,
runner TaskRunner, dcgmExporter *dcgm.DCGMExporter, dcgmWrapper *dcgm.DCGMWrapper,
) *ShimServer {
r := api.NewRouter()
s := &ShimServer{
HttpServer: &http.Server{
Expand All @@ -45,12 +49,14 @@ func NewShimServer(ctx context.Context, address string, runner TaskRunner, dcgmE
runner: runner,

dcgmExporter: dcgmExporter,
dcgmWrapper: dcgmWrapper,

version: version,
}

// The healthcheck endpoint should stay backward compatible, as it is used for negotiation
r.AddHandler("GET", "/api/healthcheck", s.HealthcheckHandler)
r.AddHandler("GET", "/api/instance/health", s.InstanceHealthHandler)
r.AddHandler("GET", "/api/tasks", s.TaskListHandler)
r.AddHandler("GET", "/api/tasks/{id}", s.TaskInfoHandler)
r.AddHandler("POST", "/api/tasks", s.TaskSubmitHandler)
Expand Down
Loading
Loading