Skip to content

Commit 8e9b4a5

Browse files
[Bug]: error gathering device information while adding custom device "/dev/tenstorrent/4": no such file or directory #2787
1 parent bdb9aab commit 8e9b4a5

File tree

10 files changed

+1233
-37
lines changed

10 files changed

+1233
-37
lines changed

runner/.justfile

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ build-runner-binary:
4545
#!/usr/bin/env bash
4646
set -e
4747
echo "Building runner for linux/amd64"
48-
cd {{source_directory()}}/cmd/runner && GOOS=linux GOARCH=amd64 go build
48+
cd {{source_directory()}}/cmd/runner && GOOS=linux GOARCH=amd64 go build -ldflags "-X 'main.Version=$version' -extldflags '-static'"
4949
echo "Runner build complete!"
5050

5151
# Build shim
@@ -56,12 +56,12 @@ build-shim-binary:
5656
cd {{source_directory()}}/cmd/shim
5757
if [ -n "$shim_os" ] && [ -n "$shim_arch" ]; then
5858
echo "Building shim for $shim_os/$shim_arch"
59-
GOOS=$shim_os GOARCH=$shim_arch go build
59+
GOOS=$shim_os GOARCH=$shim_arch go build -ldflags "-X 'main.Version=$version' -extldflags '-static'"
6060
else
6161
echo "Building shim for current platform"
62-
go build
62+
go build -ldflags "-X 'main.Version=$version' -extldflags '-static'"
6363
fi
64-
echo "Shim build complete!"
64+
echo "Shim build (version: $version) complete!"
6565

6666
# Build both runner and shim
6767
build-runner: build-runner-binary build-shim-binary

runner/internal/shim/host/gpu.go

Lines changed: 64 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,67 @@ type ttDeviceInfo struct {
188188

189189
type ttBoardInfo struct {
190190
BoardType string `json:"board_type"`
191-
BusID string `json:"bus_id"`
191+
BoardID string `json:"board_id"`
192+
}
193+
194+
func unmarshalTtSmiSnapshot(data []byte) (*ttSmiSnapshot, error) {
195+
var snapshot ttSmiSnapshot
196+
if err := json.Unmarshal(data, &snapshot); err != nil {
197+
return nil, err
198+
}
199+
return &snapshot, nil
200+
}
201+
202+
func getGpusFromTtSmiSnapshot(snapshot *ttSmiSnapshot) []GpuInfo {
203+
// Group devices by board_id to aggregate memory for the same physical GPU
204+
boardMap := make(map[string]*GpuInfo)
205+
indexCounter := 0
206+
207+
for _, device := range snapshot.DeviceInfo {
208+
boardID := device.BoardInfo.BoardID
209+
210+
// Extract board type without R/L suffix
211+
boardType := strings.TrimSpace(device.BoardInfo.BoardType)
212+
name := boardType
213+
214+
// Remove " R" or " L" suffix if present
215+
if strings.HasSuffix(boardType, " R") {
216+
name = boardType[:len(boardType)-2]
217+
} else if strings.HasSuffix(boardType, " L") {
218+
name = boardType[:len(boardType)-2]
219+
}
220+
221+
// Determine base VRAM based on board type
222+
baseVram := 0
223+
if strings.HasPrefix(name, "n150") {
224+
baseVram = 12 * 1024 // 12GB in MiB
225+
} else if strings.HasPrefix(name, "n300") {
226+
baseVram = 12 * 1024 // 12GB in MiB
227+
}
228+
229+
if existingGpu, exists := boardMap[boardID]; exists {
230+
// Aggregate VRAM for the same board_id
231+
existingGpu.Vram += baseVram
232+
} else {
233+
// Create new GPU entry
234+
boardMap[boardID] = &GpuInfo{
235+
Vendor: common.GpuVendorTenstorrent,
236+
Name: name,
237+
Vram: baseVram,
238+
ID: boardID,
239+
Index: strconv.Itoa(indexCounter),
240+
}
241+
indexCounter++
242+
}
243+
}
244+
245+
// Convert map to slice
246+
var gpus []GpuInfo
247+
for _, gpu := range boardMap {
248+
gpus = append(gpus, *gpu)
249+
}
250+
251+
return gpus
192252
}
193253

194254
func getTenstorrentGpuInfo(ctx context.Context) []GpuInfo {
@@ -218,43 +278,14 @@ func getTenstorrentGpuInfo(ctx context.Context) []GpuInfo {
218278
return gpus
219279
}
220280

221-
var ttSmiSnapshot ttSmiSnapshot
222-
if err := json.Unmarshal([]byte(res.Stdout), &ttSmiSnapshot); err != nil {
281+
ttSmiSnapshot, err := unmarshalTtSmiSnapshot([]byte(res.Stdout))
282+
if err != nil {
223283
log.Error(ctx, "cannot read tt-smi json", "err", err)
224284
log.Debug(ctx, "tt-smi output", "stdout", res.Stdout)
225285
return gpus
226286
}
227287

228-
for i, device := range ttSmiSnapshot.DeviceInfo {
229-
// Extract board type without R/L suffix
230-
boardType := strings.TrimSpace(device.BoardInfo.BoardType)
231-
name := boardType
232-
233-
// Remove " R" or " L" suffix if present
234-
if strings.HasSuffix(boardType, " R") {
235-
name = boardType[:len(boardType)-2]
236-
} else if strings.HasSuffix(boardType, " L") {
237-
name = boardType[:len(boardType)-2]
238-
}
239-
240-
// Determine VRAM based on board type
241-
vram := 0
242-
if strings.HasPrefix(name, "n150") {
243-
vram = 12 * 1024 // 12GB in MiB
244-
} else if strings.HasPrefix(name, "n300") {
245-
vram = 24 * 1024 // 24GB in MiB
246-
}
247-
248-
gpus = append(gpus, GpuInfo{
249-
Vendor: common.GpuVendorTenstorrent,
250-
Name: name,
251-
Vram: vram,
252-
ID: device.BoardInfo.BusID,
253-
Index: strconv.Itoa(i),
254-
})
255-
}
256-
257-
return gpus
288+
return getGpusFromTtSmiSnapshot(ttSmiSnapshot)
258289
}
259290

260291
func getAmdRenderNodePath(bdf string) (string, error) {
Lines changed: 239 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,239 @@
1+
package host
2+
3+
import (
4+
"os"
5+
"path/filepath"
6+
"reflect"
7+
"testing"
8+
9+
"github.com/dstackai/dstack/runner/internal/common"
10+
)
11+
12+
func loadTestData(filename string) ([]byte, error) {
13+
path := filepath.Join("testdata", filename)
14+
return os.ReadFile(path)
15+
}
16+
17+
func TestUnmarshalTtSmiSnapshot(t *testing.T) {
18+
tests := []struct {
19+
name string
20+
filename string
21+
want *ttSmiSnapshot
22+
wantErr bool
23+
}{
24+
{
25+
name: "valid single device",
26+
filename: "valid_single_device.json",
27+
want: &ttSmiSnapshot{
28+
DeviceInfo: []ttDeviceInfo{
29+
{
30+
BoardInfo: ttBoardInfo{
31+
BoardType: "n150 L",
32+
BoardID: "100018611902010",
33+
},
34+
},
35+
},
36+
},
37+
wantErr: false,
38+
},
39+
{
40+
name: "valid multiple devices",
41+
filename: "valid_multiple_devices.json",
42+
want: &ttSmiSnapshot{
43+
DeviceInfo: []ttDeviceInfo{
44+
{
45+
BoardInfo: ttBoardInfo{
46+
BoardType: "n300 L",
47+
BoardID: "10001451172208f",
48+
},
49+
},
50+
{
51+
BoardInfo: ttBoardInfo{
52+
BoardType: "n300 L",
53+
BoardID: "100014511722053",
54+
},
55+
},
56+
{
57+
BoardInfo: ttBoardInfo{
58+
BoardType: "n300 L",
59+
BoardID: "10001451172209c",
60+
},
61+
},
62+
{
63+
BoardInfo: ttBoardInfo{
64+
BoardType: "n300 L",
65+
BoardID: "100014511722058",
66+
},
67+
},
68+
{
69+
BoardInfo: ttBoardInfo{
70+
BoardType: "n300 R",
71+
BoardID: "10001451172208f",
72+
},
73+
},
74+
{
75+
BoardInfo: ttBoardInfo{
76+
BoardType: "n300 R",
77+
BoardID: "100014511722053",
78+
},
79+
},
80+
{
81+
BoardInfo: ttBoardInfo{
82+
BoardType: "n300 R",
83+
BoardID: "10001451172209c",
84+
},
85+
},
86+
{
87+
BoardInfo: ttBoardInfo{
88+
BoardType: "n300 R",
89+
BoardID: "100014511722058",
90+
},
91+
},
92+
},
93+
},
94+
wantErr: false,
95+
},
96+
{
97+
name: "empty device info",
98+
filename: "empty_device_info.json",
99+
want: &ttSmiSnapshot{
100+
DeviceInfo: []ttDeviceInfo{},
101+
},
102+
wantErr: false,
103+
},
104+
{
105+
name: "invalid JSON",
106+
filename: "invalid_json.json",
107+
want: nil,
108+
wantErr: true,
109+
},
110+
{
111+
name: "missing device_info field",
112+
filename: "missing_device_info.json",
113+
want: &ttSmiSnapshot{DeviceInfo: nil},
114+
wantErr: false,
115+
},
116+
{
117+
name: "empty JSON",
118+
filename: "empty_json.json",
119+
want: &ttSmiSnapshot{DeviceInfo: nil},
120+
wantErr: false,
121+
},
122+
}
123+
124+
for _, tt := range tests {
125+
t.Run(tt.name, func(t *testing.T) {
126+
data, err := loadTestData(tt.filename)
127+
if err != nil {
128+
t.Fatalf("Failed to load test data from %s: %v", tt.filename, err)
129+
}
130+
131+
got, err := unmarshalTtSmiSnapshot(data)
132+
if (err != nil) != tt.wantErr {
133+
t.Errorf("unmarshalTtSmiSnapshot() error = %v, wantErr %v", err, tt.wantErr)
134+
return
135+
}
136+
if !tt.wantErr {
137+
if got == nil {
138+
t.Errorf("unmarshalTtSmiSnapshot() returned nil, expected non-nil result")
139+
return
140+
}
141+
if len(got.DeviceInfo) != len(tt.want.DeviceInfo) {
142+
t.Errorf("unmarshalTtSmiSnapshot() device count = %v, want %v", len(got.DeviceInfo), len(tt.want.DeviceInfo))
143+
return
144+
}
145+
for i, device := range got.DeviceInfo {
146+
if i >= len(tt.want.DeviceInfo) {
147+
break
148+
}
149+
expected := tt.want.DeviceInfo[i]
150+
if device.BoardInfo.BoardType != expected.BoardInfo.BoardType {
151+
t.Errorf("unmarshalTtSmiSnapshot() device[%d].BoardInfo.BoardType = %v, want %v", i, device.BoardInfo.BoardType, expected.BoardInfo.BoardType)
152+
}
153+
if device.BoardInfo.BoardID != expected.BoardInfo.BoardID {
154+
t.Errorf("unmarshalTtSmiSnapshot() device[%d].BoardInfo.BoardID = %v, want %v", i, device.BoardInfo.BoardID, expected.BoardInfo.BoardID)
155+
}
156+
}
157+
}
158+
})
159+
}
160+
}
161+
162+
func TestGetGpusFromTtSmiSnapshot(t *testing.T) {
163+
data, err := loadTestData("single_n150_gpu.json")
164+
if err != nil {
165+
t.Fatalf("Failed to load test data: %v", err)
166+
}
167+
snapshot, err := unmarshalTtSmiSnapshot(data)
168+
if err != nil {
169+
t.Fatalf("Failed to unmarshal snapshot: %v", err)
170+
}
171+
172+
expectedGpus := []GpuInfo{
173+
{
174+
Vendor: common.GpuVendorTenstorrent,
175+
Name: "n150",
176+
Vram: 12 * 1024,
177+
ID: "100018611902010",
178+
Index: "0",
179+
},
180+
}
181+
182+
gpus := getGpusFromTtSmiSnapshot(snapshot)
183+
184+
if !reflect.DeepEqual(gpus, expectedGpus) {
185+
t.Errorf("getGpusFromTtSmiSnapshot() = %v, want %v", gpus, expectedGpus)
186+
}
187+
}
188+
189+
func TestGetGpusFromTtSmiSnapshotMultipleDevices(t *testing.T) {
190+
data, err := loadTestData("valid_multiple_devices.json")
191+
if err != nil {
192+
t.Fatalf("Failed to load test data: %v", err)
193+
}
194+
snapshot, err := unmarshalTtSmiSnapshot(data)
195+
if err != nil {
196+
t.Fatalf("Failed to unmarshal snapshot: %v", err)
197+
}
198+
199+
gpus := getGpusFromTtSmiSnapshot(snapshot)
200+
201+
// Verify we have 4 unique GPUs (grouped by board_id)
202+
if len(gpus) != 4 {
203+
t.Errorf("getGpusFromTtSmiSnapshot() returned %d GPUs, want 4", len(gpus))
204+
}
205+
206+
// Create a map to check the results by board_id
207+
gpusByID := make(map[string]GpuInfo)
208+
for _, gpu := range gpus {
209+
gpusByID[gpu.ID] = gpu
210+
}
211+
212+
// Verify specific GPUs and their aggregated VRAM
213+
expectedGpus := map[string]struct {
214+
name string
215+
vram int
216+
}{
217+
"10001451172208f": {"n300", 24 * 1024}, // 12GB (n300 L) + 12GB (n300 R) = 24GB
218+
"100014511722053": {"n300", 24 * 1024}, // 12GB (n300 L) + 12GB (n300 R) = 24GB
219+
"10001451172209c": {"n300", 24 * 1024}, // 12GB (n300 L) + 12GB (n300 R) = 24GB
220+
"100014511722058": {"n300", 24 * 1024}, // 12GB (n300 L) + 12GB (n300 R) = 24GB
221+
}
222+
223+
for boardID, expected := range expectedGpus {
224+
gpu, exists := gpusByID[boardID]
225+
if !exists {
226+
t.Errorf("Expected GPU with board_id %s not found", boardID)
227+
continue
228+
}
229+
if gpu.Name != expected.name {
230+
t.Errorf("GPU %s: name = %s, want %s", boardID, gpu.Name, expected.name)
231+
}
232+
if gpu.Vram != expected.vram {
233+
t.Errorf("GPU %s: VRAM = %d, want %d", boardID, gpu.Vram, expected.vram)
234+
}
235+
if gpu.Vendor != common.GpuVendorTenstorrent {
236+
t.Errorf("GPU %s: vendor = %v, want %v", boardID, gpu.Vendor, common.GpuVendorTenstorrent)
237+
}
238+
}
239+
}
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
{
2+
"time": "2025-06-20T12:10:28.926938",
3+
"host_info": {
4+
"OS": "Linux",
5+
"Distro": "Ubuntu 20.04.6 LTS",
6+
"Kernel": "5.15.0-138-generic",
7+
"Hostname": "empty-system",
8+
"Platform": "x86_64",
9+
"Python": "3.8.10",
10+
"Memory": "16.00 GB",
11+
"Driver": "TT-KMD 1.33"
12+
},
13+
"host_sw_vers": {
14+
"tt_smi": "3.0.15",
15+
"pyluwen": "0.7.2"
16+
},
17+
"device_info": []
18+
}

0 commit comments

Comments
 (0)