diff --git a/runner/.justfile b/runner/.justfile index ac3df4ba9d..cde1ddb717 100644 --- a/runner/.justfile +++ b/runner/.justfile @@ -45,7 +45,7 @@ build-runner-binary: #!/usr/bin/env bash set -e echo "Building runner for linux/amd64" - cd {{source_directory()}}/cmd/runner && GOOS=linux GOARCH=amd64 go build + cd {{source_directory()}}/cmd/runner && GOOS=linux GOARCH=amd64 go build -ldflags "-X 'main.Version=$version' -extldflags '-static'" echo "Runner build complete!" # Build shim @@ -56,12 +56,12 @@ build-shim-binary: cd {{source_directory()}}/cmd/shim if [ -n "$shim_os" ] && [ -n "$shim_arch" ]; then echo "Building shim for $shim_os/$shim_arch" - GOOS=$shim_os GOARCH=$shim_arch go build + GOOS=$shim_os GOARCH=$shim_arch go build -ldflags "-X 'main.Version=$version' -extldflags '-static'" else echo "Building shim for current platform" - go build + go build -ldflags "-X 'main.Version=$version' -extldflags '-static'" fi - echo "Shim build complete!" + echo "Shim build (version: $version) complete!" # Build both runner and shim build-runner: build-runner-binary build-shim-binary diff --git a/runner/internal/shim/host/gpu.go b/runner/internal/shim/host/gpu.go index 1dd4e07005..eb2f53122b 100644 --- a/runner/internal/shim/host/gpu.go +++ b/runner/internal/shim/host/gpu.go @@ -188,7 +188,67 @@ type ttDeviceInfo struct { type ttBoardInfo struct { BoardType string `json:"board_type"` - BusID string `json:"bus_id"` + BoardID string `json:"board_id"` +} + +func unmarshalTtSmiSnapshot(data []byte) (*ttSmiSnapshot, error) { + var snapshot ttSmiSnapshot + if err := json.Unmarshal(data, &snapshot); err != nil { + return nil, err + } + return &snapshot, nil +} + +func getGpusFromTtSmiSnapshot(snapshot *ttSmiSnapshot) []GpuInfo { + // Group devices by board_id to aggregate memory for the same physical GPU + boardMap := make(map[string]*GpuInfo) + indexCounter := 0 + + for _, device := range snapshot.DeviceInfo { + boardID := device.BoardInfo.BoardID + + // Extract board type without R/L suffix + boardType := strings.TrimSpace(device.BoardInfo.BoardType) + name := boardType + + // Remove " R" or " L" suffix if present + if strings.HasSuffix(boardType, " R") { + name = boardType[:len(boardType)-2] + } else if strings.HasSuffix(boardType, " L") { + name = boardType[:len(boardType)-2] + } + + // Determine base VRAM based on board type + baseVram := 0 + if strings.HasPrefix(name, "n150") { + baseVram = 12 * 1024 // 12GB in MiB + } else if strings.HasPrefix(name, "n300") { + baseVram = 12 * 1024 // 12GB in MiB + } + + if existingGpu, exists := boardMap[boardID]; exists { + // Aggregate VRAM for the same board_id + existingGpu.Vram += baseVram + } else { + // Create new GPU entry + boardMap[boardID] = &GpuInfo{ + Vendor: common.GpuVendorTenstorrent, + Name: name, + Vram: baseVram, + ID: boardID, + Index: strconv.Itoa(indexCounter), + } + indexCounter++ + } + } + + // Convert map to slice + var gpus []GpuInfo + for _, gpu := range boardMap { + gpus = append(gpus, *gpu) + } + + return gpus } func getTenstorrentGpuInfo(ctx context.Context) []GpuInfo { @@ -218,43 +278,14 @@ func getTenstorrentGpuInfo(ctx context.Context) []GpuInfo { return gpus } - var ttSmiSnapshot ttSmiSnapshot - if err := json.Unmarshal([]byte(res.Stdout), &ttSmiSnapshot); err != nil { + ttSmiSnapshot, err := unmarshalTtSmiSnapshot([]byte(res.Stdout)) + if err != nil { log.Error(ctx, "cannot read tt-smi json", "err", err) log.Debug(ctx, "tt-smi output", "stdout", res.Stdout) return gpus } - for i, device := range ttSmiSnapshot.DeviceInfo { - // Extract board type without R/L suffix - boardType := strings.TrimSpace(device.BoardInfo.BoardType) - name := boardType - - // Remove " R" or " L" suffix if present - if strings.HasSuffix(boardType, " R") { - name = boardType[:len(boardType)-2] - } else if strings.HasSuffix(boardType, " L") { - name = boardType[:len(boardType)-2] - } - - // Determine VRAM based on board type - vram := 0 - if strings.HasPrefix(name, "n150") { - vram = 12 * 1024 // 12GB in MiB - } else if strings.HasPrefix(name, "n300") { - vram = 24 * 1024 // 24GB in MiB - } - - gpus = append(gpus, GpuInfo{ - Vendor: common.GpuVendorTenstorrent, - Name: name, - Vram: vram, - ID: device.BoardInfo.BusID, - Index: strconv.Itoa(i), - }) - } - - return gpus + return getGpusFromTtSmiSnapshot(ttSmiSnapshot) } func getAmdRenderNodePath(bdf string) (string, error) { diff --git a/runner/internal/shim/host/gpu_test.go b/runner/internal/shim/host/gpu_test.go new file mode 100644 index 0000000000..4ba60f9d53 --- /dev/null +++ b/runner/internal/shim/host/gpu_test.go @@ -0,0 +1,239 @@ +package host + +import ( + "os" + "path/filepath" + "reflect" + "testing" + + "github.com/dstackai/dstack/runner/internal/common" +) + +func loadTestData(filename string) ([]byte, error) { + path := filepath.Join("testdata", filename) + return os.ReadFile(path) +} + +func TestUnmarshalTtSmiSnapshot(t *testing.T) { + tests := []struct { + name string + filename string + want *ttSmiSnapshot + wantErr bool + }{ + { + name: "valid single device", + filename: "tenstorrent/valid_single_device.json", + want: &ttSmiSnapshot{ + DeviceInfo: []ttDeviceInfo{ + { + BoardInfo: ttBoardInfo{ + BoardType: "n150 L", + BoardID: "100018611902010", + }, + }, + }, + }, + wantErr: false, + }, + { + name: "valid multiple devices", + filename: "tenstorrent/valid_multiple_devices.json", + want: &ttSmiSnapshot{ + DeviceInfo: []ttDeviceInfo{ + { + BoardInfo: ttBoardInfo{ + BoardType: "n300 L", + BoardID: "10001451172208f", + }, + }, + { + BoardInfo: ttBoardInfo{ + BoardType: "n300 L", + BoardID: "100014511722053", + }, + }, + { + BoardInfo: ttBoardInfo{ + BoardType: "n300 L", + BoardID: "10001451172209c", + }, + }, + { + BoardInfo: ttBoardInfo{ + BoardType: "n300 L", + BoardID: "100014511722058", + }, + }, + { + BoardInfo: ttBoardInfo{ + BoardType: "n300 R", + BoardID: "10001451172208f", + }, + }, + { + BoardInfo: ttBoardInfo{ + BoardType: "n300 R", + BoardID: "100014511722053", + }, + }, + { + BoardInfo: ttBoardInfo{ + BoardType: "n300 R", + BoardID: "10001451172209c", + }, + }, + { + BoardInfo: ttBoardInfo{ + BoardType: "n300 R", + BoardID: "100014511722058", + }, + }, + }, + }, + wantErr: false, + }, + { + name: "empty device info", + filename: "tenstorrent/empty_device_info.json", + want: &ttSmiSnapshot{ + DeviceInfo: []ttDeviceInfo{}, + }, + wantErr: false, + }, + { + name: "invalid JSON", + filename: "tenstorrent/invalid_json.json", + want: nil, + wantErr: true, + }, + { + name: "missing device_info field", + filename: "tenstorrent/missing_device_info.json", + want: &ttSmiSnapshot{DeviceInfo: nil}, + wantErr: false, + }, + { + name: "empty JSON", + filename: "tenstorrent/empty_json.json", + want: &ttSmiSnapshot{DeviceInfo: nil}, + wantErr: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + data, err := loadTestData(tt.filename) + if err != nil { + t.Fatalf("Failed to load test data from %s: %v", tt.filename, err) + } + + got, err := unmarshalTtSmiSnapshot(data) + if (err != nil) != tt.wantErr { + t.Errorf("unmarshalTtSmiSnapshot() error = %v, wantErr %v", err, tt.wantErr) + return + } + if !tt.wantErr { + if got == nil { + t.Errorf("unmarshalTtSmiSnapshot() returned nil, expected non-nil result") + return + } + if len(got.DeviceInfo) != len(tt.want.DeviceInfo) { + t.Errorf("unmarshalTtSmiSnapshot() device count = %v, want %v", len(got.DeviceInfo), len(tt.want.DeviceInfo)) + return + } + for i, device := range got.DeviceInfo { + if i >= len(tt.want.DeviceInfo) { + break + } + expected := tt.want.DeviceInfo[i] + if device.BoardInfo.BoardType != expected.BoardInfo.BoardType { + t.Errorf("unmarshalTtSmiSnapshot() device[%d].BoardInfo.BoardType = %v, want %v", i, device.BoardInfo.BoardType, expected.BoardInfo.BoardType) + } + if device.BoardInfo.BoardID != expected.BoardInfo.BoardID { + t.Errorf("unmarshalTtSmiSnapshot() device[%d].BoardInfo.BoardID = %v, want %v", i, device.BoardInfo.BoardID, expected.BoardInfo.BoardID) + } + } + } + }) + } +} + +func TestGetGpusFromTtSmiSnapshot(t *testing.T) { + data, err := loadTestData("tenstorrent/single_n150_gpu.json") + if err != nil { + t.Fatalf("Failed to load test data: %v", err) + } + snapshot, err := unmarshalTtSmiSnapshot(data) + if err != nil { + t.Fatalf("Failed to unmarshal snapshot: %v", err) + } + + expectedGpus := []GpuInfo{ + { + Vendor: common.GpuVendorTenstorrent, + Name: "n150", + Vram: 12 * 1024, + ID: "100018611902010", + Index: "0", + }, + } + + gpus := getGpusFromTtSmiSnapshot(snapshot) + + if !reflect.DeepEqual(gpus, expectedGpus) { + t.Errorf("getGpusFromTtSmiSnapshot() = %v, want %v", gpus, expectedGpus) + } +} + +func TestGetGpusFromTtSmiSnapshotMultipleDevices(t *testing.T) { + data, err := loadTestData("tenstorrent/valid_multiple_devices.json") + if err != nil { + t.Fatalf("Failed to load test data: %v", err) + } + snapshot, err := unmarshalTtSmiSnapshot(data) + if err != nil { + t.Fatalf("Failed to unmarshal snapshot: %v", err) + } + + gpus := getGpusFromTtSmiSnapshot(snapshot) + + // Verify we have 4 unique GPUs (grouped by board_id) + if len(gpus) != 4 { + t.Errorf("getGpusFromTtSmiSnapshot() returned %d GPUs, want 4", len(gpus)) + } + + // Create a map to check the results by board_id + gpusByID := make(map[string]GpuInfo) + for _, gpu := range gpus { + gpusByID[gpu.ID] = gpu + } + + // Verify specific GPUs and their aggregated VRAM + expectedGpus := map[string]struct { + name string + vram int + }{ + "10001451172208f": {"n300", 24 * 1024}, // 12GB (n300 L) + 12GB (n300 R) = 24GB + "100014511722053": {"n300", 24 * 1024}, // 12GB (n300 L) + 12GB (n300 R) = 24GB + "10001451172209c": {"n300", 24 * 1024}, // 12GB (n300 L) + 12GB (n300 R) = 24GB + "100014511722058": {"n300", 24 * 1024}, // 12GB (n300 L) + 12GB (n300 R) = 24GB + } + + for boardID, expected := range expectedGpus { + gpu, exists := gpusByID[boardID] + if !exists { + t.Errorf("Expected GPU with board_id %s not found", boardID) + continue + } + if gpu.Name != expected.name { + t.Errorf("GPU %s: name = %s, want %s", boardID, gpu.Name, expected.name) + } + if gpu.Vram != expected.vram { + t.Errorf("GPU %s: VRAM = %d, want %d", boardID, gpu.Vram, expected.vram) + } + if gpu.Vendor != common.GpuVendorTenstorrent { + t.Errorf("GPU %s: vendor = %v, want %v", boardID, gpu.Vendor, common.GpuVendorTenstorrent) + } + } +} diff --git a/runner/internal/shim/host/testdata/tenstorrent/empty_device_info.json b/runner/internal/shim/host/testdata/tenstorrent/empty_device_info.json new file mode 100644 index 0000000000..6aab9062ab --- /dev/null +++ b/runner/internal/shim/host/testdata/tenstorrent/empty_device_info.json @@ -0,0 +1,18 @@ +{ + "time": "2025-06-20T12:10:28.926938", + "host_info": { + "OS": "Linux", + "Distro": "Ubuntu 20.04.6 LTS", + "Kernel": "5.15.0-138-generic", + "Hostname": "empty-system", + "Platform": "x86_64", + "Python": "3.8.10", + "Memory": "16.00 GB", + "Driver": "TT-KMD 1.33" + }, + "host_sw_vers": { + "tt_smi": "3.0.15", + "pyluwen": "0.7.2" + }, + "device_info": [] +} diff --git a/runner/internal/shim/host/testdata/tenstorrent/empty_json.json b/runner/internal/shim/host/testdata/tenstorrent/empty_json.json new file mode 100644 index 0000000000..451cbbb316 --- /dev/null +++ b/runner/internal/shim/host/testdata/tenstorrent/empty_json.json @@ -0,0 +1 @@ +{} diff --git a/runner/internal/shim/host/testdata/tenstorrent/invalid_json.json b/runner/internal/shim/host/testdata/tenstorrent/invalid_json.json new file mode 100644 index 0000000000..3cd2b46b13 --- /dev/null +++ b/runner/internal/shim/host/testdata/tenstorrent/invalid_json.json @@ -0,0 +1 @@ +{"device_info": [{"board_info": {"board_type": "n150"}} diff --git a/runner/internal/shim/host/testdata/tenstorrent/missing_device_info.json b/runner/internal/shim/host/testdata/tenstorrent/missing_device_info.json new file mode 100644 index 0000000000..97563f9ccb --- /dev/null +++ b/runner/internal/shim/host/testdata/tenstorrent/missing_device_info.json @@ -0,0 +1,18 @@ +{ + "time": "2025-06-20T12:10:28.926938", + "host_info": { + "OS": "Linux", + "Distro": "Ubuntu 20.04.6 LTS", + "Kernel": "5.15.0-138-generic", + "Hostname": "incomplete-system", + "Platform": "x86_64", + "Python": "3.8.10", + "Memory": "16.00 GB", + "Driver": "TT-KMD 1.33" + }, + "host_sw_vers": { + "tt_smi": "3.0.15", + "pyluwen": "0.7.2" + }, + "other_field": "value" +} diff --git a/runner/internal/shim/host/testdata/tenstorrent/single_n150_gpu.json b/runner/internal/shim/host/testdata/tenstorrent/single_n150_gpu.json new file mode 100644 index 0000000000..5b3ca7371d --- /dev/null +++ b/runner/internal/shim/host/testdata/tenstorrent/single_n150_gpu.json @@ -0,0 +1,110 @@ +{ + "time": "2025-06-20T12:10:28.926938", + "host_info": { + "OS": "Linux", + "Distro": "Ubuntu 20.04.6 LTS", + "Kernel": "5.15.0-138-generic", + "Hostname": "7330093c7194", + "Platform": "x86_64", + "Python": "3.8.10", + "Memory": "30.46 GB", + "Driver": "TT-KMD 1.33" + }, + "host_sw_vers": { + "tt_smi": "3.0.15", + "pyluwen": "0.7.2" + }, + "device_info": [ + { + "smbus_telem": { + "BOARD_ID": "0x100018611902010", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2df07", + "ASIC_IDD": "0xbf1", + "BOARD_ID_HIGH": "0x1000186", + "BOARD_ID_LOW": "0x11902010", + "ARC0_FW_VERSION": "0x2200000", + "ARC1_FW_VERSION": "0x2200000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2200000", + "SPIBOOTROM_FW_VERSION": "0x30c0000", + "ETH_FW_VERSION": "0x6e000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x50c0000", + "DDR_SPEED": null, + "DDR_STATUS": "0x2222222", + "ETH_STATUS0": "0x11111111", + "ETH_STATUS1": "0x11111111", + "PCIE_STATUS": "0x10440000", + "FAULTS": null, + "ARC0_HEALTH": "0x6b1943", + "ARC1_HEALTH": "0x275466", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x42e0", + "FAN_SPEED": "0x4cbffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31b", + "ASIC_TEMPERATURE": "0x3690365", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x323433", + "TDP": "0x64000c", + "TDC": "0xf00010", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x54010d33", + "ASIC_TMON0": "0x37324235", + "ASIC_TMON1": "0x382f", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0x56140b2a", + "RT_SECONDS": "0x6ca", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": "0xccddddcc", + "ETH_DEBUG_STATUS1": "0xccdddddd", + "TT_FLASH_VERSION": "0x30200", + "FW_BUNDLE_VERSION": "0x50110000" + }, + "board_info": { + "bus_id": "0000:01:00.0", + "board_type": "n150 L", + "board_id": "100018611902010", + "coords": "(0, 0, 0, 0)", + "dram_status": true, + "dram_speed": "12G", + "pcie_speed": 4, + "pcie_width": "4" + }, + "telemetry": { + "voltage": "0.80", + "current": " 16.0", + "power": " 12.0", + "aiclk": " 500", + "asic_temperature": "54.3" + }, + "firmwares": { + "fw_bundle_version": "80.17.0.0", + "tt_flash_version": "0.3.2.0", + "cm_fw": "2.32.0.0", + "cm_fw_date": "2025-04-01", + "eth_fw": "6.14.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.12.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "100", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + } + ] +} diff --git a/runner/internal/shim/host/testdata/tenstorrent/valid_multiple_devices.json b/runner/internal/shim/host/testdata/tenstorrent/valid_multiple_devices.json new file mode 100644 index 0000000000..4fd62a0b38 --- /dev/null +++ b/runner/internal/shim/host/testdata/tenstorrent/valid_multiple_devices.json @@ -0,0 +1,732 @@ +{ + "time": "2025-06-11T03:37:23.927792", + "host_info": { + "OS": "Linux", + "Distro": "Ubuntu 22.04.5 LTS", + "Kernel": "5.15.0-141-generic", + "Hostname": "TT-QuietBox", + "Platform": "x86_64", + "Python": "3.10.12", + "Memory": "503.45 GB", + "Driver": "TT-KMD 1.34" + }, + "host_sw_vers": { + "tt_smi": "3.0.20", + "pyluwen": "0.7.2" + }, + "device_info": [ + { + "smbus_telem": { + "BOARD_ID": "0x10001451172208f", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2df07", + "ASIC_IDD": "0xbf1", + "BOARD_ID_HIGH": "0x1000145", + "BOARD_ID_LOW": "0x1172208f", + "ARC0_FW_VERSION": "0x2200000", + "ARC1_FW_VERSION": "0x2200000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2200000", + "SPIBOOTROM_FW_VERSION": "0x30c0000", + "ETH_FW_VERSION": "0x6e000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x50c0000", + "DDR_SPEED": null, + "DDR_STATUS": "0x2222222", + "ETH_STATUS0": "0x11111111", + "ETH_STATUS1": "0x11111111", + "PCIE_STATUS": "0x10440000", + "FAULTS": null, + "ARC0_HEALTH": "0x6b1943", + "ARC1_HEALTH": "0x275466", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x42e0", + "FAN_SPEED": "0x4cbffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31b", + "ASIC_TEMPERATURE": "0x3690365", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x323433", + "TDP": "0x64000c", + "TDC": "0xf00010", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x54010d33", + "ASIC_TMON0": "0x37324235", + "ASIC_TMON1": "0x382f", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0x56140b2a", + "RT_SECONDS": "0x6ca", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": "0xccddddcc", + "ETH_DEBUG_STATUS1": "0xccdddddd", + "TT_FLASH_VERSION": "0x30200", + "FW_BUNDLE_VERSION": "0x50110000" + }, + "board_info": { + "bus_id": "0000:c1:00.0", + "board_type": "n300 L", + "board_id": "10001451172208f", + "coords": "(1, 0, 0, 0)", + "dram_status": true, + "dram_speed": "12G", + "pcie_speed": 4, + "pcie_width": "16" + }, + "telemetry": { + "voltage": "0.80", + "current": " 16.0", + "power": " 12.0", + "aiclk": " 500", + "asic_temperature": "54.3" + }, + "firmwares": { + "fw_bundle_version": "80.17.0.0", + "tt_flash_version": "0.3.2.0", + "cm_fw": "2.32.0.0", + "cm_fw_date": "2025-04-01", + "eth_fw": "6.14.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.12.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "100", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x100014511722053", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2df07", + "ASIC_IDD": "0xbf1", + "BOARD_ID_HIGH": "0x1000145", + "BOARD_ID_LOW": "0x11722053", + "ARC0_FW_VERSION": "0x2200000", + "ARC1_FW_VERSION": "0x2200000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2200000", + "SPIBOOTROM_FW_VERSION": "0x30c0000", + "ETH_FW_VERSION": "0x6e000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x50c0000", + "DDR_SPEED": null, + "DDR_STATUS": "0x2222222", + "ETH_STATUS0": "0x11111111", + "ETH_STATUS1": "0x11111111", + "PCIE_STATUS": "0x10440000", + "FAULTS": null, + "ARC0_HEALTH": "0x6b1943", + "ARC1_HEALTH": "0x275466", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x42e0", + "FAN_SPEED": "0x4cbffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31b", + "ASIC_TEMPERATURE": "0x3690365", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x323433", + "TDP": "0x64000c", + "TDC": "0xf00010", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x54010d33", + "ASIC_TMON0": "0x37324235", + "ASIC_TMON1": "0x382f", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0x56140b2a", + "RT_SECONDS": "0x6ca", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": "0xccddddcc", + "ETH_DEBUG_STATUS1": "0xccdddddd", + "TT_FLASH_VERSION": "0x30200", + "FW_BUNDLE_VERSION": "0x50110000" + }, + "board_info": { + "bus_id": "0000:01:00.0", + "board_type": "n300 L", + "board_id": "100014511722053", + "coords": "(1, 1, 0, 0)", + "dram_status": true, + "dram_speed": "12G", + "pcie_speed": 4, + "pcie_width": "16" + }, + "telemetry": { + "temperature": 48.7, + "power_consumption": 15.2 + }, + "firmwares": { + "version": "1.2.5" + }, + "limits": { + "max_temp": 85.0, + "max_power": 25.0 + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x10001451172209c", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2df07", + "ASIC_IDD": "0xbf1", + "BOARD_ID_HIGH": "0x1000145", + "BOARD_ID_LOW": "0x1172209c", + "ARC0_FW_VERSION": "0x2200000", + "ARC1_FW_VERSION": "0x2200000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2200000", + "SPIBOOTROM_FW_VERSION": "0x30c0000", + "ETH_FW_VERSION": "0x6e000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x50c0000", + "DDR_SPEED": null, + "DDR_STATUS": "0x2222222", + "ETH_STATUS0": "0x11111111", + "ETH_STATUS1": "0x11111111", + "PCIE_STATUS": "0x10440000", + "FAULTS": null, + "ARC0_HEALTH": "0x6b1943", + "ARC1_HEALTH": "0x275466", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x42e0", + "FAN_SPEED": "0x4cbffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31b", + "ASIC_TEMPERATURE": "0x3690365", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x323433", + "TDP": "0x64000c", + "TDC": "0xf00010", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x54010d33", + "ASIC_TMON0": "0x37324235", + "ASIC_TMON1": "0x382f", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0x56140b2a", + "RT_SECONDS": "0x6ca", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": "0xccddddcc", + "ETH_DEBUG_STATUS1": "0xccdddddd", + "TT_FLASH_VERSION": "0x30200", + "FW_BUNDLE_VERSION": "0x50110000" + }, + "board_info": { + "bus_id": "0000:02:00.0", + "board_type": "n300 L", + "board_id": "10001451172209c", + "coords": "(2, 1, 0, 0)", + "dram_status": true, + "dram_speed": "12G", + "pcie_speed": 4, + "pcie_width": "16" + }, + "telemetry": { + "voltage": "0.80", + "current": " 16.0", + "power": " 12.0", + "aiclk": " 500", + "asic_temperature": "54.3" + }, + "firmwares": { + "fw_bundle_version": "80.17.0.0", + "tt_flash_version": "0.3.2.0", + "cm_fw": "2.32.0.0", + "cm_fw_date": "2025-04-01", + "eth_fw": "6.14.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.12.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "100", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x100014511722058", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2df07", + "ASIC_IDD": "0xbf1", + "BOARD_ID_HIGH": "0x1000145", + "BOARD_ID_LOW": "0x11722058", + "ARC0_FW_VERSION": "0x2200000", + "ARC1_FW_VERSION": "0x2200000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2200000", + "SPIBOOTROM_FW_VERSION": "0x30c0000", + "ETH_FW_VERSION": "0x6e000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x50c0000", + "DDR_SPEED": null, + "DDR_STATUS": "0x2222222", + "ETH_STATUS0": "0x11111111", + "ETH_STATUS1": "0x11111111", + "PCIE_STATUS": "0x10440000", + "FAULTS": null, + "ARC0_HEALTH": "0x6b1943", + "ARC1_HEALTH": "0x275466", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x42e0", + "FAN_SPEED": "0x4cbffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31b", + "ASIC_TEMPERATURE": "0x3690365", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x323433", + "TDP": "0x64000c", + "TDC": "0xf00010", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x54010d33", + "ASIC_TMON0": "0x37324235", + "ASIC_TMON1": "0x382f", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0x56140b2a", + "RT_SECONDS": "0x6ca", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": "0xccddddcc", + "ETH_DEBUG_STATUS1": "0xccdddddd", + "TT_FLASH_VERSION": "0x30200", + "FW_BUNDLE_VERSION": "0x50110000" + }, + "board_info": { + "bus_id": "0000:41:00.0", + "board_type": "n300 L", + "board_id": "100014511722058", + "coords": "(2, 0, 0, 0)", + "dram_status": true, + "dram_speed": "12G", + "pcie_speed": 4, + "pcie_width": "16" + }, + "telemetry": { + "voltage": "0.80", + "current": " 16.0", + "power": " 12.0", + "aiclk": " 500", + "asic_temperature": "54.3" + }, + "firmwares": { + "fw_bundle_version": "80.17.0.0", + "tt_flash_version": "0.3.2.0", + "cm_fw": "2.32.0.0", + "cm_fw_date": "2025-04-01", + "eth_fw": "6.14.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.12.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "100", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x10001451172208f", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2df07", + "ASIC_IDD": "0xbf1", + "BOARD_ID_HIGH": "0x1000145", + "BOARD_ID_LOW": "0x1172208f", + "ARC0_FW_VERSION": "0x2200000", + "ARC1_FW_VERSION": "0x2200000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2200000", + "SPIBOOTROM_FW_VERSION": "0x30c0000", + "ETH_FW_VERSION": "0x6e000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x50c0000", + "DDR_SPEED": null, + "DDR_STATUS": "0x2222222", + "ETH_STATUS0": "0x11111111", + "ETH_STATUS1": "0x11111111", + "PCIE_STATUS": "0x10440000", + "FAULTS": null, + "ARC0_HEALTH": "0x6b1943", + "ARC1_HEALTH": "0x275466", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x42e0", + "FAN_SPEED": "0x4cbffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31b", + "ASIC_TEMPERATURE": "0x3690365", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x323433", + "TDP": "0x64000c", + "TDC": "0xf00010", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x54010d33", + "ASIC_TMON0": "0x37324235", + "ASIC_TMON1": "0x382f", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0x56140b2a", + "RT_SECONDS": "0x6ca", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": "0xccddddcc", + "ETH_DEBUG_STATUS1": "0xccdddddd", + "TT_FLASH_VERSION": "0x30200", + "FW_BUNDLE_VERSION": "0x50110000" + }, + "board_info": { + "bus_id": "N/A", + "board_type": "n300 R", + "board_id": "10001451172208f", + "coords": "(0, 0, 0, 0)", + "dram_status": true, + "dram_speed": "12G", + "pcie_speed": "N/A", + "pcie_width": "N/A" + }, + "telemetry": { + "voltage": "0.80", + "current": " 16.0", + "power": " 12.0", + "aiclk": " 500", + "asic_temperature": "54.3" + }, + "firmwares": { + "fw_bundle_version": "80.17.0.0", + "tt_flash_version": "0.3.2.0", + "cm_fw": "2.32.0.0", + "cm_fw_date": "2025-04-01", + "eth_fw": "6.14.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.12.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "100", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x100014511722053", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2df07", + "ASIC_IDD": "0xbf1", + "BOARD_ID_HIGH": "0x1000145", + "BOARD_ID_LOW": "0x11722053", + "ARC0_FW_VERSION": "0x2200000", + "ARC1_FW_VERSION": "0x2200000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2200000", + "SPIBOOTROM_FW_VERSION": "0x30c0000", + "ETH_FW_VERSION": "0x6e000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x50c0000", + "DDR_SPEED": null, + "DDR_STATUS": "0x2222222", + "ETH_STATUS0": "0x11111111", + "ETH_STATUS1": "0x11111111", + "PCIE_STATUS": "0x10440000", + "FAULTS": null, + "ARC0_HEALTH": "0x6b1943", + "ARC1_HEALTH": "0x275466", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x42e0", + "FAN_SPEED": "0x4cbffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31b", + "ASIC_TEMPERATURE": "0x3690365", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x323433", + "TDP": "0x64000c", + "TDC": "0xf00010", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x54010d33", + "ASIC_TMON0": "0x37324235", + "ASIC_TMON1": "0x382f", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0x56140b2a", + "RT_SECONDS": "0x6ca", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": "0xccddddcc", + "ETH_DEBUG_STATUS1": "0xccdddddd", + "TT_FLASH_VERSION": "0x30200", + "FW_BUNDLE_VERSION": "0x50110000" + }, + "board_info": { + "bus_id": "N/A", + "board_type": "n300 R", + "board_id": "100014511722053", + "coords": "(0, 1, 0, 0)", + "dram_status": true, + "dram_speed": "12G", + "pcie_speed": "N/A", + "pcie_width": "N/A" + }, + "telemetry": { + "voltage": "0.80", + "current": " 16.0", + "power": " 12.0", + "aiclk": " 500", + "asic_temperature": "54.3" + }, + "firmwares": { + "fw_bundle_version": "80.17.0.0", + "tt_flash_version": "0.3.2.0", + "cm_fw": "2.32.0.0", + "cm_fw_date": "2025-04-01", + "eth_fw": "6.14.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.12.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "100", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x10001451172209c", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2df07", + "ASIC_IDD": "0xbf1", + "BOARD_ID_HIGH": "0x1000145", + "BOARD_ID_LOW": "0x1172209c", + "ARC0_FW_VERSION": "0x2200000", + "ARC1_FW_VERSION": "0x2200000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2200000", + "SPIBOOTROM_FW_VERSION": "0x30c0000", + "ETH_FW_VERSION": "0x6e000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x50c0000", + "DDR_SPEED": null, + "DDR_STATUS": "0x2222222", + "ETH_STATUS0": "0x11111111", + "ETH_STATUS1": "0x11111111", + "PCIE_STATUS": "0x10440000", + "FAULTS": null, + "ARC0_HEALTH": "0x6b1943", + "ARC1_HEALTH": "0x275466", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x42e0", + "FAN_SPEED": "0x4cbffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31b", + "ASIC_TEMPERATURE": "0x3690365", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x323433", + "TDP": "0x64000c", + "TDC": "0xf00010", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x54010d33", + "ASIC_TMON0": "0x37324235", + "ASIC_TMON1": "0x382f", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0x56140b2a", + "RT_SECONDS": "0x6ca", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": "0xccddddcc", + "ETH_DEBUG_STATUS1": "0xccdddddd", + "TT_FLASH_VERSION": "0x30200", + "FW_BUNDLE_VERSION": "0x50110000" + }, + "board_info": { + "bus_id": "N/A", + "board_type": "n300 R", + "board_id": "10001451172209c", + "coords": "(3, 1, 0, 0)", + "dram_status": true, + "dram_speed": "12G", + "pcie_speed": "N/A", + "pcie_width": "N/A" + }, + "telemetry": { + "voltage": "0.80", + "current": " 16.0", + "power": " 12.0", + "aiclk": " 500", + "asic_temperature": "54.3" + }, + "firmwares": { + "fw_bundle_version": "80.17.0.0", + "tt_flash_version": "0.3.2.0", + "cm_fw": "2.32.0.0", + "cm_fw_date": "2025-04-01", + "eth_fw": "6.14.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.12.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "100", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + }, + { + "smbus_telem": { + "BOARD_ID": "0x100014511722058", + "ENUM_VERSION": "0xba5e0001", + "DEVICE_ID": "0x401e1e52", + "ASIC_RO": "0x2df07", + "ASIC_IDD": "0xbf1", + "BOARD_ID_HIGH": "0x1000145", + "BOARD_ID_LOW": "0x11722058", + "ARC0_FW_VERSION": "0x2200000", + "ARC1_FW_VERSION": "0x2200000", + "ARC2_FW_VERSION": null, + "ARC3_FW_VERSION": "0x2200000", + "SPIBOOTROM_FW_VERSION": "0x30c0000", + "ETH_FW_VERSION": "0x6e000", + "M3_BL_FW_VERSION": "0x81020000", + "M3_APP_FW_VERSION": "0x50c0000", + "DDR_SPEED": null, + "DDR_STATUS": "0x2222222", + "ETH_STATUS0": "0x11111111", + "ETH_STATUS1": "0x11111111", + "PCIE_STATUS": "0x10440000", + "FAULTS": null, + "ARC0_HEALTH": "0x6b1943", + "ARC1_HEALTH": "0x275466", + "ARC2_HEALTH": null, + "ARC3_HEALTH": "0x42e0", + "FAN_SPEED": "0x4cbffff", + "AICLK": "0x3e801f4", + "AXICLK": "0x384", + "ARCCLK": "0x21c", + "THROTTLER": null, + "VCORE": "0x31b", + "ASIC_TEMPERATURE": "0x3690365", + "VREG_TEMPERATURE": null, + "BOARD_TEMPERATURE": "0x323433", + "TDP": "0x64000c", + "TDC": "0xf00010", + "VDD_LIMITS": "0x3e80320", + "THM_LIMITS": "0x53004b", + "WH_FW_DATE": "0x54010d33", + "ASIC_TMON0": "0x37324235", + "ASIC_TMON1": "0x382f", + "MVDDQ_POWER": "0x190000", + "GDDR_TRAIN_TEMP0": null, + "GDDR_TRAIN_TEMP1": null, + "BOOT_DATE": "0x56140b2a", + "RT_SECONDS": "0x6ca", + "AUX_STATUS": null, + "ETH_DEBUG_STATUS0": "0xccddddcc", + "ETH_DEBUG_STATUS1": "0xccdddddd", + "TT_FLASH_VERSION": "0x30200", + "FW_BUNDLE_VERSION": "0x50110000" + }, + "board_info": { + "bus_id": "N/A", + "board_type": "n300 R", + "board_id": "100014511722058", + "coords": "(3, 0, 0, 0)", + "dram_status": true, + "dram_speed": "12G", + "pcie_speed": "N/A", + "pcie_width": "N/A" + }, + "telemetry": { + "voltage": "0.80", + "current": " 16.0", + "power": " 12.0", + "aiclk": " 500", + "asic_temperature": "54.3" + }, + "firmwares": { + "fw_bundle_version": "80.17.0.0", + "tt_flash_version": "0.3.2.0", + "cm_fw": "2.32.0.0", + "cm_fw_date": "2025-04-01", + "eth_fw": "6.14.0", + "bm_bl_fw": "129.2.0.0", + "bm_app_fw": "5.12.0.0" + }, + "limits": { + "vdd_min": "0.80", + "vdd_max": "1.00", + "tdp_limit": "100", + "tdc_limit": "240", + "asic_fmax": "1000", + "therm_trip_l1_limit": "83", + "thm_limit": "75", + "bus_peak_limit": null + } + } + ] +} diff --git a/runner/internal/shim/host/testdata/tenstorrent/valid_single_device.json b/runner/internal/shim/host/testdata/tenstorrent/valid_single_device.json new file mode 100644 index 0000000000..0f48fffe9f --- /dev/null +++ b/runner/internal/shim/host/testdata/tenstorrent/valid_single_device.json @@ -0,0 +1,46 @@ +{ + "time": "2025-06-20T12:10:28.926938", + "host_info": { + "OS": "Linux", + "Distro": "Ubuntu 20.04.6 LTS", + "Kernel": "5.15.0-138-generic", + "Hostname": "7330093c7194", + "Platform": "x86_64", + "Python": "3.8.10", + "Memory": "30.46 GB", + "Driver": "TT-KMD 1.33" + }, + "host_sw_vers": { + "tt_smi": "3.0.15", + "pyluwen": "0.7.2" + }, + "device_info": [ + { + "smbus_telem": { + "temp": 45.2, + "power": 12.5 + }, + "board_info": { + "bus_id": "0000:01:00.0", + "board_type": "n150 L", + "board_id": "100018611902010", + "coords": "(0, 0, 0, 0)", + "dram_status": true, + "dram_speed": "12G", + "pcie_speed": 4, + "pcie_width": "4" + }, + "telemetry": { + "temperature": 45.2, + "power_consumption": 12.5 + }, + "firmwares": { + "version": "1.2.3" + }, + "limits": { + "max_temp": 85.0, + "max_power": 25.0 + } + } + ] +}