Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
d74a306
fix: use std package `encoding/json`
May 8, 2026
b6a8df4
fix: replace `%v` with `%w` to wrap error
May 8, 2026
5e286d1
style: run `go fmt` to format
May 8, 2026
2a72cd1
fix: allocate pod with multiple containers
May 13, 2026
5e35f7c
feat: set bind-phase when allocation success or failed
May 13, 2026
8e2cbdc
device node config support filterDevices
band-p May 14, 2026
461d419
fix(allocate): resolve pod variable shadowing and optimize allocation…
May 14, 2026
27b929f
fix(server): remove redundant non-zero IDs length condition
May 14, 2026
54eb8f0
fix(manager): add RWMutex for `AscendManager.dev`
May 14, 2026
444d4f5
fix(server): exit the for loop in goroutine in `serve` method when `s…
May 14, 2026
b216b6b
fix(manager): use %w to preserve error chain in CleanupIdleVNPUs
May 14, 2026
4e1bcea
fix(manager): add error logging in GetIDs and GetUnHealthIDs
May 14, 2026
15ad42b
fix(manager): wrap bare errors with context in NewAscendManager and L…
May 14, 2026
f54b1ba
fix(server): use errors.Is instead of string matching for ETXTBSY
May 14, 2026
644b6a4
fix(server): wrap bare errors with context in serve and registerKubelet
May 14, 2026
22dfc0b
build(deps): add the missing npu-exporter dependency
May 15, 2026
ca3e583
Merge branch 'main' into fix/dep
peachest May 15, 2026
82f2da2
Merge pull request #79 from peachest/fix/dep
hami-robot[bot] May 15, 2026
d08966d
Merge pull request #77 from peachest/fix/error-handling
hami-robot[bot] May 15, 2026
6a0172a
device node config support filterDevices
band-p May 14, 2026
3b45bf6
Merge branch 'main' of github.com:band-p/ascend-device-plugin
band-p May 15, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ kubectl apply -f https://raw.githubusercontent.com/Project-HAMi/ascend-device-pl

The `hami-device-node-config` is used to enable or override hami-vnpu-core for specific nodes within the cluster. Node-level settings take higher priority than the global `vnpus.hamiVnpuCore` switch.

It also supports `filterDevices` to limit which card IDs are exposed by the device plugin on a specific node, for example: `filterDevices: [0, 1, 2, 3]`.

```bash
kubectl apply -f https://raw.githubusercontent.com/Project-HAMi/ascend-device-plugin/main/ascend-device-node-configmap.yaml
```
Expand Down
2 changes: 2 additions & 0 deletions README_cn.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ kubectl apply -f https://raw.githubusercontent.com/Project-HAMi/ascend-device-pl

`hami-device-node-config` 用于对集群中特定节点的 hami-vnpu-core 进行启用或覆盖。节点级配置的优先级高于全局 `vnpus.hamiVnpuCore` 开关。

同时支持 `filterDevices`,用于限制某个节点对外暴露的卡号,例如:`filterDevices: [0, 1, 2, 3]`。

```bash
kubectl apply -f https://raw.githubusercontent.com/Project-HAMi/ascend-device-plugin/main/ascend-device-node-configmap.yaml
```
Expand Down
1 change: 1 addition & 0 deletions ascend-device-node-configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ data:
- name: "cnst-dev-w2"
hami-vnpu-core: true
vDeviceCount: 8
filterDevices: [0, 1, 2, 3, 4, 5, 6, 7]
2 changes: 1 addition & 1 deletion cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ import (
var (
hwLoglevel = flag.Int("hw_loglevel", 0, "huawei log level, -1-debug, 0-info, 1-warning, 2-error 3-critical default value: 0")
configFile = flag.String("config_file", "", "config file path")
nodeConfigFile = flag.String("node_config_file", "", "node specific config file path")
nodeConfigFile = flag.String("node_config_file", "", "node specific config file path")
nodeName = flag.String("node_name", os.Getenv("NODE_NAME"), "node name")
checkIdleVNPUInterval = flag.Int("check_idle_vnpu_interval", 60, "the interval (in seconds) to check idle vNPU and release them")
)
Expand Down
10 changes: 2 additions & 8 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ require (
github.com/Project-HAMi/HAMi v0.0.0
github.com/fsnotify/fsnotify v1.9.0
google.golang.org/grpc v1.75.0
huawei.com/npu-exporter v0.0.0-00010101000000-000000000000
k8s.io/api v0.33.0
k8s.io/apimachinery v0.33.0
k8s.io/klog/v2 v2.130.1
Expand All @@ -23,20 +24,16 @@ require (
github.com/go-openapi/jsonreference v0.20.4 // indirect
github.com/go-openapi/swag v0.23.0 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/golang/protobuf v1.5.4 // indirect
github.com/google/gnostic-models v0.6.9 // indirect
github.com/google/go-cmp v0.7.0 // indirect
github.com/google/gofuzz v1.2.0 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/imdario/mergo v0.3.16 // indirect
github.com/influxdata/telegraf v1.26.3 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/onsi/ginkgo/v2 v2.23.4 // indirect
github.com/onsi/gomega v1.38.0 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/smartystreets/goconvey v1.7.2 // indirect
github.com/spf13/pflag v1.0.7 // indirect
Expand All @@ -47,14 +44,11 @@ require (
golang.org/x/term v0.34.0 // indirect
golang.org/x/text v0.28.0 // indirect
golang.org/x/time v0.9.0 // indirect
google.golang.org/appengine v1.6.8 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20250707201910-8d1bb00bc6a7 // indirect
google.golang.org/protobuf v1.36.8 // indirect
gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
huawei.com/npu-exporter v0.0.0-00010101000000-000000000000 // indirect
k8s.io/client-go v0.33.0 // indirect
k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect
k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 // indirect
Expand Down
139 changes: 36 additions & 103 deletions go.sum

Large diffs are not rendered by default.

90 changes: 74 additions & 16 deletions internal/manager/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package manager
import (
"fmt"
"sort"
"sync"

"ascend-common/devmanager"
"ascend-common/devmanager/dcmi"
Expand All @@ -39,17 +40,18 @@ type Device struct {
}

type AscendManager struct {
mgr *devmanager.DeviceManager
config internal.VNPUConfig
mu sync.RWMutex
mgr *devmanager.DeviceManager
config internal.VNPUConfig
globalConfig internal.Config
devs []*Device
nodeConfig *internal.NodeConfig
devs []*Device
nodeConfig *internal.NodeConfig
}

func NewAscendManager() (*AscendManager, error) {
mgr, err := devmanager.AutoInit("", 30)
if err != nil {
return nil, err
return nil, fmt.Errorf("failed to auto-init device manager: %w", err)
}
return &AscendManager{
mgr: mgr,
Expand All @@ -58,7 +60,7 @@ func NewAscendManager() (*AscendManager, error) {
}

func (am *AscendManager) LoadNodeConfig(nodePath string, nodeName string) error {
nodeConfigList, err := internal.LoadNodeConfig(nodePath)
nodeConfigList, err := internal.LoadNodeConfig(nodePath)
if err != nil {
klog.Warningf("Failed to load node config from %s: %v", nodePath, err)
return err
Expand All @@ -71,19 +73,40 @@ func (am *AscendManager) LoadNodeConfig(nodePath string, nodeName string) error
return nil
}
}

klog.Infof("No specific config found for node %s, will use default settings", nodeName)
return nil
}

func (am *AscendManager) filteredCardSet() map[int32]struct{} {
if am.nodeConfig == nil || len(am.nodeConfig.FilterDevices) == 0 {
return nil
}

filtered := make(map[int32]struct{}, len(am.nodeConfig.FilterDevices))
for _, cardID := range am.nodeConfig.FilterDevices {
filtered[cardID] = struct{}{}
}
return filtered
}

func (am *AscendManager) shouldIncludeCard(cardID int32) bool {
filtered := am.filteredCardSet()
if len(filtered) == 0 {
return true
}
_, ok := filtered[cardID]
return ok
}

func (am *AscendManager) LoadConfig(path string) error {
config, err := internal.LoadConfig(path)
if err != nil {
return err
return fmt.Errorf("failed to load config from %s: %w", path, err)
}
chipInfo, err := am.mgr.GetValidChipInfo()
if err != nil {
return err
return fmt.Errorf("failed to get valid chip info: %w", err)
}
if chipInfo.Type != "Ascend" {
return fmt.Errorf("chip type is not Ascend")
Expand Down Expand Up @@ -129,7 +152,7 @@ func (am *AscendManager) UpdateDevice() error {
return err
}

am.devs = make([]*Device, 0, len(IDs))
newDevs := make([]*Device, 0, len(IDs))
for _, ID := range IDs {
phyID, err := am.mgr.GetPhysicIDFromLogicID(ID)
if err != nil {
Expand All @@ -141,6 +164,10 @@ func (am *AscendManager) UpdateDevice() error {
klog.Errorf("failed to get card id from device id: %v", err)
return err
}
if !am.shouldIncludeCard(cardID) {
klog.V(4).Infof("skip filtered cardID=%d logicID=%d phyID=%d deviceID=%d", cardID, ID, phyID, deviceID)
continue
}
uuid, err := am.mgr.GetDieID(ID, dcmi.VDIE)
if err != nil {
klog.Errorf("failed to get uuid from device id: %v", err)
Expand All @@ -151,7 +178,7 @@ func (am *AscendManager) UpdateDevice() error {
klog.Errorf("failed to get device health: %v", err)
return err
}
am.devs = append(am.devs, &Device{
newDevs = append(newDevs, &Device{
UUID: uuid,
LogicID: ID,
PhyID: phyID,
Expand All @@ -162,14 +189,21 @@ func (am *AscendManager) UpdateDevice() error {
Health: health == 0,
})
}
am.mu.Lock()
am.devs = newDevs
am.mu.Unlock()
return nil
}

func (am *AscendManager) GetDevices() []*Device {
am.mu.RLock()
defer am.mu.RUnlock()
return am.devs
}

func (am *AscendManager) GetDeviceByUUID(UUID string) *Device {
am.mu.RLock()
defer am.mu.RUnlock()
for _, dev := range am.devs {
if dev.UUID == UUID {
return dev
Expand All @@ -181,9 +215,21 @@ func (am *AscendManager) GetDeviceByUUID(UUID string) *Device {
func (am *AscendManager) GetIDs() []int32 {
_, IDs, err := am.mgr.GetDeviceList()
if err != nil {
klog.Errorf("failed to get device list: %v", err)
return nil
}
return IDs
filteredIDs := make([]int32, 0, len(IDs))
for _, id := range IDs {
cardID, _, err := am.mgr.GetCardIDDeviceID(id)
if err != nil {
klog.Warningf("failed to get card/device ID for logic ID %d: %v", id, err)
continue
}
if am.shouldIncludeCard(cardID) {
filteredIDs = append(filteredIDs, id)
}
}
return filteredIDs
}

func (am *AscendManager) GetUnHealthIDs() []int32 {
Expand All @@ -193,8 +239,17 @@ func (am *AscendManager) GetUnHealthIDs() []int32 {
}
var unhealthy []int32
for _, d := range IDs {
cardID, _, err := am.mgr.GetCardIDDeviceID(d)
if err != nil {
klog.Warningf("failed to get card/device ID for logic ID %d: %v", d, err)
continue
}
if !am.shouldIncludeCard(cardID) {
continue
}
healthCode, err := am.mgr.GetDeviceHealth(d)
if err != nil {
klog.Warningf("failed to get device health for %d: %v", d, err)
continue
}
if healthCode != 0 {
Expand All @@ -209,7 +264,7 @@ func (am *AscendManager) CleanupIdleVNPUs() error {

_, IDs, err := am.mgr.GetDeviceList()
if err != nil {
return fmt.Errorf("failed to get device list: %v", err)
return fmt.Errorf("failed to get device list: %w", err)
}
klog.Infof("Found %d devices to check for idle vNPUs,%+v", len(IDs), IDs)

Expand All @@ -220,6 +275,10 @@ func (am *AscendManager) CleanupIdleVNPUs() error {
klog.Warningf("failed to get card/device ID for logic ID %d: %v", logicID, err)
continue
}
if !am.shouldIncludeCard(cardID) {
klog.V(4).Infof("skip cleanup on filtered cardID=%d logicID=%d deviceID=%d", cardID, logicID, deviceID)
continue
}
// Obtain all vNPU information on this device
vDevInfos, err := am.mgr.GetVirtualDeviceInfo(logicID)
if err != nil {
Expand Down Expand Up @@ -254,14 +313,13 @@ func (am *AscendManager) CleanupIdleVNPUs() error {
return nil
}


func (am *AscendManager) GetNodeConfig() *internal.NodeConfig {
return am.nodeConfig
return am.nodeConfig
}

func (am *AscendManager) IsHamiVnpuCore() bool {
if am.nodeConfig != nil {
return am.nodeConfig.HamiVnpuCore
}
return am.globalConfig.VNPUs.HamiVnpuCore
}
}
Loading
Loading