From d74a306f3fa2668fee2715a8b4448ff6093bd053 Mon Sep 17 00:00:00 2001 From: houyuxi Date: Fri, 8 May 2026 17:06:05 +0800 Subject: [PATCH 01/17] fix: use std package `encoding/json` Signed-off-by: houyuxi --- internal/server/server.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/server/server.go b/internal/server/server.go index f610876..f3e2868 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -18,6 +18,7 @@ package server import ( "context" + "encoding/json" "flag" "fmt" "net" @@ -35,7 +36,6 @@ import ( "google.golang.org/grpc" "google.golang.org/grpc/credentials/insecure" v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/util/json" "k8s.io/klog/v2" "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" "io" From b6a8df4d32cb3ae839ffb65cd228d8ae67f388d4 Mon Sep 17 00:00:00 2001 From: houyuxi Date: Fri, 8 May 2026 17:15:06 +0800 Subject: [PATCH 02/17] fix: replace `%v` with `%w` to wrap error Signed-off-by: houyuxi --- internal/server/server.go | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/internal/server/server.go b/internal/server/server.go index f3e2868..e6edcc8 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -117,18 +117,18 @@ func prepareHostResources() error { sharedRegionPath := "/usr/local/hami-shared-region" if err := os.MkdirAll(sharedRegionPath, 0777); err != nil { if !os.IsExist(err) { - return fmt.Errorf("failed to create %s: %v", sharedRegionPath, err) + return fmt.Errorf("failed to create %s: %w", sharedRegionPath, err) } } if err := os.Chmod(sharedRegionPath, 0777); err != nil { - return fmt.Errorf("failed to chmod %s: %v", sharedRegionPath, err) + return fmt.Errorf("failed to chmod %s: %w", sharedRegionPath, err) } klog.Infof("Successfully prepared directory: %s", sharedRegionPath) // 2. Prepare /usr/local/hami-vnpu-core/ directory targetDir := "/usr/local/hami-vnpu-core" if err := os.MkdirAll(targetDir, 0775); err != nil { - return fmt.Errorf("failed to create %s: %v", targetDir, err) + return fmt.Errorf("failed to create %s: %w", targetDir, err) } // Specify the in-container assets directory (can be overridden via environment variable, default follows standard DevicePlugin convention) @@ -163,7 +163,7 @@ func prepareHostResources() error { klog.Warningf("⚠ %s is in use by running process, keeping existing version (safe)", destPath) continue } - return fmt.Errorf("failed to copy %s: %v", destPath, err) + return fmt.Errorf("failed to copy %s: %w", destPath, err) } klog.Infof("✓ Copied %s -> %s", srcPath, destPath) } @@ -377,7 +377,7 @@ func (ps *PluginServer) registerHAMi() error { if strings.HasPrefix(device.Type, Ascend910Prefix) { NetworkID, err := ps.getDeviceNetworkID(i, device.Type) if err != nil { - return fmt.Errorf("get networkID error: %v", err) + return fmt.Errorf("get networkID error: %w", err) } device.CustomInfo = map[string]any{ "NetworkID": NetworkID, @@ -398,11 +398,11 @@ func (ps *PluginServer) registerHAMi() error { node, err := util.GetNode(ps.nodeName) if err != nil { - return fmt.Errorf("get node %s error: %v", ps.nodeName, err) + return fmt.Errorf("get node %s error: %w", ps.nodeName, err) } err = util.PatchNodeAnnotations(node, annos) if err != nil { - return fmt.Errorf("patch node %s annotations error: %v", ps.nodeName, err) + return fmt.Errorf("patch node %s annotations error: %w", ps.nodeName, err) } klog.V(5).Infof("patch node %s annotations: %v", ps.nodeName, annos) return nil @@ -446,7 +446,7 @@ func (ps *PluginServer) parsePodAnnotation(pod *v1.Pod) ([]int32, []string, []*i var rtInfo []RuntimeInfo err := json.Unmarshal([]byte(anno), &rtInfo) if err != nil { - return nil, nil,nil, nil, fmt.Errorf("annotation %s value %s invalid", ps.allocAnno, anno) + return nil, nil,nil, nil, fmt.Errorf("annotation %s value %s invalid: %w", ps.allocAnno, anno, err) } var IDs []int32 var temps []string @@ -530,12 +530,12 @@ func (ps *PluginServer) Allocate(ctx context.Context, reqs *v1beta1.AllocateRequ pod, err := util.GetPendingPod(ctx, ps.nodeName) if err != nil { klog.Errorf("get pending pod error: %v", err) - return nil, fmt.Errorf("get pending pod error: %v", err) + return nil, fmt.Errorf("get pending pod error: %w", err) } resp := v1beta1.ContainerAllocateResponse{} IDs, temps, memories, cores, err := ps.parsePodAnnotation(pod) if err != nil { - return nil, fmt.Errorf("parse pod annotation error: %v", err) + return nil, fmt.Errorf("parse pod annotation error: %w", err) } vnpuMode := pod.Annotations[VNPUModeAnnotation] From 5e286d1deb04ba5031867c5e44aadfd77b45d18e Mon Sep 17 00:00:00 2001 From: houyuxi Date: Fri, 8 May 2026 19:38:30 +0800 Subject: [PATCH 03/17] style: run `go fmt` to format Signed-off-by: houyuxi --- cmd/main.go | 2 +- internal/manager/manager.go | 17 ++- internal/server/server.go | 259 ++++++++++++++++++------------------ internal/vnpu.go | 3 +- 4 files changed, 139 insertions(+), 142 deletions(-) diff --git a/cmd/main.go b/cmd/main.go index 849e40c..10d8aca 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -36,7 +36,7 @@ import ( var ( hwLoglevel = flag.Int("hw_loglevel", 0, "huawei log level, -1-debug, 0-info, 1-warning, 2-error 3-critical default value: 0") configFile = flag.String("config_file", "", "config file path") - nodeConfigFile = flag.String("node_config_file", "", "node specific config file path") + nodeConfigFile = flag.String("node_config_file", "", "node specific config file path") nodeName = flag.String("node_name", os.Getenv("NODE_NAME"), "node name") checkIdleVNPUInterval = flag.Int("check_idle_vnpu_interval", 60, "the interval (in seconds) to check idle vNPU and release them") ) diff --git a/internal/manager/manager.go b/internal/manager/manager.go index 8ce1df2..3fd82b5 100644 --- a/internal/manager/manager.go +++ b/internal/manager/manager.go @@ -39,11 +39,11 @@ type Device struct { } type AscendManager struct { - mgr *devmanager.DeviceManager - config internal.VNPUConfig + mgr *devmanager.DeviceManager + config internal.VNPUConfig globalConfig internal.Config - devs []*Device - nodeConfig *internal.NodeConfig + devs []*Device + nodeConfig *internal.NodeConfig } func NewAscendManager() (*AscendManager, error) { @@ -58,7 +58,7 @@ func NewAscendManager() (*AscendManager, error) { } func (am *AscendManager) LoadNodeConfig(nodePath string, nodeName string) error { - nodeConfigList, err := internal.LoadNodeConfig(nodePath) + nodeConfigList, err := internal.LoadNodeConfig(nodePath) if err != nil { klog.Warningf("Failed to load node config from %s: %v", nodePath, err) return err @@ -71,7 +71,7 @@ func (am *AscendManager) LoadNodeConfig(nodePath string, nodeName string) error return nil } } - + klog.Infof("No specific config found for node %s, will use default settings", nodeName) return nil } @@ -254,9 +254,8 @@ func (am *AscendManager) CleanupIdleVNPUs() error { return nil } - func (am *AscendManager) GetNodeConfig() *internal.NodeConfig { - return am.nodeConfig + return am.nodeConfig } func (am *AscendManager) IsHamiVnpuCore() bool { @@ -264,4 +263,4 @@ func (am *AscendManager) IsHamiVnpuCore() bool { return am.nodeConfig.HamiVnpuCore } return am.globalConfig.VNPUs.HamiVnpuCore -} \ No newline at end of file +} diff --git a/internal/server/server.go b/internal/server/server.go index e6edcc8..fd285a8 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -21,15 +21,20 @@ import ( "encoding/json" "flag" "fmt" + "io" "net" "os" "path" + "strconv" "strings" "time" - "strconv" "github.com/Project-HAMi/HAMi/pkg/device" // "github.com/Project-HAMi/HAMi/pkg/device/ascend" + "crypto/sha256" + "encoding/hex" + "path/filepath" + "github.com/Project-HAMi/HAMi/pkg/util" "github.com/Project-HAMi/HAMi/pkg/util/nodelock" "github.com/Project-HAMi/ascend-device-plugin/internal/manager" @@ -38,20 +43,16 @@ import ( v1 "k8s.io/api/core/v1" "k8s.io/klog/v2" "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" - "io" - "path/filepath" - "crypto/sha256" - "encoding/hex" ) const ( // RegisterAnnos = "hami.io/node-register-ascend" // PodAllocAnno = "huawei.com/AscendDevices" - NodeLockAscend = "hami.io/mutex.lock" - Ascend910Prefix = "Ascend910" - Ascend910CType = "Ascend910C" - VNPUModeAnnotation = "huawei.com/vnpu-mode" - VNPUModeHamiCore = "hami-core" + NodeLockAscend = "hami.io/mutex.lock" + Ascend910Prefix = "Ascend910" + Ascend910CType = "Ascend910C" + VNPUModeAnnotation = "huawei.com/vnpu-mode" + VNPUModeHamiCore = "hami-core" VNPUNodeSelectorAnnotation = "hami-vnpu-core" ) @@ -73,10 +74,10 @@ type PluginServer struct { } type RuntimeInfo struct { - UUID string `json:"UUID,omitempty"` - Temp string `json:"temp,omitempty"` - Memory *int64 `json:"memory,omitempty"` - Core *int32 `json:"core,omitempty"` + UUID string `json:"UUID,omitempty"` + Temp string `json:"temp,omitempty"` + Memory *int64 `json:"memory,omitempty"` + Core *int32 `json:"core,omitempty"` } func NewPluginServer(mgr *manager.AscendManager, nodeName string, checkIdleVNPUInterval int) (*PluginServer, error) { @@ -96,116 +97,115 @@ func NewPluginServer(mgr *manager.AscendManager, nodeName string, checkIdleVNPUI // fileSHA256 calculates the SHA256 checksum of the specified file func fileSHA256(path string) (string, error) { - f, err := os.Open(path) - if err != nil { - return "", err - } - defer f.Close() - - h := sha256.New() - if _, err := io.Copy(h, f); err != nil { - return "", err - } - return hex.EncodeToString(h.Sum(nil)), nil + f, err := os.Open(path) + if err != nil { + return "", err + } + defer f.Close() + + h := sha256.New() + if _, err := io.Copy(h, f); err != nil { + return "", err + } + return hex.EncodeToString(h.Sum(nil)), nil } // Automatically creates directories, sets permissions, and copies core files on the host func prepareHostResources() error { - klog.Info("Starting host resource preparation for HAMi vNPU core...") - - // 1. Create shared memory directory - sharedRegionPath := "/usr/local/hami-shared-region" - if err := os.MkdirAll(sharedRegionPath, 0777); err != nil { - if !os.IsExist(err) { - return fmt.Errorf("failed to create %s: %w", sharedRegionPath, err) - } - } - if err := os.Chmod(sharedRegionPath, 0777); err != nil { - return fmt.Errorf("failed to chmod %s: %w", sharedRegionPath, err) - } - klog.Infof("Successfully prepared directory: %s", sharedRegionPath) - - // 2. Prepare /usr/local/hami-vnpu-core/ directory - targetDir := "/usr/local/hami-vnpu-core" - if err := os.MkdirAll(targetDir, 0775); err != nil { - return fmt.Errorf("failed to create %s: %w", targetDir, err) - } - - // Specify the in-container assets directory (can be overridden via environment variable, default follows standard DevicePlugin convention) - assetsDir := os.Getenv("HAMI_VNPU_ASSETS_PATH") - if assetsDir == "" { - assetsDir = "/usr/local/hami-vnpu-core-assets" - } - - // Define files to copy: source path in container -> target path on host - filesToCopy := map[string]string{ - "limiter": filepath.Join(targetDir, "limiter"), - "libvnpu.so": filepath.Join(targetDir, "libvnpu.so"), - "ld.so.preload": filepath.Join(targetDir, "ld.so.preload"), - } - - for srcName, destPath := range filesToCopy { - srcPath := filepath.Join(assetsDir, srcName) + klog.Info("Starting host resource preparation for HAMi vNPU core...") + + // 1. Create shared memory directory + sharedRegionPath := "/usr/local/hami-shared-region" + if err := os.MkdirAll(sharedRegionPath, 0777); err != nil { + if !os.IsExist(err) { + return fmt.Errorf("failed to create %s: %w", sharedRegionPath, err) + } + } + if err := os.Chmod(sharedRegionPath, 0777); err != nil { + return fmt.Errorf("failed to chmod %s: %w", sharedRegionPath, err) + } + klog.Infof("Successfully prepared directory: %s", sharedRegionPath) + + // 2. Prepare /usr/local/hami-vnpu-core/ directory + targetDir := "/usr/local/hami-vnpu-core" + if err := os.MkdirAll(targetDir, 0775); err != nil { + return fmt.Errorf("failed to create %s: %w", targetDir, err) + } + + // Specify the in-container assets directory (can be overridden via environment variable, default follows standard DevicePlugin convention) + assetsDir := os.Getenv("HAMI_VNPU_ASSETS_PATH") + if assetsDir == "" { + assetsDir = "/usr/local/hami-vnpu-core-assets" + } + + // Define files to copy: source path in container -> target path on host + filesToCopy := map[string]string{ + "limiter": filepath.Join(targetDir, "limiter"), + "libvnpu.so": filepath.Join(targetDir, "libvnpu.so"), + "ld.so.preload": filepath.Join(targetDir, "ld.so.preload"), + } + + for srcName, destPath := range filesToCopy { + srcPath := filepath.Join(assetsDir, srcName) // File already exists, skip if content is consistent if _, err := os.Stat(destPath); err == nil { - srcSum, err1 := fileSHA256(srcPath) - dstSum, err2 := fileSHA256(destPath) - - if err1 == nil && err2 == nil && srcSum == dstSum { - klog.Infof("✓ %s already up-to-date, skipping", destPath) - continue - } - } - - if err := copyFile(srcPath, destPath); err != nil { - if strings.Contains(err.Error(), "text file busy") { - klog.Warningf("⚠ %s is in use by running process, keeping existing version (safe)", destPath) - continue - } - return fmt.Errorf("failed to copy %s: %w", destPath, err) - } - klog.Infof("✓ Copied %s -> %s", srcPath, destPath) - } - - klog.Info("Host resource preparation completed successfully.") - return nil + srcSum, err1 := fileSHA256(srcPath) + dstSum, err2 := fileSHA256(destPath) + + if err1 == nil && err2 == nil && srcSum == dstSum { + klog.Infof("✓ %s already up-to-date, skipping", destPath) + continue + } + } + + if err := copyFile(srcPath, destPath); err != nil { + if strings.Contains(err.Error(), "text file busy") { + klog.Warningf("⚠ %s is in use by running process, keeping existing version (safe)", destPath) + continue + } + return fmt.Errorf("failed to copy %s: %w", destPath, err) + } + klog.Infof("✓ Copied %s -> %s", srcPath, destPath) + } + + klog.Info("Host resource preparation completed successfully.") + return nil } // A standard file copy implementation that preserves the original file permissions func copyFile(src, dst string) error { - srcFile, err := os.Open(src) - if err != nil { - return err - } - defer srcFile.Close() - - dstFile, err := os.Create(dst) - if err != nil { - return err - } - defer dstFile.Close() - - if _, err = io.Copy(dstFile, srcFile); err != nil { - return err - } - - // Sync source file permissions (ensure the limiter binary retains executable permission) - srcInfo, err := srcFile.Stat() - if err != nil { - return err - } - return os.Chmod(dst, srcInfo.Mode()) -} + srcFile, err := os.Open(src) + if err != nil { + return err + } + defer srcFile.Close() + + dstFile, err := os.Create(dst) + if err != nil { + return err + } + defer dstFile.Close() + if _, err = io.Copy(dstFile, srcFile); err != nil { + return err + } + + // Sync source file permissions (ensure the limiter binary retains executable permission) + srcInfo, err := srcFile.Stat() + if err != nil { + return err + } + return os.Chmod(dst, srcInfo.Mode()) +} func (ps *PluginServer) Start() error { // Automatically prepare host environment when the plugin starts - if err := prepareHostResources(); err != nil { - klog.Errorf("Failed to prepare host resources: %v. vNPU core functionality will be impaired.", err) - return err - } - + if err := prepareHostResources(); err != nil { + klog.Errorf("Failed to prepare host resources: %v. vNPU core functionality will be impaired.", err) + return err + } + ps.stopCh = make(chan interface{}) err := ps.mgr.UpdateDevice() if err != nil { @@ -388,14 +388,14 @@ func (ps *PluginServer) registerHAMi() error { annos := make(map[string]string) annos[ps.registerAnno] = device.MarshalNodeDevices(apiDevices) annos[ps.handshakeAnno] = "Reported_" + time.Now().Add(time.Duration(*reportTimeOffset)*time.Second).Format("2006.01.02 15:04:05") - + if ps.mgr.IsHamiVnpuCore() { annos[VNPUNodeSelectorAnnotation] = "true" klog.V(4).Infof("Node %s has HamiVnpuCore enabled, patching annotation %s: true", ps.nodeName, VNPUNodeSelectorAnnotation) } else { annos[VNPUNodeSelectorAnnotation] = "false" } - + node, err := util.GetNode(ps.nodeName) if err != nil { return fmt.Errorf("get node %s error: %w", ps.nodeName, err) @@ -437,21 +437,20 @@ func (ps *PluginServer) watchAndRegister() { } } - func (ps *PluginServer) parsePodAnnotation(pod *v1.Pod) ([]int32, []string, []*int64, []*int32, error) { anno, ok := pod.Annotations[ps.allocAnno] if !ok { - return nil, nil,nil, nil, fmt.Errorf("annotation %s not set", "huawei.com/Ascend") + return nil, nil, nil, nil, fmt.Errorf("annotation %s not set", "huawei.com/Ascend") } var rtInfo []RuntimeInfo err := json.Unmarshal([]byte(anno), &rtInfo) if err != nil { - return nil, nil,nil, nil, fmt.Errorf("annotation %s value %s invalid: %w", ps.allocAnno, anno, err) + return nil, nil, nil, nil, fmt.Errorf("annotation %s value %s invalid: %w", ps.allocAnno, anno, err) } var IDs []int32 var temps []string - var memories []*int64 - var cores []*int32 + var memories []*int64 + var cores []*int32 for _, info := range rtInfo { if info.UUID == "" { @@ -465,10 +464,10 @@ func (ps *PluginServer) parsePodAnnotation(pod *v1.Pod) ([]int32, []string, []*i temps = append(temps, info.Temp) if info.Memory != nil { memories = append(memories, info.Memory) - } + } if info.Core != nil { cores = append(cores, info.Core) - } + } } if len(IDs) == 0 { return nil, nil, nil, nil, fmt.Errorf("annotation %s value %s invalid", ps.allocAnno, anno) @@ -537,10 +536,10 @@ func (ps *PluginServer) Allocate(ctx context.Context, reqs *v1beta1.AllocateRequ if err != nil { return nil, fmt.Errorf("parse pod annotation error: %w", err) } - + vnpuMode := pod.Annotations[VNPUModeAnnotation] klog.V(4).Infof("Pod %s vnpu mode: %s", pod.Name, vnpuMode) - + if len(IDs) == 0 { return nil, fmt.Errorf("empty id from pod annotation") } @@ -553,22 +552,22 @@ func (ps *PluginServer) Allocate(ctx context.Context, reqs *v1beta1.AllocateRequ resp.Envs["ASCEND_VISIBLE_DEVICES"] = ascendVisibleDevices if vnpuMode == VNPUModeHamiCore { - // 1. Handle volume mount injection + // 1. Handle volume mount injection var mounts []*v1beta1.Mount // A.Huawei driver and SMI toolchain (Read-Only) driverPaths := []string{ - "/usr/local/bin/npu-smi", - "/etc/ascend_install.info", - "/usr/local/Ascend/driver/lib64/driver", - "/usr/local/Ascend/driver/version.info", - } + "/usr/local/bin/npu-smi", + "/etc/ascend_install.info", + "/usr/local/Ascend/driver/lib64/driver", + "/usr/local/Ascend/driver/version.info", + } for _, p := range driverPaths { mounts = append(mounts, &v1beta1.Mount{HostPath: p, ContainerPath: p, ReadOnly: true}) } mounts = append(mounts, &v1beta1.Mount{ - HostPath: "/usr/local/hami-vnpu-core", - ContainerPath: "/hami-vnpu-core", + HostPath: "/usr/local/hami-vnpu-core", + ContainerPath: "/hami-vnpu-core", ReadOnly: true, }) // B. Inject HAMi library path by mounting /etc/ld.so.preload. @@ -580,12 +579,12 @@ func (ps *PluginServer) Allocate(ctx context.Context, reqs *v1beta1.AllocateRequ // C. Shared directory for HAMi compute resource partitioning (Read/Write) mounts = append(mounts, &v1beta1.Mount{ - HostPath: "/usr/local/hami-shared-region", + HostPath: "/usr/local/hami-shared-region", ContainerPath: "/hami-shared-region", ReadOnly: false, }) resp.Mounts = mounts - + // Set NPU_MEM_QUOTA if len(memories) > 0 && memories[0] != nil { resp.Envs["NPU_MEM_QUOTA"] = strconv.FormatInt(*memories[0], 10) diff --git a/internal/vnpu.go b/internal/vnpu.go index 9872d72..2f890e6 100644 --- a/internal/vnpu.go +++ b/internal/vnpu.go @@ -63,7 +63,6 @@ func LoadConfig(path string) (*Config, error) { return &yamlData, nil } - type NodeConfig struct { Name string `json:"name"` HamiVnpuCore bool `json:"hami-vnpu-core"` @@ -85,4 +84,4 @@ func LoadNodeConfig(path string) (*NodeListConfig, error) { return nil, err } return &yamlData, nil -} \ No newline at end of file +} From 2a72cd1816f9093a159d90209288f72ca649217d Mon Sep 17 00:00:00 2001 From: houyuxi Date: Wed, 13 May 2026 16:49:57 +0800 Subject: [PATCH 04/17] fix: allocate pod with multiple containers Signed-off-by: houyuxi --- internal/server/server.go | 323 ++++++++++++++++++++++++++++---------- 1 file changed, 242 insertions(+), 81 deletions(-) diff --git a/internal/server/server.go b/internal/server/server.go index fd285a8..5d91d0a 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -30,6 +30,7 @@ import ( "time" "github.com/Project-HAMi/HAMi/pkg/device" + // "github.com/Project-HAMi/HAMi/pkg/device/ascend" "crypto/sha256" "encoding/hex" @@ -65,6 +66,7 @@ type PluginServer struct { registerAnno string handshakeAnno string allocAnno string + toAllocDeviceAnno string grpcServer *grpc.Server mgr *manager.AscendManager socket string @@ -81,14 +83,16 @@ type RuntimeInfo struct { } func NewPluginServer(mgr *manager.AscendManager, nodeName string, checkIdleVNPUInterval int) (*PluginServer, error) { + commonWord := mgr.CommonWord() return &PluginServer{ nodeName: nodeName, - registerAnno: fmt.Sprintf("hami.io/node-register-%s", mgr.CommonWord()), - handshakeAnno: fmt.Sprintf("hami.io/node-handshake-%s", mgr.CommonWord()), - allocAnno: fmt.Sprintf("huawei.com/%s", mgr.CommonWord()), + registerAnno: fmt.Sprintf("hami.io/node-register-%s", commonWord), + handshakeAnno: fmt.Sprintf("hami.io/node-handshake-%s", commonWord), + allocAnno: fmt.Sprintf("huawei.com/%s", commonWord), + toAllocDeviceAnno: fmt.Sprintf("hami.io/%s-devices-to-allocate", commonWord), grpcServer: grpc.NewServer(), mgr: mgr, - socket: path.Join(v1beta1.DevicePluginPath, fmt.Sprintf("%s.sock", mgr.CommonWord())), + socket: path.Join(v1beta1.DevicePluginPath, fmt.Sprintf("%s.sock", commonWord)), stopCh: make(chan interface{}), healthCh: make(chan int32), checkIdleVNPUInterval: checkIdleVNPUInterval, @@ -437,6 +441,220 @@ func (ps *PluginServer) watchAndRegister() { } } +// buildContainerAllocateResponse constructs the ContainerAllocateResponse for +// the given container devices. It resolves UUIDs to PhyIDs and looks up vNPU +// template names from the allocAnno annotation. +func (ps *PluginServer) buildContainerAllocateResponse(pod *v1.Pod, containerDevs device.ContainerDevices) (*v1beta1.ContainerAllocateResponse, error) { + rtInfoLookup, err := ps.buildRuntimeInfoLookup(pod) + if err != nil { + return nil, fmt.Errorf("build runtimeInfo lookup: %w", err) + } + + resp := &v1beta1.ContainerAllocateResponse{} + + var ( + IDs []int32 + memories []*int64 + cores []*int32 + ascendVNPUSpec string + ) + + for _, dev := range containerDevs { + d := ps.mgr.GetDeviceByUUID(dev.UUID) + if d == nil { + return nil, fmt.Errorf("unknown uuid: %s", dev.UUID) + } + IDs = append(IDs, d.PhyID) + + if info, ok := rtInfoLookup[dev.UUID]; ok { + if ascendVNPUSpec == "" && info.Temp != "" { + ascendVNPUSpec = info.Temp + } + if info.Memory != nil { + memories = append(memories, info.Memory) + } + if info.Core != nil { + cores = append(cores, info.Core) + } + } + } + + if len(IDs) == 0 { + return nil, fmt.Errorf("annotation %s value invalid", ps.allocAnno) + } + ascendVisibleDevices := fmt.Sprintf("%d", IDs[0]) + for i := 1; i < len(IDs); i++ { + ascendVisibleDevices = fmt.Sprintf("%s,%d", ascendVisibleDevices, IDs[i]) + } + resp.Envs = make(map[string]string) + resp.Envs["ASCEND_VISIBLE_DEVICES"] = ascendVisibleDevices + + vnpuMode := pod.Annotations[VNPUModeAnnotation] + klog.V(4).Infof("Pod %s vnpu mode: %s", pod.Name, vnpuMode) + if vnpuMode == VNPUModeHamiCore { + // 1. Handle volume mount injection + var mounts []*v1beta1.Mount + // A.Huawei driver and SMI toolchain (Read-Only) + driverPaths := []string{ + "/usr/local/bin/npu-smi", + "/etc/ascend_install.info", + "/usr/local/Ascend/driver/lib64/driver", + "/usr/local/Ascend/driver/version.info", + } + for _, p := range driverPaths { + mounts = append(mounts, &v1beta1.Mount{HostPath: p, ContainerPath: p, ReadOnly: true}) + } + + mounts = append(mounts, &v1beta1.Mount{ + HostPath: "/usr/local/hami-vnpu-core", + ContainerPath: "/hami-vnpu-core", + ReadOnly: true, + }) + // B. Inject HAMi library path by mounting /etc/ld.so.preload. + mounts = append(mounts, &v1beta1.Mount{ + HostPath: "/usr/local/hami-vnpu-core/ld.so.preload", // Template file on host + ContainerPath: "/etc/ld.so.preload", // Overwrites the target file in container + ReadOnly: true, + }) + + // C. Shared directory for HAMi compute resource partitioning (Read/Write) + mounts = append(mounts, &v1beta1.Mount{ + HostPath: "/usr/local/hami-shared-region", + ContainerPath: "/hami-shared-region", + ReadOnly: false, + }) + resp.Mounts = mounts + + // Set NPU_MEM_QUOTA + if len(memories) > 0 && memories[0] != nil { + resp.Envs["NPU_MEM_QUOTA"] = strconv.FormatInt(*memories[0], 10) + klog.V(4).InfoS("Memory quota set", "value", *memories[0]) + } + + // Set NPU_PRIORITY + if len(cores) > 0 && cores[0] != nil { + resp.Envs["NPU_PRIORITY"] = strconv.FormatInt(int64(*cores[0]), 10) + klog.V(4).InfoS("Core priority set", "value", *cores[0]) + } + + // Set GLOBAL_SHM_PATH separated by device ID. + if len(IDs) > 0 { + resp.Envs["NPU_GLOBAL_SHM_PATH"] = fmt.Sprintf("/hami-shared-region/%d_global_registry", IDs[0]) + klog.V(5).Infof("Create %d_global_registry", IDs[0]) + } else { + klog.Warningf("No device IDs allocated") + } + } else { + if ascendVNPUSpec != "" { + resp.Envs["ASCEND_VNPU_SPECS"] = ascendVNPUSpec + } + } + return resp, nil +} + +// getNextContainerDevices reads the toAllocDeviceAnno annotation and returns +// the devices of the first container that still has pending allocations. +func (ps *PluginServer) getNextContainerDevices(pod *v1.Pod) (device.ContainerDevices, error) { + anno, ok := pod.Annotations[ps.toAllocDeviceAnno] + if !ok { + return nil, fmt.Errorf("annotation %s not found", ps.toAllocDeviceAnno) + } + podSingleDev, err := decodePodSingleDevice(anno) + if err != nil { + return nil, fmt.Errorf("decode annotation %s: %w", ps.toAllocDeviceAnno, err) + } + for _, ctrDevice := range podSingleDev { + if len(ctrDevice) > 0 { + return ctrDevice, nil + } + } + return nil, fmt.Errorf("no pending device allocation in annotation %s", ps.toAllocDeviceAnno) +} + +// buildTempLookup reads the allocAnno annotation and returns a map from +// device UUID to its vNPU template name. +func (ps *PluginServer) buildRuntimeInfoLookup(pod *v1.Pod) (map[string]RuntimeInfo, error) { + anno, ok := pod.Annotations[ps.allocAnno] + if !ok { + // The annotation may not exist for non-vNPU scenarios; return empty lookup. + return nil, fmt.Errorf("annotation %s not set", ps.allocAnno) + } + var rtInfo []RuntimeInfo + if err := json.Unmarshal([]byte(anno), &rtInfo); err != nil { + return nil, fmt.Errorf("annotation %s value %s invalid: %w", ps.allocAnno, anno, err) + } + lookup := make(map[string]RuntimeInfo, len(rtInfo)) + for _, info := range rtInfo { + if info.UUID != "" { + lookup[info.UUID] = info + } + } + return lookup, nil +} + +// eraseCurrentContainerAnnotation erases the current container's devices from +// the toAllocDeviceAnno annotation, so the next Allocate call will advance +// to the next container. +func (ps *PluginServer) eraseCurrentContainerAnnotation(pod *v1.Pod) error { + anno, ok := pod.Annotations[ps.toAllocDeviceAnno] + if !ok { + return fmt.Errorf("annotation %s not found", ps.toAllocDeviceAnno) + } + podSingleDev, err := decodePodSingleDevice(anno) + if err != nil { + return fmt.Errorf("decode annotation %s: %w", ps.toAllocDeviceAnno, err) + } + res := make(device.PodSingleDevice, 0, len(podSingleDev)) + found := false + for _, val := range podSingleDev { + if found { + res = append(res, val) + } else { + if len(val) > 0 { + found = true + res = append(res, device.ContainerDevices{}) + } else { + res = append(res, val) + } + } + } + klog.V(5).Infof("After erase annotation, remaining devices: %v", res) + newAnnoValue := device.EncodePodSingleDevice(res) + newAnnos := map[string]string{ + ps.toAllocDeviceAnno: newAnnoValue, + } + if err := util.PatchPodAnnotations(pod, newAnnos); err != nil { + return err + } + // Update in-memory pod annotations so subsequent getNextContainerDevices + // calls within the same Allocate see the erased state. + pod.Annotations[ps.toAllocDeviceAnno] = newAnnoValue + return nil +} + +// decodePodSingleDevice decodes a single annotation value string into a +// PodSingleDevice. It reuses HAMi's DecodeContainerDevices for per-device +// parsing. The format is: +// +// ,:,;,;... +func decodePodSingleDevice(str string) (device.PodSingleDevice, error) { + if len(str) == 0 { + return device.PodSingleDevice{}, nil + } + pd := make(device.PodSingleDevice, 0) + for _, s := range strings.Split(str, device.OnePodMultiContainerSplitSymbol) { + cd, err := device.DecodeContainerDevices(s) + if err != nil { + return nil, err + } + if len(cd) == 0 { + continue + } + pd = append(pd, cd) + } + return pd, nil +} + func (ps *PluginServer) parsePodAnnotation(pod *v1.Pod) ([]int32, []string, []*int64, []*int32, error) { anno, ok := pod.Annotations[ps.allocAnno] if !ok { @@ -531,94 +749,37 @@ func (ps *PluginServer) Allocate(ctx context.Context, reqs *v1beta1.AllocateRequ klog.Errorf("get pending pod error: %v", err) return nil, fmt.Errorf("get pending pod error: %w", err) } - resp := v1beta1.ContainerAllocateResponse{} - IDs, temps, memories, cores, err := ps.parsePodAnnotation(pod) - if err != nil { - return nil, fmt.Errorf("parse pod annotation error: %w", err) - } + klog.Infof("allocating for pod %s/%s", pod.Namespace, pod.Name) - vnpuMode := pod.Annotations[VNPUModeAnnotation] - klog.V(4).Infof("Pod %s vnpu mode: %s", pod.Name, vnpuMode) + responses := v1beta1.AllocateResponse{} - if len(IDs) == 0 { - return nil, fmt.Errorf("empty id from pod annotation") - } - ascendVisibleDevices := fmt.Sprintf("%d", IDs[0]) - for i := 1; i < len(IDs); i++ { - ascendVisibleDevices = fmt.Sprintf("%s,%d", ascendVisibleDevices, IDs[i]) - } - - resp.Envs = make(map[string]string) - resp.Envs["ASCEND_VISIBLE_DEVICES"] = ascendVisibleDevices - - if vnpuMode == VNPUModeHamiCore { - // 1. Handle volume mount injection - var mounts []*v1beta1.Mount - // A.Huawei driver and SMI toolchain (Read-Only) - driverPaths := []string{ - "/usr/local/bin/npu-smi", - "/etc/ascend_install.info", - "/usr/local/Ascend/driver/lib64/driver", - "/usr/local/Ascend/driver/version.info", - } - for _, p := range driverPaths { - mounts = append(mounts, &v1beta1.Mount{HostPath: p, ContainerPath: p, ReadOnly: true}) + // resp := v1beta1.ContainerAllocateResponse{} + for _, req := range reqs.ContainerRequests { + containerDevs, err := ps.getNextContainerDevices(pod) + if err != nil { + return nil, fmt.Errorf("get next container devices: %w", err) } + klog.Infof("containerDevs: %+v", containerDevs) - mounts = append(mounts, &v1beta1.Mount{ - HostPath: "/usr/local/hami-vnpu-core", - ContainerPath: "/hami-vnpu-core", - ReadOnly: true, - }) - // B. Inject HAMi library path by mounting /etc/ld.so.preload. - mounts = append(mounts, &v1beta1.Mount{ - HostPath: "/usr/local/hami-vnpu-core/ld.so.preload", // Template file on host - ContainerPath: "/etc/ld.so.preload", // Overwrites the target file in container - ReadOnly: true, - }) - - // C. Shared directory for HAMi compute resource partitioning (Read/Write) - mounts = append(mounts, &v1beta1.Mount{ - HostPath: "/usr/local/hami-shared-region", - ContainerPath: "/hami-shared-region", - ReadOnly: false, - }) - resp.Mounts = mounts - - // Set NPU_MEM_QUOTA - if len(memories) > 0 && memories[0] != nil { - resp.Envs["NPU_MEM_QUOTA"] = strconv.FormatInt(*memories[0], 10) - klog.V(4).InfoS("Memory quota set", "value", *memories[0]) + if len(containerDevs) != len(req.DevicesIDs) { + return nil, fmt.Errorf("device number not matched: annotation has %d, request has %d", len(containerDevs), len(req.DevicesIDs)) } - // Set NPU_PRIORITY - if len(cores) > 0 && cores[0] != nil { - resp.Envs["NPU_PRIORITY"] = strconv.FormatInt(int64(*cores[0]), 10) - klog.V(4).InfoS("Core priority set", "value", *cores[0]) + resp, err := ps.buildContainerAllocateResponse(pod, containerDevs) + if err != nil { + return nil, fmt.Errorf("build container allocate response: %w", err) } - // Set GLOBAL_SHM_PATH separated by device ID. - if len(IDs) > 0 { - resp.Envs["NPU_GLOBAL_SHM_PATH"] = fmt.Sprintf("/hami-shared-region/%d_global_registry", IDs[0]) - klog.V(5).Infof("Create %d_global_registry", IDs[0]) - } else { - klog.Warningf("No device IDs allocated") - } - } else { - ascendVNPUSpec := "" - for i := 0; i < len(temps); i++ { - if temps[i] != "" { - ascendVNPUSpec = temps[i] - break - } - } - if ascendVNPUSpec != "" { - resp.Envs["ASCEND_VNPU_SPECS"] = ascendVNPUSpec + if err := ps.eraseCurrentContainerAnnotation(pod); err != nil { + klog.Errorf("erase current container annotation error: %v", err) + return nil, fmt.Errorf("erase current container annotation: %w", err) } + + responses.ContainerResponses = append(responses.ContainerResponses, resp) } - klog.V(5).Infof("allocate response: %v", resp) + klog.V(5).Infof("allocate response: %+v", responses.ContainerResponses) success = true - return &v1beta1.AllocateResponse{ContainerResponses: []*v1beta1.ContainerAllocateResponse{&resp}}, nil + return &responses, nil } func (ps *PluginServer) PreStartContainer(context.Context, *v1beta1.PreStartContainerRequest) (*v1beta1.PreStartContainerResponse, error) { From 5e35f7c4867da71d0040b61a85042373a0bfd822 Mon Sep 17 00:00:00 2001 From: houyuxi Date: Wed, 13 May 2026 16:58:30 +0800 Subject: [PATCH 05/17] feat: set bind-phase when allocation success or failed Signed-off-by: houyuxi --- internal/server/server.go | 66 +++++++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 23 deletions(-) diff --git a/internal/server/server.go b/internal/server/server.go index 5d91d0a..7b5c0c5 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -18,6 +18,8 @@ package server import ( "context" + "crypto/sha256" + "encoding/hex" "encoding/json" "flag" "fmt" @@ -25,25 +27,22 @@ import ( "net" "os" "path" + "path/filepath" "strconv" "strings" "time" - "github.com/Project-HAMi/HAMi/pkg/device" - - // "github.com/Project-HAMi/HAMi/pkg/device/ascend" - "crypto/sha256" - "encoding/hex" - "path/filepath" - - "github.com/Project-HAMi/HAMi/pkg/util" - "github.com/Project-HAMi/HAMi/pkg/util/nodelock" - "github.com/Project-HAMi/ascend-device-plugin/internal/manager" "google.golang.org/grpc" "google.golang.org/grpc/credentials/insecure" v1 "k8s.io/api/core/v1" "k8s.io/klog/v2" "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" + + // "github.com/Project-HAMi/HAMi/pkg/device/ascend" + "github.com/Project-HAMi/HAMi/pkg/device" + "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/plugin" + "github.com/Project-HAMi/HAMi/pkg/util" + "github.com/Project-HAMi/ascend-device-plugin/internal/manager" ) const ( @@ -62,6 +61,7 @@ var ( ) type PluginServer struct { + commonWord string nodeName string registerAnno string handshakeAnno string @@ -84,7 +84,8 @@ type RuntimeInfo struct { func NewPluginServer(mgr *manager.AscendManager, nodeName string, checkIdleVNPUInterval int) (*PluginServer, error) { commonWord := mgr.CommonWord() - return &PluginServer{ + server := &PluginServer{ + commonWord: commonWord, nodeName: nodeName, registerAnno: fmt.Sprintf("hami.io/node-register-%s", commonWord), handshakeAnno: fmt.Sprintf("hami.io/node-handshake-%s", commonWord), @@ -96,7 +97,10 @@ func NewPluginServer(mgr *manager.AscendManager, nodeName string, checkIdleVNPUI stopCh: make(chan interface{}), healthCh: make(chan int32), checkIdleVNPUInterval: checkIdleVNPUInterval, - }, nil + } + // enable calling hami methods + device.InRequestDevices[commonWord] = server.toAllocDeviceAnno + return server, nil } // fileSHA256 calculates the SHA256 checksum of the specified file @@ -739,22 +743,25 @@ func (ps *PluginServer) Allocate(ctx context.Context, reqs *v1beta1.AllocateRequ success := false var pod *v1.Pod defer func() { - lockerr := nodelock.ReleaseNodeLock(ps.nodeName, NodeLockAscend, pod, success) - if lockerr != nil { - klog.Errorf("failed to release lock:%s", lockerr.Error()) + if pod == nil { + return + } + if success { + ps.podAllocationTrySuccess(pod) + } else { + ps.podAllocationFailed(pod) } }() - pod, err := util.GetPendingPod(ctx, ps.nodeName) - if err != nil { - klog.Errorf("get pending pod error: %v", err) - return nil, fmt.Errorf("get pending pod error: %w", err) - } - klog.Infof("allocating for pod %s/%s", pod.Namespace, pod.Name) responses := v1beta1.AllocateResponse{} - - // resp := v1beta1.ContainerAllocateResponse{} for _, req := range reqs.ContainerRequests { + pod, err := util.GetPendingPod(ctx, ps.nodeName) + if err != nil { + klog.Errorf("get pending pod error: %v", err) + return nil, fmt.Errorf("get pending pod error: %w", err) + } + klog.Infof("allocating for pod %s/%s", pod.Namespace, pod.Name) + containerDevs, err := ps.getNextContainerDevices(pod) if err != nil { return nil, fmt.Errorf("get next container devices: %w", err) @@ -785,3 +792,16 @@ func (ps *PluginServer) Allocate(ctx context.Context, reqs *v1beta1.AllocateRequ func (ps *PluginServer) PreStartContainer(context.Context, *v1beta1.PreStartContainerRequest) (*v1beta1.PreStartContainerResponse, error) { return &v1beta1.PreStartContainerResponse{}, nil } + +// podAllocationTrySuccess checks if all containers of this pod have been +// allocated. If so, it sets bind-phase to "success" and releases the node +// lock; otherwise it returns without setting bind-phase or releasing the lock, +// waiting for the next Allocate call. +func (ps *PluginServer) podAllocationTrySuccess(pod *v1.Pod) { + plugin.PodAllocationTrySuccess(ps.nodeName, ps.commonWord, NodeLockAscend, pod) +} + +// podAllocationFailed sets bind-phase to "failed" and releases the node lock. +func (ps *PluginServer) podAllocationFailed(pod *v1.Pod) { + plugin.PodAllocationFailed(ps.nodeName, pod, NodeLockAscend) +} From 8e2cbdc68c8bb6689dd1e5396ee16781e4571459 Mon Sep 17 00:00:00 2001 From: band-p Date: Thu, 14 May 2026 11:40:21 +0800 Subject: [PATCH 06/17] device node config support filterDevices Signed-off-by: band-p --- README.md | 2 + README_cn.md | 2 + ascend-device-node-configmap.yaml | 1 + internal/manager/manager.go | 67 ++++++++++++++++++++++++++----- internal/vnpu.go | 12 +++--- 5 files changed, 68 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 94adff0..80c2d31 100644 --- a/README.md +++ b/README.md @@ -80,6 +80,8 @@ kubectl apply -f https://raw.githubusercontent.com/Project-HAMi/ascend-device-pl The `hami-device-node-config` is used to enable or override hami-vnpu-core for specific nodes within the cluster. Node-level settings take higher priority than the global `vnpus.hamiVnpuCore` switch. +It also supports `filterDevices` to limit which card IDs are exposed by the device plugin on a specific node, for example: `filterDevices: [0, 1, 2, 3]`. + ```bash kubectl apply -f https://raw.githubusercontent.com/Project-HAMi/ascend-device-plugin/main/ascend-device-node-configmap.yaml ``` diff --git a/README_cn.md b/README_cn.md index cfbb548..7c7585f 100644 --- a/README_cn.md +++ b/README_cn.md @@ -82,6 +82,8 @@ kubectl apply -f https://raw.githubusercontent.com/Project-HAMi/ascend-device-pl `hami-device-node-config` 用于对集群中特定节点的 hami-vnpu-core 进行启用或覆盖。节点级配置的优先级高于全局 `vnpus.hamiVnpuCore` 开关。 +同时支持 `filterDevices`,用于限制某个节点对外暴露的卡号,例如:`filterDevices: [0, 1, 2, 3]`。 + ```bash kubectl apply -f https://raw.githubusercontent.com/Project-HAMi/ascend-device-plugin/main/ascend-device-node-configmap.yaml ``` diff --git a/ascend-device-node-configmap.yaml b/ascend-device-node-configmap.yaml index ca6ee57..e83963e 100644 --- a/ascend-device-node-configmap.yaml +++ b/ascend-device-node-configmap.yaml @@ -13,3 +13,4 @@ data: - name: "cnst-dev-w2" hami-vnpu-core: true vDeviceCount: 8 + filterDevices: [0, 1, 2, 3, 4, 5, 6, 7] diff --git a/internal/manager/manager.go b/internal/manager/manager.go index 8ce1df2..a058e0d 100644 --- a/internal/manager/manager.go +++ b/internal/manager/manager.go @@ -39,11 +39,11 @@ type Device struct { } type AscendManager struct { - mgr *devmanager.DeviceManager - config internal.VNPUConfig + mgr *devmanager.DeviceManager + config internal.VNPUConfig globalConfig internal.Config - devs []*Device - nodeConfig *internal.NodeConfig + devs []*Device + nodeConfig *internal.NodeConfig } func NewAscendManager() (*AscendManager, error) { @@ -58,7 +58,7 @@ func NewAscendManager() (*AscendManager, error) { } func (am *AscendManager) LoadNodeConfig(nodePath string, nodeName string) error { - nodeConfigList, err := internal.LoadNodeConfig(nodePath) + nodeConfigList, err := internal.LoadNodeConfig(nodePath) if err != nil { klog.Warningf("Failed to load node config from %s: %v", nodePath, err) return err @@ -71,11 +71,32 @@ func (am *AscendManager) LoadNodeConfig(nodePath string, nodeName string) error return nil } } - + klog.Infof("No specific config found for node %s, will use default settings", nodeName) return nil } +func (am *AscendManager) filteredCardSet() map[int32]struct{} { + if am.nodeConfig == nil || len(am.nodeConfig.FilterDevices) == 0 { + return nil + } + + filtered := make(map[int32]struct{}, len(am.nodeConfig.FilterDevices)) + for _, cardID := range am.nodeConfig.FilterDevices { + filtered[cardID] = struct{}{} + } + return filtered +} + +func (am *AscendManager) shouldIncludeCard(cardID int32) bool { + filtered := am.filteredCardSet() + if len(filtered) == 0 { + return true + } + _, ok := filtered[cardID] + return ok +} + func (am *AscendManager) LoadConfig(path string) error { config, err := internal.LoadConfig(path) if err != nil { @@ -141,6 +162,10 @@ func (am *AscendManager) UpdateDevice() error { klog.Errorf("failed to get card id from device id: %v", err) return err } + if !am.shouldIncludeCard(cardID) { + klog.V(4).Infof("skip filtered cardID=%d logicID=%d phyID=%d deviceID=%d", cardID, ID, phyID, deviceID) + continue + } uuid, err := am.mgr.GetDieID(ID, dcmi.VDIE) if err != nil { klog.Errorf("failed to get uuid from device id: %v", err) @@ -183,7 +208,18 @@ func (am *AscendManager) GetIDs() []int32 { if err != nil { return nil } - return IDs + filteredIDs := make([]int32, 0, len(IDs)) + for _, id := range IDs { + cardID, _, err := am.mgr.GetCardIDDeviceID(id) + if err != nil { + klog.Warningf("failed to get card/device ID for logic ID %d: %v", id, err) + continue + } + if am.shouldIncludeCard(cardID) { + filteredIDs = append(filteredIDs, id) + } + } + return filteredIDs } func (am *AscendManager) GetUnHealthIDs() []int32 { @@ -193,6 +229,14 @@ func (am *AscendManager) GetUnHealthIDs() []int32 { } var unhealthy []int32 for _, d := range IDs { + cardID, _, err := am.mgr.GetCardIDDeviceID(d) + if err != nil { + klog.Warningf("failed to get card/device ID for logic ID %d: %v", d, err) + continue + } + if !am.shouldIncludeCard(cardID) { + continue + } healthCode, err := am.mgr.GetDeviceHealth(d) if err != nil { continue @@ -220,6 +264,10 @@ func (am *AscendManager) CleanupIdleVNPUs() error { klog.Warningf("failed to get card/device ID for logic ID %d: %v", logicID, err) continue } + if !am.shouldIncludeCard(cardID) { + klog.V(4).Infof("skip cleanup on filtered cardID=%d logicID=%d deviceID=%d", cardID, logicID, deviceID) + continue + } // Obtain all vNPU information on this device vDevInfos, err := am.mgr.GetVirtualDeviceInfo(logicID) if err != nil { @@ -254,9 +302,8 @@ func (am *AscendManager) CleanupIdleVNPUs() error { return nil } - func (am *AscendManager) GetNodeConfig() *internal.NodeConfig { - return am.nodeConfig + return am.nodeConfig } func (am *AscendManager) IsHamiVnpuCore() bool { @@ -264,4 +311,4 @@ func (am *AscendManager) IsHamiVnpuCore() bool { return am.nodeConfig.HamiVnpuCore } return am.globalConfig.VNPUs.HamiVnpuCore -} \ No newline at end of file +} diff --git a/internal/vnpu.go b/internal/vnpu.go index 9872d72..e43aef2 100644 --- a/internal/vnpu.go +++ b/internal/vnpu.go @@ -63,15 +63,15 @@ func LoadConfig(path string) (*Config, error) { return &yamlData, nil } - type NodeConfig struct { - Name string `json:"name"` - HamiVnpuCore bool `json:"hami-vnpu-core"` - VDeviceCount int `json:"vDeviceCount"` + Name string `json:"name" yaml:"name"` + HamiVnpuCore bool `json:"hami-vnpu-core" yaml:"hami-vnpu-core"` + VDeviceCount int `json:"vDeviceCount" yaml:"vDeviceCount"` + FilterDevices []int32 `json:"filterDevices,omitempty" yaml:"filterDevices,omitempty"` } type NodeListConfig struct { - Nodes []NodeConfig `json:"nodes"` + Nodes []NodeConfig `json:"nodes" yaml:"nodes"` } func LoadNodeConfig(path string) (*NodeListConfig, error) { @@ -85,4 +85,4 @@ func LoadNodeConfig(path string) (*NodeListConfig, error) { return nil, err } return &yamlData, nil -} \ No newline at end of file +} From 461d419135c40474b7ec6b50f5f1360a7d08d9b7 Mon Sep 17 00:00:00 2001 From: houyuxi Date: Thu, 14 May 2026 19:54:28 +0800 Subject: [PATCH 07/17] fix(allocate): resolve pod variable shadowing and optimize allocation loop The `pod, err :=` declaration inside the Allocate loop shadowed the outer `pod` variable, causing the defer closure to always see nil and preventing podAllocationTrySuccess/podAllocationFailed from executing. Additionally: - Move `GetPendingPod`, `buildRuntimeInfoLookup`, and annotation decode outside the loop to eliminate redundant per-container API calls - Replace custom `decodePodSingleDevice` with `HAMi's DecodePodDevices` - Use `popNextContainerDevices` for find and erase ContainerDevices in memory then call `patchErasedAnnotation` only one times instead of per-container erase - Simplify method docs Signed-off-by: houyuxi --- internal/server/server.go | 190 ++++++++++++-------------------------- 1 file changed, 59 insertions(+), 131 deletions(-) diff --git a/internal/server/server.go b/internal/server/server.go index 7b5c0c5..c30b2e3 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -445,15 +445,8 @@ func (ps *PluginServer) watchAndRegister() { } } -// buildContainerAllocateResponse constructs the ContainerAllocateResponse for -// the given container devices. It resolves UUIDs to PhyIDs and looks up vNPU -// template names from the allocAnno annotation. -func (ps *PluginServer) buildContainerAllocateResponse(pod *v1.Pod, containerDevs device.ContainerDevices) (*v1beta1.ContainerAllocateResponse, error) { - rtInfoLookup, err := ps.buildRuntimeInfoLookup(pod) - if err != nil { - return nil, fmt.Errorf("build runtimeInfo lookup: %w", err) - } - +// buildContainerAllocateResponse builds the allocate response for a single container. +func (ps *PluginServer) buildContainerAllocateResponse(pod *v1.Pod, containerDevs device.ContainerDevices, rtInfoLookup map[string]RuntimeInfo) (*v1beta1.ContainerAllocateResponse, error) { resp := &v1beta1.ContainerAllocateResponse{} var ( @@ -556,31 +549,37 @@ func (ps *PluginServer) buildContainerAllocateResponse(pod *v1.Pod, containerDev return resp, nil } -// getNextContainerDevices reads the toAllocDeviceAnno annotation and returns -// the devices of the first container that still has pending allocations. -func (ps *PluginServer) getNextContainerDevices(pod *v1.Pod) (device.ContainerDevices, error) { - anno, ok := pod.Annotations[ps.toAllocDeviceAnno] - if !ok { - return nil, fmt.Errorf("annotation %s not found", ps.toAllocDeviceAnno) +// popNextContainerDevices finds and erases the first non-empty containerDevices +// from podSingleDev. It mutates podSingleDev in place. +func (ps *PluginServer) popNextContainerDevices(podSingleDev device.PodSingleDevice) (device.ContainerDevices, error) { + for i, ctrDevs := range podSingleDev { + if len(ctrDevs) > 0 { + podSingleDev[i] = device.ContainerDevices{} + return ctrDevs, nil + } } - podSingleDev, err := decodePodSingleDevice(anno) + return nil, fmt.Errorf("no pending device allocation found") +} + +// decodeDeviceAnnotations decodes the pod's device allocation annotation +// (registered as hami.io/-devices-to-allocate in InRequestDevices) +// into a PodSingleDevice. +func (ps *PluginServer) decodeDeviceAnnotations(pod *v1.Pod) (device.PodSingleDevice, error) { + pdevices, err := device.DecodePodDevices(device.InRequestDevices, pod.Annotations) if err != nil { - return nil, fmt.Errorf("decode annotation %s: %w", ps.toAllocDeviceAnno, err) + return nil, err } - for _, ctrDevice := range podSingleDev { - if len(ctrDevice) > 0 { - return ctrDevice, nil - } + pd, ok := pdevices[ps.commonWord] + if !ok { + return nil, fmt.Errorf("device %s not found in pod annotations", ps.commonWord) } - return nil, fmt.Errorf("no pending device allocation in annotation %s", ps.toAllocDeviceAnno) + return pd, nil } -// buildTempLookup reads the allocAnno annotation and returns a map from -// device UUID to its vNPU template name. +// buildRuntimeInfoLookup builds a UUID-to-RuntimeInfo lookup from the pod's allocAnno annotation. func (ps *PluginServer) buildRuntimeInfoLookup(pod *v1.Pod) (map[string]RuntimeInfo, error) { anno, ok := pod.Annotations[ps.allocAnno] if !ok { - // The annotation may not exist for non-vNPU scenarios; return empty lookup. return nil, fmt.Errorf("annotation %s not set", ps.allocAnno) } var rtInfo []RuntimeInfo @@ -596,107 +595,21 @@ func (ps *PluginServer) buildRuntimeInfoLookup(pod *v1.Pod) (map[string]RuntimeI return lookup, nil } -// eraseCurrentContainerAnnotation erases the current container's devices from -// the toAllocDeviceAnno annotation, so the next Allocate call will advance -// to the next container. -func (ps *PluginServer) eraseCurrentContainerAnnotation(pod *v1.Pod) error { - anno, ok := pod.Annotations[ps.toAllocDeviceAnno] - if !ok { - return fmt.Errorf("annotation %s not found", ps.toAllocDeviceAnno) - } - podSingleDev, err := decodePodSingleDevice(anno) - if err != nil { - return fmt.Errorf("decode annotation %s: %w", ps.toAllocDeviceAnno, err) - } - res := make(device.PodSingleDevice, 0, len(podSingleDev)) - found := false - for _, val := range podSingleDev { - if found { - res = append(res, val) - } else { - if len(val) > 0 { - found = true - res = append(res, device.ContainerDevices{}) - } else { - res = append(res, val) - } - } - } - klog.V(5).Infof("After erase annotation, remaining devices: %v", res) - newAnnoValue := device.EncodePodSingleDevice(res) +// patchErasedAnnotation patches the pod's device annotation with the given +// podSingleDev. It also updates pod.Annotations in place. +func (ps *PluginServer) patchErasedAnnotation(pod *v1.Pod, podSingleDev device.PodSingleDevice) error { + klog.V(5).Infof("After erase annotation, remaining devices: %v", podSingleDev) + newAnnoValue := device.EncodePodSingleDevice(podSingleDev) newAnnos := map[string]string{ ps.toAllocDeviceAnno: newAnnoValue, } if err := util.PatchPodAnnotations(pod, newAnnos); err != nil { return err } - // Update in-memory pod annotations so subsequent getNextContainerDevices - // calls within the same Allocate see the erased state. pod.Annotations[ps.toAllocDeviceAnno] = newAnnoValue return nil } -// decodePodSingleDevice decodes a single annotation value string into a -// PodSingleDevice. It reuses HAMi's DecodeContainerDevices for per-device -// parsing. The format is: -// -// ,:,;,;... -func decodePodSingleDevice(str string) (device.PodSingleDevice, error) { - if len(str) == 0 { - return device.PodSingleDevice{}, nil - } - pd := make(device.PodSingleDevice, 0) - for _, s := range strings.Split(str, device.OnePodMultiContainerSplitSymbol) { - cd, err := device.DecodeContainerDevices(s) - if err != nil { - return nil, err - } - if len(cd) == 0 { - continue - } - pd = append(pd, cd) - } - return pd, nil -} - -func (ps *PluginServer) parsePodAnnotation(pod *v1.Pod) ([]int32, []string, []*int64, []*int32, error) { - anno, ok := pod.Annotations[ps.allocAnno] - if !ok { - return nil, nil, nil, nil, fmt.Errorf("annotation %s not set", "huawei.com/Ascend") - } - var rtInfo []RuntimeInfo - err := json.Unmarshal([]byte(anno), &rtInfo) - if err != nil { - return nil, nil, nil, nil, fmt.Errorf("annotation %s value %s invalid: %w", ps.allocAnno, anno, err) - } - var IDs []int32 - var temps []string - var memories []*int64 - var cores []*int32 - - for _, info := range rtInfo { - if info.UUID == "" { - continue - } - d := ps.mgr.GetDeviceByUUID(info.UUID) - if d == nil { - return nil, nil, nil, nil, fmt.Errorf("unknown uuid: %s", info.UUID) - } - IDs = append(IDs, d.PhyID) - temps = append(temps, info.Temp) - if info.Memory != nil { - memories = append(memories, info.Memory) - } - if info.Core != nil { - cores = append(cores, info.Core) - } - } - if len(IDs) == 0 { - return nil, nil, nil, nil, fmt.Errorf("annotation %s value %s invalid", ps.allocAnno, anno) - } - return IDs, temps, memories, cores, nil -} - func (ps *PluginServer) apiDevices() []*v1beta1.Device { devs := ps.mgr.GetDevices() devices := make([]*v1beta1.Device, 0, len(devs)) @@ -753,16 +666,30 @@ func (ps *PluginServer) Allocate(ctx context.Context, reqs *v1beta1.AllocateRequ } }() + var err error + pod, err = util.GetPendingPod(ctx, ps.nodeName) + if err != nil { + klog.Errorf("get pending pod error: %v", err) + return nil, fmt.Errorf("get pending pod error: %w", err) + } + klog.Infof("allocating for pod %s/%s", pod.Namespace, pod.Name) + + rtInfoLookup, err := ps.buildRuntimeInfoLookup(pod) + if err != nil { + return nil, fmt.Errorf("build runtimeInfo lookup: %w", err) + } + + podSingleDev, err := ps.decodeDeviceAnnotations(pod) + if err != nil { + return nil, fmt.Errorf("decode device annotations: %w", err) + } + + // kubelet may call Allocate multiple times for the same pod, each time with + // a subset of containers. Use pop semantics to match each request with its + // corresponding containerDevices. responses := v1beta1.AllocateResponse{} for _, req := range reqs.ContainerRequests { - pod, err := util.GetPendingPod(ctx, ps.nodeName) - if err != nil { - klog.Errorf("get pending pod error: %v", err) - return nil, fmt.Errorf("get pending pod error: %w", err) - } - klog.Infof("allocating for pod %s/%s", pod.Namespace, pod.Name) - - containerDevs, err := ps.getNextContainerDevices(pod) + containerDevs, err := ps.popNextContainerDevices(podSingleDev) if err != nil { return nil, fmt.Errorf("get next container devices: %w", err) } @@ -772,18 +699,19 @@ func (ps *PluginServer) Allocate(ctx context.Context, reqs *v1beta1.AllocateRequ return nil, fmt.Errorf("device number not matched: annotation has %d, request has %d", len(containerDevs), len(req.DevicesIDs)) } - resp, err := ps.buildContainerAllocateResponse(pod, containerDevs) + resp, err := ps.buildContainerAllocateResponse(pod, containerDevs, rtInfoLookup) if err != nil { return nil, fmt.Errorf("build container allocate response: %w", err) } - - if err := ps.eraseCurrentContainerAnnotation(pod); err != nil { - klog.Errorf("erase current container annotation error: %v", err) - return nil, fmt.Errorf("erase current container annotation: %w", err) - } - responses.ContainerResponses = append(responses.ContainerResponses, resp) } + + // Patch the annotation with the in-memory erased podSingleDev. + if err := ps.patchErasedAnnotation(pod, podSingleDev); err != nil { + klog.Errorf("erase allocated containers annotation error: %v", err) + return nil, fmt.Errorf("erase allocated containers annotation: %w", err) + } + klog.V(5).Infof("allocate response: %+v", responses.ContainerResponses) success = true return &responses, nil From 27b929f9f77db002733805747abedc0575cddf69 Mon Sep 17 00:00:00 2001 From: houyuxi Date: Thu, 14 May 2026 19:57:50 +0800 Subject: [PATCH 08/17] fix(server): remove redundant non-zero IDs length condition Signed-off-by: houyuxi --- internal/server/server.go | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/internal/server/server.go b/internal/server/server.go index c30b2e3..58297c5 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -534,13 +534,9 @@ func (ps *PluginServer) buildContainerAllocateResponse(pod *v1.Pod, containerDev klog.V(4).InfoS("Core priority set", "value", *cores[0]) } - // Set GLOBAL_SHM_PATH separated by device ID. - if len(IDs) > 0 { - resp.Envs["NPU_GLOBAL_SHM_PATH"] = fmt.Sprintf("/hami-shared-region/%d_global_registry", IDs[0]) - klog.V(5).Infof("Create %d_global_registry", IDs[0]) - } else { - klog.Warningf("No device IDs allocated") - } + // Set GLOBAL_SHM_PATH based on the first device ID. + resp.Envs["NPU_GLOBAL_SHM_PATH"] = fmt.Sprintf("/hami-shared-region/%d_global_registry", IDs[0]) + klog.V(5).Infof("Create %d_global_registry", IDs[0]) } else { if ascendVNPUSpec != "" { resp.Envs["ASCEND_VNPU_SPECS"] = ascendVNPUSpec From 54eb8f064daad63c7e4520e9ed9acdec73567145 Mon Sep 17 00:00:00 2001 From: houyuxi Date: Thu, 14 May 2026 20:30:49 +0800 Subject: [PATCH 09/17] fix(manager): add RWMutex for `AscendManager.dev` Signed-off-by: houyuxi --- internal/manager/manager.go | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/internal/manager/manager.go b/internal/manager/manager.go index 3fd82b5..bea438e 100644 --- a/internal/manager/manager.go +++ b/internal/manager/manager.go @@ -19,6 +19,7 @@ package manager import ( "fmt" "sort" + "sync" "ascend-common/devmanager" "ascend-common/devmanager/dcmi" @@ -39,6 +40,7 @@ type Device struct { } type AscendManager struct { + mu sync.RWMutex mgr *devmanager.DeviceManager config internal.VNPUConfig globalConfig internal.Config @@ -129,7 +131,7 @@ func (am *AscendManager) UpdateDevice() error { return err } - am.devs = make([]*Device, 0, len(IDs)) + newDevs := make([]*Device, 0, len(IDs)) for _, ID := range IDs { phyID, err := am.mgr.GetPhysicIDFromLogicID(ID) if err != nil { @@ -151,7 +153,7 @@ func (am *AscendManager) UpdateDevice() error { klog.Errorf("failed to get device health: %v", err) return err } - am.devs = append(am.devs, &Device{ + newDevs = append(newDevs, &Device{ UUID: uuid, LogicID: ID, PhyID: phyID, @@ -162,14 +164,21 @@ func (am *AscendManager) UpdateDevice() error { Health: health == 0, }) } + am.mu.Lock() + am.devs = newDevs + am.mu.Unlock() return nil } func (am *AscendManager) GetDevices() []*Device { + am.mu.RLock() + defer am.mu.RUnlock() return am.devs } func (am *AscendManager) GetDeviceByUUID(UUID string) *Device { + am.mu.RLock() + defer am.mu.RUnlock() for _, dev := range am.devs { if dev.UUID == UUID { return dev From 444d4f5e8268a960ed79cacb654e88fd29f12c66 Mon Sep 17 00:00:00 2001 From: houyuxi Date: Thu, 14 May 2026 20:34:24 +0800 Subject: [PATCH 10/17] fix(server): exit the for loop in goroutine in `serve` method when `stopCh` closed Signed-off-by: houyuxi --- internal/server/server.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/internal/server/server.go b/internal/server/server.go index 58297c5..881cb5b 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -293,6 +293,11 @@ func (ps *PluginServer) serve() error { lastCrashTime := time.Now() restartCount := 0 for { + select { + case <-ps.stopCh: + return + default: + } klog.Infof("Starting GRPC server for '%s'", resourceName) err := ps.grpcServer.Serve(sock) if err == nil { From b216b6b3a46869340dd9b674db8261aa7a5b967f Mon Sep 17 00:00:00 2001 From: houyuxi Date: Thu, 14 May 2026 20:52:09 +0800 Subject: [PATCH 11/17] fix(manager): use %w to preserve error chain in CleanupIdleVNPUs Signed-off-by: houyuxi --- internal/manager/manager.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/manager/manager.go b/internal/manager/manager.go index bea438e..6221671 100644 --- a/internal/manager/manager.go +++ b/internal/manager/manager.go @@ -218,7 +218,7 @@ func (am *AscendManager) CleanupIdleVNPUs() error { _, IDs, err := am.mgr.GetDeviceList() if err != nil { - return fmt.Errorf("failed to get device list: %v", err) + return fmt.Errorf("failed to get device list: %w", err) } klog.Infof("Found %d devices to check for idle vNPUs,%+v", len(IDs), IDs) From 4e1bcea3b7ce7196433152f7627dd0b5192bcf05 Mon Sep 17 00:00:00 2001 From: houyuxi Date: Thu, 14 May 2026 20:52:34 +0800 Subject: [PATCH 12/17] fix(manager): add error logging in GetIDs and GetUnHealthIDs Signed-off-by: houyuxi --- internal/manager/manager.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/internal/manager/manager.go b/internal/manager/manager.go index 6221671..3d79991 100644 --- a/internal/manager/manager.go +++ b/internal/manager/manager.go @@ -190,6 +190,7 @@ func (am *AscendManager) GetDeviceByUUID(UUID string) *Device { func (am *AscendManager) GetIDs() []int32 { _, IDs, err := am.mgr.GetDeviceList() if err != nil { + klog.Errorf("failed to get device list: %v", err) return nil } return IDs @@ -204,6 +205,7 @@ func (am *AscendManager) GetUnHealthIDs() []int32 { for _, d := range IDs { healthCode, err := am.mgr.GetDeviceHealth(d) if err != nil { + klog.Warningf("failed to get device health for %d: %v", d, err) continue } if healthCode != 0 { From 15ad42b11ded0bf7a310c07f8dccbb70b03e2609 Mon Sep 17 00:00:00 2001 From: houyuxi Date: Thu, 14 May 2026 20:53:19 +0800 Subject: [PATCH 13/17] fix(manager): wrap bare errors with context in NewAscendManager and LoadConfig Signed-off-by: houyuxi --- internal/manager/manager.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/internal/manager/manager.go b/internal/manager/manager.go index 3d79991..b3af1ec 100644 --- a/internal/manager/manager.go +++ b/internal/manager/manager.go @@ -51,7 +51,7 @@ type AscendManager struct { func NewAscendManager() (*AscendManager, error) { mgr, err := devmanager.AutoInit("", 30) if err != nil { - return nil, err + return nil, fmt.Errorf("failed to auto-init device manager: %w", err) } return &AscendManager{ mgr: mgr, @@ -81,11 +81,11 @@ func (am *AscendManager) LoadNodeConfig(nodePath string, nodeName string) error func (am *AscendManager) LoadConfig(path string) error { config, err := internal.LoadConfig(path) if err != nil { - return err + return fmt.Errorf("failed to load config from %s: %w", path, err) } chipInfo, err := am.mgr.GetValidChipInfo() if err != nil { - return err + return fmt.Errorf("failed to get valid chip info: %w", err) } if chipInfo.Type != "Ascend" { return fmt.Errorf("chip type is not Ascend") From f54b1babf8ead821d3134953f4f6faf65151c204 Mon Sep 17 00:00:00 2001 From: houyuxi Date: Thu, 14 May 2026 20:53:51 +0800 Subject: [PATCH 14/17] fix(server): use errors.Is instead of string matching for ETXTBSY Signed-off-by: houyuxi --- internal/server/server.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/internal/server/server.go b/internal/server/server.go index 881cb5b..45f240f 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -21,6 +21,7 @@ import ( "crypto/sha256" "encoding/hex" "encoding/json" + "errors" "flag" "fmt" "io" @@ -30,6 +31,7 @@ import ( "path/filepath" "strconv" "strings" + "syscall" "time" "google.golang.org/grpc" @@ -168,7 +170,7 @@ func prepareHostResources() error { } if err := copyFile(srcPath, destPath); err != nil { - if strings.Contains(err.Error(), "text file busy") { + if errors.Is(err, syscall.ETXTBSY) { klog.Warningf("⚠ %s is in use by running process, keeping existing version (safe)", destPath) continue } From 644b6a44dee540d74f231c0e822c4ea627ceb249 Mon Sep 17 00:00:00 2001 From: houyuxi Date: Thu, 14 May 2026 20:54:14 +0800 Subject: [PATCH 15/17] fix(server): wrap bare errors with context in serve and registerKubelet Signed-off-by: houyuxi --- internal/server/server.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/internal/server/server.go b/internal/server/server.go index 45f240f..4fff85a 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -329,7 +329,7 @@ func (ps *PluginServer) serve() error { // Wait for server to start by launching a blocking connexion conn, err := ps.dial(ps.socket, 5*time.Second) if err != nil { - return err + return fmt.Errorf("failed to dial device plugin socket: %w", err) } _ = conn.Close() @@ -339,7 +339,7 @@ func (ps *PluginServer) serve() error { func (ps *PluginServer) registerKubelet() error { conn, err := ps.dial(v1beta1.KubeletSocket, 5*time.Second) if err != nil { - return err + return fmt.Errorf("failed to dial kubelet socket: %w", err) } defer func(conn *grpc.ClientConn) { _ = conn.Close() @@ -356,7 +356,7 @@ func (ps *PluginServer) registerKubelet() error { _, err = client.Register(context.Background(), reqt) if err != nil { - return err + return fmt.Errorf("failed to register device plugin with kubelet: %w", err) } return nil } From 22dfc0b4d015f509fc91fdf8513b3a6745d093a9 Mon Sep 17 00:00:00 2001 From: houyuxi Date: Fri, 15 May 2026 14:40:53 +0800 Subject: [PATCH 16/17] build(deps): add the missing npu-exporter dependency Signed-off-by: houyuxi --- go.mod | 68 ++++++++++--------- go.sum | 208 +++++++++++++++++++++++++++++---------------------------- 2 files changed, 140 insertions(+), 136 deletions(-) diff --git a/go.mod b/go.mod index d04534c..767178e 100644 --- a/go.mod +++ b/go.mod @@ -1,58 +1,60 @@ module github.com/Project-HAMi/ascend-device-plugin -go 1.22.2 +go 1.24.6 require ( ascend-common v0.0.0 github.com/Project-HAMi/HAMi v0.0.0 - github.com/fsnotify/fsnotify v1.7.0 - google.golang.org/grpc v1.63.2 - k8s.io/api v0.29.3 - k8s.io/apimachinery v0.29.3 - k8s.io/klog/v2 v2.120.1 - k8s.io/kubelet v0.29.3 + github.com/fsnotify/fsnotify v1.9.0 + google.golang.org/grpc v1.75.0 + huawei.com/npu-exporter v0.0.0-00010101000000-000000000000 + k8s.io/api v0.33.0 + k8s.io/apimachinery v0.33.0 + k8s.io/klog/v2 v2.130.1 + k8s.io/kubelet v0.31.3 ) require ( - github.com/davecgh/go-spew v1.1.1 // indirect + github.com/ccoveille/go-safecast v1.6.1 // indirect + github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/emicklei/go-restful/v3 v3.11.3 // indirect - github.com/go-logr/logr v1.4.1 // indirect - github.com/go-openapi/jsonpointer v0.20.2 // indirect + github.com/fxamacker/cbor/v2 v2.7.0 // indirect + github.com/go-logr/logr v1.4.3 // indirect + github.com/go-openapi/jsonpointer v0.21.0 // indirect github.com/go-openapi/jsonreference v0.20.4 // indirect - github.com/go-openapi/swag v0.22.9 // indirect + github.com/go-openapi/swag v0.23.0 // indirect github.com/gogo/protobuf v1.3.2 // indirect - github.com/golang/protobuf v1.5.4 // indirect - github.com/google/gnostic-models v0.6.8 // indirect - github.com/google/gofuzz v1.2.0 // indirect + github.com/google/gnostic-models v0.6.9 // indirect + github.com/google/go-cmp v0.7.0 // indirect github.com/google/uuid v1.6.0 // indirect - github.com/imdario/mergo v0.3.16 // indirect + github.com/influxdata/telegraf v1.26.3 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/mailru/easyjson v0.7.7 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect - github.com/onsi/ginkgo/v2 v2.17.1 // indirect - github.com/onsi/gomega v1.32.0 // indirect + github.com/pkg/errors v0.9.1 // indirect github.com/smartystreets/goconvey v1.7.2 // indirect - github.com/spf13/pflag v1.0.5 // indirect - golang.org/x/net v0.26.0 // indirect - golang.org/x/oauth2 v0.17.0 // indirect - golang.org/x/sys v0.21.0 // indirect - golang.org/x/term v0.21.0 // indirect - golang.org/x/text v0.16.0 // indirect - golang.org/x/time v0.5.0 // indirect - google.golang.org/appengine v1.6.8 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20240227224415-6ceb2ff114de // indirect - google.golang.org/protobuf v1.33.0 // indirect + github.com/spf13/pflag v1.0.7 // indirect + github.com/x448/float16 v0.8.4 // indirect + golang.org/x/net v0.43.0 // indirect + golang.org/x/oauth2 v0.30.0 // indirect + golang.org/x/sys v0.35.0 // indirect + golang.org/x/term v0.34.0 // indirect + golang.org/x/text v0.28.0 // indirect + golang.org/x/time v0.9.0 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20250707201910-8d1bb00bc6a7 // indirect + google.golang.org/protobuf v1.36.8 // indirect + gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect - gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/client-go v0.29.3 // indirect - k8s.io/kube-openapi v0.0.0-20240227032403-f107216b40e2 // indirect - k8s.io/utils v0.0.0-20240102154912-e7106e64919e // indirect - sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect - sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect + k8s.io/client-go v0.33.0 // indirect + k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect + k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 // indirect + sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect + sigs.k8s.io/randfill v1.0.0 // indirect + sigs.k8s.io/structured-merge-diff/v4 v4.6.0 // indirect sigs.k8s.io/yaml v1.4.0 // indirect ) diff --git a/go.sum b/go.sum index af6584a..d53050b 100644 --- a/go.sum +++ b/go.sum @@ -1,51 +1,49 @@ -gitee.com/ascend/ascend-npu-exporter/v6 v6.0.0-RC3 h1:gmcdFAckl3OCubjk8Mz7jgYWBHm+7pzkmQ19/afghhY= -gitee.com/ascend/ascend-npu-exporter/v6 v6.0.0-RC3/go.mod h1:tQw2ukw5YzlXWJa5cDfY8TNcTiBieor69lsdHFEiMZ8= -github.com/Project-HAMi/HAMi v0.0.0-20250107033239-d04fc8baaad6 h1:5SbvXn7H5spMTgCM4+sF6zm113WVCceUuOuwItkqELY= -github.com/Project-HAMi/HAMi v0.0.0-20250107033239-d04fc8baaad6/go.mod h1:lY4bmpcPiKWg0bVPCJFRH6xDW8p5PouIk/nIIU1I2d8= +github.com/Project-HAMi/HAMi v0.0.0-20250901013025-61c6cbe7d480 h1:2rV+Gpy2+1fDOpQBPPXE3YG6nwfaO8DZjyCH+ARAmMY= +github.com/Project-HAMi/HAMi v0.0.0-20250901013025-61c6cbe7d480/go.mod h1:KgE6IKrLJBAp6YrToFRFLDXHXctsZ6wXvNHMWY6ZbBU= github.com/agiledragon/gomonkey/v2 v2.8.0 h1:u2K2nNGyk0ippzklz1CWalllEB9ptD+DtSXeCX5O000= github.com/agiledragon/gomonkey/v2 v2.8.0/go.mod h1:ap1AmDzcVOAz1YpeJ3TCzIgstoaWLA6jbbgxfB4w2iY= +github.com/ccoveille/go-safecast v1.6.1 h1:Nb9WMDR8PqhnKCVs2sCB+OqhohwO5qaXtCviZkIff5Q= +github.com/ccoveille/go-safecast v1.6.1/go.mod h1:QqwNjxQ7DAqY0C721OIO9InMk9zCwcsO7tnRuHytad8= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/emicklei/go-restful/v3 v3.11.3 h1:yagOQz/38xJmcNeZJtrUcKjkHRltIaIFXKWeG1SkWGE= github.com/emicklei/go-restful/v3 v3.11.3/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= -github.com/evanphx/json-patch v5.9.0+incompatible h1:fBXyNpNMuTTDdquAq/uisOr2lShz4oaXpDTX2bLe7ls= -github.com/evanphx/json-patch v5.9.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= -github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= -github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= -github.com/go-logr/logr v1.4.1 h1:pKouT5E8xu9zeFC39JXRDukb6JFQPXM5p5I91188VAQ= -github.com/go-logr/logr v1.4.1/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= -github.com/go-openapi/jsonpointer v0.20.2 h1:mQc3nmndL8ZBzStEo3JYF8wzmeWffDH4VbXz58sAx6Q= -github.com/go-openapi/jsonpointer v0.20.2/go.mod h1:bHen+N0u1KEO3YlmqOjTT9Adn1RfD91Ar825/PuiRVs= +github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= +github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= +github.com/fxamacker/cbor/v2 v2.7.0 h1:iM5WgngdRBanHcxugY4JySA0nk1wZorNOpTgCMedv5E= +github.com/fxamacker/cbor/v2 v2.7.0/go.mod h1:pxXPTn3joSm21Gbwsv0w9OSA2y1HFR9qXEeXQVeNoDQ= +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ= +github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY= github.com/go-openapi/jsonreference v0.20.4 h1:bKlDxQxQJgwpUSgOENiMPzCTBVuc7vTdXSSgNeAhojU= github.com/go-openapi/jsonreference v0.20.4/go.mod h1:5pZJyJP2MnYCpoeoMAql78cCHauHj0V9Lhc506VOpw4= -github.com/go-openapi/swag v0.22.9 h1:XX2DssF+mQKM2DHsbgZK74y/zj4mo9I99+89xUmuZCE= -github.com/go-openapi/swag v0.22.9/go.mod h1:3/OXnFfnMAwBD099SwYRk7GD3xOrr1iL7d/XNLXVVwE= -github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI= -github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572/go.mod h1:9Pwr4B2jHnOSGXyyzV8ROjYa2ojvAY6HCGYYfMoC3Ls= +github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE= +github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ= +github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= +github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= -github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= -github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= -github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I= -github.com/google/gnostic-models v0.6.8/go.mod h1:5n7qKqH0f5wFt+aWF8CW6pZLLNOfYuF5OpfBSENuI8U= -github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/gnostic-models v0.6.9 h1:MU/8wDLif2qCXZmzncUQ/BOfxWfthHi63KqpoNbWqVw= +github.com/google/gnostic-models v0.6.9/go.mod h1:CiWsm0s6BSQd1hRn8/QmxqB6BesYcbSZxsz9b0KuDBw= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= -github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= -github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= -github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 h1:K6RDEckDVWvDI9JAJYCmNdQXq6neHJOYx3V6jnqNEec= -github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= +github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J0b1vyeLSOYI8bm5wbJM/8yDe8= +github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 h1:EGx4pi6eqNxGaHF6qqu48+N2wcFQ5qg5FXgOdqsJ5d8= github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY= -github.com/imdario/mergo v0.3.16 h1:wwQJbIsHYGMUyLSPrEq1CT16AhnhNJQ51+4fdHUnCl4= -github.com/imdario/mergo v0.3.16/go.mod h1:WBLT9ZmE3lPoWsEzCh9LPo3TiwVN+ZKEjmz+hD27ysY= +github.com/influxdata/telegraf v1.26.3 h1:wawD3VTdnPDbHnJ1RBGgCf0YB7vlxREZ70rvEepHdGs= +github.com/influxdata/telegraf v1.26.3/go.mod h1:w+VUZ4NRDzfhRmhEdBbbNZBNT7E8qRkLiL73j/pD0ug= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= @@ -67,123 +65,127 @@ github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9G github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= -github.com/onsi/ginkgo/v2 v2.17.1 h1:V++EzdbhI4ZV4ev0UTIj0PzhzOcReJFyJaLjtSF55M8= -github.com/onsi/ginkgo/v2 v2.17.1/go.mod h1:llBI3WDLL9Z6taip6f33H76YcWtJv+7R3HigUjbIBOs= -github.com/onsi/gomega v1.32.0 h1:JRYU78fJ1LPxlckP6Txi/EYqJvjtMrDC04/MM5XRHPk= -github.com/onsi/gomega v1.32.0/go.mod h1:a4x4gW6Pz2yK1MAmvluYme5lvYTn61afQ2ETw/8n4Lg= +github.com/onsi/ginkgo/v2 v2.23.4 h1:ktYTpKJAVZnDT4VjxSbiBenUjmlL/5QkBEocaWXiQus= +github.com/onsi/ginkgo/v2 v2.23.4/go.mod h1:Bt66ApGPBFzHyR+JO10Zbt0Gsp4uWxu5mIOTusL46e8= +github.com/onsi/gomega v1.38.0 h1:c/WX+w8SLAinvuKKQFh77WEucCnPk4j2OTUr7lt7BeY= +github.com/onsi/gomega v1.38.0/go.mod h1:OcXcwId0b9QsE7Y49u+BTrL4IdKOBOKnD6VQNTJEB6o= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= -github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= -github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= +github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= github.com/smartystreets/assertions v1.2.0 h1:42S6lae5dvLc7BrLu/0ugRtcFVjoJNMC/N3yZFZkDFs= github.com/smartystreets/assertions v1.2.0/go.mod h1:tcbTF8ujkAEcZ8TElKY+i30BzYlVhC/LOxJk7iOWnoo= github.com/smartystreets/goconvey v1.7.2 h1:9RBaZCeXEQ3UselpuwUQHltGVXvdwm6cv1hgR6gDIPg= github.com/smartystreets/goconvey v1.7.2/go.mod h1:Vw0tHAZW6lzCRk3xgdin6fKYcG+G3Pg9vgXWeJpQFMM= -github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= -github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/pflag v1.0.7 h1:vN6T9TfwStFPFM5XzjsvmzZkLuaLX+HS+0SeFLRgU6M= +github.com/spf13/pflag v1.0.7/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= +github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= -github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= -github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= +github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= +github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= -github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= +go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= +go.opentelemetry.io/otel v1.37.0 h1:9zhNfelUvx0KBfu/gb+ZgeAfAgtWrfHJZcAqFC228wQ= +go.opentelemetry.io/otel v1.37.0/go.mod h1:ehE/umFRLnuLa/vSccNq9oS1ErUlkkK71gMcN34UG8I= +go.opentelemetry.io/otel/metric v1.37.0 h1:mvwbQS5m0tbmqML4NqK+e3aDiO02vsf/WgbsdpcPoZE= +go.opentelemetry.io/otel/metric v1.37.0/go.mod h1:04wGrZurHYKOc+RKeye86GwKiTb9FKm1WHtO+4EVr2E= +go.opentelemetry.io/otel/sdk v1.37.0 h1:ItB0QUqnjesGRvNcmAcU0LyvkVyGJ2xftD29bWdDvKI= +go.opentelemetry.io/otel/sdk v1.37.0/go.mod h1:VredYzxUvuo2q3WRcDnKDjbdvmO0sCzOvVAiY+yUkAg= +go.opentelemetry.io/otel/sdk/metric v1.37.0 h1:90lI228XrB9jCMuSdA0673aubgRobVZFhbjxHHspCPc= +go.opentelemetry.io/otel/sdk/metric v1.37.0/go.mod h1:cNen4ZWfiD37l5NhS+Keb5RXVWZWpRE+9WyVCpbo5ps= +go.opentelemetry.io/otel/trace v1.37.0 h1:HLdcFNbRQBE2imdSEgm/kwqmQj1Or1l/7bW6mxVK7z4= +go.opentelemetry.io/otel/trace v1.37.0/go.mod h1:TlgrlQ+PtQO5XFerSPUYG0JSgGyryXewPGyayAWSBS0= +go.uber.org/automaxprocs v1.6.0 h1:O3y2/QNTOdbF+e/dpXNNW7Rx2hZ4sTIPyybbxyNqTUs= +go.uber.org/automaxprocs v1.6.0/go.mod h1:ifeIMSnPZuznNm6jmdzmU3/bfk01Fe2fotchwEFJ8r8= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= -golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= -golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ= -golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE= -golang.org/x/oauth2 v0.17.0 h1:6m3ZPmLEFdVxKKWnKq4VqZ60gutO35zm+zrAHVmHyDQ= -golang.org/x/oauth2 v0.17.0/go.mod h1:OzPDGQiuQMguemayvdylqddI7qcD9lnSDb+1FiwQ5HA= +golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE= +golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg= +golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI= +golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws= -golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= -golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= -golang.org/x/term v0.21.0 h1:WVXCp+/EBEHOj53Rvu+7KiT/iElMrO8ACK16SMZ3jaA= -golang.org/x/term v0.21.0/go.mod h1:ooXLefLobQVslOqselCNF4SxFAaoS6KujMbsGzSDmX0= +golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= +golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/term v0.34.0 h1:O/2T7POpk0ZZ7MAzMeWFSg6S5IpWd/RXDlM9hgM3DR4= +golang.org/x/term v0.34.0/go.mod h1:5jC53AEywhIVebHgPVeg0mj8OD3VO9OzclacVrqpaAw= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= -golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= -golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4= -golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= -golang.org/x/time v0.5.0 h1:o7cqy6amK/52YcAKIPlM3a+Fpj35zvRj2TP+e1xFSfk= -golang.org/x/time v0.5.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= +golang.org/x/text v0.28.0 h1:rhazDwis8INMIwQ4tpjLDzUhx6RlXqZNPEM0huQojng= +golang.org/x/text v0.28.0/go.mod h1:U8nCwOR8jO/marOQ0QbDiOngZVEBB7MAiitBuMjXiNU= +golang.org/x/time v0.9.0 h1:EsRrnYcQiGH+5FfbgvV4AP7qEZstoyrHB0DzarOQ4ZY= +golang.org/x/time v0.9.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= -golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d h1:vU5i/LfpvrRCpgM/VPfJLg5KjxD3E+hfT1SH+d9zLwg= -golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= +golang.org/x/tools v0.36.0 h1:kWS0uv/zsvHEle1LbV5LE8QujrxB3wfQyxHfhOk0Qkg= +golang.org/x/tools v0.36.0/go.mod h1:WBDiHKJK8YgLHlcQPYQzNCkUxUypCaa5ZegCVutKm+s= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -google.golang.org/appengine v1.6.8 h1:IhEN5q69dyKagZPYMSdIjS2HqprW324FRQZJcGqPAsM= -google.golang.org/appengine v1.6.8/go.mod h1:1jJ3jBArFh5pcgW8gCtRJnepW8FzD1V44FJffLiz/Ds= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240227224415-6ceb2ff114de h1:cZGRis4/ot9uVm639a+rHCUaG0JJHEsdyzSQTMX+suY= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240227224415-6ceb2ff114de/go.mod h1:H4O17MA/PE9BsGx3w+a+W2VOLLD1Qf7oJneAoU6WktY= -google.golang.org/grpc v1.63.2 h1:MUeiw1B2maTVZthpU5xvASfTh3LDbxHd6IJ6QQVU+xM= -google.golang.org/grpc v1.63.2/go.mod h1:WAX/8DgncnokcFUldAxq7GeB5DXHDbMF+lLvDomNkRA= -google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= -google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= -google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGmI= -google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= +gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= +gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250707201910-8d1bb00bc6a7 h1:pFyd6EwwL2TqFf8emdthzeX+gZE1ElRq3iM8pui4KBY= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250707201910-8d1bb00bc6a7/go.mod h1:qQ0YXyHHx3XkvlzUtpXDkS29lDSafHMZBAZDc03LQ3A= +google.golang.org/grpc v1.75.0 h1:+TW+dqTd2Biwe6KKfhE5JpiYIBWq865PhKGSXiivqt4= +google.golang.org/grpc v1.75.0/go.mod h1:JtPAzKiq4v1xcAB2hydNlWI2RnF85XXcV0mhKXr2ecQ= +google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc= +google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/evanphx/json-patch.v4 v4.12.0 h1:n6jtcsulIzXPJaxegRbvFNNrZDjbij7ny3gmSPG+6V4= +gopkg.in/evanphx/json-patch.v4 v4.12.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= -gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= -gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= -gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -gotest.tools/v3 v3.5.1 h1:EENdUnS3pdur5nybKYIh2Vfgc8IUNBjxDPSjtiJcOzU= -gotest.tools/v3 v3.5.1/go.mod h1:isy3WKz7GK6uNw/sbHzfKBLvlvXwUyV06n6brMxxopU= -k8s.io/api v0.29.3 h1:2ORfZ7+bGC3YJqGpV0KSDDEVf8hdGQ6A03/50vj8pmw= -k8s.io/api v0.29.3/go.mod h1:y2yg2NTyHUUkIoTC+phinTnEa3KFM6RZ3szxt014a80= -k8s.io/apimachinery v0.29.3 h1:2tbx+5L7RNvqJjn7RIuIKu9XTsIZ9Z5wX2G22XAa5EU= -k8s.io/apimachinery v0.29.3/go.mod h1:hx/S4V2PNW4OMg3WizRrHutyB5la0iCUbZym+W0EQIU= -k8s.io/client-go v0.29.3 h1:R/zaZbEAxqComZ9FHeQwOh3Y1ZUs7FaHKZdQtIc2WZg= -k8s.io/client-go v0.29.3/go.mod h1:tkDisCvgPfiRpxGnOORfkljmS+UrW+WtXAy2fTvXJB0= -k8s.io/klog/v2 v2.120.1 h1:QXU6cPEOIslTGvZaXvFWiP9VKyeet3sawzTOvdXb4Vw= -k8s.io/klog/v2 v2.120.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= -k8s.io/kube-openapi v0.0.0-20240227032403-f107216b40e2 h1:02WBxjyRwX4rJdl3XlWVjFbXT/kAKCsipoM8hQY3Dwo= -k8s.io/kube-openapi v0.0.0-20240227032403-f107216b40e2/go.mod h1:B7Huvd1LKZtTYmY+nC6rnmN8lyGYT9lifBcPD5epL6k= -k8s.io/kubelet v0.29.3 h1:X9h0ZHzc+eUeNTaksbN0ItHyvGhQ7Z0HPjnQD2oHdwU= -k8s.io/kubelet v0.29.3/go.mod h1:jDiGuTkFOUynyBKzOoC1xRSWlgAZ9UPcTYeFyjr6vas= -k8s.io/utils v0.0.0-20240102154912-e7106e64919e h1:eQ/4ljkx21sObifjzXwlPKpdGLrCfRziVtos3ofG/sQ= -k8s.io/utils v0.0.0-20240102154912-e7106e64919e/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= -sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= -sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= -sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+sGiqlzvrtq4= -sigs.k8s.io/structured-merge-diff/v4 v4.4.1/go.mod h1:N8hJocpFajUSSeSJ9bOZ77VzejKZaXsTtZo4/u7Io08= +gotest.tools/v3 v3.5.2 h1:7koQfIKdy+I8UTetycgUqXWSDwpgv193Ka+qRsmBY8Q= +gotest.tools/v3 v3.5.2/go.mod h1:LtdLGcnqToBH83WByAAi/wiwSFCArdFIUV/xxN4pcjA= +k8s.io/api v0.33.0 h1:yTgZVn1XEe6opVpP1FylmNrIFWuDqe2H0V8CT5gxfIU= +k8s.io/api v0.33.0/go.mod h1:CTO61ECK/KU7haa3qq8sarQ0biLq2ju405IZAd9zsiM= +k8s.io/apimachinery v0.33.0 h1:1a6kHrJxb2hs4t8EE5wuR/WxKDwGN1FKH3JvDtA0CIQ= +k8s.io/apimachinery v0.33.0/go.mod h1:BHW0YOu7n22fFv/JkYOEfkUYNRN0fj0BlvMFWA7b+SM= +k8s.io/client-go v0.33.0 h1:UASR0sAYVUzs2kYuKn/ZakZlcs2bEHaizrrHUZg0G98= +k8s.io/client-go v0.33.0/go.mod h1:kGkd+l/gNGg8GYWAPr0xF1rRKvVWvzh9vmZAMXtaKOg= +k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= +k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= +k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff h1:/usPimJzUKKu+m+TE36gUyGcf03XZEP0ZIKgKj35LS4= +k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff/go.mod h1:5jIi+8yX4RIb8wk3XwBo5Pq2ccx4FP10ohkbSKCZoK8= +k8s.io/kubelet v0.31.3 h1:DIXRAmvVGp42mV2vpA1GCLU6oO8who0/vp3Oq6kSpbI= +k8s.io/kubelet v0.31.3/go.mod h1:KSdbEfNy5VzqUlAHlytA/fH12s+sE1u8fb/8JY9sL/8= +k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 h1:M3sRQVHv7vB20Xc2ybTt7ODCeFj6JSWYFzOFnYeS6Ro= +k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 h1:/Rv+M11QRah1itp8VhT6HoVx1Ray9eB4DBr+K+/sCJ8= +sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3/go.mod h1:18nIHnGi6636UCz6m8i4DhaJ65T6EruyzmoQqI2BVDo= +sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= +sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= +sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= +sigs.k8s.io/structured-merge-diff/v4 v4.6.0 h1:IUA9nvMmnKWcj5jl84xn+T5MnlZKThmUW1TdblaLVAc= +sigs.k8s.io/structured-merge-diff/v4 v4.6.0/go.mod h1:dDy58f92j70zLsuZVuUX5Wp9vtxXpaZnkPGWeqDfCps= sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= From 6a0172a904a8c09b16825feb978976ba63422abc Mon Sep 17 00:00:00 2001 From: band-p Date: Thu, 14 May 2026 11:40:21 +0800 Subject: [PATCH 17/17] device node config support filterDevices Signed-off-by: band-p --- README.md | 2 ++ README_cn.md | 2 ++ ascend-device-node-configmap.yaml | 1 + internal/manager/manager.go | 50 ++++++++++++++++++++++++++++++- internal/vnpu.go | 9 +++--- 5 files changed, 59 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 94adff0..80c2d31 100644 --- a/README.md +++ b/README.md @@ -80,6 +80,8 @@ kubectl apply -f https://raw.githubusercontent.com/Project-HAMi/ascend-device-pl The `hami-device-node-config` is used to enable or override hami-vnpu-core for specific nodes within the cluster. Node-level settings take higher priority than the global `vnpus.hamiVnpuCore` switch. +It also supports `filterDevices` to limit which card IDs are exposed by the device plugin on a specific node, for example: `filterDevices: [0, 1, 2, 3]`. + ```bash kubectl apply -f https://raw.githubusercontent.com/Project-HAMi/ascend-device-plugin/main/ascend-device-node-configmap.yaml ``` diff --git a/README_cn.md b/README_cn.md index cfbb548..7c7585f 100644 --- a/README_cn.md +++ b/README_cn.md @@ -82,6 +82,8 @@ kubectl apply -f https://raw.githubusercontent.com/Project-HAMi/ascend-device-pl `hami-device-node-config` 用于对集群中特定节点的 hami-vnpu-core 进行启用或覆盖。节点级配置的优先级高于全局 `vnpus.hamiVnpuCore` 开关。 +同时支持 `filterDevices`,用于限制某个节点对外暴露的卡号,例如:`filterDevices: [0, 1, 2, 3]`。 + ```bash kubectl apply -f https://raw.githubusercontent.com/Project-HAMi/ascend-device-plugin/main/ascend-device-node-configmap.yaml ``` diff --git a/ascend-device-node-configmap.yaml b/ascend-device-node-configmap.yaml index ca6ee57..e83963e 100644 --- a/ascend-device-node-configmap.yaml +++ b/ascend-device-node-configmap.yaml @@ -13,3 +13,4 @@ data: - name: "cnst-dev-w2" hami-vnpu-core: true vDeviceCount: 8 + filterDevices: [0, 1, 2, 3, 4, 5, 6, 7] diff --git a/internal/manager/manager.go b/internal/manager/manager.go index b3af1ec..d83dba8 100644 --- a/internal/manager/manager.go +++ b/internal/manager/manager.go @@ -78,6 +78,27 @@ func (am *AscendManager) LoadNodeConfig(nodePath string, nodeName string) error return nil } +func (am *AscendManager) filteredCardSet() map[int32]struct{} { + if am.nodeConfig == nil || len(am.nodeConfig.FilterDevices) == 0 { + return nil + } + + filtered := make(map[int32]struct{}, len(am.nodeConfig.FilterDevices)) + for _, cardID := range am.nodeConfig.FilterDevices { + filtered[cardID] = struct{}{} + } + return filtered +} + +func (am *AscendManager) shouldIncludeCard(cardID int32) bool { + filtered := am.filteredCardSet() + if len(filtered) == 0 { + return true + } + _, ok := filtered[cardID] + return ok +} + func (am *AscendManager) LoadConfig(path string) error { config, err := internal.LoadConfig(path) if err != nil { @@ -143,6 +164,10 @@ func (am *AscendManager) UpdateDevice() error { klog.Errorf("failed to get card id from device id: %v", err) return err } + if !am.shouldIncludeCard(cardID) { + klog.V(4).Infof("skip filtered cardID=%d logicID=%d phyID=%d deviceID=%d", cardID, ID, phyID, deviceID) + continue + } uuid, err := am.mgr.GetDieID(ID, dcmi.VDIE) if err != nil { klog.Errorf("failed to get uuid from device id: %v", err) @@ -193,7 +218,18 @@ func (am *AscendManager) GetIDs() []int32 { klog.Errorf("failed to get device list: %v", err) return nil } - return IDs + filteredIDs := make([]int32, 0, len(IDs)) + for _, id := range IDs { + cardID, _, err := am.mgr.GetCardIDDeviceID(id) + if err != nil { + klog.Warningf("failed to get card/device ID for logic ID %d: %v", id, err) + continue + } + if am.shouldIncludeCard(cardID) { + filteredIDs = append(filteredIDs, id) + } + } + return filteredIDs } func (am *AscendManager) GetUnHealthIDs() []int32 { @@ -203,6 +239,14 @@ func (am *AscendManager) GetUnHealthIDs() []int32 { } var unhealthy []int32 for _, d := range IDs { + cardID, _, err := am.mgr.GetCardIDDeviceID(d) + if err != nil { + klog.Warningf("failed to get card/device ID for logic ID %d: %v", d, err) + continue + } + if !am.shouldIncludeCard(cardID) { + continue + } healthCode, err := am.mgr.GetDeviceHealth(d) if err != nil { klog.Warningf("failed to get device health for %d: %v", d, err) @@ -231,6 +275,10 @@ func (am *AscendManager) CleanupIdleVNPUs() error { klog.Warningf("failed to get card/device ID for logic ID %d: %v", logicID, err) continue } + if !am.shouldIncludeCard(cardID) { + klog.V(4).Infof("skip cleanup on filtered cardID=%d logicID=%d deviceID=%d", cardID, logicID, deviceID) + continue + } // Obtain all vNPU information on this device vDevInfos, err := am.mgr.GetVirtualDeviceInfo(logicID) if err != nil { diff --git a/internal/vnpu.go b/internal/vnpu.go index 2f890e6..e43aef2 100644 --- a/internal/vnpu.go +++ b/internal/vnpu.go @@ -64,13 +64,14 @@ func LoadConfig(path string) (*Config, error) { } type NodeConfig struct { - Name string `json:"name"` - HamiVnpuCore bool `json:"hami-vnpu-core"` - VDeviceCount int `json:"vDeviceCount"` + Name string `json:"name" yaml:"name"` + HamiVnpuCore bool `json:"hami-vnpu-core" yaml:"hami-vnpu-core"` + VDeviceCount int `json:"vDeviceCount" yaml:"vDeviceCount"` + FilterDevices []int32 `json:"filterDevices,omitempty" yaml:"filterDevices,omitempty"` } type NodeListConfig struct { - Nodes []NodeConfig `json:"nodes"` + Nodes []NodeConfig `json:"nodes" yaml:"nodes"` } func LoadNodeConfig(path string) (*NodeListConfig, error) {