diff --git a/README.md b/README.md index 94adff0..80c2d31 100644 --- a/README.md +++ b/README.md @@ -80,6 +80,8 @@ kubectl apply -f https://raw.githubusercontent.com/Project-HAMi/ascend-device-pl The `hami-device-node-config` is used to enable or override hami-vnpu-core for specific nodes within the cluster. Node-level settings take higher priority than the global `vnpus.hamiVnpuCore` switch. +It also supports `filterDevices` to limit which card IDs are exposed by the device plugin on a specific node, for example: `filterDevices: [0, 1, 2, 3]`. + ```bash kubectl apply -f https://raw.githubusercontent.com/Project-HAMi/ascend-device-plugin/main/ascend-device-node-configmap.yaml ``` diff --git a/README_cn.md b/README_cn.md index cfbb548..7c7585f 100644 --- a/README_cn.md +++ b/README_cn.md @@ -82,6 +82,8 @@ kubectl apply -f https://raw.githubusercontent.com/Project-HAMi/ascend-device-pl `hami-device-node-config` 用于对集群中特定节点的 hami-vnpu-core 进行启用或覆盖。节点级配置的优先级高于全局 `vnpus.hamiVnpuCore` 开关。 +同时支持 `filterDevices`,用于限制某个节点对外暴露的卡号,例如:`filterDevices: [0, 1, 2, 3]`。 + ```bash kubectl apply -f https://raw.githubusercontent.com/Project-HAMi/ascend-device-plugin/main/ascend-device-node-configmap.yaml ``` diff --git a/ascend-device-node-configmap.yaml b/ascend-device-node-configmap.yaml index ca6ee57..e83963e 100644 --- a/ascend-device-node-configmap.yaml +++ b/ascend-device-node-configmap.yaml @@ -13,3 +13,4 @@ data: - name: "cnst-dev-w2" hami-vnpu-core: true vDeviceCount: 8 + filterDevices: [0, 1, 2, 3, 4, 5, 6, 7] diff --git a/cmd/main.go b/cmd/main.go index 849e40c..10d8aca 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -36,7 +36,7 @@ import ( var ( hwLoglevel = flag.Int("hw_loglevel", 0, "huawei log level, -1-debug, 0-info, 1-warning, 2-error 3-critical default value: 0") configFile = flag.String("config_file", "", "config file path") - nodeConfigFile = flag.String("node_config_file", "", "node specific config file path") + nodeConfigFile = flag.String("node_config_file", "", "node specific config file path") nodeName = flag.String("node_name", os.Getenv("NODE_NAME"), "node name") checkIdleVNPUInterval = flag.Int("check_idle_vnpu_interval", 60, "the interval (in seconds) to check idle vNPU and release them") ) diff --git a/go.mod b/go.mod index 5201059..767178e 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ require ( github.com/Project-HAMi/HAMi v0.0.0 github.com/fsnotify/fsnotify v1.9.0 google.golang.org/grpc v1.75.0 + huawei.com/npu-exporter v0.0.0-00010101000000-000000000000 k8s.io/api v0.33.0 k8s.io/apimachinery v0.33.0 k8s.io/klog/v2 v2.130.1 @@ -23,20 +24,16 @@ require ( github.com/go-openapi/jsonreference v0.20.4 // indirect github.com/go-openapi/swag v0.23.0 // indirect github.com/gogo/protobuf v1.3.2 // indirect - github.com/golang/protobuf v1.5.4 // indirect github.com/google/gnostic-models v0.6.9 // indirect github.com/google/go-cmp v0.7.0 // indirect - github.com/google/gofuzz v1.2.0 // indirect github.com/google/uuid v1.6.0 // indirect - github.com/imdario/mergo v0.3.16 // indirect + github.com/influxdata/telegraf v1.26.3 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/mailru/easyjson v0.7.7 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect - github.com/onsi/ginkgo/v2 v2.23.4 // indirect - github.com/onsi/gomega v1.38.0 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/smartystreets/goconvey v1.7.2 // indirect github.com/spf13/pflag v1.0.7 // indirect @@ -47,14 +44,11 @@ require ( golang.org/x/term v0.34.0 // indirect golang.org/x/text v0.28.0 // indirect golang.org/x/time v0.9.0 // indirect - google.golang.org/appengine v1.6.8 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20250707201910-8d1bb00bc6a7 // indirect google.golang.org/protobuf v1.36.8 // indirect gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect - gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - huawei.com/npu-exporter v0.0.0-00010101000000-000000000000 // indirect k8s.io/client-go v0.33.0 // indirect k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 // indirect diff --git a/go.sum b/go.sum index 0a3dcdf..d53050b 100644 --- a/go.sum +++ b/go.sum @@ -1,7 +1,3 @@ -gitee.com/ascend/ascend-npu-exporter/v6 v6.0.0-RC3 h1:gmcdFAckl3OCubjk8Mz7jgYWBHm+7pzkmQ19/afghhY= -gitee.com/ascend/ascend-npu-exporter/v6 v6.0.0-RC3/go.mod h1:tQw2ukw5YzlXWJa5cDfY8TNcTiBieor69lsdHFEiMZ8= -github.com/Project-HAMi/HAMi v0.0.0-20250107033239-d04fc8baaad6 h1:5SbvXn7H5spMTgCM4+sF6zm113WVCceUuOuwItkqELY= -github.com/Project-HAMi/HAMi v0.0.0-20250107033239-d04fc8baaad6/go.mod h1:lY4bmpcPiKWg0bVPCJFRH6xDW8p5PouIk/nIIU1I2d8= github.com/Project-HAMi/HAMi v0.0.0-20250901013025-61c6cbe7d480 h1:2rV+Gpy2+1fDOpQBPPXE3YG6nwfaO8DZjyCH+ARAmMY= github.com/Project-HAMi/HAMi v0.0.0-20250901013025-61c6cbe7d480/go.mod h1:KgE6IKrLJBAp6YrToFRFLDXHXctsZ6wXvNHMWY6ZbBU= github.com/agiledragon/gomonkey/v2 v2.8.0 h1:u2K2nNGyk0ippzklz1CWalllEB9ptD+DtSXeCX5O000= @@ -9,63 +5,43 @@ github.com/agiledragon/gomonkey/v2 v2.8.0/go.mod h1:ap1AmDzcVOAz1YpeJ3TCzIgstoaW github.com/ccoveille/go-safecast v1.6.1 h1:Nb9WMDR8PqhnKCVs2sCB+OqhohwO5qaXtCviZkIff5Q= github.com/ccoveille/go-safecast v1.6.1/go.mod h1:QqwNjxQ7DAqY0C721OIO9InMk9zCwcsO7tnRuHytad8= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/emicklei/go-restful/v3 v3.11.3 h1:yagOQz/38xJmcNeZJtrUcKjkHRltIaIFXKWeG1SkWGE= github.com/emicklei/go-restful/v3 v3.11.3/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= -github.com/evanphx/json-patch v5.9.0+incompatible h1:fBXyNpNMuTTDdquAq/uisOr2lShz4oaXpDTX2bLe7ls= -github.com/evanphx/json-patch v5.9.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= -github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= -github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= github.com/fxamacker/cbor/v2 v2.7.0 h1:iM5WgngdRBanHcxugY4JySA0nk1wZorNOpTgCMedv5E= github.com/fxamacker/cbor/v2 v2.7.0/go.mod h1:pxXPTn3joSm21Gbwsv0w9OSA2y1HFR9qXEeXQVeNoDQ= -github.com/go-logr/logr v1.4.1 h1:pKouT5E8xu9zeFC39JXRDukb6JFQPXM5p5I91188VAQ= -github.com/go-logr/logr v1.4.1/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= -github.com/go-openapi/jsonpointer v0.20.2 h1:mQc3nmndL8ZBzStEo3JYF8wzmeWffDH4VbXz58sAx6Q= -github.com/go-openapi/jsonpointer v0.20.2/go.mod h1:bHen+N0u1KEO3YlmqOjTT9Adn1RfD91Ar825/PuiRVs= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ= github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY= github.com/go-openapi/jsonreference v0.20.4 h1:bKlDxQxQJgwpUSgOENiMPzCTBVuc7vTdXSSgNeAhojU= github.com/go-openapi/jsonreference v0.20.4/go.mod h1:5pZJyJP2MnYCpoeoMAql78cCHauHj0V9Lhc506VOpw4= -github.com/go-openapi/swag v0.22.9 h1:XX2DssF+mQKM2DHsbgZK74y/zj4mo9I99+89xUmuZCE= -github.com/go-openapi/swag v0.22.9/go.mod h1:3/OXnFfnMAwBD099SwYRk7GD3xOrr1iL7d/XNLXVVwE= github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE= github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ= -github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI= -github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572/go.mod h1:9Pwr4B2jHnOSGXyyzV8ROjYa2ojvAY6HCGYYfMoC3Ls= +github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= +github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= -github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= -github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= -github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I= -github.com/google/gnostic-models v0.6.8/go.mod h1:5n7qKqH0f5wFt+aWF8CW6pZLLNOfYuF5OpfBSENuI8U= github.com/google/gnostic-models v0.6.9 h1:MU/8wDLif2qCXZmzncUQ/BOfxWfthHi63KqpoNbWqVw= github.com/google/gnostic-models v0.6.9/go.mod h1:CiWsm0s6BSQd1hRn8/QmxqB6BesYcbSZxsz9b0KuDBw= -github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= -github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= -github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= -github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 h1:K6RDEckDVWvDI9JAJYCmNdQXq6neHJOYx3V6jnqNEec= -github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= +github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J0b1vyeLSOYI8bm5wbJM/8yDe8= +github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 h1:EGx4pi6eqNxGaHF6qqu48+N2wcFQ5qg5FXgOdqsJ5d8= github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY= -github.com/imdario/mergo v0.3.16 h1:wwQJbIsHYGMUyLSPrEq1CT16AhnhNJQ51+4fdHUnCl4= -github.com/imdario/mergo v0.3.16/go.mod h1:WBLT9ZmE3lPoWsEzCh9LPo3TiwVN+ZKEjmz+hD27ysY= github.com/influxdata/telegraf v1.26.3 h1:wawD3VTdnPDbHnJ1RBGgCf0YB7vlxREZ70rvEepHdGs= github.com/influxdata/telegraf v1.26.3/go.mod h1:w+VUZ4NRDzfhRmhEdBbbNZBNT7E8qRkLiL73j/pD0ug= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= @@ -89,88 +65,75 @@ github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9G github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= -github.com/onsi/ginkgo/v2 v2.17.1 h1:V++EzdbhI4ZV4ev0UTIj0PzhzOcReJFyJaLjtSF55M8= -github.com/onsi/ginkgo/v2 v2.17.1/go.mod h1:llBI3WDLL9Z6taip6f33H76YcWtJv+7R3HigUjbIBOs= +github.com/onsi/ginkgo/v2 v2.23.4 h1:ktYTpKJAVZnDT4VjxSbiBenUjmlL/5QkBEocaWXiQus= github.com/onsi/ginkgo/v2 v2.23.4/go.mod h1:Bt66ApGPBFzHyR+JO10Zbt0Gsp4uWxu5mIOTusL46e8= -github.com/onsi/gomega v1.32.0 h1:JRYU78fJ1LPxlckP6Txi/EYqJvjtMrDC04/MM5XRHPk= -github.com/onsi/gomega v1.32.0/go.mod h1:a4x4gW6Pz2yK1MAmvluYme5lvYTn61afQ2ETw/8n4Lg= +github.com/onsi/gomega v1.38.0 h1:c/WX+w8SLAinvuKKQFh77WEucCnPk4j2OTUr7lt7BeY= github.com/onsi/gomega v1.38.0/go.mod h1:OcXcwId0b9QsE7Y49u+BTrL4IdKOBOKnD6VQNTJEB6o= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= -github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= -github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= +github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= github.com/smartystreets/assertions v1.2.0 h1:42S6lae5dvLc7BrLu/0ugRtcFVjoJNMC/N3yZFZkDFs= github.com/smartystreets/assertions v1.2.0/go.mod h1:tcbTF8ujkAEcZ8TElKY+i30BzYlVhC/LOxJk7iOWnoo= github.com/smartystreets/goconvey v1.7.2 h1:9RBaZCeXEQ3UselpuwUQHltGVXvdwm6cv1hgR6gDIPg= github.com/smartystreets/goconvey v1.7.2/go.mod h1:Vw0tHAZW6lzCRk3xgdin6fKYcG+G3Pg9vgXWeJpQFMM= -github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= -github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/spf13/pflag v1.0.7 h1:vN6T9TfwStFPFM5XzjsvmzZkLuaLX+HS+0SeFLRgU6M= github.com/spf13/pflag v1.0.7/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= +github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= -github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= -github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= +github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= -github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= +go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= +go.opentelemetry.io/otel v1.37.0 h1:9zhNfelUvx0KBfu/gb+ZgeAfAgtWrfHJZcAqFC228wQ= +go.opentelemetry.io/otel v1.37.0/go.mod h1:ehE/umFRLnuLa/vSccNq9oS1ErUlkkK71gMcN34UG8I= +go.opentelemetry.io/otel/metric v1.37.0 h1:mvwbQS5m0tbmqML4NqK+e3aDiO02vsf/WgbsdpcPoZE= +go.opentelemetry.io/otel/metric v1.37.0/go.mod h1:04wGrZurHYKOc+RKeye86GwKiTb9FKm1WHtO+4EVr2E= +go.opentelemetry.io/otel/sdk v1.37.0 h1:ItB0QUqnjesGRvNcmAcU0LyvkVyGJ2xftD29bWdDvKI= +go.opentelemetry.io/otel/sdk v1.37.0/go.mod h1:VredYzxUvuo2q3WRcDnKDjbdvmO0sCzOvVAiY+yUkAg= +go.opentelemetry.io/otel/sdk/metric v1.37.0 h1:90lI228XrB9jCMuSdA0673aubgRobVZFhbjxHHspCPc= +go.opentelemetry.io/otel/sdk/metric v1.37.0/go.mod h1:cNen4ZWfiD37l5NhS+Keb5RXVWZWpRE+9WyVCpbo5ps= +go.opentelemetry.io/otel/trace v1.37.0 h1:HLdcFNbRQBE2imdSEgm/kwqmQj1Or1l/7bW6mxVK7z4= +go.opentelemetry.io/otel/trace v1.37.0/go.mod h1:TlgrlQ+PtQO5XFerSPUYG0JSgGyryXewPGyayAWSBS0= +go.uber.org/automaxprocs v1.6.0 h1:O3y2/QNTOdbF+e/dpXNNW7Rx2hZ4sTIPyybbxyNqTUs= +go.uber.org/automaxprocs v1.6.0/go.mod h1:ifeIMSnPZuznNm6jmdzmU3/bfk01Fe2fotchwEFJ8r8= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= -golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= -golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ= -golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE= golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE= golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg= -golang.org/x/oauth2 v0.17.0 h1:6m3ZPmLEFdVxKKWnKq4VqZ60gutO35zm+zrAHVmHyDQ= -golang.org/x/oauth2 v0.17.0/go.mod h1:OzPDGQiuQMguemayvdylqddI7qcD9lnSDb+1FiwQ5HA= golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI= golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws= -golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= -golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= -golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= -golang.org/x/term v0.21.0 h1:WVXCp+/EBEHOj53Rvu+7KiT/iElMrO8ACK16SMZ3jaA= -golang.org/x/term v0.21.0/go.mod h1:ooXLefLobQVslOqselCNF4SxFAaoS6KujMbsGzSDmX0= golang.org/x/term v0.34.0 h1:O/2T7POpk0ZZ7MAzMeWFSg6S5IpWd/RXDlM9hgM3DR4= golang.org/x/term v0.34.0/go.mod h1:5jC53AEywhIVebHgPVeg0mj8OD3VO9OzclacVrqpaAw= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= -golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= -golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4= -golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= golang.org/x/text v0.28.0 h1:rhazDwis8INMIwQ4tpjLDzUhx6RlXqZNPEM0huQojng= golang.org/x/text v0.28.0/go.mod h1:U8nCwOR8jO/marOQ0QbDiOngZVEBB7MAiitBuMjXiNU= -golang.org/x/time v0.5.0 h1:o7cqy6amK/52YcAKIPlM3a+Fpj35zvRj2TP+e1xFSfk= -golang.org/x/time v0.5.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/time v0.9.0 h1:EsRrnYcQiGH+5FfbgvV4AP7qEZstoyrHB0DzarOQ4ZY= golang.org/x/time v0.9.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= @@ -178,27 +141,18 @@ golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3 golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= -golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d h1:vU5i/LfpvrRCpgM/VPfJLg5KjxD3E+hfT1SH+d9zLwg= -golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= +golang.org/x/tools v0.36.0 h1:kWS0uv/zsvHEle1LbV5LE8QujrxB3wfQyxHfhOk0Qkg= +golang.org/x/tools v0.36.0/go.mod h1:WBDiHKJK8YgLHlcQPYQzNCkUxUypCaa5ZegCVutKm+s= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -google.golang.org/appengine v1.6.8 h1:IhEN5q69dyKagZPYMSdIjS2HqprW324FRQZJcGqPAsM= -google.golang.org/appengine v1.6.8/go.mod h1:1jJ3jBArFh5pcgW8gCtRJnepW8FzD1V44FJffLiz/Ds= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240227224415-6ceb2ff114de h1:cZGRis4/ot9uVm639a+rHCUaG0JJHEsdyzSQTMX+suY= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240227224415-6ceb2ff114de/go.mod h1:H4O17MA/PE9BsGx3w+a+W2VOLLD1Qf7oJneAoU6WktY= +gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= +gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= google.golang.org/genproto/googleapis/rpc v0.0.0-20250707201910-8d1bb00bc6a7 h1:pFyd6EwwL2TqFf8emdthzeX+gZE1ElRq3iM8pui4KBY= google.golang.org/genproto/googleapis/rpc v0.0.0-20250707201910-8d1bb00bc6a7/go.mod h1:qQ0YXyHHx3XkvlzUtpXDkS29lDSafHMZBAZDc03LQ3A= -google.golang.org/grpc v1.63.2 h1:MUeiw1B2maTVZthpU5xvASfTh3LDbxHd6IJ6QQVU+xM= -google.golang.org/grpc v1.63.2/go.mod h1:WAX/8DgncnokcFUldAxq7GeB5DXHDbMF+lLvDomNkRA= google.golang.org/grpc v1.75.0 h1:+TW+dqTd2Biwe6KKfhE5JpiYIBWq865PhKGSXiivqt4= google.golang.org/grpc v1.75.0/go.mod h1:JtPAzKiq4v1xcAB2hydNlWI2RnF85XXcV0mhKXr2ecQ= -google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= -google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= -google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGmI= -google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc= google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= @@ -208,50 +162,29 @@ gopkg.in/evanphx/json-patch.v4 v4.12.0 h1:n6jtcsulIzXPJaxegRbvFNNrZDjbij7ny3gmSP gopkg.in/evanphx/json-patch.v4 v4.12.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= -gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= -gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= -gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -gotest.tools/v3 v3.5.1 h1:EENdUnS3pdur5nybKYIh2Vfgc8IUNBjxDPSjtiJcOzU= -gotest.tools/v3 v3.5.1/go.mod h1:isy3WKz7GK6uNw/sbHzfKBLvlvXwUyV06n6brMxxopU= -k8s.io/api v0.29.3 h1:2ORfZ7+bGC3YJqGpV0KSDDEVf8hdGQ6A03/50vj8pmw= -k8s.io/api v0.29.3/go.mod h1:y2yg2NTyHUUkIoTC+phinTnEa3KFM6RZ3szxt014a80= +gotest.tools/v3 v3.5.2 h1:7koQfIKdy+I8UTetycgUqXWSDwpgv193Ka+qRsmBY8Q= +gotest.tools/v3 v3.5.2/go.mod h1:LtdLGcnqToBH83WByAAi/wiwSFCArdFIUV/xxN4pcjA= k8s.io/api v0.33.0 h1:yTgZVn1XEe6opVpP1FylmNrIFWuDqe2H0V8CT5gxfIU= k8s.io/api v0.33.0/go.mod h1:CTO61ECK/KU7haa3qq8sarQ0biLq2ju405IZAd9zsiM= -k8s.io/apimachinery v0.29.3 h1:2tbx+5L7RNvqJjn7RIuIKu9XTsIZ9Z5wX2G22XAa5EU= -k8s.io/apimachinery v0.29.3/go.mod h1:hx/S4V2PNW4OMg3WizRrHutyB5la0iCUbZym+W0EQIU= k8s.io/apimachinery v0.33.0 h1:1a6kHrJxb2hs4t8EE5wuR/WxKDwGN1FKH3JvDtA0CIQ= k8s.io/apimachinery v0.33.0/go.mod h1:BHW0YOu7n22fFv/JkYOEfkUYNRN0fj0BlvMFWA7b+SM= -k8s.io/client-go v0.29.3 h1:R/zaZbEAxqComZ9FHeQwOh3Y1ZUs7FaHKZdQtIc2WZg= -k8s.io/client-go v0.29.3/go.mod h1:tkDisCvgPfiRpxGnOORfkljmS+UrW+WtXAy2fTvXJB0= k8s.io/client-go v0.33.0 h1:UASR0sAYVUzs2kYuKn/ZakZlcs2bEHaizrrHUZg0G98= k8s.io/client-go v0.33.0/go.mod h1:kGkd+l/gNGg8GYWAPr0xF1rRKvVWvzh9vmZAMXtaKOg= -k8s.io/klog/v2 v2.120.1 h1:QXU6cPEOIslTGvZaXvFWiP9VKyeet3sawzTOvdXb4Vw= -k8s.io/klog/v2 v2.120.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= -k8s.io/kube-openapi v0.0.0-20240227032403-f107216b40e2 h1:02WBxjyRwX4rJdl3XlWVjFbXT/kAKCsipoM8hQY3Dwo= -k8s.io/kube-openapi v0.0.0-20240227032403-f107216b40e2/go.mod h1:B7Huvd1LKZtTYmY+nC6rnmN8lyGYT9lifBcPD5epL6k= k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff h1:/usPimJzUKKu+m+TE36gUyGcf03XZEP0ZIKgKj35LS4= k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff/go.mod h1:5jIi+8yX4RIb8wk3XwBo5Pq2ccx4FP10ohkbSKCZoK8= -k8s.io/kubelet v0.29.3 h1:X9h0ZHzc+eUeNTaksbN0ItHyvGhQ7Z0HPjnQD2oHdwU= -k8s.io/kubelet v0.29.3/go.mod h1:jDiGuTkFOUynyBKzOoC1xRSWlgAZ9UPcTYeFyjr6vas= k8s.io/kubelet v0.31.3 h1:DIXRAmvVGp42mV2vpA1GCLU6oO8who0/vp3Oq6kSpbI= k8s.io/kubelet v0.31.3/go.mod h1:KSdbEfNy5VzqUlAHlytA/fH12s+sE1u8fb/8JY9sL/8= -k8s.io/utils v0.0.0-20240102154912-e7106e64919e h1:eQ/4ljkx21sObifjzXwlPKpdGLrCfRziVtos3ofG/sQ= -k8s.io/utils v0.0.0-20240102154912-e7106e64919e/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 h1:M3sRQVHv7vB20Xc2ybTt7ODCeFj6JSWYFzOFnYeS6Ro= k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= -sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= -sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 h1:/Rv+M11QRah1itp8VhT6HoVx1Ray9eB4DBr+K+/sCJ8= sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3/go.mod h1:18nIHnGi6636UCz6m8i4DhaJ65T6EruyzmoQqI2BVDo= sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= -sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+sGiqlzvrtq4= -sigs.k8s.io/structured-merge-diff/v4 v4.4.1/go.mod h1:N8hJocpFajUSSeSJ9bOZ77VzejKZaXsTtZo4/u7Io08= sigs.k8s.io/structured-merge-diff/v4 v4.6.0 h1:IUA9nvMmnKWcj5jl84xn+T5MnlZKThmUW1TdblaLVAc= sigs.k8s.io/structured-merge-diff/v4 v4.6.0/go.mod h1:dDy58f92j70zLsuZVuUX5Wp9vtxXpaZnkPGWeqDfCps= sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= diff --git a/internal/manager/manager.go b/internal/manager/manager.go index 8ce1df2..d83dba8 100644 --- a/internal/manager/manager.go +++ b/internal/manager/manager.go @@ -19,6 +19,7 @@ package manager import ( "fmt" "sort" + "sync" "ascend-common/devmanager" "ascend-common/devmanager/dcmi" @@ -39,17 +40,18 @@ type Device struct { } type AscendManager struct { - mgr *devmanager.DeviceManager - config internal.VNPUConfig + mu sync.RWMutex + mgr *devmanager.DeviceManager + config internal.VNPUConfig globalConfig internal.Config - devs []*Device - nodeConfig *internal.NodeConfig + devs []*Device + nodeConfig *internal.NodeConfig } func NewAscendManager() (*AscendManager, error) { mgr, err := devmanager.AutoInit("", 30) if err != nil { - return nil, err + return nil, fmt.Errorf("failed to auto-init device manager: %w", err) } return &AscendManager{ mgr: mgr, @@ -58,7 +60,7 @@ func NewAscendManager() (*AscendManager, error) { } func (am *AscendManager) LoadNodeConfig(nodePath string, nodeName string) error { - nodeConfigList, err := internal.LoadNodeConfig(nodePath) + nodeConfigList, err := internal.LoadNodeConfig(nodePath) if err != nil { klog.Warningf("Failed to load node config from %s: %v", nodePath, err) return err @@ -71,19 +73,40 @@ func (am *AscendManager) LoadNodeConfig(nodePath string, nodeName string) error return nil } } - + klog.Infof("No specific config found for node %s, will use default settings", nodeName) return nil } +func (am *AscendManager) filteredCardSet() map[int32]struct{} { + if am.nodeConfig == nil || len(am.nodeConfig.FilterDevices) == 0 { + return nil + } + + filtered := make(map[int32]struct{}, len(am.nodeConfig.FilterDevices)) + for _, cardID := range am.nodeConfig.FilterDevices { + filtered[cardID] = struct{}{} + } + return filtered +} + +func (am *AscendManager) shouldIncludeCard(cardID int32) bool { + filtered := am.filteredCardSet() + if len(filtered) == 0 { + return true + } + _, ok := filtered[cardID] + return ok +} + func (am *AscendManager) LoadConfig(path string) error { config, err := internal.LoadConfig(path) if err != nil { - return err + return fmt.Errorf("failed to load config from %s: %w", path, err) } chipInfo, err := am.mgr.GetValidChipInfo() if err != nil { - return err + return fmt.Errorf("failed to get valid chip info: %w", err) } if chipInfo.Type != "Ascend" { return fmt.Errorf("chip type is not Ascend") @@ -129,7 +152,7 @@ func (am *AscendManager) UpdateDevice() error { return err } - am.devs = make([]*Device, 0, len(IDs)) + newDevs := make([]*Device, 0, len(IDs)) for _, ID := range IDs { phyID, err := am.mgr.GetPhysicIDFromLogicID(ID) if err != nil { @@ -141,6 +164,10 @@ func (am *AscendManager) UpdateDevice() error { klog.Errorf("failed to get card id from device id: %v", err) return err } + if !am.shouldIncludeCard(cardID) { + klog.V(4).Infof("skip filtered cardID=%d logicID=%d phyID=%d deviceID=%d", cardID, ID, phyID, deviceID) + continue + } uuid, err := am.mgr.GetDieID(ID, dcmi.VDIE) if err != nil { klog.Errorf("failed to get uuid from device id: %v", err) @@ -151,7 +178,7 @@ func (am *AscendManager) UpdateDevice() error { klog.Errorf("failed to get device health: %v", err) return err } - am.devs = append(am.devs, &Device{ + newDevs = append(newDevs, &Device{ UUID: uuid, LogicID: ID, PhyID: phyID, @@ -162,14 +189,21 @@ func (am *AscendManager) UpdateDevice() error { Health: health == 0, }) } + am.mu.Lock() + am.devs = newDevs + am.mu.Unlock() return nil } func (am *AscendManager) GetDevices() []*Device { + am.mu.RLock() + defer am.mu.RUnlock() return am.devs } func (am *AscendManager) GetDeviceByUUID(UUID string) *Device { + am.mu.RLock() + defer am.mu.RUnlock() for _, dev := range am.devs { if dev.UUID == UUID { return dev @@ -181,9 +215,21 @@ func (am *AscendManager) GetDeviceByUUID(UUID string) *Device { func (am *AscendManager) GetIDs() []int32 { _, IDs, err := am.mgr.GetDeviceList() if err != nil { + klog.Errorf("failed to get device list: %v", err) return nil } - return IDs + filteredIDs := make([]int32, 0, len(IDs)) + for _, id := range IDs { + cardID, _, err := am.mgr.GetCardIDDeviceID(id) + if err != nil { + klog.Warningf("failed to get card/device ID for logic ID %d: %v", id, err) + continue + } + if am.shouldIncludeCard(cardID) { + filteredIDs = append(filteredIDs, id) + } + } + return filteredIDs } func (am *AscendManager) GetUnHealthIDs() []int32 { @@ -193,8 +239,17 @@ func (am *AscendManager) GetUnHealthIDs() []int32 { } var unhealthy []int32 for _, d := range IDs { + cardID, _, err := am.mgr.GetCardIDDeviceID(d) + if err != nil { + klog.Warningf("failed to get card/device ID for logic ID %d: %v", d, err) + continue + } + if !am.shouldIncludeCard(cardID) { + continue + } healthCode, err := am.mgr.GetDeviceHealth(d) if err != nil { + klog.Warningf("failed to get device health for %d: %v", d, err) continue } if healthCode != 0 { @@ -209,7 +264,7 @@ func (am *AscendManager) CleanupIdleVNPUs() error { _, IDs, err := am.mgr.GetDeviceList() if err != nil { - return fmt.Errorf("failed to get device list: %v", err) + return fmt.Errorf("failed to get device list: %w", err) } klog.Infof("Found %d devices to check for idle vNPUs,%+v", len(IDs), IDs) @@ -220,6 +275,10 @@ func (am *AscendManager) CleanupIdleVNPUs() error { klog.Warningf("failed to get card/device ID for logic ID %d: %v", logicID, err) continue } + if !am.shouldIncludeCard(cardID) { + klog.V(4).Infof("skip cleanup on filtered cardID=%d logicID=%d deviceID=%d", cardID, logicID, deviceID) + continue + } // Obtain all vNPU information on this device vDevInfos, err := am.mgr.GetVirtualDeviceInfo(logicID) if err != nil { @@ -254,9 +313,8 @@ func (am *AscendManager) CleanupIdleVNPUs() error { return nil } - func (am *AscendManager) GetNodeConfig() *internal.NodeConfig { - return am.nodeConfig + return am.nodeConfig } func (am *AscendManager) IsHamiVnpuCore() bool { @@ -264,4 +322,4 @@ func (am *AscendManager) IsHamiVnpuCore() bool { return am.nodeConfig.HamiVnpuCore } return am.globalConfig.VNPUs.HamiVnpuCore -} \ No newline at end of file +} diff --git a/internal/server/server.go b/internal/server/server.go index f610876..4fff85a 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -18,40 +18,43 @@ package server import ( "context" + "crypto/sha256" + "encoding/hex" + "encoding/json" + "errors" "flag" "fmt" + "io" "net" "os" "path" + "path/filepath" + "strconv" "strings" + "syscall" "time" - "strconv" - "github.com/Project-HAMi/HAMi/pkg/device" - // "github.com/Project-HAMi/HAMi/pkg/device/ascend" - "github.com/Project-HAMi/HAMi/pkg/util" - "github.com/Project-HAMi/HAMi/pkg/util/nodelock" - "github.com/Project-HAMi/ascend-device-plugin/internal/manager" "google.golang.org/grpc" "google.golang.org/grpc/credentials/insecure" v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/util/json" "k8s.io/klog/v2" "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" - "io" - "path/filepath" - "crypto/sha256" - "encoding/hex" + + // "github.com/Project-HAMi/HAMi/pkg/device/ascend" + "github.com/Project-HAMi/HAMi/pkg/device" + "github.com/Project-HAMi/HAMi/pkg/device-plugin/nvidiadevice/nvinternal/plugin" + "github.com/Project-HAMi/HAMi/pkg/util" + "github.com/Project-HAMi/ascend-device-plugin/internal/manager" ) const ( // RegisterAnnos = "hami.io/node-register-ascend" // PodAllocAnno = "huawei.com/AscendDevices" - NodeLockAscend = "hami.io/mutex.lock" - Ascend910Prefix = "Ascend910" - Ascend910CType = "Ascend910C" - VNPUModeAnnotation = "huawei.com/vnpu-mode" - VNPUModeHamiCore = "hami-core" + NodeLockAscend = "hami.io/mutex.lock" + Ascend910Prefix = "Ascend910" + Ascend910CType = "Ascend910C" + VNPUModeAnnotation = "huawei.com/vnpu-mode" + VNPUModeHamiCore = "hami-core" VNPUNodeSelectorAnnotation = "hami-vnpu-core" ) @@ -60,10 +63,12 @@ var ( ) type PluginServer struct { + commonWord string nodeName string registerAnno string handshakeAnno string allocAnno string + toAllocDeviceAnno string grpcServer *grpc.Server mgr *manager.AscendManager socket string @@ -73,139 +78,144 @@ type PluginServer struct { } type RuntimeInfo struct { - UUID string `json:"UUID,omitempty"` - Temp string `json:"temp,omitempty"` - Memory *int64 `json:"memory,omitempty"` - Core *int32 `json:"core,omitempty"` + UUID string `json:"UUID,omitempty"` + Temp string `json:"temp,omitempty"` + Memory *int64 `json:"memory,omitempty"` + Core *int32 `json:"core,omitempty"` } func NewPluginServer(mgr *manager.AscendManager, nodeName string, checkIdleVNPUInterval int) (*PluginServer, error) { - return &PluginServer{ + commonWord := mgr.CommonWord() + server := &PluginServer{ + commonWord: commonWord, nodeName: nodeName, - registerAnno: fmt.Sprintf("hami.io/node-register-%s", mgr.CommonWord()), - handshakeAnno: fmt.Sprintf("hami.io/node-handshake-%s", mgr.CommonWord()), - allocAnno: fmt.Sprintf("huawei.com/%s", mgr.CommonWord()), + registerAnno: fmt.Sprintf("hami.io/node-register-%s", commonWord), + handshakeAnno: fmt.Sprintf("hami.io/node-handshake-%s", commonWord), + allocAnno: fmt.Sprintf("huawei.com/%s", commonWord), + toAllocDeviceAnno: fmt.Sprintf("hami.io/%s-devices-to-allocate", commonWord), grpcServer: grpc.NewServer(), mgr: mgr, - socket: path.Join(v1beta1.DevicePluginPath, fmt.Sprintf("%s.sock", mgr.CommonWord())), + socket: path.Join(v1beta1.DevicePluginPath, fmt.Sprintf("%s.sock", commonWord)), stopCh: make(chan interface{}), healthCh: make(chan int32), checkIdleVNPUInterval: checkIdleVNPUInterval, - }, nil + } + // enable calling hami methods + device.InRequestDevices[commonWord] = server.toAllocDeviceAnno + return server, nil } // fileSHA256 calculates the SHA256 checksum of the specified file func fileSHA256(path string) (string, error) { - f, err := os.Open(path) - if err != nil { - return "", err - } - defer f.Close() - - h := sha256.New() - if _, err := io.Copy(h, f); err != nil { - return "", err - } - return hex.EncodeToString(h.Sum(nil)), nil + f, err := os.Open(path) + if err != nil { + return "", err + } + defer f.Close() + + h := sha256.New() + if _, err := io.Copy(h, f); err != nil { + return "", err + } + return hex.EncodeToString(h.Sum(nil)), nil } // Automatically creates directories, sets permissions, and copies core files on the host func prepareHostResources() error { - klog.Info("Starting host resource preparation for HAMi vNPU core...") - - // 1. Create shared memory directory - sharedRegionPath := "/usr/local/hami-shared-region" - if err := os.MkdirAll(sharedRegionPath, 0777); err != nil { - if !os.IsExist(err) { - return fmt.Errorf("failed to create %s: %v", sharedRegionPath, err) - } - } - if err := os.Chmod(sharedRegionPath, 0777); err != nil { - return fmt.Errorf("failed to chmod %s: %v", sharedRegionPath, err) - } - klog.Infof("Successfully prepared directory: %s", sharedRegionPath) - - // 2. Prepare /usr/local/hami-vnpu-core/ directory - targetDir := "/usr/local/hami-vnpu-core" - if err := os.MkdirAll(targetDir, 0775); err != nil { - return fmt.Errorf("failed to create %s: %v", targetDir, err) - } - - // Specify the in-container assets directory (can be overridden via environment variable, default follows standard DevicePlugin convention) - assetsDir := os.Getenv("HAMI_VNPU_ASSETS_PATH") - if assetsDir == "" { - assetsDir = "/usr/local/hami-vnpu-core-assets" - } - - // Define files to copy: source path in container -> target path on host - filesToCopy := map[string]string{ - "limiter": filepath.Join(targetDir, "limiter"), - "libvnpu.so": filepath.Join(targetDir, "libvnpu.so"), - "ld.so.preload": filepath.Join(targetDir, "ld.so.preload"), - } - - for srcName, destPath := range filesToCopy { - srcPath := filepath.Join(assetsDir, srcName) + klog.Info("Starting host resource preparation for HAMi vNPU core...") + + // 1. Create shared memory directory + sharedRegionPath := "/usr/local/hami-shared-region" + if err := os.MkdirAll(sharedRegionPath, 0777); err != nil { + if !os.IsExist(err) { + return fmt.Errorf("failed to create %s: %w", sharedRegionPath, err) + } + } + if err := os.Chmod(sharedRegionPath, 0777); err != nil { + return fmt.Errorf("failed to chmod %s: %w", sharedRegionPath, err) + } + klog.Infof("Successfully prepared directory: %s", sharedRegionPath) + + // 2. Prepare /usr/local/hami-vnpu-core/ directory + targetDir := "/usr/local/hami-vnpu-core" + if err := os.MkdirAll(targetDir, 0775); err != nil { + return fmt.Errorf("failed to create %s: %w", targetDir, err) + } + + // Specify the in-container assets directory (can be overridden via environment variable, default follows standard DevicePlugin convention) + assetsDir := os.Getenv("HAMI_VNPU_ASSETS_PATH") + if assetsDir == "" { + assetsDir = "/usr/local/hami-vnpu-core-assets" + } + + // Define files to copy: source path in container -> target path on host + filesToCopy := map[string]string{ + "limiter": filepath.Join(targetDir, "limiter"), + "libvnpu.so": filepath.Join(targetDir, "libvnpu.so"), + "ld.so.preload": filepath.Join(targetDir, "ld.so.preload"), + } + + for srcName, destPath := range filesToCopy { + srcPath := filepath.Join(assetsDir, srcName) // File already exists, skip if content is consistent if _, err := os.Stat(destPath); err == nil { - srcSum, err1 := fileSHA256(srcPath) - dstSum, err2 := fileSHA256(destPath) - - if err1 == nil && err2 == nil && srcSum == dstSum { - klog.Infof("✓ %s already up-to-date, skipping", destPath) - continue - } - } - - if err := copyFile(srcPath, destPath); err != nil { - if strings.Contains(err.Error(), "text file busy") { - klog.Warningf("⚠ %s is in use by running process, keeping existing version (safe)", destPath) - continue - } - return fmt.Errorf("failed to copy %s: %v", destPath, err) - } - klog.Infof("✓ Copied %s -> %s", srcPath, destPath) - } - - klog.Info("Host resource preparation completed successfully.") - return nil + srcSum, err1 := fileSHA256(srcPath) + dstSum, err2 := fileSHA256(destPath) + + if err1 == nil && err2 == nil && srcSum == dstSum { + klog.Infof("✓ %s already up-to-date, skipping", destPath) + continue + } + } + + if err := copyFile(srcPath, destPath); err != nil { + if errors.Is(err, syscall.ETXTBSY) { + klog.Warningf("⚠ %s is in use by running process, keeping existing version (safe)", destPath) + continue + } + return fmt.Errorf("failed to copy %s: %w", destPath, err) + } + klog.Infof("✓ Copied %s -> %s", srcPath, destPath) + } + + klog.Info("Host resource preparation completed successfully.") + return nil } // A standard file copy implementation that preserves the original file permissions func copyFile(src, dst string) error { - srcFile, err := os.Open(src) - if err != nil { - return err - } - defer srcFile.Close() - - dstFile, err := os.Create(dst) - if err != nil { - return err - } - defer dstFile.Close() - - if _, err = io.Copy(dstFile, srcFile); err != nil { - return err - } - - // Sync source file permissions (ensure the limiter binary retains executable permission) - srcInfo, err := srcFile.Stat() - if err != nil { - return err - } - return os.Chmod(dst, srcInfo.Mode()) -} + srcFile, err := os.Open(src) + if err != nil { + return err + } + defer srcFile.Close() + + dstFile, err := os.Create(dst) + if err != nil { + return err + } + defer dstFile.Close() + if _, err = io.Copy(dstFile, srcFile); err != nil { + return err + } + + // Sync source file permissions (ensure the limiter binary retains executable permission) + srcInfo, err := srcFile.Stat() + if err != nil { + return err + } + return os.Chmod(dst, srcInfo.Mode()) +} func (ps *PluginServer) Start() error { // Automatically prepare host environment when the plugin starts - if err := prepareHostResources(); err != nil { - klog.Errorf("Failed to prepare host resources: %v. vNPU core functionality will be impaired.", err) - return err - } - + if err := prepareHostResources(); err != nil { + klog.Errorf("Failed to prepare host resources: %v. vNPU core functionality will be impaired.", err) + return err + } + ps.stopCh = make(chan interface{}) err := ps.mgr.UpdateDevice() if err != nil { @@ -285,6 +295,11 @@ func (ps *PluginServer) serve() error { lastCrashTime := time.Now() restartCount := 0 for { + select { + case <-ps.stopCh: + return + default: + } klog.Infof("Starting GRPC server for '%s'", resourceName) err := ps.grpcServer.Serve(sock) if err == nil { @@ -314,7 +329,7 @@ func (ps *PluginServer) serve() error { // Wait for server to start by launching a blocking connexion conn, err := ps.dial(ps.socket, 5*time.Second) if err != nil { - return err + return fmt.Errorf("failed to dial device plugin socket: %w", err) } _ = conn.Close() @@ -324,7 +339,7 @@ func (ps *PluginServer) serve() error { func (ps *PluginServer) registerKubelet() error { conn, err := ps.dial(v1beta1.KubeletSocket, 5*time.Second) if err != nil { - return err + return fmt.Errorf("failed to dial kubelet socket: %w", err) } defer func(conn *grpc.ClientConn) { _ = conn.Close() @@ -341,7 +356,7 @@ func (ps *PluginServer) registerKubelet() error { _, err = client.Register(context.Background(), reqt) if err != nil { - return err + return fmt.Errorf("failed to register device plugin with kubelet: %w", err) } return nil } @@ -377,7 +392,7 @@ func (ps *PluginServer) registerHAMi() error { if strings.HasPrefix(device.Type, Ascend910Prefix) { NetworkID, err := ps.getDeviceNetworkID(i, device.Type) if err != nil { - return fmt.Errorf("get networkID error: %v", err) + return fmt.Errorf("get networkID error: %w", err) } device.CustomInfo = map[string]any{ "NetworkID": NetworkID, @@ -388,21 +403,21 @@ func (ps *PluginServer) registerHAMi() error { annos := make(map[string]string) annos[ps.registerAnno] = device.MarshalNodeDevices(apiDevices) annos[ps.handshakeAnno] = "Reported_" + time.Now().Add(time.Duration(*reportTimeOffset)*time.Second).Format("2006.01.02 15:04:05") - + if ps.mgr.IsHamiVnpuCore() { annos[VNPUNodeSelectorAnnotation] = "true" klog.V(4).Infof("Node %s has HamiVnpuCore enabled, patching annotation %s: true", ps.nodeName, VNPUNodeSelectorAnnotation) } else { annos[VNPUNodeSelectorAnnotation] = "false" } - + node, err := util.GetNode(ps.nodeName) if err != nil { - return fmt.Errorf("get node %s error: %v", ps.nodeName, err) + return fmt.Errorf("get node %s error: %w", ps.nodeName, err) } err = util.PatchNodeAnnotations(node, annos) if err != nil { - return fmt.Errorf("patch node %s annotations error: %v", ps.nodeName, err) + return fmt.Errorf("patch node %s annotations error: %w", ps.nodeName, err) } klog.V(5).Infof("patch node %s annotations: %v", ps.nodeName, annos) return nil @@ -437,43 +452,165 @@ func (ps *PluginServer) watchAndRegister() { } } +// buildContainerAllocateResponse builds the allocate response for a single container. +func (ps *PluginServer) buildContainerAllocateResponse(pod *v1.Pod, containerDevs device.ContainerDevices, rtInfoLookup map[string]RuntimeInfo) (*v1beta1.ContainerAllocateResponse, error) { + resp := &v1beta1.ContainerAllocateResponse{} + + var ( + IDs []int32 + memories []*int64 + cores []*int32 + ascendVNPUSpec string + ) + + for _, dev := range containerDevs { + d := ps.mgr.GetDeviceByUUID(dev.UUID) + if d == nil { + return nil, fmt.Errorf("unknown uuid: %s", dev.UUID) + } + IDs = append(IDs, d.PhyID) + + if info, ok := rtInfoLookup[dev.UUID]; ok { + if ascendVNPUSpec == "" && info.Temp != "" { + ascendVNPUSpec = info.Temp + } + if info.Memory != nil { + memories = append(memories, info.Memory) + } + if info.Core != nil { + cores = append(cores, info.Core) + } + } + } + + if len(IDs) == 0 { + return nil, fmt.Errorf("annotation %s value invalid", ps.allocAnno) + } + ascendVisibleDevices := fmt.Sprintf("%d", IDs[0]) + for i := 1; i < len(IDs); i++ { + ascendVisibleDevices = fmt.Sprintf("%s,%d", ascendVisibleDevices, IDs[i]) + } + resp.Envs = make(map[string]string) + resp.Envs["ASCEND_VISIBLE_DEVICES"] = ascendVisibleDevices + + vnpuMode := pod.Annotations[VNPUModeAnnotation] + klog.V(4).Infof("Pod %s vnpu mode: %s", pod.Name, vnpuMode) + if vnpuMode == VNPUModeHamiCore { + // 1. Handle volume mount injection + var mounts []*v1beta1.Mount + // A.Huawei driver and SMI toolchain (Read-Only) + driverPaths := []string{ + "/usr/local/bin/npu-smi", + "/etc/ascend_install.info", + "/usr/local/Ascend/driver/lib64/driver", + "/usr/local/Ascend/driver/version.info", + } + for _, p := range driverPaths { + mounts = append(mounts, &v1beta1.Mount{HostPath: p, ContainerPath: p, ReadOnly: true}) + } + + mounts = append(mounts, &v1beta1.Mount{ + HostPath: "/usr/local/hami-vnpu-core", + ContainerPath: "/hami-vnpu-core", + ReadOnly: true, + }) + // B. Inject HAMi library path by mounting /etc/ld.so.preload. + mounts = append(mounts, &v1beta1.Mount{ + HostPath: "/usr/local/hami-vnpu-core/ld.so.preload", // Template file on host + ContainerPath: "/etc/ld.so.preload", // Overwrites the target file in container + ReadOnly: true, + }) -func (ps *PluginServer) parsePodAnnotation(pod *v1.Pod) ([]int32, []string, []*int64, []*int32, error) { + // C. Shared directory for HAMi compute resource partitioning (Read/Write) + mounts = append(mounts, &v1beta1.Mount{ + HostPath: "/usr/local/hami-shared-region", + ContainerPath: "/hami-shared-region", + ReadOnly: false, + }) + resp.Mounts = mounts + + // Set NPU_MEM_QUOTA + if len(memories) > 0 && memories[0] != nil { + resp.Envs["NPU_MEM_QUOTA"] = strconv.FormatInt(*memories[0], 10) + klog.V(4).InfoS("Memory quota set", "value", *memories[0]) + } + + // Set NPU_PRIORITY + if len(cores) > 0 && cores[0] != nil { + resp.Envs["NPU_PRIORITY"] = strconv.FormatInt(int64(*cores[0]), 10) + klog.V(4).InfoS("Core priority set", "value", *cores[0]) + } + + // Set GLOBAL_SHM_PATH based on the first device ID. + resp.Envs["NPU_GLOBAL_SHM_PATH"] = fmt.Sprintf("/hami-shared-region/%d_global_registry", IDs[0]) + klog.V(5).Infof("Create %d_global_registry", IDs[0]) + } else { + if ascendVNPUSpec != "" { + resp.Envs["ASCEND_VNPU_SPECS"] = ascendVNPUSpec + } + } + return resp, nil +} + +// popNextContainerDevices finds and erases the first non-empty containerDevices +// from podSingleDev. It mutates podSingleDev in place. +func (ps *PluginServer) popNextContainerDevices(podSingleDev device.PodSingleDevice) (device.ContainerDevices, error) { + for i, ctrDevs := range podSingleDev { + if len(ctrDevs) > 0 { + podSingleDev[i] = device.ContainerDevices{} + return ctrDevs, nil + } + } + return nil, fmt.Errorf("no pending device allocation found") +} + +// decodeDeviceAnnotations decodes the pod's device allocation annotation +// (registered as hami.io/-devices-to-allocate in InRequestDevices) +// into a PodSingleDevice. +func (ps *PluginServer) decodeDeviceAnnotations(pod *v1.Pod) (device.PodSingleDevice, error) { + pdevices, err := device.DecodePodDevices(device.InRequestDevices, pod.Annotations) + if err != nil { + return nil, err + } + pd, ok := pdevices[ps.commonWord] + if !ok { + return nil, fmt.Errorf("device %s not found in pod annotations", ps.commonWord) + } + return pd, nil +} + +// buildRuntimeInfoLookup builds a UUID-to-RuntimeInfo lookup from the pod's allocAnno annotation. +func (ps *PluginServer) buildRuntimeInfoLookup(pod *v1.Pod) (map[string]RuntimeInfo, error) { anno, ok := pod.Annotations[ps.allocAnno] if !ok { - return nil, nil,nil, nil, fmt.Errorf("annotation %s not set", "huawei.com/Ascend") + return nil, fmt.Errorf("annotation %s not set", ps.allocAnno) } var rtInfo []RuntimeInfo - err := json.Unmarshal([]byte(anno), &rtInfo) - if err != nil { - return nil, nil,nil, nil, fmt.Errorf("annotation %s value %s invalid", ps.allocAnno, anno) + if err := json.Unmarshal([]byte(anno), &rtInfo); err != nil { + return nil, fmt.Errorf("annotation %s value %s invalid: %w", ps.allocAnno, anno, err) } - var IDs []int32 - var temps []string - var memories []*int64 - var cores []*int32 - + lookup := make(map[string]RuntimeInfo, len(rtInfo)) for _, info := range rtInfo { - if info.UUID == "" { - continue - } - d := ps.mgr.GetDeviceByUUID(info.UUID) - if d == nil { - return nil, nil, nil, nil, fmt.Errorf("unknown uuid: %s", info.UUID) + if info.UUID != "" { + lookup[info.UUID] = info } - IDs = append(IDs, d.PhyID) - temps = append(temps, info.Temp) - if info.Memory != nil { - memories = append(memories, info.Memory) - } - if info.Core != nil { - cores = append(cores, info.Core) - } } - if len(IDs) == 0 { - return nil, nil, nil, nil, fmt.Errorf("annotation %s value %s invalid", ps.allocAnno, anno) + return lookup, nil +} + +// patchErasedAnnotation patches the pod's device annotation with the given +// podSingleDev. It also updates pod.Annotations in place. +func (ps *PluginServer) patchErasedAnnotation(pod *v1.Pod, podSingleDev device.PodSingleDevice) error { + klog.V(5).Infof("After erase annotation, remaining devices: %v", podSingleDev) + newAnnoValue := device.EncodePodSingleDevice(podSingleDev) + newAnnos := map[string]string{ + ps.toAllocDeviceAnno: newAnnoValue, + } + if err := util.PatchPodAnnotations(pod, newAnnos); err != nil { + return err } - return IDs, temps, memories, cores, nil + pod.Annotations[ps.toAllocDeviceAnno] = newAnnoValue + return nil } func (ps *PluginServer) apiDevices() []*v1beta1.Device { @@ -522,106 +659,80 @@ func (ps *PluginServer) Allocate(ctx context.Context, reqs *v1beta1.AllocateRequ success := false var pod *v1.Pod defer func() { - lockerr := nodelock.ReleaseNodeLock(ps.nodeName, NodeLockAscend, pod, success) - if lockerr != nil { - klog.Errorf("failed to release lock:%s", lockerr.Error()) + if pod == nil { + return + } + if success { + ps.podAllocationTrySuccess(pod) + } else { + ps.podAllocationFailed(pod) } }() - pod, err := util.GetPendingPod(ctx, ps.nodeName) + + var err error + pod, err = util.GetPendingPod(ctx, ps.nodeName) if err != nil { klog.Errorf("get pending pod error: %v", err) - return nil, fmt.Errorf("get pending pod error: %v", err) + return nil, fmt.Errorf("get pending pod error: %w", err) } - resp := v1beta1.ContainerAllocateResponse{} - IDs, temps, memories, cores, err := ps.parsePodAnnotation(pod) + klog.Infof("allocating for pod %s/%s", pod.Namespace, pod.Name) + + rtInfoLookup, err := ps.buildRuntimeInfoLookup(pod) if err != nil { - return nil, fmt.Errorf("parse pod annotation error: %v", err) - } - - vnpuMode := pod.Annotations[VNPUModeAnnotation] - klog.V(4).Infof("Pod %s vnpu mode: %s", pod.Name, vnpuMode) - - if len(IDs) == 0 { - return nil, fmt.Errorf("empty id from pod annotation") - } - ascendVisibleDevices := fmt.Sprintf("%d", IDs[0]) - for i := 1; i < len(IDs); i++ { - ascendVisibleDevices = fmt.Sprintf("%s,%d", ascendVisibleDevices, IDs[i]) + return nil, fmt.Errorf("build runtimeInfo lookup: %w", err) } - resp.Envs = make(map[string]string) - resp.Envs["ASCEND_VISIBLE_DEVICES"] = ascendVisibleDevices + podSingleDev, err := ps.decodeDeviceAnnotations(pod) + if err != nil { + return nil, fmt.Errorf("decode device annotations: %w", err) + } - if vnpuMode == VNPUModeHamiCore { - // 1. Handle volume mount injection - var mounts []*v1beta1.Mount - // A.Huawei driver and SMI toolchain (Read-Only) - driverPaths := []string{ - "/usr/local/bin/npu-smi", - "/etc/ascend_install.info", - "/usr/local/Ascend/driver/lib64/driver", - "/usr/local/Ascend/driver/version.info", - } - for _, p := range driverPaths { - mounts = append(mounts, &v1beta1.Mount{HostPath: p, ContainerPath: p, ReadOnly: true}) + // kubelet may call Allocate multiple times for the same pod, each time with + // a subset of containers. Use pop semantics to match each request with its + // corresponding containerDevices. + responses := v1beta1.AllocateResponse{} + for _, req := range reqs.ContainerRequests { + containerDevs, err := ps.popNextContainerDevices(podSingleDev) + if err != nil { + return nil, fmt.Errorf("get next container devices: %w", err) } + klog.Infof("containerDevs: %+v", containerDevs) - mounts = append(mounts, &v1beta1.Mount{ - HostPath: "/usr/local/hami-vnpu-core", - ContainerPath: "/hami-vnpu-core", - ReadOnly: true, - }) - // B. Inject HAMi library path by mounting /etc/ld.so.preload. - mounts = append(mounts, &v1beta1.Mount{ - HostPath: "/usr/local/hami-vnpu-core/ld.so.preload", // Template file on host - ContainerPath: "/etc/ld.so.preload", // Overwrites the target file in container - ReadOnly: true, - }) - - // C. Shared directory for HAMi compute resource partitioning (Read/Write) - mounts = append(mounts, &v1beta1.Mount{ - HostPath: "/usr/local/hami-shared-region", - ContainerPath: "/hami-shared-region", - ReadOnly: false, - }) - resp.Mounts = mounts - - // Set NPU_MEM_QUOTA - if len(memories) > 0 && memories[0] != nil { - resp.Envs["NPU_MEM_QUOTA"] = strconv.FormatInt(*memories[0], 10) - klog.V(4).InfoS("Memory quota set", "value", *memories[0]) + if len(containerDevs) != len(req.DevicesIDs) { + return nil, fmt.Errorf("device number not matched: annotation has %d, request has %d", len(containerDevs), len(req.DevicesIDs)) } - // Set NPU_PRIORITY - if len(cores) > 0 && cores[0] != nil { - resp.Envs["NPU_PRIORITY"] = strconv.FormatInt(int64(*cores[0]), 10) - klog.V(4).InfoS("Core priority set", "value", *cores[0]) + resp, err := ps.buildContainerAllocateResponse(pod, containerDevs, rtInfoLookup) + if err != nil { + return nil, fmt.Errorf("build container allocate response: %w", err) } + responses.ContainerResponses = append(responses.ContainerResponses, resp) + } - // Set GLOBAL_SHM_PATH separated by device ID. - if len(IDs) > 0 { - resp.Envs["NPU_GLOBAL_SHM_PATH"] = fmt.Sprintf("/hami-shared-region/%d_global_registry", IDs[0]) - klog.V(5).Infof("Create %d_global_registry", IDs[0]) - } else { - klog.Warningf("No device IDs allocated") - } - } else { - ascendVNPUSpec := "" - for i := 0; i < len(temps); i++ { - if temps[i] != "" { - ascendVNPUSpec = temps[i] - break - } - } - if ascendVNPUSpec != "" { - resp.Envs["ASCEND_VNPU_SPECS"] = ascendVNPUSpec - } + // Patch the annotation with the in-memory erased podSingleDev. + if err := ps.patchErasedAnnotation(pod, podSingleDev); err != nil { + klog.Errorf("erase allocated containers annotation error: %v", err) + return nil, fmt.Errorf("erase allocated containers annotation: %w", err) } - klog.V(5).Infof("allocate response: %v", resp) + + klog.V(5).Infof("allocate response: %+v", responses.ContainerResponses) success = true - return &v1beta1.AllocateResponse{ContainerResponses: []*v1beta1.ContainerAllocateResponse{&resp}}, nil + return &responses, nil } func (ps *PluginServer) PreStartContainer(context.Context, *v1beta1.PreStartContainerRequest) (*v1beta1.PreStartContainerResponse, error) { return &v1beta1.PreStartContainerResponse{}, nil } + +// podAllocationTrySuccess checks if all containers of this pod have been +// allocated. If so, it sets bind-phase to "success" and releases the node +// lock; otherwise it returns without setting bind-phase or releasing the lock, +// waiting for the next Allocate call. +func (ps *PluginServer) podAllocationTrySuccess(pod *v1.Pod) { + plugin.PodAllocationTrySuccess(ps.nodeName, ps.commonWord, NodeLockAscend, pod) +} + +// podAllocationFailed sets bind-phase to "failed" and releases the node lock. +func (ps *PluginServer) podAllocationFailed(pod *v1.Pod) { + plugin.PodAllocationFailed(ps.nodeName, pod, NodeLockAscend) +} diff --git a/internal/vnpu.go b/internal/vnpu.go index 9872d72..e43aef2 100644 --- a/internal/vnpu.go +++ b/internal/vnpu.go @@ -63,15 +63,15 @@ func LoadConfig(path string) (*Config, error) { return &yamlData, nil } - type NodeConfig struct { - Name string `json:"name"` - HamiVnpuCore bool `json:"hami-vnpu-core"` - VDeviceCount int `json:"vDeviceCount"` + Name string `json:"name" yaml:"name"` + HamiVnpuCore bool `json:"hami-vnpu-core" yaml:"hami-vnpu-core"` + VDeviceCount int `json:"vDeviceCount" yaml:"vDeviceCount"` + FilterDevices []int32 `json:"filterDevices,omitempty" yaml:"filterDevices,omitempty"` } type NodeListConfig struct { - Nodes []NodeConfig `json:"nodes"` + Nodes []NodeConfig `json:"nodes" yaml:"nodes"` } func LoadNodeConfig(path string) (*NodeListConfig, error) { @@ -85,4 +85,4 @@ func LoadNodeConfig(path string) (*NodeListConfig, error) { return nil, err } return &yamlData, nil -} \ No newline at end of file +}