diff --git a/pkg/scheduler/nodes.go b/pkg/scheduler/nodes.go index 286e0bca1..102e69b31 100644 --- a/pkg/scheduler/nodes.go +++ b/pkg/scheduler/nodes.go @@ -100,6 +100,12 @@ func (m *nodeManager) rmNodeDevices(nodeID string, deviceVendor string) { klog.InfoS("Removing device from node", "nodeName", nodeID, "deviceVendor", deviceVendor, "remainingDevices", devices) } +func (m *nodeManager) removeNode(nodeID string) { + m.mutex.Lock() + defer m.mutex.Unlock() + delete(m.nodes, nodeID) +} + func (m *nodeManager) GetNode(nodeID string) (*util.NodeInfo, error) { m.mutex.RLock() defer m.mutex.RUnlock() diff --git a/pkg/scheduler/scheduler.go b/pkg/scheduler/scheduler.go index 070658178..1ea5dc7f5 100644 --- a/pkg/scheduler/scheduler.go +++ b/pkg/scheduler/scheduler.go @@ -300,6 +300,18 @@ func (s *Scheduler) RegisterFromNodeAnnotations() { } } } + activeNodes := make(map[string]struct{}, len(nodeNames)) + for _, name := range nodeNames { + activeNodes[name] = struct{}{} + } + allRegistered, _ := s.ListNodes() + for id := range allRegistered { + if _, exists := activeNodes[id]; !exists { + klog.InfoS("Removing stale node from scheduler", "nodeName", id) + s.removeNode(id) + } + } + _, _, err = s.getNodesUsage(&nodeNames, nil) if err != nil { klog.ErrorS(err, "Failed to get node usage", "nodeNames", nodeNames)