Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions data-models/pkg/model/health_event_extentions.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,14 @@ type HealthEventWithStatus struct {
HealthEvent *protos.HealthEvent `bson:"healthevent,omitempty"`
HealthEventStatus *protos.HealthEventStatus `bson:"healtheventstatus"`
}

// GetEffectiveActionName returns the action name to use for remediation routing.
// For built-in actions, returns the enum string (e.g., "RESTART_BM").
// For CUSTOM actions, returns the customRecommendedAction string.
func GetEffectiveActionName(he *protos.HealthEvent) string {
if he.RecommendedAction == protos.RecommendedAction_CUSTOM {
return he.CustomRecommendedAction
}

return he.RecommendedAction.String()
}
62 changes: 38 additions & 24 deletions data-models/pkg/protos/health_event.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions data-models/protobufs/health_event.proto
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ enum RecommendedAction {
RESTART_BM = 24;
REPLACE_VM = 25;
RUN_DCGMEUD = 26;
CUSTOM = 27;

UNKNOWN = 99;
}
Expand Down Expand Up @@ -109,6 +110,7 @@ message HealthEvent {
BehaviourOverrides drainOverrides = 15;
ProcessingStrategy processingStrategy = 16;
string id = 17;
string customRecommendedAction = 18;
}

message BehaviourOverrides {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ spec:
- "REPLACE_VM"
- 26
- "RUN_DCGMEUD"
- 27
- "CUSTOM"
- 99
- "UNKNOWN"
format: enum
Expand Down Expand Up @@ -119,6 +121,8 @@ spec:
x-kubernetes-int-or-string: true
id:
type: string
customRecommendedAction:
type: string
status:
nullable: true
type: object
Expand Down
4 changes: 4 additions & 0 deletions event-exporter/pkg/transformer/cloudevents.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ func ToCloudEvent(event *pb.HealthEvent, metadata map[string]string) (*CloudEven
"processingStrategy": event.ProcessingStrategy.String(),
}

if event.CustomRecommendedAction != "" {
healthEventData["customRecommendedAction"] = event.CustomRecommendedAction
}
Comment thread
lalitadithya marked this conversation as resolved.

if len(event.Metadata) > 0 {
healthEventData["metadata"] = event.Metadata
}
Expand Down
24 changes: 24 additions & 0 deletions event-exporter/pkg/transformer/cloudevents_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,30 @@ func TestToCloudEvent(t *testing.T) {
}
},
},
{
name: "custom action includes customRecommendedAction",
event: &pb.HealthEvent{
Version: 1,
Agent: "disk-health-monitor",
ComponentClass: "Disk",
CheckName: "SmartCheck",
NodeName: "node-1",
GeneratedTimestamp: fixedTimestamp,
RecommendedAction: pb.RecommendedAction_CUSTOM,
CustomRecommendedAction: "REPLACE_DISK",
},
metadata: map[string]string{"cluster": "test-cluster"},
wantErr: false,
validateFunc: func(t *testing.T, ce *CloudEvent) {
healthEvent := ce.Data["healthEvent"].(map[string]any)
if healthEvent["recommendedAction"] != "CUSTOM" {
t.Errorf("recommendedAction = %v, want CUSTOM", healthEvent["recommendedAction"])
}
if healthEvent["customRecommendedAction"] != "REPLACE_DISK" {
t.Errorf("customRecommendedAction = %v, want REPLACE_DISK", healthEvent["customRecommendedAction"])
}
},
},
}

for _, tt := range tests {
Expand Down
9 changes: 5 additions & 4 deletions fault-quarantine/pkg/evaluator/rule_evaluator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -261,10 +261,11 @@ func TestRoundTrip(t *testing.T) {
"seconds": float64(eventTime.GetSeconds()),
"nanos": float64(eventTime.GetNanos()),
},
"nodeName": "test-node",
"processingStrategy": float64(0),
"quarantineOverrides": nil,
"drainOverrides": nil,
"nodeName": "test-node",
"processingStrategy": float64(0),
"quarantineOverrides": nil,
"drainOverrides": nil,
"customRecommendedAction": "",
}

if !reflect.DeepEqual(result, expectedMap) {
Expand Down
3 changes: 2 additions & 1 deletion fault-remediation/pkg/common/equivalence_groups.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"fmt"
"log/slog"

"github.com/nvidia/nvsentinel/data-models/pkg/model"
"github.com/nvidia/nvsentinel/data-models/pkg/protos"
"github.com/nvidia/nvsentinel/fault-remediation/pkg/annotation"
"github.com/nvidia/nvsentinel/fault-remediation/pkg/config"
Expand Down Expand Up @@ -61,7 +62,7 @@ custom resource template.
*/
func GetGroupConfigForEvent(remediationActions map[string]config.MaintenanceResource,
healthEvent *protos.HealthEvent) (*EquivalenceGroupConfig, error) {
actionName := healthEvent.RecommendedAction.String()
actionName := model.GetEffectiveActionName(healthEvent)

actionConfig, exists := remediationActions[actionName]
if !exists {
Expand Down
7 changes: 5 additions & 2 deletions fault-remediation/pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -191,8 +191,11 @@ func validateResourceImpactedEntityScope(actionName string, resource Maintenance
return nil
}

if actionName != protos.RecommendedAction_COMPONENT_RESET.String() {
return fmt.Errorf("action '%s' cannot have an ImpactedEntityScope defined", actionName)
_, isBuiltinAction := protos.RecommendedAction_value[actionName]
if isBuiltinAction && actionName != protos.RecommendedAction_COMPONENT_RESET.String() {
return fmt.Errorf(
"built-in action '%s' cannot have an ImpactedEntityScope; "+
"only COMPONENT_RESET and custom actions support this", actionName)
}

if _, ok := model.EntityTypeToResourceNames[resource.ImpactedEntityScope]; !ok {
Expand Down
36 changes: 29 additions & 7 deletions fault-remediation/pkg/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -233,11 +233,11 @@ func TestTomlConfig_Validate(t *testing.T) {
config: TomlConfig{
Template: Template{MountPath: tempDir},
RemediationActions: map[string]MaintenanceResource{
"ACTION_A": {
"RESTART_BM": {
TemplateFileName: "template-a.yaml",
Scope: "Cluster",
EquivalenceGroup: "restart",
ImpactedEntityScope: "PCI",
ImpactedEntityScope: "GPU_UUID",
},
"COMPONENT_RESET": {
TemplateFileName: "template-b.yaml",
Expand All @@ -250,19 +250,19 @@ func TestTomlConfig_Validate(t *testing.T) {
},
},
expectError: true,
errorSubstr: "cannot have an ImpactedEntityScope defined",
errorSubstr: "built-in action 'RESTART_BM' cannot have an ImpactedEntityScope",
},
{
name: "Only the COMPONENT_RESET action can have an ImpactedEntityScope",
name: "Built-in actions other than COMPONENT_RESET cannot have an ImpactedEntityScope",
config: TomlConfig{
Template: Template{MountPath: tempDir},
RemediationActions: map[string]MaintenanceResource{
"ACTION_A": {
"RESTART_VM": {
TemplateFileName: "template-a.yaml",
Scope: "Cluster",
EquivalenceGroup: "restart",
},
"ACTION_B": {
"RESTART_BM": {
TemplateFileName: "template-b.yaml",
Scope: "Namespaced",
Namespace: "test-namespace",
Expand All @@ -273,7 +273,29 @@ func TestTomlConfig_Validate(t *testing.T) {
},
},
expectError: true,
errorSubstr: "cannot have an ImpactedEntityScope defined",
errorSubstr: "built-in action 'RESTART_BM' cannot have an ImpactedEntityScope",
},
{
name: "Custom actions can have an ImpactedEntityScope with valid entity type",
config: TomlConfig{
Template: Template{MountPath: tempDir},
RemediationActions: map[string]MaintenanceResource{
"RESTART_VM": {
TemplateFileName: "template-a.yaml",
Scope: "Cluster",
EquivalenceGroup: "restart",
},
"REPLACE_DISK": {
TemplateFileName: "template-b.yaml",
Scope: "Namespaced",
Namespace: "test-namespace",
EquivalenceGroup: "reset",
SupersedingEquivalenceGroups: []string{"restart"},
ImpactedEntityScope: "GPU_UUID",
},
},
},
expectError: false,
},
}

Expand Down
5 changes: 3 additions & 2 deletions fault-remediation/pkg/reconciler/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,10 +151,11 @@ func (r *FaultRemediationReconciler) shouldSkipEvent(ctx context.Context,
}

// Unsupported action detected
actionName := model.GetEffectiveActionName(healthEventWithStatus.HealthEvent)
slog.Info("Unsupported recommended action for node",
"action", action.String(),
"action", actionName,
"node", nodeName)
metrics.TotalUnsupportedRemediationActions.WithLabelValues(action.String(), nodeName).Inc()
metrics.TotalUnsupportedRemediationActions.WithLabelValues(actionName, nodeName).Inc()

_, err := r.Config.StateManager.UpdateNVSentinelStateNodeLabel(ctx,
healthEventWithStatus.HealthEvent.NodeName,
Expand Down
Loading
Loading