11package dcgm
22
3- import (
4- "errors"
5- "fmt"
6- "sync"
7-
8- godcgm "github.com/NVIDIA/go-dcgm/pkg/dcgm"
9- )
10-
113type HealthStatus string
124
135const (
@@ -30,88 +22,8 @@ type Health struct {
3022 Incidents []HealthIncident `json:"incidents"`
3123}
3224
33- // DCGMWrapper is a wrapper around go-dcgm (which, in turn, is a wrapper around libdcgm.so)
34- type DCGMWrapper struct {
35- group godcgm.GroupHandle
36- healthCheckEnabled bool
37-
38- mu * sync.Mutex
39- }
40-
41- // NewDCGMWrapper initializes and starts DCGM in the specific mode:
42- // - If address is empty, then libdcgm starts embedded hostengine within the current process.
43- // This is the main mode.
44- // - If address is not empty, then libdcgm connects to already running nv-hostengine service via TCP.
45- // This mode is useful for debugging, e.g., one can start nv-hostengine via systemd and inject
46- // errors via dcgmi:
47- // - systemctl start nvidia-dcgm.service
48- // - dcgmi test --inject --gpuid 0 -f 202 -v 99999
49- //
50- // Note: embedded hostengine is started in AUTO operation mode, which means that
51- // the library handles periodic tasks by itself executing them in additional threads.
52- func NewDCGMWrapper (address string ) (* DCGMWrapper , error ) {
53- var err error
54- if address == "" {
55- _ , err = godcgm .Init (godcgm .Embedded )
56- } else {
57- // "address is a unix socket filename (1) or a TCP/IP address (0)"
58- _ , err = godcgm .Init (godcgm .Standalone , address , "0" )
59- }
60- if err != nil {
61- return nil , fmt .Errorf ("failed to initialize or start DCGM: %w" , err )
62- }
63- return & DCGMWrapper {
64- group : godcgm .GroupAllGPUs (),
65- mu : new (sync.Mutex ),
66- }, nil
67- }
68-
69- func (w * DCGMWrapper ) Shutdown () error {
70- if err := godcgm .Shutdown (); err != nil {
71- return fmt .Errorf ("failed to shut down DCGM: %w" , err )
72- }
73- return nil
74- }
75-
76- func (w * DCGMWrapper ) EnableHealthChecks () error {
77- w .mu .Lock ()
78- defer w .mu .Unlock ()
79- if w .healthCheckEnabled {
80- return errors .New ("health check system already enabled" )
81- }
82- if err := godcgm .HealthSet (w .group , godcgm .DCGM_HEALTH_WATCH_ALL ); err != nil {
83- return fmt .Errorf ("failed to configure health watches: %w" , err )
84- }
85- // "On the first call, stateful information about all of the enabled watches within a group
86- // is created but no error results are provided. On subsequent calls, any error information
87- // will be returned."
88- if _ , err := godcgm .HealthCheck (w .group ); err != nil {
89- return fmt .Errorf ("failed to initialize health watches state: %w" , err )
90- }
91- w .healthCheckEnabled = true
92- return nil
93- }
94-
95- func (w * DCGMWrapper ) GetHealth () (Health , error ) {
96- health := Health {}
97- if ! w .healthCheckEnabled {
98- return health , errors .New ("health check system is not enabled" )
99- }
100- response , err := godcgm .HealthCheck (w .group )
101- if err != nil {
102- return health , fmt .Errorf ("failed to fetch health status: %w" , err )
103- }
104- health .OverallHealth = int (response .OverallHealth )
105- health .Incidents = make ([]HealthIncident , 0 , len (response .Incidents ))
106- for _ , incident := range response .Incidents {
107- health .Incidents = append (health .Incidents , HealthIncident {
108- System : int (incident .System ),
109- Health : int (incident .Health ),
110- ErrorMessage : incident .Error .Message ,
111- ErrorCode : int (incident .Error .Code ),
112- EntityGroupID : int (incident .EntityInfo .EntityGroupId ),
113- EntityID : int (incident .EntityInfo .EntityId ),
114- })
115- }
116- return health , nil
25+ type DCGMWrapperInterface interface {
26+ Shutdown () error
27+ EnableHealthChecks () error
28+ GetHealth () (Health , error )
11729}
0 commit comments