From 363253543a29707555db5fc569abf54b091d367f Mon Sep 17 00:00:00 2001 From: hlts2 Date: Mon, 13 Apr 2026 22:14:54 +0900 Subject: [PATCH 01/71] feat: add node recovery state machine with health checkers, executor, and metrics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a proper recovery state machine (Healthy → Unhealthy → WaitingReboot) with pluggable health checkers, Prometheus metrics, and monitor-only mode support. - Add pkg/health: HealthChecker interface with NodeReady and GPU checkers - Add pkg/operation: Executor interface with CivoExecutor (FOP-based) - Add pkg/metrics: Prometheus metrics (health checks, recovery actions, phase, duration) - Add pkg/watcher/state: NodePhase enum, NodeState (private fields), StateStore - Refactor watcher: replace polling with Node Informer, state machine reconcile loop - Remove legacy code: fake.go, inline reboot/check functions, sync.Map tracking - Update main.go: new env vars (CIVO_NODE_MONITOR_ONLY, CIVO_NODE_UNHEALTHY_THRESHOLD_MINUTES, CIVO_NODE_METRICS_PORT) Co-Authored-By: Claude Opus 4.6 (1M context) --- go.mod | 23 +- go.sum | 60 +- main.go | 75 +- pkg/health/gpu.go | 36 + pkg/health/gpu_test.go | 121 ++++ pkg/health/health.go | 23 + pkg/health/node_ready.go | 17 + pkg/health/node_ready_test.go | 79 +++ pkg/metrics/metrics.go | 65 ++ pkg/operation/operation.go | 8 + pkg/operation/reboot.go | 80 +++ pkg/operation/reboot_test.go | 93 +++ pkg/watcher/fake.go | 28 - pkg/watcher/options.go | 66 +- pkg/watcher/state.go | 182 +++++ pkg/watcher/state_test.go | 295 ++++++++ pkg/watcher/watcher.go | 291 ++++---- pkg/watcher/watcher_test.go | 1204 +++++++++++---------------------- 18 files changed, 1711 insertions(+), 1035 deletions(-) create mode 100644 pkg/health/gpu.go create mode 100644 pkg/health/gpu_test.go create mode 100644 pkg/health/health.go create mode 100644 pkg/health/node_ready.go create mode 100644 pkg/health/node_ready_test.go create mode 100644 pkg/metrics/metrics.go create mode 100644 pkg/operation/operation.go create mode 100644 pkg/operation/reboot.go create mode 100644 pkg/operation/reboot_test.go delete mode 100644 pkg/watcher/fake.go create mode 100644 pkg/watcher/state.go create mode 100644 pkg/watcher/state_test.go diff --git a/go.mod b/go.mod index 21997ac..bf87abb 100644 --- a/go.mod +++ b/go.mod @@ -4,12 +4,15 @@ go 1.24.0 require ( github.com/civo/civogo v0.3.94 + github.com/prometheus/client_golang v1.23.2 k8s.io/api v0.32.2 k8s.io/apimachinery v0.32.2 k8s.io/client-go v0.32.2 ) require ( + github.com/beorn7/perks v1.0.1 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/emicklei/go-restful/v3 v3.11.0 // indirect github.com/fxamacker/cbor/v2 v2.7.0 // indirect @@ -20,7 +23,7 @@ require ( github.com/gogo/protobuf v1.3.2 // indirect github.com/golang/protobuf v1.5.4 // indirect github.com/google/gnostic-models v0.6.8 // indirect - github.com/google/go-cmp v0.6.0 // indirect + github.com/google/go-cmp v0.7.0 // indirect github.com/google/go-querystring v1.1.0 // indirect github.com/google/gofuzz v1.2.0 // indirect github.com/google/uuid v1.6.0 // indirect @@ -31,16 +34,20 @@ require ( github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pkg/errors v0.9.1 // indirect + github.com/prometheus/client_model v0.6.2 // indirect + github.com/prometheus/common v0.66.1 // indirect + github.com/prometheus/procfs v0.16.1 // indirect github.com/spf13/pflag v1.0.5 // indirect github.com/x448/float16 v0.8.4 // indirect - golang.org/x/mod v0.20.0 // indirect - golang.org/x/net v0.38.0 // indirect - golang.org/x/oauth2 v0.27.0 // indirect - golang.org/x/sys v0.31.0 // indirect - golang.org/x/term v0.30.0 // indirect - golang.org/x/text v0.23.0 // indirect + go.yaml.in/yaml/v2 v2.4.2 // indirect + golang.org/x/mod v0.26.0 // indirect + golang.org/x/net v0.43.0 // indirect + golang.org/x/oauth2 v0.30.0 // indirect + golang.org/x/sys v0.35.0 // indirect + golang.org/x/term v0.34.0 // indirect + golang.org/x/text v0.28.0 // indirect golang.org/x/time v0.7.0 // indirect - google.golang.org/protobuf v1.35.1 // indirect + google.golang.org/protobuf v1.36.8 // indirect gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect diff --git a/go.sum b/go.sum index efaae30..fb4ea1d 100644 --- a/go.sum +++ b/go.sum @@ -1,3 +1,7 @@ +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/civo/civogo v0.3.94 h1:VhdqaJ2m4z8Jz8arzyzVjokRnO8JQ3lGjLKLshJ1eJI= github.com/civo/civogo v0.3.94/go.mod h1:LaEbkszc+9nXSh4YNG0sYXFGYqdQFmXXzQg0gESs2hc= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= @@ -29,8 +33,8 @@ github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvR github.com/google/gnostic-models v0.6.8/go.mod h1:5n7qKqH0f5wFt+aWF8CW6pZLLNOfYuF5OpfBSENuI8U= github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= -github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= -github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/go-querystring v1.1.0 h1:AnCroh3fv4ZBgVIf1Iwtovgjaw/GiKJo8M8yD/fhyJ8= github.com/google/go-querystring v1.1.0/go.mod h1:Kcdr2DB4koayq7X8pmAG4sNG59So17icRSOU623lUBU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= @@ -46,6 +50,8 @@ github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnr github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= +github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= @@ -53,6 +59,8 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= +github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= @@ -71,6 +79,14 @@ github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= +github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= +github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= +github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= +github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs= +github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA= +github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= +github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8= github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4= github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= @@ -82,55 +98,59 @@ github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UV github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= -github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= -github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= +go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= +go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0= -golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/mod v0.26.0 h1:EGMPT//Ezu+ylkCijjPc+f4Aih7sZvaAr+O3EHBxvZg= +golang.org/x/mod v0.26.0/go.mod h1:/j6NAhSk8iQ723BGAUyoAcn7SlD7s15Dp9Nd/SfeaFQ= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.38.0 h1:vRMAPTMaeGqVhG5QyLJHqNDwecKTomGeqbnfZyKlBI8= -golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8= -golang.org/x/oauth2 v0.27.0 h1:da9Vo7/tDv5RH/7nZDz1eMGS/q1Vv1N/7FCrBhI9I3M= -golang.org/x/oauth2 v0.27.0/go.mod h1:onh5ek6nERTohokkhCD/y2cV4Do3fxFHFuAejCkRWT8= +golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE= +golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg= +golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI= +golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.31.0 h1:ioabZlmFYtWhL+TRYpcnNlLwhyxaM9kWTDEmfnprqik= -golang.org/x/sys v0.31.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= -golang.org/x/term v0.30.0 h1:PQ39fJZ+mfadBm0y5WlL4vlM7Sx1Hgf13sMIY2+QS9Y= -golang.org/x/term v0.30.0/go.mod h1:NYYFdzHoI5wRh/h5tDMdMqCqPJZEuNqVR5xJLd/n67g= +golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= +golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/term v0.34.0 h1:O/2T7POpk0ZZ7MAzMeWFSg6S5IpWd/RXDlM9hgM3DR4= +golang.org/x/term v0.34.0/go.mod h1:5jC53AEywhIVebHgPVeg0mj8OD3VO9OzclacVrqpaAw= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY= -golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4= +golang.org/x/text v0.28.0 h1:rhazDwis8INMIwQ4tpjLDzUhx6RlXqZNPEM0huQojng= +golang.org/x/text v0.28.0/go.mod h1:U8nCwOR8jO/marOQ0QbDiOngZVEBB7MAiitBuMjXiNU= golang.org/x/time v0.7.0 h1:ntUhktv3OPE6TgYxXWv9vKvUSJyIFJlyohwbkEwPrKQ= golang.org/x/time v0.7.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.26.0 h1:v/60pFQmzmT9ExmjDv2gGIfi3OqfKoEP6I5+umXlbnQ= -golang.org/x/tools v0.26.0/go.mod h1:TPVVj70c7JJ3WCazhD8OdXcZg/og+b9+tH/KxylGwH0= +golang.org/x/tools v0.35.0 h1:mBffYraMEf7aa0sB+NuKnuCy8qI/9Bughn8dC2Gu5r0= +golang.org/x/tools v0.35.0/go.mod h1:NKdj5HkL/73byiZSJjqJgKn3ep7KjFkBOkR/Hps3VPw= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -google.golang.org/protobuf v1.35.1 h1:m3LfL6/Ca+fqnjnlqQXNpFPABW1UD7mjh8KO2mKFytA= -google.golang.org/protobuf v1.35.1/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= +google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc= +google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= diff --git a/main.go b/main.go index 94f678b..31c4bca 100644 --- a/main.go +++ b/main.go @@ -3,34 +3,80 @@ package main import ( "context" "flag" + "fmt" "log/slog" + "net/http" "os" "os/signal" + "strconv" "strings" "syscall" + "github.com/civo/node-agent/pkg/health" + "github.com/civo/node-agent/pkg/metrics" + "github.com/civo/node-agent/pkg/operation" "github.com/civo/node-agent/pkg/watcher" ) -var versionInfo = flag.Bool("version", false, "Print the driver version") +var ( + version = "0.0.1" + versionInfo = flag.Bool("version", false, "Print the driver version") +) var ( - apiURL = strings.TrimSpace(os.Getenv("CIVO_API_URL")) - apiKey = strings.TrimSpace(os.Getenv("CIVO_API_KEY")) - region = strings.TrimSpace(os.Getenv("CIVO_REGION")) - clusterID = strings.TrimSpace(os.Getenv("CIVO_CLUSTER_ID")) - nodePoolID = strings.TrimSpace(os.Getenv("CIVO_NODE_POOL_ID")) - nodeDesiredGPUCount = strings.TrimSpace(os.Getenv("CIVO_NODE_DESIRED_GPU_COUNT")) - rebootTimeWindowMinutes = strings.TrimSpace(os.Getenv("CIVO_NODE_REBOOT_TIME_WINDOW_MINUTES")) + apiURL = strings.TrimSpace(os.Getenv("CIVO_API_URL")) + apiKey = strings.TrimSpace(os.Getenv("CIVO_API_KEY")) + region = strings.TrimSpace(os.Getenv("CIVO_REGION")) + clusterID = strings.TrimSpace(os.Getenv("CIVO_CLUSTER_ID")) + nodePoolID = strings.TrimSpace(os.Getenv("CIVO_NODE_POOL_ID")) + nodeDesiredGPUCount = strings.TrimSpace(os.Getenv("CIVO_NODE_DESIRED_GPU_COUNT")) + rebootTimeWindowMinutes = strings.TrimSpace(os.Getenv("CIVO_NODE_REBOOT_TIME_WINDOW_MINUTES")) + monitorOnly = strings.TrimSpace(os.Getenv("CIVO_NODE_MONITOR_ONLY")) + unhealthyThresholdMinutes = strings.TrimSpace(os.Getenv("CIVO_NODE_UNHEALTHY_THRESHOLD_MINUTES")) + metricsPort = strings.TrimSpace(os.Getenv("CIVO_NODE_METRICS_PORT")) +) + +const ( + defaultMetricsPort = "9625" ) func run(ctx context.Context) error { ctx, stop := signal.NotifyContext(ctx, os.Interrupt, syscall.SIGTERM) defer stop() - w, err := watcher.NewWatcher(ctx, apiURL, apiKey, region, clusterID, nodePoolID, + executor, err := operation.NewCivoExecutor(clusterID, + operation.WithAPIConfig(apiKey, apiURL, region, version)) + if err != nil { + return fmt.Errorf("failed to initialise executor: %w", err) + } + checkers := health.NewDefaultCheckers(parseIntOrZero(nodeDesiredGPUCount)) + + monitorOnlyFlag := true + if v, err := strconv.ParseBool(monitorOnly); err == nil { + monitorOnlyFlag = v + } + + metrics.Register() + go func() { + port := defaultMetricsPort + // Exclude well known port and negative integers. + if v, err := strconv.Atoi(metricsPort); err == nil && v >= 1024 && v <= 65535 { + port = metricsPort + } + addr := ":" + port + slog.Info("Starting metrics server", "addr", addr) + if err := http.ListenAndServe(addr, metrics.Handler()); err != nil { + slog.Error("Metrics server failed", "error", err) + } + }() + + w, err := watcher.NewWatcher(ctx, clusterID, nodePoolID, + watcher.WithExecutor(executor), + watcher.WithCheckers(checkers), + watcher.WithMonitorOnly(monitorOnlyFlag), watcher.WithRebootTimeWindowMinutes(rebootTimeWindowMinutes), watcher.WithDesiredGPUCount(nodeDesiredGPUCount), + watcher.WithUnhealthyThresholdMinutes(unhealthyThresholdMinutes), ) if err != nil { return err @@ -41,7 +87,7 @@ func run(ctx context.Context) error { func main() { flag.Parse() if *versionInfo { - slog.Info("node-agent", "version", watcher.Version) + slog.Info("node-agent", "version", version) return } @@ -56,3 +102,12 @@ func main() { os.Exit(1) } } + +func parseIntOrZero(s string) int { + if s == "" { + return 0 + } + n := 0 + fmt.Sscanf(s, "%d", &n) + return n +} diff --git a/pkg/health/gpu.go b/pkg/health/gpu.go new file mode 100644 index 0000000..de81007 --- /dev/null +++ b/pkg/health/gpu.go @@ -0,0 +1,36 @@ +package health + +import corev1 "k8s.io/api/core/v1" + +const gpuResourceName = "nvidia.com/gpu" + +// gpuChecker reports healthy when the node's allocatable GPU count +// matches the desired count. If desiredCount is 0 the check always passes. +type gpuChecker struct { + desiredCount int +} + +// NewGPUChecker creates a HealthChecker that verifies the node's allocatable GPU count. +func NewGPUChecker(desiredCount int) HealthChecker { + return &gpuChecker{desiredCount: desiredCount} +} + +func (c *gpuChecker) Name() string { return "GPU" } + +func (c *gpuChecker) Check(node *corev1.Node) bool { + if c.desiredCount == 0 { + return true + } + + quantity, exists := node.Status.Allocatable[gpuResourceName] + if !exists || quantity.IsZero() { + return false + } + + gpuCount, ok := quantity.AsInt64() + if !ok { + return false + } + + return gpuCount == int64(c.desiredCount) +} diff --git a/pkg/health/gpu_test.go b/pkg/health/gpu_test.go new file mode 100644 index 0000000..f494054 --- /dev/null +++ b/pkg/health/gpu_test.go @@ -0,0 +1,121 @@ +package health + +import ( + "testing" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func TestGPUChecker_Name(t *testing.T) { + c := NewGPUChecker(8) + if got := c.Name(); got != "GPU" { + t.Errorf("got %q, want %q", got, "GPU") + } +} + +func TestGPUChecker_Check(t *testing.T) { + tests := []struct { + name string + desired int + node *corev1.Node + want bool + }{ + { + name: "Returns true when GPU count matches desired", + desired: 8, + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, + Status: corev1.NodeStatus{ + Allocatable: corev1.ResourceList{ + gpuResourceName: resource.MustParse("8"), + }, + }, + }, + want: true, + }, + { + name: "Returns true when desired is 0 (check skipped)", + desired: 0, + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, + Status: corev1.NodeStatus{ + Allocatable: corev1.ResourceList{}, + }, + }, + want: true, + }, + { + name: "Returns false when GPU count is less than desired", + desired: 8, + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, + Status: corev1.NodeStatus{ + Allocatable: corev1.ResourceList{ + gpuResourceName: resource.MustParse("7"), + }, + }, + }, + want: false, + }, + { + name: "Returns false when GPU count is zero", + desired: 8, + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, + Status: corev1.NodeStatus{ + Allocatable: corev1.ResourceList{ + gpuResourceName: resource.MustParse("0"), + }, + }, + }, + want: false, + }, + { + name: "Returns false when no GPU resource in allocatable", + desired: 8, + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, + Status: corev1.NodeStatus{ + Allocatable: corev1.ResourceList{}, + }, + }, + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + c := NewGPUChecker(tt.desired) + if got := c.Check(tt.node); got != tt.want { + t.Errorf("got %v, want %v", got, tt.want) + } + }) + } +} + +func TestNewDefaultCheckers(t *testing.T) { + t.Run("GPU disabled when desiredCount is 0", func(t *testing.T) { + checkers := NewDefaultCheckers(0) + if len(checkers) != 1 { + t.Fatalf("expected 1 checker, got %d", len(checkers)) + } + if checkers[0].Name() != "NodeReady" { + t.Errorf("expected NodeReady checker, got %q", checkers[0].Name()) + } + }) + + t.Run("GPU enabled when desiredCount is positive", func(t *testing.T) { + checkers := NewDefaultCheckers(8) + if len(checkers) != 2 { + t.Fatalf("expected 2 checkers, got %d", len(checkers)) + } + if checkers[0].Name() != "NodeReady" { + t.Errorf("expected NodeReady checker first, got %q", checkers[0].Name()) + } + if checkers[1].Name() != "GPU" { + t.Errorf("expected GPU checker second, got %q", checkers[1].Name()) + } + }) +} diff --git a/pkg/health/health.go b/pkg/health/health.go new file mode 100644 index 0000000..d0c5308 --- /dev/null +++ b/pkg/health/health.go @@ -0,0 +1,23 @@ +package health + +import corev1 "k8s.io/api/core/v1" + +// HealthChecker determines whether a single aspect of a node is healthy. +type HealthChecker interface { + // Name returns a human-readable identifier for this checker (e.g. "NodeReady"). + Name() string + // Check returns true if the node is healthy for this checker's concern. + Check(node *corev1.Node) bool +} + +// NewDefaultCheckers returns the enabled health checkers for the MVP. +// GPUChecker is included only when desiredGPUCount > 0. +func NewDefaultCheckers(desiredGPUCount int) []HealthChecker { + checkers := []HealthChecker{ + &nodeReadyChecker{}, + } + if desiredGPUCount > 0 { + checkers = append(checkers, &gpuChecker{desiredCount: desiredGPUCount}) + } + return checkers +} diff --git a/pkg/health/node_ready.go b/pkg/health/node_ready.go new file mode 100644 index 0000000..d9ca7e7 --- /dev/null +++ b/pkg/health/node_ready.go @@ -0,0 +1,17 @@ +package health + +import corev1 "k8s.io/api/core/v1" + +// nodeReadyChecker reports healthy when the node's NodeReady condition is True. +type nodeReadyChecker struct{} + +func (c *nodeReadyChecker) Name() string { return "NodeReady" } + +func (c *nodeReadyChecker) Check(node *corev1.Node) bool { + for _, cond := range node.Status.Conditions { + if cond.Type == corev1.NodeReady { + return cond.Status == corev1.ConditionTrue + } + } + return false +} diff --git a/pkg/health/node_ready_test.go b/pkg/health/node_ready_test.go new file mode 100644 index 0000000..120ebd5 --- /dev/null +++ b/pkg/health/node_ready_test.go @@ -0,0 +1,79 @@ +package health + +import ( + "testing" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func TestNodeReadyChecker_Name(t *testing.T) { + c := &nodeReadyChecker{} + if got := c.Name(); got != "NodeReady" { + t.Errorf("got %q, want %q", got, "NodeReady") + } +} + +func TestNodeReadyChecker_Check(t *testing.T) { + tests := []struct { + name string + node *corev1.Node + want bool + }{ + { + name: "Returns true when NodeReady condition is True", + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, + Status: corev1.NodeStatus{ + Conditions: []corev1.NodeCondition{ + {Type: corev1.NodeReady, Status: corev1.ConditionTrue}, + }, + }, + }, + want: true, + }, + { + name: "Returns false when NodeReady condition is False", + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, + Status: corev1.NodeStatus{ + Conditions: []corev1.NodeCondition{ + {Type: corev1.NodeReady, Status: corev1.ConditionFalse}, + }, + }, + }, + want: false, + }, + { + name: "Returns false when no conditions present", + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, + Status: corev1.NodeStatus{ + Conditions: []corev1.NodeCondition{}, + }, + }, + want: false, + }, + { + name: "Returns false when only non-NodeReady conditions present", + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, + Status: corev1.NodeStatus{ + Conditions: []corev1.NodeCondition{ + {Type: corev1.NodeDiskPressure, Status: corev1.ConditionFalse}, + }, + }, + }, + want: false, + }, + } + + c := &nodeReadyChecker{} + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := c.Check(tt.node); got != tt.want { + t.Errorf("got %v, want %v", got, tt.want) + } + }) + } +} diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go new file mode 100644 index 0000000..e9ea34e --- /dev/null +++ b/pkg/metrics/metrics.go @@ -0,0 +1,65 @@ +package metrics + +import ( + "net/http" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" +) + +var ( + // HealthCheckTotal counts the number of health check executions per node, + // checker, and result (pass/fail). + HealthCheckTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "node_agent_health_check_total", + Help: "Total number of health check executions.", + }, + []string{"node", "checker", "result"}, + ) + + // RecoveryActionsTotal counts the number of recovery actions performed + // per node, action type (reboot), and mode (report/active). + RecoveryActionsTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "node_agent_recovery_actions_total", + Help: "Total number of recovery actions performed.", + }, + []string{"node", "action", "mode"}, + ) + + // NodeUnhealthyDurationSeconds tracks how long each node has been + // continuously unhealthy, in seconds. + NodeUnhealthyDurationSeconds = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "node_agent_node_unhealthy_duration_seconds", + Help: "Duration in seconds a node has been continuously unhealthy.", + }, + []string{"node"}, + ) + + // RecoveryPhase reports the current recovery phase for each node. + // The value is the numeric NodePhase (0=Healthy, 1=Unhealthy, etc.). + RecoveryPhase = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "node_agent_recovery_phase", + Help: "Current recovery phase of a node.", + }, + []string{"node", "phase"}, + ) +) + +// Register registers all node-agent metrics with the default Prometheus registerer. +func Register() { + prometheus.MustRegister( + HealthCheckTotal, + RecoveryActionsTotal, + NodeUnhealthyDurationSeconds, + RecoveryPhase, + ) +} + +// Handler returns an http.Handler that serves Prometheus metrics. +func Handler() http.Handler { + return promhttp.Handler() +} diff --git a/pkg/operation/operation.go b/pkg/operation/operation.go new file mode 100644 index 0000000..4f00a21 --- /dev/null +++ b/pkg/operation/operation.go @@ -0,0 +1,8 @@ +package operation + +import "context" + +// Executor performs recovery operations on cluster nodes. +type Executor interface { + Reboot(ctx context.Context, nodeName string) error +} diff --git a/pkg/operation/reboot.go b/pkg/operation/reboot.go new file mode 100644 index 0000000..d817774 --- /dev/null +++ b/pkg/operation/reboot.go @@ -0,0 +1,80 @@ +package operation + +import ( + "context" + "fmt" + "log/slog" + + "github.com/civo/civogo" +) + +// Option represents a configuration function that modifies civoExecutor. +type Option func(*civoExecutor) + +// WithAPIConfig returns Option to configure the Civo API credentials and version. +// The client is created internally using these values. +func WithAPIConfig(apiKey, apiURL, region, version string) Option { + return func(e *civoExecutor) { + e.apiKey = apiKey + e.apiURL = apiURL + e.region = region + e.version = version + } +} + +// WithClient returns Option to inject a pre-built Civo client (for testing). +func WithClient(client civogo.Clienter) Option { + return func(e *civoExecutor) { + e.civoClient = client + } +} + +// civoExecutor implements Executor using the Civo API. +type civoExecutor struct { + civoClient civogo.Clienter + clusterID string + + apiKey string + apiURL string + region string + version string +} + +// NewCivoExecutor creates an Executor that performs recovery actions via the Civo API. +func NewCivoExecutor(clusterID string, opts ...Option) (Executor, error) { + e := &civoExecutor{clusterID: clusterID} + for _, opt := range opts { + opt(e) + } + + if e.civoClient != nil { + return e, nil + } + + client, err := civogo.NewClientWithURL(e.apiKey, e.apiURL, e.region) + if err != nil { + return nil, fmt.Errorf("failed to initialise civo client: %w", err) + } + client.SetUserAgent(&civogo.Component{ + ID: clusterID, + Name: "node-agent", + Version: e.version, + }) + e.civoClient = client + return e, nil +} + +func (e *civoExecutor) Reboot(_ context.Context, nodeName string) error { + instance, err := e.civoClient.FindKubernetesClusterInstance(e.clusterID, nodeName) + if err != nil { + return fmt.Errorf("failed to find instance, clusterID: %s, nodeName: %s: %w", e.clusterID, nodeName, err) + } + + _, err = e.civoClient.HardRebootInstance(instance.ID) + if err != nil { + return fmt.Errorf("failed to reboot instance, clusterID: %s, instanceID: %s: %w", e.clusterID, instance.ID, err) + } + + slog.Info("Instance is rebooting", "instanceID", instance.ID, "node", nodeName) + return nil +} diff --git a/pkg/operation/reboot_test.go b/pkg/operation/reboot_test.go new file mode 100644 index 0000000..c0977cb --- /dev/null +++ b/pkg/operation/reboot_test.go @@ -0,0 +1,93 @@ +package operation + +import ( + "errors" + "testing" + + "github.com/civo/civogo" +) + +// fakeClient overrides the Civo API methods needed by CivoExecutor. +type fakeClient struct { + findFunc func(clusterID, search string) (*civogo.Instance, error) + rebootFunc func(id string) (*civogo.SimpleResponse, error) + + *civogo.FakeClient +} + +func (f *fakeClient) FindKubernetesClusterInstance(clusterID, search string) (*civogo.Instance, error) { + if f.findFunc != nil { + return f.findFunc(clusterID, search) + } + return f.FakeClient.FindKubernetesClusterInstance(clusterID, search) +} + +func (f *fakeClient) HardRebootInstance(id string) (*civogo.SimpleResponse, error) { + if f.rebootFunc != nil { + return f.rebootFunc(id) + } + return f.FakeClient.HardRebootInstance(id) +} + +var _ civogo.Clienter = (*fakeClient)(nil) + +func TestCivoExecutor_Reboot(t *testing.T) { + tests := []struct { + name string + nodeName string + client *fakeClient + wantErr bool + }{ + { + name: "Returns nil on successful find and reboot", + nodeName: "node-01", + client: &fakeClient{ + findFunc: func(clusterID, search string) (*civogo.Instance, error) { + return &civogo.Instance{ID: "instance-01"}, nil + }, + rebootFunc: func(id string) (*civogo.SimpleResponse, error) { + if id != "instance-01" { + t.Errorf("instanceID mismatch: got %s, want instance-01", id) + } + return new(civogo.SimpleResponse), nil + }, + }, + }, + { + name: "Returns error when instance lookup fails", + nodeName: "node-01", + client: &fakeClient{ + findFunc: func(_, _ string) (*civogo.Instance, error) { + return nil, errors.New("not found") + }, + }, + wantErr: true, + }, + { + name: "Returns error when hard reboot fails", + nodeName: "node-01", + client: &fakeClient{ + findFunc: func(_, _ string) (*civogo.Instance, error) { + return &civogo.Instance{ID: "instance-01"}, nil + }, + rebootFunc: func(_ string) (*civogo.SimpleResponse, error) { + return nil, errors.New("reboot failed") + }, + }, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + exec, err := NewCivoExecutor("test-cluster", WithClient(tt.client)) + if err != nil { + t.Fatal(err) + } + err = exec.Reboot(t.Context(), tt.nodeName) + if (err != nil) != tt.wantErr { + t.Errorf("error = %v, wantErr %v", err, tt.wantErr) + } + }) + } +} diff --git a/pkg/watcher/fake.go b/pkg/watcher/fake.go deleted file mode 100644 index 40c37d9..0000000 --- a/pkg/watcher/fake.go +++ /dev/null @@ -1,28 +0,0 @@ -package watcher - -import "github.com/civo/civogo" - -// FakeClient is a test client used for more flexible behavior control -// when FakeClient alone is not sufficient. -type FakeClient struct { - HardRebootInstanceFunc func(id string) (*civogo.SimpleResponse, error) - FindKubernetesClusterInstanceFunc func(clusterID, search string) (*civogo.Instance, error) - - *civogo.FakeClient -} - -func (f *FakeClient) HardRebootInstance(id string) (*civogo.SimpleResponse, error) { - if f.HardRebootInstanceFunc != nil { - return f.HardRebootInstanceFunc(id) - } - return f.FakeClient.HardRebootInstance(id) -} - -func (f *FakeClient) FindKubernetesClusterInstance(clusterID, search string) (*civogo.Instance, error) { - if f.FindKubernetesClusterInstanceFunc != nil { - return f.FindKubernetesClusterInstanceFunc(clusterID, search) - } - return f.FakeClient.FindKubernetesClusterInstance(clusterID, search) -} - -var _ civogo.Clienter = (*FakeClient)(nil) diff --git a/pkg/watcher/options.go b/pkg/watcher/options.go index 904634d..cc3d677 100644 --- a/pkg/watcher/options.go +++ b/pkg/watcher/options.go @@ -5,8 +5,10 @@ import ( "strconv" "time" - "github.com/civo/civogo" + "github.com/civo/node-agent/pkg/health" + "github.com/civo/node-agent/pkg/operation" "k8s.io/client-go/kubernetes" + listerscorev1 "k8s.io/client-go/listers/core/v1" ) // Option represents a configuration function that modifies watcher object. @@ -15,6 +17,7 @@ type Option func(*watcher) var defaultOptions = []Option{ WithRebootTimeWindowMinutes("40"), WithDesiredGPUCount("0"), + WithUnhealthyThresholdMinutes("10"), } // WithKubernetesClient returns Option to set Kubernetes API client. @@ -35,15 +38,6 @@ func WithKubernetesClientConfigPath(path string) Option { } } -// WithCivoClient returns Option to set Civo API client. -func WithCivoClient(client civogo.Clienter) Option { - return func(w *watcher) { - if client != nil { - w.civoClient = client - } - } -} - // WithRebootTimeWindowMinutes returns Option to set reboot time window. func WithRebootTimeWindowMinutes(s string) Option { return func(w *watcher) { @@ -67,3 +61,55 @@ func WithDesiredGPUCount(s string) Option { } } } + +// WithMonitorOnly returns Option to enable or disable monitor-only mode. +// When true (default), recovery actions are logged but not executed. +func WithMonitorOnly(v bool) Option { + return func(w *watcher) { + w.monitorOnly = v + } +} + +// WithUnhealthyThresholdMinutes returns Option to set the duration a node +// must be continuously unhealthy before a recovery action is triggered. +func WithUnhealthyThresholdMinutes(s string) Option { + return func(w *watcher) { + n, err := strconv.Atoi(s) + if err == nil && n > 0 { + w.unhealthyThreshold = time.Duration(n) * time.Minute + } else { + slog.Info("UnhealthyThresholdMinutes is invalid", "value", s) + } + } +} + +// WithCheckers returns Option to set the health checkers. +func WithCheckers(checkers []health.HealthChecker) Option { + return func(w *watcher) { + w.checkers = checkers + } +} + +// WithExecutor returns Option to set the recovery executor. +func WithExecutor(exec operation.Executor) Option { + return func(w *watcher) { + w.executor = exec + } +} + +// WithNowFunc returns Option to override the time source (for testing). +func WithNowFunc(fn func() time.Time) Option { + return func(w *watcher) { + if fn != nil { + w.nowFunc = fn + } + } +} + +// WithNodeLister returns Option to inject a node lister (for testing). +// When set, the informer setup is skipped. +func WithNodeLister(lister listerscorev1.NodeLister) Option { + return func(w *watcher) { + w.nodeLister = lister + } +} diff --git a/pkg/watcher/state.go b/pkg/watcher/state.go new file mode 100644 index 0000000..7b3a60a --- /dev/null +++ b/pkg/watcher/state.go @@ -0,0 +1,182 @@ +package watcher + +import ( + "sync" + "time" +) + +// NodePhase represents the current recovery phase of a node. +type NodePhase int + +const ( + PhaseUnknown NodePhase = iota // 0 - unknown/uninitialized + PhaseHealthy // 1 - node is healthy + PhaseUnhealthy // 2 - checker(s) failing, waiting for threshold + PhaseReboot // 3 - reboot command issued + PhaseWaitingReboot // 4 - waiting for reboot to take effect + PhaseDrain // 5 - future: draining pods + PhaseReplace // 6 - future: replace issued +) + +// String returns the string representation of a NodePhase. +func (p NodePhase) String() string { + switch p { + case PhaseUnknown: + return "Unknown" + case PhaseHealthy: + return "Healthy" + case PhaseUnhealthy: + return "Unhealthy" + case PhaseReboot: + return "Reboot" + case PhaseWaitingReboot: + return "WaitingReboot" + case PhaseDrain: + return "Drain" + case PhaseReplace: + return "Replace" + default: + return "Unknown" + } +} + +// NodeState holds the recovery state for a single node. +// All fields are private; read via getters, mutate via StateStore methods. +type NodeState struct { + phase NodePhase + unhealthySince time.Time + lastRebootTime time.Time + rebootCount int + failedCheckers []string + isGPUNode bool +} + +func (s *NodeState) Phase() NodePhase { return s.phase } +func (s *NodeState) UnhealthySince() time.Time { return s.unhealthySince } +func (s *NodeState) LastRebootTime() time.Time { return s.lastRebootTime } +func (s *NodeState) RebootCount() int { return s.rebootCount } +func (s *NodeState) IsGPUNode() bool { return s.isGPUNode } + +// FailedCheckers returns a copy of the failed checker names. +func (s *NodeState) FailedCheckers() []string { + out := make([]string, len(s.failedCheckers)) + copy(out, s.failedCheckers) + return out +} + +// StateStore is a concurrency-safe store for per-node recovery state. +type StateStore struct { + mu sync.RWMutex + nodes map[string]*NodeState +} + +// NewStateStore creates a new empty StateStore. +func NewStateStore() *StateStore { + return &StateStore{ + nodes: make(map[string]*NodeState), + } +} + +// GetOrCreate returns the NodeState for the given node name, +// creating a new one (PhaseHealthy) if it does not exist. +func (s *StateStore) GetOrCreate(name string) *NodeState { + s.mu.Lock() + defer s.mu.Unlock() + + if st, ok := s.nodes[name]; ok { + return st + } + st := &NodeState{phase: PhaseHealthy} + s.nodes[name] = st + return st +} + +// Get returns the NodeState for the given node name and whether it was found. +func (s *StateStore) Get(name string) (*NodeState, bool) { + s.mu.RLock() + defer s.mu.RUnlock() + st, ok := s.nodes[name] + return st, ok +} + +// Delete removes the state entry for the given node name. +func (s *StateStore) Delete(name string) { + s.mu.Lock() + defer s.mu.Unlock() + delete(s.nodes, name) +} + +// Range calls fn for each node state entry. If fn returns false, iteration stops. +func (s *StateStore) Range(fn func(name string, state *NodeState) bool) { + s.mu.RLock() + defer s.mu.RUnlock() + + for name, state := range s.nodes { + if !fn(name, state) { + return + } + } +} + +// UpdateCheckerInfo updates the failed checker names and GPU flag for a node. +func (s *StateStore) UpdateCheckerInfo(name string, failedCheckers []string, isGPUNode bool) { + s.mu.Lock() + defer s.mu.Unlock() + + st, ok := s.nodes[name] + if !ok { + return + } + st.failedCheckers = failedCheckers + st.isGPUNode = isGPUNode +} + +// MarkUnhealthy transitions a node to PhaseUnhealthy and records when it became unhealthy. +func (s *StateStore) MarkUnhealthy(name string, now time.Time) { + s.mu.Lock() + defer s.mu.Unlock() + + st, ok := s.nodes[name] + if !ok { + return + } + st.phase = PhaseUnhealthy + st.unhealthySince = now +} + +// MarkWaitingReboot transitions a node to PhaseWaitingReboot, +// records the reboot time, and increments the reboot counter. +func (s *StateStore) MarkWaitingReboot(name string, now time.Time) { + s.mu.Lock() + defer s.mu.Unlock() + + st, ok := s.nodes[name] + if !ok { + return + } + st.phase = PhaseWaitingReboot + st.lastRebootTime = now + st.rebootCount++ +} + +// Reset replaces the node's state with a fresh PhaseHealthy entry. +func (s *StateStore) Reset(name string) { + s.mu.Lock() + defer s.mu.Unlock() + + if _, ok := s.nodes[name]; ok { + s.nodes[name] = &NodeState{phase: PhaseHealthy} + } +} + +// Cleanup removes state entries for nodes that are not in the activeNodes set. +func (s *StateStore) Cleanup(activeNodes map[string]struct{}) { + s.mu.Lock() + defer s.mu.Unlock() + + for name := range s.nodes { + if _, ok := activeNodes[name]; !ok { + delete(s.nodes, name) + } + } +} diff --git a/pkg/watcher/state_test.go b/pkg/watcher/state_test.go new file mode 100644 index 0000000..45fcbdb --- /dev/null +++ b/pkg/watcher/state_test.go @@ -0,0 +1,295 @@ +package watcher + +import ( + "testing" + "time" +) + +func TestNodePhaseString(t *testing.T) { + tests := []struct { + phase NodePhase + want string + }{ + {PhaseUnknown, "Unknown"}, + {PhaseHealthy, "Healthy"}, + {PhaseUnhealthy, "Unhealthy"}, + {PhaseReboot, "Reboot"}, + {PhaseWaitingReboot, "WaitingReboot"}, + {PhaseDrain, "Drain"}, + {PhaseReplace, "Replace"}, + {NodePhase(99), "Unknown"}, + } + + for _, tt := range tests { + t.Run(tt.want, func(t *testing.T) { + if got := tt.phase.String(); got != tt.want { + t.Errorf("got %q, want %q", got, tt.want) + } + }) + } +} + +func TestNodePhaseZeroValue(t *testing.T) { + var phase NodePhase + if phase != PhaseUnknown { + t.Errorf("zero value of NodePhase should be PhaseUnknown, got %v", phase) + } +} + +func TestStateStoreGetOrCreate(t *testing.T) { + s := NewStateStore() + + st := s.GetOrCreate("node-01") + if st.Phase() != PhaseHealthy { + t.Errorf("new state should be PhaseHealthy, got %v", st.Phase()) + } + + st2 := s.GetOrCreate("node-01") + if st != st2 { + t.Error("GetOrCreate should return the same pointer for existing node") + } +} + +func TestStateStoreGet(t *testing.T) { + s := NewStateStore() + + _, ok := s.Get("nonexistent") + if ok { + t.Error("Get should return false for nonexistent node") + } + + s.GetOrCreate("node-01") + st, ok := s.Get("node-01") + if !ok { + t.Error("Get should return true for existing node") + } + if st.Phase() != PhaseHealthy { + t.Errorf("got phase %v, want PhaseHealthy", st.Phase()) + } +} + +func TestStateStoreDelete(t *testing.T) { + s := NewStateStore() + s.GetOrCreate("node-01") + + s.Delete("node-01") + _, ok := s.Get("node-01") + if ok { + t.Error("node should be deleted") + } + + // Deleting nonexistent node should not panic. + s.Delete("nonexistent") +} + +func TestStateStoreRange(t *testing.T) { + s := NewStateStore() + s.GetOrCreate("node-01") + s.GetOrCreate("node-02") + s.GetOrCreate("node-03") + + visited := make(map[string]bool) + s.Range(func(name string, _ *NodeState) bool { + visited[name] = true + return true + }) + + if len(visited) != 3 { + t.Errorf("Range should visit 3 nodes, visited %d", len(visited)) + } +} + +func TestStateStoreRangeEarlyStop(t *testing.T) { + s := NewStateStore() + s.GetOrCreate("node-01") + s.GetOrCreate("node-02") + s.GetOrCreate("node-03") + + count := 0 + s.Range(func(_ string, _ *NodeState) bool { + count++ + return false + }) + + if count != 1 { + t.Errorf("Range should stop after first call when fn returns false, visited %d", count) + } +} + +func TestStateStoreMarkUnhealthy(t *testing.T) { + s := NewStateStore() + s.GetOrCreate("node-01") + + now := time.Date(2026, 4, 13, 12, 0, 0, 0, time.UTC) + s.MarkUnhealthy("node-01", now) + + st, _ := s.Get("node-01") + if st.Phase() != PhaseUnhealthy { + t.Errorf("got phase %v, want PhaseUnhealthy", st.Phase()) + } + if !st.UnhealthySince().Equal(now) { + t.Errorf("got unhealthySince %v, want %v", st.UnhealthySince(), now) + } +} + +func TestStateStoreMarkUnhealthyNonexistent(t *testing.T) { + s := NewStateStore() + // Should not panic. + s.MarkUnhealthy("nonexistent", time.Now()) +} + +func TestStateStoreMarkWaitingReboot(t *testing.T) { + s := NewStateStore() + s.GetOrCreate("node-01") + + now := time.Date(2026, 4, 13, 12, 0, 0, 0, time.UTC) + s.MarkWaitingReboot("node-01", now) + + st, _ := s.Get("node-01") + if st.Phase() != PhaseWaitingReboot { + t.Errorf("got phase %v, want PhaseWaitingReboot", st.Phase()) + } + if !st.LastRebootTime().Equal(now) { + t.Errorf("got lastRebootTime %v, want %v", st.LastRebootTime(), now) + } + if st.RebootCount() != 1 { + t.Errorf("got rebootCount %d, want 1", st.RebootCount()) + } + + // Retry increments count. + later := now.Add(time.Hour) + s.MarkWaitingReboot("node-01", later) + + st, _ = s.Get("node-01") + if st.RebootCount() != 2 { + t.Errorf("got rebootCount %d after retry, want 2", st.RebootCount()) + } + if !st.LastRebootTime().Equal(later) { + t.Errorf("got lastRebootTime %v after retry, want %v", st.LastRebootTime(), later) + } +} + +func TestStateStoreMarkWaitingRebootNonexistent(t *testing.T) { + s := NewStateStore() + // Should not panic. + s.MarkWaitingReboot("nonexistent", time.Now()) +} + +func TestStateStoreUpdateCheckerInfo(t *testing.T) { + s := NewStateStore() + s.GetOrCreate("node-01") + + checkers := []string{"NodeReady", "GPU"} + s.UpdateCheckerInfo("node-01", checkers, true) + + st, _ := s.Get("node-01") + got := st.FailedCheckers() + if len(got) != 2 || got[0] != "NodeReady" || got[1] != "GPU" { + t.Errorf("got failedCheckers %v, want %v", got, checkers) + } + if !st.IsGPUNode() { + t.Error("expected isGPUNode to be true") + } +} + +func TestStateStoreUpdateCheckerInfoNonexistent(t *testing.T) { + s := NewStateStore() + // Should not panic. + s.UpdateCheckerInfo("nonexistent", []string{"NodeReady"}, false) +} + +func TestFailedCheckersReturnsCopy(t *testing.T) { + s := NewStateStore() + s.GetOrCreate("node-01") + s.UpdateCheckerInfo("node-01", []string{"NodeReady"}, false) + + st, _ := s.Get("node-01") + got := st.FailedCheckers() + got[0] = "mutated" + + original := st.FailedCheckers() + if original[0] != "NodeReady" { + t.Error("FailedCheckers should return a copy; mutation should not affect internal state") + } +} + +func TestStateStoreReset(t *testing.T) { + s := NewStateStore() + s.GetOrCreate("node-01") + + now := time.Date(2026, 4, 13, 12, 0, 0, 0, time.UTC) + s.MarkUnhealthy("node-01", now) + s.UpdateCheckerInfo("node-01", []string{"NodeReady"}, true) + s.MarkWaitingReboot("node-01", now) + + s.Reset("node-01") + + st, ok := s.Get("node-01") + if !ok { + t.Fatal("node should still exist after Reset") + } + if st.Phase() != PhaseHealthy { + t.Errorf("got phase %v, want PhaseHealthy", st.Phase()) + } + if st.RebootCount() != 0 { + t.Errorf("got rebootCount %d, want 0", st.RebootCount()) + } + if !st.UnhealthySince().IsZero() { + t.Error("unhealthySince should be zero after Reset") + } + if !st.LastRebootTime().IsZero() { + t.Error("lastRebootTime should be zero after Reset") + } + if len(st.FailedCheckers()) != 0 { + t.Error("failedCheckers should be empty after Reset") + } + if st.IsGPUNode() { + t.Error("isGPUNode should be false after Reset") + } +} + +func TestStateStoreResetNonexistent(t *testing.T) { + s := NewStateStore() + // Should not panic and should not create an entry. + s.Reset("nonexistent") + + _, ok := s.Get("nonexistent") + if ok { + t.Error("Reset on nonexistent node should not create an entry") + } +} + +func TestStateStoreResetReplacesPointer(t *testing.T) { + s := NewStateStore() + old := s.GetOrCreate("node-01") + + s.Reset("node-01") + + current, _ := s.Get("node-01") + if old == current { + t.Error("Reset should replace the map entry with a new pointer") + } +} + +func TestStateStoreCleanup(t *testing.T) { + s := NewStateStore() + s.GetOrCreate("node-01") + s.GetOrCreate("node-02") + s.GetOrCreate("node-03") + + active := map[string]struct{}{ + "node-01": {}, + "node-03": {}, + } + s.Cleanup(active) + + if _, ok := s.Get("node-01"); !ok { + t.Error("node-01 should still exist") + } + if _, ok := s.Get("node-02"); ok { + t.Error("node-02 should be removed") + } + if _, ok := s.Get("node-03"); !ok { + t.Error("node-03 should still exist") + } +} diff --git a/pkg/watcher/watcher.go b/pkg/watcher/watcher.go index 5f28054..18d1373 100644 --- a/pkg/watcher/watcher.go +++ b/pkg/watcher/watcher.go @@ -4,21 +4,22 @@ import ( "context" "fmt" "log/slog" - "strconv" - "sync" "time" - "github.com/civo/civogo" + "github.com/civo/node-agent/pkg/health" + "github.com/civo/node-agent/pkg/metrics" + "github.com/civo/node-agent/pkg/operation" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" + listerscorev1 "k8s.io/client-go/listers/core/v1" "k8s.io/client-go/rest" + "k8s.io/client-go/tools/cache" "k8s.io/client-go/tools/clientcmd" ) -// Version is the current version of the this watcher -var Version string = "0.0.1" - const ( nodePoolLabelKey = "kubernetes.civo.com/civo-node-pool" gpuResourceName = "nvidia.com/gpu" @@ -30,28 +31,29 @@ type Watcher interface { type watcher struct { client kubernetes.Interface - civoClient civogo.Clienter clientCfgPath string clusterID string - region string - apiKey string - apiURL string nodeDesiredGPUCount int rebootTimeWindowMinutes time.Duration - // NOTE: This is only effective when running with a single node-agent. If we want to run multiple instances, additional logic modifications will be required. - lastRebootCmdTimes sync.Map - nodeSelector *metav1.LabelSelector + nodeLister listerscorev1.NodeLister + + monitorOnly bool + unhealthyThreshold time.Duration + checkers []health.HealthChecker + executor operation.Executor + states *StateStore + nowFunc func() time.Time } -func NewWatcher(ctx context.Context, apiURL, apiKey, region, clusterID, nodePoolID string, opts ...Option) (Watcher, error) { +func NewWatcher(ctx context.Context, clusterID, nodePoolID string, opts ...Option) (Watcher, error) { w := &watcher{ - clusterID: clusterID, - apiKey: apiKey, - apiURL: apiURL, - region: region, + clusterID: clusterID, + monitorOnly: true, + states: NewStateStore(), + nowFunc: time.Now, } for _, opt := range append(defaultOptions, opts...) { opt(w) @@ -63,9 +65,6 @@ func NewWatcher(ctx context.Context, apiURL, apiKey, region, clusterID, nodePool if nodePoolID == "" { return nil, fmt.Errorf("CIVO_NODE_POOL_ID not set") } - if w.civoClient == nil && apiKey == "" { - return nil, fmt.Errorf("CIVO_API_KEY not set") - } w.nodeSelector = &metav1.LabelSelector{ MatchLabels: map[string]string{ @@ -76,15 +75,12 @@ func NewWatcher(ctx context.Context, apiURL, apiKey, region, clusterID, nodePool if err := w.setupKubernetesClient(); err != nil { return nil, err } - if err := w.setupCivoClient(); err != nil { - return nil, err - } return w, nil } // setupKubernetesClient creates Kubernetes client based on the kubeconfig path. // If kubeconfig path is not empty, the client will be created using that path. -// Otherwise, if the kubeconfig path is empty, the client will be created using the in-clustetr config. +// Otherwise, if the kubeconfig path is empty, the client will be created using the in-cluster config. func (w *watcher) setupKubernetesClient() (err error) { if w.clientCfgPath != "" && w.client == nil { cfg, err := clientcmd.BuildConfigFromFlags("", w.clientCfgPath) @@ -111,28 +107,38 @@ func (w *watcher) setupKubernetesClient() (err error) { return nil } -func (w *watcher) setupCivoClient() error { - if w.civoClient != nil { +func (w *watcher) setupInformer(ctx context.Context) error { + if w.nodeLister != nil { return nil } - client, err := civogo.NewClientWithURL(w.apiKey, w.apiURL, w.region) - if err != nil { - return fmt.Errorf("failed to initialise civo client: %w", err) - } + labelSelector := metav1.FormatLabelSelector(w.nodeSelector) + factory := informers.NewSharedInformerFactoryWithOptions( + w.client, + 0, + informers.WithTweakListOptions(func(opts *metav1.ListOptions) { + opts.LabelSelector = labelSelector + }), + ) - userAgent := &civogo.Component{ - ID: w.clusterID, - Name: "node-agent", - Version: Version, + nodeInformer := factory.Core().V1().Nodes() + w.nodeLister = nodeInformer.Lister() + + factory.Start(ctx.Done()) + + if !cache.WaitForCacheSync(ctx.Done(), nodeInformer.Informer().HasSynced) { + return fmt.Errorf("failed to sync node informer cache") } - client.SetUserAgent(userAgent) - w.civoClient = client + slog.Info("Node informer cache synced") return nil } func (w *watcher) Run(ctx context.Context) error { + if err := w.setupInformer(ctx); err != nil { + return err + } + ticker := time.NewTicker(10 * time.Second) defer ticker.Stop() @@ -150,139 +156,122 @@ func (w *watcher) Run(ctx context.Context) error { } func (w *watcher) run(ctx context.Context) error { - nodes, err := w.client.CoreV1().Nodes().List(ctx, metav1.ListOptions{ - LabelSelector: metav1.FormatLabelSelector(w.nodeSelector), - }) + nodes, err := w.nodeLister.List(labels.Everything()) if err != nil { return err } - thresholdTime := time.Now().Add(-w.rebootTimeWindowMinutes * time.Minute) - - for _, node := range nodes.Items { - if !isNodeDesiredGPU(&node, w.nodeDesiredGPUCount) || !isNodeReady(&node) { - - // LTT: LastTransitionTime of node. - // LRCT: LastRebootCmdTimes - // 60: Threshold time (example) - // - LTT > 60 , LRCT < 60 dont reboot - // - LTT < 60 , LRCT < 60 dont reboot - // - LTT < 60 , LRCT > 60 dont reboot - // - LTT > 60, LRCT >. 60 reboot - slog.Info("Node is not ready, attempting to reboot", "node", node.GetName()) - if isReadyOrNotReadyStatusChangedAfter(&node, thresholdTime) { - slog.Info("Skipping reboot because Ready/NotReady status was updated recently", "node", node.GetName()) - continue - } - if w.isLastRebootCommandTimeAfter(node.GetName(), thresholdTime) { - slog.Info("Skipping reboot because Reboot command was executed recently", "node", node.GetName()) - continue - } - if err := w.rebootNode(node.GetName()); err != nil { - slog.Error("Failed to reboot Node", "node", node.GetName(), "error", err) - return fmt.Errorf("failed to reboot node: %w", err) + now := w.nowFunc() + activeNodes := make(map[string]struct{}, len(nodes)) + + for _, node := range nodes { + nodeName := node.GetName() + activeNodes[nodeName] = struct{}{} + + // Run all health checkers and collect failures. + var failedCheckers []string + for _, checker := range w.checkers { + healthy := checker.Check(node) + result := "pass" + if !healthy { + result = "fail" + failedCheckers = append(failedCheckers, checker.Name()) } + metrics.HealthCheckTotal.WithLabelValues(nodeName, checker.Name(), result).Inc() } - } - return nil -} -func isReadyOrNotReadyStatusChangedAfter(node *corev1.Node, thresholdTime time.Time) bool { - var lastChangedTime time.Time - for _, cond := range node.Status.Conditions { - if cond.Type == corev1.NodeReady { - if cond.LastTransitionTime.After(lastChangedTime) { - lastChangedTime = cond.LastTransitionTime.Time + state := w.states.GetOrCreate(nodeName) + + // All checkers pass → node is healthy. + if len(failedCheckers) == 0 { + if state.Phase() != PhaseHealthy { + prevPhase := state.Phase() + slog.Info("Node recovered", + "node", nodeName, + "previousPhase", prevPhase.String()) + metrics.NodeUnhealthyDurationSeconds.WithLabelValues(nodeName).Set(0) + metrics.RecoveryPhase.WithLabelValues(nodeName, prevPhase.String()).Set(0) + metrics.RecoveryPhase.WithLabelValues(nodeName, PhaseHealthy.String()).Set(1) + w.states.Reset(nodeName) } + continue } - } - - slog.Info("Checking if Ready/NotReady status has changed recently", - "node", node.GetName(), - "lastTransitionTime", lastChangedTime.String(), - "thresholdTime", thresholdTime.String()) - if lastChangedTime.IsZero() { - slog.Error("Node is in an invalid state, NodeReady condition not found", "node", node.GetName()) - return false - } - return lastChangedTime.After(thresholdTime) -} - -// isLastRebootCommandTimeAfter checks if the last reboot command time for the specified node -// is after the given threshold time. In case of delays in reboot, the -// LastTransitionTime of node might not be updated, so it compares the latest reboot -// command time to prevent sending reboot commands multiple times. -// NOTE: This is only effective when running with a single node-agent. If we want to run multiple instances, additional logic modifications will be required. -func (w *watcher) isLastRebootCommandTimeAfter(nodeName string, thresholdTime time.Time) bool { - v, ok := w.lastRebootCmdTimes.Load(nodeName) - if !ok { - slog.Info("LastRebootCommandTime not found", "node", nodeName) - return false - } - lastRebootCmdTime, ok := v.(time.Time) - if !ok { - slog.Info("LastRebootCommandTime is invalid, so it will be removed from the records", "node", nodeName, "value", v) - w.lastRebootCmdTimes.Delete(nodeName) - return false + // At least one checker failed. + isGPU := hasGPU(node) + w.states.UpdateCheckerInfo(nodeName, failedCheckers, isGPU) + + switch state.Phase() { + case PhaseHealthy: + w.states.MarkUnhealthy(nodeName, now) + slog.Info("Node unhealthy detected", + "node", nodeName, + "failedCheckers", failedCheckers) + metrics.NodeUnhealthyDurationSeconds.WithLabelValues(nodeName).Set(0) + metrics.RecoveryPhase.WithLabelValues(nodeName, PhaseHealthy.String()).Set(0) + metrics.RecoveryPhase.WithLabelValues(nodeName, PhaseUnhealthy.String()).Set(1) + + case PhaseUnhealthy: + metrics.NodeUnhealthyDurationSeconds.WithLabelValues(nodeName).Set( + now.Sub(state.UnhealthySince()).Seconds()) + if now.Sub(state.UnhealthySince()) < w.unhealthyThreshold { + continue + } + if !w.monitorOnly { + if err := w.executor.Reboot(ctx, nodeName); err != nil { + slog.Error("Failed to reboot node", "node", nodeName, "error", err) + continue + } + } + mode := modeLabel(w.monitorOnly) + slog.Info("Reboot initiated", + "node", nodeName, + "mode", mode, + "failedCheckers", failedCheckers) + metrics.RecoveryActionsTotal.WithLabelValues(nodeName, "reboot", mode).Inc() + metrics.RecoveryPhase.WithLabelValues(nodeName, PhaseUnhealthy.String()).Set(0) + metrics.RecoveryPhase.WithLabelValues(nodeName, PhaseWaitingReboot.String()).Set(1) + w.states.MarkWaitingReboot(nodeName, now) + + case PhaseWaitingReboot: + metrics.NodeUnhealthyDurationSeconds.WithLabelValues(nodeName).Set( + now.Sub(state.UnhealthySince()).Seconds()) + if now.Sub(state.LastRebootTime()) < w.rebootTimeWindowMinutes*time.Minute { + continue + } + if !w.monitorOnly { + if err := w.executor.Reboot(ctx, nodeName); err != nil { + slog.Error("Failed to reboot node (retry)", "node", nodeName, "error", err) + continue + } + } + mode := modeLabel(w.monitorOnly) + slog.Info("Reboot retry", + "node", nodeName, + "mode", mode, + "rebootCount", state.RebootCount()+1, + "failedCheckers", failedCheckers) + metrics.RecoveryActionsTotal.WithLabelValues(nodeName, "reboot", mode).Inc() + w.states.MarkWaitingReboot(nodeName, now) + } } - slog.Info("Checking if LastRebootCommandTime has changed recently", - "node", nodeName, - "lastRebootCommandTime", lastRebootCmdTime.String(), - "thresholdTime", thresholdTime.String()) - - return lastRebootCmdTime.After(thresholdTime) + w.states.Cleanup(activeNodes) + return nil } -func isNodeReady(node *corev1.Node) bool { - for _, cond := range node.Status.Conditions { - if cond.Type == corev1.NodeReady { - slog.Info("Current Node status", "node", node.GetName(), "type", corev1.NodeReady, "status", cond.Status) - return cond.Status == corev1.ConditionTrue - } +func modeLabel(monitorOnly bool) string { + if monitorOnly { + return "monitor" } - slog.Info("NodeReady condition not found", "node", node.GetName()) - return false + return "active" } -func isNodeDesiredGPU(node *corev1.Node, desired int) bool { - if desired == 0 { - slog.Info("Desired GPU count is set to 0, so the GPU count check is skipped", "node", node.GetName()) - return true - } - +func hasGPU(node *corev1.Node) bool { quantity, exists := node.Status.Allocatable[gpuResourceName] - if !exists || quantity.IsZero() { - slog.Info("Allocatable GPU not found", "node", node.GetName()) + if !exists { return false } - gpuCount, ok := quantity.AsInt64() - if !ok { - slog.Info("Failed to convert allocatable GPU quantity to int64", "node", node.GetName(), "quantity", quantity.String()) - return false - } - - slog.Info("Checking actual GPU count with desired", - "node", node.GetName(), - "actual", gpuCount, - "desired", strconv.Itoa(desired)) - - return gpuCount == int64(desired) -} - -func (w *watcher) rebootNode(name string) error { - instance, err := w.civoClient.FindKubernetesClusterInstance(w.clusterID, name) - if err != nil { - return fmt.Errorf("failed to find instance, clusterID: %s, nodeName: %s: %w", w.clusterID, name, err) - } - - _, err = w.civoClient.HardRebootInstance(instance.ID) - if err != nil { - return fmt.Errorf("failed to reboot instance, clusterID: %s, instanceID: %s: %w", w.clusterID, instance.ID, err) - } - slog.Info("Instance is rebooting", "instanceID", instance.ID, "node", name) - w.lastRebootCmdTimes.Store(name, time.Now()) - return nil + return ok && gpuCount > 0 } diff --git a/pkg/watcher/watcher_test.go b/pkg/watcher/watcher_test.go index c69d17c..6d877bd 100644 --- a/pkg/watcher/watcher_test.go +++ b/pkg/watcher/watcher_test.go @@ -1,37 +1,117 @@ package watcher import ( - "errors" + "context" "fmt" "strconv" "testing" "time" - "github.com/civo/civogo" + "github.com/civo/node-agent/pkg/health" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/labels" "k8s.io/client-go/kubernetes/fake" - k8stesting "k8s.io/client-go/testing" ) +// --- Test helpers --- + +// fakeNodeLister implements listerscorev1.NodeLister for testing. +type fakeNodeLister struct { + nodes []*corev1.Node + err error +} + +func (l *fakeNodeLister) List(selector labels.Selector) ([]*corev1.Node, error) { + if l.err != nil { + return nil, l.err + } + return l.nodes, nil +} + +func (l *fakeNodeLister) Get(name string) (*corev1.Node, error) { + for _, n := range l.nodes { + if n.Name == name { + return n, nil + } + } + return nil, fmt.Errorf("node %q not found", name) +} + +// mockExecutor implements operation.Executor for testing. +type mockExecutor struct { + rebootFunc func(ctx context.Context, nodeName string) error + calls []string +} + +func (m *mockExecutor) Reboot(ctx context.Context, nodeName string) error { + m.calls = append(m.calls, nodeName) + if m.rebootFunc != nil { + return m.rebootFunc(ctx, nodeName) + } + return nil +} + +// alwaysFailChecker is a HealthChecker that always reports unhealthy. +type alwaysFailChecker struct{ name string } + +func (c *alwaysFailChecker) Name() string { return c.name } +func (c *alwaysFailChecker) Check(*corev1.Node) bool { return false } + +// --- Test variables --- + var ( testClusterID = "test-cluster-123" - testRegion = "lon1" - testApiKey = "test-api-key" - testApiURL = "https://test.civo.com" testNodePoolID = "test-node-pool" testNodeDesiredGPUCount = "8" testRebootTimeWindowMinutes = time.Duration(40) ) +// newTestNode creates a node for testing with common defaults. +func newTestNode(name string, ready corev1.ConditionStatus, gpuCount int) *corev1.Node { + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Labels: map[string]string{ + nodePoolLabelKey: testNodePoolID, + }, + }, + Status: corev1.NodeStatus{ + Conditions: []corev1.NodeCondition{ + {Type: corev1.NodeReady, Status: ready}, + }, + }, + } + if gpuCount > 0 { + node.Status.Allocatable = corev1.ResourceList{ + gpuResourceName: resource.MustParse(strconv.Itoa(gpuCount)), + } + } + return node +} + +// newTestWatcher creates a watcher with sensible test defaults and the given options. +func newTestWatcher(t *testing.T, opts ...Option) *watcher { + t.Helper() + baseOpts := []Option{ + WithKubernetesClient(fake.NewSimpleClientset()), + WithExecutor(&mockExecutor{}), + } + w, err := NewWatcher(t.Context(), + testClusterID, testNodePoolID, + append(baseOpts, opts...)...) + if err != nil { + t.Fatal(err) + } + return w.(*watcher) +} + +// --- TestNew --- + func TestNew(t *testing.T) { type args struct { clusterID string - region string - apiKey string - apiURL string nodePoolID string opts []Option } @@ -47,13 +127,10 @@ func TestNew(t *testing.T) { name: "Returns no error when given valid input", args: args{ clusterID: testClusterID, - region: testRegion, - apiKey: testApiKey, - apiURL: testApiURL, nodePoolID: testNodePoolID, opts: []Option{ WithKubernetesClient(fake.NewSimpleClientset()), - WithCivoClient(&FakeClient{}), + WithExecutor(&mockExecutor{}), WithDesiredGPUCount(testNodeDesiredGPUCount), }, }, @@ -61,22 +138,12 @@ func TestNew(t *testing.T) { if w.clusterID != testClusterID { return fmt.Errorf("clusterID mismatch: got %s, want %s", w.clusterID, testClusterID) } - if w.region != testRegion { - return fmt.Errorf("region mismatch: got %s, want %s", w.region, testRegion) - } - if w.apiKey != testApiKey { - return fmt.Errorf("apiKey mismatch: got %s, want %s", w.apiKey, testApiKey) - } - if w.apiURL != testApiURL { - return fmt.Errorf("apiURL mismatch: got %s, want %s", w.apiURL, testApiURL) - } - cnt, err := strconv.Atoi(testNodeDesiredGPUCount) if err != nil { return err } if w.nodeDesiredGPUCount != cnt { - return fmt.Errorf("nodeDesiredGPUCount mismatch: got %d, want %s", w.nodeDesiredGPUCount, testNodeDesiredGPUCount) + return fmt.Errorf("nodeDesiredGPUCount mismatch: got %d, want %d", w.nodeDesiredGPUCount, cnt) } if w.nodeSelector == nil || w.nodeSelector.MatchLabels[nodePoolLabelKey] != testNodePoolID { return fmt.Errorf("nodeSelector mismatch: got %v, want %s", w.nodeSelector, testNodePoolID) @@ -84,11 +151,17 @@ func TestNew(t *testing.T) { if w.client == nil { return fmt.Errorf("client is nil") } - if w.civoClient == nil { - return fmt.Errorf("civoClient is nil") - } if w.rebootTimeWindowMinutes != testRebootTimeWindowMinutes { - return fmt.Errorf("w.rebootTimeWindowMinutes mismatch: got %v, want %s", w.nodeSelector, testNodePoolID) + return fmt.Errorf("rebootTimeWindowMinutes mismatch: got %v, want %v", w.rebootTimeWindowMinutes, testRebootTimeWindowMinutes) + } + if !w.monitorOnly { + return fmt.Errorf("monitorOnly should default to true") + } + if w.states == nil { + return fmt.Errorf("states is nil") + } + if w.nowFunc == nil { + return fmt.Errorf("nowFunc is nil") } return nil }, @@ -97,60 +170,44 @@ func TestNew(t *testing.T) { name: "Returns no error when input is invalid, but default value is set", args: args{ clusterID: testClusterID, - region: testRegion, - apiKey: testApiKey, - apiURL: testApiURL, nodePoolID: testNodePoolID, opts: []Option{ WithKubernetesClient(fake.NewSimpleClientset()), - WithCivoClient(&FakeClient{}), - WithDesiredGPUCount("invalid"), // It is invalid, but the default count (0) will be used. - WithDesiredGPUCount("-1"), // It is invalid, but the default count (0) will be used. - WithRebootTimeWindowMinutes("invalid time"), // It is invalid, but the default time (40) will be used. - WithRebootTimeWindowMinutes("0"), // It is invalid, but the default time (40) will be used. + WithExecutor(&mockExecutor{}), + WithDesiredGPUCount("invalid"), + WithDesiredGPUCount("-1"), + WithRebootTimeWindowMinutes("invalid time"), + WithRebootTimeWindowMinutes("0"), }, }, checkFunc: func(w *watcher) error { if w.nodeDesiredGPUCount != 0 { - return fmt.Errorf("w.nodeDesiredGPUCount mismatch: got %d, want %d", w.nodeDesiredGPUCount, 0) + return fmt.Errorf("nodeDesiredGPUCount mismatch: got %d, want %d", w.nodeDesiredGPUCount, 0) } if w.rebootTimeWindowMinutes != testRebootTimeWindowMinutes { - return fmt.Errorf("w.rebootTimeWindowMinutes mismatch: got %v, want %s", w.nodeSelector, testNodePoolID) + return fmt.Errorf("rebootTimeWindowMinutes mismatch: got %v, want %v", w.rebootTimeWindowMinutes, testRebootTimeWindowMinutes) } return nil }, }, { - name: "Returns no error when nodeDesiredGPUCount is 0", + name: "Returns an error when clusterID is missing", args: args{ - clusterID: testClusterID, - region: testRegion, - apiKey: testApiKey, - apiURL: testApiURL, nodePoolID: testNodePoolID, opts: []Option{ WithKubernetesClient(fake.NewSimpleClientset()), - WithCivoClient(&FakeClient{}), - WithDesiredGPUCount("0"), + WithExecutor(&mockExecutor{}), }, }, - checkFunc: func(w *watcher) error { - if w.nodeDesiredGPUCount != 0 { - return fmt.Errorf("w.nodeDesiredGPUCount mismatch: got %d, want %d", w.nodeDesiredGPUCount, 0) - } - return nil - }, + wantErr: true, }, { - name: "Returns an error when clusterID is missing", + name: "Returns an error when nodePoolID is missing", args: args{ - region: testRegion, - apiKey: testApiKey, - apiURL: testApiURL, - nodePoolID: testNodePoolID, + clusterID: testClusterID, opts: []Option{ WithKubernetesClient(fake.NewSimpleClientset()), - WithCivoClient(&FakeClient{}), + WithExecutor(&mockExecutor{}), }, }, wantErr: true, @@ -160,9 +217,6 @@ func TestNew(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { w, err := NewWatcher(t.Context(), - test.args.apiURL, - test.args.apiKey, - test.args.region, test.args.clusterID, test.args.nodePoolID, test.args.opts...) @@ -186,804 +240,338 @@ func TestNew(t *testing.T) { } } -func TestRun(t *testing.T) { - type args struct { - opts []Option - nodePoolID string - } - type test struct { - name string - args args - beforeFunc func(*watcher) - wantErr bool - } +// --- State machine transition tests --- - tests := []test{ - { - name: "Returns nil when node GPU count is 8 and no reboot needed", - args: args{ - opts: []Option{ - WithKubernetesClient(fake.NewSimpleClientset()), - WithCivoClient(&FakeClient{}), - WithDesiredGPUCount(testNodeDesiredGPUCount), - }, - nodePoolID: testNodePoolID, - }, - beforeFunc: func(w *watcher) { - t.Helper() - client := w.client.(*fake.Clientset) - - nodes := &corev1.NodeList{ - Items: []corev1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node-01", - Labels: map[string]string{ - nodePoolLabelKey: testNodePoolID, - }, - }, - Status: corev1.NodeStatus{ - Conditions: []corev1.NodeCondition{ - { - Type: corev1.NodeReady, - Status: corev1.ConditionTrue, - }, - { - Type: corev1.NodeReady, - Status: corev1.ConditionFalse, - }, - }, - Allocatable: corev1.ResourceList{ - gpuResourceName: resource.MustParse("8"), - }, - }, - }, - }, - } - client.Fake.PrependReactor("list", "nodes", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { - return true, nodes, nil - }) - }, - }, - { - name: "Returns nil and triggers reboot when GPU count drops below desired (7 GPUs available)", - args: args{ - opts: []Option{ - WithKubernetesClient(fake.NewSimpleClientset()), - WithCivoClient(&FakeClient{}), - WithDesiredGPUCount(testNodeDesiredGPUCount), - }, - nodePoolID: testNodePoolID, - }, - beforeFunc: func(w *watcher) { - t.Helper() - client := w.client.(*fake.Clientset) - - nodes := &corev1.NodeList{ - Items: []corev1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node-01", - Labels: map[string]string{ - nodePoolLabelKey: testNodePoolID, - }, - }, - Status: corev1.NodeStatus{ - Conditions: []corev1.NodeCondition{ - { - Type: corev1.NodeReady, - Status: corev1.ConditionTrue, - }, - { - Type: corev1.NodeReady, - Status: corev1.ConditionFalse, - }, - }, - Allocatable: corev1.ResourceList{ - gpuResourceName: resource.MustParse("7"), - }, - }, - }, - }, - } - client.Fake.PrependReactor("list", "nodes", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { - return true, nodes, nil - }) +func TestRun_HealthyNodeStaysHealthy(t *testing.T) { + node := newTestNode("node-01", corev1.ConditionTrue, 8) + w := newTestWatcher(t, + WithNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), + WithCheckers(health.NewDefaultCheckers(8)), + ) - civoClient := w.civoClient.(*FakeClient) - instance := &civogo.Instance{ - ID: "instance-01", - } - civoClient.FindKubernetesClusterInstanceFunc = func(clusterID, search string) (*civogo.Instance, error) { - return instance, nil - } - civoClient.HardRebootInstanceFunc = func(id string) (*civogo.SimpleResponse, error) { - return new(civogo.SimpleResponse), nil - } - }, - }, - { - name: "Returns nil and triggers reboot when GPU count matches desired but node is not ready", - args: args{ - opts: []Option{ - WithKubernetesClient(fake.NewSimpleClientset()), - WithCivoClient(&FakeClient{}), - WithDesiredGPUCount(testNodeDesiredGPUCount), - }, - nodePoolID: testNodePoolID, - }, - beforeFunc: func(w *watcher) { - t.Helper() - client := w.client.(*fake.Clientset) - - nodes := &corev1.NodeList{ - Items: []corev1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node-01", - Labels: map[string]string{ - nodePoolLabelKey: testNodePoolID, - }, - }, - Status: corev1.NodeStatus{ - Conditions: []corev1.NodeCondition{ - { - Type: corev1.NodeReady, - Status: corev1.ConditionFalse, - }, - }, - Allocatable: corev1.ResourceList{ - gpuResourceName: resource.MustParse("8"), - }, - }, - }, - }, - } - client.Fake.PrependReactor("list", "nodes", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { - return true, nodes, nil - }) + if err := w.run(t.Context()); err != nil { + t.Fatal(err) + } - civoClient := w.civoClient.(*FakeClient) - instance := &civogo.Instance{ - ID: "instance-01", - } - civoClient.FindKubernetesClusterInstanceFunc = func(clusterID, search string) (*civogo.Instance, error) { - return instance, nil - } - civoClient.HardRebootInstanceFunc = func(id string) (*civogo.SimpleResponse, error) { - return new(civogo.SimpleResponse), nil - } - }, - }, - { - name: "Returns nil and skips reboot when GPU count matches desired but node is not ready, and LastTransitionTime is more recent than thresholdTime", - args: args{ - opts: []Option{ - WithKubernetesClient(fake.NewSimpleClientset()), - WithCivoClient(&FakeClient{}), - WithDesiredGPUCount(testNodeDesiredGPUCount), - }, - nodePoolID: testNodePoolID, - }, - beforeFunc: func(w *watcher) { - t.Helper() - client := w.client.(*fake.Clientset) - - w.lastRebootCmdTimes.Store("node-01", time.Now()) - - nodes := &corev1.NodeList{ - Items: []corev1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node-01", - Labels: map[string]string{ - nodePoolLabelKey: testNodePoolID, - }, - }, - Status: corev1.NodeStatus{ - Conditions: []corev1.NodeCondition{ - { - Type: corev1.NodeReady, - Status: corev1.ConditionFalse, - }, - }, - Allocatable: corev1.ResourceList{ - gpuResourceName: resource.MustParse("8"), - }, - }, - }, - }, - } - client.Fake.PrependReactor("list", "nodes", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { - return true, nodes, nil - }) - }, - }, - { - name: "Returns nil and skips reboot when GPU count matches desired but node is not ready, and LastRebootCmdTime is more recent than thresholdTime", - args: args{ - opts: []Option{ - WithKubernetesClient(fake.NewSimpleClientset()), - WithCivoClient(&FakeClient{}), - WithDesiredGPUCount(testNodeDesiredGPUCount), - }, - nodePoolID: testNodePoolID, - }, - beforeFunc: func(w *watcher) { - t.Helper() - client := w.client.(*fake.Clientset) - - nodes := &corev1.NodeList{ - Items: []corev1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node-01", - Labels: map[string]string{ - nodePoolLabelKey: testNodePoolID, - }, - }, - Status: corev1.NodeStatus{ - Conditions: []corev1.NodeCondition{ - { - Type: corev1.NodeReady, - Status: corev1.ConditionFalse, - LastTransitionTime: metav1.NewTime(time.Now()), - }, - }, - Allocatable: corev1.ResourceList{ - gpuResourceName: resource.MustParse("8"), - }, - }, - }, - }, - } - client.Fake.PrependReactor("list", "nodes", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { - return true, nodes, nil - }) - }, - }, - { - name: "Returns an error when unable to list nodes", - args: args{ - opts: []Option{ - WithKubernetesClient(fake.NewSimpleClientset()), - WithCivoClient(&FakeClient{}), - WithDesiredGPUCount(testNodeDesiredGPUCount), - }, - nodePoolID: testNodePoolID, - }, - beforeFunc: func(w *watcher) { - t.Helper() - client := w.client.(*fake.Clientset) + state, ok := w.states.Get("node-01") + if !ok { + t.Fatal("state should exist for node-01") + } + if state.Phase() != PhaseHealthy { + t.Errorf("got phase %v, want PhaseHealthy", state.Phase()) + } +} - client.Fake.PrependReactor("list", "nodes", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { - return true, &corev1.NodeList{}, errors.New("invalid error") - }) - }, - wantErr: true, - }, +func TestRun_UnhealthyDetection(t *testing.T) { + now := time.Date(2026, 4, 13, 12, 0, 0, 0, time.UTC) + node := newTestNode("node-01", corev1.ConditionFalse, 8) + w := newTestWatcher(t, + WithNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), + WithCheckers(health.NewDefaultCheckers(8)), + WithNowFunc(func() time.Time { return now }), + ) + + if err := w.run(t.Context()); err != nil { + t.Fatal(err) + } - { - name: "Returns an error when finding the Kubernetes cluster instance fails during reboot", - args: args{ - opts: []Option{ - WithKubernetesClient(fake.NewSimpleClientset()), - WithCivoClient(&FakeClient{}), - WithDesiredGPUCount(testNodeDesiredGPUCount), - }, - nodePoolID: testNodePoolID, - }, - beforeFunc: func(w *watcher) { - t.Helper() - client := w.client.(*fake.Clientset) - - nodes := &corev1.NodeList{ - Items: []corev1.Node{ - { - ObjectMeta: metav1.ObjectMeta{ - Name: "node-01", - Labels: map[string]string{ - nodePoolLabelKey: testNodePoolID, - }, - }, - Status: corev1.NodeStatus{ - Conditions: []corev1.NodeCondition{ - { - Type: corev1.NodeReady, - Status: corev1.ConditionFalse, - }, - }, - Allocatable: corev1.ResourceList{ - gpuResourceName: resource.MustParse("8"), - }, - }, - }, - }, - } - client.Fake.PrependReactor("list", "nodes", func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { - return true, nodes, nil - }) + state, _ := w.states.Get("node-01") + if state.Phase() != PhaseUnhealthy { + t.Errorf("got phase %v, want PhaseUnhealthy", state.Phase()) + } + if !state.UnhealthySince().Equal(now) { + t.Errorf("got unhealthySince %v, want %v", state.UnhealthySince(), now) + } +} - civoClient := w.civoClient.(*FakeClient) - civoClient.FindKubernetesClusterInstanceFunc = func(clusterID, search string) (*civogo.Instance, error) { - return nil, errors.New("invalid error") - } - }, - wantErr: true, - }, +func TestRun_RebootTriggerActiveMode(t *testing.T) { + now := time.Date(2026, 4, 13, 12, 0, 0, 0, time.UTC) + node := newTestNode("node-01", corev1.ConditionFalse, 8) + exec := &mockExecutor{} + w := newTestWatcher(t, + WithNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), + WithCheckers(health.NewDefaultCheckers(8)), + WithExecutor(exec), + WithMonitorOnly(false), + WithUnhealthyThresholdMinutes("10"), + WithNowFunc(func() time.Time { return now }), + ) + + // First run: detect unhealthy. + if err := w.run(t.Context()); err != nil { + t.Fatal(err) } - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - w, err := NewWatcher(t.Context(), - testApiURL, testApiKey, testRegion, testClusterID, test.args.nodePoolID, test.args.opts...) - if err != nil { - t.Fatal(err) - } + // Advance past threshold. + now = now.Add(11 * time.Minute) - obj := w.(*watcher) - if test.beforeFunc != nil { - test.beforeFunc(obj) - } + // Second run: should trigger reboot. + if err := w.run(t.Context()); err != nil { + t.Fatal(err) + } - err = obj.run(t.Context()) - if (err != nil) != test.wantErr { - t.Errorf("error = %v, wantErr %v", err, test.wantErr) - } - }) + state, _ := w.states.Get("node-01") + if state.Phase() != PhaseWaitingReboot { + t.Errorf("got phase %v, want PhaseWaitingReboot", state.Phase()) + } + if state.RebootCount() != 1 { + t.Errorf("got rebootCount %d, want 1", state.RebootCount()) + } + if len(exec.calls) != 1 || exec.calls[0] != "node-01" { + t.Errorf("expected 1 reboot call for node-01, got %v", exec.calls) } } -func TestIsReadyOrNotReadyStatusChangedAfter(t *testing.T) { - type test struct { - name string - node *corev1.Node - thresholdTime time.Time - want bool +func TestRun_RebootSkippedInReportMode(t *testing.T) { + now := time.Date(2026, 4, 13, 12, 0, 0, 0, time.UTC) + node := newTestNode("node-01", corev1.ConditionFalse, 8) + exec := &mockExecutor{} + w := newTestWatcher(t, + WithNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), + WithCheckers(health.NewDefaultCheckers(8)), + WithExecutor(exec), + WithMonitorOnly(true), + WithUnhealthyThresholdMinutes("10"), + WithNowFunc(func() time.Time { return now }), + ) + + // First run: detect unhealthy. + if err := w.run(t.Context()); err != nil { + t.Fatal(err) } - tests := []test{ - { - name: "Returns true when NodeReady condition is true (Ready) and last transition time is after threshold", - node: &corev1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node-01", - }, - Status: corev1.NodeStatus{ - Conditions: []corev1.NodeCondition{ - { - Type: corev1.NodeReady, - Status: corev1.ConditionTrue, - LastTransitionTime: metav1.NewTime(time.Now()), - }, - }, - }, - }, - thresholdTime: time.Now().Add(-time.Hour), - want: true, - }, - { - name: "Returns true when NodeReady condition is false (NotReady) and last transition time is after threshold", - node: &corev1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node-01", - }, - Status: corev1.NodeStatus{ - Conditions: []corev1.NodeCondition{ - { - Type: corev1.NodeReady, - Status: corev1.ConditionFalse, - LastTransitionTime: metav1.NewTime(time.Now()), - }, - }, - }, - }, - thresholdTime: time.Now().Add(-time.Hour), - want: true, - }, - { - name: "Returns false when the latest NodeReady condition is older than thresholdTime", - node: &corev1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node-01", - }, - Status: corev1.NodeStatus{ - Conditions: []corev1.NodeCondition{ - { - Type: corev1.NodeReady, - Status: corev1.ConditionFalse, - LastTransitionTime: metav1.NewTime(time.Now().Add(-time.Hour)), - }, - }, - }, - }, - thresholdTime: time.Now(), - want: false, - }, - { - name: "Returns false when no conditions are present on the node", - node: &corev1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node-01", - }, - Status: corev1.NodeStatus{ - Conditions: []corev1.NodeCondition{}, - }, - }, - thresholdTime: time.Now().Add(-time.Hour), - want: false, - }, - { - name: "Returns false when there is only NodeDiskPressure condition", - node: &corev1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node-01", - }, - Status: corev1.NodeStatus{ - Conditions: []corev1.NodeCondition{ - { - Type: corev1.NodeDiskPressure, - Status: corev1.ConditionFalse, - LastHeartbeatTime: metav1.NewTime(time.Now()), - }, - }, - }, - }, - thresholdTime: time.Now().Add(-time.Hour), - want: false, - }, + // Advance past threshold. + now = now.Add(11 * time.Minute) + + // Second run: should transition to WaitingReboot but NOT call executor. + if err := w.run(t.Context()); err != nil { + t.Fatal(err) } - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - got := isReadyOrNotReadyStatusChangedAfter(test.node, test.thresholdTime) - if got != test.want { - t.Errorf("got = %v, want %v", got, test.want) - } - }) + state, _ := w.states.Get("node-01") + if state.Phase() != PhaseWaitingReboot { + t.Errorf("got phase %v, want PhaseWaitingReboot", state.Phase()) + } + if len(exec.calls) != 0 { + t.Errorf("expected no reboot calls in report mode, got %v", exec.calls) } } -func TestIsLastRebootCommandTimeAfter(t *testing.T) { - type test struct { - name string - nodeName string - opts []Option - thresholdTime time.Time - beforeFunc func(*watcher) - want bool +func TestRun_RecoveryAfterReboot(t *testing.T) { + now := time.Date(2026, 4, 13, 12, 0, 0, 0, time.UTC) + node := newTestNode("node-01", corev1.ConditionFalse, 8) + w := newTestWatcher(t, + WithNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), + WithCheckers(health.NewDefaultCheckers(8)), + WithMonitorOnly(false), + WithUnhealthyThresholdMinutes("10"), + WithNowFunc(func() time.Time { return now }), + ) + + // Run 1: detect unhealthy. + if err := w.run(t.Context()); err != nil { + t.Fatal(err) } - - tests := []test{ - { - name: "Return true when last reboot command time is after threshold", - opts: []Option{ - WithKubernetesClient(fake.NewSimpleClientset()), - WithCivoClient(&FakeClient{}), - }, - nodeName: "node-01", - thresholdTime: time.Now().Add(-time.Hour), - beforeFunc: func(w *watcher) { - w.lastRebootCmdTimes.Store("node-01", time.Now()) - }, - want: true, - }, - { - name: "Return false when last reboot command time is before threshold", - opts: []Option{ - WithKubernetesClient(fake.NewSimpleClientset()), - WithCivoClient(&FakeClient{}), - }, - nodeName: "node-01", - thresholdTime: time.Now().Add(-time.Hour), - beforeFunc: func(w *watcher) { - w.lastRebootCmdTimes.Store("nodde-01", time.Now().Add(-2*time.Hour)) - }, - want: false, - }, - { - name: "Return false when last reboot command time not found", - opts: []Option{ - WithKubernetesClient(fake.NewSimpleClientset()), - WithCivoClient(&FakeClient{}), - }, - nodeName: "node-01", - thresholdTime: time.Now().Add(-time.Hour), - want: false, - }, - { - name: "Return false when type of last reboot command time is invalid", - opts: []Option{ - WithKubernetesClient(fake.NewSimpleClientset()), - WithCivoClient(&FakeClient{}), - }, - nodeName: "node-01", - thresholdTime: time.Now().Add(-time.Hour), - beforeFunc: func(w *watcher) { - w.lastRebootCmdTimes.Store("nodde-01", "invalid-type") - }, - want: false, - }, + // Run 2: trigger reboot. + now = now.Add(11 * time.Minute) + if err := w.run(t.Context()); err != nil { + t.Fatal(err) } - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - w, err := NewWatcher(t.Context(), - testApiURL, testApiKey, testRegion, testClusterID, testNodePoolID, test.opts...) - if err != nil { - t.Fatal(err) - } + // Node recovers. + node.Status.Conditions[0].Status = corev1.ConditionTrue + now = now.Add(5 * time.Minute) - obj := w.(*watcher) - if test.beforeFunc != nil { - test.beforeFunc(obj) - } - got := obj.isLastRebootCommandTimeAfter(test.nodeName, test.thresholdTime) - if got != test.want { - t.Errorf("got = %v, want %v", got, test.want) - } - }) + // Run 3: should detect recovery. + if err := w.run(t.Context()); err != nil { + t.Fatal(err) + } + + state, _ := w.states.Get("node-01") + if state.Phase() != PhaseHealthy { + t.Errorf("got phase %v, want PhaseHealthy", state.Phase()) + } + if state.RebootCount() != 0 { + t.Errorf("got rebootCount %d, want 0 after recovery", state.RebootCount()) } } -func TestIsNodeReady(t *testing.T) { - type test struct { - name string - node *corev1.Node - want bool +func TestRun_RebootRetry(t *testing.T) { + now := time.Date(2026, 4, 13, 12, 0, 0, 0, time.UTC) + node := newTestNode("node-01", corev1.ConditionFalse, 8) + exec := &mockExecutor{} + w := newTestWatcher(t, + WithNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), + WithCheckers(health.NewDefaultCheckers(8)), + WithExecutor(exec), + WithMonitorOnly(false), + WithUnhealthyThresholdMinutes("10"), + WithRebootTimeWindowMinutes("40"), + WithNowFunc(func() time.Time { return now }), + ) + + // Run 1: detect unhealthy. + if err := w.run(t.Context()); err != nil { + t.Fatal(err) + } + // Run 2: trigger first reboot. + now = now.Add(11 * time.Minute) + if err := w.run(t.Context()); err != nil { + t.Fatal(err) } - tests := []test{ - { - name: "Returns true when Node is ready state", - node: &corev1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node-01", - }, - Status: corev1.NodeStatus{ - Conditions: []corev1.NodeCondition{ - { - Type: corev1.NodeReady, - Status: corev1.ConditionTrue, - }, - { - Type: corev1.NodeReady, - Status: corev1.ConditionFalse, - }, - }, - }, - }, - want: true, - }, - { - name: "Returns false when Node is not ready state", - node: &corev1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node-01", - }, - Status: corev1.NodeStatus{ - Conditions: []corev1.NodeCondition{ - { - Type: corev1.NodeReady, - Status: corev1.ConditionFalse, - }, - }, - }, - }, - }, - { - name: "Returns false when no conditions for the node", - node: &corev1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node-01", - }, - Status: corev1.NodeStatus{ - Conditions: []corev1.NodeCondition{}, - }, - }, - }, + // Still unhealthy, but within reboot window → no retry. + now = now.Add(30 * time.Minute) + if err := w.run(t.Context()); err != nil { + t.Fatal(err) + } + if len(exec.calls) != 1 { + t.Fatalf("expected 1 reboot call before window expires, got %d", len(exec.calls)) } - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - got := isNodeReady(test.node) - if got != test.want { - t.Errorf("got = %v, want %v", got, test.want) - } - }) + // Advance past reboot window → retry. + now = now.Add(11 * time.Minute) + if err := w.run(t.Context()); err != nil { + t.Fatal(err) } -} -func TestIsNodeDesiredGPU(t *testing.T) { - type test struct { - name string - node *corev1.Node - desired int - want bool + state, _ := w.states.Get("node-01") + if state.Phase() != PhaseWaitingReboot { + t.Errorf("got phase %v, want PhaseWaitingReboot", state.Phase()) + } + if state.RebootCount() != 2 { + t.Errorf("got rebootCount %d, want 2", state.RebootCount()) + } + if len(exec.calls) != 2 { + t.Errorf("expected 2 reboot calls, got %d", len(exec.calls)) } +} - tests := []test{ - { - name: "Returns true when GPU count matches desired value", - node: &corev1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node-01", - }, - Status: corev1.NodeStatus{ - Allocatable: corev1.ResourceList{ - gpuResourceName: resource.MustParse("8"), - }, - }, - }, - desired: 8, - want: true, - }, - { - name: "Returns true when desired GPU count is 0, so count check is skipped", - node: &corev1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node-01", - }, - Status: corev1.NodeStatus{ - Allocatable: corev1.ResourceList{}, - }, - }, - desired: 0, - want: true, - }, - { - name: "Returns false when GPU count is 0", - node: &corev1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node-01", - }, - Status: corev1.NodeStatus{ - Allocatable: corev1.ResourceList{ - gpuResourceName: resource.MustParse("0"), - }, - }, - }, - desired: 8, - want: false, - }, - { - name: "Returns false when GPU count is less than desired value", - node: &corev1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "node-01", - }, - Status: corev1.NodeStatus{ - Allocatable: corev1.ResourceList{ - gpuResourceName: resource.MustParse("7"), - }, - }, - }, - desired: 8, - want: false, - }, +func TestRun_GPUMismatchTriggersUnhealthy(t *testing.T) { + now := time.Date(2026, 4, 13, 12, 0, 0, 0, time.UTC) + node := newTestNode("node-01", corev1.ConditionTrue, 7) // 7 GPUs, desired 8 + w := newTestWatcher(t, + WithNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), + WithCheckers(health.NewDefaultCheckers(8)), + WithNowFunc(func() time.Time { return now }), + ) + + if err := w.run(t.Context()); err != nil { + t.Fatal(err) } - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - got := isNodeDesiredGPU(test.node, test.desired) - if got != test.want { - t.Errorf("got = %v, want %v", got, test.want) - } - }) + state, _ := w.states.Get("node-01") + if state.Phase() != PhaseUnhealthy { + t.Errorf("got phase %v, want PhaseUnhealthy", state.Phase()) + } + if !state.IsGPUNode() { + t.Error("expected isGPUNode to be true for node with 7 GPUs") } } -func TestRebootNode(t *testing.T) { - type args struct { - nodeName string - opts []Option +func TestRun_RebootErrorContinuesProcessing(t *testing.T) { + now := time.Date(2026, 4, 13, 12, 0, 0, 0, time.UTC) + node := newTestNode("node-01", corev1.ConditionFalse, 0) + exec := &mockExecutor{ + rebootFunc: func(_ context.Context, _ string) error { + return fmt.Errorf("reboot API error") + }, } - type test struct { - name string - args args - beforeFunc func(*testing.T, *watcher) - wantErr bool + w := newTestWatcher(t, + WithNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), + WithCheckers(health.NewDefaultCheckers(0)), + WithExecutor(exec), + WithMonitorOnly(false), + WithUnhealthyThresholdMinutes("10"), + WithNowFunc(func() time.Time { return now }), + ) + + // Run 1: detect unhealthy. + if err := w.run(t.Context()); err != nil { + t.Fatal(err) + } + // Run 2: threshold exceeded, reboot fails → should not error out, stays PhaseUnhealthy. + now = now.Add(11 * time.Minute) + if err := w.run(t.Context()); err != nil { + t.Fatal("run should not return error on reboot failure") } - tests := []test{ - { - name: "Returns nil when there is no error finding and rebooting the instance", - args: args{ - nodeName: "node-01", - opts: []Option{ - WithKubernetesClient(fake.NewSimpleClientset()), - WithCivoClient(&FakeClient{}), - WithDesiredGPUCount(testNodeDesiredGPUCount), - }, - }, - beforeFunc: func(t *testing.T, w *watcher) { - t.Helper() - client := w.civoClient.(*FakeClient) + state, _ := w.states.Get("node-01") + if state.Phase() != PhaseUnhealthy { + t.Errorf("got phase %v, want PhaseUnhealthy (reboot failed, no transition)", state.Phase()) + } +} - instance := &civogo.Instance{ - ID: "instance-01", - } +func TestRun_NodeListError(t *testing.T) { + w := newTestWatcher(t, + WithNodeLister(&fakeNodeLister{err: fmt.Errorf("list error")}), + WithCheckers(health.NewDefaultCheckers(0)), + ) - client.FindKubernetesClusterInstanceFunc = func(clusterID, search string) (*civogo.Instance, error) { - return instance, nil - } - client.HardRebootInstanceFunc = func(id string) (*civogo.SimpleResponse, error) { - if instance.ID != id { - t.Errorf("instanceId dose not match. want: %s, but got: %s", instance.ID, id) - } - return new(civogo.SimpleResponse), nil - } - }, - }, - { - name: "Returns an error when instance lookup fails", - args: args{ - nodeName: "node-01", - opts: []Option{ - WithKubernetesClient(fake.NewSimpleClientset()), - WithCivoClient(&FakeClient{}), - WithDesiredGPUCount(testNodeDesiredGPUCount), - }, - }, - beforeFunc: func(t *testing.T, w *watcher) { - t.Helper() - client := w.civoClient.(*FakeClient) + if err := w.run(t.Context()); err == nil { + t.Error("expected error from node list failure") + } +} - client.FindKubernetesClusterInstanceFunc = func(clusterID, search string) (*civogo.Instance, error) { - return nil, errors.New("invalid error") - } - }, - wantErr: true, - }, - { - name: "Returns an error when instance reboot fails", - args: args{ - nodeName: "node-01", - opts: []Option{ - WithKubernetesClient(fake.NewSimpleClientset()), - WithCivoClient(&FakeClient{}), - WithDesiredGPUCount(testNodeDesiredGPUCount), - }, - }, - beforeFunc: func(t *testing.T, w *watcher) { - t.Helper() - client := w.civoClient.(*FakeClient) +func TestRun_StaleStateCleanup(t *testing.T) { + now := time.Date(2026, 4, 13, 12, 0, 0, 0, time.UTC) + node := newTestNode("node-01", corev1.ConditionFalse, 0) + lister := &fakeNodeLister{nodes: []*corev1.Node{node}} + w := newTestWatcher(t, + WithNodeLister(lister), + WithCheckers(health.NewDefaultCheckers(0)), + WithNowFunc(func() time.Time { return now }), + ) + + // Run 1: detect node-01 unhealthy. + if err := w.run(t.Context()); err != nil { + t.Fatal(err) + } + if _, ok := w.states.Get("node-01"); !ok { + t.Fatal("state should exist for node-01") + } - instance := &civogo.Instance{ - ID: "instance-01", - } + // Node removed from cluster. + lister.nodes = nil - client.FindKubernetesClusterInstanceFunc = func(clusterID, search string) (*civogo.Instance, error) { - return instance, nil - } - client.HardRebootInstanceFunc = func(id string) (*civogo.SimpleResponse, error) { - if instance.ID != id { - t.Errorf("instanceId dose not match. want: %s, but got: %s", instance.ID, id) - } - return nil, errors.New("invalid error") - } - }, - wantErr: true, - }, + if err := w.run(t.Context()); err != nil { + t.Fatal(err) } - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - w, err := NewWatcher(t.Context(), - testApiURL, testApiKey, testRegion, testClusterID, testNodePoolID, test.args.opts...) - if err != nil { - t.Fatal(err) - } + if _, ok := w.states.Get("node-01"); ok { + t.Error("state for node-01 should be cleaned up after removal") + } +} - obj := w.(*watcher) - if test.beforeFunc != nil { - test.beforeFunc(t, obj) - } +func TestRun_UnhealthyWithinThresholdNoReboot(t *testing.T) { + now := time.Date(2026, 4, 13, 12, 0, 0, 0, time.UTC) + node := newTestNode("node-01", corev1.ConditionFalse, 0) + exec := &mockExecutor{} + w := newTestWatcher(t, + WithNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), + WithCheckers(health.NewDefaultCheckers(0)), + WithExecutor(exec), + WithMonitorOnly(false), + WithUnhealthyThresholdMinutes("10"), + WithNowFunc(func() time.Time { return now }), + ) + + // Run 1: detect unhealthy. + if err := w.run(t.Context()); err != nil { + t.Fatal(err) + } - err = obj.rebootNode(test.args.nodeName) - if (err != nil) != test.wantErr { - t.Errorf("error = %v, wantErr %v", err, test.wantErr) - } - }) + // Run 2: still within threshold → no reboot. + now = now.Add(5 * time.Minute) + if err := w.run(t.Context()); err != nil { + t.Fatal(err) + } + + state, _ := w.states.Get("node-01") + if state.Phase() != PhaseUnhealthy { + t.Errorf("got phase %v, want PhaseUnhealthy", state.Phase()) + } + if len(exec.calls) != 0 { + t.Errorf("expected no reboot calls within threshold, got %v", exec.calls) } } From c64cd02c771aef9fda9cbff1f5bab6db934fa07f Mon Sep 17 00:00:00 2001 From: hlts2 Date: Mon, 13 Apr 2026 22:41:29 +0900 Subject: [PATCH 02/71] refactor: extract operation options to options.go and add input validation - Rename reboot.go to civo.go to better reflect the Civo executor scope - Extract FOP (Option, WithAPIConfig, WithClient) into options.go - Add validation for clusterID, apiKey, and apiURL before Civo client creation Co-Authored-By: Claude Opus 4.6 (1M context) --- pkg/operation/{reboot.go => civo.go} | 32 +++++++------------ .../{reboot_test.go => civo_test.go} | 0 pkg/operation/options.go | 24 ++++++++++++++ 3 files changed, 35 insertions(+), 21 deletions(-) rename pkg/operation/{reboot.go => civo.go} (70%) rename pkg/operation/{reboot_test.go => civo_test.go} (100%) create mode 100644 pkg/operation/options.go diff --git a/pkg/operation/reboot.go b/pkg/operation/civo.go similarity index 70% rename from pkg/operation/reboot.go rename to pkg/operation/civo.go index d817774..ef9cd67 100644 --- a/pkg/operation/reboot.go +++ b/pkg/operation/civo.go @@ -8,27 +8,6 @@ import ( "github.com/civo/civogo" ) -// Option represents a configuration function that modifies civoExecutor. -type Option func(*civoExecutor) - -// WithAPIConfig returns Option to configure the Civo API credentials and version. -// The client is created internally using these values. -func WithAPIConfig(apiKey, apiURL, region, version string) Option { - return func(e *civoExecutor) { - e.apiKey = apiKey - e.apiURL = apiURL - e.region = region - e.version = version - } -} - -// WithClient returns Option to inject a pre-built Civo client (for testing). -func WithClient(client civogo.Clienter) Option { - return func(e *civoExecutor) { - e.civoClient = client - } -} - // civoExecutor implements Executor using the Civo API. type civoExecutor struct { civoClient civogo.Clienter @@ -47,10 +26,21 @@ func NewCivoExecutor(clusterID string, opts ...Option) (Executor, error) { opt(e) } + if clusterID == "" { + return nil, fmt.Errorf("cluster ID must not be empty") + } + if e.civoClient != nil { return e, nil } + if e.apiKey == "" { + return nil, fmt.Errorf("API key must not be empty") + } + if e.apiURL == "" { + return nil, fmt.Errorf("API URL must not be empty") + } + client, err := civogo.NewClientWithURL(e.apiKey, e.apiURL, e.region) if err != nil { return nil, fmt.Errorf("failed to initialise civo client: %w", err) diff --git a/pkg/operation/reboot_test.go b/pkg/operation/civo_test.go similarity index 100% rename from pkg/operation/reboot_test.go rename to pkg/operation/civo_test.go diff --git a/pkg/operation/options.go b/pkg/operation/options.go new file mode 100644 index 0000000..49ca08a --- /dev/null +++ b/pkg/operation/options.go @@ -0,0 +1,24 @@ +package operation + +import "github.com/civo/civogo" + +// Option represents a configuration function that modifies civoExecutor. +type Option func(*civoExecutor) + +// WithAPIConfig returns Option to configure the Civo API credentials and version. +// The client is created internally using these values. +func WithAPIConfig(apiKey, apiURL, region, version string) Option { + return func(e *civoExecutor) { + e.apiKey = apiKey + e.apiURL = apiURL + e.region = region + e.version = version + } +} + +// WithClient returns Option to inject a pre-built Civo client (for testing). +func WithClient(client civogo.Clienter) Option { + return func(e *civoExecutor) { + e.civoClient = client + } +} From abc45452b58445d7db0a798452fa905aff844082 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Mon, 13 Apr 2026 22:44:12 +0900 Subject: [PATCH 03/71] fix: update godoc comment Signed-off-by: hlts2 Co-Authored-By: Claude Opus 4.6 (1M context) --- pkg/health/health.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/health/health.go b/pkg/health/health.go index d0c5308..4032d13 100644 --- a/pkg/health/health.go +++ b/pkg/health/health.go @@ -10,7 +10,7 @@ type HealthChecker interface { Check(node *corev1.Node) bool } -// NewDefaultCheckers returns the enabled health checkers for the MVP. +// NewDefaultCheckers returns the enabled health checkers. // GPUChecker is included only when desiredGPUCount > 0. func NewDefaultCheckers(desiredGPUCount int) []HealthChecker { checkers := []HealthChecker{ From 6e665856884d1edb32ede9517d21616c1302eb48 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Mon, 13 Apr 2026 23:33:27 +0900 Subject: [PATCH 04/71] feat: add check function to detect disk pressure issue Signed-off-by: hlts2 Co-Authored-By: Claude Opus 4.6 (1M context) --- pkg/health/disk_pressure.go | 17 +++++++ pkg/health/disk_pressure_test.go | 79 ++++++++++++++++++++++++++++++++ pkg/health/gpu_test.go | 20 +++++--- pkg/health/health.go | 1 + 4 files changed, 110 insertions(+), 7 deletions(-) create mode 100644 pkg/health/disk_pressure.go create mode 100644 pkg/health/disk_pressure_test.go diff --git a/pkg/health/disk_pressure.go b/pkg/health/disk_pressure.go new file mode 100644 index 0000000..ec1e4df --- /dev/null +++ b/pkg/health/disk_pressure.go @@ -0,0 +1,17 @@ +package health + +import corev1 "k8s.io/api/core/v1" + +// diskPressureChecker reports healthy when the node does not have disk pressure. +type diskPressureChecker struct{} + +func (c *diskPressureChecker) Name() string { return "DiskPressure" } + +func (c *diskPressureChecker) Check(node *corev1.Node) bool { + for _, cond := range node.Status.Conditions { + if cond.Type == corev1.NodeDiskPressure { + return cond.Status != corev1.ConditionTrue + } + } + return true +} diff --git a/pkg/health/disk_pressure_test.go b/pkg/health/disk_pressure_test.go new file mode 100644 index 0000000..5208f15 --- /dev/null +++ b/pkg/health/disk_pressure_test.go @@ -0,0 +1,79 @@ +package health + +import ( + "testing" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func TestDiskPressureChecker_Name(t *testing.T) { + c := &diskPressureChecker{} + if got := c.Name(); got != "DiskPressure" { + t.Errorf("got %q, want %q", got, "DiskPressure") + } +} + +func TestDiskPressureChecker_Check(t *testing.T) { + tests := []struct { + name string + node *corev1.Node + want bool + }{ + { + name: "Returns true when DiskPressure is False (no pressure)", + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, + Status: corev1.NodeStatus{ + Conditions: []corev1.NodeCondition{ + {Type: corev1.NodeDiskPressure, Status: corev1.ConditionFalse}, + }, + }, + }, + want: true, + }, + { + name: "Returns false when DiskPressure is True (under pressure)", + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, + Status: corev1.NodeStatus{ + Conditions: []corev1.NodeCondition{ + {Type: corev1.NodeDiskPressure, Status: corev1.ConditionTrue}, + }, + }, + }, + want: false, + }, + { + name: "Returns true when no conditions present", + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, + Status: corev1.NodeStatus{ + Conditions: []corev1.NodeCondition{}, + }, + }, + want: true, + }, + { + name: "Returns true when only non-DiskPressure conditions present", + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, + Status: corev1.NodeStatus{ + Conditions: []corev1.NodeCondition{ + {Type: corev1.NodeReady, Status: corev1.ConditionTrue}, + }, + }, + }, + want: true, + }, + } + + c := &diskPressureChecker{} + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := c.Check(tt.node); got != tt.want { + t.Errorf("got %v, want %v", got, tt.want) + } + }) + } +} diff --git a/pkg/health/gpu_test.go b/pkg/health/gpu_test.go index f494054..0ff326c 100644 --- a/pkg/health/gpu_test.go +++ b/pkg/health/gpu_test.go @@ -98,24 +98,30 @@ func TestGPUChecker_Check(t *testing.T) { func TestNewDefaultCheckers(t *testing.T) { t.Run("GPU disabled when desiredCount is 0", func(t *testing.T) { checkers := NewDefaultCheckers(0) - if len(checkers) != 1 { - t.Fatalf("expected 1 checker, got %d", len(checkers)) + if len(checkers) != 2 { + t.Fatalf("expected 2 checkers, got %d", len(checkers)) } if checkers[0].Name() != "NodeReady" { - t.Errorf("expected NodeReady checker, got %q", checkers[0].Name()) + t.Errorf("expected NodeReady checker first, got %q", checkers[0].Name()) + } + if checkers[1].Name() != "DiskPressure" { + t.Errorf("expected DiskPressure checker second, got %q", checkers[1].Name()) } }) t.Run("GPU enabled when desiredCount is positive", func(t *testing.T) { checkers := NewDefaultCheckers(8) - if len(checkers) != 2 { - t.Fatalf("expected 2 checkers, got %d", len(checkers)) + if len(checkers) != 3 { + t.Fatalf("expected 3 checkers, got %d", len(checkers)) } if checkers[0].Name() != "NodeReady" { t.Errorf("expected NodeReady checker first, got %q", checkers[0].Name()) } - if checkers[1].Name() != "GPU" { - t.Errorf("expected GPU checker second, got %q", checkers[1].Name()) + if checkers[1].Name() != "DiskPressure" { + t.Errorf("expected DiskPressure checker second, got %q", checkers[1].Name()) + } + if checkers[2].Name() != "GPU" { + t.Errorf("expected GPU checker third, got %q", checkers[2].Name()) } }) } diff --git a/pkg/health/health.go b/pkg/health/health.go index 4032d13..2da0806 100644 --- a/pkg/health/health.go +++ b/pkg/health/health.go @@ -15,6 +15,7 @@ type HealthChecker interface { func NewDefaultCheckers(desiredGPUCount int) []HealthChecker { checkers := []HealthChecker{ &nodeReadyChecker{}, + &diskPressureChecker{}, } if desiredGPUCount > 0 { checkers = append(checkers, &gpuChecker{desiredCount: desiredGPUCount}) From 97732479f92c20bbc1fd5ae5298782803f5f1c6d Mon Sep 17 00:00:00 2001 From: hlts2 Date: Mon, 13 Apr 2026 23:44:22 +0900 Subject: [PATCH 05/71] fix: update code comment Signed-off-by: hlts2 Co-Authored-By: Claude Opus 4.6 (1M context) --- pkg/watcher/watcher.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pkg/watcher/watcher.go b/pkg/watcher/watcher.go index 18d1373..fe79de8 100644 --- a/pkg/watcher/watcher.go +++ b/pkg/watcher/watcher.go @@ -197,7 +197,9 @@ func (w *watcher) run(ctx context.Context) error { continue } - // At least one checker failed. + // At least one checker failed — enter the recovery judgment phase. + // The state machine decides the next action (wait, reboot, retry) + // regardless of which specific checker(s) failed. isGPU := hasGPU(node) w.states.UpdateCheckerInfo(nodeName, failedCheckers, isGPU) From 117931df2237a3566f087fd7baef40cffbb69b14 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 14 Apr 2026 01:54:18 +0900 Subject: [PATCH 06/71] feat: add per-checker thresholds and DiskPressure checker - Add Threshold() to HealthChecker interface for per-checker thresholds - NodeReady: 10min, GPU: 10min, DiskPressure: 30min - Remove single unhealthyThreshold from watcher; use min threshold of failed checkers - Remove WithUnhealthyThresholdMinutes option and CIVO_NODE_UNHEALTHY_THRESHOLD_MINUTES env var - Refactor main.go: parseUintOrZero, defaultMetricsPort as int Co-Authored-By: Claude Opus 4.6 (1M context) --- main.go | 41 ++++++++++++++++++++----------------- pkg/health/disk_pressure.go | 9 ++++++-- pkg/health/gpu.go | 9 ++++++-- pkg/health/health.go | 9 +++++++- pkg/health/node_ready.go | 9 ++++++-- pkg/watcher/options.go | 14 ------------- pkg/watcher/watcher.go | 17 ++++++++------- pkg/watcher/watcher_test.go | 16 +++++++-------- 8 files changed, 68 insertions(+), 56 deletions(-) diff --git a/main.go b/main.go index 31c4bca..3d109ce 100644 --- a/main.go +++ b/main.go @@ -24,20 +24,19 @@ var ( ) var ( - apiURL = strings.TrimSpace(os.Getenv("CIVO_API_URL")) - apiKey = strings.TrimSpace(os.Getenv("CIVO_API_KEY")) - region = strings.TrimSpace(os.Getenv("CIVO_REGION")) - clusterID = strings.TrimSpace(os.Getenv("CIVO_CLUSTER_ID")) - nodePoolID = strings.TrimSpace(os.Getenv("CIVO_NODE_POOL_ID")) - nodeDesiredGPUCount = strings.TrimSpace(os.Getenv("CIVO_NODE_DESIRED_GPU_COUNT")) - rebootTimeWindowMinutes = strings.TrimSpace(os.Getenv("CIVO_NODE_REBOOT_TIME_WINDOW_MINUTES")) - monitorOnly = strings.TrimSpace(os.Getenv("CIVO_NODE_MONITOR_ONLY")) - unhealthyThresholdMinutes = strings.TrimSpace(os.Getenv("CIVO_NODE_UNHEALTHY_THRESHOLD_MINUTES")) - metricsPort = strings.TrimSpace(os.Getenv("CIVO_NODE_METRICS_PORT")) + apiURL = strings.TrimSpace(os.Getenv("CIVO_API_URL")) + apiKey = strings.TrimSpace(os.Getenv("CIVO_API_KEY")) + region = strings.TrimSpace(os.Getenv("CIVO_REGION")) + clusterID = strings.TrimSpace(os.Getenv("CIVO_CLUSTER_ID")) + nodePoolID = strings.TrimSpace(os.Getenv("CIVO_NODE_POOL_ID")) + nodeDesiredGPUCount = strings.TrimSpace(os.Getenv("CIVO_NODE_DESIRED_GPU_COUNT")) + rebootTimeWindowMinutes = strings.TrimSpace(os.Getenv("CIVO_NODE_REBOOT_TIME_WINDOW_MINUTES")) + monitorOnly = strings.TrimSpace(os.Getenv("CIVO_NODE_MONITOR_ONLY")) + metricsPort = strings.TrimSpace(os.Getenv("CIVO_NODE_METRICS_PORT")) ) const ( - defaultMetricsPort = "9625" + defaultMetricsPort = 9625 ) func run(ctx context.Context) error { @@ -49,7 +48,7 @@ func run(ctx context.Context) error { if err != nil { return fmt.Errorf("failed to initialise executor: %w", err) } - checkers := health.NewDefaultCheckers(parseIntOrZero(nodeDesiredGPUCount)) + checkers := health.NewDefaultCheckers(parseUintOrZero(nodeDesiredGPUCount)) monitorOnlyFlag := true if v, err := strconv.ParseBool(monitorOnly); err == nil { @@ -61,9 +60,9 @@ func run(ctx context.Context) error { port := defaultMetricsPort // Exclude well known port and negative integers. if v, err := strconv.Atoi(metricsPort); err == nil && v >= 1024 && v <= 65535 { - port = metricsPort + port = v } - addr := ":" + port + addr := ":" + strconv.Itoa(port) slog.Info("Starting metrics server", "addr", addr) if err := http.ListenAndServe(addr, metrics.Handler()); err != nil { slog.Error("Metrics server failed", "error", err) @@ -76,7 +75,6 @@ func run(ctx context.Context) error { watcher.WithMonitorOnly(monitorOnlyFlag), watcher.WithRebootTimeWindowMinutes(rebootTimeWindowMinutes), watcher.WithDesiredGPUCount(nodeDesiredGPUCount), - watcher.WithUnhealthyThresholdMinutes(unhealthyThresholdMinutes), ) if err != nil { return err @@ -103,11 +101,16 @@ func main() { } } -func parseIntOrZero(s string) int { +func parseUintOrZero(s string) int { if s == "" { return 0 } - n := 0 - fmt.Sscanf(s, "%d", &n) - return n + v, err := strconv.Atoi(s) + if err != nil { + return 0 + } + if v < 0 { + return 0 + } + return v } diff --git a/pkg/health/disk_pressure.go b/pkg/health/disk_pressure.go index ec1e4df..3610375 100644 --- a/pkg/health/disk_pressure.go +++ b/pkg/health/disk_pressure.go @@ -1,11 +1,16 @@ package health -import corev1 "k8s.io/api/core/v1" +import ( + "time" + + corev1 "k8s.io/api/core/v1" +) // diskPressureChecker reports healthy when the node does not have disk pressure. type diskPressureChecker struct{} -func (c *diskPressureChecker) Name() string { return "DiskPressure" } +func (c *diskPressureChecker) Name() string { return "DiskPressure" } +func (c *diskPressureChecker) Threshold() time.Duration { return 30 * time.Minute } func (c *diskPressureChecker) Check(node *corev1.Node) bool { for _, cond := range node.Status.Conditions { diff --git a/pkg/health/gpu.go b/pkg/health/gpu.go index de81007..4afb8ae 100644 --- a/pkg/health/gpu.go +++ b/pkg/health/gpu.go @@ -1,6 +1,10 @@ package health -import corev1 "k8s.io/api/core/v1" +import ( + "time" + + corev1 "k8s.io/api/core/v1" +) const gpuResourceName = "nvidia.com/gpu" @@ -15,7 +19,8 @@ func NewGPUChecker(desiredCount int) HealthChecker { return &gpuChecker{desiredCount: desiredCount} } -func (c *gpuChecker) Name() string { return "GPU" } +func (c *gpuChecker) Name() string { return "GPU" } +func (c *gpuChecker) Threshold() time.Duration { return 10 * time.Minute } func (c *gpuChecker) Check(node *corev1.Node) bool { if c.desiredCount == 0 { diff --git a/pkg/health/health.go b/pkg/health/health.go index 2da0806..8096b42 100644 --- a/pkg/health/health.go +++ b/pkg/health/health.go @@ -1,6 +1,10 @@ package health -import corev1 "k8s.io/api/core/v1" +import ( + "time" + + corev1 "k8s.io/api/core/v1" +) // HealthChecker determines whether a single aspect of a node is healthy. type HealthChecker interface { @@ -8,6 +12,9 @@ type HealthChecker interface { Name() string // Check returns true if the node is healthy for this checker's concern. Check(node *corev1.Node) bool + // Threshold returns how long this checker must continuously fail + // before a recovery action is triggered. + Threshold() time.Duration } // NewDefaultCheckers returns the enabled health checkers. diff --git a/pkg/health/node_ready.go b/pkg/health/node_ready.go index d9ca7e7..a95b259 100644 --- a/pkg/health/node_ready.go +++ b/pkg/health/node_ready.go @@ -1,11 +1,16 @@ package health -import corev1 "k8s.io/api/core/v1" +import ( + "time" + + corev1 "k8s.io/api/core/v1" +) // nodeReadyChecker reports healthy when the node's NodeReady condition is True. type nodeReadyChecker struct{} -func (c *nodeReadyChecker) Name() string { return "NodeReady" } +func (c *nodeReadyChecker) Name() string { return "NodeReady" } +func (c *nodeReadyChecker) Threshold() time.Duration { return 10 * time.Minute } func (c *nodeReadyChecker) Check(node *corev1.Node) bool { for _, cond := range node.Status.Conditions { diff --git a/pkg/watcher/options.go b/pkg/watcher/options.go index cc3d677..748e6bf 100644 --- a/pkg/watcher/options.go +++ b/pkg/watcher/options.go @@ -17,7 +17,6 @@ type Option func(*watcher) var defaultOptions = []Option{ WithRebootTimeWindowMinutes("40"), WithDesiredGPUCount("0"), - WithUnhealthyThresholdMinutes("10"), } // WithKubernetesClient returns Option to set Kubernetes API client. @@ -70,19 +69,6 @@ func WithMonitorOnly(v bool) Option { } } -// WithUnhealthyThresholdMinutes returns Option to set the duration a node -// must be continuously unhealthy before a recovery action is triggered. -func WithUnhealthyThresholdMinutes(s string) Option { - return func(w *watcher) { - n, err := strconv.Atoi(s) - if err == nil && n > 0 { - w.unhealthyThreshold = time.Duration(n) * time.Minute - } else { - slog.Info("UnhealthyThresholdMinutes is invalid", "value", s) - } - } -} - // WithCheckers returns Option to set the health checkers. func WithCheckers(checkers []health.HealthChecker) Option { return func(w *watcher) { diff --git a/pkg/watcher/watcher.go b/pkg/watcher/watcher.go index fe79de8..476e48b 100644 --- a/pkg/watcher/watcher.go +++ b/pkg/watcher/watcher.go @@ -40,12 +40,11 @@ type watcher struct { nodeSelector *metav1.LabelSelector nodeLister listerscorev1.NodeLister - monitorOnly bool - unhealthyThreshold time.Duration - checkers []health.HealthChecker - executor operation.Executor - states *StateStore - nowFunc func() time.Time + monitorOnly bool + checkers []health.HealthChecker + executor operation.Executor + states *StateStore + nowFunc func() time.Time } func NewWatcher(ctx context.Context, clusterID, nodePoolID string, opts ...Option) (Watcher, error) { @@ -170,12 +169,16 @@ func (w *watcher) run(ctx context.Context) error { // Run all health checkers and collect failures. var failedCheckers []string + var minThreshold time.Duration for _, checker := range w.checkers { healthy := checker.Check(node) result := "pass" if !healthy { result = "fail" failedCheckers = append(failedCheckers, checker.Name()) + if minThreshold == 0 || checker.Threshold() < minThreshold { + minThreshold = checker.Threshold() + } } metrics.HealthCheckTotal.WithLabelValues(nodeName, checker.Name(), result).Inc() } @@ -216,7 +219,7 @@ func (w *watcher) run(ctx context.Context) error { case PhaseUnhealthy: metrics.NodeUnhealthyDurationSeconds.WithLabelValues(nodeName).Set( now.Sub(state.UnhealthySince()).Seconds()) - if now.Sub(state.UnhealthySince()) < w.unhealthyThreshold { + if now.Sub(state.UnhealthySince()) < minThreshold { continue } if !w.monitorOnly { diff --git a/pkg/watcher/watcher_test.go b/pkg/watcher/watcher_test.go index 6d877bd..41c6684 100644 --- a/pkg/watcher/watcher_test.go +++ b/pkg/watcher/watcher_test.go @@ -54,10 +54,14 @@ func (m *mockExecutor) Reboot(ctx context.Context, nodeName string) error { } // alwaysFailChecker is a HealthChecker that always reports unhealthy. -type alwaysFailChecker struct{ name string } +type alwaysFailChecker struct { + name string + threshold time.Duration +} -func (c *alwaysFailChecker) Name() string { return c.name } -func (c *alwaysFailChecker) Check(*corev1.Node) bool { return false } +func (c *alwaysFailChecker) Name() string { return c.name } +func (c *alwaysFailChecker) Check(*corev1.Node) bool { return false } +func (c *alwaysFailChecker) Threshold() time.Duration { return c.threshold } // --- Test variables --- @@ -293,7 +297,6 @@ func TestRun_RebootTriggerActiveMode(t *testing.T) { WithCheckers(health.NewDefaultCheckers(8)), WithExecutor(exec), WithMonitorOnly(false), - WithUnhealthyThresholdMinutes("10"), WithNowFunc(func() time.Time { return now }), ) @@ -331,7 +334,6 @@ func TestRun_RebootSkippedInReportMode(t *testing.T) { WithCheckers(health.NewDefaultCheckers(8)), WithExecutor(exec), WithMonitorOnly(true), - WithUnhealthyThresholdMinutes("10"), WithNowFunc(func() time.Time { return now }), ) @@ -364,7 +366,6 @@ func TestRun_RecoveryAfterReboot(t *testing.T) { WithNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), WithCheckers(health.NewDefaultCheckers(8)), WithMonitorOnly(false), - WithUnhealthyThresholdMinutes("10"), WithNowFunc(func() time.Time { return now }), ) @@ -405,7 +406,6 @@ func TestRun_RebootRetry(t *testing.T) { WithCheckers(health.NewDefaultCheckers(8)), WithExecutor(exec), WithMonitorOnly(false), - WithUnhealthyThresholdMinutes("10"), WithRebootTimeWindowMinutes("40"), WithNowFunc(func() time.Time { return now }), ) @@ -482,7 +482,6 @@ func TestRun_RebootErrorContinuesProcessing(t *testing.T) { WithCheckers(health.NewDefaultCheckers(0)), WithExecutor(exec), WithMonitorOnly(false), - WithUnhealthyThresholdMinutes("10"), WithNowFunc(func() time.Time { return now }), ) @@ -552,7 +551,6 @@ func TestRun_UnhealthyWithinThresholdNoReboot(t *testing.T) { WithCheckers(health.NewDefaultCheckers(0)), WithExecutor(exec), WithMonitorOnly(false), - WithUnhealthyThresholdMinutes("10"), WithNowFunc(func() time.Time { return now }), ) From b1bd9a066292c8ec9426f040d19734f985917689 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 14 Apr 2026 01:58:57 +0900 Subject: [PATCH 07/71] fix: replace environment variable names with parameter-level error messages in NewWatcher Co-Authored-By: Claude Opus 4.6 (1M context) --- pkg/watcher/watcher.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/watcher/watcher.go b/pkg/watcher/watcher.go index 476e48b..e725b4b 100644 --- a/pkg/watcher/watcher.go +++ b/pkg/watcher/watcher.go @@ -59,10 +59,10 @@ func NewWatcher(ctx context.Context, clusterID, nodePoolID string, opts ...Optio } if clusterID == "" { - return nil, fmt.Errorf("CIVO_CLUSTER_ID not set") + return nil, fmt.Errorf("cluster ID must not be empty") } if nodePoolID == "" { - return nil, fmt.Errorf("CIVO_NODE_POOL_ID not set") + return nil, fmt.Errorf("node pool ID must not be empty") } w.nodeSelector = &metav1.LabelSelector{ From 0a456c8f690eac284b1a92e0e8e45088d7a9376c Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 14 Apr 2026 02:18:21 +0900 Subject: [PATCH 08/71] refactor: remove unused nodeDesiredGPUCount from watcher and add kubeconfig flag - Remove nodeDesiredGPUCount field, WithDesiredGPUCount option (GPU count is owned by checker) - Remove NewGPUChecker public constructor (only used internally by NewDefaultCheckers) - Add --kubeconfig flag with default /etc/rancher/k3s/k3s.yaml for CP VM Co-Authored-By: Claude Opus 4.6 (1M context) --- main.go | 7 ++++--- pkg/health/gpu.go | 5 ----- pkg/health/gpu_test.go | 4 ++-- pkg/watcher/options.go | 13 ------------- pkg/watcher/watcher.go | 1 - pkg/watcher/watcher_test.go | 14 -------------- 6 files changed, 6 insertions(+), 38 deletions(-) diff --git a/main.go b/main.go index 3d109ce..a5a3849 100644 --- a/main.go +++ b/main.go @@ -19,8 +19,9 @@ import ( ) var ( - version = "0.0.1" - versionInfo = flag.Bool("version", false, "Print the driver version") + version = "0.0.1" + versionInfo = flag.Bool("version", false, "Print the driver version") + kubeconfigPath = flag.String("kubeconfig", "/etc/rancher/k3s/k3s.yaml", "Path to kubeconfig file (empty for in-cluster config)") ) var ( @@ -70,11 +71,11 @@ func run(ctx context.Context) error { }() w, err := watcher.NewWatcher(ctx, clusterID, nodePoolID, + watcher.WithKubernetesClientConfigPath(*kubeconfigPath), watcher.WithExecutor(executor), watcher.WithCheckers(checkers), watcher.WithMonitorOnly(monitorOnlyFlag), watcher.WithRebootTimeWindowMinutes(rebootTimeWindowMinutes), - watcher.WithDesiredGPUCount(nodeDesiredGPUCount), ) if err != nil { return err diff --git a/pkg/health/gpu.go b/pkg/health/gpu.go index 4afb8ae..72405de 100644 --- a/pkg/health/gpu.go +++ b/pkg/health/gpu.go @@ -14,11 +14,6 @@ type gpuChecker struct { desiredCount int } -// NewGPUChecker creates a HealthChecker that verifies the node's allocatable GPU count. -func NewGPUChecker(desiredCount int) HealthChecker { - return &gpuChecker{desiredCount: desiredCount} -} - func (c *gpuChecker) Name() string { return "GPU" } func (c *gpuChecker) Threshold() time.Duration { return 10 * time.Minute } diff --git a/pkg/health/gpu_test.go b/pkg/health/gpu_test.go index 0ff326c..2288a54 100644 --- a/pkg/health/gpu_test.go +++ b/pkg/health/gpu_test.go @@ -9,7 +9,7 @@ import ( ) func TestGPUChecker_Name(t *testing.T) { - c := NewGPUChecker(8) + c := &gpuChecker{desiredCount: 8} if got := c.Name(); got != "GPU" { t.Errorf("got %q, want %q", got, "GPU") } @@ -87,7 +87,7 @@ func TestGPUChecker_Check(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - c := NewGPUChecker(tt.desired) + c := &gpuChecker{desiredCount: tt.desired} if got := c.Check(tt.node); got != tt.want { t.Errorf("got %v, want %v", got, tt.want) } diff --git a/pkg/watcher/options.go b/pkg/watcher/options.go index 748e6bf..ad30630 100644 --- a/pkg/watcher/options.go +++ b/pkg/watcher/options.go @@ -16,7 +16,6 @@ type Option func(*watcher) var defaultOptions = []Option{ WithRebootTimeWindowMinutes("40"), - WithDesiredGPUCount("0"), } // WithKubernetesClient returns Option to set Kubernetes API client. @@ -49,18 +48,6 @@ func WithRebootTimeWindowMinutes(s string) Option { } } -// WithDesiredGPUCount returns Option to set desired GPU count . -func WithDesiredGPUCount(s string) Option { - return func(w *watcher) { - n, err := strconv.Atoi(s) - if err == nil && n >= 0 { - w.nodeDesiredGPUCount = n - } else { - slog.Info("DesiredGPUCount is invalid", "value", s) - } - } -} - // WithMonitorOnly returns Option to enable or disable monitor-only mode. // When true (default), recovery actions are logged but not executed. func WithMonitorOnly(v bool) Option { diff --git a/pkg/watcher/watcher.go b/pkg/watcher/watcher.go index e725b4b..4c16083 100644 --- a/pkg/watcher/watcher.go +++ b/pkg/watcher/watcher.go @@ -34,7 +34,6 @@ type watcher struct { clientCfgPath string clusterID string - nodeDesiredGPUCount int rebootTimeWindowMinutes time.Duration nodeSelector *metav1.LabelSelector diff --git a/pkg/watcher/watcher_test.go b/pkg/watcher/watcher_test.go index 41c6684..d980d5b 100644 --- a/pkg/watcher/watcher_test.go +++ b/pkg/watcher/watcher_test.go @@ -68,7 +68,6 @@ func (c *alwaysFailChecker) Threshold() time.Duration { return c.threshold } var ( testClusterID = "test-cluster-123" testNodePoolID = "test-node-pool" - testNodeDesiredGPUCount = "8" testRebootTimeWindowMinutes = time.Duration(40) ) @@ -135,20 +134,12 @@ func TestNew(t *testing.T) { opts: []Option{ WithKubernetesClient(fake.NewSimpleClientset()), WithExecutor(&mockExecutor{}), - WithDesiredGPUCount(testNodeDesiredGPUCount), }, }, checkFunc: func(w *watcher) error { if w.clusterID != testClusterID { return fmt.Errorf("clusterID mismatch: got %s, want %s", w.clusterID, testClusterID) } - cnt, err := strconv.Atoi(testNodeDesiredGPUCount) - if err != nil { - return err - } - if w.nodeDesiredGPUCount != cnt { - return fmt.Errorf("nodeDesiredGPUCount mismatch: got %d, want %d", w.nodeDesiredGPUCount, cnt) - } if w.nodeSelector == nil || w.nodeSelector.MatchLabels[nodePoolLabelKey] != testNodePoolID { return fmt.Errorf("nodeSelector mismatch: got %v, want %s", w.nodeSelector, testNodePoolID) } @@ -178,16 +169,11 @@ func TestNew(t *testing.T) { opts: []Option{ WithKubernetesClient(fake.NewSimpleClientset()), WithExecutor(&mockExecutor{}), - WithDesiredGPUCount("invalid"), - WithDesiredGPUCount("-1"), WithRebootTimeWindowMinutes("invalid time"), WithRebootTimeWindowMinutes("0"), }, }, checkFunc: func(w *watcher) error { - if w.nodeDesiredGPUCount != 0 { - return fmt.Errorf("nodeDesiredGPUCount mismatch: got %d, want %d", w.nodeDesiredGPUCount, 0) - } if w.rebootTimeWindowMinutes != testRebootTimeWindowMinutes { return fmt.Errorf("rebootTimeWindowMinutes mismatch: got %v, want %v", w.rebootTimeWindowMinutes, testRebootTimeWindowMinutes) } From 43a495a865c058423fdfb1e76860b63754b7d9d1 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 14 Apr 2026 02:59:51 +0900 Subject: [PATCH 09/71] feat: support multiple node pool IDs and empty (all nodes) selector MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Change NewWatcher signature: nodePoolID string → nodePoolIDs []string - Add buildNodeSelector: empty=all nodes, single=MatchLabels, multiple=In operator - Add parseNodePoolIDs in main.go for comma-separated CIVO_NODE_POOL_ID Co-Authored-By: Claude Opus 4.6 (1M context) --- main.go | 17 ++++++++++++++++- pkg/watcher/watcher.go | 38 ++++++++++++++++++++++++++++--------- pkg/watcher/watcher_test.go | 31 ++++++++++-------------------- 3 files changed, 55 insertions(+), 31 deletions(-) diff --git a/main.go b/main.go index a5a3849..016d37e 100644 --- a/main.go +++ b/main.go @@ -70,7 +70,7 @@ func run(ctx context.Context) error { } }() - w, err := watcher.NewWatcher(ctx, clusterID, nodePoolID, + w, err := watcher.NewWatcher(ctx, clusterID, parseNodePoolIDs(nodePoolID), watcher.WithKubernetesClientConfigPath(*kubeconfigPath), watcher.WithExecutor(executor), watcher.WithCheckers(checkers), @@ -102,6 +102,21 @@ func main() { } } +// parseNodePoolIDs splits a comma-separated string into node pool IDs. +// Empty string returns nil (all node pools). +func parseNodePoolIDs(s string) []string { + if s == "" { + return nil + } + var ids []string + for _, id := range strings.Split(s, ",") { + if v := strings.TrimSpace(id); v != "" { + ids = append(ids, v) + } + } + return ids +} + func parseUintOrZero(s string) int { if s == "" { return 0 diff --git a/pkg/watcher/watcher.go b/pkg/watcher/watcher.go index 4c16083..bef65b1 100644 --- a/pkg/watcher/watcher.go +++ b/pkg/watcher/watcher.go @@ -46,7 +46,7 @@ type watcher struct { nowFunc func() time.Time } -func NewWatcher(ctx context.Context, clusterID, nodePoolID string, opts ...Option) (Watcher, error) { +func NewWatcher(ctx context.Context, clusterID string, nodePoolIDs []string, opts ...Option) (Watcher, error) { w := &watcher{ clusterID: clusterID, monitorOnly: true, @@ -60,15 +60,8 @@ func NewWatcher(ctx context.Context, clusterID, nodePoolID string, opts ...Optio if clusterID == "" { return nil, fmt.Errorf("cluster ID must not be empty") } - if nodePoolID == "" { - return nil, fmt.Errorf("node pool ID must not be empty") - } - w.nodeSelector = &metav1.LabelSelector{ - MatchLabels: map[string]string{ - nodePoolLabelKey: nodePoolID, - }, - } + w.nodeSelector = buildNodeSelector(nodePoolIDs) if err := w.setupKubernetesClient(); err != nil { return nil, err @@ -271,6 +264,33 @@ func modeLabel(monitorOnly bool) string { return "active" } +// buildNodeSelector builds a LabelSelector based on the given node pool IDs. +// - empty: no selector (all nodes) +// - single: MatchLabels exact match +// - multiple: MatchExpressions In operator +func buildNodeSelector(nodePoolIDs []string) *metav1.LabelSelector { + switch len(nodePoolIDs) { + case 0: + return nil + case 1: + return &metav1.LabelSelector{ + MatchLabels: map[string]string{ + nodePoolLabelKey: nodePoolIDs[0], + }, + } + default: + return &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: nodePoolLabelKey, + Operator: metav1.LabelSelectorOpIn, + Values: nodePoolIDs, + }, + }, + } + } +} + func hasGPU(node *corev1.Node) bool { quantity, exists := node.Status.Allocatable[gpuResourceName] if !exists { diff --git a/pkg/watcher/watcher_test.go b/pkg/watcher/watcher_test.go index d980d5b..2d7cb42 100644 --- a/pkg/watcher/watcher_test.go +++ b/pkg/watcher/watcher_test.go @@ -102,7 +102,7 @@ func newTestWatcher(t *testing.T, opts ...Option) *watcher { WithExecutor(&mockExecutor{}), } w, err := NewWatcher(t.Context(), - testClusterID, testNodePoolID, + testClusterID, []string{testNodePoolID}, append(baseOpts, opts...)...) if err != nil { t.Fatal(err) @@ -114,9 +114,9 @@ func newTestWatcher(t *testing.T, opts ...Option) *watcher { func TestNew(t *testing.T) { type args struct { - clusterID string - nodePoolID string - opts []Option + clusterID string + nodePoolIDs []string + opts []Option } type test struct { name string @@ -129,8 +129,8 @@ func TestNew(t *testing.T) { { name: "Returns no error when given valid input", args: args{ - clusterID: testClusterID, - nodePoolID: testNodePoolID, + clusterID: testClusterID, + nodePoolIDs: []string{testNodePoolID}, opts: []Option{ WithKubernetesClient(fake.NewSimpleClientset()), WithExecutor(&mockExecutor{}), @@ -164,8 +164,8 @@ func TestNew(t *testing.T) { { name: "Returns no error when input is invalid, but default value is set", args: args{ - clusterID: testClusterID, - nodePoolID: testNodePoolID, + clusterID: testClusterID, + nodePoolIDs: []string{testNodePoolID}, opts: []Option{ WithKubernetesClient(fake.NewSimpleClientset()), WithExecutor(&mockExecutor{}), @@ -183,18 +183,7 @@ func TestNew(t *testing.T) { { name: "Returns an error when clusterID is missing", args: args{ - nodePoolID: testNodePoolID, - opts: []Option{ - WithKubernetesClient(fake.NewSimpleClientset()), - WithExecutor(&mockExecutor{}), - }, - }, - wantErr: true, - }, - { - name: "Returns an error when nodePoolID is missing", - args: args{ - clusterID: testClusterID, + nodePoolIDs: []string{testNodePoolID}, opts: []Option{ WithKubernetesClient(fake.NewSimpleClientset()), WithExecutor(&mockExecutor{}), @@ -208,7 +197,7 @@ func TestNew(t *testing.T) { t.Run(test.name, func(t *testing.T) { w, err := NewWatcher(t.Context(), test.args.clusterID, - test.args.nodePoolID, + test.args.nodePoolIDs, test.args.opts...) if (err != nil) != test.wantErr { t.Errorf("error = %v, wantErr %v", err, test.wantErr) From a0017b196467a66b038f2f3eb0138c2a2d553d43 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 14 Apr 2026 03:04:52 +0900 Subject: [PATCH 10/71] feat: detect expected GPU count from node label instead of static env var - GPU checker now compares nvidia.com/gpu.count label (expected) vs allocatable (actual) - Auto-skips non-GPU nodes when label is absent - Remove CIVO_NODE_DESIRED_GPU_COUNT env var and NewDefaultCheckers parameter - NewDefaultCheckers() always includes GPU checker (self-determines GPU node) Co-Authored-By: Claude Opus 4.6 (1M context) --- main.go | 17 +----- pkg/health/gpu.go | 33 ++++++++--- pkg/health/gpu_test.go | 113 +++++++++++++++++++++--------------- pkg/health/health.go | 12 ++-- pkg/watcher/watcher_test.go | 27 +++++---- 5 files changed, 111 insertions(+), 91 deletions(-) diff --git a/main.go b/main.go index 016d37e..4899d4e 100644 --- a/main.go +++ b/main.go @@ -30,7 +30,6 @@ var ( region = strings.TrimSpace(os.Getenv("CIVO_REGION")) clusterID = strings.TrimSpace(os.Getenv("CIVO_CLUSTER_ID")) nodePoolID = strings.TrimSpace(os.Getenv("CIVO_NODE_POOL_ID")) - nodeDesiredGPUCount = strings.TrimSpace(os.Getenv("CIVO_NODE_DESIRED_GPU_COUNT")) rebootTimeWindowMinutes = strings.TrimSpace(os.Getenv("CIVO_NODE_REBOOT_TIME_WINDOW_MINUTES")) monitorOnly = strings.TrimSpace(os.Getenv("CIVO_NODE_MONITOR_ONLY")) metricsPort = strings.TrimSpace(os.Getenv("CIVO_NODE_METRICS_PORT")) @@ -49,7 +48,7 @@ func run(ctx context.Context) error { if err != nil { return fmt.Errorf("failed to initialise executor: %w", err) } - checkers := health.NewDefaultCheckers(parseUintOrZero(nodeDesiredGPUCount)) + checkers := health.NewDefaultCheckers() monitorOnlyFlag := true if v, err := strconv.ParseBool(monitorOnly); err == nil { @@ -116,17 +115,3 @@ func parseNodePoolIDs(s string) []string { } return ids } - -func parseUintOrZero(s string) int { - if s == "" { - return 0 - } - v, err := strconv.Atoi(s) - if err != nil { - return 0 - } - if v < 0 { - return 0 - } - return v -} diff --git a/pkg/health/gpu.go b/pkg/health/gpu.go index 72405de..8033fbe 100644 --- a/pkg/health/gpu.go +++ b/pkg/health/gpu.go @@ -1,24 +1,28 @@ package health import ( + "strconv" "time" corev1 "k8s.io/api/core/v1" ) -const gpuResourceName = "nvidia.com/gpu" +const ( + gpuResourceName = "nvidia.com/gpu" + gpuCountLabel = "nvidia.com/gpu.count" +) // gpuChecker reports healthy when the node's allocatable GPU count -// matches the desired count. If desiredCount is 0 the check always passes. -type gpuChecker struct { - desiredCount int -} +// matches the expected count from the nvidia.com/gpu.count label. +// If the label is not present, the node is not a GPU node and the check is skipped. +type gpuChecker struct{} func (c *gpuChecker) Name() string { return "GPU" } func (c *gpuChecker) Threshold() time.Duration { return 10 * time.Minute } func (c *gpuChecker) Check(node *corev1.Node) bool { - if c.desiredCount == 0 { + expected, ok := expectedGPUCount(node) + if !ok || expected == 0 { return true } @@ -27,10 +31,23 @@ func (c *gpuChecker) Check(node *corev1.Node) bool { return false } - gpuCount, ok := quantity.AsInt64() + actual, ok := quantity.AsInt64() if !ok { return false } - return gpuCount == int64(c.desiredCount) + return actual == int64(expected) +} + +// expectedGPUCount reads the nvidia.com/gpu.count label from the node. +func expectedGPUCount(node *corev1.Node) (int, bool) { + v, exists := node.Labels[gpuCountLabel] + if !exists { + return 0, false + } + n, err := strconv.Atoi(v) + if err != nil || n < 0 { + return 0, false + } + return n, true } diff --git a/pkg/health/gpu_test.go b/pkg/health/gpu_test.go index 2288a54..6f7b7a9 100644 --- a/pkg/health/gpu_test.go +++ b/pkg/health/gpu_test.go @@ -9,7 +9,7 @@ import ( ) func TestGPUChecker_Name(t *testing.T) { - c := &gpuChecker{desiredCount: 8} + c := &gpuChecker{} if got := c.Name(); got != "GPU" { t.Errorf("got %q, want %q", got, "GPU") } @@ -17,16 +17,17 @@ func TestGPUChecker_Name(t *testing.T) { func TestGPUChecker_Check(t *testing.T) { tests := []struct { - name string - desired int - node *corev1.Node - want bool + name string + node *corev1.Node + want bool }{ { - name: "Returns true when GPU count matches desired", - desired: 8, + name: "Returns true when allocatable matches label count", node: &corev1.Node{ - ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, + ObjectMeta: metav1.ObjectMeta{ + Name: "node-01", + Labels: map[string]string{gpuCountLabel: "8"}, + }, Status: corev1.NodeStatus{ Allocatable: corev1.ResourceList{ gpuResourceName: resource.MustParse("8"), @@ -36,8 +37,7 @@ func TestGPUChecker_Check(t *testing.T) { want: true, }, { - name: "Returns true when desired is 0 (check skipped)", - desired: 0, + name: "Returns true when gpu.count label is absent (non-GPU node)", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, Status: corev1.NodeStatus{ @@ -47,10 +47,25 @@ func TestGPUChecker_Check(t *testing.T) { want: true, }, { - name: "Returns false when GPU count is less than desired", - desired: 8, + name: "Returns true when gpu.count label is 0", node: &corev1.Node{ - ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, + ObjectMeta: metav1.ObjectMeta{ + Name: "node-01", + Labels: map[string]string{gpuCountLabel: "0"}, + }, + Status: corev1.NodeStatus{ + Allocatable: corev1.ResourceList{}, + }, + }, + want: true, + }, + { + name: "Returns false when allocatable is less than label count", + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node-01", + Labels: map[string]string{gpuCountLabel: "8"}, + }, Status: corev1.NodeStatus{ Allocatable: corev1.ResourceList{ gpuResourceName: resource.MustParse("7"), @@ -60,10 +75,12 @@ func TestGPUChecker_Check(t *testing.T) { want: false, }, { - name: "Returns false when GPU count is zero", - desired: 8, + name: "Returns false when allocatable GPU is zero", node: &corev1.Node{ - ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, + ObjectMeta: metav1.ObjectMeta{ + Name: "node-01", + Labels: map[string]string{gpuCountLabel: "8"}, + }, Status: corev1.NodeStatus{ Allocatable: corev1.ResourceList{ gpuResourceName: resource.MustParse("0"), @@ -73,21 +90,36 @@ func TestGPUChecker_Check(t *testing.T) { want: false, }, { - name: "Returns false when no GPU resource in allocatable", - desired: 8, + name: "Returns false when allocatable GPU resource is missing", node: &corev1.Node{ - ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, + ObjectMeta: metav1.ObjectMeta{ + Name: "node-01", + Labels: map[string]string{gpuCountLabel: "8"}, + }, Status: corev1.NodeStatus{ Allocatable: corev1.ResourceList{}, }, }, want: false, }, + { + name: "Returns true when gpu.count label is invalid", + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node-01", + Labels: map[string]string{gpuCountLabel: "invalid"}, + }, + Status: corev1.NodeStatus{ + Allocatable: corev1.ResourceList{}, + }, + }, + want: true, + }, } + c := &gpuChecker{} for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - c := &gpuChecker{desiredCount: tt.desired} if got := c.Check(tt.node); got != tt.want { t.Errorf("got %v, want %v", got, tt.want) } @@ -96,32 +128,17 @@ func TestGPUChecker_Check(t *testing.T) { } func TestNewDefaultCheckers(t *testing.T) { - t.Run("GPU disabled when desiredCount is 0", func(t *testing.T) { - checkers := NewDefaultCheckers(0) - if len(checkers) != 2 { - t.Fatalf("expected 2 checkers, got %d", len(checkers)) - } - if checkers[0].Name() != "NodeReady" { - t.Errorf("expected NodeReady checker first, got %q", checkers[0].Name()) - } - if checkers[1].Name() != "DiskPressure" { - t.Errorf("expected DiskPressure checker second, got %q", checkers[1].Name()) - } - }) - - t.Run("GPU enabled when desiredCount is positive", func(t *testing.T) { - checkers := NewDefaultCheckers(8) - if len(checkers) != 3 { - t.Fatalf("expected 3 checkers, got %d", len(checkers)) - } - if checkers[0].Name() != "NodeReady" { - t.Errorf("expected NodeReady checker first, got %q", checkers[0].Name()) - } - if checkers[1].Name() != "DiskPressure" { - t.Errorf("expected DiskPressure checker second, got %q", checkers[1].Name()) - } - if checkers[2].Name() != "GPU" { - t.Errorf("expected GPU checker third, got %q", checkers[2].Name()) - } - }) + checkers := NewDefaultCheckers() + if len(checkers) != 3 { + t.Fatalf("expected 3 checkers, got %d", len(checkers)) + } + if checkers[0].Name() != "NodeReady" { + t.Errorf("expected NodeReady checker first, got %q", checkers[0].Name()) + } + if checkers[1].Name() != "DiskPressure" { + t.Errorf("expected DiskPressure checker second, got %q", checkers[1].Name()) + } + if checkers[2].Name() != "GPU" { + t.Errorf("expected GPU checker third, got %q", checkers[2].Name()) + } } diff --git a/pkg/health/health.go b/pkg/health/health.go index 8096b42..bdd8989 100644 --- a/pkg/health/health.go +++ b/pkg/health/health.go @@ -18,14 +18,12 @@ type HealthChecker interface { } // NewDefaultCheckers returns the enabled health checkers. -// GPUChecker is included only when desiredGPUCount > 0. -func NewDefaultCheckers(desiredGPUCount int) []HealthChecker { - checkers := []HealthChecker{ +// GPU checker is always included; it auto-skips non-GPU nodes +// by checking for the nvidia.com/gpu.count label. +func NewDefaultCheckers() []HealthChecker { + return []HealthChecker{ &nodeReadyChecker{}, &diskPressureChecker{}, + &gpuChecker{}, } - if desiredGPUCount > 0 { - checkers = append(checkers, &gpuChecker{desiredCount: desiredGPUCount}) - } - return checkers } diff --git a/pkg/watcher/watcher_test.go b/pkg/watcher/watcher_test.go index 2d7cb42..ca766da 100644 --- a/pkg/watcher/watcher_test.go +++ b/pkg/watcher/watcher_test.go @@ -87,6 +87,7 @@ func newTestNode(name string, ready corev1.ConditionStatus, gpuCount int) *corev }, } if gpuCount > 0 { + node.Labels["nvidia.com/gpu.count"] = strconv.Itoa(gpuCount) node.Status.Allocatable = corev1.ResourceList{ gpuResourceName: resource.MustParse(strconv.Itoa(gpuCount)), } @@ -225,7 +226,7 @@ func TestRun_HealthyNodeStaysHealthy(t *testing.T) { node := newTestNode("node-01", corev1.ConditionTrue, 8) w := newTestWatcher(t, WithNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), - WithCheckers(health.NewDefaultCheckers(8)), + WithCheckers(health.NewDefaultCheckers()), ) if err := w.run(t.Context()); err != nil { @@ -246,7 +247,7 @@ func TestRun_UnhealthyDetection(t *testing.T) { node := newTestNode("node-01", corev1.ConditionFalse, 8) w := newTestWatcher(t, WithNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), - WithCheckers(health.NewDefaultCheckers(8)), + WithCheckers(health.NewDefaultCheckers()), WithNowFunc(func() time.Time { return now }), ) @@ -269,7 +270,7 @@ func TestRun_RebootTriggerActiveMode(t *testing.T) { exec := &mockExecutor{} w := newTestWatcher(t, WithNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), - WithCheckers(health.NewDefaultCheckers(8)), + WithCheckers(health.NewDefaultCheckers()), WithExecutor(exec), WithMonitorOnly(false), WithNowFunc(func() time.Time { return now }), @@ -306,7 +307,7 @@ func TestRun_RebootSkippedInReportMode(t *testing.T) { exec := &mockExecutor{} w := newTestWatcher(t, WithNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), - WithCheckers(health.NewDefaultCheckers(8)), + WithCheckers(health.NewDefaultCheckers()), WithExecutor(exec), WithMonitorOnly(true), WithNowFunc(func() time.Time { return now }), @@ -339,7 +340,7 @@ func TestRun_RecoveryAfterReboot(t *testing.T) { node := newTestNode("node-01", corev1.ConditionFalse, 8) w := newTestWatcher(t, WithNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), - WithCheckers(health.NewDefaultCheckers(8)), + WithCheckers(health.NewDefaultCheckers()), WithMonitorOnly(false), WithNowFunc(func() time.Time { return now }), ) @@ -378,7 +379,7 @@ func TestRun_RebootRetry(t *testing.T) { exec := &mockExecutor{} w := newTestWatcher(t, WithNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), - WithCheckers(health.NewDefaultCheckers(8)), + WithCheckers(health.NewDefaultCheckers()), WithExecutor(exec), WithMonitorOnly(false), WithRebootTimeWindowMinutes("40"), @@ -424,10 +425,12 @@ func TestRun_RebootRetry(t *testing.T) { func TestRun_GPUMismatchTriggersUnhealthy(t *testing.T) { now := time.Date(2026, 4, 13, 12, 0, 0, 0, time.UTC) - node := newTestNode("node-01", corev1.ConditionTrue, 7) // 7 GPUs, desired 8 + node := newTestNode("node-01", corev1.ConditionTrue, 8) + // Simulate GPU failure: label says 8 but only 7 allocatable. + node.Status.Allocatable[gpuResourceName] = resource.MustParse("7") w := newTestWatcher(t, WithNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), - WithCheckers(health.NewDefaultCheckers(8)), + WithCheckers(health.NewDefaultCheckers()), WithNowFunc(func() time.Time { return now }), ) @@ -454,7 +457,7 @@ func TestRun_RebootErrorContinuesProcessing(t *testing.T) { } w := newTestWatcher(t, WithNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), - WithCheckers(health.NewDefaultCheckers(0)), + WithCheckers(health.NewDefaultCheckers()), WithExecutor(exec), WithMonitorOnly(false), WithNowFunc(func() time.Time { return now }), @@ -479,7 +482,7 @@ func TestRun_RebootErrorContinuesProcessing(t *testing.T) { func TestRun_NodeListError(t *testing.T) { w := newTestWatcher(t, WithNodeLister(&fakeNodeLister{err: fmt.Errorf("list error")}), - WithCheckers(health.NewDefaultCheckers(0)), + WithCheckers(health.NewDefaultCheckers()), ) if err := w.run(t.Context()); err == nil { @@ -493,7 +496,7 @@ func TestRun_StaleStateCleanup(t *testing.T) { lister := &fakeNodeLister{nodes: []*corev1.Node{node}} w := newTestWatcher(t, WithNodeLister(lister), - WithCheckers(health.NewDefaultCheckers(0)), + WithCheckers(health.NewDefaultCheckers()), WithNowFunc(func() time.Time { return now }), ) @@ -523,7 +526,7 @@ func TestRun_UnhealthyWithinThresholdNoReboot(t *testing.T) { exec := &mockExecutor{} w := newTestWatcher(t, WithNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), - WithCheckers(health.NewDefaultCheckers(0)), + WithCheckers(health.NewDefaultCheckers()), WithExecutor(exec), WithMonitorOnly(false), WithNowFunc(func() time.Time { return now }), From 7e7e3167f06e5a31df29eeac9bddc74a80e9e1c1 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 14 Apr 2026 03:18:25 +0900 Subject: [PATCH 11/71] refactor: move nodePoolIDs from NewWatcher parameter to functional option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - NewWatcher signature: (ctx, clusterID, nodePoolIDs, opts...) → (ctx, clusterID, opts...) - Add WithNodePoolIDs option with append semantics (empty is no-op) Co-Authored-By: Claude Opus 4.6 (1M context) --- main.go | 3 ++- pkg/watcher/options.go | 11 +++++++++++ pkg/watcher/watcher.go | 5 +++-- pkg/watcher/watcher_test.go | 18 ++++++++---------- 4 files changed, 24 insertions(+), 13 deletions(-) diff --git a/main.go b/main.go index 4899d4e..41ba7a4 100644 --- a/main.go +++ b/main.go @@ -69,7 +69,8 @@ func run(ctx context.Context) error { } }() - w, err := watcher.NewWatcher(ctx, clusterID, parseNodePoolIDs(nodePoolID), + w, err := watcher.NewWatcher(ctx, clusterID, + watcher.WithNodePoolIDs(parseNodePoolIDs(nodePoolID)), watcher.WithKubernetesClientConfigPath(*kubeconfigPath), watcher.WithExecutor(executor), watcher.WithCheckers(checkers), diff --git a/pkg/watcher/options.go b/pkg/watcher/options.go index ad30630..9d27aa2 100644 --- a/pkg/watcher/options.go +++ b/pkg/watcher/options.go @@ -36,6 +36,17 @@ func WithKubernetesClientConfigPath(path string) Option { } } +// WithNodePoolIDs returns Option to append node pool IDs to watch. +// Can be called multiple times to accumulate IDs. +// If no IDs are provided across all calls, all nodes are watched. +func WithNodePoolIDs(ids []string) Option { + return func(w *watcher) { + if len(ids) > 0 { + w.nodePoolIDs = append(w.nodePoolIDs, ids...) + } + } +} + // WithRebootTimeWindowMinutes returns Option to set reboot time window. func WithRebootTimeWindowMinutes(s string) Option { return func(w *watcher) { diff --git a/pkg/watcher/watcher.go b/pkg/watcher/watcher.go index bef65b1..f0cbfbb 100644 --- a/pkg/watcher/watcher.go +++ b/pkg/watcher/watcher.go @@ -34,6 +34,7 @@ type watcher struct { clientCfgPath string clusterID string + nodePoolIDs []string rebootTimeWindowMinutes time.Duration nodeSelector *metav1.LabelSelector @@ -46,7 +47,7 @@ type watcher struct { nowFunc func() time.Time } -func NewWatcher(ctx context.Context, clusterID string, nodePoolIDs []string, opts ...Option) (Watcher, error) { +func NewWatcher(ctx context.Context, clusterID string, opts ...Option) (Watcher, error) { w := &watcher{ clusterID: clusterID, monitorOnly: true, @@ -61,7 +62,7 @@ func NewWatcher(ctx context.Context, clusterID string, nodePoolIDs []string, opt return nil, fmt.Errorf("cluster ID must not be empty") } - w.nodeSelector = buildNodeSelector(nodePoolIDs) + w.nodeSelector = buildNodeSelector(w.nodePoolIDs) if err := w.setupKubernetesClient(); err != nil { return nil, err diff --git a/pkg/watcher/watcher_test.go b/pkg/watcher/watcher_test.go index ca766da..9cab637 100644 --- a/pkg/watcher/watcher_test.go +++ b/pkg/watcher/watcher_test.go @@ -103,7 +103,7 @@ func newTestWatcher(t *testing.T, opts ...Option) *watcher { WithExecutor(&mockExecutor{}), } w, err := NewWatcher(t.Context(), - testClusterID, []string{testNodePoolID}, + testClusterID, append(baseOpts, opts...)...) if err != nil { t.Fatal(err) @@ -115,9 +115,8 @@ func newTestWatcher(t *testing.T, opts ...Option) *watcher { func TestNew(t *testing.T) { type args struct { - clusterID string - nodePoolIDs []string - opts []Option + clusterID string + opts []Option } type test struct { name string @@ -130,11 +129,11 @@ func TestNew(t *testing.T) { { name: "Returns no error when given valid input", args: args{ - clusterID: testClusterID, - nodePoolIDs: []string{testNodePoolID}, + clusterID: testClusterID, opts: []Option{ WithKubernetesClient(fake.NewSimpleClientset()), WithExecutor(&mockExecutor{}), + WithNodePoolIDs([]string{testNodePoolID}), }, }, checkFunc: func(w *watcher) error { @@ -165,8 +164,8 @@ func TestNew(t *testing.T) { { name: "Returns no error when input is invalid, but default value is set", args: args{ - clusterID: testClusterID, - nodePoolIDs: []string{testNodePoolID}, + clusterID: testClusterID, + opts: []Option{ WithKubernetesClient(fake.NewSimpleClientset()), WithExecutor(&mockExecutor{}), @@ -184,7 +183,7 @@ func TestNew(t *testing.T) { { name: "Returns an error when clusterID is missing", args: args{ - nodePoolIDs: []string{testNodePoolID}, + opts: []Option{ WithKubernetesClient(fake.NewSimpleClientset()), WithExecutor(&mockExecutor{}), @@ -198,7 +197,6 @@ func TestNew(t *testing.T) { t.Run(test.name, func(t *testing.T) { w, err := NewWatcher(t.Context(), test.args.clusterID, - test.args.nodePoolIDs, test.args.opts...) if (err != nil) != test.wantErr { t.Errorf("error = %v, wantErr %v", err, test.wantErr) From 9c864547d6b003b411289d06af43847cc20fdd22 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 14 Apr 2026 03:21:35 +0900 Subject: [PATCH 12/71] refactor: remove unused clusterID from watcher MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit clusterID was only used by the old rebootNode() which moved to the executor. NewWatcher signature simplified: (ctx, clusterID, opts...) → (ctx, opts...) Co-Authored-By: Claude Opus 4.6 (1M context) --- main.go | 2 +- pkg/watcher/watcher.go | 8 +------- pkg/watcher/watcher_test.go | 23 +---------------------- 3 files changed, 3 insertions(+), 30 deletions(-) diff --git a/main.go b/main.go index 41ba7a4..28c50b6 100644 --- a/main.go +++ b/main.go @@ -69,7 +69,7 @@ func run(ctx context.Context) error { } }() - w, err := watcher.NewWatcher(ctx, clusterID, + w, err := watcher.NewWatcher(ctx, watcher.WithNodePoolIDs(parseNodePoolIDs(nodePoolID)), watcher.WithKubernetesClientConfigPath(*kubeconfigPath), watcher.WithExecutor(executor), diff --git a/pkg/watcher/watcher.go b/pkg/watcher/watcher.go index f0cbfbb..475ca17 100644 --- a/pkg/watcher/watcher.go +++ b/pkg/watcher/watcher.go @@ -33,7 +33,6 @@ type watcher struct { client kubernetes.Interface clientCfgPath string - clusterID string nodePoolIDs []string rebootTimeWindowMinutes time.Duration @@ -47,9 +46,8 @@ type watcher struct { nowFunc func() time.Time } -func NewWatcher(ctx context.Context, clusterID string, opts ...Option) (Watcher, error) { +func NewWatcher(ctx context.Context, opts ...Option) (Watcher, error) { w := &watcher{ - clusterID: clusterID, monitorOnly: true, states: NewStateStore(), nowFunc: time.Now, @@ -58,10 +56,6 @@ func NewWatcher(ctx context.Context, clusterID string, opts ...Option) (Watcher, opt(w) } - if clusterID == "" { - return nil, fmt.Errorf("cluster ID must not be empty") - } - w.nodeSelector = buildNodeSelector(w.nodePoolIDs) if err := w.setupKubernetesClient(); err != nil { diff --git a/pkg/watcher/watcher_test.go b/pkg/watcher/watcher_test.go index 9cab637..89b701a 100644 --- a/pkg/watcher/watcher_test.go +++ b/pkg/watcher/watcher_test.go @@ -66,7 +66,6 @@ func (c *alwaysFailChecker) Threshold() time.Duration { return c.threshold } // --- Test variables --- var ( - testClusterID = "test-cluster-123" testNodePoolID = "test-node-pool" testRebootTimeWindowMinutes = time.Duration(40) ) @@ -103,7 +102,6 @@ func newTestWatcher(t *testing.T, opts ...Option) *watcher { WithExecutor(&mockExecutor{}), } w, err := NewWatcher(t.Context(), - testClusterID, append(baseOpts, opts...)...) if err != nil { t.Fatal(err) @@ -115,8 +113,7 @@ func newTestWatcher(t *testing.T, opts ...Option) *watcher { func TestNew(t *testing.T) { type args struct { - clusterID string - opts []Option + opts []Option } type test struct { name string @@ -129,7 +126,6 @@ func TestNew(t *testing.T) { { name: "Returns no error when given valid input", args: args{ - clusterID: testClusterID, opts: []Option{ WithKubernetesClient(fake.NewSimpleClientset()), WithExecutor(&mockExecutor{}), @@ -137,9 +133,6 @@ func TestNew(t *testing.T) { }, }, checkFunc: func(w *watcher) error { - if w.clusterID != testClusterID { - return fmt.Errorf("clusterID mismatch: got %s, want %s", w.clusterID, testClusterID) - } if w.nodeSelector == nil || w.nodeSelector.MatchLabels[nodePoolLabelKey] != testNodePoolID { return fmt.Errorf("nodeSelector mismatch: got %v, want %s", w.nodeSelector, testNodePoolID) } @@ -164,8 +157,6 @@ func TestNew(t *testing.T) { { name: "Returns no error when input is invalid, but default value is set", args: args{ - clusterID: testClusterID, - opts: []Option{ WithKubernetesClient(fake.NewSimpleClientset()), WithExecutor(&mockExecutor{}), @@ -180,23 +171,11 @@ func TestNew(t *testing.T) { return nil }, }, - { - name: "Returns an error when clusterID is missing", - args: args{ - - opts: []Option{ - WithKubernetesClient(fake.NewSimpleClientset()), - WithExecutor(&mockExecutor{}), - }, - }, - wantErr: true, - }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { w, err := NewWatcher(t.Context(), - test.args.clusterID, test.args.opts...) if (err != nil) != test.wantErr { t.Errorf("error = %v, wantErr %v", err, test.wantErr) From baee4e0f23c7ebe42cde99be2282a646e3d68ee1 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 14 Apr 2026 03:30:02 +0900 Subject: [PATCH 13/71] fix: use nvidia.com/gpu.count label for hasGPU instead of allocatable The label is static (set by GFD) and correctly identifies GPU nodes even when all GPUs are unhealthy and allocatable drops to 0. Co-Authored-By: Claude Opus 4.6 (1M context) --- pkg/watcher/watcher.go | 15 ++++++++------- pkg/watcher/watcher_test.go | 4 ++-- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/pkg/watcher/watcher.go b/pkg/watcher/watcher.go index 475ca17..84101be 100644 --- a/pkg/watcher/watcher.go +++ b/pkg/watcher/watcher.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "log/slog" + "strconv" "time" "github.com/civo/node-agent/pkg/health" @@ -20,10 +21,7 @@ import ( "k8s.io/client-go/tools/clientcmd" ) -const ( - nodePoolLabelKey = "kubernetes.civo.com/civo-node-pool" - gpuResourceName = "nvidia.com/gpu" -) +const nodePoolLabelKey = "kubernetes.civo.com/civo-node-pool" type Watcher interface { Run(ctx context.Context) error @@ -286,11 +284,14 @@ func buildNodeSelector(nodePoolIDs []string) *metav1.LabelSelector { } } +// hasGPU returns true if the node has the nvidia.com/gpu.count label +// with a positive value, indicating it is a GPU node regardless of +// current GPU health. func hasGPU(node *corev1.Node) bool { - quantity, exists := node.Status.Allocatable[gpuResourceName] + v, exists := node.Labels["nvidia.com/gpu.count"] if !exists { return false } - gpuCount, ok := quantity.AsInt64() - return ok && gpuCount > 0 + n, err := strconv.Atoi(v) + return err == nil && n > 0 } diff --git a/pkg/watcher/watcher_test.go b/pkg/watcher/watcher_test.go index 89b701a..a87e2de 100644 --- a/pkg/watcher/watcher_test.go +++ b/pkg/watcher/watcher_test.go @@ -88,7 +88,7 @@ func newTestNode(name string, ready corev1.ConditionStatus, gpuCount int) *corev if gpuCount > 0 { node.Labels["nvidia.com/gpu.count"] = strconv.Itoa(gpuCount) node.Status.Allocatable = corev1.ResourceList{ - gpuResourceName: resource.MustParse(strconv.Itoa(gpuCount)), + "nvidia.com/gpu": resource.MustParse(strconv.Itoa(gpuCount)), } } return node @@ -404,7 +404,7 @@ func TestRun_GPUMismatchTriggersUnhealthy(t *testing.T) { now := time.Date(2026, 4, 13, 12, 0, 0, 0, time.UTC) node := newTestNode("node-01", corev1.ConditionTrue, 8) // Simulate GPU failure: label says 8 but only 7 allocatable. - node.Status.Allocatable[gpuResourceName] = resource.MustParse("7") + node.Status.Allocatable["nvidia.com/gpu"] = resource.MustParse("7") w := newTestWatcher(t, WithNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), WithCheckers(health.NewDefaultCheckers()), From 32046733cb83ebd6c1fa5958231f3debba1d9a32 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 14 Apr 2026 03:33:49 +0900 Subject: [PATCH 14/71] fix: rename env vars to CIVO_NODE_AGENT_ prefix for agent-specific settings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - CIVO_NODE_MONITOR_ONLY → CIVO_NODE_AGENT_MONITOR_ONLY - CIVO_NODE_METRICS_PORT → CIVO_NODE_AGENT_METRICS_PORT Co-Authored-By: Claude Opus 4.6 (1M context) --- main.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.go b/main.go index 28c50b6..5b3bd83 100644 --- a/main.go +++ b/main.go @@ -31,8 +31,8 @@ var ( clusterID = strings.TrimSpace(os.Getenv("CIVO_CLUSTER_ID")) nodePoolID = strings.TrimSpace(os.Getenv("CIVO_NODE_POOL_ID")) rebootTimeWindowMinutes = strings.TrimSpace(os.Getenv("CIVO_NODE_REBOOT_TIME_WINDOW_MINUTES")) - monitorOnly = strings.TrimSpace(os.Getenv("CIVO_NODE_MONITOR_ONLY")) - metricsPort = strings.TrimSpace(os.Getenv("CIVO_NODE_METRICS_PORT")) + monitorOnly = strings.TrimSpace(os.Getenv("CIVO_NODE_AGENT_MONITOR_ONLY")) + metricsPort = strings.TrimSpace(os.Getenv("CIVO_NODE_AGENT_METRICS_PORT")) ) const ( From 1cc2c874674f3f65c6b5c7376090590a7cc427b0 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 14 Apr 2026 04:20:14 +0900 Subject: [PATCH 15/71] feat: differentiate reboot wait times for standard and GPU nodes - Standard nodes: reboot wait 10min (CIVO_NODE_REBOOT_WAIT_MINUTES) - GPU nodes: reboot wait 40min (CIVO_GPU_NODE_REBOOT_WAIT_MINUTES) - NodeReady threshold changed from 10min to 5min per TDD - Replace single rebootTimeWindowMinutes with rebootWaitMinutes + gpuRebootWaitMinutes - WaitingReboot phase checks state.IsGPUNode() to select appropriate wait time Co-Authored-By: Claude Opus 4.6 (1M context) --- main.go | 20 +++++++++++--------- pkg/health/node_ready.go | 2 +- pkg/watcher/options.go | 23 ++++++++++++++++++----- pkg/watcher/watcher.go | 11 ++++++++--- pkg/watcher/watcher_test.go | 22 +++++++++++----------- 5 files changed, 49 insertions(+), 29 deletions(-) diff --git a/main.go b/main.go index 5b3bd83..94336d2 100644 --- a/main.go +++ b/main.go @@ -25,14 +25,15 @@ var ( ) var ( - apiURL = strings.TrimSpace(os.Getenv("CIVO_API_URL")) - apiKey = strings.TrimSpace(os.Getenv("CIVO_API_KEY")) - region = strings.TrimSpace(os.Getenv("CIVO_REGION")) - clusterID = strings.TrimSpace(os.Getenv("CIVO_CLUSTER_ID")) - nodePoolID = strings.TrimSpace(os.Getenv("CIVO_NODE_POOL_ID")) - rebootTimeWindowMinutes = strings.TrimSpace(os.Getenv("CIVO_NODE_REBOOT_TIME_WINDOW_MINUTES")) - monitorOnly = strings.TrimSpace(os.Getenv("CIVO_NODE_AGENT_MONITOR_ONLY")) - metricsPort = strings.TrimSpace(os.Getenv("CIVO_NODE_AGENT_METRICS_PORT")) + apiURL = strings.TrimSpace(os.Getenv("CIVO_API_URL")) + apiKey = strings.TrimSpace(os.Getenv("CIVO_API_KEY")) + region = strings.TrimSpace(os.Getenv("CIVO_REGION")) + clusterID = strings.TrimSpace(os.Getenv("CIVO_CLUSTER_ID")) + nodePoolID = strings.TrimSpace(os.Getenv("CIVO_NODE_POOL_ID")) + rebootWaitMinutes = strings.TrimSpace(os.Getenv("CIVO_NODE_REBOOT_WAIT_MINUTES")) + gpuRebootWaitMinutes = strings.TrimSpace(os.Getenv("CIVO_GPU_NODE_REBOOT_WAIT_MINUTES")) + monitorOnly = strings.TrimSpace(os.Getenv("CIVO_NODE_AGENT_MONITOR_ONLY")) + metricsPort = strings.TrimSpace(os.Getenv("CIVO_NODE_AGENT_METRICS_PORT")) ) const ( @@ -75,7 +76,8 @@ func run(ctx context.Context) error { watcher.WithExecutor(executor), watcher.WithCheckers(checkers), watcher.WithMonitorOnly(monitorOnlyFlag), - watcher.WithRebootTimeWindowMinutes(rebootTimeWindowMinutes), + watcher.WithRebootWaitMinutes(rebootWaitMinutes), + watcher.WithGPURebootWaitMinutes(gpuRebootWaitMinutes), ) if err != nil { return err diff --git a/pkg/health/node_ready.go b/pkg/health/node_ready.go index a95b259..efaddbc 100644 --- a/pkg/health/node_ready.go +++ b/pkg/health/node_ready.go @@ -10,7 +10,7 @@ import ( type nodeReadyChecker struct{} func (c *nodeReadyChecker) Name() string { return "NodeReady" } -func (c *nodeReadyChecker) Threshold() time.Duration { return 10 * time.Minute } +func (c *nodeReadyChecker) Threshold() time.Duration { return 5 * time.Minute } func (c *nodeReadyChecker) Check(node *corev1.Node) bool { for _, cond := range node.Status.Conditions { diff --git a/pkg/watcher/options.go b/pkg/watcher/options.go index 9d27aa2..7933e65 100644 --- a/pkg/watcher/options.go +++ b/pkg/watcher/options.go @@ -15,7 +15,8 @@ import ( type Option func(*watcher) var defaultOptions = []Option{ - WithRebootTimeWindowMinutes("40"), + WithRebootWaitMinutes("10"), + WithGPURebootWaitMinutes("40"), } // WithKubernetesClient returns Option to set Kubernetes API client. @@ -47,14 +48,26 @@ func WithNodePoolIDs(ids []string) Option { } } -// WithRebootTimeWindowMinutes returns Option to set reboot time window. -func WithRebootTimeWindowMinutes(s string) Option { +// WithRebootWaitMinutes returns Option to set the reboot wait time for standard nodes. +func WithRebootWaitMinutes(s string) Option { return func(w *watcher) { n, err := strconv.Atoi(s) if err == nil && n > 0 { - w.rebootTimeWindowMinutes = time.Duration(n) + w.rebootWaitMinutes = time.Duration(n) } else { - slog.Info("RebootTimeWindowMinutes is invalid", "value", s) + slog.Info("RebootWaitMinutes is invalid", "value", s) + } + } +} + +// WithGPURebootWaitMinutes returns Option to set the reboot wait time for GPU nodes. +func WithGPURebootWaitMinutes(s string) Option { + return func(w *watcher) { + n, err := strconv.Atoi(s) + if err == nil && n > 0 { + w.gpuRebootWaitMinutes = time.Duration(n) + } else { + slog.Info("GPURebootWaitMinutes is invalid", "value", s) } } } diff --git a/pkg/watcher/watcher.go b/pkg/watcher/watcher.go index 84101be..3503c6b 100644 --- a/pkg/watcher/watcher.go +++ b/pkg/watcher/watcher.go @@ -31,8 +31,9 @@ type watcher struct { client kubernetes.Interface clientCfgPath string - nodePoolIDs []string - rebootTimeWindowMinutes time.Duration + nodePoolIDs []string + rebootWaitMinutes time.Duration // Standard nodes (default: 10) + gpuRebootWaitMinutes time.Duration // GPU nodes (default: 40) nodeSelector *metav1.LabelSelector nodeLister listerscorev1.NodeLister @@ -226,7 +227,11 @@ func (w *watcher) run(ctx context.Context) error { case PhaseWaitingReboot: metrics.NodeUnhealthyDurationSeconds.WithLabelValues(nodeName).Set( now.Sub(state.UnhealthySince()).Seconds()) - if now.Sub(state.LastRebootTime()) < w.rebootTimeWindowMinutes*time.Minute { + rebootWait := w.rebootWaitMinutes + if state.IsGPUNode() { + rebootWait = w.gpuRebootWaitMinutes + } + if now.Sub(state.LastRebootTime()) < rebootWait*time.Minute { continue } if !w.monitorOnly { diff --git a/pkg/watcher/watcher_test.go b/pkg/watcher/watcher_test.go index a87e2de..8ff5cf1 100644 --- a/pkg/watcher/watcher_test.go +++ b/pkg/watcher/watcher_test.go @@ -66,8 +66,8 @@ func (c *alwaysFailChecker) Threshold() time.Duration { return c.threshold } // --- Test variables --- var ( - testNodePoolID = "test-node-pool" - testRebootTimeWindowMinutes = time.Duration(40) + testNodePoolID = "test-node-pool" + testRebootWaitMinutes = time.Duration(10) ) // newTestNode creates a node for testing with common defaults. @@ -139,8 +139,8 @@ func TestNew(t *testing.T) { if w.client == nil { return fmt.Errorf("client is nil") } - if w.rebootTimeWindowMinutes != testRebootTimeWindowMinutes { - return fmt.Errorf("rebootTimeWindowMinutes mismatch: got %v, want %v", w.rebootTimeWindowMinutes, testRebootTimeWindowMinutes) + if w.rebootWaitMinutes != testRebootWaitMinutes { + return fmt.Errorf("rebootTimeWindowMinutes mismatch: got %v, want %v", w.rebootWaitMinutes, testRebootWaitMinutes) } if !w.monitorOnly { return fmt.Errorf("monitorOnly should default to true") @@ -160,13 +160,13 @@ func TestNew(t *testing.T) { opts: []Option{ WithKubernetesClient(fake.NewSimpleClientset()), WithExecutor(&mockExecutor{}), - WithRebootTimeWindowMinutes("invalid time"), - WithRebootTimeWindowMinutes("0"), + WithRebootWaitMinutes("invalid time"), + WithRebootWaitMinutes("0"), }, }, checkFunc: func(w *watcher) error { - if w.rebootTimeWindowMinutes != testRebootTimeWindowMinutes { - return fmt.Errorf("rebootTimeWindowMinutes mismatch: got %v, want %v", w.rebootTimeWindowMinutes, testRebootTimeWindowMinutes) + if w.rebootWaitMinutes != testRebootWaitMinutes { + return fmt.Errorf("rebootTimeWindowMinutes mismatch: got %v, want %v", w.rebootWaitMinutes, testRebootWaitMinutes) } return nil }, @@ -359,7 +359,7 @@ func TestRun_RebootRetry(t *testing.T) { WithCheckers(health.NewDefaultCheckers()), WithExecutor(exec), WithMonitorOnly(false), - WithRebootTimeWindowMinutes("40"), + WithGPURebootWaitMinutes("40"), WithNowFunc(func() time.Time { return now }), ) @@ -367,7 +367,7 @@ func TestRun_RebootRetry(t *testing.T) { if err := w.run(t.Context()); err != nil { t.Fatal(err) } - // Run 2: trigger first reboot. + // Run 2: trigger first reboot (GPU checker threshold is 10min). now = now.Add(11 * time.Minute) if err := w.run(t.Context()); err != nil { t.Fatal(err) @@ -515,7 +515,7 @@ func TestRun_UnhealthyWithinThresholdNoReboot(t *testing.T) { } // Run 2: still within threshold → no reboot. - now = now.Add(5 * time.Minute) + now = now.Add(3 * time.Minute) if err := w.run(t.Context()); err != nil { t.Fatal(err) } From d364bb72d394701e92f1db1fa6ce449aeb589c7b Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 14 Apr 2026 04:24:15 +0900 Subject: [PATCH 16/71] refactor: move node pool ID parsing into WithNodePoolIDs option WithNodePoolIDs now accepts a comma-separated string and parses internally. Remove parseNodePoolIDs from main.go. Co-Authored-By: Claude Opus 4.6 (1M context) --- main.go | 17 +---------------- pkg/watcher/options.go | 12 ++++++++---- pkg/watcher/watcher_test.go | 2 +- 3 files changed, 10 insertions(+), 21 deletions(-) diff --git a/main.go b/main.go index 94336d2..b33d4b8 100644 --- a/main.go +++ b/main.go @@ -71,7 +71,7 @@ func run(ctx context.Context) error { }() w, err := watcher.NewWatcher(ctx, - watcher.WithNodePoolIDs(parseNodePoolIDs(nodePoolID)), + watcher.WithNodePoolIDs(nodePoolID), watcher.WithKubernetesClientConfigPath(*kubeconfigPath), watcher.WithExecutor(executor), watcher.WithCheckers(checkers), @@ -103,18 +103,3 @@ func main() { os.Exit(1) } } - -// parseNodePoolIDs splits a comma-separated string into node pool IDs. -// Empty string returns nil (all node pools). -func parseNodePoolIDs(s string) []string { - if s == "" { - return nil - } - var ids []string - for _, id := range strings.Split(s, ",") { - if v := strings.TrimSpace(id); v != "" { - ids = append(ids, v) - } - } - return ids -} diff --git a/pkg/watcher/options.go b/pkg/watcher/options.go index 7933e65..c6bac14 100644 --- a/pkg/watcher/options.go +++ b/pkg/watcher/options.go @@ -3,6 +3,7 @@ package watcher import ( "log/slog" "strconv" + "strings" "time" "github.com/civo/node-agent/pkg/health" @@ -38,12 +39,15 @@ func WithKubernetesClientConfigPath(path string) Option { } // WithNodePoolIDs returns Option to append node pool IDs to watch. +// Accepts a comma-separated string (e.g. "pool-1,pool-2"). // Can be called multiple times to accumulate IDs. -// If no IDs are provided across all calls, all nodes are watched. -func WithNodePoolIDs(ids []string) Option { +// Empty string is a no-op. If no IDs are provided across all calls, all nodes are watched. +func WithNodePoolIDs(s string) Option { return func(w *watcher) { - if len(ids) > 0 { - w.nodePoolIDs = append(w.nodePoolIDs, ids...) + for _, id := range strings.Split(s, ",") { + if v := strings.TrimSpace(id); v != "" { + w.nodePoolIDs = append(w.nodePoolIDs, v) + } } } } diff --git a/pkg/watcher/watcher_test.go b/pkg/watcher/watcher_test.go index 8ff5cf1..571d242 100644 --- a/pkg/watcher/watcher_test.go +++ b/pkg/watcher/watcher_test.go @@ -129,7 +129,7 @@ func TestNew(t *testing.T) { opts: []Option{ WithKubernetesClient(fake.NewSimpleClientset()), WithExecutor(&mockExecutor{}), - WithNodePoolIDs([]string{testNodePoolID}), + WithNodePoolIDs(testNodePoolID), }, }, checkFunc: func(w *watcher) error { From 6167e5a1426c5b1c9665363bdd4d623fe9a92086 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 14 Apr 2026 04:53:01 +0900 Subject: [PATCH 17/71] =?UTF-8?q?docs:=20add=20TODO=20for=20standard=20nod?= =?UTF-8?q?e=20Drain=20=E2=86=92=20Replace=20recovery=20flow?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Standard nodes should transition to PhaseDrain → PhaseReplace instead of retrying reboot indefinitely. GPU nodes keep reboot-only. Co-Authored-By: Claude Opus 4.6 (1M context) --- pkg/watcher/watcher.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pkg/watcher/watcher.go b/pkg/watcher/watcher.go index 3503c6b..fec591d 100644 --- a/pkg/watcher/watcher.go +++ b/pkg/watcher/watcher.go @@ -234,6 +234,12 @@ func (w *watcher) run(ctx context.Context) error { if now.Sub(state.LastRebootTime()) < rebootWait*time.Minute { continue } + + // TODO: Standard nodes should transition to PhaseDrain → PhaseReplace + // instead of retrying reboot indefinitely. + // GPU nodes must never be replaced; they retry reboot only. + // See: Recovery Flow — Standard Nodes (Drain → timeout 30min → Replace) + if !w.monitorOnly { if err := w.executor.Reboot(ctx, nodeName); err != nil { slog.Error("Failed to reboot node (retry)", "node", nodeName, "error", err) From 2bd48982af83ce52bd0a0ee4380c6155f854b7b9 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 14 Apr 2026 05:25:37 +0900 Subject: [PATCH 18/71] refactor: move monitor-only flag parsing into WithMonitorOnly option WithMonitorOnly now accepts a string and parses internally via strconv.ParseBool. Empty or unparsable values preserve the default (true). Co-Authored-By: Claude Opus 4.6 (1M context) --- main.go | 7 +------ pkg/watcher/options.go | 9 ++++++--- pkg/watcher/watcher_test.go | 12 ++++++------ 3 files changed, 13 insertions(+), 15 deletions(-) diff --git a/main.go b/main.go index b33d4b8..93c78f2 100644 --- a/main.go +++ b/main.go @@ -51,11 +51,6 @@ func run(ctx context.Context) error { } checkers := health.NewDefaultCheckers() - monitorOnlyFlag := true - if v, err := strconv.ParseBool(monitorOnly); err == nil { - monitorOnlyFlag = v - } - metrics.Register() go func() { port := defaultMetricsPort @@ -75,7 +70,7 @@ func run(ctx context.Context) error { watcher.WithKubernetesClientConfigPath(*kubeconfigPath), watcher.WithExecutor(executor), watcher.WithCheckers(checkers), - watcher.WithMonitorOnly(monitorOnlyFlag), + watcher.WithMonitorOnly(monitorOnly), watcher.WithRebootWaitMinutes(rebootWaitMinutes), watcher.WithGPURebootWaitMinutes(gpuRebootWaitMinutes), ) diff --git a/pkg/watcher/options.go b/pkg/watcher/options.go index c6bac14..4e66bc0 100644 --- a/pkg/watcher/options.go +++ b/pkg/watcher/options.go @@ -77,10 +77,13 @@ func WithGPURebootWaitMinutes(s string) Option { } // WithMonitorOnly returns Option to enable or disable monitor-only mode. -// When true (default), recovery actions are logged but not executed. -func WithMonitorOnly(v bool) Option { +// Accepts a string parsable by strconv.ParseBool (e.g. "true", "false", "1", "0"). +// Empty or unparsable values are ignored (default: true). +func WithMonitorOnly(s string) Option { return func(w *watcher) { - w.monitorOnly = v + if v, err := strconv.ParseBool(s); err == nil { + w.monitorOnly = v + } } } diff --git a/pkg/watcher/watcher_test.go b/pkg/watcher/watcher_test.go index 571d242..6aab977 100644 --- a/pkg/watcher/watcher_test.go +++ b/pkg/watcher/watcher_test.go @@ -249,7 +249,7 @@ func TestRun_RebootTriggerActiveMode(t *testing.T) { WithNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), WithCheckers(health.NewDefaultCheckers()), WithExecutor(exec), - WithMonitorOnly(false), + WithMonitorOnly("false"), WithNowFunc(func() time.Time { return now }), ) @@ -286,7 +286,7 @@ func TestRun_RebootSkippedInReportMode(t *testing.T) { WithNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), WithCheckers(health.NewDefaultCheckers()), WithExecutor(exec), - WithMonitorOnly(true), + WithMonitorOnly("true"), WithNowFunc(func() time.Time { return now }), ) @@ -318,7 +318,7 @@ func TestRun_RecoveryAfterReboot(t *testing.T) { w := newTestWatcher(t, WithNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), WithCheckers(health.NewDefaultCheckers()), - WithMonitorOnly(false), + WithMonitorOnly("false"), WithNowFunc(func() time.Time { return now }), ) @@ -358,7 +358,7 @@ func TestRun_RebootRetry(t *testing.T) { WithNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), WithCheckers(health.NewDefaultCheckers()), WithExecutor(exec), - WithMonitorOnly(false), + WithMonitorOnly("false"), WithGPURebootWaitMinutes("40"), WithNowFunc(func() time.Time { return now }), ) @@ -436,7 +436,7 @@ func TestRun_RebootErrorContinuesProcessing(t *testing.T) { WithNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), WithCheckers(health.NewDefaultCheckers()), WithExecutor(exec), - WithMonitorOnly(false), + WithMonitorOnly("false"), WithNowFunc(func() time.Time { return now }), ) @@ -505,7 +505,7 @@ func TestRun_UnhealthyWithinThresholdNoReboot(t *testing.T) { WithNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), WithCheckers(health.NewDefaultCheckers()), WithExecutor(exec), - WithMonitorOnly(false), + WithMonitorOnly("false"), WithNowFunc(func() time.Time { return now }), ) From b061f026403eb04f310c3d1518a19afd3ccabeea Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 14 Apr 2026 05:27:51 +0900 Subject: [PATCH 19/71] fix: add WithMonitorOnly to defaultOptions Co-Authored-By: Claude Opus 4.6 (1M context) --- pkg/watcher/options.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/watcher/options.go b/pkg/watcher/options.go index 4e66bc0..6ecc239 100644 --- a/pkg/watcher/options.go +++ b/pkg/watcher/options.go @@ -16,6 +16,7 @@ import ( type Option func(*watcher) var defaultOptions = []Option{ + WithMonitorOnly("true"), WithRebootWaitMinutes("10"), WithGPURebootWaitMinutes("40"), } From 8cd0e350dff6de1a5cb3fca31bceed0b4cf60dd2 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 14 Apr 2026 10:30:49 +0900 Subject: [PATCH 20/71] refactor: remove unused FailedCheckers getter from NodeState Co-Authored-By: Claude Opus 4.6 (1M context) --- pkg/watcher/state.go | 7 ------- pkg/watcher/state_test.go | 22 ---------------------- 2 files changed, 29 deletions(-) diff --git a/pkg/watcher/state.go b/pkg/watcher/state.go index 7b3a60a..2621eac 100644 --- a/pkg/watcher/state.go +++ b/pkg/watcher/state.go @@ -57,13 +57,6 @@ func (s *NodeState) LastRebootTime() time.Time { return s.lastRebootTime } func (s *NodeState) RebootCount() int { return s.rebootCount } func (s *NodeState) IsGPUNode() bool { return s.isGPUNode } -// FailedCheckers returns a copy of the failed checker names. -func (s *NodeState) FailedCheckers() []string { - out := make([]string, len(s.failedCheckers)) - copy(out, s.failedCheckers) - return out -} - // StateStore is a concurrency-safe store for per-node recovery state. type StateStore struct { mu sync.RWMutex diff --git a/pkg/watcher/state_test.go b/pkg/watcher/state_test.go index 45fcbdb..f696f94 100644 --- a/pkg/watcher/state_test.go +++ b/pkg/watcher/state_test.go @@ -183,10 +183,6 @@ func TestStateStoreUpdateCheckerInfo(t *testing.T) { s.UpdateCheckerInfo("node-01", checkers, true) st, _ := s.Get("node-01") - got := st.FailedCheckers() - if len(got) != 2 || got[0] != "NodeReady" || got[1] != "GPU" { - t.Errorf("got failedCheckers %v, want %v", got, checkers) - } if !st.IsGPUNode() { t.Error("expected isGPUNode to be true") } @@ -198,21 +194,6 @@ func TestStateStoreUpdateCheckerInfoNonexistent(t *testing.T) { s.UpdateCheckerInfo("nonexistent", []string{"NodeReady"}, false) } -func TestFailedCheckersReturnsCopy(t *testing.T) { - s := NewStateStore() - s.GetOrCreate("node-01") - s.UpdateCheckerInfo("node-01", []string{"NodeReady"}, false) - - st, _ := s.Get("node-01") - got := st.FailedCheckers() - got[0] = "mutated" - - original := st.FailedCheckers() - if original[0] != "NodeReady" { - t.Error("FailedCheckers should return a copy; mutation should not affect internal state") - } -} - func TestStateStoreReset(t *testing.T) { s := NewStateStore() s.GetOrCreate("node-01") @@ -240,9 +221,6 @@ func TestStateStoreReset(t *testing.T) { if !st.LastRebootTime().IsZero() { t.Error("lastRebootTime should be zero after Reset") } - if len(st.FailedCheckers()) != 0 { - t.Error("failedCheckers should be empty after Reset") - } if st.IsGPUNode() { t.Error("isGPUNode should be false after Reset") } From 9f8a7548898f9bae06af65a28dc76646b63d4999 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 14 Apr 2026 12:47:43 +0900 Subject: [PATCH 21/71] feat: add CiliumAgent health checker Monitors CiliumAgentIsReady node condition. Auto-skips nodes where the condition is absent (Cilium not installed). Threshold: 10min. Co-Authored-By: Claude Opus 4.6 (1M context) --- pkg/health/cilium.go | 26 +++++++++++++ pkg/health/cilium_test.go | 79 +++++++++++++++++++++++++++++++++++++++ pkg/health/gpu_test.go | 17 ++++----- pkg/health/health.go | 5 ++- 4 files changed, 115 insertions(+), 12 deletions(-) create mode 100644 pkg/health/cilium.go create mode 100644 pkg/health/cilium_test.go diff --git a/pkg/health/cilium.go b/pkg/health/cilium.go new file mode 100644 index 0000000..3972852 --- /dev/null +++ b/pkg/health/cilium.go @@ -0,0 +1,26 @@ +package health + +import ( + "time" + + corev1 "k8s.io/api/core/v1" +) + +const ciliumAgentConditionType corev1.NodeConditionType = "CiliumAgentIsReady" + +// ciliumChecker reports healthy when the Cilium agent is ready. +// If the CiliumAgentIsReady condition is not present (Cilium not installed), +// the check is skipped and returns healthy. +type ciliumChecker struct{} + +func (c *ciliumChecker) Name() string { return "CiliumAgent" } +func (c *ciliumChecker) Threshold() time.Duration { return 10 * time.Minute } + +func (c *ciliumChecker) Check(node *corev1.Node) bool { + for _, cond := range node.Status.Conditions { + if cond.Type == ciliumAgentConditionType { + return cond.Status == corev1.ConditionTrue + } + } + return true +} diff --git a/pkg/health/cilium_test.go b/pkg/health/cilium_test.go new file mode 100644 index 0000000..76cbcaa --- /dev/null +++ b/pkg/health/cilium_test.go @@ -0,0 +1,79 @@ +package health + +import ( + "testing" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func TestCiliumChecker_Name(t *testing.T) { + c := &ciliumChecker{} + if got := c.Name(); got != "CiliumAgent" { + t.Errorf("got %q, want %q", got, "CiliumAgent") + } +} + +func TestCiliumChecker_Check(t *testing.T) { + tests := []struct { + name string + node *corev1.Node + want bool + }{ + { + name: "Returns true when CiliumAgentIsReady is True", + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, + Status: corev1.NodeStatus{ + Conditions: []corev1.NodeCondition{ + {Type: ciliumAgentConditionType, Status: corev1.ConditionTrue}, + }, + }, + }, + want: true, + }, + { + name: "Returns false when CiliumAgentIsReady is False", + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, + Status: corev1.NodeStatus{ + Conditions: []corev1.NodeCondition{ + {Type: ciliumAgentConditionType, Status: corev1.ConditionFalse}, + }, + }, + }, + want: false, + }, + { + name: "Returns true when condition is absent (Cilium not installed)", + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, + Status: corev1.NodeStatus{ + Conditions: []corev1.NodeCondition{}, + }, + }, + want: true, + }, + { + name: "Returns true when only other conditions present", + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, + Status: corev1.NodeStatus{ + Conditions: []corev1.NodeCondition{ + {Type: corev1.NodeReady, Status: corev1.ConditionTrue}, + }, + }, + }, + want: true, + }, + } + + c := &ciliumChecker{} + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := c.Check(tt.node); got != tt.want { + t.Errorf("got %v, want %v", got, tt.want) + } + }) + } +} diff --git a/pkg/health/gpu_test.go b/pkg/health/gpu_test.go index 6f7b7a9..7a05d88 100644 --- a/pkg/health/gpu_test.go +++ b/pkg/health/gpu_test.go @@ -129,16 +129,13 @@ func TestGPUChecker_Check(t *testing.T) { func TestNewDefaultCheckers(t *testing.T) { checkers := NewDefaultCheckers() - if len(checkers) != 3 { - t.Fatalf("expected 3 checkers, got %d", len(checkers)) + if len(checkers) != 4 { + t.Fatalf("expected 4 checkers, got %d", len(checkers)) } - if checkers[0].Name() != "NodeReady" { - t.Errorf("expected NodeReady checker first, got %q", checkers[0].Name()) - } - if checkers[1].Name() != "DiskPressure" { - t.Errorf("expected DiskPressure checker second, got %q", checkers[1].Name()) - } - if checkers[2].Name() != "GPU" { - t.Errorf("expected GPU checker third, got %q", checkers[2].Name()) + expected := []string{"NodeReady", "DiskPressure", "CiliumAgent", "GPU"} + for i, name := range expected { + if checkers[i].Name() != name { + t.Errorf("checkers[%d]: expected %q, got %q", i, name, checkers[i].Name()) + } } } diff --git a/pkg/health/health.go b/pkg/health/health.go index bdd8989..031bb1f 100644 --- a/pkg/health/health.go +++ b/pkg/health/health.go @@ -18,12 +18,13 @@ type HealthChecker interface { } // NewDefaultCheckers returns the enabled health checkers. -// GPU checker is always included; it auto-skips non-GPU nodes -// by checking for the nvidia.com/gpu.count label. +// GPU checker auto-skips non-GPU nodes by checking the nvidia.com/gpu.count label. +// Cilium checker auto-skips nodes without CiliumAgentIsReady condition. func NewDefaultCheckers() []HealthChecker { return []HealthChecker{ &nodeReadyChecker{}, &diskPressureChecker{}, + &ciliumChecker{}, &gpuChecker{}, } } From 6dd821883e6142bc5138bcafac7f8ba227ac57a5 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 14 Apr 2026 12:59:10 +0900 Subject: [PATCH 22/71] fix: use NetworkUnavailable condition with CiliumIsUp reason for Cilium checker Cilium sets NetworkUnavailable=False with reason CiliumIsUp, not a custom CiliumAgentIsReady condition. Skip if reason is not CiliumIsUp (other CNI). Co-Authored-By: Claude Opus 4.6 (1M context) --- pkg/health/cilium.go | 15 +++++++++------ pkg/health/cilium_test.go | 32 ++++++++++++++++++++++---------- 2 files changed, 31 insertions(+), 16 deletions(-) diff --git a/pkg/health/cilium.go b/pkg/health/cilium.go index 3972852..c2694b4 100644 --- a/pkg/health/cilium.go +++ b/pkg/health/cilium.go @@ -6,11 +6,11 @@ import ( corev1 "k8s.io/api/core/v1" ) -const ciliumAgentConditionType corev1.NodeConditionType = "CiliumAgentIsReady" +const ciliumReadyReason = "CiliumIsUp" -// ciliumChecker reports healthy when the Cilium agent is ready. -// If the CiliumAgentIsReady condition is not present (Cilium not installed), -// the check is skipped and returns healthy. +// ciliumChecker reports healthy when the Cilium-managed NetworkUnavailable +// condition is False. If the condition's reason is not "CiliumIsUp" +// (i.e. a different CNI manages the condition), the check is skipped. type ciliumChecker struct{} func (c *ciliumChecker) Name() string { return "CiliumAgent" } @@ -18,8 +18,11 @@ func (c *ciliumChecker) Threshold() time.Duration { return 10 * time.Minute } func (c *ciliumChecker) Check(node *corev1.Node) bool { for _, cond := range node.Status.Conditions { - if cond.Type == ciliumAgentConditionType { - return cond.Status == corev1.ConditionTrue + if cond.Type == corev1.NodeNetworkUnavailable { + if cond.Reason != ciliumReadyReason { + return true + } + return cond.Status == corev1.ConditionFalse } } return true diff --git a/pkg/health/cilium_test.go b/pkg/health/cilium_test.go index 76cbcaa..88662ef 100644 --- a/pkg/health/cilium_test.go +++ b/pkg/health/cilium_test.go @@ -21,47 +21,59 @@ func TestCiliumChecker_Check(t *testing.T) { want bool }{ { - name: "Returns true when CiliumAgentIsReady is True", + name: "Returns true when NetworkUnavailable is False with CiliumIsUp", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, Status: corev1.NodeStatus{ Conditions: []corev1.NodeCondition{ - {Type: ciliumAgentConditionType, Status: corev1.ConditionTrue}, + { + Type: corev1.NodeNetworkUnavailable, + Status: corev1.ConditionFalse, + Reason: ciliumReadyReason, + }, }, }, }, want: true, }, { - name: "Returns false when CiliumAgentIsReady is False", + name: "Returns false when NetworkUnavailable is True with CiliumIsUp", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, Status: corev1.NodeStatus{ Conditions: []corev1.NodeCondition{ - {Type: ciliumAgentConditionType, Status: corev1.ConditionFalse}, + { + Type: corev1.NodeNetworkUnavailable, + Status: corev1.ConditionTrue, + Reason: ciliumReadyReason, + }, }, }, }, want: false, }, { - name: "Returns true when condition is absent (Cilium not installed)", + name: "Returns true when NetworkUnavailable has non-Cilium reason (skip)", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, Status: corev1.NodeStatus{ - Conditions: []corev1.NodeCondition{}, + Conditions: []corev1.NodeCondition{ + { + Type: corev1.NodeNetworkUnavailable, + Status: corev1.ConditionFalse, + Reason: "FlannelIsUp", + }, + }, }, }, want: true, }, { - name: "Returns true when only other conditions present", + name: "Returns true when condition is absent", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, Status: corev1.NodeStatus{ - Conditions: []corev1.NodeCondition{ - {Type: corev1.NodeReady, Status: corev1.ConditionTrue}, - }, + Conditions: []corev1.NodeCondition{}, }, }, want: true, From d6e27ab5d49f9a4abcf7bced5758daa39302e9c4 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 14 Apr 2026 13:29:02 +0900 Subject: [PATCH 23/71] feat: return reason from HealthChecker.Check for metrics observability Check now returns (bool, string) where the string is the reason. Condition-based checkers pass through cond.Reason directly. GPU checker returns a descriptive reason (e.g. "expected 8 but got 7"). The reason is used as the result label in HealthCheckTotal metric. Co-Authored-By: Claude Opus 4.6 (1M context) --- pkg/health/cilium.go | 8 ++++---- pkg/health/cilium_test.go | 2 +- pkg/health/disk_pressure.go | 6 +++--- pkg/health/disk_pressure_test.go | 2 +- pkg/health/gpu.go | 14 +++++++++----- pkg/health/gpu_test.go | 2 +- pkg/health/health.go | 5 +++-- pkg/health/node_ready.go | 6 +++--- pkg/health/node_ready_test.go | 2 +- pkg/watcher/watcher.go | 6 ++---- pkg/watcher/watcher_test.go | 6 +++--- 11 files changed, 31 insertions(+), 28 deletions(-) diff --git a/pkg/health/cilium.go b/pkg/health/cilium.go index c2694b4..d0ef88a 100644 --- a/pkg/health/cilium.go +++ b/pkg/health/cilium.go @@ -16,14 +16,14 @@ type ciliumChecker struct{} func (c *ciliumChecker) Name() string { return "CiliumAgent" } func (c *ciliumChecker) Threshold() time.Duration { return 10 * time.Minute } -func (c *ciliumChecker) Check(node *corev1.Node) bool { +func (c *ciliumChecker) Check(node *corev1.Node) (bool, string) { for _, cond := range node.Status.Conditions { if cond.Type == corev1.NodeNetworkUnavailable { if cond.Reason != ciliumReadyReason { - return true + return true, cond.Reason } - return cond.Status == corev1.ConditionFalse + return cond.Status == corev1.ConditionFalse, cond.Reason } } - return true + return true, "NetworkUnavailable condition not found" } diff --git a/pkg/health/cilium_test.go b/pkg/health/cilium_test.go index 88662ef..ad0044d 100644 --- a/pkg/health/cilium_test.go +++ b/pkg/health/cilium_test.go @@ -83,7 +83,7 @@ func TestCiliumChecker_Check(t *testing.T) { c := &ciliumChecker{} for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - if got := c.Check(tt.node); got != tt.want { + if got, _ := c.Check(tt.node); got != tt.want { t.Errorf("got %v, want %v", got, tt.want) } }) diff --git a/pkg/health/disk_pressure.go b/pkg/health/disk_pressure.go index 3610375..eba79ca 100644 --- a/pkg/health/disk_pressure.go +++ b/pkg/health/disk_pressure.go @@ -12,11 +12,11 @@ type diskPressureChecker struct{} func (c *diskPressureChecker) Name() string { return "DiskPressure" } func (c *diskPressureChecker) Threshold() time.Duration { return 30 * time.Minute } -func (c *diskPressureChecker) Check(node *corev1.Node) bool { +func (c *diskPressureChecker) Check(node *corev1.Node) (bool, string) { for _, cond := range node.Status.Conditions { if cond.Type == corev1.NodeDiskPressure { - return cond.Status != corev1.ConditionTrue + return cond.Status != corev1.ConditionTrue, cond.Reason } } - return true + return true, "DiskPressure condition not found" } diff --git a/pkg/health/disk_pressure_test.go b/pkg/health/disk_pressure_test.go index 5208f15..e1a9284 100644 --- a/pkg/health/disk_pressure_test.go +++ b/pkg/health/disk_pressure_test.go @@ -71,7 +71,7 @@ func TestDiskPressureChecker_Check(t *testing.T) { c := &diskPressureChecker{} for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - if got := c.Check(tt.node); got != tt.want { + if got, _ := c.Check(tt.node); got != tt.want { t.Errorf("got %v, want %v", got, tt.want) } }) diff --git a/pkg/health/gpu.go b/pkg/health/gpu.go index 8033fbe..6bee234 100644 --- a/pkg/health/gpu.go +++ b/pkg/health/gpu.go @@ -1,6 +1,7 @@ package health import ( + "fmt" "strconv" "time" @@ -20,23 +21,26 @@ type gpuChecker struct{} func (c *gpuChecker) Name() string { return "GPU" } func (c *gpuChecker) Threshold() time.Duration { return 10 * time.Minute } -func (c *gpuChecker) Check(node *corev1.Node) bool { +func (c *gpuChecker) Check(node *corev1.Node) (bool, string) { expected, ok := expectedGPUCount(node) if !ok || expected == 0 { - return true + return true, "non-GPU node" } quantity, exists := node.Status.Allocatable[gpuResourceName] if !exists || quantity.IsZero() { - return false + return false, fmt.Sprintf("expected %d but got 0", expected) } actual, ok := quantity.AsInt64() if !ok { - return false + return false, "failed to read allocatable GPU count" } - return actual == int64(expected) + if actual == int64(expected) { + return true, fmt.Sprintf("%d/%d", actual, expected) + } + return false, fmt.Sprintf("expected %d but got %d", expected, actual) } // expectedGPUCount reads the nvidia.com/gpu.count label from the node. diff --git a/pkg/health/gpu_test.go b/pkg/health/gpu_test.go index 7a05d88..e4dcac4 100644 --- a/pkg/health/gpu_test.go +++ b/pkg/health/gpu_test.go @@ -120,7 +120,7 @@ func TestGPUChecker_Check(t *testing.T) { c := &gpuChecker{} for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - if got := c.Check(tt.node); got != tt.want { + if got, _ := c.Check(tt.node); got != tt.want { t.Errorf("got %v, want %v", got, tt.want) } }) diff --git a/pkg/health/health.go b/pkg/health/health.go index 031bb1f..3efd618 100644 --- a/pkg/health/health.go +++ b/pkg/health/health.go @@ -10,8 +10,9 @@ import ( type HealthChecker interface { // Name returns a human-readable identifier for this checker (e.g. "NodeReady"). Name() string - // Check returns true if the node is healthy for this checker's concern. - Check(node *corev1.Node) bool + // Check returns whether the node is healthy and a reason string. + // On success the reason is empty. On failure it describes what went wrong. + Check(node *corev1.Node) (healthy bool, reason string) // Threshold returns how long this checker must continuously fail // before a recovery action is triggered. Threshold() time.Duration diff --git a/pkg/health/node_ready.go b/pkg/health/node_ready.go index efaddbc..d002473 100644 --- a/pkg/health/node_ready.go +++ b/pkg/health/node_ready.go @@ -12,11 +12,11 @@ type nodeReadyChecker struct{} func (c *nodeReadyChecker) Name() string { return "NodeReady" } func (c *nodeReadyChecker) Threshold() time.Duration { return 5 * time.Minute } -func (c *nodeReadyChecker) Check(node *corev1.Node) bool { +func (c *nodeReadyChecker) Check(node *corev1.Node) (bool, string) { for _, cond := range node.Status.Conditions { if cond.Type == corev1.NodeReady { - return cond.Status == corev1.ConditionTrue + return cond.Status == corev1.ConditionTrue, cond.Reason } } - return false + return false, "NodeReady condition not found" } diff --git a/pkg/health/node_ready_test.go b/pkg/health/node_ready_test.go index 120ebd5..c131b48 100644 --- a/pkg/health/node_ready_test.go +++ b/pkg/health/node_ready_test.go @@ -71,7 +71,7 @@ func TestNodeReadyChecker_Check(t *testing.T) { c := &nodeReadyChecker{} for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - if got := c.Check(tt.node); got != tt.want { + if got, _ := c.Check(tt.node); got != tt.want { t.Errorf("got %v, want %v", got, tt.want) } }) diff --git a/pkg/watcher/watcher.go b/pkg/watcher/watcher.go index fec591d..9a9c973 100644 --- a/pkg/watcher/watcher.go +++ b/pkg/watcher/watcher.go @@ -157,16 +157,14 @@ func (w *watcher) run(ctx context.Context) error { var failedCheckers []string var minThreshold time.Duration for _, checker := range w.checkers { - healthy := checker.Check(node) - result := "pass" + healthy, reason := checker.Check(node) if !healthy { - result = "fail" failedCheckers = append(failedCheckers, checker.Name()) if minThreshold == 0 || checker.Threshold() < minThreshold { minThreshold = checker.Threshold() } } - metrics.HealthCheckTotal.WithLabelValues(nodeName, checker.Name(), result).Inc() + metrics.HealthCheckTotal.WithLabelValues(nodeName, checker.Name(), reason).Inc() } state := w.states.GetOrCreate(nodeName) diff --git a/pkg/watcher/watcher_test.go b/pkg/watcher/watcher_test.go index 6aab977..8f2d126 100644 --- a/pkg/watcher/watcher_test.go +++ b/pkg/watcher/watcher_test.go @@ -59,9 +59,9 @@ type alwaysFailChecker struct { threshold time.Duration } -func (c *alwaysFailChecker) Name() string { return c.name } -func (c *alwaysFailChecker) Check(*corev1.Node) bool { return false } -func (c *alwaysFailChecker) Threshold() time.Duration { return c.threshold } +func (c *alwaysFailChecker) Name() string { return c.name } +func (c *alwaysFailChecker) Check(*corev1.Node) (bool, string) { return false, "always fail" } +func (c *alwaysFailChecker) Threshold() time.Duration { return c.threshold } // --- Test variables --- From 953abd332795e854646802efaa04af8937bb69e1 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 14 Apr 2026 13:33:35 +0900 Subject: [PATCH 24/71] fix: capitalize GPU checker reason messages for consistency Co-Authored-By: Claude Opus 4.6 (1M context) --- pkg/health/gpu.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pkg/health/gpu.go b/pkg/health/gpu.go index 6bee234..7faba80 100644 --- a/pkg/health/gpu.go +++ b/pkg/health/gpu.go @@ -24,23 +24,23 @@ func (c *gpuChecker) Threshold() time.Duration { return 10 * time.Minute } func (c *gpuChecker) Check(node *corev1.Node) (bool, string) { expected, ok := expectedGPUCount(node) if !ok || expected == 0 { - return true, "non-GPU node" + return true, "Non-GPU node" } quantity, exists := node.Status.Allocatable[gpuResourceName] if !exists || quantity.IsZero() { - return false, fmt.Sprintf("expected %d but got 0", expected) + return false, fmt.Sprintf("Expected %d but got 0", expected) } actual, ok := quantity.AsInt64() if !ok { - return false, "failed to read allocatable GPU count" + return false, "No allocatable GPU count" } if actual == int64(expected) { return true, fmt.Sprintf("%d/%d", actual, expected) } - return false, fmt.Sprintf("expected %d but got %d", expected, actual) + return false, fmt.Sprintf("Expected %d but got %d", expected, actual) } // expectedGPUCount reads the nvidia.com/gpu.count label from the node. From 5041abd11e0541c49847086bfd1e83ef5fe6fdcb Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 14 Apr 2026 13:43:25 +0900 Subject: [PATCH 25/71] refactor: move hasGPU to health.HasGPU GPU node detection belongs in the health package where gpuCountLabel is defined. Removes strconv and corev1 imports from watcher. Co-Authored-By: Claude Opus 4.6 (1M context) --- pkg/health/gpu.go | 8 ++++++++ pkg/watcher/watcher.go | 18 ++---------------- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/pkg/health/gpu.go b/pkg/health/gpu.go index 7faba80..323bcc7 100644 --- a/pkg/health/gpu.go +++ b/pkg/health/gpu.go @@ -43,6 +43,14 @@ func (c *gpuChecker) Check(node *corev1.Node) (bool, string) { return false, fmt.Sprintf("Expected %d but got %d", expected, actual) } +// HasGPU returns true if the node has the nvidia.com/gpu.count label +// with a positive value, indicating it is a GPU node regardless of +// current GPU health. +func HasGPU(node *corev1.Node) bool { + n, ok := expectedGPUCount(node) + return ok && n > 0 +} + // expectedGPUCount reads the nvidia.com/gpu.count label from the node. func expectedGPUCount(node *corev1.Node) (int, bool) { v, exists := node.Labels[gpuCountLabel] diff --git a/pkg/watcher/watcher.go b/pkg/watcher/watcher.go index 9a9c973..635ae70 100644 --- a/pkg/watcher/watcher.go +++ b/pkg/watcher/watcher.go @@ -4,13 +4,11 @@ import ( "context" "fmt" "log/slog" - "strconv" "time" "github.com/civo/node-agent/pkg/health" "github.com/civo/node-agent/pkg/metrics" "github.com/civo/node-agent/pkg/operation" - corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/client-go/informers" @@ -187,8 +185,8 @@ func (w *watcher) run(ctx context.Context) error { // At least one checker failed — enter the recovery judgment phase. // The state machine decides the next action (wait, reboot, retry) // regardless of which specific checker(s) failed. - isGPU := hasGPU(node) - w.states.UpdateCheckerInfo(nodeName, failedCheckers, isGPU) + isGPUNode := health.HasGPU(node) + w.states.UpdateCheckerInfo(nodeName, failedCheckers, isGPUNode) switch state.Phase() { case PhaseHealthy: @@ -292,15 +290,3 @@ func buildNodeSelector(nodePoolIDs []string) *metav1.LabelSelector { } } } - -// hasGPU returns true if the node has the nvidia.com/gpu.count label -// with a positive value, indicating it is a GPU node regardless of -// current GPU health. -func hasGPU(node *corev1.Node) bool { - v, exists := node.Labels["nvidia.com/gpu.count"] - if !exists { - return false - } - n, err := strconv.Atoi(v) - return err == nil && n > 0 -} From 96e68668d8cc6a2a0d184644e5d378efb4bf66e2 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 14 Apr 2026 15:07:01 +0900 Subject: [PATCH 26/71] test: add missing tests for Threshold, HasGPU, validation, and buildNodeSelector MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Health: Threshold tests for all checkers, HasGPU tests - Operation: NewCivoExecutor validation tests (empty clusterID, apiKey, apiURL) - Watcher: buildNodeSelector tests (nil, single, multiple) - Coverage: health 84→98%, operation 50→81%, total 71→76% Co-Authored-By: Claude Opus 4.6 (1M context) --- pkg/health/cilium_test.go | 8 +++++ pkg/health/disk_pressure_test.go | 8 +++++ pkg/health/gpu_test.go | 62 ++++++++++++++++++++++++++++++++ pkg/health/node_ready_test.go | 8 +++++ pkg/operation/civo_test.go | 48 +++++++++++++++++++++++++ pkg/watcher/watcher_test.go | 59 ++++++++++++++++++++++++++++++ 6 files changed, 193 insertions(+) diff --git a/pkg/health/cilium_test.go b/pkg/health/cilium_test.go index ad0044d..1632a6f 100644 --- a/pkg/health/cilium_test.go +++ b/pkg/health/cilium_test.go @@ -2,11 +2,19 @@ package health import ( "testing" + "time" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) +func TestCiliumChecker_Threshold(t *testing.T) { + c := &ciliumChecker{} + if got := c.Threshold(); got != 10*time.Minute { + t.Errorf("got %v, want %v", got, 10*time.Minute) + } +} + func TestCiliumChecker_Name(t *testing.T) { c := &ciliumChecker{} if got := c.Name(); got != "CiliumAgent" { diff --git a/pkg/health/disk_pressure_test.go b/pkg/health/disk_pressure_test.go index e1a9284..2b9dbd0 100644 --- a/pkg/health/disk_pressure_test.go +++ b/pkg/health/disk_pressure_test.go @@ -2,11 +2,19 @@ package health import ( "testing" + "time" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) +func TestDiskPressureChecker_Threshold(t *testing.T) { + c := &diskPressureChecker{} + if got := c.Threshold(); got != 30*time.Minute { + t.Errorf("got %v, want %v", got, 30*time.Minute) + } +} + func TestDiskPressureChecker_Name(t *testing.T) { c := &diskPressureChecker{} if got := c.Name(); got != "DiskPressure" { diff --git a/pkg/health/gpu_test.go b/pkg/health/gpu_test.go index e4dcac4..f8abbde 100644 --- a/pkg/health/gpu_test.go +++ b/pkg/health/gpu_test.go @@ -2,12 +2,74 @@ package health import ( "testing" + "time" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) +func TestGPUChecker_Threshold(t *testing.T) { + c := &gpuChecker{} + if got := c.Threshold(); got != 10*time.Minute { + t.Errorf("got %v, want %v", got, 10*time.Minute) + } +} + +func TestHasGPU(t *testing.T) { + tests := []struct { + name string + node *corev1.Node + want bool + }{ + { + name: "Returns true when gpu.count label is positive", + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node-01", + Labels: map[string]string{gpuCountLabel: "8"}, + }, + }, + want: true, + }, + { + name: "Returns false when gpu.count label is absent", + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, + }, + want: false, + }, + { + name: "Returns false when gpu.count label is 0", + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node-01", + Labels: map[string]string{gpuCountLabel: "0"}, + }, + }, + want: false, + }, + { + name: "Returns false when gpu.count label is invalid", + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node-01", + Labels: map[string]string{gpuCountLabel: "invalid"}, + }, + }, + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := HasGPU(tt.node); got != tt.want { + t.Errorf("got %v, want %v", got, tt.want) + } + }) + } +} + func TestGPUChecker_Name(t *testing.T) { c := &gpuChecker{} if got := c.Name(); got != "GPU" { diff --git a/pkg/health/node_ready_test.go b/pkg/health/node_ready_test.go index c131b48..fcd3a91 100644 --- a/pkg/health/node_ready_test.go +++ b/pkg/health/node_ready_test.go @@ -2,11 +2,19 @@ package health import ( "testing" + "time" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) +func TestNodeReadyChecker_Threshold(t *testing.T) { + c := &nodeReadyChecker{} + if got := c.Threshold(); got != 5*time.Minute { + t.Errorf("got %v, want %v", got, 5*time.Minute) + } +} + func TestNodeReadyChecker_Name(t *testing.T) { c := &nodeReadyChecker{} if got := c.Name(); got != "NodeReady" { diff --git a/pkg/operation/civo_test.go b/pkg/operation/civo_test.go index c0977cb..1056678 100644 --- a/pkg/operation/civo_test.go +++ b/pkg/operation/civo_test.go @@ -91,3 +91,51 @@ func TestCivoExecutor_Reboot(t *testing.T) { }) } } + +func TestNewCivoExecutor_Validation(t *testing.T) { + tests := []struct { + name string + id string + opts []Option + wantErr bool + }{ + { + name: "Returns no error with injected client", + id: "test-cluster", + opts: []Option{WithClient(&fakeClient{})}, + }, + { + name: "Returns error when clusterID is empty without injected client", + id: "", + opts: []Option{WithAPIConfig("key", "https://api.civo.com", "lon1", "0.0.1")}, + wantErr: true, + }, + { + name: "Returns error when apiKey is empty", + id: "test-cluster", + opts: []Option{WithAPIConfig("", "https://api.civo.com", "lon1", "0.0.1")}, + wantErr: true, + }, + { + name: "Returns error when apiURL is empty", + id: "test-cluster", + opts: []Option{WithAPIConfig("key", "", "lon1", "0.0.1")}, + wantErr: true, + }, + { + name: "Returns error when clusterID is empty even with injected client", + id: "", + opts: []Option{WithClient(&fakeClient{})}, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + _, err := NewCivoExecutor(tt.id, tt.opts...) + if (err != nil) != tt.wantErr { + t.Errorf("error = %v, wantErr %v", err, tt.wantErr) + } + }) + } +} diff --git a/pkg/watcher/watcher_test.go b/pkg/watcher/watcher_test.go index 8f2d126..0872f2e 100644 --- a/pkg/watcher/watcher_test.go +++ b/pkg/watcher/watcher_test.go @@ -528,3 +528,62 @@ func TestRun_UnhealthyWithinThresholdNoReboot(t *testing.T) { t.Errorf("expected no reboot calls within threshold, got %v", exec.calls) } } + +func TestBuildNodeSelector(t *testing.T) { + tests := []struct { + name string + nodePoolIDs []string + wantNil bool + wantLabels map[string]string + wantInExpr bool + }{ + { + name: "Returns nil for empty IDs", + wantNil: true, + }, + { + name: "Returns MatchLabels for single ID", + nodePoolIDs: []string{"pool-1"}, + wantLabels: map[string]string{nodePoolLabelKey: "pool-1"}, + }, + { + name: "Returns MatchExpressions In for multiple IDs", + nodePoolIDs: []string{"pool-1", "pool-2"}, + wantInExpr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + sel := buildNodeSelector(tt.nodePoolIDs) + if tt.wantNil { + if sel != nil { + t.Errorf("expected nil selector, got %v", sel) + } + return + } + if sel == nil { + t.Fatal("expected non-nil selector") + } + if tt.wantLabels != nil { + for k, v := range tt.wantLabels { + if sel.MatchLabels[k] != v { + t.Errorf("MatchLabels[%s] = %q, want %q", k, sel.MatchLabels[k], v) + } + } + } + if tt.wantInExpr { + if len(sel.MatchExpressions) != 1 { + t.Fatalf("expected 1 MatchExpression, got %d", len(sel.MatchExpressions)) + } + expr := sel.MatchExpressions[0] + if expr.Key != nodePoolLabelKey { + t.Errorf("key = %q, want %q", expr.Key, nodePoolLabelKey) + } + if len(expr.Values) != len(tt.nodePoolIDs) { + t.Errorf("values count = %d, want %d", len(expr.Values), len(tt.nodePoolIDs)) + } + } + }) + } +} From c583f7c36c9650fb062a72c04053f16779f4d1c7 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 14 Apr 2026 15:11:36 +0900 Subject: [PATCH 27/71] refactor: extract threshold values into local constants Each checker now defines its threshold as a named constant (nodeReadyThreshold, gpuThreshold, diskPressureThreshold, ciliumThreshold). Co-Authored-By: Claude Opus 4.6 (1M context) --- pkg/health/cilium.go | 7 +++++-- pkg/health/disk_pressure.go | 4 +++- pkg/health/gpu.go | 3 ++- pkg/health/node_ready.go | 4 +++- 4 files changed, 13 insertions(+), 5 deletions(-) diff --git a/pkg/health/cilium.go b/pkg/health/cilium.go index d0ef88a..44ce71d 100644 --- a/pkg/health/cilium.go +++ b/pkg/health/cilium.go @@ -6,7 +6,10 @@ import ( corev1 "k8s.io/api/core/v1" ) -const ciliumReadyReason = "CiliumIsUp" +const ( + ciliumReadyReason = "CiliumIsUp" + ciliumThreshold = 10 * time.Minute +) // ciliumChecker reports healthy when the Cilium-managed NetworkUnavailable // condition is False. If the condition's reason is not "CiliumIsUp" @@ -14,7 +17,7 @@ const ciliumReadyReason = "CiliumIsUp" type ciliumChecker struct{} func (c *ciliumChecker) Name() string { return "CiliumAgent" } -func (c *ciliumChecker) Threshold() time.Duration { return 10 * time.Minute } +func (c *ciliumChecker) Threshold() time.Duration { return ciliumThreshold } func (c *ciliumChecker) Check(node *corev1.Node) (bool, string) { for _, cond := range node.Status.Conditions { diff --git a/pkg/health/disk_pressure.go b/pkg/health/disk_pressure.go index eba79ca..d092aa3 100644 --- a/pkg/health/disk_pressure.go +++ b/pkg/health/disk_pressure.go @@ -6,11 +6,13 @@ import ( corev1 "k8s.io/api/core/v1" ) +const diskPressureThreshold = 30 * time.Minute + // diskPressureChecker reports healthy when the node does not have disk pressure. type diskPressureChecker struct{} func (c *diskPressureChecker) Name() string { return "DiskPressure" } -func (c *diskPressureChecker) Threshold() time.Duration { return 30 * time.Minute } +func (c *diskPressureChecker) Threshold() time.Duration { return diskPressureThreshold } func (c *diskPressureChecker) Check(node *corev1.Node) (bool, string) { for _, cond := range node.Status.Conditions { diff --git a/pkg/health/gpu.go b/pkg/health/gpu.go index 323bcc7..8d4c996 100644 --- a/pkg/health/gpu.go +++ b/pkg/health/gpu.go @@ -11,6 +11,7 @@ import ( const ( gpuResourceName = "nvidia.com/gpu" gpuCountLabel = "nvidia.com/gpu.count" + gpuThreshold = 10 * time.Minute ) // gpuChecker reports healthy when the node's allocatable GPU count @@ -19,7 +20,7 @@ const ( type gpuChecker struct{} func (c *gpuChecker) Name() string { return "GPU" } -func (c *gpuChecker) Threshold() time.Duration { return 10 * time.Minute } +func (c *gpuChecker) Threshold() time.Duration { return gpuThreshold } func (c *gpuChecker) Check(node *corev1.Node) (bool, string) { expected, ok := expectedGPUCount(node) diff --git a/pkg/health/node_ready.go b/pkg/health/node_ready.go index d002473..edaf2a5 100644 --- a/pkg/health/node_ready.go +++ b/pkg/health/node_ready.go @@ -6,11 +6,13 @@ import ( corev1 "k8s.io/api/core/v1" ) +const nodeReadyThreshold = 5 * time.Minute + // nodeReadyChecker reports healthy when the node's NodeReady condition is True. type nodeReadyChecker struct{} func (c *nodeReadyChecker) Name() string { return "NodeReady" } -func (c *nodeReadyChecker) Threshold() time.Duration { return 5 * time.Minute } +func (c *nodeReadyChecker) Threshold() time.Duration { return nodeReadyThreshold } func (c *nodeReadyChecker) Check(node *corev1.Node) (bool, string) { for _, cond := range node.Status.Conditions { From de779152645b597ba0eb5cb6511adf201a7eb81f Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 14 Apr 2026 17:32:46 +0900 Subject: [PATCH 28/71] feat: add NopExecutor as default to prevent nil pointer dereference - Add NewNopExecutor() that performs no operations - Set as default in defaultOptions - Add nil check in WithExecutor to preserve default when nil is passed Co-Authored-By: Claude Opus 4.6 (1M context) --- pkg/operation/operation.go | 11 +++++++++++ pkg/watcher/options.go | 5 ++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/pkg/operation/operation.go b/pkg/operation/operation.go index 4f00a21..1498a6b 100644 --- a/pkg/operation/operation.go +++ b/pkg/operation/operation.go @@ -6,3 +6,14 @@ import "context" type Executor interface { Reboot(ctx context.Context, nodeName string) error } + +// nopExecutor is a no-op Executor that does nothing. +// Used as a safe default to prevent nil pointer dereference. +type nopExecutor struct{} + +func (e *nopExecutor) Reboot(_ context.Context, _ string) error { return nil } + +// NewNopExecutor returns an Executor that performs no operations. +func NewNopExecutor() Executor { + return &nopExecutor{} +} diff --git a/pkg/watcher/options.go b/pkg/watcher/options.go index 6ecc239..c10d3db 100644 --- a/pkg/watcher/options.go +++ b/pkg/watcher/options.go @@ -17,6 +17,7 @@ type Option func(*watcher) var defaultOptions = []Option{ WithMonitorOnly("true"), + WithExecutor(operation.NewNopExecutor()), WithRebootWaitMinutes("10"), WithGPURebootWaitMinutes("40"), } @@ -98,7 +99,9 @@ func WithCheckers(checkers []health.HealthChecker) Option { // WithExecutor returns Option to set the recovery executor. func WithExecutor(exec operation.Executor) Option { return func(w *watcher) { - w.executor = exec + if exec != nil { + w.executor = exec + } } } From edefca80f7f380c84fbc4fd8555b930e1c2b557b Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 14 Apr 2026 22:59:23 +0900 Subject: [PATCH 29/71] refactor: rename nodeSelector to nodeLabelSelector and inline FormatLabelSelector Co-Authored-By: Claude Opus 4.6 (1M context) --- pkg/watcher/watcher.go | 9 ++++----- pkg/watcher/watcher_test.go | 4 ++-- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/pkg/watcher/watcher.go b/pkg/watcher/watcher.go index 635ae70..bf42704 100644 --- a/pkg/watcher/watcher.go +++ b/pkg/watcher/watcher.go @@ -33,8 +33,8 @@ type watcher struct { rebootWaitMinutes time.Duration // Standard nodes (default: 10) gpuRebootWaitMinutes time.Duration // GPU nodes (default: 40) - nodeSelector *metav1.LabelSelector - nodeLister listerscorev1.NodeLister + nodeLabelSelector *metav1.LabelSelector + nodeLister listerscorev1.NodeLister monitorOnly bool checkers []health.HealthChecker @@ -53,7 +53,7 @@ func NewWatcher(ctx context.Context, opts ...Option) (Watcher, error) { opt(w) } - w.nodeSelector = buildNodeSelector(w.nodePoolIDs) + w.nodeLabelSelector = buildNodeSelector(w.nodePoolIDs) if err := w.setupKubernetesClient(); err != nil { return nil, err @@ -95,12 +95,11 @@ func (w *watcher) setupInformer(ctx context.Context) error { return nil } - labelSelector := metav1.FormatLabelSelector(w.nodeSelector) factory := informers.NewSharedInformerFactoryWithOptions( w.client, 0, informers.WithTweakListOptions(func(opts *metav1.ListOptions) { - opts.LabelSelector = labelSelector + opts.LabelSelector = metav1.FormatLabelSelector(w.nodeLabelSelector) }), ) diff --git a/pkg/watcher/watcher_test.go b/pkg/watcher/watcher_test.go index 0872f2e..09c1a44 100644 --- a/pkg/watcher/watcher_test.go +++ b/pkg/watcher/watcher_test.go @@ -133,8 +133,8 @@ func TestNew(t *testing.T) { }, }, checkFunc: func(w *watcher) error { - if w.nodeSelector == nil || w.nodeSelector.MatchLabels[nodePoolLabelKey] != testNodePoolID { - return fmt.Errorf("nodeSelector mismatch: got %v, want %s", w.nodeSelector, testNodePoolID) + if w.nodeLabelSelector == nil || w.nodeLabelSelector.MatchLabels[nodePoolLabelKey] != testNodePoolID { + return fmt.Errorf("nodeLabelSelector mismatch: got %v, want %s", w.nodeLabelSelector, testNodePoolID) } if w.client == nil { return fmt.Errorf("client is nil") From 53a4799c08d309a53c9a865ae857df1ace93509a Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 14 Apr 2026 23:04:40 +0900 Subject: [PATCH 30/71] refactor: unexport test-only options WithNowFunc and WithNodeLister These options are only used within the watcher package tests, so they don't need to be exported. Co-Authored-By: Claude Opus 4.6 (1M context) --- pkg/watcher/options.go | 8 ++++---- pkg/watcher/watcher_test.go | 40 ++++++++++++++++++------------------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/pkg/watcher/options.go b/pkg/watcher/options.go index c10d3db..693f7e5 100644 --- a/pkg/watcher/options.go +++ b/pkg/watcher/options.go @@ -105,8 +105,8 @@ func WithExecutor(exec operation.Executor) Option { } } -// WithNowFunc returns Option to override the time source (for testing). -func WithNowFunc(fn func() time.Time) Option { +// withNowFunc returns Option to override the time source (for testing). +func withNowFunc(fn func() time.Time) Option { return func(w *watcher) { if fn != nil { w.nowFunc = fn @@ -114,9 +114,9 @@ func WithNowFunc(fn func() time.Time) Option { } } -// WithNodeLister returns Option to inject a node lister (for testing). +// withNodeLister returns Option to inject a node lister (for testing). // When set, the informer setup is skipped. -func WithNodeLister(lister listerscorev1.NodeLister) Option { +func withNodeLister(lister listerscorev1.NodeLister) Option { return func(w *watcher) { w.nodeLister = lister } diff --git a/pkg/watcher/watcher_test.go b/pkg/watcher/watcher_test.go index 09c1a44..50e4922 100644 --- a/pkg/watcher/watcher_test.go +++ b/pkg/watcher/watcher_test.go @@ -202,7 +202,7 @@ func TestNew(t *testing.T) { func TestRun_HealthyNodeStaysHealthy(t *testing.T) { node := newTestNode("node-01", corev1.ConditionTrue, 8) w := newTestWatcher(t, - WithNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), + withNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), WithCheckers(health.NewDefaultCheckers()), ) @@ -223,9 +223,9 @@ func TestRun_UnhealthyDetection(t *testing.T) { now := time.Date(2026, 4, 13, 12, 0, 0, 0, time.UTC) node := newTestNode("node-01", corev1.ConditionFalse, 8) w := newTestWatcher(t, - WithNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), + withNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), WithCheckers(health.NewDefaultCheckers()), - WithNowFunc(func() time.Time { return now }), + withNowFunc(func() time.Time { return now }), ) if err := w.run(t.Context()); err != nil { @@ -246,11 +246,11 @@ func TestRun_RebootTriggerActiveMode(t *testing.T) { node := newTestNode("node-01", corev1.ConditionFalse, 8) exec := &mockExecutor{} w := newTestWatcher(t, - WithNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), + withNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), WithCheckers(health.NewDefaultCheckers()), WithExecutor(exec), WithMonitorOnly("false"), - WithNowFunc(func() time.Time { return now }), + withNowFunc(func() time.Time { return now }), ) // First run: detect unhealthy. @@ -283,11 +283,11 @@ func TestRun_RebootSkippedInReportMode(t *testing.T) { node := newTestNode("node-01", corev1.ConditionFalse, 8) exec := &mockExecutor{} w := newTestWatcher(t, - WithNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), + withNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), WithCheckers(health.NewDefaultCheckers()), WithExecutor(exec), WithMonitorOnly("true"), - WithNowFunc(func() time.Time { return now }), + withNowFunc(func() time.Time { return now }), ) // First run: detect unhealthy. @@ -316,10 +316,10 @@ func TestRun_RecoveryAfterReboot(t *testing.T) { now := time.Date(2026, 4, 13, 12, 0, 0, 0, time.UTC) node := newTestNode("node-01", corev1.ConditionFalse, 8) w := newTestWatcher(t, - WithNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), + withNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), WithCheckers(health.NewDefaultCheckers()), WithMonitorOnly("false"), - WithNowFunc(func() time.Time { return now }), + withNowFunc(func() time.Time { return now }), ) // Run 1: detect unhealthy. @@ -355,12 +355,12 @@ func TestRun_RebootRetry(t *testing.T) { node := newTestNode("node-01", corev1.ConditionFalse, 8) exec := &mockExecutor{} w := newTestWatcher(t, - WithNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), + withNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), WithCheckers(health.NewDefaultCheckers()), WithExecutor(exec), WithMonitorOnly("false"), WithGPURebootWaitMinutes("40"), - WithNowFunc(func() time.Time { return now }), + withNowFunc(func() time.Time { return now }), ) // Run 1: detect unhealthy. @@ -406,9 +406,9 @@ func TestRun_GPUMismatchTriggersUnhealthy(t *testing.T) { // Simulate GPU failure: label says 8 but only 7 allocatable. node.Status.Allocatable["nvidia.com/gpu"] = resource.MustParse("7") w := newTestWatcher(t, - WithNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), + withNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), WithCheckers(health.NewDefaultCheckers()), - WithNowFunc(func() time.Time { return now }), + withNowFunc(func() time.Time { return now }), ) if err := w.run(t.Context()); err != nil { @@ -433,11 +433,11 @@ func TestRun_RebootErrorContinuesProcessing(t *testing.T) { }, } w := newTestWatcher(t, - WithNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), + withNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), WithCheckers(health.NewDefaultCheckers()), WithExecutor(exec), WithMonitorOnly("false"), - WithNowFunc(func() time.Time { return now }), + withNowFunc(func() time.Time { return now }), ) // Run 1: detect unhealthy. @@ -458,7 +458,7 @@ func TestRun_RebootErrorContinuesProcessing(t *testing.T) { func TestRun_NodeListError(t *testing.T) { w := newTestWatcher(t, - WithNodeLister(&fakeNodeLister{err: fmt.Errorf("list error")}), + withNodeLister(&fakeNodeLister{err: fmt.Errorf("list error")}), WithCheckers(health.NewDefaultCheckers()), ) @@ -472,9 +472,9 @@ func TestRun_StaleStateCleanup(t *testing.T) { node := newTestNode("node-01", corev1.ConditionFalse, 0) lister := &fakeNodeLister{nodes: []*corev1.Node{node}} w := newTestWatcher(t, - WithNodeLister(lister), + withNodeLister(lister), WithCheckers(health.NewDefaultCheckers()), - WithNowFunc(func() time.Time { return now }), + withNowFunc(func() time.Time { return now }), ) // Run 1: detect node-01 unhealthy. @@ -502,11 +502,11 @@ func TestRun_UnhealthyWithinThresholdNoReboot(t *testing.T) { node := newTestNode("node-01", corev1.ConditionFalse, 0) exec := &mockExecutor{} w := newTestWatcher(t, - WithNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), + withNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), WithCheckers(health.NewDefaultCheckers()), WithExecutor(exec), WithMonitorOnly("false"), - WithNowFunc(func() time.Time { return now }), + withNowFunc(func() time.Time { return now }), ) // Run 1: detect unhealthy. From cad145f65100d2d215694602f1f615dde849e9f2 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 14 Apr 2026 23:22:40 +0900 Subject: [PATCH 31/71] fix: use UTC for all internal timestamps Co-Authored-By: Claude Opus 4.6 (1M context) --- pkg/watcher/watcher.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/watcher/watcher.go b/pkg/watcher/watcher.go index bf42704..e3f308f 100644 --- a/pkg/watcher/watcher.go +++ b/pkg/watcher/watcher.go @@ -47,7 +47,7 @@ func NewWatcher(ctx context.Context, opts ...Option) (Watcher, error) { w := &watcher{ monitorOnly: true, states: NewStateStore(), - nowFunc: time.Now, + nowFunc: func() time.Time { return time.Now().UTC() }, } for _, opt := range append(defaultOptions, opts...) { opt(w) From 98f80c5c9a338bfca0892322056139bf2a286234 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 14 Apr 2026 23:29:45 +0900 Subject: [PATCH 32/71] refactor: apply Civo Go testing conventions to all test files - Rename 'name' to 'description' in test struct fields - Use verb-driven descriptions ("returns", "detects", "skips", etc.) - Use 'test' instead of 'tt' in range loops - Initialize mocks inside t.Run for isolation (operation tests) - Use test.description in t.Run calls Co-Authored-By: Claude Opus 4.6 (1M context) --- pkg/health/cilium_test.go | 22 ++--- pkg/health/disk_pressure_test.go | 22 ++--- pkg/health/gpu_test.go | 50 +++++------ pkg/health/node_ready_test.go | 22 ++--- pkg/operation/civo_test.go | 137 ++++++++++++++++--------------- pkg/watcher/state_test.go | 8 +- pkg/watcher/watcher_test.go | 42 +++++----- 7 files changed, 155 insertions(+), 148 deletions(-) diff --git a/pkg/health/cilium_test.go b/pkg/health/cilium_test.go index 1632a6f..a146788 100644 --- a/pkg/health/cilium_test.go +++ b/pkg/health/cilium_test.go @@ -24,12 +24,12 @@ func TestCiliumChecker_Name(t *testing.T) { func TestCiliumChecker_Check(t *testing.T) { tests := []struct { - name string - node *corev1.Node - want bool + description string + node *corev1.Node + want bool }{ { - name: "Returns true when NetworkUnavailable is False with CiliumIsUp", + description: "returns true when NetworkUnavailable is False with CiliumIsUp", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, Status: corev1.NodeStatus{ @@ -45,7 +45,7 @@ func TestCiliumChecker_Check(t *testing.T) { want: true, }, { - name: "Returns false when NetworkUnavailable is True with CiliumIsUp", + description: "returns false when NetworkUnavailable is True with CiliumIsUp", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, Status: corev1.NodeStatus{ @@ -61,7 +61,7 @@ func TestCiliumChecker_Check(t *testing.T) { want: false, }, { - name: "Returns true when NetworkUnavailable has non-Cilium reason (skip)", + description: "skips check when NetworkUnavailable has non-Cilium reason", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, Status: corev1.NodeStatus{ @@ -77,7 +77,7 @@ func TestCiliumChecker_Check(t *testing.T) { want: true, }, { - name: "Returns true when condition is absent", + description: "returns true when condition is absent", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, Status: corev1.NodeStatus{ @@ -89,10 +89,10 @@ func TestCiliumChecker_Check(t *testing.T) { } c := &ciliumChecker{} - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got, _ := c.Check(tt.node); got != tt.want { - t.Errorf("got %v, want %v", got, tt.want) + for _, test := range tests { + t.Run(test.description, func(t *testing.T) { + if got, _ := c.Check(test.node); got != test.want { + t.Errorf("got %v, want %v", got, test.want) } }) } diff --git a/pkg/health/disk_pressure_test.go b/pkg/health/disk_pressure_test.go index 2b9dbd0..bfc66aa 100644 --- a/pkg/health/disk_pressure_test.go +++ b/pkg/health/disk_pressure_test.go @@ -24,12 +24,12 @@ func TestDiskPressureChecker_Name(t *testing.T) { func TestDiskPressureChecker_Check(t *testing.T) { tests := []struct { - name string - node *corev1.Node - want bool + description string + node *corev1.Node + want bool }{ { - name: "Returns true when DiskPressure is False (no pressure)", + description: "returns true when DiskPressure is False (no pressure)", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, Status: corev1.NodeStatus{ @@ -41,7 +41,7 @@ func TestDiskPressureChecker_Check(t *testing.T) { want: true, }, { - name: "Returns false when DiskPressure is True (under pressure)", + description: "returns false when DiskPressure is True (under pressure)", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, Status: corev1.NodeStatus{ @@ -53,7 +53,7 @@ func TestDiskPressureChecker_Check(t *testing.T) { want: false, }, { - name: "Returns true when no conditions present", + description: "returns true when no conditions present", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, Status: corev1.NodeStatus{ @@ -63,7 +63,7 @@ func TestDiskPressureChecker_Check(t *testing.T) { want: true, }, { - name: "Returns true when only non-DiskPressure conditions present", + description: "returns true when only non-DiskPressure conditions present", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, Status: corev1.NodeStatus{ @@ -77,10 +77,10 @@ func TestDiskPressureChecker_Check(t *testing.T) { } c := &diskPressureChecker{} - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got, _ := c.Check(tt.node); got != tt.want { - t.Errorf("got %v, want %v", got, tt.want) + for _, test := range tests { + t.Run(test.description, func(t *testing.T) { + if got, _ := c.Check(test.node); got != test.want { + t.Errorf("got %v, want %v", got, test.want) } }) } diff --git a/pkg/health/gpu_test.go b/pkg/health/gpu_test.go index f8abbde..7a48c3f 100644 --- a/pkg/health/gpu_test.go +++ b/pkg/health/gpu_test.go @@ -18,12 +18,12 @@ func TestGPUChecker_Threshold(t *testing.T) { func TestHasGPU(t *testing.T) { tests := []struct { - name string - node *corev1.Node - want bool + description string + node *corev1.Node + want bool }{ { - name: "Returns true when gpu.count label is positive", + description: "returns true when gpu.count label is positive", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ Name: "node-01", @@ -33,14 +33,14 @@ func TestHasGPU(t *testing.T) { want: true, }, { - name: "Returns false when gpu.count label is absent", + description: "returns false when gpu.count label is absent", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, }, want: false, }, { - name: "Returns false when gpu.count label is 0", + description: "returns false when gpu.count label is 0", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ Name: "node-01", @@ -50,7 +50,7 @@ func TestHasGPU(t *testing.T) { want: false, }, { - name: "Returns false when gpu.count label is invalid", + description: "returns false when gpu.count label is invalid", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ Name: "node-01", @@ -61,10 +61,10 @@ func TestHasGPU(t *testing.T) { }, } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got := HasGPU(tt.node); got != tt.want { - t.Errorf("got %v, want %v", got, tt.want) + for _, test := range tests { + t.Run(test.description, func(t *testing.T) { + if got := HasGPU(test.node); got != test.want { + t.Errorf("got %v, want %v", got, test.want) } }) } @@ -79,12 +79,12 @@ func TestGPUChecker_Name(t *testing.T) { func TestGPUChecker_Check(t *testing.T) { tests := []struct { - name string - node *corev1.Node - want bool + description string + node *corev1.Node + want bool }{ { - name: "Returns true when allocatable matches label count", + description: "returns true when allocatable matches label count", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ Name: "node-01", @@ -99,7 +99,7 @@ func TestGPUChecker_Check(t *testing.T) { want: true, }, { - name: "Returns true when gpu.count label is absent (non-GPU node)", + description: "returns true when gpu.count label is absent (non-GPU node)", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, Status: corev1.NodeStatus{ @@ -109,7 +109,7 @@ func TestGPUChecker_Check(t *testing.T) { want: true, }, { - name: "Returns true when gpu.count label is 0", + description: "returns true when gpu.count label is 0", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ Name: "node-01", @@ -122,7 +122,7 @@ func TestGPUChecker_Check(t *testing.T) { want: true, }, { - name: "Returns false when allocatable is less than label count", + description: "returns false when allocatable is less than label count", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ Name: "node-01", @@ -137,7 +137,7 @@ func TestGPUChecker_Check(t *testing.T) { want: false, }, { - name: "Returns false when allocatable GPU is zero", + description: "returns false when allocatable GPU is zero", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ Name: "node-01", @@ -152,7 +152,7 @@ func TestGPUChecker_Check(t *testing.T) { want: false, }, { - name: "Returns false when allocatable GPU resource is missing", + description: "returns false when allocatable GPU resource is missing", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ Name: "node-01", @@ -165,7 +165,7 @@ func TestGPUChecker_Check(t *testing.T) { want: false, }, { - name: "Returns true when gpu.count label is invalid", + description: "returns true when gpu.count label is invalid", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ Name: "node-01", @@ -180,10 +180,10 @@ func TestGPUChecker_Check(t *testing.T) { } c := &gpuChecker{} - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got, _ := c.Check(tt.node); got != tt.want { - t.Errorf("got %v, want %v", got, tt.want) + for _, test := range tests { + t.Run(test.description, func(t *testing.T) { + if got, _ := c.Check(test.node); got != test.want { + t.Errorf("got %v, want %v", got, test.want) } }) } diff --git a/pkg/health/node_ready_test.go b/pkg/health/node_ready_test.go index fcd3a91..9b7747d 100644 --- a/pkg/health/node_ready_test.go +++ b/pkg/health/node_ready_test.go @@ -24,12 +24,12 @@ func TestNodeReadyChecker_Name(t *testing.T) { func TestNodeReadyChecker_Check(t *testing.T) { tests := []struct { - name string - node *corev1.Node - want bool + description string + node *corev1.Node + want bool }{ { - name: "Returns true when NodeReady condition is True", + description: "returns true when NodeReady condition is True", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, Status: corev1.NodeStatus{ @@ -41,7 +41,7 @@ func TestNodeReadyChecker_Check(t *testing.T) { want: true, }, { - name: "Returns false when NodeReady condition is False", + description: "returns false when NodeReady condition is False", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, Status: corev1.NodeStatus{ @@ -53,7 +53,7 @@ func TestNodeReadyChecker_Check(t *testing.T) { want: false, }, { - name: "Returns false when no conditions present", + description: "returns false when no conditions present", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, Status: corev1.NodeStatus{ @@ -63,7 +63,7 @@ func TestNodeReadyChecker_Check(t *testing.T) { want: false, }, { - name: "Returns false when only non-NodeReady conditions present", + description: "returns false when only non-NodeReady conditions present", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{Name: "node-01"}, Status: corev1.NodeStatus{ @@ -77,10 +77,10 @@ func TestNodeReadyChecker_Check(t *testing.T) { } c := &nodeReadyChecker{} - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got, _ := c.Check(tt.node); got != tt.want { - t.Errorf("got %v, want %v", got, tt.want) + for _, test := range tests { + t.Run(test.description, func(t *testing.T) { + if got, _ := c.Check(test.node); got != test.want { + t.Errorf("got %v, want %v", got, test.want) } }) } diff --git a/pkg/operation/civo_test.go b/pkg/operation/civo_test.go index 1056678..4dcf06d 100644 --- a/pkg/operation/civo_test.go +++ b/pkg/operation/civo_test.go @@ -33,60 +33,67 @@ var _ civogo.Clienter = (*fakeClient)(nil) func TestCivoExecutor_Reboot(t *testing.T) { tests := []struct { - name string - nodeName string - client *fakeClient - wantErr bool + description string + nodeName string + setupClient func(t *testing.T) *fakeClient + wantErr bool }{ { - name: "Returns nil on successful find and reboot", - nodeName: "node-01", - client: &fakeClient{ - findFunc: func(clusterID, search string) (*civogo.Instance, error) { - return &civogo.Instance{ID: "instance-01"}, nil - }, - rebootFunc: func(id string) (*civogo.SimpleResponse, error) { - if id != "instance-01" { - t.Errorf("instanceID mismatch: got %s, want instance-01", id) - } - return new(civogo.SimpleResponse), nil - }, + description: "returns nil on successful find and reboot", + nodeName: "node-01", + setupClient: func(t *testing.T) *fakeClient { + return &fakeClient{ + findFunc: func(clusterID, search string) (*civogo.Instance, error) { + return &civogo.Instance{ID: "instance-01"}, nil + }, + rebootFunc: func(id string) (*civogo.SimpleResponse, error) { + if id != "instance-01" { + t.Errorf("instanceID mismatch: got %s, want instance-01", id) + } + return new(civogo.SimpleResponse), nil + }, + } }, }, { - name: "Returns error when instance lookup fails", - nodeName: "node-01", - client: &fakeClient{ - findFunc: func(_, _ string) (*civogo.Instance, error) { - return nil, errors.New("not found") - }, + description: "returns error when instance lookup fails", + nodeName: "node-01", + setupClient: func(t *testing.T) *fakeClient { + return &fakeClient{ + findFunc: func(_, _ string) (*civogo.Instance, error) { + return nil, errors.New("not found") + }, + } }, wantErr: true, }, { - name: "Returns error when hard reboot fails", - nodeName: "node-01", - client: &fakeClient{ - findFunc: func(_, _ string) (*civogo.Instance, error) { - return &civogo.Instance{ID: "instance-01"}, nil - }, - rebootFunc: func(_ string) (*civogo.SimpleResponse, error) { - return nil, errors.New("reboot failed") - }, + description: "returns error when hard reboot fails", + nodeName: "node-01", + setupClient: func(t *testing.T) *fakeClient { + return &fakeClient{ + findFunc: func(_, _ string) (*civogo.Instance, error) { + return &civogo.Instance{ID: "instance-01"}, nil + }, + rebootFunc: func(_ string) (*civogo.SimpleResponse, error) { + return nil, errors.New("reboot failed") + }, + } }, wantErr: true, }, } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - exec, err := NewCivoExecutor("test-cluster", WithClient(tt.client)) + for _, test := range tests { + t.Run(test.description, func(t *testing.T) { + client := test.setupClient(t) + exec, err := NewCivoExecutor("test-cluster", WithClient(client)) if err != nil { t.Fatal(err) } - err = exec.Reboot(t.Context(), tt.nodeName) - if (err != nil) != tt.wantErr { - t.Errorf("error = %v, wantErr %v", err, tt.wantErr) + err = exec.Reboot(t.Context(), test.nodeName) + if (err != nil) != test.wantErr { + t.Errorf("error = %v, wantErr %v", err, test.wantErr) } }) } @@ -94,47 +101,47 @@ func TestCivoExecutor_Reboot(t *testing.T) { func TestNewCivoExecutor_Validation(t *testing.T) { tests := []struct { - name string - id string - opts []Option - wantErr bool + description string + id string + opts []Option + wantErr bool }{ { - name: "Returns no error with injected client", - id: "test-cluster", - opts: []Option{WithClient(&fakeClient{})}, + description: "returns no error with injected client", + id: "test-cluster", + opts: []Option{WithClient(&fakeClient{})}, }, { - name: "Returns error when clusterID is empty without injected client", - id: "", - opts: []Option{WithAPIConfig("key", "https://api.civo.com", "lon1", "0.0.1")}, - wantErr: true, + description: "returns error when clusterID is empty without injected client", + id: "", + opts: []Option{WithAPIConfig("key", "https://api.civo.com", "lon1", "0.0.1")}, + wantErr: true, }, { - name: "Returns error when apiKey is empty", - id: "test-cluster", - opts: []Option{WithAPIConfig("", "https://api.civo.com", "lon1", "0.0.1")}, - wantErr: true, + description: "returns error when apiKey is empty", + id: "test-cluster", + opts: []Option{WithAPIConfig("", "https://api.civo.com", "lon1", "0.0.1")}, + wantErr: true, }, { - name: "Returns error when apiURL is empty", - id: "test-cluster", - opts: []Option{WithAPIConfig("key", "", "lon1", "0.0.1")}, - wantErr: true, + description: "returns error when apiURL is empty", + id: "test-cluster", + opts: []Option{WithAPIConfig("key", "", "lon1", "0.0.1")}, + wantErr: true, }, { - name: "Returns error when clusterID is empty even with injected client", - id: "", - opts: []Option{WithClient(&fakeClient{})}, - wantErr: true, + description: "returns error when clusterID is empty even with injected client", + id: "", + opts: []Option{WithClient(&fakeClient{})}, + wantErr: true, }, } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - _, err := NewCivoExecutor(tt.id, tt.opts...) - if (err != nil) != tt.wantErr { - t.Errorf("error = %v, wantErr %v", err, tt.wantErr) + for _, test := range tests { + t.Run(test.description, func(t *testing.T) { + _, err := NewCivoExecutor(test.id, test.opts...) + if (err != nil) != test.wantErr { + t.Errorf("error = %v, wantErr %v", err, test.wantErr) } }) } diff --git a/pkg/watcher/state_test.go b/pkg/watcher/state_test.go index f696f94..ccfe3a6 100644 --- a/pkg/watcher/state_test.go +++ b/pkg/watcher/state_test.go @@ -20,10 +20,10 @@ func TestNodePhaseString(t *testing.T) { {NodePhase(99), "Unknown"}, } - for _, tt := range tests { - t.Run(tt.want, func(t *testing.T) { - if got := tt.phase.String(); got != tt.want { - t.Errorf("got %q, want %q", got, tt.want) + for _, test := range tests { + t.Run(test.want, func(t *testing.T) { + if got := test.phase.String(); got != test.want { + t.Errorf("got %q, want %q", got, test.want) } }) } diff --git a/pkg/watcher/watcher_test.go b/pkg/watcher/watcher_test.go index 50e4922..9744453 100644 --- a/pkg/watcher/watcher_test.go +++ b/pkg/watcher/watcher_test.go @@ -116,15 +116,15 @@ func TestNew(t *testing.T) { opts []Option } type test struct { - name string - args args - checkFunc func(*watcher) error - wantErr bool + description string + args args + checkFunc func(*watcher) error + wantErr bool } tests := []test{ { - name: "Returns no error when given valid input", + description: "returns no error when given valid input", args: args{ opts: []Option{ WithKubernetesClient(fake.NewSimpleClientset()), @@ -155,7 +155,7 @@ func TestNew(t *testing.T) { }, }, { - name: "Returns no error when input is invalid, but default value is set", + description: "returns no error when input is invalid, but default value is set", args: args{ opts: []Option{ WithKubernetesClient(fake.NewSimpleClientset()), @@ -174,7 +174,7 @@ func TestNew(t *testing.T) { } for _, test := range tests { - t.Run(test.name, func(t *testing.T) { + t.Run(test.description, func(t *testing.T) { w, err := NewWatcher(t.Context(), test.args.opts...) if (err != nil) != test.wantErr { @@ -531,32 +531,32 @@ func TestRun_UnhealthyWithinThresholdNoReboot(t *testing.T) { func TestBuildNodeSelector(t *testing.T) { tests := []struct { - name string + description string nodePoolIDs []string wantNil bool wantLabels map[string]string wantInExpr bool }{ { - name: "Returns nil for empty IDs", - wantNil: true, + description: "returns nil for empty IDs", + wantNil: true, }, { - name: "Returns MatchLabels for single ID", + description: "returns MatchLabels for single ID", nodePoolIDs: []string{"pool-1"}, wantLabels: map[string]string{nodePoolLabelKey: "pool-1"}, }, { - name: "Returns MatchExpressions In for multiple IDs", + description: "returns MatchExpressions In for multiple IDs", nodePoolIDs: []string{"pool-1", "pool-2"}, wantInExpr: true, }, } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - sel := buildNodeSelector(tt.nodePoolIDs) - if tt.wantNil { + for _, test := range tests { + t.Run(test.description, func(t *testing.T) { + sel := buildNodeSelector(test.nodePoolIDs) + if test.wantNil { if sel != nil { t.Errorf("expected nil selector, got %v", sel) } @@ -565,14 +565,14 @@ func TestBuildNodeSelector(t *testing.T) { if sel == nil { t.Fatal("expected non-nil selector") } - if tt.wantLabels != nil { - for k, v := range tt.wantLabels { + if test.wantLabels != nil { + for k, v := range test.wantLabels { if sel.MatchLabels[k] != v { t.Errorf("MatchLabels[%s] = %q, want %q", k, sel.MatchLabels[k], v) } } } - if tt.wantInExpr { + if test.wantInExpr { if len(sel.MatchExpressions) != 1 { t.Fatalf("expected 1 MatchExpression, got %d", len(sel.MatchExpressions)) } @@ -580,8 +580,8 @@ func TestBuildNodeSelector(t *testing.T) { if expr.Key != nodePoolLabelKey { t.Errorf("key = %q, want %q", expr.Key, nodePoolLabelKey) } - if len(expr.Values) != len(tt.nodePoolIDs) { - t.Errorf("values count = %d, want %d", len(expr.Values), len(tt.nodePoolIDs)) + if len(expr.Values) != len(test.nodePoolIDs) { + t.Errorf("values count = %d, want %d", len(expr.Values), len(test.nodePoolIDs)) } } }) From df5b4289496061f9bf6a2213baffc2bbb3b2e730 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Wed, 15 Apr 2026 03:43:44 +0900 Subject: [PATCH 33/71] refactor: remove redundant state.Phase() call in recovery check Use if-init statement to call Phase() once. Co-Authored-By: Claude Opus 4.6 (1M context) --- pkg/watcher/watcher.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pkg/watcher/watcher.go b/pkg/watcher/watcher.go index e3f308f..bcbbd2c 100644 --- a/pkg/watcher/watcher.go +++ b/pkg/watcher/watcher.go @@ -168,8 +168,7 @@ func (w *watcher) run(ctx context.Context) error { // All checkers pass → node is healthy. if len(failedCheckers) == 0 { - if state.Phase() != PhaseHealthy { - prevPhase := state.Phase() + if prevPhase := state.Phase(); prevPhase != PhaseHealthy { slog.Info("Node recovered", "node", nodeName, "previousPhase", prevPhase.String()) From 4943091f1ca6048abc8c8362500c19bdc2af853d Mon Sep 17 00:00:00 2001 From: hlts2 Date: Wed, 15 Apr 2026 03:48:56 +0900 Subject: [PATCH 34/71] docs: add state transition comments to reconcile loop Co-Authored-By: Claude Opus 4.6 (1M context) --- pkg/watcher/watcher.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pkg/watcher/watcher.go b/pkg/watcher/watcher.go index bcbbd2c..85b6408 100644 --- a/pkg/watcher/watcher.go +++ b/pkg/watcher/watcher.go @@ -187,6 +187,7 @@ func (w *watcher) run(ctx context.Context) error { w.states.UpdateCheckerInfo(nodeName, failedCheckers, isGPUNode) switch state.Phase() { + // Healthy → Unhealthy: health check failed for the first time, start tracking. case PhaseHealthy: w.states.MarkUnhealthy(nodeName, now) slog.Info("Node unhealthy detected", @@ -196,6 +197,7 @@ func (w *watcher) run(ctx context.Context) error { metrics.RecoveryPhase.WithLabelValues(nodeName, PhaseHealthy.String()).Set(0) metrics.RecoveryPhase.WithLabelValues(nodeName, PhaseUnhealthy.String()).Set(1) + // Unhealthy → WaitingReboot: health check still failing and threshold exceeded, issue reboot. case PhaseUnhealthy: metrics.NodeUnhealthyDurationSeconds.WithLabelValues(nodeName).Set( now.Sub(state.UnhealthySince()).Seconds()) @@ -218,6 +220,7 @@ func (w *watcher) run(ctx context.Context) error { metrics.RecoveryPhase.WithLabelValues(nodeName, PhaseWaitingReboot.String()).Set(1) w.states.MarkWaitingReboot(nodeName, now) + // WaitingReboot: health check still failing after reboot, retry after wait window. case PhaseWaitingReboot: metrics.NodeUnhealthyDurationSeconds.WithLabelValues(nodeName).Set( now.Sub(state.UnhealthySince()).Seconds()) From 4025e6804eaa135058772438c4a42a9daaf5caf9 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Wed, 15 Apr 2026 04:16:35 +0900 Subject: [PATCH 35/71] fix: add graceful shutdown for metrics server and fix godoc typo - Metrics server now uses http.Server with Shutdown() on context cancellation - Ignores http.ErrServerClosed on normal shutdown - Fix WithKubernetesClientConfigPath godoc (was copy of WithKubernetesClient) Co-Authored-By: Claude Opus 4.6 (1M context) --- main.go | 29 +++++++++++++++++++++-------- pkg/watcher/options.go | 2 +- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/main.go b/main.go index 93c78f2..35050de 100644 --- a/main.go +++ b/main.go @@ -11,6 +11,7 @@ import ( "strconv" "strings" "syscall" + "time" "github.com/civo/node-agent/pkg/health" "github.com/civo/node-agent/pkg/metrics" @@ -52,18 +53,23 @@ func run(ctx context.Context) error { checkers := health.NewDefaultCheckers() metrics.Register() + metricsServer := &http.Server{ + Addr: ":" + metricsPortValue(metricsPort), + Handler: metrics.Handler(), + } go func() { - port := defaultMetricsPort - // Exclude well known port and negative integers. - if v, err := strconv.Atoi(metricsPort); err == nil && v >= 1024 && v <= 65535 { - port = v - } - addr := ":" + strconv.Itoa(port) - slog.Info("Starting metrics server", "addr", addr) - if err := http.ListenAndServe(addr, metrics.Handler()); err != nil { + slog.Info("Starting metrics server", "addr", metricsServer.Addr) + if err := metricsServer.ListenAndServe(); err != nil && err != http.ErrServerClosed { slog.Error("Metrics server failed", "error", err) } }() + defer func() { + shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if err := metricsServer.Shutdown(shutdownCtx); err != nil { + slog.Error("Metrics server shutdown failed", "error", err) + } + }() w, err := watcher.NewWatcher(ctx, watcher.WithNodePoolIDs(nodePoolID), @@ -98,3 +104,10 @@ func main() { os.Exit(1) } } + +func metricsPortValue(s string) string { + if v, err := strconv.Atoi(s); err == nil && v >= 1024 && v <= 65535 { + return s + } + return strconv.Itoa(defaultMetricsPort) +} diff --git a/pkg/watcher/options.go b/pkg/watcher/options.go index 693f7e5..5bd8dd9 100644 --- a/pkg/watcher/options.go +++ b/pkg/watcher/options.go @@ -31,7 +31,7 @@ func WithKubernetesClient(client kubernetes.Interface) Option { } } -// WithKubernetesClient returns Option to set Kubernetes config path. +// WithKubernetesClientConfigPath returns Option to set Kubernetes config path. func WithKubernetesClientConfigPath(path string) Option { return func(w *watcher) { if path != "" { From d05744bf941160240bff6c82db1a5f115861e48b Mon Sep 17 00:00:00 2001 From: hlts2 Date: Wed, 15 Apr 2026 04:22:05 +0900 Subject: [PATCH 36/71] fix: use clientCfgPath instead of cfg in kubeconfig error message cfg may be nil when BuildConfigFromFlags fails. Co-Authored-By: Claude Opus 4.6 (1M context) --- pkg/watcher/watcher.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/watcher/watcher.go b/pkg/watcher/watcher.go index 85b6408..58b9ddd 100644 --- a/pkg/watcher/watcher.go +++ b/pkg/watcher/watcher.go @@ -68,7 +68,7 @@ func (w *watcher) setupKubernetesClient() (err error) { if w.clientCfgPath != "" && w.client == nil { cfg, err := clientcmd.BuildConfigFromFlags("", w.clientCfgPath) if err != nil { - return fmt.Errorf("failed to build kubeconfig from path %q: %w", cfg, err) + return fmt.Errorf("failed to build kubeconfig from path %q: %w", w.clientCfgPath, err) } w.client, err = kubernetes.NewForConfig(cfg) if err != nil { From 45a4bad846abd7f2000dab1908e93219cc5587bf Mon Sep 17 00:00:00 2001 From: hlts2 Date: Wed, 15 Apr 2026 04:42:47 +0900 Subject: [PATCH 37/71] fix: use fixed reason strings in GPU checker to avoid high-cardinality metrics Replace dynamic formatted reasons (e.g. "Expected 8 but got 7") with enumerable constants: GPUCountMatch, GPUCountMismatch, NoAllocatableGPU, NonGPUNode. Co-Authored-By: Claude Opus 4.6 (1M context) --- pkg/health/gpu.go | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pkg/health/gpu.go b/pkg/health/gpu.go index 8d4c996..48bd351 100644 --- a/pkg/health/gpu.go +++ b/pkg/health/gpu.go @@ -1,7 +1,6 @@ package health import ( - "fmt" "strconv" "time" @@ -25,23 +24,23 @@ func (c *gpuChecker) Threshold() time.Duration { return gpuThreshold } func (c *gpuChecker) Check(node *corev1.Node) (bool, string) { expected, ok := expectedGPUCount(node) if !ok || expected == 0 { - return true, "Non-GPU node" + return true, "NonGPUNode" } quantity, exists := node.Status.Allocatable[gpuResourceName] if !exists || quantity.IsZero() { - return false, fmt.Sprintf("Expected %d but got 0", expected) + return false, "GPUCountMismatch" } actual, ok := quantity.AsInt64() if !ok { - return false, "No allocatable GPU count" + return false, "NoAllocatableGPU" } if actual == int64(expected) { - return true, fmt.Sprintf("%d/%d", actual, expected) + return true, "GPUCountMatch" } - return false, fmt.Sprintf("Expected %d but got %d", expected, actual) + return false, "GPUCountMismatch" } // HasGPU returns true if the node has the nvidia.com/gpu.count label From cad1c3eb6618e3aa6b43d961fe4bd5d4f9a05485 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Wed, 15 Apr 2026 04:50:43 +0900 Subject: [PATCH 38/71] fix: clean up Prometheus gauge metrics when nodes are removed When a node is removed from the cluster, Cleanup only deleted the state entry but left gauge metrics (recovery_phase, unhealthy_duration) with stale values. Now watcher.run() deletes all metric labels for removed nodes before calling Cleanup. Co-Authored-By: Claude Opus 4.6 (1M context) --- pkg/watcher/watcher.go | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pkg/watcher/watcher.go b/pkg/watcher/watcher.go index 58b9ddd..032c52a 100644 --- a/pkg/watcher/watcher.go +++ b/pkg/watcher/watcher.go @@ -9,6 +9,7 @@ import ( "github.com/civo/node-agent/pkg/health" "github.com/civo/node-agent/pkg/metrics" "github.com/civo/node-agent/pkg/operation" + "github.com/prometheus/client_golang/prometheus" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/client-go/informers" @@ -254,6 +255,16 @@ func (w *watcher) run(ctx context.Context) error { } } + // Clean up state and metrics for nodes no longer in the cluster. + w.states.Range(func(name string, _ *NodeState) bool { + if _, ok := activeNodes[name]; !ok { + metrics.NodeUnhealthyDurationSeconds.DeleteLabelValues(name) + metrics.HealthCheckTotal.DeletePartialMatch(prometheus.Labels{"node": name}) + metrics.RecoveryActionsTotal.DeletePartialMatch(prometheus.Labels{"node": name}) + metrics.RecoveryPhase.DeletePartialMatch(prometheus.Labels{"node": name}) + } + return true + }) w.states.Cleanup(activeNodes) return nil } From 6bb50ca43f78a42000d02971b5b6878a86ff3d9d Mon Sep 17 00:00:00 2001 From: hlts2 Date: Wed, 15 Apr 2026 16:16:30 +0900 Subject: [PATCH 39/71] fix: skip label selector when no node pool IDs are configured FormatLabelSelector(nil) returns "" which is an invalid selector. Only apply the label selector option when nodeLabelSelector is non-nil. Co-Authored-By: Claude Opus 4.6 (1M context) --- pkg/watcher/watcher.go | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/pkg/watcher/watcher.go b/pkg/watcher/watcher.go index 032c52a..6ca318f 100644 --- a/pkg/watcher/watcher.go +++ b/pkg/watcher/watcher.go @@ -96,13 +96,14 @@ func (w *watcher) setupInformer(ctx context.Context) error { return nil } - factory := informers.NewSharedInformerFactoryWithOptions( - w.client, - 0, - informers.WithTweakListOptions(func(opts *metav1.ListOptions) { - opts.LabelSelector = metav1.FormatLabelSelector(w.nodeLabelSelector) - }), - ) + var informerOpts []informers.SharedInformerOption + if w.nodeLabelSelector != nil { + labelSelector := metav1.FormatLabelSelector(w.nodeLabelSelector) + informerOpts = append(informerOpts, informers.WithTweakListOptions(func(opts *metav1.ListOptions) { + opts.LabelSelector = labelSelector + })) + } + factory := informers.NewSharedInformerFactoryWithOptions(w.client, 0, informerOpts...) nodeInformer := factory.Core().V1().Nodes() w.nodeLister = nodeInformer.Lister() From 9a900cde36af2e248fdb444a285e5f5fddadb6e5 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Wed, 15 Apr 2026 16:18:33 +0900 Subject: [PATCH 40/71] fix: log node label selector on informer setup Co-Authored-By: Claude Opus 4.6 (1M context) --- pkg/watcher/watcher.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pkg/watcher/watcher.go b/pkg/watcher/watcher.go index 6ca318f..e0d155a 100644 --- a/pkg/watcher/watcher.go +++ b/pkg/watcher/watcher.go @@ -99,9 +99,12 @@ func (w *watcher) setupInformer(ctx context.Context) error { var informerOpts []informers.SharedInformerOption if w.nodeLabelSelector != nil { labelSelector := metav1.FormatLabelSelector(w.nodeLabelSelector) + slog.Info("Using node label selector", "selector", labelSelector) informerOpts = append(informerOpts, informers.WithTweakListOptions(func(opts *metav1.ListOptions) { opts.LabelSelector = labelSelector })) + } else { + slog.Info("No node label selector configured, watching all nodes") } factory := informers.NewSharedInformerFactoryWithOptions(w.client, 0, informerOpts...) From fb830cc7caa254107f4c7f97d4cd65b094d70b65 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Wed, 15 Apr 2026 16:40:40 +0900 Subject: [PATCH 41/71] fix: add skip logs for unhealthy threshold wait and reboot wait Without these logs, it was hard to trace why the watcher was not taking action on unhealthy nodes during the wait periods. Co-Authored-By: Claude Opus 4.6 (1M context) --- pkg/watcher/watcher.go | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pkg/watcher/watcher.go b/pkg/watcher/watcher.go index e0d155a..7d40d31 100644 --- a/pkg/watcher/watcher.go +++ b/pkg/watcher/watcher.go @@ -207,6 +207,11 @@ func (w *watcher) run(ctx context.Context) error { metrics.NodeUnhealthyDurationSeconds.WithLabelValues(nodeName).Set( now.Sub(state.UnhealthySince()).Seconds()) if now.Sub(state.UnhealthySince()) < minThreshold { + slog.Info("Waiting for unhealthy threshold", + "node", nodeName, + "elapsed", now.Sub(state.UnhealthySince()).String(), + "threshold", minThreshold.String(), + "failedCheckers", failedCheckers) continue } if !w.monitorOnly { @@ -234,6 +239,12 @@ func (w *watcher) run(ctx context.Context) error { rebootWait = w.gpuRebootWaitMinutes } if now.Sub(state.LastRebootTime()) < rebootWait*time.Minute { + slog.Info("Waiting for reboot effect", + "node", nodeName, + "elapsed", now.Sub(state.LastRebootTime()).String(), + "rebootWait", (rebootWait * time.Minute).String(), + "rebootCount", state.RebootCount(), + "isGPUNode", state.IsGPUNode()) continue } From 73d943b73dc21babb9d300883c31fec7f1d38c74 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Mon, 20 Apr 2026 12:02:32 +0900 Subject: [PATCH 42/71] feat: update Helm chart env vars and bump version to 0.2.0 - Remove obsolete CIVO_NODE_DESIRED_GPU_COUNT, CIVO_NODE_REBOOT_TIME_WINDOW_MINUTES - Add CIVO_NODE_REBOOT_WAIT_MINUTES, CIVO_GPU_NODE_REBOOT_WAIT_MINUTES, CIVO_NODE_AGENT_MONITOR_ONLY, CIVO_NODE_AGENT_METRICS_PORT - Move non-sensitive config from secrets to values.yaml (nodePoolIDs, reboot wait times, monitor-only, metrics port) - Keep CIVO_API_KEY/URL/REGION/CLUSTER_ID as secret references - Add --kubeconfig="" arg to force in-cluster config - Expose metrics port via container ports - Bump app version to 0.2.0 Co-Authored-By: Claude Opus 4.6 (1M context) --- charts/templates/deployment.yaml | 31 +++++++++++++++++-------------- charts/values.yaml | 15 +++++++++++++++ main.go | 2 +- 3 files changed, 33 insertions(+), 15 deletions(-) diff --git a/charts/templates/deployment.yaml b/charts/templates/deployment.yaml index a6f6e07..2bc9c00 100644 --- a/charts/templates/deployment.yaml +++ b/charts/templates/deployment.yaml @@ -56,27 +56,30 @@ spec: secretKeyRef: name: civo-api-access key: region + {{- with .Values.nodePoolIDs }} - name: CIVO_NODE_POOL_ID - valueFrom: - secretKeyRef: - name: civo-node-agent - key: node-pool-id - - name: CIVO_NODE_DESIRED_GPU_COUNT - valueFrom: - secretKeyRef: - name: civo-node-agent - key: desired-gpu-count - - name: CIVO_NODE_REBOOT_TIME_WINDOW_MINUTES - valueFrom: - secretKeyRef: - name: civo-node-agent - key: time-window + value: {{ . | quote }} + {{- end }} + - name: CIVO_NODE_REBOOT_WAIT_MINUTES + value: {{ .Values.rebootWaitMinutes | quote }} + - name: CIVO_GPU_NODE_REBOOT_WAIT_MINUTES + value: {{ .Values.gpuRebootWaitMinutes | quote }} + - name: CIVO_NODE_AGENT_MONITOR_ONLY + value: {{ .Values.monitorOnly | quote }} + - name: CIVO_NODE_AGENT_METRICS_PORT + value: {{ .Values.metricsPort | quote }} {{- with .Values.securityContext }} securityContext: {{- toYaml . | nindent 12 }} {{- end }} image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" imagePullPolicy: {{ .Values.image.pullPolicy }} + args: + - "--kubeconfig=" + ports: + - name: metrics + containerPort: {{ .Values.metricsPort | default 9625 }} + protocol: TCP {{- with .Values.resources }} resources: {{- toYaml . | nindent 12 }} diff --git a/charts/values.yaml b/charts/values.yaml index 92ec8b1..9563079 100644 --- a/charts/values.yaml +++ b/charts/values.yaml @@ -6,6 +6,21 @@ image: pullPolicy: IfNotPresent tag: "6b8426a" +# Comma-separated node pool IDs to watch (empty = all nodes). +nodePoolIDs: "" + +# Reboot wait time for standard nodes (minutes). +rebootWaitMinutes: 10 + +# Reboot wait time for GPU nodes (minutes). +gpuRebootWaitMinutes: 40 + +# Monitor-only mode: log recovery actions without executing them. +monitorOnly: true + +# Port for Prometheus metrics endpoint. +metricsPort: 9625 + imagePullSecrets: [] nameOverride: "" fullnameOverride: "" diff --git a/main.go b/main.go index 35050de..2bf4c46 100644 --- a/main.go +++ b/main.go @@ -20,7 +20,7 @@ import ( ) var ( - version = "0.0.1" + version = "0.2.0" versionInfo = flag.Bool("version", false, "Print the driver version") kubeconfigPath = flag.String("kubeconfig", "/etc/rancher/k3s/k3s.yaml", "Path to kubeconfig file (empty for in-cluster config)") ) From bc11770caeff5274ec48360e192cf2aaeee6c1a3 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Mon, 20 Apr 2026 12:13:13 +0900 Subject: [PATCH 43/71] docs: update README for new config and civo-api-access secret - Replace civo-node-agent secret instructions with civo-api-access (auto-provisioned) - Remove obsolete env vars (DESIRED_GPU_COUNT, REBOOT_TIME_WINDOW_MINUTES) - Add Helm values table and health checkers table - Document monitor-only default mode Co-Authored-By: Claude Opus 4.6 (1M context) --- README.md | 59 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 36 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index fb740b7..ab1a8a4 100644 --- a/README.md +++ b/README.md @@ -1,29 +1,27 @@ # Node Agent -`node-agent` monitors the health of Kubernetes nodes and can automatically restart VM instances when necessary. It triggers a restart under the following conditions: +`node-agent` monitors the health of Kubernetes nodes and can automatically reboot VM instances when necessary. A reboot is triggered when a node fails one or more health checks (e.g. `NodeReady`, GPU count, Cilium, DiskPressure) for a configured threshold. -- A node enters the **NotReady** state. -- The number of available GPUs per node falls below a configured threshold. +By default it runs in **monitor-only** mode, logging recovery actions without executing them. Set `monitorOnly=false` to enable actual reboots. +## Prerequisites: `civo-api-access` Secret -## Set Your `civo-node-agent` Secret +The `civo-api-access` secret is automatically provisioned by Civo in the `kube-system` namespace of every Civo Kubernetes cluster. It contains the API credentials and cluster identity used by `node-agent`: -``` -export CIVO_DESIRED_GPU_COUNT="8" -export CIVO_NODE_POOL_ID="xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxxxxxx" -export CIVO_API_KEY="xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" -export CIVO_NODE_REBOOT_TIME_WINDOW_MINUTES="xxxx" -kubectl -n kube-system delete secret civo-node-agent --ignore-not-found -kubectl -n kube-system create secret generic civo-node-agent -kubectl -n kube-system patch secret civo-node-agent -n kube-system --type='merge' \ - -p='{"stringData": {"civo-api-key": "'"$CIVO_API_KEY"'", "node-pool-id": "'"$CIVO_NODE_POOL_ID"'", "desired-gpu-count": "'"$CIVO_DESIRED_GPU_COUNT"'", "time-window": "'"$CIVO_NODE_REBOOT_TIME_WINDOW_MINUTES"'" }}' -``` +| Key | Description | +|-----|-------------| +| `civo-api-key` | Civo API key used for reboot operations. | +| `api-url` | Civo API URL. | +| `cluster-id` | The ID of this Civo Kubernetes cluster. | +| `region` | The Civo region this cluster runs in. | + +No manual setup is required — `node-agent` reads these values directly from the existing secret. -## Nvidia Device Plugin Install +## Nvidia Device Plugin Install ```bash kubectl create ns gpu-operator -kubectl label namespace gpu-operator pod-security.kubernetes.io/enforce=privileged +kubectl label namespace gpu-operator pod-security.kubernetes.io/enforce=privileged kubectl label namespace gpu-operator pod-security.kubernetes.io/warn=privileged kubectl label namespace gpu-operator pod-security.kubernetes.io/audit=privileged ``` @@ -45,20 +43,35 @@ helm install --namespace gpu-operator nvidia-device-plugin nvdp/nvidia-device-pl ## Install `node-agent` chart -You will need to clone this repository in order to have access to the charts directory that is used for installation. In your terminal, please change directory to your cloned `node-agent` repo directory, and then run: +You will need to clone this repository in order to have access to the charts directory. In your terminal, change directory to your cloned `node-agent` repo directory, then run: ```bash helm upgrade -n kube-system --install node-agent ./charts ``` -## Configuration Details +To enable active recovery (actually reboot nodes): + +```bash +helm upgrade -n kube-system --install node-agent ./charts --set monitorOnly=false +``` -The following configurations are stored in the `node-agent` secret in the `kube-system` namespace. +## Configuration -`node-pool-id`: The ID of your Kubernetes node pool which you want monitored. To collect this value, go to the [civo kubernetes dashboard](https://dashboard.civo.com/kubernetes), select your cluster, and click copy next to your pool id. +### Helm values (`values.yaml`) -`desired-gpu-count`: This value is intended to match the number of GPUs per node. If you had a 2-node cluster with 8 GPU total, you would set this value to 4 to represent the number of GPUs per node. +| Value | Default | Description | +|-------|---------|-------------| +| `nodePoolIDs` | `""` | Comma-separated node pool IDs to watch. Empty means all nodes. | +| `rebootWaitMinutes` | `10` | Minutes to wait after rebooting a standard node before retrying. | +| `gpuRebootWaitMinutes` | `40` | Minutes to wait after rebooting a GPU node before retrying. | +| `monitorOnly` | `true` | If `true`, log recovery actions without executing them. Set `false` to enable reboots. | +| `metricsPort` | `9625` | Port for the Prometheus metrics endpoint. | -`civo-api-key`: The civo api key to use when automatically rebooting nodes. To collect this value, go to toue [civo settings security tab](https://dashboard.civo.com/security). +### Health checkers -`time-window`: The time-window is the time we need to give a node after a reboot happens +| Checker | Condition | Threshold | +|---------|-----------|-----------| +| `NodeReady` | `NodeReady == True` | 5 min | +| `DiskPressure` | `DiskPressure != True` | 30 min | +| `CiliumAgent` | `NetworkUnavailable == False` with reason `CiliumIsUp` (skipped for non-Cilium CNI) | 10 min | +| `GPU` | `allocatable["nvidia.com/gpu"]` equals `nvidia.com/gpu.count` label (skipped for non-GPU nodes) | 10 min | From ebec5b4c790472e72d68f335658b41f6f4440fbe Mon Sep 17 00:00:00 2001 From: hlts2 Date: Mon, 20 Apr 2026 12:23:13 +0900 Subject: [PATCH 44/71] fix: always render CIVO_NODE_POOL_ID env var even when empty Empty value is valid (watches all nodes), no need for conditional wrapping. Co-Authored-By: Claude Opus 4.6 (1M context) --- charts/templates/deployment.yaml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/charts/templates/deployment.yaml b/charts/templates/deployment.yaml index 2bc9c00..843aac3 100644 --- a/charts/templates/deployment.yaml +++ b/charts/templates/deployment.yaml @@ -39,7 +39,7 @@ spec: - name: CIVO_API_KEY valueFrom: secretKeyRef: - name: civo-node-agent + name: civo-api-access key: civo-api-key - name: CIVO_API_URL valueFrom: @@ -56,10 +56,8 @@ spec: secretKeyRef: name: civo-api-access key: region - {{- with .Values.nodePoolIDs }} - name: CIVO_NODE_POOL_ID - value: {{ . | quote }} - {{- end }} + value: {{ .Values.nodePoolIDs | quote }} - name: CIVO_NODE_REBOOT_WAIT_MINUTES value: {{ .Values.rebootWaitMinutes | quote }} - name: CIVO_GPU_NODE_REBOOT_WAIT_MINUTES From 7ee2fa5fe47313b865014a2583ff520c9a80fc8a Mon Sep 17 00:00:00 2001 From: hlts2 Date: Mon, 20 Apr 2026 12:25:29 +0900 Subject: [PATCH 45/71] fix: rename CIVO_NODE_POOL_ID to CIVO_NODE_POOL_IDS The variable accepts a comma-separated list, so the plural form is more accurate. Co-Authored-By: Claude Opus 4.6 (1M context) --- charts/templates/deployment.yaml | 2 +- main.go | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/charts/templates/deployment.yaml b/charts/templates/deployment.yaml index 843aac3..ddf7892 100644 --- a/charts/templates/deployment.yaml +++ b/charts/templates/deployment.yaml @@ -56,7 +56,7 @@ spec: secretKeyRef: name: civo-api-access key: region - - name: CIVO_NODE_POOL_ID + - name: CIVO_NODE_POOL_IDS value: {{ .Values.nodePoolIDs | quote }} - name: CIVO_NODE_REBOOT_WAIT_MINUTES value: {{ .Values.rebootWaitMinutes | quote }} diff --git a/main.go b/main.go index 2bf4c46..7ad5007 100644 --- a/main.go +++ b/main.go @@ -30,7 +30,7 @@ var ( apiKey = strings.TrimSpace(os.Getenv("CIVO_API_KEY")) region = strings.TrimSpace(os.Getenv("CIVO_REGION")) clusterID = strings.TrimSpace(os.Getenv("CIVO_CLUSTER_ID")) - nodePoolID = strings.TrimSpace(os.Getenv("CIVO_NODE_POOL_ID")) + nodePoolIDs = strings.TrimSpace(os.Getenv("CIVO_NODE_POOL_IDS")) rebootWaitMinutes = strings.TrimSpace(os.Getenv("CIVO_NODE_REBOOT_WAIT_MINUTES")) gpuRebootWaitMinutes = strings.TrimSpace(os.Getenv("CIVO_GPU_NODE_REBOOT_WAIT_MINUTES")) monitorOnly = strings.TrimSpace(os.Getenv("CIVO_NODE_AGENT_MONITOR_ONLY")) @@ -72,7 +72,7 @@ func run(ctx context.Context) error { }() w, err := watcher.NewWatcher(ctx, - watcher.WithNodePoolIDs(nodePoolID), + watcher.WithNodePoolIDs(nodePoolIDs), watcher.WithKubernetesClientConfigPath(*kubeconfigPath), watcher.WithExecutor(executor), watcher.WithCheckers(checkers), @@ -96,7 +96,7 @@ func main() { slog.SetDefault(slog.New(slog.NewJSONHandler(os.Stdout, nil).WithAttrs([]slog.Attr{ slog.String("clusterID", clusterID), slog.String("region", region), - slog.String("nodePoolID", nodePoolID), + slog.String("nodePoolIDs", nodePoolIDs), }))) if err := run(context.Background()); err != nil { From 89d88ed0604fa70277b191dde19aeaf6cf1d8c47 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Mon, 20 Apr 2026 12:32:39 +0900 Subject: [PATCH 46/71] fix: correct civo-api-access secret key for CIVO_API_KEY The actual key in civo-api-access secret is "api-key", not "civo-api-key". Co-Authored-By: Claude Opus 4.6 (1M context) --- charts/templates/deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/templates/deployment.yaml b/charts/templates/deployment.yaml index ddf7892..a8299dd 100644 --- a/charts/templates/deployment.yaml +++ b/charts/templates/deployment.yaml @@ -40,7 +40,7 @@ spec: valueFrom: secretKeyRef: name: civo-api-access - key: civo-api-key + key: api-key - name: CIVO_API_URL valueFrom: secretKeyRef: From bbfd189a7df070558a5e0a1cab5977050714ba21 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Mon, 20 Apr 2026 12:34:18 +0900 Subject: [PATCH 47/71] docs: correct civo-api-access secret key name Signed-off-by: hlts2 Co-Authored-By: Claude Opus 4.6 (1M context) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ab1a8a4..62a812b 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ The `civo-api-access` secret is automatically provisioned by Civo in the `kube-s | Key | Description | |-----|-------------| -| `civo-api-key` | Civo API key used for reboot operations. | +| `api-key` | Civo API key used for reboot operations. | | `api-url` | Civo API URL. | | `cluster-id` | The ID of this Civo Kubernetes cluster. | | `region` | The Civo region this cluster runs in. | From 4e295c410bebd952daee132d75fb965f63aca8bc Mon Sep 17 00:00:00 2001 From: hlts2 Date: Mon, 20 Apr 2026 15:36:34 +0900 Subject: [PATCH 48/71] fix: add civo_ prefix to all Prometheus metric names - civo_node_agent_health_check_total - civo_node_agent_recovery_actions_total - civo_node_agent_node_unhealthy_duration_seconds - civo_node_agent_recovery_phase Co-Authored-By: Claude Opus 4.6 (1M context) --- pkg/metrics/metrics.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index e9ea34e..e452fb7 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -12,7 +12,7 @@ var ( // checker, and result (pass/fail). HealthCheckTotal = prometheus.NewCounterVec( prometheus.CounterOpts{ - Name: "node_agent_health_check_total", + Name: "civo_node_agent_health_check_total", Help: "Total number of health check executions.", }, []string{"node", "checker", "result"}, @@ -22,7 +22,7 @@ var ( // per node, action type (reboot), and mode (report/active). RecoveryActionsTotal = prometheus.NewCounterVec( prometheus.CounterOpts{ - Name: "node_agent_recovery_actions_total", + Name: "civo_node_agent_recovery_actions_total", Help: "Total number of recovery actions performed.", }, []string{"node", "action", "mode"}, @@ -32,7 +32,7 @@ var ( // continuously unhealthy, in seconds. NodeUnhealthyDurationSeconds = prometheus.NewGaugeVec( prometheus.GaugeOpts{ - Name: "node_agent_node_unhealthy_duration_seconds", + Name: "civo_node_agent_node_unhealthy_duration_seconds", Help: "Duration in seconds a node has been continuously unhealthy.", }, []string{"node"}, @@ -42,7 +42,7 @@ var ( // The value is the numeric NodePhase (0=Healthy, 1=Unhealthy, etc.). RecoveryPhase = prometheus.NewGaugeVec( prometheus.GaugeOpts{ - Name: "node_agent_recovery_phase", + Name: "civo_node_agent_recovery_phase", Help: "Current recovery phase of a node.", }, []string{"node", "phase"}, From 86a245c90cd6b1e1f44e436e316a629e85025198 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Mon, 20 Apr 2026 17:00:19 +0900 Subject: [PATCH 49/71] docs: link to Civo GPU operator docs instead of inline NVIDIA device plugin instructions Co-Authored-By: Claude Opus 4.6 (1M context) --- README.md | 24 +++--------------------- 1 file changed, 3 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 62a812b..a3716b3 100644 --- a/README.md +++ b/README.md @@ -17,29 +17,11 @@ The `civo-api-access` secret is automatically provisioned by Civo in the `kube-s No manual setup is required — `node-agent` reads these values directly from the existing secret. -## Nvidia Device Plugin Install +## NVIDIA GPU Operator (GPU clusters only) -```bash -kubectl create ns gpu-operator -kubectl label namespace gpu-operator pod-security.kubernetes.io/enforce=privileged -kubectl label namespace gpu-operator pod-security.kubernetes.io/warn=privileged -kubectl label namespace gpu-operator pod-security.kubernetes.io/audit=privileged -``` +The GPU health check relies on the `nvidia.com/gpu.count` label added by the NVIDIA GPU Feature Discovery component. Follow the Civo documentation to install the NVIDIA GPU Operator on your cluster: -```bash -helm repo add nvdp https://nvidia.github.io/k8s-device-plugin \ -&& helm repo update -``` - -```bash -helm install --namespace gpu-operator nvidia-device-plugin nvdp/nvidia-device-plugin --create-namespace \ - --version=0.17.0 \ - --set gfd.enabled=true \ - --set devicePlugin.enabled=true \ - --set dcgm.enabled=true \ - --set nfd.enableNodeFeatureApi=true \ - --wait -``` +[Installing the NVIDIA GPU Operator](https://github.com/civo/docs/blob/main/content/docs/kubernetes/advanced/gpu-config.md#installing-the-nvidia-gpu-operator) ## Install `node-agent` chart From 4a66d5d601b7836f8f2c5ab8a2ee21b18441b69c Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 21 Apr 2026 01:35:49 +0900 Subject: [PATCH 50/71] feat: add civo_node_agent_recovery_failures_total metric Tracks failed recovery actions (e.g. Civo API errors during reboot), separating failures from successful attempts in recovery_actions_total. Cleaned up along with other per-node metrics on node removal. Co-Authored-By: Claude Opus 4.6 (1M context) --- pkg/metrics/metrics.go | 11 +++++++++++ pkg/watcher/watcher.go | 3 +++ 2 files changed, 14 insertions(+) diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index e452fb7..fce2ffa 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -28,6 +28,16 @@ var ( []string{"node", "action", "mode"}, ) + // RecoveryFailuresTotal counts the number of recovery actions that failed + // (e.g. Civo API errors). + RecoveryFailuresTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "civo_node_agent_recovery_failures_total", + Help: "Total number of recovery actions that failed.", + }, + []string{"node", "action"}, + ) + // NodeUnhealthyDurationSeconds tracks how long each node has been // continuously unhealthy, in seconds. NodeUnhealthyDurationSeconds = prometheus.NewGaugeVec( @@ -54,6 +64,7 @@ func Register() { prometheus.MustRegister( HealthCheckTotal, RecoveryActionsTotal, + RecoveryFailuresTotal, NodeUnhealthyDurationSeconds, RecoveryPhase, ) diff --git a/pkg/watcher/watcher.go b/pkg/watcher/watcher.go index 7d40d31..a2e293e 100644 --- a/pkg/watcher/watcher.go +++ b/pkg/watcher/watcher.go @@ -217,6 +217,7 @@ func (w *watcher) run(ctx context.Context) error { if !w.monitorOnly { if err := w.executor.Reboot(ctx, nodeName); err != nil { slog.Error("Failed to reboot node", "node", nodeName, "error", err) + metrics.RecoveryFailuresTotal.WithLabelValues(nodeName, "reboot").Inc() continue } } @@ -256,6 +257,7 @@ func (w *watcher) run(ctx context.Context) error { if !w.monitorOnly { if err := w.executor.Reboot(ctx, nodeName); err != nil { slog.Error("Failed to reboot node (retry)", "node", nodeName, "error", err) + metrics.RecoveryFailuresTotal.WithLabelValues(nodeName, "reboot").Inc() continue } } @@ -276,6 +278,7 @@ func (w *watcher) run(ctx context.Context) error { metrics.NodeUnhealthyDurationSeconds.DeleteLabelValues(name) metrics.HealthCheckTotal.DeletePartialMatch(prometheus.Labels{"node": name}) metrics.RecoveryActionsTotal.DeletePartialMatch(prometheus.Labels{"node": name}) + metrics.RecoveryFailuresTotal.DeletePartialMatch(prometheus.Labels{"node": name}) metrics.RecoveryPhase.DeletePartialMatch(prometheus.Labels{"node": name}) } return true From b4203b9e3ab12552d89cc0f6b7f0a9a107df840f Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 21 Apr 2026 01:47:26 +0900 Subject: [PATCH 51/71] feat: add civo_node_agent_info metric with version and cluster_id labels Standard Prometheus info metric pattern for tracking deployments and distinguishing clusters. Use with group_left to enrich other metrics. Co-Authored-By: Claude Opus 4.6 (1M context) --- main.go | 1 + pkg/metrics/metrics.go | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/main.go b/main.go index 7ad5007..7004a38 100644 --- a/main.go +++ b/main.go @@ -53,6 +53,7 @@ func run(ctx context.Context) error { checkers := health.NewDefaultCheckers() metrics.Register() + metrics.Info.WithLabelValues(version, clusterID).Set(1) metricsServer := &http.Server{ Addr: ":" + metricsPortValue(metricsPort), Handler: metrics.Handler(), diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index fce2ffa..cd09620 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -57,6 +57,16 @@ var ( }, []string{"node", "phase"}, ) + + // Info exposes build and cluster identity as a constant gauge (value is always 1). + // Use PromQL joins (group_left) to enrich other metrics with version/cluster_id. + Info = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "civo_node_agent_info", + Help: "Build and cluster identity for the node-agent.", + }, + []string{"version", "cluster_id"}, + ) ) // Register registers all node-agent metrics with the default Prometheus registerer. @@ -67,6 +77,7 @@ func Register() { RecoveryFailuresTotal, NodeUnhealthyDurationSeconds, RecoveryPhase, + Info, ) } From 1a90b310a59f8d3628694f3353f418144ac2d9f1 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 21 Apr 2026 01:54:08 +0900 Subject: [PATCH 52/71] feat: delete info metric on graceful shutdown Ensures civo_node_agent_info is removed when the agent stops, signaling to Prometheus that this version/cluster instance is no longer active. Co-Authored-By: Claude Opus 4.6 (1M context) --- main.go | 1 + 1 file changed, 1 insertion(+) diff --git a/main.go b/main.go index 7004a38..af28bb8 100644 --- a/main.go +++ b/main.go @@ -54,6 +54,7 @@ func run(ctx context.Context) error { metrics.Register() metrics.Info.WithLabelValues(version, clusterID).Set(1) + defer metrics.Info.DeleteLabelValues(version, clusterID) metricsServer := &http.Server{ Addr: ":" + metricsPortValue(metricsPort), Handler: metrics.Handler(), From 8ff99e2cb2add3001f074f4b80eaa874c69e2be4 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 21 Apr 2026 01:59:15 +0900 Subject: [PATCH 53/71] feat: add civo_node_agent_reconcile_errors_total metric Tracks reconcile loop errors (e.g. node list failures from the API server) to distinguish "agent alive but cannot do its job" from "agent down". Co-Authored-By: Claude Opus 4.6 (1M context) --- pkg/metrics/metrics.go | 11 +++++++++++ pkg/watcher/watcher.go | 1 + 2 files changed, 12 insertions(+) diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index cd09620..c099678 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -38,6 +38,16 @@ var ( []string{"node", "action"}, ) + // ReconcileErrorsTotal counts errors encountered during the reconcile loop, + // labeled by reason (e.g. "list_nodes"). + ReconcileErrorsTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "civo_node_agent_reconcile_errors_total", + Help: "Total number of errors encountered during the reconcile loop.", + }, + []string{"reason"}, + ) + // NodeUnhealthyDurationSeconds tracks how long each node has been // continuously unhealthy, in seconds. NodeUnhealthyDurationSeconds = prometheus.NewGaugeVec( @@ -75,6 +85,7 @@ func Register() { HealthCheckTotal, RecoveryActionsTotal, RecoveryFailuresTotal, + ReconcileErrorsTotal, NodeUnhealthyDurationSeconds, RecoveryPhase, Info, diff --git a/pkg/watcher/watcher.go b/pkg/watcher/watcher.go index a2e293e..9c4e28a 100644 --- a/pkg/watcher/watcher.go +++ b/pkg/watcher/watcher.go @@ -145,6 +145,7 @@ func (w *watcher) Run(ctx context.Context) error { func (w *watcher) run(ctx context.Context) error { nodes, err := w.nodeLister.List(labels.Everything()) if err != nil { + metrics.ReconcileErrorsTotal.WithLabelValues("list_nodes").Inc() return err } From 38a0657e3d1398d42f4f52618fd989373f89b2fa Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 21 Apr 2026 02:03:14 +0900 Subject: [PATCH 54/71] refactor: move watcher start log out of ticker loop Emit "Watcher reconcile loop started" once before the loop, not on every tick. The per-tick log was misleading (implied startup) and added noise. Co-Authored-By: Claude Opus 4.6 (1M context) --- pkg/watcher/watcher.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/watcher/watcher.go b/pkg/watcher/watcher.go index 9c4e28a..0c97af4 100644 --- a/pkg/watcher/watcher.go +++ b/pkg/watcher/watcher.go @@ -129,10 +129,10 @@ func (w *watcher) Run(ctx context.Context) error { ticker := time.NewTicker(10 * time.Second) defer ticker.Stop() + slog.Info("Watcher reconcile loop started") for { select { case <-ticker.C: - slog.Info("Started the watcher process...") if err := w.run(ctx); err != nil { slog.Error("An error occurred while running the watcher process", "error", err) } From 4c0d7f3a859dec7e164821e25ae9e127d5528b96 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 21 Apr 2026 14:34:17 +0900 Subject: [PATCH 55/71] docs: update GPU operator link to Civo docs site Co-Authored-By: Claude Opus 4.6 (1M context) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a3716b3..205d1e0 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ No manual setup is required — `node-agent` reads these values directly from th The GPU health check relies on the `nvidia.com/gpu.count` label added by the NVIDIA GPU Feature Discovery component. Follow the Civo documentation to install the NVIDIA GPU Operator on your cluster: -[Installing the NVIDIA GPU Operator](https://github.com/civo/docs/blob/main/content/docs/kubernetes/advanced/gpu-config.md#installing-the-nvidia-gpu-operator) +[Installing the NVIDIA GPU Operator](https://www.civo.com/docs/kubernetes/advanced/gpu-config#installing-the-nvidia-gpu-operator) ## Install `node-agent` chart From 2dc6d69ce66a0b446768dee74811c8ab38da2750 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 21 Apr 2026 17:52:46 +0900 Subject: [PATCH 56/71] fix: suppress "Waiting for reboot effect" log in monitor-only mode In monitor-only mode no reboot actually happens, so logging per-tick "waiting for reboot effect" is misleading and noisy (6 logs/min per unhealthy node). "Reboot retry" still fires every rebootWait cycle as a liveness signal. Co-Authored-By: Claude Opus 4.6 (1M context) --- pkg/watcher/watcher.go | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/pkg/watcher/watcher.go b/pkg/watcher/watcher.go index 0c97af4..0814375 100644 --- a/pkg/watcher/watcher.go +++ b/pkg/watcher/watcher.go @@ -241,12 +241,17 @@ func (w *watcher) run(ctx context.Context) error { rebootWait = w.gpuRebootWaitMinutes } if now.Sub(state.LastRebootTime()) < rebootWait*time.Minute { - slog.Info("Waiting for reboot effect", - "node", nodeName, - "elapsed", now.Sub(state.LastRebootTime()).String(), - "rebootWait", (rebootWait * time.Minute).String(), - "rebootCount", state.RebootCount(), - "isGPUNode", state.IsGPUNode()) + // In monitor-only mode no reboot actually happened, so logging + // "waiting for reboot effect" every tick would be misleading and noisy. + // The "Reboot retry" log still fires once per rebootWait cycle as a liveness signal. + if !w.monitorOnly { + slog.Info("Waiting for reboot effect", + "node", nodeName, + "elapsed", now.Sub(state.LastRebootTime()).String(), + "rebootWait", (rebootWait * time.Minute).String(), + "rebootCount", state.RebootCount(), + "isGPUNode", state.IsGPUNode()) + } continue } From 9dfb942e244616b7aa926aef4e07bfb16ced62a0 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 21 Apr 2026 20:57:24 +0900 Subject: [PATCH 57/71] docs: sync AGENTS.md with current implementation Previous version described the pre-refactor implementation (sync.Map, single time window, CIVO_NODE_DESIRED_GPU_COUNT etc.). Updated to reflect: - State machine (pkg/watcher/state.go) - Health checker package and per-checker thresholds - Executor/NopExecutor abstraction - Current env var names (CIVO_NODE_POOL_IDS, reboot wait, monitor-only) - Prometheus metrics - Known limitations Also simplify the "Waiting for reboot effect" comment. Co-Authored-By: Claude Opus 4.6 (1M context) --- AGENTS.md | 120 +++++++++++++++++++++++++++++++++-------- pkg/watcher/watcher.go | 2 +- 2 files changed, 99 insertions(+), 23 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 9ddd3b3..5768080 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -4,13 +4,15 @@ This file provides guidance to AI coding agents when working with code in this r ## Project Overview -Kubernetes node agent for Civo cloud that monitors cluster nodes and triggers automatic hard reboots via the Civo API when nodes become NotReady or lose expected GPU capacity. Deployed as a single-replica Deployment in kube-system via Helm. +Kubernetes node agent for Civo cloud that monitors cluster nodes and triggers automatic recovery actions (currently hard reboot via the Civo API) when nodes fail health checks. Deployed as a single-replica Deployment in `kube-system` via Helm. + +By default the agent runs in **monitor-only mode** (logs recovery actions without executing them). Set `CIVO_NODE_AGENT_MONITOR_ONLY=false` (or `monitorOnly: false` in Helm) to enable actual reboots. ## Build & Test Commands ```bash -# Build -go build -o node-agent ./ +# Build (CGO disabled — no C dependencies) +CGO_ENABLED=0 go build -o civo-node-agent ./ # Run all tests go test ./... @@ -18,6 +20,11 @@ go test ./... # Run a single test go test ./pkg/watcher/ -run TestName +# Before completing any task +go fmt ./... +go vet ./... +go test ./... + # Build Docker image (dry-run) goreleaser release --snapshot --skip=publish --clean ``` @@ -26,34 +33,103 @@ No linter is configured in CI. ## Architecture -**Entrypoint** (`main.go`): Reads env vars, sets up JSON structured logging (slog), creates a Watcher, and runs it with graceful SIGTERM/SIGINT shutdown. - -**Core package** (`pkg/watcher/`): -- `watcher.go` — Main loop polls every 10 seconds. For each node matching the node pool label (`kubernetes.civo.com/civo-node-pool={nodePoolID}`), checks if the node is NotReady or has fewer GPUs than desired. If a reboot is warranted (and cooldown window hasn't elapsed), calls `HardRebootInstance` via the Civo API. -- `options.go` — Functional options pattern (`WithKubernetesClient`, `WithCivoClient`, etc.) for dependency injection and configuration. -- `fake.go` — `FakeClient` implementing `civogo.Clienter` for testing. -- `watcher_test.go` — Tests use fake Kubernetes client (`k8s.io/client-go/kubernetes/fake`) and `FakeClient` for Civo API. - -**Reboot safeguards**: Tracks last reboot time per node in a `sync.Map`. Skips reboot if the node's Ready/NotReady condition transitioned recently or a reboot command was sent within the configurable time window (default 40 minutes). - -## Required Environment Variables - -`CIVO_API_KEY`, `CIVO_REGION`, `CIVO_CLUSTER_ID`, `CIVO_NODE_POOL_ID` — see `.env.example`. - -Optional: `CIVO_API_URL`, `CIVO_NODE_DESIRED_GPU_COUNT`, `CIVO_NODE_REBOOT_TIME_WINDOW_MINUTES`. +**Entrypoint** (`main.go`): Reads env vars + `--kubeconfig` flag, sets up JSON structured logging (slog), registers Prometheus metrics, starts the metrics HTTP server, constructs an Executor + Checkers + Watcher, and runs the watcher with graceful SIGTERM/SIGINT shutdown. + +### Packages + +- **`pkg/watcher/`** — Orchestrator. + - `watcher.go` — Sets up a Node Informer (filtered by optional node pool label selector), runs a 10s ticker reconcile loop. + - `state.go` — `NodePhase` enum (`Unknown`, `Healthy`, `Unhealthy`, `WaitingReboot`, future: `Drain`, `Replace`), `NodeState` with private fields + getters, `StateStore` with transition methods (`MarkUnhealthy`, `MarkWaitingReboot`, `Reset`, `Cleanup`). + - `options.go` — Functional options (`WithExecutor`, `WithCheckers`, `WithMonitorOnly`, `WithNodePoolIDs`, etc.). Test-only options are unexported (`withNowFunc`, `withNodeLister`). +- **`pkg/health/`** — Health checkers. + - `HealthChecker` interface returns `(healthy bool, reason string)` plus `Threshold()`. + - `nodeReadyChecker` (5min), `diskPressureChecker` (30min), `ciliumChecker` (10min, skips non-Cilium CNI via `NetworkUnavailable` reason), `gpuChecker` (10min, uses `nvidia.com/gpu.count` label vs allocatable GPU count; auto-skips non-GPU nodes). + - `HasGPU(node)` helper — reads `nvidia.com/gpu.count` label; used by watcher to mark nodes as GPU for reboot-wait differentiation. +- **`pkg/operation/`** — Recovery executors. + - `Executor` interface with `Reboot(ctx, nodeName)`. + - `civoExecutor` implements via Civo API (`FindKubernetesClusterInstance` + `HardRebootInstance`). + - `nopExecutor` is the safe default to prevent nil-pointer dereference. +- **`pkg/metrics/`** — Prometheus metrics (all `civo_` prefixed). + +### Reconcile loop + +For each node matched by the label selector (or all nodes if `nodePoolIDs` is empty): + +1. Run each `HealthChecker`; record `civo_node_agent_health_check_total{node, checker, result}` with the checker's reason as result. +2. If all checkers pass and the node was previously unhealthy → `Reset`, log `Node recovered`, update phase metrics. +3. If any checker failed: + - Track `isGPUNode` (from `nvidia.com/gpu.count` label). + - Compute `minThreshold` across failed checkers. + - State transitions: + - `Healthy → Unhealthy`: mark + log `Node unhealthy detected`. + - `Unhealthy → WaitingReboot`: after `minThreshold` elapsed, optionally call `executor.Reboot` (skipped in monitor-only), log `Reboot initiated`, increment `recovery_actions_total`. + - `WaitingReboot → WaitingReboot`: after `rebootWaitMinutes` (standard) or `gpuRebootWaitMinutes` (GPU), retry reboot, log `Reboot retry`. In monitor-only the per-tick `Waiting for reboot effect` log is suppressed. +4. Cleanup state and Prometheus labels for nodes no longer in the cluster. + +### Reboot safeguards + +- `monitorOnly=true` by default (fail-safe). +- Per-node cooldown via `rebootWaitMinutes` / `gpuRebootWaitMinutes`. +- `NopExecutor` default prevents accidental reboots when no executor is configured. + +**Known limitations (tracked as TODOs / future PRs):** + +- No cluster-wide blast-radius protection (no concurrent-reboot cap, no unhealthy-rate circuit breaker, no PDB awareness). Required before enabling `monitorOnly=false` in production. +- Standard nodes retry reboot indefinitely; `PhaseDrain → PhaseReplace` is defined but not wired up. +- Civo API calls do not propagate `context.Context` (civogo library limitation). `Reboot()` discards the ctx; a hung API call blocks the reconcile tick. +- No retry/backoff on Civo API errors. + +## Environment Variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `CIVO_API_KEY` | — | Civo API key (read from `civo-api-access` secret in Helm deployment). | +| `CIVO_API_URL` | — | Civo API URL. | +| `CIVO_CLUSTER_ID` | — | Civo cluster ID (exposed as label in `civo_node_agent_info`). | +| `CIVO_REGION` | — | Civo region. | +| `CIVO_NODE_POOL_IDS` | empty | Comma-separated node pool IDs. Empty = watch all nodes. | +| `CIVO_NODE_AGENT_MONITOR_ONLY` | `true` | Monitor-only mode (log but don't reboot). | +| `CIVO_NODE_AGENT_METRICS_PORT` | `9625` | Prometheus metrics HTTP port (validated to 1024–65535). | +| `CIVO_NODE_REBOOT_WAIT_MINUTES` | `10` | Reboot wait between retries for standard nodes. | +| `CIVO_GPU_NODE_REBOOT_WAIT_MINUTES` | `40` | Reboot wait between retries for GPU nodes. | + +## Command Line Flags + +| Flag | Default | Description | +|------|---------|-------------| +| `--kubeconfig` | `/etc/rancher/k3s/k3s.yaml` | Path to kubeconfig. Empty = in-cluster config. The Helm Deployment passes `--kubeconfig=""`. | +| `--version` | | Print the agent version and exit. | + +## Prometheus Metrics + +All metrics use the `civo_` prefix. + +| Metric | Type | Labels | +|--------|------|--------| +| `civo_node_agent_info` | Gauge (always 1) | `version`, `cluster_id` | +| `civo_node_agent_health_check_total` | Counter | `node`, `checker`, `result` (low-cardinality reason) | +| `civo_node_agent_recovery_actions_total` | Counter | `node`, `action`, `mode` (`monitor` / `active`) | +| `civo_node_agent_recovery_failures_total` | Counter | `node`, `action` | +| `civo_node_agent_reconcile_errors_total` | Counter | `reason` | +| `civo_node_agent_node_unhealthy_duration_seconds` | Gauge | `node` | +| `civo_node_agent_recovery_phase` | Gauge | `node`, `phase` (value = 1 for current phase, 0 for others) | + +Per-node metric labels are cleaned up (`DeletePartialMatch`) when a node is removed from the cluster. `civo_node_agent_info` is explicitly deleted on graceful shutdown. ## Deployment -Helm chart in `charts/`. Secrets are expected in `civo-node-agent` and `civo-api-access` Kubernetes secrets. - ```bash helm upgrade -n kube-system --install node-agent ./charts ``` +- API credentials come from the existing `civo-api-access` secret (auto-provisioned by Civo on every Civo Kubernetes cluster). +- Non-sensitive config lives in `values.yaml` (`nodePoolIDs`, `rebootWaitMinutes`, `gpuRebootWaitMinutes`, `monitorOnly`, `metricsPort`). + ## Key Dependencies -- `github.com/civo/civogo` — Civo cloud API client -- `k8s.io/client-go` — Kubernetes client (in-cluster config by default) +- `github.com/civo/civogo` — Civo cloud API client (no context support, used with goroutine+timeout workaround TBD). +- `k8s.io/client-go` — Kubernetes client. Uses a SharedInformer filtered by `kubernetes.civo.com/civo-node-pool` label when `CIVO_NODE_POOL_IDS` is set. +- `github.com/prometheus/client_golang` — Prometheus instrumentation. ## Release diff --git a/pkg/watcher/watcher.go b/pkg/watcher/watcher.go index 0814375..14501c4 100644 --- a/pkg/watcher/watcher.go +++ b/pkg/watcher/watcher.go @@ -242,7 +242,7 @@ func (w *watcher) run(ctx context.Context) error { } if now.Sub(state.LastRebootTime()) < rebootWait*time.Minute { // In monitor-only mode no reboot actually happened, so logging - // "waiting for reboot effect" every tick would be misleading and noisy. + // "waiting for reboot effect" every tick would be noisy. // The "Reboot retry" log still fires once per rebootWait cycle as a liveness signal. if !w.monitorOnly { slog.Info("Waiting for reboot effect", From e64cb2a7f081c704d22f0e643bd07f90d2fe9275 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 21 Apr 2026 21:00:15 +0900 Subject: [PATCH 58/71] fix: log invalid MonitorOnly value instead of silently ignoring Matches the pattern used by WithRebootWaitMinutes and WithGPURebootWaitMinutes: log an Info message when the input cannot be parsed, so misconfigurations are visible at startup. Co-Authored-By: Claude Opus 4.6 (1M context) --- pkg/watcher/options.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/watcher/options.go b/pkg/watcher/options.go index 5bd8dd9..1adcf21 100644 --- a/pkg/watcher/options.go +++ b/pkg/watcher/options.go @@ -85,6 +85,8 @@ func WithMonitorOnly(s string) Option { return func(w *watcher) { if v, err := strconv.ParseBool(s); err == nil { w.monitorOnly = v + } else { + slog.Info("MonitorOnly is invalid", "value", s) } } } From 375215838791feca66e64823f53b985fa71aed2e Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 21 Apr 2026 21:09:10 +0900 Subject: [PATCH 59/71] docs: simplify AGENTS.md and clarify deployment target - Remove sections that duplicate what's in the code (env vars, flags, metrics, reconcile loop detail, dependencies, etc.). Readers can find these by reading the source. - Clarify that the recommended deployment is a daemon process on the control plane VM, not a Kubernetes Pod. The Helm chart is available but secondary. - Keep: project overview, build/test commands, package-level architecture, design conventions, known limitations (important for AI agents to avoid premature enabling of active reboots), release process. Co-Authored-By: Claude Opus 4.6 (1M context) --- AGENTS.md | 128 ++++++++++++------------------------------------------ 1 file changed, 28 insertions(+), 100 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 5768080..eb5a823 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -4,29 +4,29 @@ This file provides guidance to AI coding agents when working with code in this r ## Project Overview -Kubernetes node agent for Civo cloud that monitors cluster nodes and triggers automatic recovery actions (currently hard reboot via the Civo API) when nodes fail health checks. Deployed as a single-replica Deployment in `kube-system` via Helm. +`civo-node-agent` monitors Kubernetes nodes in a Civo cluster and triggers automatic recovery actions (currently hard reboot via the Civo API) when nodes fail health checks. -By default the agent runs in **monitor-only mode** (logs recovery actions without executing them). Set `CIVO_NODE_AGENT_MONITOR_ONLY=false` (or `monitorOnly: false` in Helm) to enable actual reboots. +### Deployment target + +**The recommended deployment model is a daemon process running on the control plane VM**, installed and managed outside this repository (e.g. by a provisioning script that downloads the binary to the VM and registers it with the init system). + +A Helm chart (`charts/`) is included for running the agent as a single-replica Deployment in `kube-system`, but **this is not the primary or recommended deployment mode**. When making design decisions, prioritize the CP VM daemon use case (e.g. kubeconfig path, logging, filesystem assumptions). + +By default the agent runs in **monitor-only mode** (logs recovery actions without executing them). Set `CIVO_NODE_AGENT_MONITOR_ONLY=false` to enable actual reboots. ## Build & Test Commands ```bash -# Build (CGO disabled — no C dependencies) +# Build (CGO disabled — no C dependencies; required for static binary on the CP VM) CGO_ENABLED=0 go build -o civo-node-agent ./ # Run all tests go test ./... -# Run a single test -go test ./pkg/watcher/ -run TestName - -# Before completing any task +# Before completing any task, always run: go fmt ./... go vet ./... go test ./... - -# Build Docker image (dry-run) -goreleaser release --snapshot --skip=publish --clean ``` No linter is configured in CI. @@ -37,100 +37,28 @@ No linter is configured in CI. ### Packages -- **`pkg/watcher/`** — Orchestrator. - - `watcher.go` — Sets up a Node Informer (filtered by optional node pool label selector), runs a 10s ticker reconcile loop. - - `state.go` — `NodePhase` enum (`Unknown`, `Healthy`, `Unhealthy`, `WaitingReboot`, future: `Drain`, `Replace`), `NodeState` with private fields + getters, `StateStore` with transition methods (`MarkUnhealthy`, `MarkWaitingReboot`, `Reset`, `Cleanup`). - - `options.go` — Functional options (`WithExecutor`, `WithCheckers`, `WithMonitorOnly`, `WithNodePoolIDs`, etc.). Test-only options are unexported (`withNowFunc`, `withNodeLister`). -- **`pkg/health/`** — Health checkers. - - `HealthChecker` interface returns `(healthy bool, reason string)` plus `Threshold()`. - - `nodeReadyChecker` (5min), `diskPressureChecker` (30min), `ciliumChecker` (10min, skips non-Cilium CNI via `NetworkUnavailable` reason), `gpuChecker` (10min, uses `nvidia.com/gpu.count` label vs allocatable GPU count; auto-skips non-GPU nodes). - - `HasGPU(node)` helper — reads `nvidia.com/gpu.count` label; used by watcher to mark nodes as GPU for reboot-wait differentiation. -- **`pkg/operation/`** — Recovery executors. - - `Executor` interface with `Reboot(ctx, nodeName)`. - - `civoExecutor` implements via Civo API (`FindKubernetesClusterInstance` + `HardRebootInstance`). - - `nopExecutor` is the safe default to prevent nil-pointer dereference. -- **`pkg/metrics/`** — Prometheus metrics (all `civo_` prefixed). - -### Reconcile loop - -For each node matched by the label selector (or all nodes if `nodePoolIDs` is empty): - -1. Run each `HealthChecker`; record `civo_node_agent_health_check_total{node, checker, result}` with the checker's reason as result. -2. If all checkers pass and the node was previously unhealthy → `Reset`, log `Node recovered`, update phase metrics. -3. If any checker failed: - - Track `isGPUNode` (from `nvidia.com/gpu.count` label). - - Compute `minThreshold` across failed checkers. - - State transitions: - - `Healthy → Unhealthy`: mark + log `Node unhealthy detected`. - - `Unhealthy → WaitingReboot`: after `minThreshold` elapsed, optionally call `executor.Reboot` (skipped in monitor-only), log `Reboot initiated`, increment `recovery_actions_total`. - - `WaitingReboot → WaitingReboot`: after `rebootWaitMinutes` (standard) or `gpuRebootWaitMinutes` (GPU), retry reboot, log `Reboot retry`. In monitor-only the per-tick `Waiting for reboot effect` log is suppressed. -4. Cleanup state and Prometheus labels for nodes no longer in the cluster. - -### Reboot safeguards - -- `monitorOnly=true` by default (fail-safe). -- Per-node cooldown via `rebootWaitMinutes` / `gpuRebootWaitMinutes`. -- `NopExecutor` default prevents accidental reboots when no executor is configured. - -**Known limitations (tracked as TODOs / future PRs):** - -- No cluster-wide blast-radius protection (no concurrent-reboot cap, no unhealthy-rate circuit breaker, no PDB awareness). Required before enabling `monitorOnly=false` in production. -- Standard nodes retry reboot indefinitely; `PhaseDrain → PhaseReplace` is defined but not wired up. -- Civo API calls do not propagate `context.Context` (civogo library limitation). `Reboot()` discards the ctx; a hung API call blocks the reconcile tick. -- No retry/backoff on Civo API errors. - -## Environment Variables - -| Variable | Default | Description | -|----------|---------|-------------| -| `CIVO_API_KEY` | — | Civo API key (read from `civo-api-access` secret in Helm deployment). | -| `CIVO_API_URL` | — | Civo API URL. | -| `CIVO_CLUSTER_ID` | — | Civo cluster ID (exposed as label in `civo_node_agent_info`). | -| `CIVO_REGION` | — | Civo region. | -| `CIVO_NODE_POOL_IDS` | empty | Comma-separated node pool IDs. Empty = watch all nodes. | -| `CIVO_NODE_AGENT_MONITOR_ONLY` | `true` | Monitor-only mode (log but don't reboot). | -| `CIVO_NODE_AGENT_METRICS_PORT` | `9625` | Prometheus metrics HTTP port (validated to 1024–65535). | -| `CIVO_NODE_REBOOT_WAIT_MINUTES` | `10` | Reboot wait between retries for standard nodes. | -| `CIVO_GPU_NODE_REBOOT_WAIT_MINUTES` | `40` | Reboot wait between retries for GPU nodes. | - -## Command Line Flags - -| Flag | Default | Description | -|------|---------|-------------| -| `--kubeconfig` | `/etc/rancher/k3s/k3s.yaml` | Path to kubeconfig. Empty = in-cluster config. The Helm Deployment passes `--kubeconfig=""`. | -| `--version` | | Print the agent version and exit. | - -## Prometheus Metrics - -All metrics use the `civo_` prefix. - -| Metric | Type | Labels | -|--------|------|--------| -| `civo_node_agent_info` | Gauge (always 1) | `version`, `cluster_id` | -| `civo_node_agent_health_check_total` | Counter | `node`, `checker`, `result` (low-cardinality reason) | -| `civo_node_agent_recovery_actions_total` | Counter | `node`, `action`, `mode` (`monitor` / `active`) | -| `civo_node_agent_recovery_failures_total` | Counter | `node`, `action` | -| `civo_node_agent_reconcile_errors_total` | Counter | `reason` | -| `civo_node_agent_node_unhealthy_duration_seconds` | Gauge | `node` | -| `civo_node_agent_recovery_phase` | Gauge | `node`, `phase` (value = 1 for current phase, 0 for others) | - -Per-node metric labels are cleaned up (`DeletePartialMatch`) when a node is removed from the cluster. `civo_node_agent_info` is explicitly deleted on graceful shutdown. - -## Deployment +- **`pkg/watcher/`** — Orchestrator. Sets up a Node Informer (filtered by optional node pool label selector) and runs a 10s ticker reconcile loop driving a state machine (`Unknown → Healthy → Unhealthy → WaitingReboot`). +- **`pkg/health/`** — Health checkers (`HealthChecker` interface: `Name()`, `Check() (healthy, reason)`, `Threshold()`). +- **`pkg/operation/`** — Recovery executors (`Executor` interface; `civoExecutor` for Civo API, `nopExecutor` as safe default). +- **`pkg/metrics/`** — Prometheus metrics (all `civo_` prefixed). Defined once in `metrics.go`. -```bash -helm upgrade -n kube-system --install node-agent ./charts -``` +### Design conventions + +- Package-boundary types are exposed via interfaces; concrete structs are unexported. +- `NodeState` fields are private, mutated only through `StateStore` transition methods. +- Functional options for configuration. Test-only options are unexported (`withNowFunc`, `withNodeLister`). +- All timestamps stored in UTC (`nowFunc` defaults to `time.Now().UTC()`). +- Tests follow the Civo Go testing conventions (`description` field, verb-driven descriptions, `test` iterator, mock init inside `t.Run`). -- API credentials come from the existing `civo-api-access` secret (auto-provisioned by Civo on every Civo Kubernetes cluster). -- Non-sensitive config lives in `values.yaml` (`nodePoolIDs`, `rebootWaitMinutes`, `gpuRebootWaitMinutes`, `monitorOnly`, `metricsPort`). +## Known Limitations -## Key Dependencies +The following are intentionally not implemented in the current state and must be addressed before enabling `monitorOnly=false` in production or expanding recovery beyond reboot: -- `github.com/civo/civogo` — Civo cloud API client (no context support, used with goroutine+timeout workaround TBD). -- `k8s.io/client-go` — Kubernetes client. Uses a SharedInformer filtered by `kubernetes.civo.com/civo-node-pool` label when `CIVO_NODE_POOL_IDS` is set. -- `github.com/prometheus/client_golang` — Prometheus instrumentation. +- **No cluster-wide blast-radius protection.** There is no concurrent-reboot cap, no unhealthy-rate circuit breaker, and no PDB awareness. A cluster-wide outage (CNI glitch, object storage failure, region issue) could cause every node to be rebooted simultaneously. +- **Standard nodes retry reboot indefinitely.** `PhaseDrain → PhaseReplace` is defined in the state machine but not wired up. A persistently broken node will be rebooted forever with no upper bound on `rebootCount`. +- **Civo API calls do not propagate `context.Context`.** The civogo library does not accept a context, and `Reboot()` currently discards it. A hung API call will block the reconcile tick. +- **No retry/backoff on Civo API errors.** A failed reboot is retried immediately on the next tick, which can hammer the Civo API during an outage. ## Release -Tags matching `v*.*.*` trigger `.github/workflows/release-image.yaml`, which builds multi-arch Docker images via goreleaser and publishes to Docker Hub. +Tags matching `v*.*.*` trigger `.github/workflows/release-image.yaml`, which builds multi-arch Docker images via goreleaser and publishes to Docker Hub. The same binary is also uploaded to Civo object storage for CP VM installations (handled outside this repository). From 1207ff503e7aacd2d8c10e2756c9f37bdba5782a Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 21 Apr 2026 21:15:36 +0900 Subject: [PATCH 60/71] docs: trim AGENTS.md to essentials MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove Known Limitations and Design Conventions sections — these belong in issue tracker / PR descriptions, not in the agent guide. Soften the deployment description: daemon on CP VM is preferred, Helm Deployment is also supported. Co-Authored-By: Claude Opus 4.6 (1M context) --- AGENTS.md | 23 ++--------------------- 1 file changed, 2 insertions(+), 21 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index eb5a823..220ed90 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -6,11 +6,9 @@ This file provides guidance to AI coding agents when working with code in this r `civo-node-agent` monitors Kubernetes nodes in a Civo cluster and triggers automatic recovery actions (currently hard reboot via the Civo API) when nodes fail health checks. -### Deployment target +### Deployment -**The recommended deployment model is a daemon process running on the control plane VM**, installed and managed outside this repository (e.g. by a provisioning script that downloads the binary to the VM and registers it with the init system). - -A Helm chart (`charts/`) is included for running the agent as a single-replica Deployment in `kube-system`, but **this is not the primary or recommended deployment mode**. When making design decisions, prioritize the CP VM daemon use case (e.g. kubeconfig path, logging, filesystem assumptions). +`civo-node-agent` is designed to run as a daemon process on the control plane VM, which is the preferred deployment. A Helm chart (`charts/`) is also provided so it can run as a single-replica Deployment in `kube-system` if needed. By default the agent runs in **monitor-only mode** (logs recovery actions without executing them). Set `CIVO_NODE_AGENT_MONITOR_ONLY=false` to enable actual reboots. @@ -42,23 +40,6 @@ No linter is configured in CI. - **`pkg/operation/`** — Recovery executors (`Executor` interface; `civoExecutor` for Civo API, `nopExecutor` as safe default). - **`pkg/metrics/`** — Prometheus metrics (all `civo_` prefixed). Defined once in `metrics.go`. -### Design conventions - -- Package-boundary types are exposed via interfaces; concrete structs are unexported. -- `NodeState` fields are private, mutated only through `StateStore` transition methods. -- Functional options for configuration. Test-only options are unexported (`withNowFunc`, `withNodeLister`). -- All timestamps stored in UTC (`nowFunc` defaults to `time.Now().UTC()`). -- Tests follow the Civo Go testing conventions (`description` field, verb-driven descriptions, `test` iterator, mock init inside `t.Run`). - -## Known Limitations - -The following are intentionally not implemented in the current state and must be addressed before enabling `monitorOnly=false` in production or expanding recovery beyond reboot: - -- **No cluster-wide blast-radius protection.** There is no concurrent-reboot cap, no unhealthy-rate circuit breaker, and no PDB awareness. A cluster-wide outage (CNI glitch, object storage failure, region issue) could cause every node to be rebooted simultaneously. -- **Standard nodes retry reboot indefinitely.** `PhaseDrain → PhaseReplace` is defined in the state machine but not wired up. A persistently broken node will be rebooted forever with no upper bound on `rebootCount`. -- **Civo API calls do not propagate `context.Context`.** The civogo library does not accept a context, and `Reboot()` currently discards it. A hung API call will block the reconcile tick. -- **No retry/backoff on Civo API errors.** A failed reboot is retried immediately on the next tick, which can hammer the Civo API during an outage. - ## Release Tags matching `v*.*.*` trigger `.github/workflows/release-image.yaml`, which builds multi-arch Docker images via goreleaser and publishes to Docker Hub. The same binary is also uploaded to Civo object storage for CP VM installations (handled outside this repository). From 640596cd06be7894ce1be1c87678e6c94b7cca72 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 21 Apr 2026 22:03:04 +0900 Subject: [PATCH 61/71] feat: add PhaseFailed and reboot retry limit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After maxRebootRetries reboots fail to recover a node, the state machine transitions to PhaseFailed and stops retrying. The node remains in Failed until it naturally recovers (all health checkers pass), at which point the existing recovery path resets it to Healthy. - Add PhaseFailed to NodePhase enum and StateStore.MarkFailed() - MarkWaitingReboot now takes a countReboot bool; monitor-only mode passes false so rebootCount tracks only real reboots issued to the Civo API - Add WithMaxRebootRetries option (default 5) and CIVO_NODE_MAX_REBOOT_RETRIES env var - Update Helm chart (maxRebootRetries in values.yaml and deployment env) - Update README with the new value Applied to both standard and GPU nodes to avoid infinite reboot loops. A future PR can wire up PhaseDrain → PhaseReplace for standard nodes. Co-Authored-By: Claude Opus 4.6 (1M context) --- README.md | 1 + charts/templates/deployment.yaml | 2 ++ charts/values.yaml | 3 +++ main.go | 2 ++ pkg/watcher/options.go | 14 ++++++++++++++ pkg/watcher/state.go | 26 ++++++++++++++++++++++---- pkg/watcher/state_test.go | 15 +++++++++++---- pkg/watcher/watcher.go | 31 +++++++++++++++++++++++++------ 8 files changed, 80 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 205d1e0..ff50953 100644 --- a/README.md +++ b/README.md @@ -46,6 +46,7 @@ helm upgrade -n kube-system --install node-agent ./charts --set monitorOnly=fals | `nodePoolIDs` | `""` | Comma-separated node pool IDs to watch. Empty means all nodes. | | `rebootWaitMinutes` | `10` | Minutes to wait after rebooting a standard node before retrying. | | `gpuRebootWaitMinutes` | `40` | Minutes to wait after rebooting a GPU node before retrying. | +| `maxRebootRetries` | `5` | Maximum reboot attempts before the node transitions to `Failed` (no further reboots). | | `monitorOnly` | `true` | If `true`, log recovery actions without executing them. Set `false` to enable reboots. | | `metricsPort` | `9625` | Port for the Prometheus metrics endpoint. | diff --git a/charts/templates/deployment.yaml b/charts/templates/deployment.yaml index a8299dd..3c2626a 100644 --- a/charts/templates/deployment.yaml +++ b/charts/templates/deployment.yaml @@ -62,6 +62,8 @@ spec: value: {{ .Values.rebootWaitMinutes | quote }} - name: CIVO_GPU_NODE_REBOOT_WAIT_MINUTES value: {{ .Values.gpuRebootWaitMinutes | quote }} + - name: CIVO_NODE_MAX_REBOOT_RETRIES + value: {{ .Values.maxRebootRetries | quote }} - name: CIVO_NODE_AGENT_MONITOR_ONLY value: {{ .Values.monitorOnly | quote }} - name: CIVO_NODE_AGENT_METRICS_PORT diff --git a/charts/values.yaml b/charts/values.yaml index 9563079..3551e88 100644 --- a/charts/values.yaml +++ b/charts/values.yaml @@ -15,6 +15,9 @@ rebootWaitMinutes: 10 # Reboot wait time for GPU nodes (minutes). gpuRebootWaitMinutes: 40 +# Maximum number of reboot attempts before a node transitions to PhaseFailed. +maxRebootRetries: 5 + # Monitor-only mode: log recovery actions without executing them. monitorOnly: true diff --git a/main.go b/main.go index af28bb8..9b71d9d 100644 --- a/main.go +++ b/main.go @@ -33,6 +33,7 @@ var ( nodePoolIDs = strings.TrimSpace(os.Getenv("CIVO_NODE_POOL_IDS")) rebootWaitMinutes = strings.TrimSpace(os.Getenv("CIVO_NODE_REBOOT_WAIT_MINUTES")) gpuRebootWaitMinutes = strings.TrimSpace(os.Getenv("CIVO_GPU_NODE_REBOOT_WAIT_MINUTES")) + maxRebootRetries = strings.TrimSpace(os.Getenv("CIVO_NODE_MAX_REBOOT_RETRIES")) monitorOnly = strings.TrimSpace(os.Getenv("CIVO_NODE_AGENT_MONITOR_ONLY")) metricsPort = strings.TrimSpace(os.Getenv("CIVO_NODE_AGENT_METRICS_PORT")) ) @@ -81,6 +82,7 @@ func run(ctx context.Context) error { watcher.WithMonitorOnly(monitorOnly), watcher.WithRebootWaitMinutes(rebootWaitMinutes), watcher.WithGPURebootWaitMinutes(gpuRebootWaitMinutes), + watcher.WithMaxRebootRetries(maxRebootRetries), ) if err != nil { return err diff --git a/pkg/watcher/options.go b/pkg/watcher/options.go index 1adcf21..9400bae 100644 --- a/pkg/watcher/options.go +++ b/pkg/watcher/options.go @@ -20,6 +20,7 @@ var defaultOptions = []Option{ WithExecutor(operation.NewNopExecutor()), WithRebootWaitMinutes("10"), WithGPURebootWaitMinutes("40"), + WithMaxRebootRetries("5"), } // WithKubernetesClient returns Option to set Kubernetes API client. @@ -78,6 +79,19 @@ func WithGPURebootWaitMinutes(s string) Option { } } +// WithMaxRebootRetries returns Option to set the maximum number of reboot +// attempts before a node transitions to PhaseFailed. +func WithMaxRebootRetries(s string) Option { + return func(w *watcher) { + n, err := strconv.Atoi(s) + if err == nil && n > 0 { + w.maxRebootRetries = n + } else { + slog.Info("MaxRebootRetries is invalid", "value", s) + } + } +} + // WithMonitorOnly returns Option to enable or disable monitor-only mode. // Accepts a string parsable by strconv.ParseBool (e.g. "true", "false", "1", "0"). // Empty or unparsable values are ignored (default: true). diff --git a/pkg/watcher/state.go b/pkg/watcher/state.go index 2621eac..0e18d76 100644 --- a/pkg/watcher/state.go +++ b/pkg/watcher/state.go @@ -16,6 +16,7 @@ const ( PhaseWaitingReboot // 4 - waiting for reboot to take effect PhaseDrain // 5 - future: draining pods PhaseReplace // 6 - future: replace issued + PhaseFailed // 7 - recovery gave up (exceeded retries); awaits manual intervention or natural recovery ) // String returns the string representation of a NodePhase. @@ -35,6 +36,8 @@ func (p NodePhase) String() string { return "Drain" case PhaseReplace: return "Replace" + case PhaseFailed: + return "Failed" default: return "Unknown" } @@ -137,9 +140,10 @@ func (s *StateStore) MarkUnhealthy(name string, now time.Time) { st.unhealthySince = now } -// MarkWaitingReboot transitions a node to PhaseWaitingReboot, -// records the reboot time, and increments the reboot counter. -func (s *StateStore) MarkWaitingReboot(name string, now time.Time) { +// MarkWaitingReboot transitions a node to PhaseWaitingReboot and records the reboot time. +// When countReboot is true, the reboot counter is incremented. Pass false in monitor-only +// mode where no actual reboot was issued. +func (s *StateStore) MarkWaitingReboot(name string, now time.Time, countReboot bool) { s.mu.Lock() defer s.mu.Unlock() @@ -149,7 +153,21 @@ func (s *StateStore) MarkWaitingReboot(name string, now time.Time) { } st.phase = PhaseWaitingReboot st.lastRebootTime = now - st.rebootCount++ + if countReboot { + st.rebootCount++ + } +} + +// MarkFailed transitions a node to PhaseFailed after recovery attempts were exhausted. +func (s *StateStore) MarkFailed(name string) { + s.mu.Lock() + defer s.mu.Unlock() + + st, ok := s.nodes[name] + if !ok { + return + } + st.phase = PhaseFailed } // Reset replaces the node's state with a fresh PhaseHealthy entry. diff --git a/pkg/watcher/state_test.go b/pkg/watcher/state_test.go index ccfe3a6..7426988 100644 --- a/pkg/watcher/state_test.go +++ b/pkg/watcher/state_test.go @@ -143,7 +143,7 @@ func TestStateStoreMarkWaitingReboot(t *testing.T) { s.GetOrCreate("node-01") now := time.Date(2026, 4, 13, 12, 0, 0, 0, time.UTC) - s.MarkWaitingReboot("node-01", now) + s.MarkWaitingReboot("node-01", now, true) st, _ := s.Get("node-01") if st.Phase() != PhaseWaitingReboot { @@ -158,7 +158,7 @@ func TestStateStoreMarkWaitingReboot(t *testing.T) { // Retry increments count. later := now.Add(time.Hour) - s.MarkWaitingReboot("node-01", later) + s.MarkWaitingReboot("node-01", later, true) st, _ = s.Get("node-01") if st.RebootCount() != 2 { @@ -167,12 +167,19 @@ func TestStateStoreMarkWaitingReboot(t *testing.T) { if !st.LastRebootTime().Equal(later) { t.Errorf("got lastRebootTime %v after retry, want %v", st.LastRebootTime(), later) } + + // countReboot=false keeps the counter stable (monitor-only mode). + s.MarkWaitingReboot("node-01", later.Add(time.Hour), false) + st, _ = s.Get("node-01") + if st.RebootCount() != 2 { + t.Errorf("rebootCount should not increment when countReboot=false; got %d, want 2", st.RebootCount()) + } } func TestStateStoreMarkWaitingRebootNonexistent(t *testing.T) { s := NewStateStore() // Should not panic. - s.MarkWaitingReboot("nonexistent", time.Now()) + s.MarkWaitingReboot("nonexistent", time.Now(), true) } func TestStateStoreUpdateCheckerInfo(t *testing.T) { @@ -201,7 +208,7 @@ func TestStateStoreReset(t *testing.T) { now := time.Date(2026, 4, 13, 12, 0, 0, 0, time.UTC) s.MarkUnhealthy("node-01", now) s.UpdateCheckerInfo("node-01", []string{"NodeReady"}, true) - s.MarkWaitingReboot("node-01", now) + s.MarkWaitingReboot("node-01", now, true) s.Reset("node-01") diff --git a/pkg/watcher/watcher.go b/pkg/watcher/watcher.go index 14501c4..e0ae701 100644 --- a/pkg/watcher/watcher.go +++ b/pkg/watcher/watcher.go @@ -33,6 +33,7 @@ type watcher struct { nodePoolIDs []string rebootWaitMinutes time.Duration // Standard nodes (default: 10) gpuRebootWaitMinutes time.Duration // GPU nodes (default: 40) + maxRebootRetries int // Give up and transition to PhaseFailed after this many reboots nodeLabelSelector *metav1.LabelSelector nodeLister listerscorev1.NodeLister @@ -230,7 +231,7 @@ func (w *watcher) run(ctx context.Context) error { metrics.RecoveryActionsTotal.WithLabelValues(nodeName, "reboot", mode).Inc() metrics.RecoveryPhase.WithLabelValues(nodeName, PhaseUnhealthy.String()).Set(0) metrics.RecoveryPhase.WithLabelValues(nodeName, PhaseWaitingReboot.String()).Set(1) - w.states.MarkWaitingReboot(nodeName, now) + w.states.MarkWaitingReboot(nodeName, now, !w.monitorOnly) // WaitingReboot: health check still failing after reboot, retry after wait window. case PhaseWaitingReboot: @@ -255,10 +256,22 @@ func (w *watcher) run(ctx context.Context) error { continue } - // TODO: Standard nodes should transition to PhaseDrain → PhaseReplace - // instead of retrying reboot indefinitely. - // GPU nodes must never be replaced; they retry reboot only. - // See: Recovery Flow — Standard Nodes (Drain → timeout 30min → Replace) + // Retry budget exhausted → give up and transition to PhaseFailed. + // The node stays in Failed until it naturally recovers (all checkers pass). + // TODO: Standard nodes could transition to PhaseDrain → PhaseReplace here + // once that flow is wired up. GPU nodes must stay in Failed (never replaced). + if state.RebootCount() >= w.maxRebootRetries { + slog.Warn("Reboot retry limit exceeded, giving up", + "node", nodeName, + "rebootCount", state.RebootCount(), + "maxRebootRetries", w.maxRebootRetries, + "isGPUNode", state.IsGPUNode(), + "failedCheckers", failedCheckers) + metrics.RecoveryPhase.WithLabelValues(nodeName, PhaseWaitingReboot.String()).Set(0) + metrics.RecoveryPhase.WithLabelValues(nodeName, PhaseFailed.String()).Set(1) + w.states.MarkFailed(nodeName) + continue + } if !w.monitorOnly { if err := w.executor.Reboot(ctx, nodeName); err != nil { @@ -274,7 +287,13 @@ func (w *watcher) run(ctx context.Context) error { "rebootCount", state.RebootCount()+1, "failedCheckers", failedCheckers) metrics.RecoveryActionsTotal.WithLabelValues(nodeName, "reboot", mode).Inc() - w.states.MarkWaitingReboot(nodeName, now) + w.states.MarkWaitingReboot(nodeName, now, !w.monitorOnly) + + // Failed: recovery attempts exhausted. Wait for natural recovery (all checkers pass). + // If the node recovers the "all checkers pass" branch above will Reset it back to Healthy. + case PhaseFailed: + metrics.NodeUnhealthyDurationSeconds.WithLabelValues(nodeName).Set( + now.Sub(state.UnhealthySince()).Seconds()) } } From ffe8d7a6916c4ddce08f677a0e746edc39b6e605 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 21 Apr 2026 22:34:37 +0900 Subject: [PATCH 62/71] docs: update AGENTS.md for PhaseFailed and trim interface details - Include Failed in the state machine summary - Drop interface / type-level details from the Packages section; they can be read from the code directly Co-Authored-By: Claude Opus 4.6 (1M context) --- AGENTS.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 220ed90..74b5f3b 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -35,9 +35,9 @@ No linter is configured in CI. ### Packages -- **`pkg/watcher/`** — Orchestrator. Sets up a Node Informer (filtered by optional node pool label selector) and runs a 10s ticker reconcile loop driving a state machine (`Unknown → Healthy → Unhealthy → WaitingReboot`). -- **`pkg/health/`** — Health checkers (`HealthChecker` interface: `Name()`, `Check() (healthy, reason)`, `Threshold()`). -- **`pkg/operation/`** — Recovery executors (`Executor` interface; `civoExecutor` for Civo API, `nopExecutor` as safe default). +- **`pkg/watcher/`** — Orchestrator. Sets up a Node Informer (filtered by optional node pool label selector) and runs a 10s ticker reconcile loop driving the state machine (`Unknown → Healthy → Unhealthy → WaitingReboot → Failed`). +- **`pkg/health/`** — Health checkers. +- **`pkg/operation/`** — Recovery executors (Civo API reboot; nop executor used as safe default). - **`pkg/metrics/`** — Prometheus metrics (all `civo_` prefixed). Defined once in `metrics.go`. ## Release From 3ae7bf9464b08bcc8f83479d98d8302c95ec863a Mon Sep 17 00:00:00 2001 From: hlts2 Date: Tue, 21 Apr 2026 23:25:31 +0900 Subject: [PATCH 63/71] test: cover PhaseFailed, retry limit, and monitor-only count semantics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - TestNodePhaseString: add PhaseFailed - TestStateStoreMarkFailed / _Nonexistent: verify the transition - TestRun_RebootRetryLimitExceeded_TransitionsToFailed: verify active-mode flow from Unhealthy → 3 reboots → WaitingReboot → Failed (no further reboots) - TestRun_MonitorOnlyDoesNotIncrementRebootCount: rebootCount stays 0 and Failed is never reached in monitor-only mode - TestRun_RecoverFromFailed: Failed → Healthy via Reset on successful checks Co-Authored-By: Claude Opus 4.6 (1M context) --- pkg/watcher/state_test.go | 20 ++++++ pkg/watcher/watcher_test.go | 133 ++++++++++++++++++++++++++++++++++++ 2 files changed, 153 insertions(+) diff --git a/pkg/watcher/state_test.go b/pkg/watcher/state_test.go index 7426988..6f66c13 100644 --- a/pkg/watcher/state_test.go +++ b/pkg/watcher/state_test.go @@ -17,6 +17,7 @@ func TestNodePhaseString(t *testing.T) { {PhaseWaitingReboot, "WaitingReboot"}, {PhaseDrain, "Drain"}, {PhaseReplace, "Replace"}, + {PhaseFailed, "Failed"}, {NodePhase(99), "Unknown"}, } @@ -182,6 +183,25 @@ func TestStateStoreMarkWaitingRebootNonexistent(t *testing.T) { s.MarkWaitingReboot("nonexistent", time.Now(), true) } +func TestStateStoreMarkFailed(t *testing.T) { + s := NewStateStore() + s.GetOrCreate("node-01") + s.MarkWaitingReboot("node-01", time.Now(), true) + + s.MarkFailed("node-01") + + st, _ := s.Get("node-01") + if st.Phase() != PhaseFailed { + t.Errorf("got phase %v, want PhaseFailed", st.Phase()) + } +} + +func TestStateStoreMarkFailedNonexistent(t *testing.T) { + s := NewStateStore() + // Should not panic. + s.MarkFailed("nonexistent") +} + func TestStateStoreUpdateCheckerInfo(t *testing.T) { s := NewStateStore() s.GetOrCreate("node-01") diff --git a/pkg/watcher/watcher_test.go b/pkg/watcher/watcher_test.go index 9744453..3de03c3 100644 --- a/pkg/watcher/watcher_test.go +++ b/pkg/watcher/watcher_test.go @@ -529,6 +529,139 @@ func TestRun_UnhealthyWithinThresholdNoReboot(t *testing.T) { } } +func TestRun_RebootRetryLimitExceeded_TransitionsToFailed(t *testing.T) { + now := time.Date(2026, 4, 13, 12, 0, 0, 0, time.UTC) + node := newTestNode("node-01", corev1.ConditionFalse, 0) + exec := &mockExecutor{} + w := newTestWatcher(t, + withNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), + WithCheckers(health.NewDefaultCheckers()), + WithExecutor(exec), + WithMonitorOnly("false"), + WithRebootWaitMinutes("10"), + WithMaxRebootRetries("3"), + withNowFunc(func() time.Time { return now }), + ) + + // Run 1: detect unhealthy. + if err := w.run(t.Context()); err != nil { + t.Fatal(err) + } + // Run 2: threshold exceeded → first reboot (rebootCount=1). + now = now.Add(6 * time.Minute) + if err := w.run(t.Context()); err != nil { + t.Fatal(err) + } + // Reboot retries 2 and 3. + for i := 0; i < 2; i++ { + now = now.Add(11 * time.Minute) + if err := w.run(t.Context()); err != nil { + t.Fatal(err) + } + } + + state, _ := w.states.Get("node-01") + if state.RebootCount() != 3 { + t.Fatalf("expected rebootCount=3 after 3 reboots, got %d", state.RebootCount()) + } + if state.Phase() != PhaseWaitingReboot { + t.Fatalf("expected PhaseWaitingReboot after %d reboots, got %v", state.RebootCount(), state.Phase()) + } + + // Next retry should exceed the limit → PhaseFailed. + now = now.Add(11 * time.Minute) + if err := w.run(t.Context()); err != nil { + t.Fatal(err) + } + + state, _ = w.states.Get("node-01") + if state.Phase() != PhaseFailed { + t.Errorf("got phase %v, want PhaseFailed", state.Phase()) + } + if len(exec.calls) != 3 { + t.Errorf("expected exactly 3 reboot calls (no further reboots after Failed), got %d", len(exec.calls)) + } +} + +func TestRun_MonitorOnlyDoesNotIncrementRebootCount(t *testing.T) { + now := time.Date(2026, 4, 13, 12, 0, 0, 0, time.UTC) + node := newTestNode("node-01", corev1.ConditionFalse, 0) + w := newTestWatcher(t, + withNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), + WithCheckers(health.NewDefaultCheckers()), + WithMonitorOnly("true"), + WithRebootWaitMinutes("10"), + WithMaxRebootRetries("3"), + withNowFunc(func() time.Time { return now }), + ) + + // Run the state machine through several reboot cycles. + if err := w.run(t.Context()); err != nil { + t.Fatal(err) + } + for i := 0; i < 5; i++ { + now = now.Add(11 * time.Minute) + if err := w.run(t.Context()); err != nil { + t.Fatal(err) + } + } + + state, _ := w.states.Get("node-01") + if state.RebootCount() != 0 { + t.Errorf("rebootCount should stay 0 in monitor-only mode, got %d", state.RebootCount()) + } + if state.Phase() == PhaseFailed { + t.Errorf("monitor-only mode must not transition to PhaseFailed; got phase %v", state.Phase()) + } +} + +func TestRun_RecoverFromFailed(t *testing.T) { + now := time.Date(2026, 4, 13, 12, 0, 0, 0, time.UTC) + node := newTestNode("node-01", corev1.ConditionFalse, 0) + exec := &mockExecutor{} + w := newTestWatcher(t, + withNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), + WithCheckers(health.NewDefaultCheckers()), + WithExecutor(exec), + WithMonitorOnly("false"), + WithRebootWaitMinutes("10"), + WithMaxRebootRetries("1"), + withNowFunc(func() time.Time { return now }), + ) + + // Drive to Failed: detect → first reboot (rebootCount=1) → retry exceeds limit. + if err := w.run(t.Context()); err != nil { + t.Fatal(err) + } + now = now.Add(6 * time.Minute) + if err := w.run(t.Context()); err != nil { + t.Fatal(err) + } + now = now.Add(11 * time.Minute) + if err := w.run(t.Context()); err != nil { + t.Fatal(err) + } + + if st, _ := w.states.Get("node-01"); st.Phase() != PhaseFailed { + t.Fatalf("expected PhaseFailed, got %v", st.Phase()) + } + + // Node recovers. + node.Status.Conditions[0].Status = corev1.ConditionTrue + now = now.Add(time.Minute) + if err := w.run(t.Context()); err != nil { + t.Fatal(err) + } + + st, _ := w.states.Get("node-01") + if st.Phase() != PhaseHealthy { + t.Errorf("expected PhaseHealthy after recovery, got %v", st.Phase()) + } + if st.RebootCount() != 0 { + t.Errorf("rebootCount should reset to 0 after recovery, got %d", st.RebootCount()) + } +} + func TestBuildNodeSelector(t *testing.T) { tests := []struct { description string From a4e96583f8be7b1bcd85648b6fc1f39b2953c780 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Wed, 22 Apr 2026 01:31:50 +0900 Subject: [PATCH 64/71] fix: address review feedback (RBAC, time.Duration, error handling) - RBAC: add roleRef.apiGroup to ClusterRoleBinding (worked via apiserver defaulting but should be explicit) - time.Duration: multiply by time.Minute at option-set time so the field truly holds a duration (was nanoseconds) - metrics server: use errors.Is for http.ErrServerClosed and trigger the signal-based stop() when ListenAndServe fails unexpectedly, so the agent shuts down cleanly instead of running without observability - watcher: drop the unused named return on setupKubernetesClient - health.HealthChecker.Threshold godoc: document zero value as "trigger immediately on failure" Co-Authored-By: Claude Opus 4.6 (1M context) --- charts/templates/rbac.yaml | 3 ++- main.go | 4 +++- pkg/health/health.go | 1 + pkg/watcher/options.go | 4 ++-- pkg/watcher/watcher.go | 4 ++-- pkg/watcher/watcher_test.go | 2 +- 6 files changed, 11 insertions(+), 7 deletions(-) diff --git a/charts/templates/rbac.yaml b/charts/templates/rbac.yaml index e3a295f..0bb2b6a 100644 --- a/charts/templates/rbac.yaml +++ b/charts/templates/rbac.yaml @@ -16,5 +16,6 @@ subjects: name: {{ .Chart.Name }} namespace: kube-system roleRef: - kind: ClusterRole + kind: ClusterRole name: {{ .Chart.Name }} + apiGroup: rbac.authorization.k8s.io diff --git a/main.go b/main.go index 9b71d9d..8fafb13 100644 --- a/main.go +++ b/main.go @@ -2,6 +2,7 @@ package main import ( "context" + "errors" "flag" "fmt" "log/slog" @@ -62,8 +63,9 @@ func run(ctx context.Context) error { } go func() { slog.Info("Starting metrics server", "addr", metricsServer.Addr) - if err := metricsServer.ListenAndServe(); err != nil && err != http.ErrServerClosed { + if err := metricsServer.ListenAndServe(); !errors.Is(err, http.ErrServerClosed) { slog.Error("Metrics server failed", "error", err) + stop() } }() defer func() { diff --git a/pkg/health/health.go b/pkg/health/health.go index 3efd618..58ff759 100644 --- a/pkg/health/health.go +++ b/pkg/health/health.go @@ -15,6 +15,7 @@ type HealthChecker interface { Check(node *corev1.Node) (healthy bool, reason string) // Threshold returns how long this checker must continuously fail // before a recovery action is triggered. + // A zero value means "trigger immediately on failure" (no wait period). Threshold() time.Duration } diff --git a/pkg/watcher/options.go b/pkg/watcher/options.go index 9400bae..bdfb546 100644 --- a/pkg/watcher/options.go +++ b/pkg/watcher/options.go @@ -60,7 +60,7 @@ func WithRebootWaitMinutes(s string) Option { return func(w *watcher) { n, err := strconv.Atoi(s) if err == nil && n > 0 { - w.rebootWaitMinutes = time.Duration(n) + w.rebootWaitMinutes = time.Duration(n) * time.Minute } else { slog.Info("RebootWaitMinutes is invalid", "value", s) } @@ -72,7 +72,7 @@ func WithGPURebootWaitMinutes(s string) Option { return func(w *watcher) { n, err := strconv.Atoi(s) if err == nil && n > 0 { - w.gpuRebootWaitMinutes = time.Duration(n) + w.gpuRebootWaitMinutes = time.Duration(n) * time.Minute } else { slog.Info("GPURebootWaitMinutes is invalid", "value", s) } diff --git a/pkg/watcher/watcher.go b/pkg/watcher/watcher.go index e0ae701..578d7ae 100644 --- a/pkg/watcher/watcher.go +++ b/pkg/watcher/watcher.go @@ -66,7 +66,7 @@ func NewWatcher(ctx context.Context, opts ...Option) (Watcher, error) { // setupKubernetesClient creates Kubernetes client based on the kubeconfig path. // If kubeconfig path is not empty, the client will be created using that path. // Otherwise, if the kubeconfig path is empty, the client will be created using the in-cluster config. -func (w *watcher) setupKubernetesClient() (err error) { +func (w *watcher) setupKubernetesClient() error { if w.clientCfgPath != "" && w.client == nil { cfg, err := clientcmd.BuildConfigFromFlags("", w.clientCfgPath) if err != nil { @@ -241,7 +241,7 @@ func (w *watcher) run(ctx context.Context) error { if state.IsGPUNode() { rebootWait = w.gpuRebootWaitMinutes } - if now.Sub(state.LastRebootTime()) < rebootWait*time.Minute { + if now.Sub(state.LastRebootTime()) < rebootWait { // In monitor-only mode no reboot actually happened, so logging // "waiting for reboot effect" every tick would be noisy. // The "Reboot retry" log still fires once per rebootWait cycle as a liveness signal. diff --git a/pkg/watcher/watcher_test.go b/pkg/watcher/watcher_test.go index 3de03c3..9048f67 100644 --- a/pkg/watcher/watcher_test.go +++ b/pkg/watcher/watcher_test.go @@ -67,7 +67,7 @@ func (c *alwaysFailChecker) Threshold() time.Duration { return c.thresh var ( testNodePoolID = "test-node-pool" - testRebootWaitMinutes = time.Duration(10) + testRebootWaitMinutes = time.Duration(10) * time.Minute ) // newTestNode creates a node for testing with common defaults. From 51c5d38a1d6674cac4ef2c95667691a4b5bd3ef9 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Wed, 22 Apr 2026 02:56:41 +0900 Subject: [PATCH 65/71] fix: NodeState concurrency, Healthy metric, log fixes - NodeState: add sync.RWMutex; getters RLock; mutators take Get() then NodeState.mu.Lock() so the Store lock is released before the per-state lock - StateStore.Range: iterate over a snapshot taken under the read lock so fn can safely call mutators without self-deadlocking - Set RecoveryPhase{phase=Healthy}=1 every tick while healthy, not only on transition, so always-healthy nodes appear in the metric - Call MarkWaitingReboot before the "Reboot retry" log and drop the stale "+1" in the rebootCount log; monitor mode now reports the actual (unincremented) count - Drop leftover rebootWait*time.Minute in the waiting-for-reboot-effect log - Remove unused ctx parameter from NewWatcher Co-Authored-By: Claude Opus 4.7 (1M context) --- main.go | 2 +- pkg/watcher/state.go | 69 +++++++++++++++++++++++-------------- pkg/watcher/watcher.go | 10 +++--- pkg/watcher/watcher_test.go | 6 ++-- 4 files changed, 51 insertions(+), 36 deletions(-) diff --git a/main.go b/main.go index 8fafb13..361d50d 100644 --- a/main.go +++ b/main.go @@ -76,7 +76,7 @@ func run(ctx context.Context) error { } }() - w, err := watcher.NewWatcher(ctx, + w, err := watcher.NewWatcher( watcher.WithNodePoolIDs(nodePoolIDs), watcher.WithKubernetesClientConfigPath(*kubeconfigPath), watcher.WithExecutor(executor), diff --git a/pkg/watcher/state.go b/pkg/watcher/state.go index 0e18d76..47f441f 100644 --- a/pkg/watcher/state.go +++ b/pkg/watcher/state.go @@ -46,6 +46,7 @@ func (p NodePhase) String() string { // NodeState holds the recovery state for a single node. // All fields are private; read via getters, mutate via StateStore methods. type NodeState struct { + mu sync.RWMutex phase NodePhase unhealthySince time.Time lastRebootTime time.Time @@ -54,11 +55,31 @@ type NodeState struct { isGPUNode bool } -func (s *NodeState) Phase() NodePhase { return s.phase } -func (s *NodeState) UnhealthySince() time.Time { return s.unhealthySince } -func (s *NodeState) LastRebootTime() time.Time { return s.lastRebootTime } -func (s *NodeState) RebootCount() int { return s.rebootCount } -func (s *NodeState) IsGPUNode() bool { return s.isGPUNode } +func (s *NodeState) Phase() NodePhase { + s.mu.RLock() + defer s.mu.RUnlock() + return s.phase +} +func (s *NodeState) UnhealthySince() time.Time { + s.mu.RLock() + defer s.mu.RUnlock() + return s.unhealthySince +} +func (s *NodeState) LastRebootTime() time.Time { + s.mu.RLock() + defer s.mu.RUnlock() + return s.lastRebootTime +} +func (s *NodeState) RebootCount() int { + s.mu.RLock() + defer s.mu.RUnlock() + return s.rebootCount +} +func (s *NodeState) IsGPUNode() bool { + s.mu.RLock() + defer s.mu.RUnlock() + return s.isGPUNode +} // StateStore is a concurrency-safe store for per-node recovery state. type StateStore struct { @@ -78,7 +99,6 @@ func NewStateStore() *StateStore { func (s *StateStore) GetOrCreate(name string) *NodeState { s.mu.Lock() defer s.mu.Unlock() - if st, ok := s.nodes[name]; ok { return st } @@ -105,9 +125,12 @@ func (s *StateStore) Delete(name string) { // Range calls fn for each node state entry. If fn returns false, iteration stops. func (s *StateStore) Range(fn func(name string, state *NodeState) bool) { s.mu.RLock() - defer s.mu.RUnlock() - + snapshot := make(map[string]*NodeState, len(s.nodes)) for name, state := range s.nodes { + snapshot[name] = state + } + s.mu.RUnlock() + for name, state := range snapshot { if !fn(name, state) { return } @@ -116,65 +139,60 @@ func (s *StateStore) Range(fn func(name string, state *NodeState) bool) { // UpdateCheckerInfo updates the failed checker names and GPU flag for a node. func (s *StateStore) UpdateCheckerInfo(name string, failedCheckers []string, isGPUNode bool) { - s.mu.Lock() - defer s.mu.Unlock() - - st, ok := s.nodes[name] + st, ok := s.Get(name) if !ok { return } + st.mu.Lock() st.failedCheckers = failedCheckers st.isGPUNode = isGPUNode + st.mu.Unlock() } // MarkUnhealthy transitions a node to PhaseUnhealthy and records when it became unhealthy. func (s *StateStore) MarkUnhealthy(name string, now time.Time) { - s.mu.Lock() - defer s.mu.Unlock() - - st, ok := s.nodes[name] + st, ok := s.Get(name) if !ok { return } + st.mu.Lock() st.phase = PhaseUnhealthy st.unhealthySince = now + st.mu.Unlock() } // MarkWaitingReboot transitions a node to PhaseWaitingReboot and records the reboot time. // When countReboot is true, the reboot counter is incremented. Pass false in monitor-only // mode where no actual reboot was issued. func (s *StateStore) MarkWaitingReboot(name string, now time.Time, countReboot bool) { - s.mu.Lock() - defer s.mu.Unlock() - - st, ok := s.nodes[name] + st, ok := s.Get(name) if !ok { return } + st.mu.Lock() st.phase = PhaseWaitingReboot st.lastRebootTime = now if countReboot { st.rebootCount++ } + st.mu.Unlock() } // MarkFailed transitions a node to PhaseFailed after recovery attempts were exhausted. func (s *StateStore) MarkFailed(name string) { - s.mu.Lock() - defer s.mu.Unlock() - - st, ok := s.nodes[name] + st, ok := s.Get(name) if !ok { return } + st.mu.Lock() st.phase = PhaseFailed + st.mu.Unlock() } // Reset replaces the node's state with a fresh PhaseHealthy entry. func (s *StateStore) Reset(name string) { s.mu.Lock() defer s.mu.Unlock() - if _, ok := s.nodes[name]; ok { s.nodes[name] = &NodeState{phase: PhaseHealthy} } @@ -184,7 +202,6 @@ func (s *StateStore) Reset(name string) { func (s *StateStore) Cleanup(activeNodes map[string]struct{}) { s.mu.Lock() defer s.mu.Unlock() - for name := range s.nodes { if _, ok := activeNodes[name]; !ok { delete(s.nodes, name) diff --git a/pkg/watcher/watcher.go b/pkg/watcher/watcher.go index 578d7ae..97e8930 100644 --- a/pkg/watcher/watcher.go +++ b/pkg/watcher/watcher.go @@ -45,7 +45,7 @@ type watcher struct { nowFunc func() time.Time } -func NewWatcher(ctx context.Context, opts ...Option) (Watcher, error) { +func NewWatcher(opts ...Option) (Watcher, error) { w := &watcher{ monitorOnly: true, states: NewStateStore(), @@ -175,13 +175,13 @@ func (w *watcher) run(ctx context.Context) error { // All checkers pass → node is healthy. if len(failedCheckers) == 0 { + metrics.RecoveryPhase.WithLabelValues(nodeName, PhaseHealthy.String()).Set(1) if prevPhase := state.Phase(); prevPhase != PhaseHealthy { slog.Info("Node recovered", "node", nodeName, "previousPhase", prevPhase.String()) metrics.NodeUnhealthyDurationSeconds.WithLabelValues(nodeName).Set(0) metrics.RecoveryPhase.WithLabelValues(nodeName, prevPhase.String()).Set(0) - metrics.RecoveryPhase.WithLabelValues(nodeName, PhaseHealthy.String()).Set(1) w.states.Reset(nodeName) } continue @@ -249,7 +249,7 @@ func (w *watcher) run(ctx context.Context) error { slog.Info("Waiting for reboot effect", "node", nodeName, "elapsed", now.Sub(state.LastRebootTime()).String(), - "rebootWait", (rebootWait * time.Minute).String(), + "rebootWait", rebootWait.String(), "rebootCount", state.RebootCount(), "isGPUNode", state.IsGPUNode()) } @@ -280,14 +280,14 @@ func (w *watcher) run(ctx context.Context) error { continue } } + w.states.MarkWaitingReboot(nodeName, now, !w.monitorOnly) mode := modeLabel(w.monitorOnly) slog.Info("Reboot retry", "node", nodeName, "mode", mode, - "rebootCount", state.RebootCount()+1, + "rebootCount", state.RebootCount(), "failedCheckers", failedCheckers) metrics.RecoveryActionsTotal.WithLabelValues(nodeName, "reboot", mode).Inc() - w.states.MarkWaitingReboot(nodeName, now, !w.monitorOnly) // Failed: recovery attempts exhausted. Wait for natural recovery (all checkers pass). // If the node recovers the "all checkers pass" branch above will Reset it back to Healthy. diff --git a/pkg/watcher/watcher_test.go b/pkg/watcher/watcher_test.go index 9048f67..f478498 100644 --- a/pkg/watcher/watcher_test.go +++ b/pkg/watcher/watcher_test.go @@ -101,8 +101,7 @@ func newTestWatcher(t *testing.T, opts ...Option) *watcher { WithKubernetesClient(fake.NewSimpleClientset()), WithExecutor(&mockExecutor{}), } - w, err := NewWatcher(t.Context(), - append(baseOpts, opts...)...) + w, err := NewWatcher(append(baseOpts, opts...)...) if err != nil { t.Fatal(err) } @@ -175,8 +174,7 @@ func TestNew(t *testing.T) { for _, test := range tests { t.Run(test.description, func(t *testing.T) { - w, err := NewWatcher(t.Context(), - test.args.opts...) + w, err := NewWatcher(test.args.opts...) if (err != nil) != test.wantErr { t.Errorf("error = %v, wantErr %v", err, test.wantErr) } From 8debc95a2069a1a7d89617e0e206bc95fe9ada3e Mon Sep 17 00:00:00 2001 From: hlts2 Date: Wed, 22 Apr 2026 04:13:24 +0900 Subject: [PATCH 66/71] refactor: monitor mode fully simulates the recovery lifecycle The only difference from active mode is now the absence of the executor.Reboot() call. rebootCount increments every retry, maxRebootRetries eventually drives the node to PhaseFailed, and the "Reboot retry" log stops instead of firing every rebootWait cycle forever. - state.MarkWaitingReboot drops the countReboot flag and always increments the counter - watcher.go drops the !monitorOnly guard around state updates, logs, and metrics in the retry path - watcher_test: invert TestRun_MonitorOnlyDoesNotIncrementRebootCount into TestRun_MonitorOnlySimulatesFullLifecycle Co-Authored-By: Claude Opus 4.7 (1M context) --- pkg/watcher/state.go | 11 ++++------- pkg/watcher/state_test.go | 17 +++++------------ pkg/watcher/watcher.go | 4 ++-- pkg/watcher/watcher_test.go | 17 +++++++++++------ 4 files changed, 22 insertions(+), 27 deletions(-) diff --git a/pkg/watcher/state.go b/pkg/watcher/state.go index 47f441f..df9e91f 100644 --- a/pkg/watcher/state.go +++ b/pkg/watcher/state.go @@ -161,10 +161,9 @@ func (s *StateStore) MarkUnhealthy(name string, now time.Time) { st.mu.Unlock() } -// MarkWaitingReboot transitions a node to PhaseWaitingReboot and records the reboot time. -// When countReboot is true, the reboot counter is incremented. Pass false in monitor-only -// mode where no actual reboot was issued. -func (s *StateStore) MarkWaitingReboot(name string, now time.Time, countReboot bool) { +// MarkWaitingReboot transitions a node to PhaseWaitingReboot, records the +// reboot time, and increments the reboot counter. +func (s *StateStore) MarkWaitingReboot(name string, now time.Time) { st, ok := s.Get(name) if !ok { return @@ -172,9 +171,7 @@ func (s *StateStore) MarkWaitingReboot(name string, now time.Time, countReboot b st.mu.Lock() st.phase = PhaseWaitingReboot st.lastRebootTime = now - if countReboot { - st.rebootCount++ - } + st.rebootCount++ st.mu.Unlock() } diff --git a/pkg/watcher/state_test.go b/pkg/watcher/state_test.go index 6f66c13..1a47a72 100644 --- a/pkg/watcher/state_test.go +++ b/pkg/watcher/state_test.go @@ -144,7 +144,7 @@ func TestStateStoreMarkWaitingReboot(t *testing.T) { s.GetOrCreate("node-01") now := time.Date(2026, 4, 13, 12, 0, 0, 0, time.UTC) - s.MarkWaitingReboot("node-01", now, true) + s.MarkWaitingReboot("node-01", now) st, _ := s.Get("node-01") if st.Phase() != PhaseWaitingReboot { @@ -159,7 +159,7 @@ func TestStateStoreMarkWaitingReboot(t *testing.T) { // Retry increments count. later := now.Add(time.Hour) - s.MarkWaitingReboot("node-01", later, true) + s.MarkWaitingReboot("node-01", later) st, _ = s.Get("node-01") if st.RebootCount() != 2 { @@ -168,25 +168,18 @@ func TestStateStoreMarkWaitingReboot(t *testing.T) { if !st.LastRebootTime().Equal(later) { t.Errorf("got lastRebootTime %v after retry, want %v", st.LastRebootTime(), later) } - - // countReboot=false keeps the counter stable (monitor-only mode). - s.MarkWaitingReboot("node-01", later.Add(time.Hour), false) - st, _ = s.Get("node-01") - if st.RebootCount() != 2 { - t.Errorf("rebootCount should not increment when countReboot=false; got %d, want 2", st.RebootCount()) - } } func TestStateStoreMarkWaitingRebootNonexistent(t *testing.T) { s := NewStateStore() // Should not panic. - s.MarkWaitingReboot("nonexistent", time.Now(), true) + s.MarkWaitingReboot("nonexistent", time.Now()) } func TestStateStoreMarkFailed(t *testing.T) { s := NewStateStore() s.GetOrCreate("node-01") - s.MarkWaitingReboot("node-01", time.Now(), true) + s.MarkWaitingReboot("node-01", time.Now()) s.MarkFailed("node-01") @@ -228,7 +221,7 @@ func TestStateStoreReset(t *testing.T) { now := time.Date(2026, 4, 13, 12, 0, 0, 0, time.UTC) s.MarkUnhealthy("node-01", now) s.UpdateCheckerInfo("node-01", []string{"NodeReady"}, true) - s.MarkWaitingReboot("node-01", now, true) + s.MarkWaitingReboot("node-01", now) s.Reset("node-01") diff --git a/pkg/watcher/watcher.go b/pkg/watcher/watcher.go index 97e8930..8e26a72 100644 --- a/pkg/watcher/watcher.go +++ b/pkg/watcher/watcher.go @@ -231,7 +231,7 @@ func (w *watcher) run(ctx context.Context) error { metrics.RecoveryActionsTotal.WithLabelValues(nodeName, "reboot", mode).Inc() metrics.RecoveryPhase.WithLabelValues(nodeName, PhaseUnhealthy.String()).Set(0) metrics.RecoveryPhase.WithLabelValues(nodeName, PhaseWaitingReboot.String()).Set(1) - w.states.MarkWaitingReboot(nodeName, now, !w.monitorOnly) + w.states.MarkWaitingReboot(nodeName, now) // WaitingReboot: health check still failing after reboot, retry after wait window. case PhaseWaitingReboot: @@ -280,7 +280,7 @@ func (w *watcher) run(ctx context.Context) error { continue } } - w.states.MarkWaitingReboot(nodeName, now, !w.monitorOnly) + w.states.MarkWaitingReboot(nodeName, now) mode := modeLabel(w.monitorOnly) slog.Info("Reboot retry", "node", nodeName, diff --git a/pkg/watcher/watcher_test.go b/pkg/watcher/watcher_test.go index f478498..6e42798 100644 --- a/pkg/watcher/watcher_test.go +++ b/pkg/watcher/watcher_test.go @@ -581,19 +581,21 @@ func TestRun_RebootRetryLimitExceeded_TransitionsToFailed(t *testing.T) { } } -func TestRun_MonitorOnlyDoesNotIncrementRebootCount(t *testing.T) { +func TestRun_MonitorOnlySimulatesFullLifecycle(t *testing.T) { now := time.Date(2026, 4, 13, 12, 0, 0, 0, time.UTC) node := newTestNode("node-01", corev1.ConditionFalse, 0) + exec := &mockExecutor{} w := newTestWatcher(t, withNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), WithCheckers(health.NewDefaultCheckers()), + WithExecutor(exec), WithMonitorOnly("true"), WithRebootWaitMinutes("10"), WithMaxRebootRetries("3"), withNowFunc(func() time.Time { return now }), ) - // Run the state machine through several reboot cycles. + // Drive the state machine through detection + three reboot cycles + retry-limit check. if err := w.run(t.Context()); err != nil { t.Fatal(err) } @@ -605,11 +607,14 @@ func TestRun_MonitorOnlyDoesNotIncrementRebootCount(t *testing.T) { } state, _ := w.states.Get("node-01") - if state.RebootCount() != 0 { - t.Errorf("rebootCount should stay 0 in monitor-only mode, got %d", state.RebootCount()) + if state.Phase() != PhaseFailed { + t.Errorf("expected PhaseFailed after monitor-only simulation, got %v", state.Phase()) } - if state.Phase() == PhaseFailed { - t.Errorf("monitor-only mode must not transition to PhaseFailed; got phase %v", state.Phase()) + if state.RebootCount() != 3 { + t.Errorf("expected rebootCount=3 (maxRebootRetries), got %d", state.RebootCount()) + } + if len(exec.calls) != 0 { + t.Errorf("expected no executor calls in monitor-only mode, got %d", len(exec.calls)) } } From 47239f72bbbc47384bd57af18443e15bef0ebf27 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Wed, 22 Apr 2026 04:31:49 +0900 Subject: [PATCH 67/71] fix: align HealthCheckTotal result label with godoc and drop no-op Info defer - watcher.go: the HealthCheckTotal "result" label godoc promises pass/fail, but the code was passing the free-form reason string. Emit "pass"/"fail" and rely on logs for failure reason detail (keeps the metric's label cardinality bounded) - main.go: remove the defer DeleteLabelValues for Info. Defers run LIFO after metricsServer.Shutdown, so nothing can scrape the delete; Prometheus already handles process exit via up{}=0 Co-Authored-By: Claude Opus 4.7 (1M context) --- main.go | 1 - pkg/watcher/watcher.go | 6 ++++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/main.go b/main.go index 361d50d..22f0fcc 100644 --- a/main.go +++ b/main.go @@ -56,7 +56,6 @@ func run(ctx context.Context) error { metrics.Register() metrics.Info.WithLabelValues(version, clusterID).Set(1) - defer metrics.Info.DeleteLabelValues(version, clusterID) metricsServer := &http.Server{ Addr: ":" + metricsPortValue(metricsPort), Handler: metrics.Handler(), diff --git a/pkg/watcher/watcher.go b/pkg/watcher/watcher.go index 8e26a72..04e872b 100644 --- a/pkg/watcher/watcher.go +++ b/pkg/watcher/watcher.go @@ -161,14 +161,16 @@ func (w *watcher) run(ctx context.Context) error { var failedCheckers []string var minThreshold time.Duration for _, checker := range w.checkers { - healthy, reason := checker.Check(node) + healthy, _ := checker.Check(node) + result := "pass" if !healthy { + result = "fail" failedCheckers = append(failedCheckers, checker.Name()) if minThreshold == 0 || checker.Threshold() < minThreshold { minThreshold = checker.Threshold() } } - metrics.HealthCheckTotal.WithLabelValues(nodeName, checker.Name(), reason).Inc() + metrics.HealthCheckTotal.WithLabelValues(nodeName, checker.Name(), result).Inc() } state := w.states.GetOrCreate(nodeName) From 6620955d9890693b8f6ffad81e03aa33be9f3109 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Wed, 22 Apr 2026 04:43:44 +0900 Subject: [PATCH 68/71] refactor: log checker reasons per-check and drop unused PhaseReboot - watcher.go: emit a structured "Health check failed" log inside the checker loop with the checker name and reason. The HealthCheckTotal metric stays bounded (pass/fail) while the reason is preserved for on-call debugging via logs - state.go: remove PhaseReboot. The state machine transitions directly from Unhealthy to WaitingReboot; the enum value was never assigned. Subsequent iota values shift down by one but all comparisons use names and metric labels use String(), so there is no external impact - state_test.go: drop the corresponding table entry Co-Authored-By: Claude Opus 4.7 (1M context) --- pkg/watcher/state.go | 11 ++++------- pkg/watcher/state_test.go | 1 - pkg/watcher/watcher.go | 6 +++++- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pkg/watcher/state.go b/pkg/watcher/state.go index df9e91f..5fc8025 100644 --- a/pkg/watcher/state.go +++ b/pkg/watcher/state.go @@ -12,11 +12,10 @@ const ( PhaseUnknown NodePhase = iota // 0 - unknown/uninitialized PhaseHealthy // 1 - node is healthy PhaseUnhealthy // 2 - checker(s) failing, waiting for threshold - PhaseReboot // 3 - reboot command issued - PhaseWaitingReboot // 4 - waiting for reboot to take effect - PhaseDrain // 5 - future: draining pods - PhaseReplace // 6 - future: replace issued - PhaseFailed // 7 - recovery gave up (exceeded retries); awaits manual intervention or natural recovery + PhaseWaitingReboot // 3 - waiting for reboot to take effect + PhaseDrain // 4 - future: draining pods + PhaseReplace // 5 - future: replace issued + PhaseFailed // 6 - recovery gave up (exceeded retries); awaits manual intervention or natural recovery ) // String returns the string representation of a NodePhase. @@ -28,8 +27,6 @@ func (p NodePhase) String() string { return "Healthy" case PhaseUnhealthy: return "Unhealthy" - case PhaseReboot: - return "Reboot" case PhaseWaitingReboot: return "WaitingReboot" case PhaseDrain: diff --git a/pkg/watcher/state_test.go b/pkg/watcher/state_test.go index 1a47a72..dbc9aba 100644 --- a/pkg/watcher/state_test.go +++ b/pkg/watcher/state_test.go @@ -13,7 +13,6 @@ func TestNodePhaseString(t *testing.T) { {PhaseUnknown, "Unknown"}, {PhaseHealthy, "Healthy"}, {PhaseUnhealthy, "Unhealthy"}, - {PhaseReboot, "Reboot"}, {PhaseWaitingReboot, "WaitingReboot"}, {PhaseDrain, "Drain"}, {PhaseReplace, "Replace"}, diff --git a/pkg/watcher/watcher.go b/pkg/watcher/watcher.go index 04e872b..764253f 100644 --- a/pkg/watcher/watcher.go +++ b/pkg/watcher/watcher.go @@ -161,10 +161,14 @@ func (w *watcher) run(ctx context.Context) error { var failedCheckers []string var minThreshold time.Duration for _, checker := range w.checkers { - healthy, _ := checker.Check(node) + healthy, reason := checker.Check(node) result := "pass" if !healthy { result = "fail" + slog.Info("Health check failed", + "node", nodeName, + "checker", checker.Name(), + "reason", reason) failedCheckers = append(failedCheckers, checker.Name()) if minThreshold == 0 || checker.Threshold() < minThreshold { minThreshold = checker.Threshold() From 0a5c02c517409e8f0528a2bbfbc732126ee6b998 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Wed, 22 Apr 2026 04:51:08 +0900 Subject: [PATCH 69/71] refactor: drop per-check "Health check failed" log The reason string is left unused in the watcher but the HealthChecker interface still returns it for future consumers. Co-Authored-By: Claude Opus 4.7 (1M context) --- pkg/watcher/watcher.go | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pkg/watcher/watcher.go b/pkg/watcher/watcher.go index 764253f..04e872b 100644 --- a/pkg/watcher/watcher.go +++ b/pkg/watcher/watcher.go @@ -161,14 +161,10 @@ func (w *watcher) run(ctx context.Context) error { var failedCheckers []string var minThreshold time.Duration for _, checker := range w.checkers { - healthy, reason := checker.Check(node) + healthy, _ := checker.Check(node) result := "pass" if !healthy { result = "fail" - slog.Info("Health check failed", - "node", nodeName, - "checker", checker.Name(), - "reason", reason) failedCheckers = append(failedCheckers, checker.Name()) if minThreshold == 0 || checker.Threshold() < minThreshold { minThreshold = checker.Threshold() From 94fc280e2c738874126149f1c960ca9992f4e304 Mon Sep 17 00:00:00 2001 From: hlts2 Date: Wed, 22 Apr 2026 05:08:13 +0900 Subject: [PATCH 70/71] docs: rename build output to node-agent Co-Authored-By: Claude Opus 4.7 (1M context) --- AGENTS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/AGENTS.md b/AGENTS.md index 74b5f3b..503e902 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -16,7 +16,7 @@ By default the agent runs in **monitor-only mode** (logs recovery actions withou ```bash # Build (CGO disabled — no C dependencies; required for static binary on the CP VM) -CGO_ENABLED=0 go build -o civo-node-agent ./ +CGO_ENABLED=0 go build -o node-agent ./ # Run all tests go test ./... From 3826bd2205c3753d79dfa96b29d10e9035fd6b0d Mon Sep 17 00:00:00 2001 From: hlts2 Date: Thu, 23 Apr 2026 15:05:16 +0900 Subject: [PATCH 71/71] feat: cap reboot API failures to bound Civo API load Adds a second give-up condition alongside maxRebootRetries: if executor.Reboot returns errors more than maxRebootFailures times, the node transitions to PhaseFailed from either PhaseUnhealthy (initial reboot) or PhaseWaitingReboot (retry). Prevents infinite retries when the Civo reboot API is unhealthy. maxRebootFailures is intentionally kept as an internal default option (not an env var) since failures have no wait window between attempts, so a user-set high value would let the agent hammer the API. Co-Authored-By: Claude Opus 4.7 (1M context) --- pkg/watcher/options.go | 19 +++++++++++++ pkg/watcher/state.go | 29 ++++++++++++++++--- pkg/watcher/state_test.go | 29 +++++++++++++++++++ pkg/watcher/watcher.go | 28 +++++++++++++++++++ pkg/watcher/watcher_test.go | 55 +++++++++++++++++++++++++++++++++++++ 5 files changed, 156 insertions(+), 4 deletions(-) diff --git a/pkg/watcher/options.go b/pkg/watcher/options.go index bdfb546..f1edcd4 100644 --- a/pkg/watcher/options.go +++ b/pkg/watcher/options.go @@ -21,6 +21,7 @@ var defaultOptions = []Option{ WithRebootWaitMinutes("10"), WithGPURebootWaitMinutes("40"), WithMaxRebootRetries("5"), + WithMaxRebootFailures("30"), } // WithKubernetesClient returns Option to set Kubernetes API client. @@ -92,6 +93,24 @@ func WithMaxRebootRetries(s string) Option { } } +// WithMaxRebootFailures returns Option to set the maximum number of reboot +// call failures tolerated before a node transitions to PhaseFailed. +// +// Intentionally not exposed as an env var: reboot call failures are not +// followed by a wait window, so a high value would let the agent hammer +// the Civo API on sustained failures. The bound is controlled here via +// the default option to cap the blast radius. +func WithMaxRebootFailures(s string) Option { + return func(w *watcher) { + n, err := strconv.Atoi(s) + if err == nil && n > 0 { + w.maxRebootFailures = n + } else { + slog.Info("MaxRebootFailures is invalid", "value", s) + } + } +} + // WithMonitorOnly returns Option to enable or disable monitor-only mode. // Accepts a string parsable by strconv.ParseBool (e.g. "true", "false", "1", "0"). // Empty or unparsable values are ignored (default: true). diff --git a/pkg/watcher/state.go b/pkg/watcher/state.go index 5fc8025..5b643da 100644 --- a/pkg/watcher/state.go +++ b/pkg/watcher/state.go @@ -43,11 +43,15 @@ func (p NodePhase) String() string { // NodeState holds the recovery state for a single node. // All fields are private; read via getters, mutate via StateStore methods. type NodeState struct { - mu sync.RWMutex - phase NodePhase + mu sync.RWMutex + phase NodePhase + unhealthySince time.Time - lastRebootTime time.Time - rebootCount int + + lastRebootTime time.Time + rebootCount int + failedRebootCount int + failedCheckers []string isGPUNode bool } @@ -72,6 +76,11 @@ func (s *NodeState) RebootCount() int { defer s.mu.RUnlock() return s.rebootCount } +func (s *NodeState) FailedRebootCount() int { + s.mu.RLock() + defer s.mu.RUnlock() + return s.failedRebootCount +} func (s *NodeState) IsGPUNode() bool { s.mu.RLock() defer s.mu.RUnlock() @@ -172,6 +181,18 @@ func (s *StateStore) MarkWaitingReboot(name string, now time.Time) { st.mu.Unlock() } +// RecordRebootFailure increments the failed reboot counter for a node. +// The node's phase is not changed; the caller decides whether to transition. +func (s *StateStore) RecordRebootFailure(name string) { + st, ok := s.Get(name) + if !ok { + return + } + st.mu.Lock() + st.failedRebootCount++ + st.mu.Unlock() +} + // MarkFailed transitions a node to PhaseFailed after recovery attempts were exhausted. func (s *StateStore) MarkFailed(name string) { st, ok := s.Get(name) diff --git a/pkg/watcher/state_test.go b/pkg/watcher/state_test.go index dbc9aba..b21e1ee 100644 --- a/pkg/watcher/state_test.go +++ b/pkg/watcher/state_test.go @@ -175,6 +175,31 @@ func TestStateStoreMarkWaitingRebootNonexistent(t *testing.T) { s.MarkWaitingReboot("nonexistent", time.Now()) } +func TestStateStoreRecordRebootFailure(t *testing.T) { + s := NewStateStore() + s.GetOrCreate("node-01") + + s.RecordRebootFailure("node-01") + st, _ := s.Get("node-01") + if st.FailedRebootCount() != 1 { + t.Errorf("got failedRebootCount %d, want 1", st.FailedRebootCount()) + } + if st.Phase() != PhaseHealthy { + t.Errorf("RecordRebootFailure should not change phase, got %v", st.Phase()) + } + + s.RecordRebootFailure("node-01") + if st.FailedRebootCount() != 2 { + t.Errorf("got failedRebootCount %d after second call, want 2", st.FailedRebootCount()) + } +} + +func TestStateStoreRecordRebootFailureNonexistent(t *testing.T) { + s := NewStateStore() + // Should not panic. + s.RecordRebootFailure("nonexistent") +} + func TestStateStoreMarkFailed(t *testing.T) { s := NewStateStore() s.GetOrCreate("node-01") @@ -221,6 +246,7 @@ func TestStateStoreReset(t *testing.T) { s.MarkUnhealthy("node-01", now) s.UpdateCheckerInfo("node-01", []string{"NodeReady"}, true) s.MarkWaitingReboot("node-01", now) + s.RecordRebootFailure("node-01") s.Reset("node-01") @@ -234,6 +260,9 @@ func TestStateStoreReset(t *testing.T) { if st.RebootCount() != 0 { t.Errorf("got rebootCount %d, want 0", st.RebootCount()) } + if st.FailedRebootCount() != 0 { + t.Errorf("got failedRebootCount %d, want 0", st.FailedRebootCount()) + } if !st.UnhealthySince().IsZero() { t.Error("unhealthySince should be zero after Reset") } diff --git a/pkg/watcher/watcher.go b/pkg/watcher/watcher.go index 04e872b..c388289 100644 --- a/pkg/watcher/watcher.go +++ b/pkg/watcher/watcher.go @@ -34,6 +34,7 @@ type watcher struct { rebootWaitMinutes time.Duration // Standard nodes (default: 10) gpuRebootWaitMinutes time.Duration // GPU nodes (default: 40) maxRebootRetries int // Give up and transition to PhaseFailed after this many reboots + maxRebootFailures int // Give up and transition to PhaseFailed after this many reboot call failures nodeLabelSelector *metav1.LabelSelector nodeLister listerscorev1.NodeLister @@ -218,10 +219,24 @@ func (w *watcher) run(ctx context.Context) error { "failedCheckers", failedCheckers) continue } + // Reboot call failure budget exhausted before the first successful + // reboot → give up without attempting another reboot. + if state.FailedRebootCount() >= w.maxRebootFailures { + slog.Warn("Reboot call failure limit exceeded, giving up", + "node", nodeName, + "failedRebootCount", state.FailedRebootCount(), + "maxRebootFailures", w.maxRebootFailures, + "failedCheckers", failedCheckers) + metrics.RecoveryPhase.WithLabelValues(nodeName, PhaseUnhealthy.String()).Set(0) + metrics.RecoveryPhase.WithLabelValues(nodeName, PhaseFailed.String()).Set(1) + w.states.MarkFailed(nodeName) + continue + } if !w.monitorOnly { if err := w.executor.Reboot(ctx, nodeName); err != nil { slog.Error("Failed to reboot node", "node", nodeName, "error", err) metrics.RecoveryFailuresTotal.WithLabelValues(nodeName, "reboot").Inc() + w.states.RecordRebootFailure(nodeName) continue } } @@ -274,11 +289,24 @@ func (w *watcher) run(ctx context.Context) error { w.states.MarkFailed(nodeName) continue } + // Reboot call failure budget exhausted → give up to cap Civo API load. + if state.FailedRebootCount() >= w.maxRebootFailures { + slog.Warn("Reboot call failure limit exceeded, giving up", + "node", nodeName, + "failedRebootCount", state.FailedRebootCount(), + "maxRebootFailures", w.maxRebootFailures, + "failedCheckers", failedCheckers) + metrics.RecoveryPhase.WithLabelValues(nodeName, PhaseWaitingReboot.String()).Set(0) + metrics.RecoveryPhase.WithLabelValues(nodeName, PhaseFailed.String()).Set(1) + w.states.MarkFailed(nodeName) + continue + } if !w.monitorOnly { if err := w.executor.Reboot(ctx, nodeName); err != nil { slog.Error("Failed to reboot node (retry)", "node", nodeName, "error", err) metrics.RecoveryFailuresTotal.WithLabelValues(nodeName, "reboot").Inc() + w.states.RecordRebootFailure(nodeName) continue } } diff --git a/pkg/watcher/watcher_test.go b/pkg/watcher/watcher_test.go index 6e42798..3367e29 100644 --- a/pkg/watcher/watcher_test.go +++ b/pkg/watcher/watcher_test.go @@ -581,6 +581,61 @@ func TestRun_RebootRetryLimitExceeded_TransitionsToFailed(t *testing.T) { } } +func TestRun_RebootFailureLimitExceeded_TransitionsToFailed(t *testing.T) { + now := time.Date(2026, 4, 13, 12, 0, 0, 0, time.UTC) + node := newTestNode("node-01", corev1.ConditionFalse, 0) + exec := &mockExecutor{ + rebootFunc: func(_ context.Context, _ string) error { + return fmt.Errorf("reboot API error") + }, + } + w := newTestWatcher(t, + withNodeLister(&fakeNodeLister{nodes: []*corev1.Node{node}}), + WithCheckers(health.NewDefaultCheckers()), + WithExecutor(exec), + WithMonitorOnly("false"), + WithRebootWaitMinutes("10"), + WithMaxRebootRetries("100"), + WithMaxRebootFailures("3"), + withNowFunc(func() time.Time { return now }), + ) + + // Run 1: detect unhealthy. + if err := w.run(t.Context()); err != nil { + t.Fatal(err) + } + // Advance past the unhealthy threshold; reboots will now be attempted and fail. + now = now.Add(11 * time.Minute) + + // Runs 2-4: reboot fails 3 times → failedRebootCount=3, still PhaseUnhealthy. + for i := 0; i < 3; i++ { + if err := w.run(t.Context()); err != nil { + t.Fatal(err) + } + } + + state, _ := w.states.Get("node-01") + if state.FailedRebootCount() != 3 { + t.Fatalf("expected failedRebootCount=3 after 3 failures, got %d", state.FailedRebootCount()) + } + if state.Phase() != PhaseUnhealthy { + t.Fatalf("expected still PhaseUnhealthy at limit, got %v", state.Phase()) + } + + // Next run: failedRebootCount=3 >= max=3 → PhaseFailed. + if err := w.run(t.Context()); err != nil { + t.Fatal(err) + } + + state, _ = w.states.Get("node-01") + if state.Phase() != PhaseFailed { + t.Errorf("got phase %v, want PhaseFailed", state.Phase()) + } + if len(exec.calls) != 3 { + t.Errorf("expected exactly 3 reboot calls (no further reboots after Failed), got %d", len(exec.calls)) + } +} + func TestRun_MonitorOnlySimulatesFullLifecycle(t *testing.T) { now := time.Date(2026, 4, 13, 12, 0, 0, 0, time.UTC) node := newTestNode("node-01", corev1.ConditionFalse, 0)