Skip to content

Commit

Permalink
Fix discrepency metrics <> rebootAsRequired
Browse files Browse the repository at this point in the history
Without this patch, one metric could say "reboot is required"
while the rebootAsRequired tick did not run (long period for
example).

This is a problem, as it leads to misexpectations: "Why
did the system not reboot, while the metrics indicate a reboot
was required".

This solves it by inlining the metrics management within the
rebootAsRequired goroutine.
  • Loading branch information
evrardjp committed Nov 7, 2024
1 parent 594d3df commit a0d6cec
Showing 1 changed file with 4 additions and 12 deletions.
16 changes: 4 additions & 12 deletions cmd/kured/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,6 @@ func main() {
lock := daemonsetlock.New(client, nodeID, dsNamespace, dsName, lockAnnotation, lockTTL, concurrency, lockReleaseDelay)

go rebootAsRequired(nodeID, rebooter, rebootChecker, blockCheckers, window, lock, client)
go maintainRebootRequiredMetric(nodeID, rebootChecker)

http.Handle("/metrics", promhttp.Handler())
log.Fatal(http.ListenAndServe(fmt.Sprintf("%s:%d", metricsHost, metricsPort), nil))
Expand Down Expand Up @@ -468,17 +467,6 @@ func uncordon(client *kubernetes.Clientset, node *v1.Node) error {
return nil
}

func maintainRebootRequiredMetric(nodeID string, checker checkers.Checker) {
for {
if checker.RebootRequired() {
rebootRequiredGauge.WithLabelValues(nodeID).Set(1)
} else {
rebootRequiredGauge.WithLabelValues(nodeID).Set(0)
}
time.Sleep(time.Minute)
}
}

func addNodeAnnotations(client *kubernetes.Clientset, nodeID string, annotations map[string]string) error {
node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{})
if err != nil {
Expand Down Expand Up @@ -610,7 +598,9 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers.
preferNoScheduleTaint := taints.New(client, nodeID, preferNoScheduleTaintName, v1.TaintEffectPreferNoSchedule)

// Remove taint immediately during startup to quickly allow scheduling again.
// Also update the metric at startup (after lock shenanigans)
if !checker.RebootRequired() {
rebootRequiredGauge.WithLabelValues(nodeID).Set(0)
preferNoScheduleTaint.Disable()
}

Expand All @@ -625,9 +615,11 @@ func rebootAsRequired(nodeID string, rebooter reboot.Rebooter, checker checkers.

if !checker.RebootRequired() {
log.Infof("Reboot not required")
rebootRequiredGauge.WithLabelValues(nodeID).Set(0)
preferNoScheduleTaint.Disable()
continue
}
rebootRequiredGauge.WithLabelValues(nodeID).Set(1)

node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{})
if err != nil {
Expand Down

0 comments on commit a0d6cec

Please sign in to comment.