From 0965d2317fe4dab422893a76ff0a22297bb9dfae Mon Sep 17 00:00:00 2001 From: James Warner Date: Thu, 25 Jul 2024 20:43:45 +1200 Subject: [PATCH] fix(leader-election): introduce delay before attempting to reacquire leadership (#794) Fixes #785. Introduce a retry delay before attempting to reacquire leadership, fixing the log noise generated from the API server being unavailable. --- .../LeaderElection/LeaderElectionBackgroundService.cs | 11 ++++++++++- .../LeaderElectionBackgroundService.Test.cs | 4 ++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/KubeOps.Operator/LeaderElection/LeaderElectionBackgroundService.cs b/src/KubeOps.Operator/LeaderElection/LeaderElectionBackgroundService.cs index fd9e1b07..73a459be 100644 --- a/src/KubeOps.Operator/LeaderElection/LeaderElectionBackgroundService.cs +++ b/src/KubeOps.Operator/LeaderElection/LeaderElectionBackgroundService.cs @@ -82,6 +82,8 @@ static async ValueTask CastAndDispose(IDisposable resource) private async Task RunAndTryToHoldLeadershipForeverAsync() { + uint leadershipRetries = 0; + while (!_cts.IsCancellationRequested) { try @@ -94,7 +96,14 @@ private async Task RunAndTryToHoldLeadershipForeverAsync() } catch (Exception exception) { - logger.LogError(exception, "Failed to hold leadership."); + leadershipRetries++; + + var delay = TimeSpan + .FromSeconds(Math.Pow(2, Math.Clamp(leadershipRetries, 0, 5))) + .Add(TimeSpan.FromMilliseconds(new Random().Next(0, 1000))); + + logger.LogError(exception, "Failed to hold leadership. Wait {Seconds}s before attempting to reacquire leadership.", delay.TotalSeconds); + await Task.Delay(delay); } } } diff --git a/test/KubeOps.Operator.Test/LeaderElector/LeaderElectionBackgroundService.Test.cs b/test/KubeOps.Operator.Test/LeaderElector/LeaderElectionBackgroundService.Test.cs index a6316c70..913bef81 100644 --- a/test/KubeOps.Operator.Test/LeaderElector/LeaderElectionBackgroundService.Test.cs +++ b/test/KubeOps.Operator.Test/LeaderElector/LeaderElectionBackgroundService.Test.cs @@ -50,8 +50,8 @@ public async Task Elector_Throws_Should_Retry() await leaderElectionBackgroundService.StartAsync(CancellationToken.None); // Starting the background service should result in the lock attempt throwing, and then a subsequent attempt being made. - // Wait for the subsequent event to be signalled, if we time out the test fails. - electionLockSubsequentCallEvent.WaitOne(TimeSpan.FromMilliseconds(500)).Should().BeTrue(); + // Wait for the subsequent event to be signalled, if we time out the test fails. The retry delay requires us to wait at least 3 seconds. + electionLockSubsequentCallEvent.WaitOne(TimeSpan.FromMilliseconds(3100)).Should().BeTrue(); await leaderElectionBackgroundService.StopAsync(CancellationToken.None); }