Skip to content

Commit

Permalink
Merge pull request #29183 from eggfoobar/add-sno-monitoring-test-update
Browse files Browse the repository at this point in the history
OCPBUGS-43059: fix: sno issues with alerts and pathological events
  • Loading branch information
openshift-merge-bot[bot] authored Oct 11, 2024
2 parents 3d638d0 + 2759a8f commit eadf775
Show file tree
Hide file tree
Showing 8 changed files with 112 additions and 9 deletions.
6 changes: 6 additions & 0 deletions pkg/monitortestlibrary/allowedalerts/basic_alert.go
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,12 @@ func (a *basicAlertTest) InvariantCheck(allEventIntervals monitorapi.Intervals,
state, message := a.failOrFlake(firingIntervals, pendingIntervals)

switch a.alertName {
case "KubeAPIErrorBudgetBurn":
// Currently this is a known issue that happens less often on HA but more frequently on single node
// TODO: Remove this flake when OCPBUGS-42083 is resolved
if state == fail && a.jobType.Topology == "single" {
state = flake
}
case "KubePodNotReady":
if state == fail && (kubePodNotReadyDueToImagePullBackoff(resourcesMap["events"], firingIntervals) || kubePodNotReadyDueToErrParsingSignature(resourcesMap["events"], firingIntervals)) {
// Since this is due to imagePullBackoff, change the state to flake instead of fail
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -733,6 +733,32 @@ func IsEventAfterInstallation(monitorEvent monitorapi.Interval, kubeClientConfig
return true, nil
}

func IsDuringAPIServerProgressingOnSNO(topology string, events monitorapi.Intervals) monitorapi.EventIntervalMatchesFunc {
if topology != "single" {
return func(eventInterval monitorapi.Interval) bool { return false }
}
ocpKubeAPIServerProgressingInterval := events.Filter(func(interval monitorapi.Interval) bool {
isNodeInstaller := interval.Message.Reason == monitorapi.NodeInstallerReason
isOperatorSource := interval.Source == monitorapi.SourceOperatorState
isKubeAPI := interval.Locator.Keys[monitorapi.LocatorClusterOperatorKey] == "kube-apiserver"

isKubeAPIInstaller := isNodeInstaller && isOperatorSource && isKubeAPI
isKubeAPIInstallProgressing := isKubeAPIInstaller && interval.Message.Annotations[monitorapi.AnnotationCondition] == "Progressing"

return isKubeAPIInstallProgressing
})

return func(i monitorapi.Interval) bool {
for _, progressingInterval := range ocpKubeAPIServerProgressingInterval {
// Before and After are not inclusive, we buffer 1 second for that.
if progressingInterval.From.Before(i.From.Add(time.Second)) && progressingInterval.To.After(i.To.Add(-1*time.Second)) {
return true
}
}
return false
}
}

func getInstallCompletionTime(kubeClientConfig *rest.Config) *metav1.Time {
configClient, err := configclient.NewForConfig(kubeClientConfig)
if err != nil {
Expand Down Expand Up @@ -976,13 +1002,19 @@ func newSingleNodeKubeAPIProgressingEventMatcher(finalIntervals monitorapi.Inter
return isKubeAPIInstallProgressing
})

// We buffer 1 second since Before and After are not inclusive for time comparisons.
for i := range ocpKubeAPIServerProgressingInterval {
ocpKubeAPIServerProgressingInterval[i].From = ocpKubeAPIServerProgressingInterval[i].From.Add(time.Second * -1)
ocpKubeAPIServerProgressingInterval[i].To = ocpKubeAPIServerProgressingInterval[i].To.Add(time.Second * 1)
}

if len(ocpKubeAPIServerProgressingInterval) > 0 {
logrus.Infof("found %d OCP Kube APIServer Progressing intervals", len(ocpKubeAPIServerProgressingInterval))
}
return &OverlapOtherIntervalsPathologicalEventMatcher{
delegate: &SimplePathologicalEventMatcher{
name: "KubeAPIServerProgressingDuringSingleNodeUpgrade",
messageHumanRegex: regexp.MustCompile(`^(clusteroperator/kube-apiserver version .* changed from |Back-off restarting failed container)`),
messageHumanRegex: regexp.MustCompile(`^(clusteroperator/kube-apiserver version .* changed from |Back-off restarting failed container|.*Client\.Timeout exceeded while awaiting headers)`),
topology: &snoTopology,
},
allowIfWithinIntervals: ocpKubeAPIServerProgressingInterval,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ func TestAllowedRepeatedEvents(t *testing.T) {
msg: monitorapi.NewMessage().HumanMessage("Readiness probe failed: Get \"https://10.0.30.87:9980/readyz\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)").
Reason("ProbeError").Build(),
expectedAllowName: "",
expectedMatchName: "EtcdReadinessProbeFailuresPerRevisionChange",
expectedMatchName: "EtcdReadinessProbeFailuresPerRevisionChange,KubeAPIServerProgressingDuringSingleNodeUpgrade",
},
{
name: "multiple versions found probably in transition",
Expand Down Expand Up @@ -192,7 +192,7 @@ func TestAllowedRepeatedEvents(t *testing.T) {
matches, matcher := registry.MatchesAny(i)
assert.True(t, matches, "event should have matched")
require.NotNil(t, matcher, "a matcher should have been returned")
assert.Equal(t, test.expectedMatchName, matcher.Name(), "event was not matched by the correct matcher")
assert.Contains(t, test.expectedMatchName, matcher.Name(), "event was not matched by the correct matcher")
}

if test.expectedAllowName != "" {
Expand Down
6 changes: 6 additions & 0 deletions pkg/monitortests/node/legacynodemonitortests/exclusions.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,12 @@ func isThisContainerRestartExcluded(locator string, exclusion Exclusion) bool {
containerName: "container/ingress-operator", // https://issues.redhat.com/browse/OCPBUGS-39315
topologyToExclude: "single",
},
{
// snapshot controller operator seems to fail on SNO during kube api upgrades
// the error from the pod is the inability to connect to the kas to get volumesnapshots on startup.
containerName: "container/snapshot-controller", // https://issues.redhat.com/browse/OCPBUGS-43113
topologyToExclude: "single",
},
{
containerName: "container/kube-multus", // https://issues.redhat.com/browse/OCPBUGS-42267
},
Expand Down
5 changes: 4 additions & 1 deletion pkg/monitortests/node/legacynodemonitortests/monitortest.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ func (*legacyMonitorTests) ConstructComputedIntervals(ctx context.Context, start
}

func (w *legacyMonitorTests) EvaluateTestsFromConstructedIntervals(ctx context.Context, finalIntervals monitorapi.Intervals) ([]*junitapi.JUnitTestCase, error) {

clusterData, _ := platformidentification.BuildClusterData(context.Background(), w.adminRESTConfig)

containerFailures, err := testContainerFailures(w.adminRESTConfig, finalIntervals)
if err != nil {
return nil, err
Expand Down Expand Up @@ -64,7 +67,7 @@ func (w *legacyMonitorTests) EvaluateTestsFromConstructedIntervals(ctx context.C
junits = append(junits, testNodeHasSufficientMemory(finalIntervals)...)
junits = append(junits, testNodeHasSufficientPID(finalIntervals)...)
junits = append(junits, testBackoffPullingRegistryRedhatImage(finalIntervals)...)
junits = append(junits, testBackoffStartingFailedContainer(finalIntervals)...)
junits = append(junits, testBackoffStartingFailedContainer(clusterData, finalIntervals)...)
junits = append(junits, testConfigOperatorReadinessProbe(finalIntervals)...)
junits = append(junits, testConfigOperatorProbeErrorReadinessProbe(finalIntervals)...)
junits = append(junits, testConfigOperatorProbeErrorLivenessProbe(finalIntervals)...)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (

"github.com/openshift/origin/pkg/monitor/monitorapi"
"github.com/openshift/origin/pkg/monitortestlibrary/pathologicaleventlibrary"
"github.com/openshift/origin/pkg/monitortestlibrary/platformidentification"
"github.com/openshift/origin/pkg/test/ginkgo/junitapi"
)

Expand Down Expand Up @@ -55,9 +56,13 @@ func testBackoffPullingRegistryRedhatImage(events monitorapi.Intervals) []*junit
// testBackoffStartingFailedContainer looks for this symptom in core namespaces:
//
// reason/BackOff Back-off restarting failed container
func testBackoffStartingFailedContainer(events monitorapi.Intervals) []*junitapi.JUnitTestCase {
func testBackoffStartingFailedContainer(clusterData platformidentification.ClusterData, events monitorapi.Intervals) []*junitapi.JUnitTestCase {
testName := "[sig-cluster-lifecycle] pathological event should not see excessive Back-off restarting failed containers"

events = events.Filter(
monitorapi.Not(pathologicaleventlibrary.IsDuringAPIServerProgressingOnSNO(clusterData.Topology, events)),
)

return pathologicaleventlibrary.NewSingleEventThresholdCheck(testName, pathologicaleventlibrary.AllowBackOffRestartingFailedContainer,
pathologicaleventlibrary.DuplicateEventThreshold, pathologicaleventlibrary.BackoffRestartingFlakeThreshold).
NamespacedTest(events.Filter(monitorapi.Not(monitorapi.IsInE2ENamespace)))
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
package legacynodemonitortests

import (
"fmt"
"strings"
"testing"
"time"

"github.com/openshift/origin/pkg/monitor/monitorapi"
"github.com/openshift/origin/pkg/monitortestlibrary/pathologicaleventlibrary"
"github.com/openshift/origin/pkg/monitortestlibrary/platformidentification"
"github.com/openshift/origin/pkg/test/ginkgo/junitapi"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
Expand Down Expand Up @@ -80,9 +83,11 @@ func Test_testBackoffStartingFailedContainer(t *testing.T) {
samplePod := "etcd-operator-6f9b4d9d4f-4q9q8"

tests := []struct {
name string
interval monitorapi.Interval
kind string
name string
interval monitorapi.Interval
extraIntervals monitorapi.Intervals
kind string
clusterData platformidentification.ClusterData
}{
{
name: "Test pass case",
Expand All @@ -108,11 +113,49 @@ func Test_testBackoffStartingFailedContainer(t *testing.T) {
11),
kind: "flake",
},
{
name: "Test sno case",
interval: monitorapi.Interval{
Condition: monitorapi.Condition{
Locator: monitorapi.NewLocator().PodFromNames(namespace, samplePod, ""),
Message: monitorapi.NewMessage().
Reason(monitorapi.IntervalReason("BackOff")).
HumanMessage("Back-off restarting failed container").
WithAnnotation(monitorapi.AnnotationCount, fmt.Sprintf("%d", 66)).Build(),
},
Source: monitorapi.SourceOperatorState,
From: time.Unix(1728589240, 0),
To: time.Unix(1728589345, 0),
},
extraIntervals: monitorapi.Intervals{
monitorapi.Interval{
Condition: monitorapi.Condition{
Locator: monitorapi.Locator{
Keys: map[monitorapi.LocatorKey]string{
monitorapi.LocatorClusterOperatorKey: "kube-apiserver",
},
},
Message: monitorapi.Message{
Reason: monitorapi.NodeInstallerReason,
Annotations: map[monitorapi.AnnotationKey]string{
monitorapi.AnnotationCondition: "Progressing",
},
},
},
Source: monitorapi.SourceOperatorState,
From: time.Unix(1728589200, 0),
To: time.Unix(1728589400, 0),
},
},
kind: "pass",
clusterData: platformidentification.ClusterData{JobType: platformidentification.JobType{Topology: "single"}},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
e := monitorapi.Intervals{tt.interval}
junits := testBackoffStartingFailedContainer(e)
e = append(e, tt.extraIntervals...)
junits := testBackoffStartingFailedContainer(tt.clusterData, e)

// Find the junit with the namespace of openshift-etcd-operator int the testname
var testJunits []*junitapi.JUnitTestCase
Expand Down
8 changes: 8 additions & 0 deletions test/e2e/upgrade/dns/dns.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (

"github.com/onsi/ginkgo/v2"
"github.com/onsi/gomega"
exutil "github.com/openshift/origin/test/extended/util"
kappsv1 "k8s.io/api/apps/v1"
kapiv1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -64,6 +65,13 @@ func (t *UpgradeTest) Test(ctx context.Context, f *framework.Framework, done <-c
ginkgo.By("Sleeping for a minute to give it time for verifying DNS after upgrade")
time.Sleep(1 * time.Minute)

// TODO: Remove once OCPBUGS-42777 is resolved
ex := exutil.NewCLIWithFramework(f)
if isSNO, err := exutil.IsSingleNode(ctx, ex.AdminConfigClient()); err == nil && isSNO {
// Add one minute for more data to be collected for validation
time.Sleep(1 * time.Minute)
}

ginkgo.By("Validating DNS results after upgrade")
t.validateDNSResults(f)
}
Expand Down

0 comments on commit eadf775

Please sign in to comment.