Skip to content

Commit

Permalink
Merge pull request #25 from treydock/status-labels
Browse files Browse the repository at this point in the history
Enumerate all possible states for mmces and statuses for mmhealth
  • Loading branch information
treydock authored Nov 19, 2020
2 parents 5e97d1e + a0a583e commit 4b6464f
Show file tree
Hide file tree
Showing 5 changed files with 262 additions and 15 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
* gpfs_fs_metadata_free_percent
* gpfs_fs_free_percent
* Remove nodename label from gpfs_perf_* metrics, replace with gpfs_perf_info metric
* mmces and mmhealth status metrics will always have value 1, only the `status` label will change
* mmces state metrics will have one metric per possible state, with active state having value 1
* mmhealth status metrics will have one metric per possible status with active status having value 1

### Changes

Expand Down
15 changes: 14 additions & 1 deletion collectors/mmces.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ var (
configNodeName = kingpin.Flag("collector.mmces.nodename", "CES node name to check, defaults to FQDN").Default("").String()
mmcesTimeout = kingpin.Flag("collector.mmces.timeout", "Timeout for mmces execution").Default("5").Int()
cesServices = []string{"AUTH", "BLOCK", "NETWORK", "AUTH_OBJ", "NFS", "OBJ", "SMB", "CES"}
cesStates = []string{"DEGRADED", "DEPEND", "DISABLED", "FAILED", "HEALTHY", "STARTING", "STOPPED", "SUSPENDED"}
mmcesExec = mmces
)

Expand Down Expand Up @@ -94,7 +95,19 @@ func (c *MmcesCollector) Collect(ch chan<- prometheus.Metric) {
errorMetric = 1
}
for _, m := range metrics {
ch <- prometheus.MustNewConstMetric(c.State, prometheus.GaugeValue, 1, m.Service, m.State)
for _, s := range cesStates {
var value float64
if s == m.State {
value = 1
}
ch <- prometheus.MustNewConstMetric(c.State, prometheus.GaugeValue, value, m.Service, s)
}
var unknown float64
if !SliceContains(cesStates, m.State) {
unknown = 1
level.Warn(c.logger).Log("msg", "Unknown state encountered", "state", m.State, "service", m.Service)
}
ch <- prometheus.MustNewConstMetric(c.State, prometheus.GaugeValue, unknown, m.Service, "UNKNOWN")
}
ch <- prometheus.MustNewConstMetric(collectError, prometheus.GaugeValue, float64(errorMetric), "mmces")
ch <- prometheus.MustNewConstMetric(collecTimeout, prometheus.GaugeValue, float64(timeout), "mmces")
Expand Down
143 changes: 136 additions & 7 deletions collectors/mmces_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ import (
var (
mmcesStdout = `
mmcesstate::HEADER:version:reserved:reserved:NODE:AUTH:BLOCK:NETWORK:AUTH_OBJ:NFS:OBJ:SMB:CES:
mmcesstate::0:1:::ib-protocol01.domain:HEALTHY:DISABLED:HEALTHY:DISABLED:HEALTHY:DISABLED:HEALTHY:HEALTHY:
mmcesstate::0:1:::ib-protocol01.domain:HEALTHY:DISABLED:HEALTHY:DISABLED:HEALTHY:DISABLED:FOO:HEALTHY:
`
)
Expand Down Expand Up @@ -121,24 +121,89 @@ func TestMMcesCollector(t *testing.T) {
mmcesExec = func(nodename string, ctx context.Context) (string, error) {
return mmcesStdout, nil
}

expected := `
# HELP gpfs_ces_state GPFS CES health status
# TYPE gpfs_ces_state gauge
gpfs_ces_state{service="AUTH",state="DEGRADED"} 0
gpfs_ces_state{service="AUTH",state="DEPEND"} 0
gpfs_ces_state{service="AUTH",state="DISABLED"} 0
gpfs_ces_state{service="AUTH",state="FAILED"} 0
gpfs_ces_state{service="AUTH",state="HEALTHY"} 1
gpfs_ces_state{service="AUTH",state="STARTING"} 0
gpfs_ces_state{service="AUTH",state="STOPPED"} 0
gpfs_ces_state{service="AUTH",state="SUSPENDED"} 0
gpfs_ces_state{service="AUTH",state="UNKNOWN"} 0
gpfs_ces_state{service="AUTH_OBJ",state="DEGRADED"} 0
gpfs_ces_state{service="AUTH_OBJ",state="DEPEND"} 0
gpfs_ces_state{service="AUTH_OBJ",state="DISABLED"} 1
gpfs_ces_state{service="AUTH_OBJ",state="FAILED"} 0
gpfs_ces_state{service="AUTH_OBJ",state="HEALTHY"} 0
gpfs_ces_state{service="AUTH_OBJ",state="STARTING"} 0
gpfs_ces_state{service="AUTH_OBJ",state="STOPPED"} 0
gpfs_ces_state{service="AUTH_OBJ",state="SUSPENDED"} 0
gpfs_ces_state{service="AUTH_OBJ",state="UNKNOWN"} 0
gpfs_ces_state{service="BLOCK",state="DEGRADED"} 0
gpfs_ces_state{service="BLOCK",state="DEPEND"} 0
gpfs_ces_state{service="BLOCK",state="DISABLED"} 1
gpfs_ces_state{service="BLOCK",state="FAILED"} 0
gpfs_ces_state{service="BLOCK",state="HEALTHY"} 0
gpfs_ces_state{service="BLOCK",state="STARTING"} 0
gpfs_ces_state{service="BLOCK",state="STOPPED"} 0
gpfs_ces_state{service="BLOCK",state="SUSPENDED"} 0
gpfs_ces_state{service="BLOCK",state="UNKNOWN"} 0
gpfs_ces_state{service="CES",state="DEGRADED"} 0
gpfs_ces_state{service="CES",state="DEPEND"} 0
gpfs_ces_state{service="CES",state="DISABLED"} 0
gpfs_ces_state{service="CES",state="FAILED"} 0
gpfs_ces_state{service="CES",state="HEALTHY"} 1
gpfs_ces_state{service="CES",state="STARTING"} 0
gpfs_ces_state{service="CES",state="STOPPED"} 0
gpfs_ces_state{service="CES",state="SUSPENDED"} 0
gpfs_ces_state{service="CES",state="UNKNOWN"} 0
gpfs_ces_state{service="NETWORK",state="DEGRADED"} 0
gpfs_ces_state{service="NETWORK",state="DEPEND"} 0
gpfs_ces_state{service="NETWORK",state="DISABLED"} 0
gpfs_ces_state{service="NETWORK",state="FAILED"} 0
gpfs_ces_state{service="NETWORK",state="HEALTHY"} 1
gpfs_ces_state{service="NETWORK",state="STARTING"} 0
gpfs_ces_state{service="NETWORK",state="STOPPED"} 0
gpfs_ces_state{service="NETWORK",state="SUSPENDED"} 0
gpfs_ces_state{service="NETWORK",state="UNKNOWN"} 0
gpfs_ces_state{service="NFS",state="DEGRADED"} 0
gpfs_ces_state{service="NFS",state="DEPEND"} 0
gpfs_ces_state{service="NFS",state="DISABLED"} 0
gpfs_ces_state{service="NFS",state="FAILED"} 0
gpfs_ces_state{service="NFS",state="HEALTHY"} 1
gpfs_ces_state{service="NFS",state="STARTING"} 0
gpfs_ces_state{service="NFS",state="STOPPED"} 0
gpfs_ces_state{service="NFS",state="SUSPENDED"} 0
gpfs_ces_state{service="NFS",state="UNKNOWN"} 0
gpfs_ces_state{service="OBJ",state="DEGRADED"} 0
gpfs_ces_state{service="OBJ",state="DEPEND"} 0
gpfs_ces_state{service="OBJ",state="DISABLED"} 1
gpfs_ces_state{service="SMB",state="HEALTHY"} 1
gpfs_ces_state{service="OBJ",state="FAILED"} 0
gpfs_ces_state{service="OBJ",state="HEALTHY"} 0
gpfs_ces_state{service="OBJ",state="STARTING"} 0
gpfs_ces_state{service="OBJ",state="STOPPED"} 0
gpfs_ces_state{service="OBJ",state="SUSPENDED"} 0
gpfs_ces_state{service="OBJ",state="UNKNOWN"} 0
gpfs_ces_state{service="SMB",state="DEGRADED"} 0
gpfs_ces_state{service="SMB",state="DEPEND"} 0
gpfs_ces_state{service="SMB",state="DISABLED"} 0
gpfs_ces_state{service="SMB",state="FAILED"} 0
gpfs_ces_state{service="SMB",state="HEALTHY"} 0
gpfs_ces_state{service="SMB",state="STARTING"} 0
gpfs_ces_state{service="SMB",state="STOPPED"} 0
gpfs_ces_state{service="SMB",state="SUSPENDED"} 0
gpfs_ces_state{service="SMB",state="UNKNOWN"} 1
`
collector := NewMmcesCollector(log.NewNopLogger())
gatherers := setupGatherer(collector)
if val, err := testutil.GatherAndCount(gatherers); err != nil {
t.Errorf("Unexpected error: %v", err)
} else if val != 11 {
t.Errorf("Unexpected collection count %d, expected 11", val)
} else if val != 75 {
t.Errorf("Unexpected collection count %d, expected 75", val)
}
if err := testutil.GatherAndCompare(gatherers, strings.NewReader(expected), "gpfs_ces_state"); err != nil {
t.Errorf("unexpected collecting result:\n%s", err)
Expand All @@ -155,21 +220,85 @@ func TestMMcesCollectorHostname(t *testing.T) {
expected := `
# HELP gpfs_ces_state GPFS CES health status
# TYPE gpfs_ces_state gauge
gpfs_ces_state{service="AUTH",state="DEGRADED"} 0
gpfs_ces_state{service="AUTH",state="DEPEND"} 0
gpfs_ces_state{service="AUTH",state="DISABLED"} 0
gpfs_ces_state{service="AUTH",state="FAILED"} 0
gpfs_ces_state{service="AUTH",state="HEALTHY"} 1
gpfs_ces_state{service="AUTH",state="STARTING"} 0
gpfs_ces_state{service="AUTH",state="STOPPED"} 0
gpfs_ces_state{service="AUTH",state="SUSPENDED"} 0
gpfs_ces_state{service="AUTH",state="UNKNOWN"} 0
gpfs_ces_state{service="AUTH_OBJ",state="DEGRADED"} 0
gpfs_ces_state{service="AUTH_OBJ",state="DEPEND"} 0
gpfs_ces_state{service="AUTH_OBJ",state="DISABLED"} 1
gpfs_ces_state{service="AUTH_OBJ",state="FAILED"} 0
gpfs_ces_state{service="AUTH_OBJ",state="HEALTHY"} 0
gpfs_ces_state{service="AUTH_OBJ",state="STARTING"} 0
gpfs_ces_state{service="AUTH_OBJ",state="STOPPED"} 0
gpfs_ces_state{service="AUTH_OBJ",state="SUSPENDED"} 0
gpfs_ces_state{service="AUTH_OBJ",state="UNKNOWN"} 0
gpfs_ces_state{service="BLOCK",state="DEGRADED"} 0
gpfs_ces_state{service="BLOCK",state="DEPEND"} 0
gpfs_ces_state{service="BLOCK",state="DISABLED"} 1
gpfs_ces_state{service="BLOCK",state="FAILED"} 0
gpfs_ces_state{service="BLOCK",state="HEALTHY"} 0
gpfs_ces_state{service="BLOCK",state="STARTING"} 0
gpfs_ces_state{service="BLOCK",state="STOPPED"} 0
gpfs_ces_state{service="BLOCK",state="SUSPENDED"} 0
gpfs_ces_state{service="BLOCK",state="UNKNOWN"} 0
gpfs_ces_state{service="CES",state="DEGRADED"} 0
gpfs_ces_state{service="CES",state="DEPEND"} 0
gpfs_ces_state{service="CES",state="DISABLED"} 0
gpfs_ces_state{service="CES",state="FAILED"} 0
gpfs_ces_state{service="CES",state="HEALTHY"} 1
gpfs_ces_state{service="CES",state="STARTING"} 0
gpfs_ces_state{service="CES",state="STOPPED"} 0
gpfs_ces_state{service="CES",state="SUSPENDED"} 0
gpfs_ces_state{service="CES",state="UNKNOWN"} 0
gpfs_ces_state{service="NETWORK",state="DEGRADED"} 0
gpfs_ces_state{service="NETWORK",state="DEPEND"} 0
gpfs_ces_state{service="NETWORK",state="DISABLED"} 0
gpfs_ces_state{service="NETWORK",state="FAILED"} 0
gpfs_ces_state{service="NETWORK",state="HEALTHY"} 1
gpfs_ces_state{service="NETWORK",state="STARTING"} 0
gpfs_ces_state{service="NETWORK",state="STOPPED"} 0
gpfs_ces_state{service="NETWORK",state="SUSPENDED"} 0
gpfs_ces_state{service="NETWORK",state="UNKNOWN"} 0
gpfs_ces_state{service="NFS",state="DEGRADED"} 0
gpfs_ces_state{service="NFS",state="DEPEND"} 0
gpfs_ces_state{service="NFS",state="DISABLED"} 0
gpfs_ces_state{service="NFS",state="FAILED"} 0
gpfs_ces_state{service="NFS",state="HEALTHY"} 1
gpfs_ces_state{service="NFS",state="STARTING"} 0
gpfs_ces_state{service="NFS",state="STOPPED"} 0
gpfs_ces_state{service="NFS",state="SUSPENDED"} 0
gpfs_ces_state{service="NFS",state="UNKNOWN"} 0
gpfs_ces_state{service="OBJ",state="DEGRADED"} 0
gpfs_ces_state{service="OBJ",state="DEPEND"} 0
gpfs_ces_state{service="OBJ",state="DISABLED"} 1
gpfs_ces_state{service="SMB",state="HEALTHY"} 1
gpfs_ces_state{service="OBJ",state="FAILED"} 0
gpfs_ces_state{service="OBJ",state="HEALTHY"} 0
gpfs_ces_state{service="OBJ",state="STARTING"} 0
gpfs_ces_state{service="OBJ",state="STOPPED"} 0
gpfs_ces_state{service="OBJ",state="SUSPENDED"} 0
gpfs_ces_state{service="OBJ",state="UNKNOWN"} 0
gpfs_ces_state{service="SMB",state="DEGRADED"} 0
gpfs_ces_state{service="SMB",state="DEPEND"} 0
gpfs_ces_state{service="SMB",state="DISABLED"} 0
gpfs_ces_state{service="SMB",state="FAILED"} 0
gpfs_ces_state{service="SMB",state="HEALTHY"} 0
gpfs_ces_state{service="SMB",state="STARTING"} 0
gpfs_ces_state{service="SMB",state="STOPPED"} 0
gpfs_ces_state{service="SMB",state="SUSPENDED"} 0
gpfs_ces_state{service="SMB",state="UNKNOWN"} 1
`
collector := NewMmcesCollector(log.NewNopLogger())
gatherers := setupGatherer(collector)
if val, err := testutil.GatherAndCount(gatherers); err != nil {
t.Errorf("Unexpected error: %v", err)
} else if val != 11 {
t.Errorf("Unexpected collection count %d, expected 11", val)
} else if val != 75 {
t.Errorf("Unexpected collection count %d, expected 75", val)
}
if err := testutil.GatherAndCompare(gatherers, strings.NewReader(expected), "gpfs_ces_state"); err != nil {
t.Errorf("unexpected collecting result:\n%s", err)
Expand Down
18 changes: 16 additions & 2 deletions collectors/mmhealth.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ var (
"entitytype": "EntityType",
"status": "Status",
}
mmhealthExec = mmhealth
mmhealthStatuses = []string{"CHECKING", "DEGRADED", "DEPEND", "DISABLED", "FAILED", "HEALTHY", "STARTING", "STOPPED", "SUSPENDED", "TIPS"}
mmhealthExec = mmhealth
)

type HealthMetric struct {
Expand Down Expand Up @@ -81,7 +82,20 @@ func (c *MmhealthCollector) Collect(ch chan<- prometheus.Metric) {
errorMetric = 1
}
for _, m := range metrics {
ch <- prometheus.MustNewConstMetric(c.State, prometheus.GaugeValue, 1, m.Component, m.EntityName, m.EntityType, m.Status)
for _, s := range mmhealthStatuses {
var value float64
if s == m.Status {
value = 1
}
ch <- prometheus.MustNewConstMetric(c.State, prometheus.GaugeValue, value, m.Component, m.EntityName, m.EntityType, s)
}
var unknown float64
if !SliceContains(mmhealthStatuses, m.Status) {
unknown = 1
level.Warn(c.logger).Log("msg", "Unknown status encountered", "status", m.Status,
"component", m.Component, "entityname", m.EntityName, "entitytype", m.EntityType)
}
ch <- prometheus.MustNewConstMetric(c.State, prometheus.GaugeValue, unknown, m.Component, m.EntityName, m.EntityType, "UNKNOWN")
}
ch <- prometheus.MustNewConstMetric(collectError, prometheus.GaugeValue, float64(errorMetric), "mmhealth")
ch <- prometheus.MustNewConstMetric(collecTimeout, prometheus.GaugeValue, float64(timeout), "mmhealth")
Expand Down
Loading

0 comments on commit 4b6464f

Please sign in to comment.