From 8c9d06f4a483d177f0ce370f923bce2b6e7198ab Mon Sep 17 00:00:00 2001 From: treydock Date: Thu, 21 Dec 2023 10:58:02 -0500 Subject: [PATCH] Add infiniband_switch_uptime_seconds from ibswinfo (#19) --- collectors/ibswinfo.go | 31 +++++++++++++++++++++++++++++++ collectors/ibswinfo_test.go | 17 ++++++++++++----- 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/collectors/ibswinfo.go b/collectors/ibswinfo.go index a28b439..b1beb79 100644 --- a/collectors/ibswinfo.go +++ b/collectors/ibswinfo.go @@ -43,6 +43,7 @@ type IbswinfoCollector struct { logger log.Logger collector string HardwareInfo *prometheus.Desc + Uptime *prometheus.Desc PowerSupplyStatus *prometheus.Desc PowerSupplyDCPower *prometheus.Desc PowerSupplyFanStatus *prometheus.Desc @@ -58,6 +59,7 @@ type Ibswinfo struct { SerialNumber string PSID string FirmwareVersion string + Uptime float64 PowerSupplies []SwitchPowerSupply Temp float64 FanStatus string @@ -88,6 +90,8 @@ func NewIbswinfoCollector(devices *[]InfinibandDevice, runonce bool, logger log. collector: collector, HardwareInfo: prometheus.NewDesc(prometheus.BuildFQName(namespace, "switch", "hardware_info"), "Infiniband switch hardware info", []string{"guid", "firmware_version", "psid", "part_number", "serial_number", "switch"}, nil), + Uptime: prometheus.NewDesc(prometheus.BuildFQName(namespace, "switch", "uptime_seconds"), + "Infiniband switch uptime in seconds", []string{"guid"}, nil), PowerSupplyStatus: prometheus.NewDesc(prometheus.BuildFQName(namespace, "switch", "power_supply_status_info"), "Infiniband switch power supply status", []string{"guid", "psu", "status"}, nil), PowerSupplyDCPower: prometheus.NewDesc(prometheus.BuildFQName(namespace, "switch", "power_supply_dc_power_status_info"), @@ -107,6 +111,7 @@ func NewIbswinfoCollector(devices *[]InfinibandDevice, runonce bool, logger log. func (s *IbswinfoCollector) Describe(ch chan<- *prometheus.Desc) { ch <- s.HardwareInfo + ch <- s.Uptime ch <- s.PowerSupplyStatus ch <- s.PowerSupplyDCPower ch <- s.PowerSupplyFanStatus @@ -122,6 +127,7 @@ func (s *IbswinfoCollector) Collect(ch chan<- prometheus.Metric) { for _, swinfo := range swinfos { ch <- prometheus.MustNewConstMetric(s.HardwareInfo, prometheus.GaugeValue, 1, swinfo.device.GUID, swinfo.FirmwareVersion, swinfo.PSID, swinfo.PartNumber, swinfo.SerialNumber, swinfo.device.Name) + ch <- prometheus.MustNewConstMetric(s.Uptime, prometheus.GaugeValue, swinfo.Uptime, swinfo.device.GUID) for _, psu := range swinfo.PowerSupplies { if psu.Status != "" { ch <- prometheus.MustNewConstMetric(s.PowerSupplyStatus, prometheus.GaugeValue, 1, swinfo.device.GUID, psu.ID, psu.Status) @@ -234,6 +240,7 @@ func parse_ibswinfo(out string, logger log.Logger) (Ibswinfo, error) { data.Temp = math.NaN() lines := strings.Split(out, "\n") psus := make(map[string]SwitchPowerSupply) + var err error var powerSupplies []SwitchPowerSupply var fans []SwitchFan var psuID string @@ -260,6 +267,30 @@ func parse_ibswinfo(out string, logger log.Logger) (Ibswinfo, error) { case "firmware version": data.FirmwareVersion = value } + if strings.HasPrefix(key, "uptime") { + // Convert Nd-H:M:S to time that ParseDuration understands + var days float64 + uptimeHMS := "" + uptime_s1 := strings.Split(value, "-") + if len(uptime_s1) == 2 { + daysStr := strings.Replace(uptime_s1[0], "d", "", 1) + days, err = strconv.ParseFloat(daysStr, 64) + if err != nil { + level.Error(logger).Log("msg", "Unable to parse uptime duration", "err", err, "value", value) + continue + } + uptimeHMS = uptime_s1[1] + } else { + uptimeHMS = value + } + t1, err := time.Parse("15:04:05", uptimeHMS) + if err != nil { + level.Error(logger).Log("msg", "Unable to parse uptime duration", "err", err, "value", value) + continue + } + t2, _ := time.Parse("15:04:05", "00:00:00") + data.Uptime = (days * 86400) + t1.Sub(t2).Seconds() + } var psu SwitchPowerSupply psu.PowerW = math.NaN() matchesPSU := rePSU.FindStringSubmatch(key) diff --git a/collectors/ibswinfo_test.go b/collectors/ibswinfo_test.go index 406353e..0c21d38 100644 --- a/collectors/ibswinfo_test.go +++ b/collectors/ibswinfo_test.go @@ -50,6 +50,9 @@ func TestParseIBSWInfo(t *testing.T) { if data.FirmwareVersion != "11.2008.2102" { t.Errorf("Unexpected firmware version, got %s", data.FirmwareVersion) } + if data.Uptime != 13862333 { + t.Errorf("Unexpected uptime, got %f", data.Uptime) + } if len(data.PowerSupplies) != 2 { t.Errorf("Unexpected number of power supplies, got %d", len(data.PowerSupplies)) } @@ -270,19 +273,23 @@ func TestIbswinfoCollector(t *testing.T) { # TYPE infiniband_switch_temperature_celsius gauge infiniband_switch_temperature_celsius{guid="0x506b4b03005c2740"} 53 infiniband_switch_temperature_celsius{guid="0x7cfe9003009ce5b0"} 45 + # HELP infiniband_switch_uptime_seconds Infiniband switch uptime in seconds + # TYPE infiniband_switch_uptime_seconds gauge + infiniband_switch_uptime_seconds{guid="0x506b4b03005c2740"} 8301347 + infiniband_switch_uptime_seconds{guid="0x7cfe9003009ce5b0"} 13862333 ` collector := NewIbswinfoCollector(&switchDevices, false, log.NewNopLogger()) gatherers := setupGatherer(collector) if val, err := testutil.GatherAndCount(gatherers); err != nil { t.Errorf("Unexpected error: %v", err) - } else if val != 42 { - t.Errorf("Unexpected collection count %d, expected 42", val) + } else if val != 44 { + t.Errorf("Unexpected collection count %d, expected 44", val) } if err := testutil.GatherAndCompare(gatherers, strings.NewReader(expected), "infiniband_switch_power_supply_status_info", "infiniband_switch_power_supply_dc_power_status_info", "infiniband_switch_power_supply_fan_status_info", "infiniband_switch_power_supply_watts", "infiniband_switch_temperature_celsius", "infiniband_switch_fan_status_info", "infiniband_switch_fan_rpm", - "infiniband_switch_hardware_info", + "infiniband_switch_hardware_info", "infiniband_switch_uptime_seconds", "infiniband_exporter_collect_errors", "infiniband_exporter_collect_timeouts"); err != nil { t.Errorf("unexpected collecting result:\n%s", err) } @@ -307,8 +314,8 @@ func TestIbswinfoCollectorMissingStatus(t *testing.T) { gatherers := setupGatherer(collector) if val, err := testutil.GatherAndCount(gatherers); err != nil { t.Errorf("Unexpected error: %v", err) - } else if val != 35 { - t.Errorf("Unexpected collection count %d, expected 42", val) + } else if val != 37 { + t.Errorf("Unexpected collection count %d, expected 37", val) } }