Skip to content

Commit

Permalink
feat:device add raw rate & FDR effective lane rate accurate to 13.64 (#…
Browse files Browse the repository at this point in the history
…16)

* feat:device add raw rate & FDR effective lane rate accurate to 13.64

* chore: add raw rate metric describe

* perf(ibnetdiscoverParse): name may have space so that it is split into multiple items

* perf(ibnetdiscoverParse): name may have space so that it is split into multiple items

* fix: test

* fix:test
  • Loading branch information
3th1nk authored Dec 3, 2023
1 parent 49299f5 commit 60572b5
Show file tree
Hide file tree
Showing 10 changed files with 204 additions and 81 deletions.
4 changes: 2 additions & 2 deletions collectors/collectors_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,12 @@ var (
mockedStdout string
_, cancel = context.WithTimeout(context.Background(), 5*time.Second)
switchDevices = []InfinibandDevice{
{Type: "SW", LID: "2052", GUID: "0x506b4b03005c2740", Rate: (25 * 4 * 125000000), Name: "ib-i4l1s01",
{Type: "SW", LID: "2052", GUID: "0x506b4b03005c2740", Rate: (25 * 4 * 125000000), RawRate: 1.2890625e+10, Name: "ib-i4l1s01",
Uplinks: map[string]InfinibandUplink{
"35": {Type: "CA", LID: "1432", PortNumber: "1", GUID: "0x506b4b0300cc02a6", Name: "p0001"},
},
},
{Type: "SW", LID: "1719", GUID: "0x7cfe9003009ce5b0", Rate: (25 * 4 * 125000000), Name: "ib-i1l1s01",
{Type: "SW", LID: "1719", GUID: "0x7cfe9003009ce5b0", Rate: (25 * 4 * 125000000), RawRate: 1.2890625e+10, Name: "ib-i1l1s01",
Uplinks: map[string]InfinibandUplink{
"1": {Type: "SW", LID: "1516", PortNumber: "1", GUID: "0x7cfe900300b07320", Name: "ib-i1l2s01"},
"10": {Type: "CA", LID: "134", PortNumber: "1", GUID: "0x7cfe9003003b4bde", Name: "o0001"},
Expand Down
5 changes: 5 additions & 0 deletions collectors/fixtures/ibnetdiscover/test2.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
CA 78 1 0x946dae0300630bfe 4x HDR - SW 51 81 0x946dae0300630bf6 ( 'Mellanox Technologies Aggregation Node' - '5FB0405-leaf-IB01 ' )
CA 88 1 0xb83fd20300da1138 4x HDR - SW 51 79 0x946dae0300630bf6 ( 'worker20 mlx5_3' - '5FB0405-leaf-IB01 ' )
SW 24 1 0x946dae0300618c82 4x SDR '5FB0406-spine-IB01 '
SW 25 1 0x946dae0300618c83 4x SDR ' 5FB0406-spine-IB02 '
SW 9 81 0x946dae030053ec1a 4x HDR - CA 60 1 0x946dae0300630bfe ( ' 5FB0406-spine-IB03 ' - 'Mellanox Technologies Aggregation Node' )
12 changes: 9 additions & 3 deletions collectors/hca.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ type HCACollector struct {
PortVLMappingErrors *prometheus.Desc
PortLoopingErrors *prometheus.Desc
Rate *prometheus.Desc
RawRate *prometheus.Desc
Uplink *prometheus.Desc
Info *prometheus.Desc
}
Expand Down Expand Up @@ -137,6 +138,8 @@ func NewHCACollector(devices *[]InfinibandDevice, runonce bool, logger log.Logge
"Infiniband HCA port PortLoopingErrors", labels, nil),
Rate: prometheus.NewDesc(prometheus.BuildFQName(namespace, "hca", "rate_bytes_per_second"),
"Infiniband HCA rate", []string{"guid"}, nil),
RawRate: prometheus.NewDesc(prometheus.BuildFQName(namespace, "hca", "raw_rate_bytes_per_second"),
"Infiniband HCA raw rate", []string{"guid"}, nil),
Uplink: prometheus.NewDesc(prometheus.BuildFQName(namespace, "hca", "uplink_info"),
"Infiniband HCA uplink information", append(labels, []string{"hca", "uplink", "uplink_guid", "uplink_type", "uplink_port", "uplink_lid"}...), nil),
Info: prometheus.NewDesc(prometheus.BuildFQName(namespace, "hca", "info"),
Expand Down Expand Up @@ -174,6 +177,7 @@ func (h *HCACollector) Describe(ch chan<- *prometheus.Desc) {
ch <- h.PortVLMappingErrors
ch <- h.PortLoopingErrors
ch <- h.Rate
ch <- h.RawRate
ch <- h.Uplink
ch <- h.Info
}
Expand Down Expand Up @@ -270,6 +274,7 @@ func (h *HCACollector) Collect(ch chan<- prometheus.Metric) {
if *hcaCollectBase {
for _, device := range *h.devices {
ch <- prometheus.MustNewConstMetric(h.Rate, prometheus.GaugeValue, device.Rate, device.GUID)
ch <- prometheus.MustNewConstMetric(h.RawRate, prometheus.GaugeValue, device.RawRate, device.GUID)
ch <- prometheus.MustNewConstMetric(h.Info, prometheus.GaugeValue, 1, device.GUID, device.Name, device.LID)
for port, uplink := range device.Uplinks {
ch <- prometheus.MustNewConstMetric(h.Uplink, prometheus.GaugeValue, 1, device.GUID, port, device.Name, uplink.Name, uplink.GUID, uplink.Type, uplink.PortNumber, uplink.LID)
Expand All @@ -294,7 +299,10 @@ func (h *HCACollector) collect() ([]PerfQueryCounters, float64, float64) {
limit <- 1
wg.Add(1)
go func(device InfinibandDevice) {
defer wg.Done()
defer func() {
<-limit
wg.Done()
}()
ctxExtended, cancelExtended := context.WithTimeout(context.Background(), *perfqueryTimeout)
defer cancelExtended()
ports := getDevicePorts(device.Uplinks)
Expand All @@ -308,7 +316,6 @@ func (h *HCACollector) collect() ([]PerfQueryCounters, float64, float64) {
errors++
}
if err != nil {
<-limit
return
}
deviceCounters, errs := perfqueryParse(device, extendedOut, h.logger)
Expand Down Expand Up @@ -340,7 +347,6 @@ func (h *HCACollector) collect() ([]PerfQueryCounters, float64, float64) {
countersLock.Unlock()
}
}
<-limit
}(device)
}
wg.Wait()
Expand Down
36 changes: 22 additions & 14 deletions collectors/hca_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,12 @@ import (

var (
hcaDevices = []InfinibandDevice{
{Type: "CA", LID: "133", GUID: "0x7cfe9003003b4b96", Rate: (25 * 4 * 125000000), Name: "o0002",
{Type: "CA", LID: "133", GUID: "0x7cfe9003003b4b96", Rate: (25 * 4 * 125000000), RawRate: 1.2890625e+10, Name: "o0002",
Uplinks: map[string]InfinibandUplink{
"1": {Type: "SW", LID: "1719", PortNumber: "11", GUID: "0x7cfe9003009ce5b0", Name: "ib-i1l1s01"},
},
},
{Type: "CA", LID: "134", GUID: "0x7cfe9003003b4bde", Rate: (25 * 4 * 125000000), Name: "o0001",
{Type: "CA", LID: "134", GUID: "0x7cfe9003003b4bde", Rate: (25 * 4 * 125000000), RawRate: 1.2890625e+10, Name: "o0001",
Uplinks: map[string]InfinibandUplink{
"1": {Type: "SW", LID: "1719", PortNumber: "10", GUID: "0x7cfe9003009ce5b0", Name: "ib-i1l1s01"},
},
Expand Down Expand Up @@ -145,6 +145,10 @@ func TestHCACollector(t *testing.T) {
# TYPE infiniband_hca_rate_bytes_per_second gauge
infiniband_hca_rate_bytes_per_second{guid="0x7cfe9003003b4b96"} 1.25e+10
infiniband_hca_rate_bytes_per_second{guid="0x7cfe9003003b4bde"} 1.25e+10
# HELP infiniband_hca_raw_rate_bytes_per_second Infiniband HCA raw rate
# TYPE infiniband_hca_raw_rate_bytes_per_second gauge
infiniband_hca_raw_rate_bytes_per_second{guid="0x7cfe9003003b4b96"} 1.2890625e+10
infiniband_hca_raw_rate_bytes_per_second{guid="0x7cfe9003003b4bde"} 1.2890625e+10
# HELP infiniband_hca_uplink_info Infiniband HCA uplink information
# TYPE infiniband_hca_uplink_info gauge
infiniband_hca_uplink_info{guid="0x7cfe9003003b4b96",hca="o0002",port="1",uplink="ib-i1l1s01",uplink_guid="0x7cfe9003009ce5b0",uplink_lid="1719",uplink_port="11",uplink_type="SW"} 1
Expand All @@ -154,8 +158,8 @@ func TestHCACollector(t *testing.T) {
gatherers := setupGatherer(collector)
if val, err := testutil.GatherAndCount(gatherers); err != nil {
t.Errorf("Unexpected error: %v", err)
} else if val != 53 {
t.Errorf("Unexpected collection count %d, expected 53", val)
} else if val != 55 {
t.Errorf("Unexpected collection count %d, expected 55", val)
}
if err := testutil.GatherAndCompare(gatherers, strings.NewReader(expected),
"infiniband_hca_port_excessive_buffer_overrun_errors_total", "infiniband_hca_port_link_downed_total",
Expand All @@ -170,7 +174,7 @@ func TestHCACollector(t *testing.T) {
"infiniband_hca_port_transmit_wait_total", "infiniband_hca_port_unicast_receive_packets_total",
"infiniband_hca_port_unicast_transmit_packets_total", "infiniband_hca_port_vl15_dropped_total",
"infiniband_hca_port_buffer_overrun_errors_total",
"infiniband_hca_info", "infiniband_hca_rate_bytes_per_second", "infiniband_hca_uplink_info",
"infiniband_hca_info", "infiniband_hca_rate_bytes_per_second", "infiniband_hca_raw_rate_bytes_per_second", "infiniband_hca_uplink_info",
"infiniband_exporter_collect_errors", "infiniband_exporter_collect_timeouts"); err != nil {
t.Errorf("unexpected collecting result:\n%s", err)
}
Expand Down Expand Up @@ -308,6 +312,10 @@ func TestHCACollectorFull(t *testing.T) {
# TYPE infiniband_hca_rate_bytes_per_second gauge
infiniband_hca_rate_bytes_per_second{guid="0x7cfe9003003b4b96"} 1.25e+10
infiniband_hca_rate_bytes_per_second{guid="0x7cfe9003003b4bde"} 1.25e+10
# HELP infiniband_hca_raw_rate_bytes_per_second Infiniband HCA raw rate
# TYPE infiniband_hca_raw_rate_bytes_per_second gauge
infiniband_hca_raw_rate_bytes_per_second{guid="0x7cfe9003003b4b96"} 1.2890625e+10
infiniband_hca_raw_rate_bytes_per_second{guid="0x7cfe9003003b4bde"} 1.2890625e+10
# HELP infiniband_hca_uplink_info Infiniband HCA uplink information
# TYPE infiniband_hca_uplink_info gauge
infiniband_hca_uplink_info{guid="0x7cfe9003003b4b96",hca="o0002",port="1",uplink="ib-i1l1s01",uplink_guid="0x7cfe9003009ce5b0",uplink_lid="1719",uplink_port="11",uplink_type="SW"} 1
Expand All @@ -317,8 +325,8 @@ func TestHCACollectorFull(t *testing.T) {
gatherers := setupGatherer(collector)
if val, err := testutil.GatherAndCount(gatherers); err != nil {
t.Errorf("Unexpected error: %v", err)
} else if val != 65 {
t.Errorf("Unexpected collection count %d, expected 65", val)
} else if val != 67 {
t.Errorf("Unexpected collection count %d, expected 67", val)
}
if err := testutil.GatherAndCompare(gatherers, strings.NewReader(expected),
"infiniband_hca_port_excessive_buffer_overrun_errors_total", "infiniband_hca_port_link_downed_total",
Expand All @@ -335,7 +343,7 @@ func TestHCACollectorFull(t *testing.T) {
"infiniband_hca_port_buffer_overrun_errors_total", "infiniband_hca_port_dli_mapping_errors_total",
"infiniband_hca_port_local_physical_errors_total", "infiniband_hca_port_looping_errors_total",
"infiniband_hca_port_malformed_packet_errors_total", "infiniband_hca_port_vl_mapping_errors_total",
"infiniband_hca_info", "infiniband_hca_rate_bytes_per_second", "infiniband_hca_uplink_info",
"infiniband_hca_info", "infiniband_hca_rate_bytes_per_second", "infiniband_hca_raw_rate_bytes_per_second", "infiniband_hca_uplink_info",
"infiniband_exporter_collect_errors", "infiniband_exporter_collect_timeouts"); err != nil {
t.Errorf("unexpected collecting result:\n%s", err)
}
Expand All @@ -358,8 +366,8 @@ func TestHCACollectorError(t *testing.T) {
gatherers := setupGatherer(collector)
if val, err := testutil.GatherAndCount(gatherers); err != nil {
t.Errorf("Unexpected error: %v", err)
} else if val != 9 {
t.Errorf("Unexpected collection count %d, expected 9", val)
} else if val != 11 {
t.Errorf("Unexpected collection count %d, expected 11", val)
}
if err := testutil.GatherAndCompare(gatherers, strings.NewReader(expected),
"infiniband_hca_port_excessive_buffer_overrun_errors_total", "infiniband_hca_port_link_downed_total",
Expand All @@ -386,8 +394,8 @@ func TestHCACollectorErrorRunonce(t *testing.T) {
gatherers := setupGatherer(collector)
if val, err := testutil.GatherAndCount(gatherers); err != nil {
t.Errorf("Unexpected error: %v", err)
} else if val != 10 {
t.Errorf("Unexpected collection count %d, expected 10", val)
} else if val != 12 {
t.Errorf("Unexpected collection count %d, expected 12", val)
}
if err := testutil.GatherAndCompare(gatherers, strings.NewReader(expected),
"infiniband_hca_port_excessive_buffer_overrun_errors_total", "infiniband_hca_port_link_downed_total",
Expand All @@ -414,8 +422,8 @@ func TestHCACollectorTimeout(t *testing.T) {
gatherers := setupGatherer(collector)
if val, err := testutil.GatherAndCount(gatherers); err != nil {
t.Errorf("Unexpected error: %v", err)
} else if val != 9 {
t.Errorf("Unexpected collection count %d, expected 9", val)
} else if val != 11 {
t.Errorf("Unexpected collection count %d, expected 11", val)
}
if err := testutil.GatherAndCompare(gatherers, strings.NewReader(expected),
"infiniband_hca_port_excessive_buffer_overrun_errors_total", "infiniband_hca_port_link_downed_total",
Expand Down
67 changes: 45 additions & 22 deletions collectors/ibnetdiscover.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,16 +35,18 @@ var (
ibnetdiscoverPath = kingpin.Flag("ibnetdiscover.path", "Path to ibnetdiscover").Default("ibnetdiscover").String()
nodeNameMap = kingpin.Flag("ibnetdiscover.node-name-map", "Path to node name map file").Default("").String()
ibnetdiscoverTimeout = kingpin.Flag("ibnetdiscover.timeout", "Timeout for ibnetdiscover execution").Default("20s").Duration()
rates = map[string]float64{
"SDR": 2,
"DDR": 4,
"QDR": 8,
"FDR10": 10,
"FDR": 14,
"EDR": 25,
"HDR": 50,
"NDR": 100,
"XDR": 250,
// IB Lane Rate Specification: {signaling rate, effective rate}, Gbps
// https://en.wikipedia.org/wiki/InfiniBand#Performance
laneRates = map[string][]float64{
"SDR": {2.5, 2},
"DDR": {5, 4},
"QDR": {10, 8},
"FDR10": {10.3125, 10},
"FDR": {14.0625, 13.64},
"EDR": {25.78125, 25},
"HDR": {50, 50},
"NDR": {100, 100},
"XDR": {250, 250},
}
)

Expand All @@ -53,6 +55,7 @@ type InfinibandDevice struct {
LID string
GUID string
Rate float64
RawRate float64
Name string
Uplinks map[string]InfinibandUplink
}
Expand Down Expand Up @@ -135,6 +138,17 @@ func ibnetdiscoverParse(out string, logger log.Logger) (*[]InfinibandDevice, *[]
level.Debug(logger).Log("msg", "Skipping line that is not connected", "line", line)
continue
}
// check the last item, because name may have space so that it is split into multiple items
name := items[len(items)-1]
if strings.HasSuffix(name, `'`) && !isPairedQuotesName(name) {
for i := len(items) - 2; i > 5; i-- {
name = items[i] + name
if isPairedQuotesName(name) {
items = append(items[:i], name)
break
}
}
}
if items[5] == "SDR" && len(items) == 7 {
level.Debug(logger).Log("msg", "Skipping split mode port", "line", line)
continue
Expand All @@ -151,12 +165,13 @@ func ibnetdiscoverParse(out string, logger log.Logger) (*[]InfinibandDevice, *[]
device.Type = items[0]
device.LID = items[1]
device.GUID = guid
rate, err := parseRate(items[4], items[5])
rawRate, effectiveRate, err := parseRate(items[4], items[5])
if err != nil {
level.Error(logger).Log("msg", "Unable to parse speed", "width", items[4], "rate", items[5], "type", device.Type, "guid", device.GUID)
return nil, nil, err
} else {
device.Rate = rate
device.Rate = effectiveRate
device.RawRate = rawRate
}
portName, uplinkName, err := parseNames(line)
if err != nil {
Expand Down Expand Up @@ -186,20 +201,28 @@ func ibnetdiscoverParse(out string, logger log.Logger) (*[]InfinibandDevice, *[]
return &switches, &hcas, nil
}

func parseRate(width string, rateStr string) (float64, error) {
var rate float64
func parseRate(width string, rateStr string) (float64, float64, error) {
widthRe := regexp.MustCompile("[0-9]+")
widthMatch := widthRe.FindAllString(width, 1)
if len(widthMatch) != 1 {
return 0, fmt.Errorf("Unable to find match for %s: %v", width, widthMatch)
return 0, 0, fmt.Errorf("Unable to find match for %s: %v", width, widthMatch)
}
widthMultipler, _ := strconv.ParseFloat(widthMatch[0], 64)
if baseRate, ok := rates[rateStr]; ok {
rate = widthMultipler * baseRate * math.Pow(1000, 3) / 8
} else {
return 0, fmt.Errorf("Unknown rate %s", rateStr)
if laneRate, ok := laneRates[rateStr]; ok {
baseRate := widthMultipler * math.Pow(1000, 3) / 8
rawRate := laneRate[0] * baseRate
effectiveRate := laneRate[1] * baseRate
return rawRate, effectiveRate, nil
}
return 0, 0, fmt.Errorf("Unknown rate %s", rateStr)
}

func isPairedQuotesName(name string) bool {
if name == `'` {
return false
}
return rate, nil
first, last := name[0], name[len(name)-1]
return first == last && first == '\''
}

func parseNames(line string) (string, string, error) {
Expand All @@ -208,8 +231,8 @@ func parseNames(line string) (string, string, error) {
if len(matches) != 3 {
return "", "", fmt.Errorf("Unable to extract names using regexp")
}
portName := matches[1]
uplinkName := matches[2]
portName := strings.TrimSpace(matches[1])
uplinkName := strings.TrimSpace(matches[2])
if strings.Contains(portName, " HCA") {
portName = strings.Split(portName, " ")[0]
}
Expand Down
Loading

0 comments on commit 60572b5

Please sign in to comment.