Skip to content

Commit

Permalink
Expose PSI metrics with prometheus
Browse files Browse the repository at this point in the history
This adds support for reading PSI metrics via prometheus. We exposes the
following for `psi_total`:

```
container_cpu_psi_total_seconds
container_memory_psi_total_seconds
container_io_psi_total_seconds
```

And for `psi_avg`:

```
container_cpu_psi_avg10_ratio
container_cpu_psi_avg60_ratio
container_cpu_psi_avg300_ratio

container_memory_psi_avg10_ratio
container_memory_psi_avg60_ratio
container_memory_psi_avg300_ratio

container_io_psi_avg10_ratio
container_io_psi_avg60_ratio
container_io_psi_avg300_ratio
```

Signed-off-by: Daniel Dao <[email protected]>
  • Loading branch information
dqminh committed May 23, 2022
1 parent ab9bb9e commit 5a27e49
Show file tree
Hide file tree
Showing 4 changed files with 216 additions and 0 deletions.
78 changes: 78 additions & 0 deletions metrics/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -1768,6 +1768,64 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri
})
}

if includedMetrics.Has(container.PSITotalMetrics) {
c.containerMetrics = append(c.containerMetrics, []containerMetric{
{
name: "container_cpu_psi_total_seconds",
help: "Total time spent under cpu pressure in seconds.",
valueType: prometheus.CounterValue,
extraLabels: []string{"kind"},
getValues: func(s *info.ContainerStats) metricValues {
return getPSIValues(s, &s.Cpu.PSI, "total")
},
}, {
name: "container_memory_psi_total_seconds",
help: "Total container time spent under memory pressure in seconds.",
valueType: prometheus.CounterValue,
extraLabels: []string{"kind"},
getValues: func(s *info.ContainerStats) metricValues {
return getPSIValues(s, &s.Memory.PSI, "total")
},
}, {
name: "container_io_psi_total_seconds",
help: "Total time spent under io pressure in seconds.",
valueType: prometheus.CounterValue,
extraLabels: []string{"kind"},
getValues: func(s *info.ContainerStats) metricValues {
return getPSIValues(s, &s.DiskIo.PSI, "total")
},
},
}...)
}

if includedMetrics.Has(container.PSIAvgMetrics) {
makePSIAvgMetric := func(controller, window string) containerMetric {
return containerMetric{
name: fmt.Sprintf("container_%s_psi_avg%s_ratio", controller, window),
help: fmt.Sprintf("Ratio of time spent under %s pressure over time window of %s seconds", controller, window),
valueType: prometheus.GaugeValue,
extraLabels: []string{"kind"},
getValues: func(s *info.ContainerStats) metricValues {
switch controller {
case "cpu":
return getPSIValues(s, &s.Cpu.PSI, "avg"+window)
case "memory":
return getPSIValues(s, &s.Memory.PSI, "avg"+window)
case "io":
return getPSIValues(s, &s.DiskIo.PSI, "avg"+window)
default:
return nil
}
},
}
}
for _, controller := range []string{"cpu", "memory", "io"} {
for _, window := range []string{"10", "60", "300"} {
c.containerMetrics = append(c.containerMetrics, makePSIAvgMetric(controller, window))
}
}
}

return c
}

Expand Down Expand Up @@ -2060,3 +2118,23 @@ func getMinCoreScalingRatio(s *info.ContainerStats) metricValues {
}
return values
}

func getPSIValues(s *info.ContainerStats, psi *info.PSIStats, psiMetric string) metricValues {
v := make(metricValues, 0, 2)
switch psiMetric {
case "avg10":
v = append(v, metricValue{value: psi.Some.Avg10, timestamp: s.Timestamp, labels: []string{"some"}})
v = append(v, metricValue{value: psi.Full.Avg10, timestamp: s.Timestamp, labels: []string{"full"}})
case "avg60":
v = append(v, metricValue{value: psi.Some.Avg60, timestamp: s.Timestamp, labels: []string{"some"}})
v = append(v, metricValue{value: psi.Full.Avg60, timestamp: s.Timestamp, labels: []string{"full"}})
case "avg300":
v = append(v, metricValue{value: psi.Some.Avg300, timestamp: s.Timestamp, labels: []string{"some"}})
v = append(v, metricValue{value: psi.Full.Avg300, timestamp: s.Timestamp, labels: []string{"full"}})
case "total":
// total is measured as microseconds
v = append(v, metricValue{value: float64(time.Duration(psi.Some.Total)*time.Microsecond) / float64(time.Second), timestamp: s.Timestamp, labels: []string{"some"}})
v = append(v, metricValue{value: float64(time.Duration(psi.Full.Total)*time.Microsecond) / float64(time.Second), timestamp: s.Timestamp, labels: []string{"full"}})
}
return v
}
42 changes: 42 additions & 0 deletions metrics/prometheus_fake.go
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,20 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req
RunPeriods: 984285,
},
LoadAverage: 2,
PSI: info.PSIStats{
Some: info.PSIData{
Avg10: 0.1,
Avg60: 0.2,
Avg300: 0.3,
Total: 100,
},
Full: info.PSIData{
Avg10: 0.4,
Avg60: 0.5,
Avg300: 0.6,
Total: 200,
},
},
},
Memory: info.MemoryStats{
Usage: 8,
Expand Down Expand Up @@ -346,6 +360,20 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req
RSS: 15,
MappedFile: 16,
Swap: 8192,
PSI: info.PSIStats{
Some: info.PSIData{
Avg10: 0.01,
Avg60: 0.02,
Avg300: 0.03,
Total: 1000,
},
Full: info.PSIData{
Avg10: 0.04,
Avg60: 0.05,
Avg300: 0.06,
Total: 2000,
},
},
},
Hugetlb: map[string]info.HugetlbStats{
"2Mi": {
Expand Down Expand Up @@ -538,6 +566,20 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req
"Write": 6,
},
}},
PSI: info.PSIStats{
Some: info.PSIData{
Avg10: 0.11,
Avg60: 0.12,
Avg300: 0.13,
Total: 1111,
},
Full: info.PSIData{
Avg10: 0.14,
Avg60: 0.15,
Avg300: 0.16,
Total: 2222,
},
},
},
Filesystem: []info.FsStats{
{
Expand Down
48 changes: 48 additions & 0 deletions metrics/testdata/prometheus_metrics
Original file line number Diff line number Diff line change
Expand Up @@ -433,3 +433,51 @@ container_memory_bandwidth_bytes{container_env_foo_env="prod",container_label_fo
# TYPE container_memory_bandwidth_local_bytes gauge
container_memory_bandwidth_local_bytes{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node_id="0",zone_name="hello"} 2.390393e+06 1395066363000
container_memory_bandwidth_local_bytes{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node_id="1",zone_name="hello"} 1.231233e+06 1395066363000
# HELP container_cpu_psi_avg10_ratio Ratio of time spent under cpu pressure over time window of 10 seconds
# TYPE container_cpu_psi_avg10_ratio gauge
container_cpu_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.4 1395066363000
container_cpu_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.1 1395066363000
# HELP container_cpu_psi_avg300_ratio Ratio of time spent under cpu pressure over time window of 300 seconds
# TYPE container_cpu_psi_avg300_ratio gauge
container_cpu_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.6 1395066363000
container_cpu_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.3 1395066363000
# HELP container_cpu_psi_avg60_ratio Ratio of time spent under cpu pressure over time window of 60 seconds
# TYPE container_cpu_psi_avg60_ratio gauge
container_cpu_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.5 1395066363000
container_cpu_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.2 1395066363000
# HELP container_cpu_psi_total_seconds Total time spent under cpu pressure in seconds.
# TYPE container_cpu_psi_total_seconds counter
container_cpu_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.0002 1395066363000
container_cpu_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.0001 1395066363000
# HELP container_io_psi_avg10_ratio Ratio of time spent under io pressure over time window of 10 seconds
# TYPE container_io_psi_avg10_ratio gauge
container_io_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.14 1395066363000
container_io_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.11 1395066363000
# HELP container_io_psi_avg300_ratio Ratio of time spent under io pressure over time window of 300 seconds
# TYPE container_io_psi_avg300_ratio gauge
container_io_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.16 1395066363000
container_io_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.13 1395066363000
# HELP container_io_psi_avg60_ratio Ratio of time spent under io pressure over time window of 60 seconds
# TYPE container_io_psi_avg60_ratio gauge
container_io_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.15 1395066363000
container_io_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.12 1395066363000
# HELP container_io_psi_total_seconds Total time spent under io pressure in seconds.
# TYPE container_io_psi_total_seconds counter
container_io_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.002222 1395066363000
container_io_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.001111 1395066363000
# HELP container_memory_psi_avg10_ratio Ratio of time spent under memory pressure over time window of 10 seconds
# TYPE container_memory_psi_avg10_ratio gauge
container_memory_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.04 1395066363000
container_memory_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.01 1395066363000
# HELP container_memory_psi_avg300_ratio Ratio of time spent under memory pressure over time window of 300 seconds
# TYPE container_memory_psi_avg300_ratio gauge
container_memory_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.06 1395066363000
container_memory_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.03 1395066363000
# HELP container_memory_psi_avg60_ratio Ratio of time spent under memory pressure over time window of 60 seconds
# TYPE container_memory_psi_avg60_ratio gauge
container_memory_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.05 1395066363000
container_memory_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.02 1395066363000
# HELP container_memory_psi_total_seconds Total container time spent under memory pressure in seconds.
# TYPE container_memory_psi_total_seconds counter
container_memory_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.002 1395066363000
container_memory_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.001 1395066363000
48 changes: 48 additions & 0 deletions metrics/testdata/prometheus_metrics_whitelist_filtered
Original file line number Diff line number Diff line change
Expand Up @@ -433,3 +433,51 @@ container_memory_bandwidth_bytes{container_env_foo_env="prod",id="testcontainer"
# TYPE container_memory_bandwidth_local_bytes gauge
container_memory_bandwidth_local_bytes{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",node_id="0",zone_name="hello"} 2.390393e+06 1395066363000
container_memory_bandwidth_local_bytes{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",node_id="1",zone_name="hello"} 1.231233e+06 1395066363000
# HELP container_cpu_psi_avg10_ratio Ratio of time spent under cpu pressure over time window of 10 seconds
# TYPE container_cpu_psi_avg10_ratio gauge
container_cpu_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.4 1395066363000
container_cpu_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.1 1395066363000
# HELP container_cpu_psi_avg300_ratio Ratio of time spent under cpu pressure over time window of 300 seconds
# TYPE container_cpu_psi_avg300_ratio gauge
container_cpu_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.6 1395066363000
container_cpu_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.3 1395066363000
# HELP container_cpu_psi_avg60_ratio Ratio of time spent under cpu pressure over time window of 60 seconds
# TYPE container_cpu_psi_avg60_ratio gauge
container_cpu_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.5 1395066363000
container_cpu_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.2 1395066363000
# HELP container_cpu_psi_total_seconds Total time spent under cpu pressure in seconds.
# TYPE container_cpu_psi_total_seconds counter
container_cpu_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.0002 1395066363000
container_cpu_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.0001 1395066363000
# HELP container_io_psi_avg10_ratio Ratio of time spent under io pressure over time window of 10 seconds
# TYPE container_io_psi_avg10_ratio gauge
container_io_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.14 1395066363000
container_io_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.11 1395066363000
# HELP container_io_psi_avg300_ratio Ratio of time spent under io pressure over time window of 300 seconds
# TYPE container_io_psi_avg300_ratio gauge
container_io_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.16 1395066363000
container_io_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.13 1395066363000
# HELP container_io_psi_avg60_ratio Ratio of time spent under io pressure over time window of 60 seconds
# TYPE container_io_psi_avg60_ratio gauge
container_io_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.15 1395066363000
container_io_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.12 1395066363000
# HELP container_io_psi_total_seconds Total time spent under io pressure in seconds.
# TYPE container_io_psi_total_seconds counter
container_io_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.002222 1395066363000
container_io_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.001111 1395066363000
# HELP container_memory_psi_avg10_ratio Ratio of time spent under memory pressure over time window of 10 seconds
# TYPE container_memory_psi_avg10_ratio gauge
container_memory_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.04 1395066363000
container_memory_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.01 1395066363000
# HELP container_memory_psi_avg300_ratio Ratio of time spent under memory pressure over time window of 300 seconds
# TYPE container_memory_psi_avg300_ratio gauge
container_memory_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.06 1395066363000
container_memory_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.03 1395066363000
# HELP container_memory_psi_avg60_ratio Ratio of time spent under memory pressure over time window of 60 seconds
# TYPE container_memory_psi_avg60_ratio gauge
container_memory_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.05 1395066363000
container_memory_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.02 1395066363000
# HELP container_memory_psi_total_seconds Total container time spent under memory pressure in seconds.
# TYPE container_memory_psi_total_seconds counter
container_memory_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.002 1395066363000
container_memory_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.001 1395066363000

0 comments on commit 5a27e49

Please sign in to comment.