From 4aeebc853c87e170eeafa16b58d72ae9051a168f Mon Sep 17 00:00:00 2001 From: Mikkel Oscar Lyderik Larsen Date: Wed, 15 Nov 2023 17:23:07 +0100 Subject: [PATCH 1/2] Add support for scaling based on Nakadi Signed-off-by: Mikkel Oscar Lyderik Larsen --- README.md | 86 ++++++++++++++++++ pkg/collector/nakadi_collector.go | 120 +++++++++++++++++++++++++ pkg/nakadi/nakadi.go | 124 ++++++++++++++++++++++++++ pkg/nakadi/nakadi_test.go | 141 ++++++++++++++++++++++++++++++ pkg/server/start.go | 33 ++++++- 5 files changed, 503 insertions(+), 1 deletion(-) create mode 100644 pkg/collector/nakadi_collector.go create mode 100644 pkg/nakadi/nakadi.go create mode 100644 pkg/nakadi/nakadi_test.go diff --git a/README.md b/README.md index 3f3552a4..72e2b9d8 100644 --- a/README.md +++ b/README.md @@ -702,6 +702,92 @@ you need to define a `key` or other `tag` with a "star" query syntax like metric label definitions. If both annotations and corresponding label is defined, then the annotation takes precedence. + +## Nakadi collector + +The Nakadi collector allows scaling based on [Nakadi](https://nakadi.io/) +`consumer_lag_seconds` or `unconsumed_events`. + +### Supported metrics + +| Metric Type | Description | Type | K8s Versions | +| ------------ | ------- | -- | -- | +| `unconsumed-events` | Scale based on number of unconsumed events for a Nakadi subscription | External | `>=1.24` | +| `consumer-lag-seconds` | Scale based on number of max consumer lag seconds for a Nakadi subscription | External | `>=1.24` | + +```yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: myapp-hpa + annotations: + # metric-config.../ + metric-config.external.my-nakadi-consumer.nakadi/interval: "60s" # optional +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: custom-metrics-consumer + minReplicas: 0 + maxReplicas: 8 # should match number of partitions for the event type + metrics: + - type: External + external: + metric: + name: my-nakadi-consumer + selector: + matchLabels: + type: nakadi + subscription-id: "708095f6-cece-4d02-840e-ee488d710b29" + metric-type: "consumer-lag-seconds|unconsumed-events" + target: + # value is compatible with the consumer-lag-seconds metric type. + # It describes the amount of consumer lag in seconds before scaling + # additionally up. + # if an event-type has multiple partitions the value of + # consumer-lag-seconds is the max of all the partitions. + value: "600" # 10m + # averageValue is compatible with unconsumed-events metric type. + # This means for every 30 unconsumed events a pod is scaled up. + # unconsumed-events is the sum of of unconsumed_events over all + # partitions. + averageValue: "30" + type: AverageValue +``` + +The `subscription-id` is the ID of the relevant consumer subscription. The +`metric-type` indicates whether to scale on `consumer-lag-seconds` or +`unconsumed-events` as outlined below. + +`unconsumed-events` - is the total number of unconsumed events over all +partitions. When using this `metric-type` you should also use the target +`averageValue` which indicates the number of events which can be handled per +pod. To best estimate the number of events per pods, you need to understand the +average time for processing an event as well as the rate of events. + +*Example*: You have an event type producing 100 events per second between 00:00 +and 08:00. Between 08:01 to 23:59 it produces 400 events per second. +Let's assume that on average a single pod can consume 100 events per second, +then we can define 100 as `averageValue` and the HPA would scale to 1 between +00:00 and 08:00, and scale to 4 between 08:01 and 23:59. If there for some +reason is a short spike of 800 events per second, then it would scale to 8 pods +to process those events until the rate goes down again. + +`consumer-lag-seconds` - describes the age of the oldest unconsumed event for +a subscription. If the event type has multiple partitions the lag is defined as +the max age over all partitions. When using this `metric-type` you should use +the target `value` to indicate the max lag (in seconds) before the HPA should +scale. + +*Example*: You have a subscription with a defined SLO of "99.99 of events are +consumed within 30 min.". In this case you can define a target `value` of e.g. +20 min. (1200s) (to include a safety buffer) such that the HPA only scales up +from 1 to 2 if the target of 20 min. is breached and it needs to work faster +with more consumers. +For this case you should also account for the average time for processing an +event when defining the target. + + ## HTTP Collector The http collector allows collecting metrics from an external endpoint specified in the HPA. diff --git a/pkg/collector/nakadi_collector.go b/pkg/collector/nakadi_collector.go new file mode 100644 index 00000000..39d87ef7 --- /dev/null +++ b/pkg/collector/nakadi_collector.go @@ -0,0 +1,120 @@ +package collector + +import ( + "context" + "fmt" + "time" + + "github.com/zalando-incubator/kube-metrics-adapter/pkg/nakadi" + autoscalingv2 "k8s.io/api/autoscaling/v2" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/metrics/pkg/apis/external_metrics" +) + +const ( + // NakadiMetricType defines the metric type for metrics based on Nakadi + // subscriptions. + NakadiMetricType = "nakadi" + nakadiSubscriptionIDKey = "subscription-id" + nakadiMetricTypeKey = "metric-type" + nakadiMetricTypeConsumerLagSeconds = "consumer-lag-seconds" + nakadiMetricTypeUnconsumedEvents = "unconsumed-events" +) + +// NakadiCollectorPlugin defines a plugin for creating collectors that can get +// unconsumed events from Nakadi. +type NakadiCollectorPlugin struct { + nakadi nakadi.Nakadi +} + +// NewNakadiCollectorPlugin initializes a new NakadiCollectorPlugin. +func NewNakadiCollectorPlugin(nakadi nakadi.Nakadi) (*NakadiCollectorPlugin, error) { + return &NakadiCollectorPlugin{ + nakadi: nakadi, + }, nil +} + +// NewCollector initializes a new Nakadi collector from the specified HPA. +func (c *NakadiCollectorPlugin) NewCollector(hpa *autoscalingv2.HorizontalPodAutoscaler, config *MetricConfig, interval time.Duration) (Collector, error) { + return NewNakadiCollector(c.nakadi, hpa, config, interval) +} + +// NakadiCollector defines a collector that is able to collect metrics from +// Nakadi. +type NakadiCollector struct { + nakadi nakadi.Nakadi + interval time.Duration + subscriptionID string + nakadiMetricType string + metric autoscalingv2.MetricIdentifier + metricType autoscalingv2.MetricSourceType + namespace string +} + +// NewNakadiCollector initializes a new NakadiCollector. +func NewNakadiCollector(nakadi nakadi.Nakadi, hpa *autoscalingv2.HorizontalPodAutoscaler, config *MetricConfig, interval time.Duration) (*NakadiCollector, error) { + if config.Metric.Selector == nil { + return nil, fmt.Errorf("selector for nakadi is not specified") + } + + subscriptionID, ok := config.Config[nakadiSubscriptionIDKey] + if !ok { + return nil, fmt.Errorf("subscription-id not specified on metric") + } + + metricType, ok := config.Config[nakadiMetricTypeKey] + if !ok { + return nil, fmt.Errorf("metric-type not specified on metric") + } + + if metricType != nakadiMetricTypeConsumerLagSeconds && metricType != nakadiMetricTypeUnconsumedEvents { + return nil, fmt.Errorf("metric-type must be either '%s' or '%s', was '%s'", nakadiMetricTypeConsumerLagSeconds, nakadiMetricTypeUnconsumedEvents, metricType) + } + + return &NakadiCollector{ + nakadi: nakadi, + interval: interval, + subscriptionID: subscriptionID, + nakadiMetricType: metricType, + metric: config.Metric, + metricType: config.Type, + namespace: hpa.Namespace, + }, nil +} + +// GetMetrics returns a list of collected metrics for the Nakadi subscription ID. +func (c *NakadiCollector) GetMetrics() ([]CollectedMetric, error) { + var value int64 + var err error + switch c.nakadiMetricType { + case nakadiMetricTypeConsumerLagSeconds: + value, err = c.nakadi.ConsumerLagSeconds(context.TODO(), c.subscriptionID) + if err != nil { + return nil, err + } + case nakadiMetricTypeUnconsumedEvents: + value, err = c.nakadi.UnconsumedEvents(context.TODO(), c.subscriptionID) + if err != nil { + return nil, err + } + } + + metricValue := CollectedMetric{ + Namespace: c.namespace, + Type: c.metricType, + External: external_metrics.ExternalMetricValue{ + MetricName: c.metric.Name, + MetricLabels: c.metric.Selector.MatchLabels, + Timestamp: metav1.Now(), + Value: *resource.NewQuantity(value, resource.DecimalSI), + }, + } + + return []CollectedMetric{metricValue}, nil +} + +// Interval returns the interval at which the collector should run. +func (c *NakadiCollector) Interval() time.Duration { + return c.interval +} diff --git a/pkg/nakadi/nakadi.go b/pkg/nakadi/nakadi.go new file mode 100644 index 00000000..a4e97282 --- /dev/null +++ b/pkg/nakadi/nakadi.go @@ -0,0 +1,124 @@ +package nakadi + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "io" + "net/http" + "net/url" +) + +// Nakadi defines an interface for talking to the Nakadi API. +type Nakadi interface { + ConsumerLagSeconds(ctx context.Context, subscriptionID string) (int64, error) + UnconsumedEvents(ctx context.Context, subscriptionID string) (int64, error) +} + +// Client defines client for interfacing with the Nakadi API. +type Client struct { + nakadiEndpoint string + http *http.Client +} + +// NewNakadiClient initializes a new Nakadi Client. +func NewNakadiClient(nakadiEndpoint string, client *http.Client) *Client { + return &Client{ + nakadiEndpoint: nakadiEndpoint, + http: client, + } +} + +func (c *Client) ConsumerLagSeconds(ctx context.Context, subscriptionID string) (int64, error) { + stats, err := c.stats(ctx, subscriptionID) + if err != nil { + return 0, err + } + + var maxConsumerLagSeconds int64 + for _, eventType := range stats { + for _, partition := range eventType.Partitions { + maxConsumerLagSeconds = max(maxConsumerLagSeconds, partition.ConsumerLagSeconds) + } + } + + return maxConsumerLagSeconds, nil +} + +func (c *Client) UnconsumedEvents(ctx context.Context, subscriptionID string) (int64, error) { + stats, err := c.stats(ctx, subscriptionID) + if err != nil { + return 0, err + } + + var unconsumedEvents int64 + for _, eventType := range stats { + for _, partition := range eventType.Partitions { + unconsumedEvents += partition.UnconsumedEvents + } + } + + return unconsumedEvents, nil +} + +type statsResp struct { + Items []statsEventType `json:"items"` +} + +type statsEventType struct { + EventType string `json:"event_type"` + Partitions []statsPartition `json:"partitions"` +} + +type statsPartition struct { + Partiton string `json:"partition"` + State string `json:"state"` + UnconsumedEvents int64 `json:"unconsumed_events"` + ConsumerLagSeconds int64 `json:"consumer_lag_seconds"` + StreamID string `json:"stream_id"` + AssignmentType string `json:"assignment_type"` +} + +// stats returns the Nakadi stats for a given subscription ID. +// +// https://nakadi.io/manual.html#/subscriptions/subscription_id/stats_get +func (c *Client) stats(ctx context.Context, subscriptionID string) ([]statsEventType, error) { + endpoint, err := url.Parse(c.nakadiEndpoint) + if err != nil { + return nil, err + } + + endpoint.Path = fmt.Sprintf("/subscriptions/%s/stats", subscriptionID) + + q := endpoint.Query() + q.Set("show_time_lag", "true") + endpoint.RawQuery = q.Encode() + + resp, err := c.http.Get(endpoint.String()) + if err != nil { + return nil, err + } + defer resp.Body.Close() + + d, err := io.ReadAll(resp.Body) + if err != nil { + return nil, err + } + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("[nakadi stats] unexpected response code: %d (%s)", resp.StatusCode, string(d)) + } + + var result statsResp + err = json.Unmarshal(d, &result) + if err != nil { + return nil, err + } + + if len(result.Items) == 0 { + return nil, errors.New("expected at least 1 event-type, 0 returned") + } + + return result.Items, nil +} diff --git a/pkg/nakadi/nakadi_test.go b/pkg/nakadi/nakadi_test.go new file mode 100644 index 00000000..2a14e0a8 --- /dev/null +++ b/pkg/nakadi/nakadi_test.go @@ -0,0 +1,141 @@ +package nakadi + +import ( + "context" + "errors" + "net/http" + "net/http/httptest" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestQuery(tt *testing.T) { + client := &http.Client{} + for _, ti := range []struct { + msg string + status int + responseBody string + err error + unconsumedEvents int64 + consumerLagSeconds int64 + }{ + { + msg: "test getting a single event-type", + status: http.StatusOK, + responseBody: `{ + "items": [ + { + "event_type": "example-event", + "partitions": [ + { + "partition": "0", + "state": "assigned", + "unconsumed_events": 4, + "consumer_lag_seconds": 2, + "stream_id": "example-id", + "assignment_type": "auto" + }, + { + "partition": "0", + "state": "assigned", + "unconsumed_events": 5, + "consumer_lag_seconds": 1, + "stream_id": "example-id", + "assignment_type": "auto" + } + ] + } + ] + }`, + unconsumedEvents: 9, + consumerLagSeconds: 2, + }, + { + msg: "test getting multiple event-types", + status: http.StatusOK, + responseBody: `{ + "items": [ + { + "event_type": "example-event", + "partitions": [ + { + "partition": "0", + "state": "assigned", + "unconsumed_events": 4, + "consumer_lag_seconds": 2, + "stream_id": "example-id", + "assignment_type": "auto" + }, + { + "partition": "0", + "state": "assigned", + "unconsumed_events": 5, + "consumer_lag_seconds": 1, + "stream_id": "example-id", + "assignment_type": "auto" + } + ] + }, + { + "event_type": "example-event-2", + "partitions": [ + { + "partition": "0", + "state": "assigned", + "unconsumed_events": 4, + "consumer_lag_seconds": 6, + "stream_id": "example-id", + "assignment_type": "auto" + }, + { + "partition": "0", + "state": "assigned", + "unconsumed_events": 5, + "consumer_lag_seconds": 1, + "stream_id": "example-id", + "assignment_type": "auto" + } + ] + } + ] + }`, + unconsumedEvents: 18, + consumerLagSeconds: 6, + }, + { + msg: "test call with invalid response", + status: http.StatusInternalServerError, + responseBody: `{"error": 500}`, + err: errors.New("[nakadi stats] unexpected response code: 500 ({\"error\": 500})"), + }, + { + msg: "test getting back a single data point", + status: http.StatusOK, + responseBody: `{ + "items": [] + }`, + err: errors.New("expected at least 1 event-type, 0 returned"), + }, + } { + tt.Run(ti.msg, func(t *testing.T) { + ts := httptest.NewServer(http.HandlerFunc( + func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(ti.status) + _, err := w.Write([]byte(ti.responseBody)) + assert.NoError(t, err) + }), + ) + defer ts.Close() + + nakadiClient := NewNakadiClient(ts.URL, client) + consumerLagSeconds, err := nakadiClient.ConsumerLagSeconds(context.Background(), "id") + assert.Equal(t, ti.err, err) + assert.Equal(t, ti.consumerLagSeconds, consumerLagSeconds) + unconsumedEvents, err := nakadiClient.UnconsumedEvents(context.Background(), "id") + assert.Equal(t, ti.err, err) + assert.Equal(t, ti.unconsumedEvents, unconsumedEvents) + }) + } + +} diff --git a/pkg/server/start.go b/pkg/server/start.go index 3cf62b34..15dc7d42 100644 --- a/pkg/server/start.go +++ b/pkg/server/start.go @@ -35,6 +35,7 @@ import ( "github.com/zalando-incubator/kube-metrics-adapter/pkg/client/clientset/versioned" "github.com/zalando-incubator/kube-metrics-adapter/pkg/collector" "github.com/zalando-incubator/kube-metrics-adapter/pkg/controller/scheduledscaling" + "github.com/zalando-incubator/kube-metrics-adapter/pkg/nakadi" "github.com/zalando-incubator/kube-metrics-adapter/pkg/provider" "github.com/zalando-incubator/kube-metrics-adapter/pkg/zmon" "golang.org/x/oauth2" @@ -64,6 +65,7 @@ func NewCommandStartAdapterServer(stopCh <-chan struct{}) *cobra.Command { EnableExternalMetricsAPI: true, MetricsAddress: ":7979", ZMONTokenName: "zmon", + NakadiTokenName: "nakadi", CredentialsDir: "/meta/credentials", ExternalRPSMetricName: "skipper_serve_host_duration_seconds_count", } @@ -110,8 +112,12 @@ func NewCommandStartAdapterServer(stopCh <-chan struct{}) *cobra.Command { "url of ZMON KariosDB endpoint to query for ZMON checks") flags.StringVar(&o.ZMONTokenName, "zmon-token-name", o.ZMONTokenName, ""+ "name of the token used to query ZMON") + flags.StringVar(&o.NakadiEndpoint, "nakadi-endpoint", o.NakadiEndpoint, ""+ + "url of Nakadi endpoint to for nakadi subscription stats") + flags.StringVar(&o.NakadiTokenName, "nakadi-token-name", o.NakadiTokenName, ""+ + "name of the token used to call nakadi subscription API") flags.StringVar(&o.Token, "token", o.Token, ""+ - "static oauth2 token to use when calling external services like ZMON") + "static oauth2 token to use when calling external services like ZMON and Nakadi") flags.StringVar(&o.CredentialsDir, "credentials-dir", o.CredentialsDir, ""+ "path to the credentials dir where tokens are stored") flags.BoolVar(&o.SkipperIngressMetrics, "skipper-ingress-metrics", o.SkipperIngressMetrics, ""+ @@ -274,6 +280,27 @@ func (o AdapterServerOptions) RunCustomMetricsAdapterServer(stopCh <-chan struct collectorFactory.RegisterExternalCollector([]string{collector.ZMONMetricType, collector.ZMONCheckMetricLegacy}, zmonPlugin) } + // enable Nakadi based metrics + if o.NakadiEndpoint != "" { + var tokenSource oauth2.TokenSource + if o.Token != "" { + tokenSource = oauth2.StaticTokenSource(&oauth2.Token{AccessToken: o.Token}) + } else { + tokenSource = platformiam.NewTokenSource(o.NakadiTokenName, o.CredentialsDir) + } + + httpClient := newOauth2HTTPClient(ctx, tokenSource) + + nakadiClient := nakadi.NewNakadiClient(o.NakadiEndpoint, httpClient) + + nakadiPlugin, err := collector.NewNakadiCollectorPlugin(nakadiClient) + if err != nil { + return fmt.Errorf("failed to initialize Nakadi collector plugin: %v", err) + } + + collectorFactory.RegisterExternalCollector([]string{collector.NakadiMetricType}, nakadiPlugin) + } + awsSessions := make(map[string]*session.Session, len(o.AWSRegions)) for _, region := range o.AWSRegions { awsSessions[region], err = session.NewSessionWithOptions(session.Options{ @@ -427,6 +454,10 @@ type AdapterServerOptions struct { ZMONKariosDBEndpoint string // ZMONTokenName is the name of the token used to query ZMON ZMONTokenName string + // NakadiEndpoint enables Nakadi metrics from the specified endpoint + NakadiEndpoint string + // NakadiTokenName is the name of the token used to call Nakadi + NakadiTokenName string // Token is an oauth2 token used to authenticate with services like // ZMON. Token string From 43dd2e97413b04bfdc44c9a3f3fc568be7580e77 Mon Sep 17 00:00:00 2001 From: Mikkel Oscar Lyderik Larsen Date: Thu, 7 Dec 2023 10:30:12 +0100 Subject: [PATCH 2/2] Update wording in README Co-authored-by: Andrey Signed-off-by: Mikkel Oscar Lyderik Larsen --- README.md | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 72e2b9d8..45331643 100644 --- a/README.md +++ b/README.md @@ -706,14 +706,17 @@ defined, then the annotation takes precedence. ## Nakadi collector The Nakadi collector allows scaling based on [Nakadi](https://nakadi.io/) -`consumer_lag_seconds` or `unconsumed_events`. + Subscription API stats metrics `consumer_lag_seconds` or `unconsumed_events`. ### Supported metrics -| Metric Type | Description | Type | K8s Versions | +| Metric Type | Description | Type | K8s Versions | +|------------------------|-----------------------------------------------------------------------------|----------|--------------| +| `unconsumed-events` | Scale based on number of unconsumed events for a Nakadi subscription | External | `>=1.24` | +| `consumer-lag-seconds` | Scale based on number of max consumer lag seconds for a Nakadi subscription | External | `>=1.24` | | ------------ | ------- | -- | -- | -| `unconsumed-events` | Scale based on number of unconsumed events for a Nakadi subscription | External | `>=1.24` | -| `consumer-lag-seconds` | Scale based on number of max consumer lag seconds for a Nakadi subscription | External | `>=1.24` | +| `unconsumed-events` | Scale based on number of unconsumed events for a Nakadi Subscription | External | `>=1.24` | +| `consumer-lag-seconds` | Scale based on number of max consumer lag seconds for a Nakadi Subscription | External | `>=1.24` | ```yaml apiVersion: autoscaling/v2 @@ -748,14 +751,14 @@ spec: # consumer-lag-seconds is the max of all the partitions. value: "600" # 10m # averageValue is compatible with unconsumed-events metric type. - # This means for every 30 unconsumed events a pod is scaled up. + # This means for every 30 unconsumed events a pod is added. # unconsumed-events is the sum of of unconsumed_events over all # partitions. averageValue: "30" type: AverageValue ``` -The `subscription-id` is the ID of the relevant consumer subscription. The +The `subscription-id` is the Subscription ID of the relevant consumer. The `metric-type` indicates whether to scale on `consumer-lag-seconds` or `unconsumed-events` as outlined below.