fix(metric): update committed epoch metrics (#19959) (#19961)

Co-authored-by: zwang28 <[email protected]>
risingwavelabs · Dec 31, 2024 · 336bbea · 336bbea
1 parent d2640de
commit 336bbea
Show file tree

Hide file tree

Showing 8 changed files with 35 additions and 198 deletions.
diff --git a/docker/dashboards/risingwave-dev-dashboard.json b/docker/dashboards/risingwave-dev-dashboard.json
diff --git a/docker/dashboards/risingwave-user-dashboard.json b/docker/dashboards/risingwave-user-dashboard.json
diff --git a/grafana/risingwave-dev-dashboard.dashboard.py b/grafana/risingwave-dev-dashboard.dashboard.py
@@ -3327,9 +3327,9 @@ def section_hummock_manager(outer_panels):
                             f"{metric('storage_max_committed_epoch')}",
                             "max committed epoch",
                         ),
-                        panels.target(f"{metric('storage_safe_epoch')}", "safe epoch"),
                         panels.target(
-                            f"{metric('storage_min_pinned_epoch')}", "min pinned epoch"
+                            f"{metric('storage_min_committed_epoch')}",
+                            "min committed epoch",
                         ),
                     ],
                 ),
@@ -3676,182 +3676,6 @@ def section_grpc_meta_stream_manager(outer_panels):
         ),
     ]
 
-
-def section_grpc_meta_hummock_manager(outer_panels):
-    panels = outer_panels.sub_panel()
-    return [
-        outer_panels.row_collapsed(
-            "gRPC Meta: Hummock Manager",
-            [
-                grpc_metrics_target(
-                    panels,
-                    "UnpinVersionBefore",
-                    "path='/meta.HummockManagerService/UnpinVersionBefore'",
-                ),
-                grpc_metrics_target(
-                    panels,
-                    "ReportCompactionTasks",
-                    "path='/meta.HummockManagerService/ReportCompactionTasks'",
-                ),
-                grpc_metrics_target(
-                    panels,
-                    "GetNewSstIds",
-                    "path='/meta.HummockManagerService/GetNewSstIds'",
-                ),
-            ],
-        ),
-    ]
-
-
-def section_grpc_hummock_meta_client(outer_panels):
-    panels = outer_panels.sub_panel()
-    return [
-        outer_panels.row_collapsed(
-            "gRPC: Hummock Meta Client",
-            [
-                panels.timeseries_count(
-                    "compaction_count",
-                    "",
-                    [
-                        panels.target(
-                            f"sum(irate({metric('state_store_report_compaction_task_counts')}[$__rate_interval])) by({COMPONENT_LABEL}, {NODE_LABEL})",
-                            "report_compaction_task_counts - {{%s}}" % NODE_LABEL,
-                        ),
-                    ],
-                ),
-                panels.timeseries_latency(
-                    "version_latency",
-                    "",
-                    [
-                        panels.target(
-                            f"histogram_quantile(0.5, sum(irate({metric('state_store_unpin_version_before_latency_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
-                            "unpin_version_before_latency_p50 - {{%s}}" % NODE_LABEL,
-                        ),
-                        panels.target(
-                            f"histogram_quantile(0.99, sum(irate({metric('state_store_unpin_version_before_latency_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
-                            "unpin_version_before_latency_p99 - {{%s}}" % NODE_LABEL,
-                        ),
-                        panels.target(
-                            f"sum(irate({metric('state_store_unpin_version_before_latency_sum')}[$__rate_interval])) / sum(irate({metric('state_store_unpin_version_before_latency_count')}[$__rate_interval])) > 0",
-                            "unpin_version_before_latency_avg",
-                        ),
-                        panels.target(
-                            f"histogram_quantile(0.90, sum(irate({metric('state_store_unpin_version_before_latency_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
-                            "unpin_version_before_latency_p90 - {{%s}}" % NODE_LABEL,
-                        ),
-                    ],
-                ),
-                panels.timeseries_latency(
-                    "snapshot_latency",
-                    "",
-                    [
-                        panels.target(
-                            f"histogram_quantile(0.5, sum(irate({metric('state_store_pin_snapshot_latency_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
-                            "pin_snapshot_latency_p50 - {{%s}}" % NODE_LABEL,
-                        ),
-                        panels.target(
-                            f"histogram_quantile(0.99, sum(irate({metric('state_store_pin_snapshot_latency_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
-                            "pin_snapshot_latency_p99 - {{%s}}" % NODE_LABEL,
-                        ),
-                        panels.target(
-                            f"histogram_quantile(0.9, sum(irate({metric('state_store_pin_snapshot_latency_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
-                            "pin_snapshot_latencyp90 - {{%s}}" % NODE_LABEL,
-                        ),
-                        panels.target(
-                            f"sum(irate({metric('state_store_pin_snapshot_latency_sum')}[$__rate_interval])) / sum(irate(state_store_pin_snapshot_latency_count[$__rate_interval])) > 0",
-                            "pin_snapshot_latency_avg",
-                        ),
-                        panels.target(
-                            f"histogram_quantile(0.5, sum(irate({metric('state_store_unpin_version_snapshot_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
-                            "unpin_snapshot_latency_p50 - {{%s}}" % NODE_LABEL,
-                        ),
-                        panels.target(
-                            f"histogram_quantile(0.99, sum(irate({metric('state_store_unpin_version_snapshot_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
-                            "unpin_snapshot_latency_p99 - {{%s}}" % NODE_LABEL,
-                        ),
-                        panels.target(
-                            f"sum(irate({metric('state_store_unpin_snapshot_latency_sum')}[$__rate_interval])) / sum(irate(state_store_unpin_snapshot_latency_count[$__rate_interval])) > 0",
-                            "unpin_snapshot_latency_avg",
-                        ),
-                        panels.target(
-                            f"histogram_quantile(0.90, sum(irate({metric('state_store_unpin_snapshot_latency_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
-                            "unpin_snapshot_latency_p90 - {{%s}}" % NODE_LABEL,
-                        ),
-                    ],
-                ),
-                panels.timeseries_count(
-                    "snapshot_count",
-                    "",
-                    [
-                        panels.target(
-                            f"sum(irate({metric('state_store_pin_snapshot_counts')}[$__rate_interval])) by({COMPONENT_LABEL}, {NODE_LABEL})",
-                            "pin_snapshot_counts - {{%s}}" % NODE_LABEL,
-                        ),
-                        panels.target(
-                            f"sum(irate({metric('state_store_unpin_snapshot_counts')}[$__rate_interval])) by({COMPONENT_LABEL}, {NODE_LABEL})",
-                            "unpin_snapshot_counts - {{%s}}" % NODE_LABEL,
-                        ),
-                    ],
-                ),
-                panels.timeseries_latency(
-                    "table_latency",
-                    "",
-                    [
-                        panels.target(
-                            f"histogram_quantile(0.5, sum(irate({metric('state_store_get_new_sst_ids_latency_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
-                            "get_new_sst_ids_latency_latency_p50 - {{%s}}" % NODE_LABEL,
-                        ),
-                        panels.target(
-                            f"histogram_quantile(0.99, sum(irate({metric('state_store_get_new_sst_ids_latency_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
-                            "get_new_sst_ids_latency_latency_p99 - {{%s}}" % NODE_LABEL,
-                        ),
-                        panels.target(
-                            f"sum(irate({metric('state_store_get_new_sst_ids_latency_sum')}[$__rate_interval])) / sum(irate({metric('state_store_get_new_sst_ids_latency_count')}[$__rate_interval])) > 0",
-                            "get_new_sst_ids_latency_latency_avg",
-                        ),
-                        panels.target(
-                            f"histogram_quantile(0.90, sum(irate({metric('state_store_get_new_sst_ids_latency_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
-                            "get_new_sst_ids_latency_latency_p90 - {{%s}}" % NODE_LABEL,
-                        ),
-                    ],
-                ),
-                panels.timeseries_count(
-                    "table_count",
-                    "",
-                    [
-                        panels.target(
-                            f"sum(irate({metric('state_store_get_new_sst_ids_latency_counts')}[$__rate_interval]))by({COMPONENT_LABEL}, {NODE_LABEL})",
-                            "get_new_sst_ids_latency_counts - {{%s}}" % NODE_LABEL,
-                        ),
-                    ],
-                ),
-                panels.timeseries_latency(
-                    "compaction_latency",
-                    "",
-                    [
-                        panels.target(
-                            f"histogram_quantile(0.5, sum(irate({metric('state_store_report_compaction_task_latency_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
-                            "report_compaction_task_latency_p50 - {{%s}}" % NODE_LABEL,
-                        ),
-                        panels.target(
-                            f"histogram_quantile(0.99, sum(irate({metric('state_store_report_compaction_task_latency_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
-                            "report_compaction_task_latency_p99 - {{%s}}" % NODE_LABEL,
-                        ),
-                        panels.target(
-                            f"sum(irate({metric('state_store_report_compaction_task_latency_sum')}[$__rate_interval])) / sum(irate(state_store_report_compaction_task_latency_count[$__rate_interval])) > 0",
-                            "report_compaction_task_latency_avg",
-                        ),
-                        panels.target(
-                            f"histogram_quantile(0.90, sum(irate({metric('state_store_report_compaction_task_latency_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
-                            "report_compaction_task_latency_p90 - {{%s}}" % NODE_LABEL,
-                        ),
-                    ],
-                ),
-            ],
-        ),
-    ]
-
-
 def section_kafka_metrics(outer_panels):
     panels = outer_panels.sub_panel()
     return [
@@ -5009,8 +4833,6 @@ def section_udf(outer_panels):
         *section_grpc_meta_catalog_service(panels),
         *section_grpc_meta_cluster_service(panels),
         *section_grpc_meta_stream_manager(panels),
-        *section_grpc_meta_hummock_manager(panels),
-        *section_grpc_hummock_meta_client(panels),
         *section_frontend(panels),
         *section_memory_manager(panels),
         *section_sink_metrics(panels),

diff --git a/grafana/risingwave-dev-dashboard.json b/grafana/risingwave-dev-dashboard.json
diff --git a/grafana/risingwave-user-dashboard.json b/grafana/risingwave-user-dashboard.json
diff --git a/src/meta/src/hummock/manager/commit_epoch.rs b/src/meta/src/hummock/manager/commit_epoch.rs
@@ -41,7 +41,7 @@ use crate::hummock::manager::transaction::{
 };
 use crate::hummock::manager::versioning::Versioning;
 use crate::hummock::metrics_utils::{
-    get_or_create_local_table_stat, trigger_local_table_stat, trigger_sst_stat,
+    get_or_create_local_table_stat, trigger_epoch_stat, trigger_local_table_stat, trigger_sst_stat,
 };
 use crate::hummock::model::CompactionGroup;
 use crate::hummock::sequence::{next_compaction_group_id, next_sstable_object_id};
@@ -293,6 +293,7 @@ impl HummockManager {
                 *compaction_group_id,
             );
         }
+        trigger_epoch_stat(&self.metrics, &versioning.current_version);
 
         drop(versioning_guard);
 

diff --git a/src/meta/src/hummock/metrics_utils.rs b/src/meta/src/hummock/metrics_utils.rs
@@ -311,6 +311,27 @@ pub fn trigger_sst_stat(
     }
 }
 
+pub fn trigger_epoch_stat(metrics: &MetaMetrics, version: &HummockVersion) {
+    metrics.max_committed_epoch.set(
+        version
+            .state_table_info
+            .info()
+            .values()
+            .map(|i| i.committed_epoch)
+            .max()
+            .unwrap_or(0) as _,
+    );
+    metrics.min_committed_epoch.set(
+        version
+            .state_table_info
+            .info()
+            .values()
+            .map(|i| i.committed_epoch)
+            .min()
+            .unwrap_or(0) as _,
+    );
+}
+
 pub fn remove_compaction_group_in_sst_stat(
     metrics: &MetaMetrics,
     compaction_group_id: CompactionGroupId,

diff --git a/src/meta/src/rpc/metrics.rs b/src/meta/src/rpc/metrics.rs
@@ -95,10 +95,8 @@ pub struct MetaMetrics {
     // ********************************** Hummock ************************************
     /// Max committed epoch
     pub max_committed_epoch: IntGauge,
-    /// The smallest epoch that has not been `GCed`.
-    pub safe_epoch: IntGauge,
-    /// The smallest epoch that is being pinned.
-    pub min_pinned_epoch: IntGauge,
+    /// Min committed epoch
+    pub min_committed_epoch: IntGauge,
     /// The number of SSTs in each level
     pub level_sst_num: IntGaugeVec,
     /// The number of SSTs to be merged to next level in each level
@@ -309,13 +307,9 @@ impl MetaMetrics {
         )
         .unwrap();
 
-        let safe_epoch =
-            register_int_gauge_with_registry!("storage_safe_epoch", "safe epoch", registry)
-                .unwrap();
-
-        let min_pinned_epoch = register_int_gauge_with_registry!(
-            "storage_min_pinned_epoch",
-            "min pinned epoch",
+        let min_committed_epoch = register_int_gauge_with_registry!(
+            "storage_min_committed_epoch",
+            "min committed epoch",
             registry
         )
         .unwrap();
@@ -794,8 +788,7 @@ impl MetaMetrics {
             recovery_latency,
 
             max_committed_epoch,
-            safe_epoch,
-            min_pinned_epoch,
+            min_committed_epoch,
             level_sst_num,
             level_compact_cnt,
             compact_frequency,