Skip to content

Commit

Permalink
fix(metric): update committed epoch metrics (#19959) (#19961)
Browse files Browse the repository at this point in the history
Co-authored-by: zwang28 <[email protected]>
  • Loading branch information
github-actions[bot] and zwang28 authored Dec 31, 2024
1 parent d2640de commit 336bbea
Show file tree
Hide file tree
Showing 8 changed files with 35 additions and 198 deletions.
2 changes: 1 addition & 1 deletion docker/dashboards/risingwave-dev-dashboard.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docker/dashboards/risingwave-user-dashboard.json

Large diffs are not rendered by default.

182 changes: 2 additions & 180 deletions grafana/risingwave-dev-dashboard.dashboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -3327,9 +3327,9 @@ def section_hummock_manager(outer_panels):
f"{metric('storage_max_committed_epoch')}",
"max committed epoch",
),
panels.target(f"{metric('storage_safe_epoch')}", "safe epoch"),
panels.target(
f"{metric('storage_min_pinned_epoch')}", "min pinned epoch"
f"{metric('storage_min_committed_epoch')}",
"min committed epoch",
),
],
),
Expand Down Expand Up @@ -3676,182 +3676,6 @@ def section_grpc_meta_stream_manager(outer_panels):
),
]


def section_grpc_meta_hummock_manager(outer_panels):
panels = outer_panels.sub_panel()
return [
outer_panels.row_collapsed(
"gRPC Meta: Hummock Manager",
[
grpc_metrics_target(
panels,
"UnpinVersionBefore",
"path='/meta.HummockManagerService/UnpinVersionBefore'",
),
grpc_metrics_target(
panels,
"ReportCompactionTasks",
"path='/meta.HummockManagerService/ReportCompactionTasks'",
),
grpc_metrics_target(
panels,
"GetNewSstIds",
"path='/meta.HummockManagerService/GetNewSstIds'",
),
],
),
]


def section_grpc_hummock_meta_client(outer_panels):
panels = outer_panels.sub_panel()
return [
outer_panels.row_collapsed(
"gRPC: Hummock Meta Client",
[
panels.timeseries_count(
"compaction_count",
"",
[
panels.target(
f"sum(irate({metric('state_store_report_compaction_task_counts')}[$__rate_interval])) by({COMPONENT_LABEL}, {NODE_LABEL})",
"report_compaction_task_counts - {{%s}}" % NODE_LABEL,
),
],
),
panels.timeseries_latency(
"version_latency",
"",
[
panels.target(
f"histogram_quantile(0.5, sum(irate({metric('state_store_unpin_version_before_latency_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
"unpin_version_before_latency_p50 - {{%s}}" % NODE_LABEL,
),
panels.target(
f"histogram_quantile(0.99, sum(irate({metric('state_store_unpin_version_before_latency_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
"unpin_version_before_latency_p99 - {{%s}}" % NODE_LABEL,
),
panels.target(
f"sum(irate({metric('state_store_unpin_version_before_latency_sum')}[$__rate_interval])) / sum(irate({metric('state_store_unpin_version_before_latency_count')}[$__rate_interval])) > 0",
"unpin_version_before_latency_avg",
),
panels.target(
f"histogram_quantile(0.90, sum(irate({metric('state_store_unpin_version_before_latency_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
"unpin_version_before_latency_p90 - {{%s}}" % NODE_LABEL,
),
],
),
panels.timeseries_latency(
"snapshot_latency",
"",
[
panels.target(
f"histogram_quantile(0.5, sum(irate({metric('state_store_pin_snapshot_latency_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
"pin_snapshot_latency_p50 - {{%s}}" % NODE_LABEL,
),
panels.target(
f"histogram_quantile(0.99, sum(irate({metric('state_store_pin_snapshot_latency_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
"pin_snapshot_latency_p99 - {{%s}}" % NODE_LABEL,
),
panels.target(
f"histogram_quantile(0.9, sum(irate({metric('state_store_pin_snapshot_latency_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
"pin_snapshot_latencyp90 - {{%s}}" % NODE_LABEL,
),
panels.target(
f"sum(irate({metric('state_store_pin_snapshot_latency_sum')}[$__rate_interval])) / sum(irate(state_store_pin_snapshot_latency_count[$__rate_interval])) > 0",
"pin_snapshot_latency_avg",
),
panels.target(
f"histogram_quantile(0.5, sum(irate({metric('state_store_unpin_version_snapshot_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
"unpin_snapshot_latency_p50 - {{%s}}" % NODE_LABEL,
),
panels.target(
f"histogram_quantile(0.99, sum(irate({metric('state_store_unpin_version_snapshot_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
"unpin_snapshot_latency_p99 - {{%s}}" % NODE_LABEL,
),
panels.target(
f"sum(irate({metric('state_store_unpin_snapshot_latency_sum')}[$__rate_interval])) / sum(irate(state_store_unpin_snapshot_latency_count[$__rate_interval])) > 0",
"unpin_snapshot_latency_avg",
),
panels.target(
f"histogram_quantile(0.90, sum(irate({metric('state_store_unpin_snapshot_latency_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
"unpin_snapshot_latency_p90 - {{%s}}" % NODE_LABEL,
),
],
),
panels.timeseries_count(
"snapshot_count",
"",
[
panels.target(
f"sum(irate({metric('state_store_pin_snapshot_counts')}[$__rate_interval])) by({COMPONENT_LABEL}, {NODE_LABEL})",
"pin_snapshot_counts - {{%s}}" % NODE_LABEL,
),
panels.target(
f"sum(irate({metric('state_store_unpin_snapshot_counts')}[$__rate_interval])) by({COMPONENT_LABEL}, {NODE_LABEL})",
"unpin_snapshot_counts - {{%s}}" % NODE_LABEL,
),
],
),
panels.timeseries_latency(
"table_latency",
"",
[
panels.target(
f"histogram_quantile(0.5, sum(irate({metric('state_store_get_new_sst_ids_latency_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
"get_new_sst_ids_latency_latency_p50 - {{%s}}" % NODE_LABEL,
),
panels.target(
f"histogram_quantile(0.99, sum(irate({metric('state_store_get_new_sst_ids_latency_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
"get_new_sst_ids_latency_latency_p99 - {{%s}}" % NODE_LABEL,
),
panels.target(
f"sum(irate({metric('state_store_get_new_sst_ids_latency_sum')}[$__rate_interval])) / sum(irate({metric('state_store_get_new_sst_ids_latency_count')}[$__rate_interval])) > 0",
"get_new_sst_ids_latency_latency_avg",
),
panels.target(
f"histogram_quantile(0.90, sum(irate({metric('state_store_get_new_sst_ids_latency_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
"get_new_sst_ids_latency_latency_p90 - {{%s}}" % NODE_LABEL,
),
],
),
panels.timeseries_count(
"table_count",
"",
[
panels.target(
f"sum(irate({metric('state_store_get_new_sst_ids_latency_counts')}[$__rate_interval]))by({COMPONENT_LABEL}, {NODE_LABEL})",
"get_new_sst_ids_latency_counts - {{%s}}" % NODE_LABEL,
),
],
),
panels.timeseries_latency(
"compaction_latency",
"",
[
panels.target(
f"histogram_quantile(0.5, sum(irate({metric('state_store_report_compaction_task_latency_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
"report_compaction_task_latency_p50 - {{%s}}" % NODE_LABEL,
),
panels.target(
f"histogram_quantile(0.99, sum(irate({metric('state_store_report_compaction_task_latency_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
"report_compaction_task_latency_p99 - {{%s}}" % NODE_LABEL,
),
panels.target(
f"sum(irate({metric('state_store_report_compaction_task_latency_sum')}[$__rate_interval])) / sum(irate(state_store_report_compaction_task_latency_count[$__rate_interval])) > 0",
"report_compaction_task_latency_avg",
),
panels.target(
f"histogram_quantile(0.90, sum(irate({metric('state_store_report_compaction_task_latency_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
"report_compaction_task_latency_p90 - {{%s}}" % NODE_LABEL,
),
],
),
],
),
]


def section_kafka_metrics(outer_panels):
panels = outer_panels.sub_panel()
return [
Expand Down Expand Up @@ -5009,8 +4833,6 @@ def section_udf(outer_panels):
*section_grpc_meta_catalog_service(panels),
*section_grpc_meta_cluster_service(panels),
*section_grpc_meta_stream_manager(panels),
*section_grpc_meta_hummock_manager(panels),
*section_grpc_hummock_meta_client(panels),
*section_frontend(panels),
*section_memory_manager(panels),
*section_sink_metrics(panels),
Expand Down
2 changes: 1 addition & 1 deletion grafana/risingwave-dev-dashboard.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion grafana/risingwave-user-dashboard.json

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion src/meta/src/hummock/manager/commit_epoch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ use crate::hummock::manager::transaction::{
};
use crate::hummock::manager::versioning::Versioning;
use crate::hummock::metrics_utils::{
get_or_create_local_table_stat, trigger_local_table_stat, trigger_sst_stat,
get_or_create_local_table_stat, trigger_epoch_stat, trigger_local_table_stat, trigger_sst_stat,
};
use crate::hummock::model::CompactionGroup;
use crate::hummock::sequence::{next_compaction_group_id, next_sstable_object_id};
Expand Down Expand Up @@ -293,6 +293,7 @@ impl HummockManager {
*compaction_group_id,
);
}
trigger_epoch_stat(&self.metrics, &versioning.current_version);

drop(versioning_guard);

Expand Down
21 changes: 21 additions & 0 deletions src/meta/src/hummock/metrics_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,27 @@ pub fn trigger_sst_stat(
}
}

pub fn trigger_epoch_stat(metrics: &MetaMetrics, version: &HummockVersion) {
metrics.max_committed_epoch.set(
version
.state_table_info
.info()
.values()
.map(|i| i.committed_epoch)
.max()
.unwrap_or(0) as _,
);
metrics.min_committed_epoch.set(
version
.state_table_info
.info()
.values()
.map(|i| i.committed_epoch)
.min()
.unwrap_or(0) as _,
);
}

pub fn remove_compaction_group_in_sst_stat(
metrics: &MetaMetrics,
compaction_group_id: CompactionGroupId,
Expand Down
19 changes: 6 additions & 13 deletions src/meta/src/rpc/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,10 +95,8 @@ pub struct MetaMetrics {
// ********************************** Hummock ************************************
/// Max committed epoch
pub max_committed_epoch: IntGauge,
/// The smallest epoch that has not been `GCed`.
pub safe_epoch: IntGauge,
/// The smallest epoch that is being pinned.
pub min_pinned_epoch: IntGauge,
/// Min committed epoch
pub min_committed_epoch: IntGauge,
/// The number of SSTs in each level
pub level_sst_num: IntGaugeVec,
/// The number of SSTs to be merged to next level in each level
Expand Down Expand Up @@ -309,13 +307,9 @@ impl MetaMetrics {
)
.unwrap();

let safe_epoch =
register_int_gauge_with_registry!("storage_safe_epoch", "safe epoch", registry)
.unwrap();

let min_pinned_epoch = register_int_gauge_with_registry!(
"storage_min_pinned_epoch",
"min pinned epoch",
let min_committed_epoch = register_int_gauge_with_registry!(
"storage_min_committed_epoch",
"min committed epoch",
registry
)
.unwrap();
Expand Down Expand Up @@ -794,8 +788,7 @@ impl MetaMetrics {
recovery_latency,

max_committed_epoch,
safe_epoch,
min_pinned_epoch,
min_committed_epoch,
level_sst_num,
level_compact_cnt,
compact_frequency,
Expand Down

0 comments on commit 336bbea

Please sign in to comment.