Skip to content

Commit

Permalink
pageserver: add disk consistent and remote lsn metrics (#10005)
Browse files Browse the repository at this point in the history
## Problem

There's no metrics for disk consistent LSN and remote LSN. This stuff is
useful when looking at ingest performance.

## Summary of changes

Two per timeline metrics are added: `pageserver_disk_consistent_lsn` and
`pageserver_projected_remote_consistent_lsn`. I went for the projected
remote lsn instead of the visible one
because that more closely matches remote storage write tput. Ideally we
would have both, but these metrics are expensive.
  • Loading branch information
VladLazar authored Dec 6, 2024
1 parent ec4072f commit 3f1c542
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 5 deletions.
46 changes: 43 additions & 3 deletions pageserver/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,24 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
.expect("failed to define a metric")
});

static DISK_CONSISTENT_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
"pageserver_disk_consistent_lsn",
"Disk consistent LSN grouped by timeline",
&["tenant_id", "shard_id", "timeline_id"]
)
.expect("failed to define a metric")
});

pub(crate) static PROJECTED_REMOTE_CONSISTENT_LSN: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_projected_remote_consistent_lsn",
"Projected remote consistent LSN grouped by timeline",
&["tenant_id", "shard_id", "timeline_id"]
)
.expect("failed to define a metric")
});

static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_pitr_history_size",
Expand Down Expand Up @@ -2394,7 +2412,8 @@ pub(crate) struct TimelineMetrics {
pub load_layer_map_histo: StorageTimeMetrics,
pub garbage_collect_histo: StorageTimeMetrics,
pub find_gc_cutoffs_histo: StorageTimeMetrics,
pub last_record_gauge: IntGauge,
pub last_record_lsn_gauge: IntGauge,
pub disk_consistent_lsn_gauge: IntGauge,
pub pitr_history_size: UIntGauge,
pub archival_size: UIntGauge,
pub(crate) layer_size_image: UIntGauge,
Expand Down Expand Up @@ -2475,7 +2494,11 @@ impl TimelineMetrics {
&shard_id,
&timeline_id,
);
let last_record_gauge = LAST_RECORD_LSN
let last_record_lsn_gauge = LAST_RECORD_LSN
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();

let disk_consistent_lsn_gauge = DISK_CONSISTENT_LSN
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();

Expand Down Expand Up @@ -2578,7 +2601,8 @@ impl TimelineMetrics {
garbage_collect_histo,
find_gc_cutoffs_histo,
load_layer_map_histo,
last_record_gauge,
last_record_lsn_gauge,
disk_consistent_lsn_gauge,
pitr_history_size,
archival_size,
layer_size_image,
Expand Down Expand Up @@ -2642,6 +2666,7 @@ impl TimelineMetrics {
let timeline_id = &self.timeline_id;
let shard_id = &self.shard_id;
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = DISK_CONSISTENT_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = FLUSH_WAIT_UPLOAD_TIME.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
{
Expand Down Expand Up @@ -2805,6 +2830,7 @@ pub(crate) struct RemoteTimelineClientMetrics {
calls: Mutex<HashMap<(&'static str, &'static str), IntCounterPair>>,
bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
pub(crate) projected_remote_consistent_lsn_gauge: UIntGauge,
}

impl RemoteTimelineClientMetrics {
Expand All @@ -2819,6 +2845,10 @@ impl RemoteTimelineClientMetrics {
.unwrap(),
);

let projected_remote_consistent_lsn_gauge = PROJECTED_REMOTE_CONSISTENT_LSN
.get_metric_with_label_values(&[&tenant_id_str, &shard_id_str, &timeline_id_str])
.unwrap();

RemoteTimelineClientMetrics {
tenant_id: tenant_id_str,
shard_id: shard_id_str,
Expand All @@ -2827,6 +2857,7 @@ impl RemoteTimelineClientMetrics {
bytes_started_counter: Mutex::new(HashMap::default()),
bytes_finished_counter: Mutex::new(HashMap::default()),
remote_physical_size_gauge,
projected_remote_consistent_lsn_gauge,
}
}

Expand Down Expand Up @@ -3040,6 +3071,7 @@ impl Drop for RemoteTimelineClientMetrics {
calls,
bytes_started_counter,
bytes_finished_counter,
projected_remote_consistent_lsn_gauge,
} = self;
for ((a, b), _) in calls.get_mut().unwrap().drain() {
let mut res = [Ok(()), Ok(())];
Expand Down Expand Up @@ -3069,6 +3101,14 @@ impl Drop for RemoteTimelineClientMetrics {
let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above
let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
}
{
let _ = projected_remote_consistent_lsn_gauge;
let _ = PROJECTED_REMOTE_CONSISTENT_LSN.remove_label_values(&[
tenant_id,
shard_id,
timeline_id,
]);
}
}
}

Expand Down
3 changes: 3 additions & 0 deletions pageserver/src/tenant/remote_timeline_client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2192,6 +2192,9 @@ impl RemoteTimelineClient {
upload_queue.clean.1 = Some(task.task_id);

let lsn = upload_queue.clean.0.metadata.disk_consistent_lsn();
self.metrics
.projected_remote_consistent_lsn_gauge
.set(lsn.0);

if self.generation.is_none() {
// Legacy mode: skip validating generation
Expand Down
8 changes: 6 additions & 2 deletions pageserver/src/tenant/timeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2392,7 +2392,7 @@ impl Timeline {

result
.metrics
.last_record_gauge
.last_record_lsn_gauge
.set(disk_consistent_lsn.0 as i64);
result
})
Expand Down Expand Up @@ -3514,7 +3514,7 @@ impl Timeline {
pub(crate) fn finish_write(&self, new_lsn: Lsn) {
assert!(new_lsn.is_aligned());

self.metrics.last_record_gauge.set(new_lsn.0 as i64);
self.metrics.last_record_lsn_gauge.set(new_lsn.0 as i64);
self.last_record_lsn.advance(new_lsn);
}

Expand Down Expand Up @@ -3882,6 +3882,10 @@ impl Timeline {
fn set_disk_consistent_lsn(&self, new_value: Lsn) -> bool {
let old_value = self.disk_consistent_lsn.fetch_max(new_value);
assert!(new_value >= old_value, "disk_consistent_lsn must be growing monotonously at runtime; current {old_value}, offered {new_value}");

self.metrics
.disk_consistent_lsn_gauge
.set(new_value.0 as i64);
new_value != old_value
}

Expand Down
2 changes: 2 additions & 0 deletions test_runner/fixtures/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,8 @@ def counter(name: str) -> str:
"pageserver_resident_physical_size",
"pageserver_io_operations_bytes_total",
"pageserver_last_record_lsn",
"pageserver_disk_consistent_lsn",
"pageserver_projected_remote_consistent_lsn",
"pageserver_standby_horizon",
"pageserver_smgr_query_seconds_bucket",
"pageserver_smgr_query_seconds_count",
Expand Down

1 comment on commit 3f1c542

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

7166 tests run: 6846 passed, 1 failed, 319 skipped (full report)


Failures on Postgres 16

  • test_ingest_logical_message[github-actions-selfhosted-fsync-1024]: release-x86-64
# Run all failed tests locally:
scripts/pytest -vv -n $(nproc) -k "test_ingest_logical_message[release-pg16-github-actions-selfhosted-fsync-1024]"
Flaky tests (16)

Postgres 17

Postgres 16

Postgres 15

Postgres 14

Code coverage* (full report)

  • functions: 31.4% (8324 of 26506 functions)
  • lines: 47.8% (65472 of 137071 lines)

* collected from Rust tests only


The comment gets automatically updated with the latest test results
3f1c542 at 2024-12-06T12:19:17.754Z :recycle:

Please sign in to comment.