diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index b4590fe3e5d6..df68f8a68e5f 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -262,14 +262,6 @@ async fn timeline_snapshot_handler(request: Request) -> Result, // so create the chan and write to it in another task. diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index c7f5165f90c9..c700e18cc7d7 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -8,6 +8,7 @@ use serde::{Deserialize, Serialize}; use std::{ cmp::min, io::{self, ErrorKind}, + sync::Arc, }; use tokio::{fs::OpenOptions, io::AsyncWrite, sync::mpsc, task}; use tokio_tar::{Archive, Builder, Header}; @@ -25,8 +26,8 @@ use crate::{ routes::TimelineStatus, }, safekeeper::Term, - state::TimelinePersistentState, - timeline::WalResidentTimeline, + state::{EvictionState, TimelinePersistentState}, + timeline::{Timeline, WalResidentTimeline}, timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline}, wal_backup, wal_storage::open_wal_file, @@ -43,18 +44,33 @@ use utils::{ /// Stream tar archive of timeline to tx. #[instrument(name = "snapshot", skip_all, fields(ttid = %tli.ttid))] pub async fn stream_snapshot( - tli: WalResidentTimeline, + tli: Arc, source: NodeId, destination: NodeId, tx: mpsc::Sender>, ) { - if let Err(e) = stream_snapshot_guts(tli, source, destination, tx.clone()).await { - // Error type/contents don't matter as they won't can't reach the client - // (hyper likely doesn't do anything with it), but http stream will be - // prematurely terminated. It would be nice to try to send the error in - // trailers though. - tx.send(Err(anyhow!("snapshot failed"))).await.ok(); - error!("snapshot failed: {:#}", e); + match tli.try_wal_residence_guard().await { + Err(e) => { + tx.send(Err(anyhow!("Error checking residence: {:#}", e))) + .await + .ok(); + } + Ok(maybe_resident_tli) => { + if let Err(e) = match maybe_resident_tli { + Some(resident_tli) => { + stream_snapshot_resident_guts(resident_tli, source, destination, tx.clone()) + .await + } + None => stream_snapshot_offloaded_guts(tli, source, destination, tx.clone()).await, + } { + // Error type/contents don't matter as they won't can't reach the client + // (hyper likely doesn't do anything with it), but http stream will be + // prematurely terminated. It would be nice to try to send the error in + // trailers though. + tx.send(Err(anyhow!("snapshot failed"))).await.ok(); + error!("snapshot failed: {:#}", e); + } + } } } @@ -80,12 +96,10 @@ impl Drop for SnapshotContext { } } -pub async fn stream_snapshot_guts( - tli: WalResidentTimeline, - source: NodeId, - destination: NodeId, +/// Build a tokio_tar stream that sends encoded bytes into a Bytes channel. +fn prepare_tar_stream( tx: mpsc::Sender>, -) -> Result<()> { +) -> tokio_tar::Builder { // tokio-tar wants Write implementor, but we have mpsc tx >; // use SinkWriter as a Write impl. That is, // - create Sink from the tx. It returns PollSendError if chan is closed. @@ -100,12 +114,38 @@ pub async fn stream_snapshot_guts( // - SinkWriter (not surprisingly) wants sink of &[u8], not bytes, so wrap // into CopyToBytes. This is a data copy. let copy_to_bytes = CopyToBytes::new(oksink); - let mut writer = SinkWriter::new(copy_to_bytes); - let pinned_writer = std::pin::pin!(writer); + let writer = SinkWriter::new(copy_to_bytes); + let pinned_writer = Box::pin(writer); // Note that tokio_tar append_* funcs use tokio::io::copy with 8KB buffer // which is also likely suboptimal. - let mut ar = Builder::new_non_terminated(pinned_writer); + Builder::new_non_terminated(pinned_writer) +} + +/// Implementation of snapshot for an offloaded timeline, only reads control file +pub(crate) async fn stream_snapshot_offloaded_guts( + tli: Arc, + source: NodeId, + destination: NodeId, + tx: mpsc::Sender>, +) -> Result<()> { + let mut ar = prepare_tar_stream(tx); + + tli.snapshot_offloaded(&mut ar, source, destination).await?; + + ar.finish().await?; + + Ok(()) +} + +/// Implementation of snapshot for a timeline which is resident (includes some segment data) +pub async fn stream_snapshot_resident_guts( + tli: WalResidentTimeline, + source: NodeId, + destination: NodeId, + tx: mpsc::Sender>, +) -> Result<()> { + let mut ar = prepare_tar_stream(tx); let bctx = tli.start_snapshot(&mut ar, source, destination).await?; pausable_failpoint!("sk-snapshot-after-list-pausable"); @@ -138,6 +178,70 @@ pub async fn stream_snapshot_guts( Ok(()) } +impl Timeline { + /// Simple snapshot for an offloaded timeline: we will only upload a renamed partial segment and + /// pass a modified control file into the provided tar stream (nothing with data segments on disk, since + /// we are offloaded and there aren't any) + async fn snapshot_offloaded( + self: &Arc, + ar: &mut tokio_tar::Builder, + source: NodeId, + destination: NodeId, + ) -> Result<()> { + // Take initial copy of control file, then release state lock + let mut control_file = { + let shared_state = self.write_shared_state().await; + + let control_file = TimelinePersistentState::clone(shared_state.sk.state()); + + // Rare race: we got unevicted between entering function and reading control file. + // We error out and let API caller retry. + if !matches!(control_file.eviction_state, EvictionState::Offloaded(_)) { + bail!("Timeline was un-evicted during snapshot, please retry"); + } + + control_file + }; + + // Modify the partial segment of the in-memory copy for the control file to + // point to the destination safekeeper. + let replace = control_file + .partial_backup + .replace_uploaded_segment(source, destination)?; + + let Some(replace) = replace else { + // In Manager:: ready_for_eviction, we do not permit eviction unless the timeline + // has a partial segment. It is unexpected that + anyhow::bail!("Timeline has no partial segment, cannot generate snapshot"); + }; + + tracing::info!("Replacing uploaded partial segment in in-mem control file: {replace:?}"); + + // Optimistically try to copy the partial segment to the destination's path: this + // can fail if the timeline was un-evicted and modified in the background. + let remote_timeline_path = &self.remote_path; + wal_backup::copy_partial_segment( + &replace.previous.remote_path(remote_timeline_path), + &replace.current.remote_path(remote_timeline_path), + ) + .await?; + + // Since the S3 copy succeeded with the path given in our control file snapshot, and + // we are sending that snapshot in our response, we are giving the caller a consistent + // snapshot even if our local Timeline was unevicted or otherwise modified in the meantime. + let buf = control_file + .write_to_buf() + .with_context(|| "failed to serialize control store")?; + let mut header = Header::new_gnu(); + header.set_size(buf.len().try_into().expect("never breaches u64")); + ar.append_data(&mut header, CONTROL_FILE_NAME, buf.as_slice()) + .await + .with_context(|| "failed to append to archive")?; + + Ok(()) + } +} + impl WalResidentTimeline { /// Start streaming tar archive with timeline: /// 1) stream control file under lock; diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index c737dfcf9b99..f0113978c469 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -797,14 +797,17 @@ impl Timeline { state.sk.term_bump(to).await } - /// Get the timeline guard for reading/writing WAL files. - /// If WAL files are not present on disk (evicted), they will be automatically - /// downloaded from remote storage. This is done in the manager task, which is - /// responsible for issuing all guards. - /// - /// NB: don't use this function from timeline_manager, it will deadlock. - /// NB: don't use this function while holding shared_state lock. - pub async fn wal_residence_guard(self: &Arc) -> Result { + /// Guts of [`Self::wal_residence_guard`] and [`Self::try_wal_residence_guard`] + async fn do_wal_residence_guard( + self: &Arc, + block: bool, + ) -> Result> { + let op_label = if block { + "wal_residence_guard" + } else { + "try_wal_residence_guard" + }; + if self.is_cancelled() { bail!(TimelineError::Cancelled(self.ttid)); } @@ -816,10 +819,13 @@ impl Timeline { // Wait 30 seconds for the guard to be acquired. It can time out if someone is // holding the lock (e.g. during `SafeKeeper::process_msg()`) or manager task // is stuck. - let res = tokio::time::timeout_at( - started_at + Duration::from_secs(30), - self.manager_ctl.wal_residence_guard(), - ) + let res = tokio::time::timeout_at(started_at + Duration::from_secs(30), async { + if block { + self.manager_ctl.wal_residence_guard().await.map(Some) + } else { + self.manager_ctl.try_wal_residence_guard().await + } + }) .await; let guard = match res { @@ -827,14 +833,14 @@ impl Timeline { let finished_at = Instant::now(); let elapsed = finished_at - started_at; MISC_OPERATION_SECONDS - .with_label_values(&["wal_residence_guard"]) + .with_label_values(&[op_label]) .observe(elapsed.as_secs_f64()); guard } Ok(Err(e)) => { warn!( - "error while acquiring WalResidentTimeline guard, statuses {:?} => {:?}", + "error acquiring in {op_label}, statuses {:?} => {:?}", status_before, self.mgr_status.get() ); @@ -842,7 +848,7 @@ impl Timeline { } Err(_) => { warn!( - "timeout while acquiring WalResidentTimeline guard, statuses {:?} => {:?}", + "timeout acquiring in {op_label} guard, statuses {:?} => {:?}", status_before, self.mgr_status.get() ); @@ -850,7 +856,28 @@ impl Timeline { } }; - Ok(WalResidentTimeline::new(self.clone(), guard)) + Ok(guard.map(|g| WalResidentTimeline::new(self.clone(), g))) + } + + /// Get the timeline guard for reading/writing WAL files. + /// If WAL files are not present on disk (evicted), they will be automatically + /// downloaded from remote storage. This is done in the manager task, which is + /// responsible for issuing all guards. + /// + /// NB: don't use this function from timeline_manager, it will deadlock. + /// NB: don't use this function while holding shared_state lock. + pub async fn wal_residence_guard(self: &Arc) -> Result { + self.do_wal_residence_guard(true) + .await + .map(|m| m.expect("Always get Some in block=true mode")) + } + + /// Get the timeline guard for reading/writing WAL files if the timeline is resident, + /// else return None + pub(crate) async fn try_wal_residence_guard( + self: &Arc, + ) -> Result> { + self.do_wal_residence_guard(false).await } pub async fn backup_partial_reset(self: &Arc) -> Result> { diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs index f5363ae9b01d..303421c83758 100644 --- a/safekeeper/src/timeline_eviction.rs +++ b/safekeeper/src/timeline_eviction.rs @@ -56,6 +56,9 @@ impl Manager { // This also works for the first segment despite last_removed_segno // being 0 on init because this 0 triggers run of wal_removal_task // on success of which manager updates the horizon. + // + // **Note** pull_timeline functionality assumes that evicted timelines always have + // a partial segment: if we ever change this condition, must also update that code. && self .partial_backup_uploaded .as_ref() diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs index f0583dd3ff77..79200fff8d84 100644 --- a/safekeeper/src/timeline_manager.rs +++ b/safekeeper/src/timeline_manager.rs @@ -100,6 +100,8 @@ const REFRESH_INTERVAL: Duration = Duration::from_millis(300); pub enum ManagerCtlMessage { /// Request to get a guard for WalResidentTimeline, with WAL files available locally. GuardRequest(tokio::sync::oneshot::Sender>), + /// Get a guard for WalResidentTimeline if the timeline is not currently offloaded, else None + TryGuardRequest(tokio::sync::oneshot::Sender>), /// Request to drop the guard. GuardDrop(GuardId), /// Request to reset uploaded partial backup state. @@ -110,6 +112,7 @@ impl std::fmt::Debug for ManagerCtlMessage { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { ManagerCtlMessage::GuardRequest(_) => write!(f, "GuardRequest"), + ManagerCtlMessage::TryGuardRequest(_) => write!(f, "TryGuardRequest"), ManagerCtlMessage::GuardDrop(id) => write!(f, "GuardDrop({:?})", id), ManagerCtlMessage::BackupPartialReset(_) => write!(f, "BackupPartialReset"), } @@ -152,6 +155,19 @@ impl ManagerCtl { .and_then(std::convert::identity) } + /// Issue a new guard if the timeline is currently not offloaded, else return None + /// Sends a message to the manager and waits for the response. + /// Can be blocked indefinitely if the manager is stuck. + pub async fn try_wal_residence_guard(&self) -> anyhow::Result> { + let (tx, rx) = tokio::sync::oneshot::channel(); + self.manager_tx + .send(ManagerCtlMessage::TryGuardRequest(tx))?; + + // wait for the manager to respond with the guard + rx.await + .map_err(|e| anyhow::anyhow!("response read fail: {:?}", e)) + } + /// Request timeline manager to reset uploaded partial segment state and /// wait for the result. pub async fn backup_partial_reset(&self) -> anyhow::Result> { @@ -674,6 +690,17 @@ impl Manager { warn!("failed to reply with a guard, receiver dropped"); } } + Some(ManagerCtlMessage::TryGuardRequest(tx)) => { + let result = if self.is_offloaded { + None + } else { + Some(self.access_service.create_guard()) + }; + + if tx.send(result).is_err() { + warn!("failed to reply with a guard, receiver dropped"); + } + } Some(ManagerCtlMessage::GuardDrop(guard_id)) => { self.access_service.drop_guard(guard_id); } diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index d803cd7c789f..157390c01c0e 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -1998,6 +1998,109 @@ def test_pull_timeline_term_change(neon_env_builder: NeonEnvBuilder): pt_handle.join() +def test_pull_timeline_while_evicted(neon_env_builder: NeonEnvBuilder): + """ + Verify that when pull_timeline is used on an evicted timeline, it does not result in + promoting any segments to local disk on the source, and the timeline is correctly instantiated + in evicted state on the destination. This behavior is important to avoid ballooning disk + usage when doing mass migration of timelines. + """ + neon_env_builder.num_safekeepers = 4 + neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage()) + + # Configure safekeepers with ultra-fast eviction policy + neon_env_builder.safekeeper_extra_opts = [ + "--enable-offload", + "--partial-backup-timeout", + "50ms", + "--control-file-save-interval", + "1s", + # Safekeepers usually wait a while before evicting something: for this test we want them to + # evict things as soon as they are inactive. + "--eviction-min-resident=100ms", + "--delete-offloaded-wal", + ] + + initial_tenant_conf = {"lagging_wal_timeout": "1s", "checkpoint_timeout": "100ms"} + env = neon_env_builder.init_start(initial_tenant_conf=initial_tenant_conf) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + (src_sk, dst_sk) = (env.safekeepers[0], env.safekeepers[-1]) + log.info(f"Will pull_timeline on destination {dst_sk.id} from source {src_sk.id}") + + ep = env.endpoints.create("main") + ep.active_safekeepers = [s.id for s in env.safekeepers if s.id != dst_sk.id] + log.info(f"Compute writing initially to safekeepers: {ep.active_safekeepers}") + ep.active_safekeepers = [1, 2, 3] # Exclude dst_sk from set written by compute initially + ep.start() + ep.safe_psql("CREATE TABLE t(i int)") + ep.safe_psql("INSERT INTO t VALUES (0)") + ep.stop() + + wait_lsn_force_checkpoint_at_sk(src_sk, tenant_id, timeline_id, env.pageserver) + + src_http = src_sk.http_client() + dst_http = dst_sk.http_client() + + def evicted_on_source(): + # Wait for timeline to go into evicted state + assert src_http.get_eviction_state(timeline_id) != "Present" + assert ( + src_http.get_metric_value( + "safekeeper_eviction_events_completed_total", {"kind": "evict"} + ) + or 0 > 0 + ) + assert src_http.get_metric_value("safekeeper_evicted_timelines") or 0 > 0 + # Check that on source no segment files are present + assert src_sk.list_segments(tenant_id, timeline_id) == [] + + wait_until(60, 1, evicted_on_source) + + # Invoke pull_timeline: source should serve snapshot request without promoting anything to local disk, + # destination should import the control file only & go into evicted mode immediately + dst_sk.pull_timeline([src_sk], tenant_id, timeline_id) + + # Check that on source and destination no segment files are present + assert src_sk.list_segments(tenant_id, timeline_id) == [] + assert dst_sk.list_segments(tenant_id, timeline_id) == [] + + # Check that the timeline on the destination is in the expected evicted state. + evicted_on_source() # It should still be evicted on the source + + def evicted_on_destination(): + assert dst_http.get_eviction_state(timeline_id) != "Present" + assert dst_http.get_metric_value("safekeeper_evicted_timelines") or 0 > 0 + + # This should be fast, it is a wait_until because eviction state is updated + # in the background wrt pull_timeline. + wait_until(10, 0.1, evicted_on_destination) + + # Delete the timeline on the source, to prove that deletion works on an + # evicted timeline _and_ that the final compute test is really not using + # the original location + src_sk.http_client().timeline_delete(tenant_id, timeline_id, only_local=True) + + # Check that using the timeline correctly un-evicts it on the new location + ep.active_safekeepers = [2, 3, 4] + ep.start() + ep.safe_psql("INSERT INTO t VALUES (0)") + ep.stop() + + def unevicted_on_dest(): + assert ( + dst_http.get_metric_value( + "safekeeper_eviction_events_completed_total", {"kind": "restore"} + ) + or 0 > 0 + ) + n_evicted = dst_sk.http_client().get_metric_value("safekeeper_evicted_timelines") + assert n_evicted == 0 + + wait_until(10, 1, unevicted_on_dest) + + # In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries # when compute is active, but there are no writes to the timeline. In that case # pageserver should maintain a single connection to safekeeper and don't attempt