Skip to content

Commit

Permalink
pageserver: make local timeline deletion infallible
Browse files Browse the repository at this point in the history
  • Loading branch information
jcsp committed Oct 31, 2024
1 parent 51fda11 commit 9e91f6e
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 24 deletions.
39 changes: 16 additions & 23 deletions pageserver/src/tenant/timeline/delete.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ use crate::{
CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, Tenant,
TimelineOrOffloaded,
},
virtual_file::MaybeFatalIo,
};

use super::{Timeline, TimelineResources};
Expand Down Expand Up @@ -62,18 +63,18 @@ pub(super) async fn delete_local_timeline_directory(
conf: &PageServerConf,
tenant_shard_id: TenantShardId,
timeline: &Timeline,
) -> anyhow::Result<()> {
) {
// Always ensure the lock order is compaction -> gc.
let compaction_lock = timeline.compaction_lock.lock();
let compaction_lock = crate::timed(
let _compaction_lock = crate::timed(
compaction_lock,
"acquires compaction lock",
std::time::Duration::from_secs(5),
)
.await;

let gc_lock = timeline.gc_lock.lock();
let gc_lock = crate::timed(
let _gc_lock = crate::timed(
gc_lock,
"acquires gc lock",
std::time::Duration::from_secs(5),
Expand All @@ -85,24 +86,15 @@ pub(super) async fn delete_local_timeline_directory(

let local_timeline_directory = conf.timeline_path(&tenant_shard_id, &timeline.timeline_id);

fail::fail_point!("timeline-delete-before-rm", |_| {
Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
});

// NB: This need not be atomic because the deleted flag in the IndexPart
// will be observed during tenant/timeline load. The deletion will be resumed there.
//
// Note that here we do not bail out on std::io::ErrorKind::NotFound.
// This can happen if we're called a second time, e.g.,
// because of a previous failure/cancellation at/after
// failpoint timeline-delete-after-rm.
//
// ErrorKind::NotFound can also happen if we race with tenant detach, because,
// no locks are shared.
tokio::fs::remove_dir_all(local_timeline_directory)
.await
.or_else(fs_ext::ignore_not_found)
.context("remove local timeline directory")?;
.fatal_err("Removing timeline directory");

// Make sure previous deletions are ordered before mark removal.
// Otherwise there is no guarantee that they reach the disk before mark deletion.
Expand All @@ -113,17 +105,9 @@ pub(super) async fn delete_local_timeline_directory(
let timeline_path = conf.timelines_path(&tenant_shard_id);
crashsafe::fsync_async(timeline_path)
.await
.context("fsync_pre_mark_remove")?;
.fatal_err("Fsync after removing timeline directory");

info!("finished deleting layer files, releasing locks");
drop(gc_lock);
drop(compaction_lock);

fail::fail_point!("timeline-delete-after-rm", |_| {
Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
});

Ok(())
}

/// Removes remote layers and an index file after them.
Expand Down Expand Up @@ -440,12 +424,21 @@ impl DeleteTimelineFlow {
timeline: &TimelineOrOffloaded,
remote_client: Arc<RemoteTimelineClient>,
) -> Result<(), DeleteTimelineError> {
fail::fail_point!("timeline-delete-before-rm", |_| {
Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
});

// Offloaded timelines have no local state
// TODO: once we persist offloaded information, delete the timeline from there, too
if let TimelineOrOffloaded::Timeline(timeline) = timeline {
delete_local_timeline_directory(conf, tenant.tenant_shard_id, timeline).await?;
delete_local_timeline_directory(conf, tenant.tenant_shard_id, timeline).await;
}

fail::fail_point!("timeline-delete-after-rm", |_| {
Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
});


delete_remote_layers_and_index(&remote_client).await?;

pausable_failpoint!("in_progress_delete");
Expand Down
2 changes: 1 addition & 1 deletion pageserver/src/tenant/timeline/offload.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ pub(crate) async fn offload_timeline(
// to make deletions possible while offloading is in progress

let conf = &tenant.conf;
delete_local_timeline_directory(conf, tenant.tenant_shard_id, &timeline).await?;
delete_local_timeline_directory(conf, tenant.tenant_shard_id, &timeline).await;

remove_timeline_from_tenant(tenant, &timeline, &guard).await?;

Expand Down

0 comments on commit 9e91f6e

Please sign in to comment.