Skip to content

Commit

Permalink
pageserver: add no_sync for use in regression tests (1/2) (#9677)
Browse files Browse the repository at this point in the history
## Problem

In test environments, the `syncfs` that the pageserver does on startup
can take a long time, as other tests running concurrently might have
many gigabytes of dirty pages.

## Summary of changes

- Add a `no_sync` option to the pageserver's config.
- Skip syncfs on startup if this is set
- A subsequent PR (#9678) will
enable this by default in tests. We need to wait until after the next
release to avoid breaking compat tests, which would fail if we set
no_sync & use an old pageserver binary.

Q: Why is this a different mechanism than safekeeper, which as a
--no-sync CLI?
A: Because the way we manage pageservers in neon_local depends on the
pageserver.toml containing the full configuration, whereas safekeepers
have a config file which is neon-local-specific and can drive a CLI
flag.

Q: Why is the option no_sync rather than sync?
A: For boolean configs with a dangerous value, it's preferable to make
"false" the safe option, so that any downstream future config tooling
that might have a "booleans are false by default" behavior (e.g. golang
structs) is safe by default.

Q: Why only skip the syncfs, and not all fsyncs?
A: Skipping all fsyncs would require more code changes, and the most
acute problem isn't fsyncs themselves (these just slow down a running
test), it's the syncfs (which makes a pageserver startup slow as a
result of _other_ tests)
  • Loading branch information
jcsp authored Nov 8, 2024
1 parent 027889b commit aa9112e
Show file tree
Hide file tree
Showing 6 changed files with 33 additions and 7 deletions.
3 changes: 3 additions & 0 deletions control_plane/src/bin/neon_local.rs
Original file line number Diff line number Diff line change
Expand Up @@ -944,6 +944,9 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
pg_auth_type: AuthType::Trust,
http_auth_type: AuthType::Trust,
other: Default::default(),
// Typical developer machines use disks with slow fsync, and we don't care
// about data integrity: disable disk syncs.
no_sync: true,
}
})
.collect(),
Expand Down
10 changes: 10 additions & 0 deletions control_plane/src/local_env.rs
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ pub struct PageServerConf {
pub listen_http_addr: String,
pub pg_auth_type: AuthType,
pub http_auth_type: AuthType,
pub no_sync: bool,
}

impl Default for PageServerConf {
Expand All @@ -235,6 +236,7 @@ impl Default for PageServerConf {
listen_http_addr: String::new(),
pg_auth_type: AuthType::Trust,
http_auth_type: AuthType::Trust,
no_sync: false,
}
}
}
Expand All @@ -249,6 +251,8 @@ pub struct NeonLocalInitPageserverConf {
pub listen_http_addr: String,
pub pg_auth_type: AuthType,
pub http_auth_type: AuthType,
#[serde(default, skip_serializing_if = "std::ops::Not::not")]
pub no_sync: bool,
#[serde(flatten)]
pub other: HashMap<String, toml::Value>,
}
Expand All @@ -261,6 +265,7 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf {
listen_http_addr,
pg_auth_type,
http_auth_type,
no_sync,
other: _,
} = conf;
Self {
Expand All @@ -269,6 +274,7 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf {
listen_http_addr: listen_http_addr.clone(),
pg_auth_type: *pg_auth_type,
http_auth_type: *http_auth_type,
no_sync: *no_sync,
}
}
}
Expand Down Expand Up @@ -569,6 +575,8 @@ impl LocalEnv {
listen_http_addr: String,
pg_auth_type: AuthType,
http_auth_type: AuthType,
#[serde(default)]
no_sync: bool,
}
let config_toml_path = dentry.path().join("pageserver.toml");
let config_toml: PageserverConfigTomlSubset = toml_edit::de::from_str(
Expand All @@ -591,6 +599,7 @@ impl LocalEnv {
listen_http_addr,
pg_auth_type,
http_auth_type,
no_sync,
} = config_toml;
let IdentityTomlSubset {
id: identity_toml_id,
Expand All @@ -607,6 +616,7 @@ impl LocalEnv {
listen_http_addr,
pg_auth_type,
http_auth_type,
no_sync,
};
pageservers.push(conf);
}
Expand Down
1 change: 1 addition & 0 deletions control_plane/src/pageserver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,7 @@ impl PageServerNode {
)
})?;
let args = vec!["-D", datadir_path_str];

background_process::start_process(
"pageserver",
&datadir,
Expand Down
3 changes: 3 additions & 0 deletions libs/pageserver_api/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,8 @@ pub struct ConfigToml {
pub ephemeral_bytes_per_memory_kb: usize,
pub l0_flush: Option<crate::models::L0FlushConfig>,
pub virtual_file_io_mode: Option<crate::models::virtual_file::IoMode>,
#[serde(skip_serializing_if = "Option::is_none")]
pub no_sync: Option<bool>,
}

#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
Expand Down Expand Up @@ -389,6 +391,7 @@ impl Default for ConfigToml {
l0_flush: None,
virtual_file_io_mode: None,
tenant_config: TenantConfigToml::default(),
no_sync: None,
}
}
}
Expand Down
18 changes: 11 additions & 7 deletions pageserver/src/bin/pageserver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -154,13 +154,17 @@ fn main() -> anyhow::Result<()> {
},
};

let started = Instant::now();
syncfs(dirfd)?;
let elapsed = started.elapsed();
info!(
elapsed_ms = elapsed.as_millis(),
"made tenant directory contents durable"
);
if conf.no_sync {
info!("Skipping syncfs on startup");
} else {
let started = Instant::now();
syncfs(dirfd)?;
let elapsed = started.elapsed();
info!(
elapsed_ms = elapsed.as_millis(),
"made tenant directory contents durable"
);
}
}

// Initialize up failpoints support
Expand Down
5 changes: 5 additions & 0 deletions pageserver/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,9 @@ pub struct PageServerConf {

/// Direct IO settings
pub virtual_file_io_mode: virtual_file::IoMode,

/// Optionally disable disk syncs (unsafe!)
pub no_sync: bool,
}

/// Token for authentication to safekeepers
Expand Down Expand Up @@ -332,6 +335,7 @@ impl PageServerConf {
concurrent_tenant_size_logical_size_queries,
virtual_file_io_engine,
tenant_config,
no_sync,
} = config_toml;

let mut conf = PageServerConf {
Expand Down Expand Up @@ -409,6 +413,7 @@ impl PageServerConf {
.map(crate::l0_flush::L0FlushConfig::from)
.unwrap_or_default(),
virtual_file_io_mode: virtual_file_io_mode.unwrap_or(virtual_file::IoMode::preferred()),
no_sync: no_sync.unwrap_or(false),
};

// ------------------------------------------------------------
Expand Down

1 comment on commit aa9112e

@github-actions
Copy link

@github-actions github-actions bot commented on aa9112e Nov 8, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

5454 tests run: 5220 passed, 2 failed, 232 skipped (full report)


Failures on Postgres 16

  • test_sharded_ingest[github-actions-selfhosted-1]: release-x86-64
  • test_storage_controller_many_tenants[github-actions-selfhosted]: release-x86-64
# Run all failed tests locally:
scripts/pytest -vv -n $(nproc) -k "test_sharded_ingest[release-pg16-github-actions-selfhosted-1] or test_storage_controller_many_tenants[release-pg16-github-actions-selfhosted]"
Flaky tests (5)

Postgres 17

Postgres 16

Postgres 14

Code coverage* (full report)

  • functions: 31.7% (7864 of 24802 functions)
  • lines: 49.4% (62224 of 125958 lines)

* collected from Rust tests only


The comment gets automatically updated with the latest test results
aa9112e at 2024-11-08T14:23:19.437Z :recycle:

Please sign in to comment.