Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support lz4 page images compression #5913

Closed
wants to merge 9 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ jsonwebtoken = "9"
lasso = "0.7"
leaky-bucket = "1.0.1"
libc = "0.2"
lz4_flex = "0.11.1"
md5 = "0.7.0"
memoffset = "0.8"
native-tls = "0.2"
Expand Down
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@ endif
UNAME_S := $(shell uname -s)
ifeq ($(UNAME_S),Linux)
# Seccomp BPF is only available for Linux
PG_CONFIGURE_OPTS += --with-libseccomp
PG_CONFIGURE_OPTS += --with-lz4 --with-libseccomp
else ifeq ($(UNAME_S),Darwin)
# macOS with brew-installed openssl requires explicit paths
# It can be configured with OPENSSL_PREFIX variable
OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
PG_CONFIGURE_OPTS += --with-lz4 --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
Expand Down
8 changes: 8 additions & 0 deletions libs/pageserver_api/src/models.rs
Original file line number Diff line number Diff line change
Expand Up @@ -757,6 +757,7 @@ pub enum PagestreamBeMessage {
Error(PagestreamErrorResponse),
DbSize(PagestreamDbSizeResponse),
GetSlruSegment(PagestreamGetSlruSegmentResponse),
GetCompressedPage(PagestreamGetPageResponse),
}

// Keep in sync with `pagestore_client.h`
Expand Down Expand Up @@ -996,6 +997,12 @@ impl PagestreamBeMessage {
bytes.put(&resp.page[..]);
}

Self::GetCompressedPage(resp) => {
bytes.put_u8(105); /* tag from pagestore_client.h */
bytes.put_u16(resp.page.len() as u16);
bytes.put(&resp.page[..]);
}

Self::Error(resp) => {
bytes.put_u8(Tag::Error as u8);
bytes.put(resp.message.as_bytes());
Expand Down Expand Up @@ -1078,6 +1085,7 @@ impl PagestreamBeMessage {
Self::Error(_) => "Error",
Self::DbSize(_) => "DbSize",
Self::GetSlruSegment(_) => "GetSlruSegment",
Self::GetCompressedPage(_) => "GetCompressedPage",
}
}
}
Expand Down
7 changes: 7 additions & 0 deletions libs/postgres_ffi/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,13 @@ pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> anyhow::Result<boo
dispatch_pgversion!(version, Ok(pgv::bindings::bkpimg_is_compressed(bimg_info)))
}

pub fn bkpimage_is_compressed_lz4(bimg_info: u8, version: u32) -> anyhow::Result<bool> {
dispatch_pgversion!(
version,
Ok(pgv::bindings::bkpimg_is_compressed_lz4(bimg_info))
)
}

pub fn generate_wal_segment(
segno: u64,
system_id: u64,
Expand Down
4 changes: 4 additions & 0 deletions libs/postgres_ffi/src/pg_constants_v14.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,7 @@ pub const SIZEOF_RELMAPFILE: usize = 512; /* sizeof(RelMapFile) in relmapper.c *
pub fn bkpimg_is_compressed(bimg_info: u8) -> bool {
(bimg_info & BKPIMAGE_IS_COMPRESSED) != 0
}

pub fn bkpimg_is_compressed_lz4(_bimg_info: u8) -> bool {
false
}
4 changes: 4 additions & 0 deletions libs/postgres_ffi/src/pg_constants_v15.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,7 @@ pub fn bkpimg_is_compressed(bimg_info: u8) -> bool {

(bimg_info & ANY_COMPRESS_FLAG) != 0
}

pub fn bkpimg_is_compressed_lz4(bimg_info: u8) -> bool {
(bimg_info & BKPIMAGE_COMPRESS_LZ4) != 0
}
4 changes: 4 additions & 0 deletions libs/postgres_ffi/src/pg_constants_v16.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,7 @@ pub fn bkpimg_is_compressed(bimg_info: u8) -> bool {

(bimg_info & ANY_COMPRESS_FLAG) != 0
}

pub fn bkpimg_is_compressed_lz4(bimg_info: u8) -> bool {
(bimg_info & BKPIMAGE_COMPRESS_LZ4) != 0
}
1 change: 1 addition & 0 deletions pageserver/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ humantime-serde.workspace = true
hyper.workspace = true
itertools.workspace = true
leaky-bucket.workspace = true
lz4_flex.workspace = true
md5.workspace = true
nix.workspace = true
# hack to get the number of worker threads tokio uses
Expand Down
1 change: 1 addition & 0 deletions pageserver/client/src/page_service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ impl PagestreamClient {
PagestreamBeMessage::Exists(_)
| PagestreamBeMessage::Nblocks(_)
| PagestreamBeMessage::DbSize(_)
| PagestreamBeMessage::GetCompressedPage(_)
| PagestreamBeMessage::GetSlruSegment(_) => {
anyhow::bail!(
"unexpected be message kind in response to getpage request: {}",
Expand Down
9 changes: 8 additions & 1 deletion pageserver/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,14 @@ use tracing::info;
/// format, bump this!
/// Note that TimelineMetadata uses its own version number to track
/// backwards-compatible changes to the metadata format.
pub const STORAGE_FORMAT_VERSION: u16 = 3;
pub const STORAGE_FORMAT_VERSION: u16 = 4;

/// Minimal sorage format version with compression support
pub const COMPRESSED_STORAGE_FORMAT_VERSION: u16 = 4;

/// Page image compression algorithm
pub const NO_COMPRESSION: u8 = 0;
pub const LZ4_COMPRESSION: u8 = 0;

pub const DEFAULT_PG_VERSION: u32 = 15;

Expand Down
15 changes: 12 additions & 3 deletions pageserver/src/page_service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1155,9 +1155,18 @@ impl PageServerHandler {
.get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx)
.await?;

Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
page,
}))
let compressed = lz4_flex::block::compress(&page);
if compressed.len() < page.len() {
Ok(PagestreamBeMessage::GetCompressedPage(
PagestreamGetPageResponse {
page: Bytes::from(compressed),
},
))
} else {
Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
page,
}))
}
}

#[instrument(skip_all, fields(shard_id))]
Expand Down
15 changes: 14 additions & 1 deletion pageserver/src/pgdatadir_mapping.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ use anyhow::{ensure, Context};
use bytes::{Buf, Bytes, BytesMut};
use enum_map::Enum;
use itertools::Itertools;
use lz4_flex;
use pageserver_api::key::{
dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
Expand Down Expand Up @@ -992,7 +993,15 @@ impl<'a> DatadirModification<'a> {
img: Bytes,
) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
self.put(rel_block_to_key(rel, blknum), Value::Image(img));
let compressed = lz4_flex::block::compress(&img);
if compressed.len() < img.len() {
self.put(
rel_block_to_key(rel, blknum),
Value::CompressedImage(Bytes::from(compressed)),
);
} else {
self.put(rel_block_to_key(rel, blknum), Value::Image(img));
}
Ok(())
}

Expand Down Expand Up @@ -1597,6 +1606,10 @@ impl<'a> DatadirModification<'a> {
if let Some((_, value)) = values.last() {
return if let Value::Image(img) = value {
Ok(img.clone())
} else if let Value::CompressedImage(img) = value {
let decompressed = lz4_flex::block::decompress(&img, BLCKSZ as usize)
.map_err(|msg| PageReconstructError::Other(anyhow::anyhow!(msg)))?;
Ok(Bytes::from(decompressed))
} else {
// Currently, we never need to read back a WAL record that we
// inserted in the same "transaction". All the metadata updates
Expand Down
9 changes: 8 additions & 1 deletion pageserver/src/repository.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ pub use pageserver_api::key::{Key, KEY_SIZE};
pub enum Value {
/// An Image value contains a full copy of the value
Image(Bytes),
/// An compressed page image contains a full copy of the page
CompressedImage(Bytes),
/// A WalRecord value contains a WAL record that needs to be
/// replayed get the full value. Replaying the WAL record
/// might need a previous version of the value (if will_init()
Expand All @@ -22,12 +24,17 @@ pub enum Value {

impl Value {
pub fn is_image(&self) -> bool {
matches!(self, Value::Image(_))
match self {
Value::Image(_) => true,
Value::CompressedImage(_) => true,
Value::WalRecord(_) => false,
}
}

pub fn will_init(&self) -> bool {
match self {
Value::Image(_) => true,
Value::CompressedImage(_) => true,
Value::WalRecord(rec) => rec.will_init(),
}
}
Expand Down
83 changes: 80 additions & 3 deletions pageserver/src/tenant/blob_io.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,16 @@
//! len < 128: 0XXXXXXX
//! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
//!
use bytes::{BufMut, BytesMut};
use bytes::{BufMut, Bytes, BytesMut};
use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};

use crate::context::RequestContext;
use crate::page_cache::PAGE_SZ;
use crate::tenant::block_io::BlockCursor;
use crate::virtual_file::VirtualFile;
use crate::{LZ4_COMPRESSION, NO_COMPRESSION};
use lz4_flex;
use postgres_ffi::BLCKSZ;
use std::cmp::min;
use std::io::{Error, ErrorKind};

Expand All @@ -32,6 +35,29 @@ impl<'a> BlockCursor<'a> {
self.read_blob_into_buf(offset, &mut buf, ctx).await?;
Ok(buf)
}
/// Read blob into the given buffer. Any previous contents in the buffer
/// are overwritten.
pub async fn read_compressed_blob(
&self,
offset: u64,
ctx: &RequestContext,
) -> Result<Vec<u8>, std::io::Error> {
let blknum = (offset / PAGE_SZ as u64) as u32;
let off = (offset % PAGE_SZ as u64) as usize;

let buf = self.read_blk(blknum, ctx).await?;
let compression_alg = buf[off];
let res = self.read_blob(offset + 1, ctx).await?;
if compression_alg == LZ4_COMPRESSION {
lz4_flex::block::decompress(&res, BLCKSZ as usize).map_err(|_| {
std::io::Error::new(std::io::ErrorKind::InvalidData, "decompress error")
})
} else {
assert_eq!(compression_alg, NO_COMPRESSION);
Ok(res)
}
}

/// Read blob into the given buffer. Any previous contents in the buffer
/// are overwritten.
pub async fn read_blob_into_buf(
Expand Down Expand Up @@ -211,6 +237,58 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
(src_buf, Ok(()))
}

pub async fn write_compressed_blob(&mut self, srcbuf: Bytes) -> Result<u64, Error> {
let offset = self.offset;

let len = srcbuf.len();

let mut io_buf = self.io_buf.take().expect("we always put it back below");
io_buf.clear();
let mut is_compressed = false;
if len < 128 {
// Short blob. Write a 1-byte length header
io_buf.put_u8(NO_COMPRESSION);
io_buf.put_u8(len as u8);
} else {
// Write a 4-byte length header
if len > 0x7fff_ffff {
return Err(Error::new(
ErrorKind::Other,
format!("blob too large ({} bytes)", len),
));
}
if len == BLCKSZ as usize {
let compressed = lz4_flex::block::compress(&srcbuf);
if compressed.len() < len {
io_buf.put_u8(LZ4_COMPRESSION);
let mut len_buf = (compressed.len() as u32).to_be_bytes();
len_buf[0] |= 0x80;
io_buf.extend_from_slice(&len_buf[..]);
io_buf.extend_from_slice(&compressed[..]);
is_compressed = true;
}
if is_compressed {
io_buf.put_u8(NO_COMPRESSION);
let mut len_buf = (len as u32).to_be_bytes();
len_buf[0] |= 0x80;
io_buf.extend_from_slice(&len_buf[..]);
}
}
}
let (io_buf, hdr_res) = self.write_all(io_buf).await;
match hdr_res {
Ok(_) => (),
Err(e) => return Err(e),
}
self.io_buf = Some(io_buf);
if is_compressed {
hdr_res.map(|_| offset)
} else {
let (_buf, res) = self.write_all(srcbuf).await;
res.map(|_| offset)
}
}

/// Write a blob of data. Returns the offset that it was written to,
/// which can be used to retrieve the data later.
pub async fn write_blob<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
Expand All @@ -227,7 +305,6 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
if len < 128 {
// Short blob. Write a 1-byte length header
io_buf.put_u8(len as u8);
self.write_all(io_buf).await
} else {
// Write a 4-byte length header
if len > 0x7fff_ffff {
Expand All @@ -242,8 +319,8 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
let mut len_buf = (len as u32).to_be_bytes();
len_buf[0] |= 0x80;
io_buf.extend_from_slice(&len_buf[..]);
self.write_all(io_buf).await
}
self.write_all(io_buf).await
}
.await;
self.io_buf = Some(io_buf);
Expand Down
Loading
Loading