Binary file format evolution support (#551)

* Binary file format evolution support This PR implements design document 004. New features: * Compressing snapshots and tx logs (manifest were already implemented) * Metadata keys in object store use underscore instead of hyphen to support google. We also do a big refactoring: * `Storage` instances know much less about Icechunk, they mostly speak in terms of bytes and ids * New `AssetManager` has the responsibility of encoding fetching and writing the binary files * `Assetmanager` also does asset caching, replacing the previous `CachingStorage` * test --all passing * resolver -> manager * asset manager handles transaction logs * asset manager handles chunks * caching tests * cleanup * more space for implementation name * cleanup * adapt python api * regenerate test repo * cleanup * Update icechunk-python/src/config.rs Co-authored-by: Matthew Iannucci <matthew@earthmover.io> * Update icechunk-python/src/config.rs Co-authored-by: Matthew Iannucci <matthew@earthmover.io> * Update icechunk-python/src/config.rs Co-authored-by: Matthew Iannucci <matthew@earthmover.io> --------- Co-authored-by: Matthew Iannucci <matthew@earthmover.io>
earth-mover · Jan 9, 2025 · 0a17588 · 0a17588
1 parent 480108f
commit 0a17588
Show file tree

Hide file tree

Showing 71 changed files with 1,313 additions and 1,088 deletions.
diff --git a/design-docs/004-binary-format.md b/design-docs/004-binary-format.md
@@ -20,14 +20,14 @@ All these files start with 12 magic bytes:
 
 Which correspond to the string `ICE🧊CHUNK` in UTF-8.
 
-The next 12 bytes, with indexes 12-23 (zero based) identify the Icechunk client that wrote the file.
+The next 24 bytes, with indexes 12-35 (zero based) identify the Icechunk client that wrote the file.
 Different implementations will use different strings in UTF-8 encoding, padding them with ASCII spaces on the right. The official Icechunk implementation uses "ic-" followed by the version:
 
 * "ic-0.1.0-a.8"
 * "ic-1.0.0"
 * etc.
 
-Byte 24 identifies Icechunk spec version used to write the file.
+Byte 36 identifies Icechunk spec version used to write the file.
 
 * 0x00 - Reserved for future use
 * 0x01 - Spec version 1
@@ -36,14 +36,14 @@ Byte 24 identifies Icechunk spec version used to write the file.
 
 If more than 255 version are needed, we will set this byte to 0 and modify the following bytes according to a new spec.
 
-The next byte (index 25) identifies file type :
+The next byte (index 37) identifies file type :
 
 * 0x01 - Snapshot
 * 0x02 - Manifest
 * 0x03 - Attributes file
 * 0x04 - Transaction log
 
-Byte 26 identifies compression type:
+Byte 38 identifies compression type:
 
 * 0x00 - No compression
 * 0x01 - Zstd
@@ -57,8 +57,8 @@ In object stores that support it, the file information (spec version, compressio
 | Byte index    | Content               | Example |
 | ------------- | --------------- |--------- |
 | 0 - 11        | Magic bytes           | ICE🧊CHUNK              |
-| 12 - 23       | Implementation id     | 'ic-1.0.0    '          |
-| 24            | Spec version          | 0x01                    |
-| 25            | File type             | 0x01                    |
-| 26            | Compression           | 0x01                    |
-| 27..end       | Msgpack serializanion |                         |
+| 12 - 35       | Implementation id     | 'ic-1.0.0    '          |
+| 36            | Spec version          | 0x01                    |
+| 37            | File type             | 0x01                    |
+| 38            | Compression           | 0x01                    |
+| 39..end       | Msgpack serializanion |                         |
diff --git a/icechunk-python/python/icechunk/__init__.py b/icechunk-python/python/icechunk/__init__.py
@@ -2,6 +2,9 @@
 
 from icechunk._icechunk_python import (
     BasicConflictSolver,
+    CachingConfig,
+    CompressionAlgorithm,
+    CompressionConfig,
     Conflict,
     ConflictDetector,
     ConflictErrorData,
@@ -19,8 +22,6 @@
     S3StaticCredentials,
     SnapshotMetadata,
     Storage,
-    StorageCompressionAlgorithm,
-    StorageCompressionSettings,
     StorageConcurrencySettings,
     StorageSettings,
     VersionSelection,
@@ -61,6 +62,9 @@
     "AnyObjectStoreConfig",
     "AnyS3Credential",
     "BasicConflictSolver",
+    "CachingConfig",
+    "CompressionAlgorithm",
+    "CompressionConfig",
     "Conflict",
     "ConflictDetector",
     "ConflictError",
@@ -83,8 +87,6 @@
     "Session",
     "SnapshotMetadata",
     "Storage",
-    "StorageCompressionAlgorithm",
-    "StorageCompressionSettings",
     "StorageConcurrencySettings",
     "StorageSettings",
     "VersionSelection",

diff --git a/icechunk-python/python/icechunk/_icechunk_python.pyi b/icechunk-python/python/icechunk/_icechunk_python.pyi
@@ -51,21 +51,51 @@ class VirtualChunkContainer:
 
     def __init__(self, name: str, url_prefix: str, store: AnyObjectStoreConfig): ...
 
-class StorageCompressionAlgorithm(Enum):
+class CompressionAlgorithm(Enum):
     """Enum for selecting the compression algorithm used by Icechunk to write its metadata files"""
 
     Zstd = 0
 
-class StorageCompressionSettings:
+    @staticmethod
+    def default() -> CompressionAlgorithm: ...
+
+class CompressionConfig:
     """Configuration for how Icechunk compresses its metadata files"""
     @property
-    def algorithm(self) -> StorageCompressionAlgorithm: ...
+    def algorithm(self) -> CompressionAlgorithm: ...
     @algorithm.setter
-    def algorithm(self, value: StorageCompressionAlgorithm) -> None: ...
+    def algorithm(self, value: CompressionAlgorithm) -> None: ...
     @property
     def level(self) -> int: ...
     @level.setter
     def level(self, value: int) -> None: ...
+    @staticmethod
+    def default() -> CompressionConfig: ...
+
+class CachingConfig:
+    """Configuration for how Icechunk caches its metadata files"""
+    @property
+    def snapshots_cache_size(self) -> int: ...
+    @snapshots_cache_size.setter
+    def snapshots_cache_size(self, value: int) -> None: ...
+    @property
+    def manifests_cache_size(self) -> int: ...
+    @manifests_cache_size.setter
+    def manifests_cache_size(self, value: int) -> None: ...
+    @property
+    def transactions_cache_size(self) -> int: ...
+    @transactions_cache_size.setter
+    def transactions_cache_size(self, value: int) -> None: ...
+    @property
+    def attributes_cache_size(self) -> int: ...
+    @attributes_cache_size.setter
+    def attributes_cache_size(self, value: int) -> None: ...
+    @property
+    def chunks_cache_size(self) -> int: ...
+    @chunks_cache_size.setter
+    def chunks_cache_size(self, value: int) -> None: ...
+    @staticmethod
+    def default() -> CachingConfig: ...
 
 class StorageConcurrencySettings:
     """Configuration for how Icechunk uses its Storage instance"""
@@ -86,29 +116,9 @@ class StorageSettings:
     def concurrency(self) -> StorageConcurrencySettings: ...
     @concurrency.setter
     def concurrency(self, value: StorageConcurrencySettings) -> None: ...
-    @property
-    def compression(self) -> StorageCompressionSettings: ...
-    @compression.setter
-    def compression(self, value: StorageCompressionSettings) -> None: ...
 
 class RepositoryConfig:
     """Configuration for an Icechunk repository"""
-    def __init__(
-        self,
-        *,
-        inline_chunk_threshold_bytes: int = 512,
-        unsafe_overwrite_refs: bool = False,
-    ) -> None:
-        """
-        Create a RepositoryConfig object with the given configuration options
-
-        Parameters
-        ----------
-        inline_chunk_threshold_bytes: int
-            The threshold in bytes for when to inline chunks instead of storing them as references
-        unsafe_overwrite_refs: bool
-        """
-        ...
 
     @staticmethod
     def default() -> RepositoryConfig: ...
@@ -121,6 +131,22 @@ class RepositoryConfig:
     @unsafe_overwrite_refs.setter
     def unsafe_overwrite_refs(self, value: bool) -> None: ...
     @property
+    def get_partial_values_concurrency(self) -> int: ...
+    @get_partial_values_concurrency.setter
+    def get_partial_values_concurrency(self, value: int) -> None: ...
+    @property
+    def compression(self) -> CompressionConfig: ...
+    @compression.setter
+    def compression(self, value: CompressionConfig) -> None: ...
+    @property
+    def caching(self) -> CachingConfig: ...
+    @caching.setter
+    def caching(self, value: CachingConfig) -> None: ...
+    @property
+    def storage(self) -> Storage: ...
+    @storage.setter
+    def storage(self, value: Storage) -> None: ...
+    @property
     def virtual_chunk_containers(self) -> dict[str, VirtualChunkContainer]: ...
     @virtual_chunk_containers.setter
     def virtual_chunk_containers(

diff --git a/icechunk-python/src/config.rs b/icechunk-python/src/config.rs
@@ -10,10 +10,11 @@ use std::{
 
 use icechunk::{
     config::{
-        Credentials, CredentialsFetcher, GcsCredentials, GcsStaticCredentials,
-        S3Credentials, S3Options, S3StaticCredentials,
+        CachingConfig, CompressionAlgorithm, CompressionConfig, Credentials,
+        CredentialsFetcher, GcsCredentials, GcsStaticCredentials, S3Credentials,
+        S3Options, S3StaticCredentials,
     },
-    storage::{self, CompressionAlgorithm, CompressionSettings, ConcurrencySettings},
+    storage::{self, ConcurrencySettings},
     virtual_chunks::VirtualChunkContainer,
     ObjectStoreConfig, RepositoryConfig, Storage,
 };
@@ -318,49 +319,115 @@ impl From<VirtualChunkContainer> for PyVirtualChunkContainer {
     }
 }
 
-#[pyclass(name = "StorageCompressionAlgorithm", eq, eq_int)]
+#[pyclass(name = "CompressionAlgorithm", eq, eq_int)]
 #[derive(Clone, Debug, PartialEq, Eq)]
-pub enum PyStorageCompressionAlgorithm {
+pub enum PyCompressionAlgorithm {
     Zstd,
 }
 
-impl From<CompressionAlgorithm> for PyStorageCompressionAlgorithm {
+#[pymethods]
+impl PyCompressionAlgorithm {
+    #[staticmethod]
+    /// Create a default `CompressionAlgorithm` instance
+    fn default() -> Self {
+        CompressionAlgorithm::default().into()
+    }
+}
+
+impl From<CompressionAlgorithm> for PyCompressionAlgorithm {
     fn from(value: CompressionAlgorithm) -> Self {
         match value {
-            CompressionAlgorithm::Zstd => PyStorageCompressionAlgorithm::Zstd,
+            CompressionAlgorithm::Zstd => PyCompressionAlgorithm::Zstd,
         }
     }
 }
 
-impl From<PyStorageCompressionAlgorithm> for CompressionAlgorithm {
-    fn from(value: PyStorageCompressionAlgorithm) -> Self {
+impl From<PyCompressionAlgorithm> for CompressionAlgorithm {
+    fn from(value: PyCompressionAlgorithm) -> Self {
         match value {
-            PyStorageCompressionAlgorithm::Zstd => CompressionAlgorithm::Zstd,
+            PyCompressionAlgorithm::Zstd => CompressionAlgorithm::Zstd,
         }
     }
 }
 
-#[pyclass(name = "StorageCompressionSettings", eq)]
+#[pyclass(name = "CompressionConfig", eq)]
 #[derive(Clone, Debug, PartialEq, Eq)]
-pub struct PyStorageCompressionSettings {
+pub struct PyCompressionConfig {
     #[pyo3(get, set)]
-    pub algorithm: PyStorageCompressionAlgorithm,
+    pub algorithm: PyCompressionAlgorithm,
     #[pyo3(get, set)]
     pub level: u8,
 }
 
-impl From<CompressionSettings> for PyStorageCompressionSettings {
-    fn from(value: CompressionSettings) -> Self {
+#[pymethods]
+impl PyCompressionConfig {
+    #[staticmethod]
+    /// Create a default `CompressionConfig` instance
+    fn default() -> Self {
+        CompressionConfig::default().into()
+    }
+}
+
+impl From<CompressionConfig> for PyCompressionConfig {
+    fn from(value: CompressionConfig) -> Self {
         Self { algorithm: value.algorithm.into(), level: value.level }
     }
 }
 
-impl From<PyStorageCompressionSettings> for CompressionSettings {
-    fn from(value: PyStorageCompressionSettings) -> Self {
+impl From<PyCompressionConfig> for CompressionConfig {
+    fn from(value: PyCompressionConfig) -> Self {
         Self { algorithm: value.algorithm.into(), level: value.level }
     }
 }
 
+#[pyclass(name = "CachingConfig", eq)]
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub struct PyCachingConfig {
+    #[pyo3(get, set)]
+    pub snapshots_cache_size: u16,
+    #[pyo3(get, set)]
+    pub manifests_cache_size: u16,
+    #[pyo3(get, set)]
+    pub transactions_cache_size: u16,
+    #[pyo3(get, set)]
+    pub attributes_cache_size: u16,
+    #[pyo3(get, set)]
+    pub chunks_cache_size: u16,
+}
+
+#[pymethods]
+impl PyCachingConfig {
+    #[staticmethod]
+    /// Create a default `CachingConfig` instance
+    fn default() -> Self {
+        CachingConfig::default().into()
+    }
+}
+
+impl From<PyCachingConfig> for CachingConfig {
+    fn from(value: PyCachingConfig) -> Self {
+        Self {
+            snapshots_cache_size: value.snapshots_cache_size,
+            manifests_cache_size: value.manifests_cache_size,
+            transactions_cache_size: value.transactions_cache_size,
+            attributes_cache_size: value.attributes_cache_size,
+            chunks_cache_size: value.chunks_cache_size,
+        }
+    }
+}
+
+impl From<CachingConfig> for PyCachingConfig {
+    fn from(value: CachingConfig) -> Self {
+        Self {
+            snapshots_cache_size: value.snapshots_cache_size,
+            manifests_cache_size: value.manifests_cache_size,
+            transactions_cache_size: value.transactions_cache_size,
+            attributes_cache_size: value.attributes_cache_size,
+            chunks_cache_size: value.chunks_cache_size,
+        }
+    }
+}
+
 #[pyclass(name = "StorageConcurrencySettings", eq)]
 #[derive(Clone, Debug, PartialEq, Eq)]
 pub struct PyStorageConcurrencySettings {
@@ -393,25 +460,17 @@ impl From<PyStorageConcurrencySettings> for ConcurrencySettings {
 pub struct PyStorageSettings {
     #[pyo3(get, set)]
     pub concurrency: PyStorageConcurrencySettings,
-    #[pyo3(get, set)]
-    pub compression: PyStorageCompressionSettings,
 }
 
 impl From<storage::Settings> for PyStorageSettings {
     fn from(value: storage::Settings) -> Self {
-        Self {
-            concurrency: value.concurrency.into(),
-            compression: value.compression.into(),
-        }
+        Self { concurrency: value.concurrency.into() }
     }
 }
 
 impl From<PyStorageSettings> for storage::Settings {
     fn from(value: PyStorageSettings) -> Self {
-        Self {
-            concurrency: value.concurrency.into(),
-            compression: value.compression.into(),
-        }
+        Self { concurrency: value.concurrency.into() }
     }
 }
 
@@ -425,6 +484,10 @@ pub struct PyRepositoryConfig {
     #[pyo3(get, set)]
     pub get_partial_values_concurrency: u16,
     #[pyo3(get, set)]
+    pub compression: PyCompressionConfig,
+    #[pyo3(get, set)]
+    pub caching: PyCachingConfig,
+    #[pyo3(get, set)]
     pub storage: Option<PyStorageSettings>,
     #[pyo3(get, set)]
     pub virtual_chunk_containers: HashMap<String, PyVirtualChunkContainer>,
@@ -437,6 +500,8 @@ impl From<PyRepositoryConfig> for RepositoryConfig {
             unsafe_overwrite_refs: value.unsafe_overwrite_refs,
             get_partial_values_concurrency: value.get_partial_values_concurrency,
             storage: value.storage.map(|s| s.into()),
+            compression: value.compression.into(),
+            caching: value.caching.into(),
             virtual_chunk_containers: value
                 .virtual_chunk_containers
                 .into_iter()
@@ -453,6 +518,8 @@ impl From<RepositoryConfig> for PyRepositoryConfig {
             unsafe_overwrite_refs: value.unsafe_overwrite_refs,
             get_partial_values_concurrency: value.get_partial_values_concurrency,
             storage: value.storage.map(|s| s.into()),
+            compression: value.compression.into(),
+            caching: value.caching.into(),
             virtual_chunk_containers: value
                 .virtual_chunk_containers
                 .into_iter()