Skip to content

Commit

Permalink
Bring 'test_can_read_old' test back to life (#511)
Browse files Browse the repository at this point in the history
  • Loading branch information
paraseba authored Dec 26, 2024
1 parent a55a562 commit 355fbaf
Show file tree
Hide file tree
Showing 48 changed files with 123 additions and 94 deletions.
30 changes: 30 additions & 0 deletions icechunk-python/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import Literal

import boto3
import pytest

from icechunk import ObjectStoreConfig, Repository, Storage
Expand All @@ -21,3 +22,32 @@ def repo(request: pytest.FixtureRequest, tmpdir: str) -> tuple[Repository, str]:
param = request.param
repo = parse_repo(param, tmpdir)
return repo, tmpdir


minio_client = None


def get_minio_client():
global minio_client
if minio_client is None:
minio_client = boto3.client(
"s3",
endpoint_url="http://localhost:9000",
use_ssl=False,
aws_access_key_id="minio123",
aws_secret_access_key="minio123",
)
return minio_client


def write_chunks_to_minio(
chunks: list[tuple[str, bytes]], bucket: str = "testbucket"
) -> list[str]:
"""Write chunks to local minio returning their etags"""
s3 = get_minio_client()
etags = []
for key, data in chunks:
etag = s3.put_object(Bucket=bucket, Key=key, Body=data)["ETag"]
etags.append(etag)

return etags
28 changes: 28 additions & 0 deletions icechunk-python/tests/data/test-repo/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
inline_chunk_threshold_bytes: 12
unsafe_overwrite_refs: false
get_partial_values_concurrency: 10
virtual_chunk_containers:
az:
name: az
url_prefix: az
store: !Azure {}
file:
name: file
url_prefix: file
store: !LocalFileSystem ''
gcs:
name: gcs
url_prefix: gcs
store: !Gcs {}
s3:
name: s3
url_prefix: s3://
store: !S3Compatible
region: us-east-1
endpoint_url: http://localhost:9000
anonymous: false
allow_http: true
tigris:
name: tigris
url_prefix: tigris
store: !Tigris {}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"snapshot":"GF5BY4RDJMVN8B0HC79G"}
{"snapshot":"76SEKSN1REBS625050KG"}
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"snapshot":"JD3Q1S9WHETRN5RQWPEG"}
{"snapshot":"RY94BWP577R3AC5NYA2G"}
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"snapshot":"R87MM00MRYAHDJRBMRK0"}
{"snapshot":"XW5N59QFBRA8NX854AWG"}
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"snapshot":"M2VM3ZTGNFFHHNY97J20"}
{"snapshot":"8R50R5QATH5283VSKQXG"}
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"snapshot":"RWZQ2ZEXNA7HPE1XR9CG"}
{"snapshot":"E0GK0CMQYN1JDV75EDV0"}
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"snapshot":"D6PWEBGAPSQCD6Q1DAZG"}
{"snapshot":"TBXXQSCAS9VJQG86T9H0"}
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"snapshot":"GF5BY4RDJMVN8B0HC79G"}
{"snapshot":"76SEKSN1REBS625050KG"}
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"snapshot":"RWZQ2ZEXNA7HPE1XR9CG"}
{"snapshot":"E0GK0CMQYN1JDV75EDV0"}
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"snapshot":"D6PWEBGAPSQCD6Q1DAZG"}
{"snapshot":"TBXXQSCAS9VJQG86T9H0"}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
93 changes: 50 additions & 43 deletions icechunk-python/tests/test_can_read_old.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,46 +11,44 @@
file as a python script: `python ./tests/test_can_read_old.py`.
"""

from datetime import UTC, datetime
from typing import cast

import pytest
from numpy.testing import assert_array_equal
from object_store import ClientOptions, ObjectStore

import icechunk as ic
import zarr


def write_chunks_to_minio(chunks: list[tuple[str, bytes]]):
client_options = ClientOptions(
allow_http=True, # type: ignore
)
store = ObjectStore(
"s3://testbucket",
{
"access_key_id": "minio123",
"secret_access_key": "minio123",
"aws_region": "us-east-1",
"aws_endpoint": "http://localhost:9000",
},
client_options=client_options,
)

for key, data in chunks:
store.put(key, data)


def mk_repo():
def mk_repo(create: bool):
"""Create a store that can access virtual chunks in localhost MinIO"""
store_path = "./tests/data/test-repo"

config = ic.RepositoryConfig(
inline_chunk_threshold_bytes=12,
config = ic.RepositoryConfig.default()
config.inline_chunk_threshold_bytes = 12

virtual_store_config = ic.ObjectStoreConfig.S3Compatible(
ic.S3CompatibleOptions(
region="us-east-1",
endpoint_url="http://localhost:9000",
allow_http=True,
)
)
store = ic.Repository.open_or_create(
storage=ic.ObjectStoreConfig.LocalFileSystem(store_path), config=config
container = ic.VirtualChunkContainer("s3", "s3://", virtual_store_config)
config.set_virtual_chunk_container(container)
credentials = {
"s3": ic.Credentials.Static(ic.StaticCredentials("minio123", "minio123"))
}

operation = ic.Repository.create if create else ic.Repository.open
repo = operation(
storage=ic.Storage.create(ic.ObjectStoreConfig.LocalFileSystem(store_path)),
config=config,
virtual_chunk_credentials=credentials,
)
return store

return repo


async def write_a_test_repo():
Expand All @@ -64,7 +62,7 @@ async def write_a_test_repo():
"""

print("Writing repository to ./tests/data/test-repo")
repo = mk_repo()
repo = mk_repo(True)
session = repo.writable_session("main")
store = session.store

Expand Down Expand Up @@ -93,6 +91,8 @@ async def write_a_test_repo():
attributes={"this": "is a nice array", "icechunk": 1, "size": 42.0},
)
session.commit("empty structure")
session = repo.writable_session("main")
store = session.store

session = repo.writable_session("main")
store = session.store
Expand All @@ -103,14 +103,19 @@ async def write_a_test_repo():
big_chunks[:] = 42.0
small_chunks[:] = 84
snapshot = session.commit("fill data")
session = repo.writable_session("main")
store = session.store

# store.set_virtual_ref(
# "group1/big_chunks/c/0/0",
# "s3://testbucket/path/to/python/chunk-1",
# offset=0,
# length=5 * 5 * 4,
# )
# store.commit("set virtual chunk")
store.set_virtual_ref(
"group1/big_chunks/c/0/0",
"s3://testbucket/can_read_old/chunk-1",
offset=0,
length=5 * 5 * 4,
checksum=datetime(9999, 12, 31, tzinfo=UTC),
)
snapshot = session.commit("set virtual chunk")
session = repo.writable_session("main")
store = session.store

repo.create_branch("my-branch", snapshot_id=snapshot)
session = repo.writable_session("my-branch")
Expand Down Expand Up @@ -152,13 +157,12 @@ async def write_a_test_repo():
store.close()


@pytest.mark.filterwarnings("ignore:datetime.datetime.utcnow")
async def test_icechunk_can_read_old_repo():
# FIXME
pytest.xfail(
"Temporary flagged as failing while we implement new virtual chunk mechanism"
)
# we import here so it works when the script is ran by pytest
from tests.conftest import write_chunks_to_minio

repo = mk_repo()
repo = mk_repo(False)
main_snapshot = repo.lookup_branch("main")

expected_main_history = [
Expand All @@ -174,14 +178,15 @@ async def test_icechunk_can_read_old_repo():
"some more structure",
"delete a chunk",
] + expected_main_history

assert [
p.message for p in repo.ancestry(my_branch_snapshot)
] == expected_branch_history

tag_snapshot = repo.tag("it also works!")
tag_snapshot = repo.lookup_tag("it also works!")
assert [p.message for p in repo.ancestry(tag_snapshot)] == expected_branch_history

tag_snapshot = repo.tag("it works!")
tag_snapshot = repo.lookup_tag("it works!")
assert [p.message for p in repo.ancestry(tag_snapshot)] == expected_branch_history[1:]

session = repo.writable_session("my-branch")
Expand Down Expand Up @@ -213,7 +218,7 @@ async def test_icechunk_can_read_old_repo():
[p async for p in store.list_dir("group2/group3/group4/group5/inner")]
) == ["c", "zarr.json"]

root = zarr.group(store=store.as_writable())
root = zarr.group(store=store)
# inner is not initialized, so it's all fill values
inner = root["group2/group3/group4/group5/inner"]
assert_array_equal(inner[:], float("nan"))
Expand All @@ -232,7 +237,7 @@ async def test_icechunk_can_read_old_repo():
# big chunks array has a virtual chunk pointing here
write_chunks_to_minio(
[
("path/to/python/chunk-1", chunk_data),
("can_read_old/chunk-1", chunk_data),
]
)

Expand All @@ -243,4 +248,6 @@ async def test_icechunk_can_read_old_repo():
if __name__ == "__main__":
import asyncio

# we import here so it works when the script is ran by pytest

asyncio.run(write_a_test_repo())
23 changes: 3 additions & 20 deletions icechunk-python/tests/test_regressions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import cast

from object_store import ClientOptions, ObjectStore
import pytest

import zarr
import zarr.core
Expand All @@ -15,25 +15,7 @@
VirtualChunkContainer,
)
from icechunk.repository import Repository


def write_chunks_to_minio(chunks: list[tuple[str, bytes]]):
client_options = ClientOptions(
allow_http=True, # type: ignore
)
store = ObjectStore(
"s3://testbucket",
{
"access_key_id": "minio123",
"secret_access_key": "minio123",
"aws_region": "us-east-1",
"aws_endpoint": "http://localhost:9000",
},
client_options=client_options,
)

for key, data in chunks:
store.put(key, data)
from tests.conftest import write_chunks_to_minio


async def write_minio_virtual_refs():
Expand All @@ -48,6 +30,7 @@ async def write_minio_virtual_refs():
)


@pytest.mark.filterwarnings("ignore:datetime.datetime.utcnow")
async def test_issue_418():
# See https://github.com/earth-mover/icechunk/issues/418
await write_minio_virtual_refs()
Expand Down
25 changes: 3 additions & 22 deletions icechunk-python/tests/test_virtual_ref.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import os
import uuid
from datetime import UTC, datetime, timedelta

import boto3
import numpy as np
import pytest

Expand All @@ -20,33 +18,16 @@
VirtualChunkContainer,
)
from icechunk.repository import Repository


def write_chunks_to_minio(prefix: str, chunks: list[tuple[str, bytes]]) -> list[str]:
s3 = boto3.client(
"s3",
endpoint_url="http://localhost:9000",
use_ssl=False,
aws_access_key_id="minio123",
aws_secret_access_key="minio123",
)
etags = []
for key, data in chunks:
key = os.path.join(prefix, key)
etag = s3.put_object(Bucket="testbucket", Key=key, Body=data)["ETag"]
etags.append(etag)

return etags
from tests.conftest import write_chunks_to_minio


@pytest.mark.filterwarnings("ignore:datetime.datetime.utcnow")
async def test_write_minio_virtual_refs():
prefix = str(uuid.uuid4())
etags = write_chunks_to_minio(
prefix,
[
("chunk-1", b"first"),
("chunk-2", b"second"),
(f"{prefix}/chunk-1", b"first"),
(f"{prefix}/chunk-2", b"second"),
],
)

Expand Down

0 comments on commit 355fbaf

Please sign in to comment.