Skip to content

Commit

Permalink
enh: add get_dc_instance convenience method
Browse files Browse the repository at this point in the history
  • Loading branch information
paulmueller committed Feb 14, 2024
1 parent 5521e2b commit ecc8007
Show file tree
Hide file tree
Showing 4 changed files with 136 additions and 2 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
0.7.3
- enh: add get_dc_instance convenience method
0.7.2
- tests: fix invalid make_dataset method
0.7.1
Expand Down
2 changes: 1 addition & 1 deletion dcor_shared/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# flake8: noqa: F401
from .ckan import get_ckan_config_option, get_resource_path
from .data import DUMMY_BYTES, sha256sum, wait_for_resource
from .data import DUMMY_BYTES, get_dc_instance, sha256sum, wait_for_resource
from .mime import DC_MIME_TYPES, VALID_FORMATS
from . import paths
from ._version import version as __version__
17 changes: 17 additions & 0 deletions dcor_shared/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,30 @@
import time
import warnings

from dclab.rtdc_dataset import RTDC_HDF5

from .ckan import get_resource_path
from . import s3cc


#: Content of the dummy file created when importing data.
DUMMY_BYTES = b"[Data import pending]"


def get_dc_instance(rid):
"""Return an instance of dclab's `RTDCBase` for a resource identifier"""
# Try local file first
path = get_resource_path(rid)
if path.is_file():
return RTDC_HDF5(path)
else:
# The resource must be on S3
if s3cc.object_exists(rid):
return s3cc.get_s3_dc_handle(rid)
else:
raise ValueError(f"Could not find resource {rid} anywhere")


def sha256sum(path):
"""Compute the SHA256 hash of a file in 1MB chunks"""
file_hash = hashlib.sha256()
Expand Down
117 changes: 116 additions & 1 deletion tests/test_data.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,126 @@
import atexit
import pathlib
import shutil
from unittest import mock
import uuid

from ckan import logic
import ckanext.dcor_schemas.plugin

from dcor_shared import (
get_dc_instance, get_resource_path, s3, sha256sum, wait_for_resource
)

import pytest
import ckan.tests.factories as factories
from dcor_shared.testing import make_dataset, synchronous_enqueue_job

data_path = pathlib.Path(__file__).parent / "data"


@pytest.mark.ckan_config('ckan.plugins', 'dcor_depot dcor_schemas')
@pytest.mark.usefixtures('clean_db', 'with_request_context')
@mock.patch('ckan.plugins.toolkit.enqueue_job',
side_effect=synchronous_enqueue_job)
def test_get_dc_instance_file(enqueue_job_mock, create_with_upload,
monkeypatch):
monkeypatch.setattr(
ckanext.dcor_schemas.plugin,
'DISABLE_AFTER_DATASET_CREATE_FOR_CONCURRENT_JOB_TESTS',
True)

user = factories.User()
owner_org = factories.Organization(users=[{
'name': user['id'],
'capacity': 'admin'
}])
create_context = {'ignore_auth': False,
'user': user['name'],
'api_version': 3}
ds_dict, _ = make_dataset(
create_context, owner_org,
create_with_upload=create_with_upload,
resource_path=data_path / "calibration_beads_47.rtdc",
activate=True)
rid = ds_dict["resources"][0]["id"]
resource_path = pathlib.Path(get_resource_path(rid))
assert resource_path.exists(), "sanity check"
with get_dc_instance(rid) as ds:
assert str(ds.path) == str(resource_path)


@pytest.mark.ckan_config('ckan.storage_path',
'/tmp/test_dcor_shared/test_get_dc_instance')
def test_get_dc_instance_file_fails_without_actual_resource():
tmp_path = pathlib.Path("/tmp/test_dcor_shared/test_get_dc_instance")
atexit.register(shutil.rmtree,
tmp_path,
ignore_errors=True)
rid = str(uuid.uuid4())
resource_path = tmp_path / rid[:3] / rid[3:6] / rid[6:]
resource_path.parent.mkdir(parents=True)
shutil.copy2(data_path / "calibration_beads_47.rtdc", resource_path)
with pytest.raises(logic.NotFound):
get_dc_instance(rid)


@pytest.mark.ckan_config('ckan.plugins', 'dcor_depot dcor_schemas')
@pytest.mark.usefixtures('clean_db', 'with_request_context')
@mock.patch('ckan.plugins.toolkit.enqueue_job',
side_effect=synchronous_enqueue_job)
def test_get_dc_instance_s3(enqueue_job_mock, create_with_upload,
monkeypatch):
monkeypatch.setattr(
ckanext.dcor_schemas.plugin,
'DISABLE_AFTER_DATASET_CREATE_FOR_CONCURRENT_JOB_TESTS',
True)

user = factories.User()
owner_org = factories.Organization(users=[{
'name': user['id'],
'capacity': 'admin'
}])
create_context = {'ignore_auth': False,
'user': user['name'],
'api_version': 3}
ds_dict, _ = make_dataset(
create_context, owner_org,
create_with_upload=create_with_upload,
resource_path=data_path / "calibration_beads_47.rtdc",
activate=True)
res_dict = ds_dict["resources"][0]
rid = res_dict["id"]
resource_path = pathlib.Path(get_resource_path(rid))
# remove the file, so DCOR falls back to the S3 resource
resource_path.unlink()
assert not resource_path.exists(), "sanity check"
with get_dc_instance(rid) as ds:
assert ds.path.startswith("http")
assert res_dict["s3_available"]
assert res_dict["s3_url"] == ds.path


from dcor_shared import get_resource_path, sha256sum, wait_for_resource
@pytest.mark.ckan_config('ckan.plugins', 'dcor_depot dcor_schemas')
@pytest.mark.usefixtures('clean_db', 'with_request_context')
@mock.patch('ckan.plugins.toolkit.enqueue_job',
side_effect=synchronous_enqueue_job)
def test_get_dc_instance_s3_fails_without_actual_resource(enqueue_job_mock):
user = factories.User()
owner_org = factories.Organization(users=[{
'name': user['id'],
'capacity': 'admin'
}])
rid = str(uuid.uuid4())
bucket_name = f"circle-{owner_org['id']}"
object_name = f"resource/{rid[:3]}/{rid[3:6]}/{rid[6:]}"
upload_path = data_path / "calibration_beads_47.rtdc"
s3.upload_file(bucket_name=bucket_name,
object_name=object_name,
path=upload_path,
sha256=sha256sum(upload_path)
)
with pytest.raises(logic.NotFound):
get_dc_instance(rid)


def test_sha256sum(tmp_path):
Expand Down

0 comments on commit ecc8007

Please sign in to comment.