Skip to content

Commit

Permalink
Merge pull request #129 from DataONEorg/feature-126-objectmetadata-da…
Browse files Browse the repository at this point in the history
…taclass

Feature-126: Refactor `ObjectMetadata` to be a dataclass
  • Loading branch information
doulikecookiedough authored Sep 9, 2024
2 parents 5443b39 + 6193329 commit 00c9dcd
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 64 deletions.
4 changes: 2 additions & 2 deletions src/hashstore/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,6 @@
system.
"""

from hashstore.hashstore import HashStore, HashStoreFactory, ObjectMetadata
from hashstore.hashstore import HashStore, HashStoreFactory

__all__ = ("HashStore", "HashStoreFactory", "ObjectMetadata")
__all__ = ("HashStore", "HashStoreFactory")
29 changes: 26 additions & 3 deletions src/hashstore/filehashstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,13 @@
import os
import logging
import inspect
import fcntl
import yaml
from dataclasses import dataclass
from pathlib import Path
from contextlib import closing
from tempfile import NamedTemporaryFile
import fcntl
import yaml
from hashstore import HashStore, ObjectMetadata
from hashstore import HashStore
from hashstore.filehashstore_exceptions import (
CidRefsContentError,
CidRefsDoesNotExist,
Expand Down Expand Up @@ -2611,3 +2612,25 @@ def close(self):
self._obj.close()
else:
self._obj.seek(self._pos)


@dataclass
class ObjectMetadata:
"""Represents metadata associated with an object.
The `ObjectMetadata` class represents metadata associated with an object, including
a persistent or authority-based identifier (`pid`), a content identifier (`cid`),
the size of the object in bytes (`obj_size`), and an optional list of hex digests
(`hex_digests`) to assist with validating objects.
:param str pid: An authority-based or persistent identifier
:param str cid: A unique identifier for the object (Hash ID, hex digest).
:param int obj_size: The size of the object in bytes.
:param list hex_digests: A list of hex digests to validate objects
(md5, sha1, sha256, sha384, sha512) (optional).
"""

pid: str
cid: str
obj_size: int
hex_digests: dict
47 changes: 13 additions & 34 deletions src/hashstore/hashstore.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Hashstore Interface"""

from abc import ABC, abstractmethod
from collections import namedtuple
import importlib.metadata
import importlib.util

Expand All @@ -28,19 +27,20 @@ def store_object(
):
"""Atomic storage of objects to disk using a given stream. Upon successful storage,
it returns an `ObjectMetadata` object containing relevant file information, such as
the file's id, the file's size, and a hex digest dictionary of algorithms and checksums.
The method also tags the object, creating references for discoverability.
a persistent identifier that references the data file, the file's size, and a hex digest
dictionary of algorithms and checksums. The method also tags the object, creating
references for discoverability.
`store_object` ensures that an object is stored only once by synchronizing multiple calls
and rejecting attempts to store duplicate objects. If called without a pid, it stores the
object without tagging, and it becomes the caller's responsibility to finalize the process
by calling `tag_object` after verifying the correct object is stored.
The file's id is determined by calculating the object's content identifier based on the
store's default algorithm, which is also the permanent address of the file. The file's
identifier is then sharded using the store's configured depth and width, delimited by '/',
and concatenated to produce the final permanent address. This address is stored in the
`/store_directory/objects/` directory.
The file's permanent address is determined by calculating the object's content identifier
based on the store's default algorithm, which is also the permanent address of the file.
The content identifier is then sharded using the store's configured depth and width,
delimited by '/', and concatenated to produce the final permanent address. This address
is stored in the `/store_directory/objects/` directory.
By default, the hex digest map includes common hash algorithms (md5, sha1, sha256, sha384,
sha512). If an additional algorithm is provided, the method checks if it is supported and
Expand All @@ -59,8 +59,8 @@ def store_object(
:param str checksum_algorithm: Algorithm of the supplied checksum.
:param int expected_object_size: Size of the object to verify.
:return: ObjectMetadata - Object containing the permanent address, file size, and
hex digest dictionary.
:return: ObjectMetadata - Object containing the persistent identifier (pid),
content identifier (cid), object size and hex digests dictionary (checksums).
"""
raise NotImplementedError()

Expand Down Expand Up @@ -113,8 +113,9 @@ def store_metadata(self, pid, metadata, format_id):
"""Add or update metadata, such as `sysmeta`, to disk using the given path/stream. The
`store_metadata` method uses a persistent identifier `pid` and a metadata `format_id`
to determine the permanent address of the metadata object. All metadata documents for a
given `pid` will be stored in a directory (under ../metadata) that is determined by
calculating the hash of the given pid, with the document name being the hash of the pid
given `pid` will be stored in a directory that follows the HashStore configuration
settings (under ../metadata) that is determined by calculating the hash of the given pid.
Metadata documents are stored in this directory, and is each named using the hash of the pid
and metadata format (`pid` + `format_id`).
Upon successful storage of metadata, the method returns a string representing the file's
Expand Down Expand Up @@ -239,25 +240,3 @@ def get_hashstore(module_name, class_name, properties=None):
raise AttributeError(
f"Class name '{class_name}' is not an attribute of module '{module_name}'"
)


class ObjectMetadata(
namedtuple("ObjectMetadata", ["pid", "cid", "obj_size", "hex_digests"])
):
"""Represents metadata associated with an object.
The `ObjectMetadata` class represents metadata associated with an object, including
a persistent or authority-based identifier (`pid`), a content identifier (`cid`),
the size of the object in bytes (`obj_size`), and an optional list of hex digests
(`hex_digests`) to assist with validating objects.
:param str pid: An authority-based or persistent identifier
:param str cid: A unique identifier for the object (Hash ID, hex digest).
:param int obj_size: The size of the object in bytes.
:param list hex_digests: A list of hex digests to validate objects
(md5, sha1, sha256, sha384, sha512) (optional).
"""

# Default value to prevent dangerous default value
def __new__(cls, pid, cid, obj_size, hex_digests=None):
return super(ObjectMetadata, cls).__new__(cls, pid, cid, obj_size, hex_digests)
25 changes: 24 additions & 1 deletion tests/test_filehashstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os
from pathlib import Path
import pytest
from hashstore.filehashstore import FileHashStore
from hashstore.filehashstore import FileHashStore, ObjectMetadata
from hashstore.filehashstore_exceptions import (
NonMatchingChecksum,
NonMatchingObjSize,
Expand Down Expand Up @@ -1135,3 +1135,26 @@ def test_check_string(store):
tab_line = "\t"
with pytest.raises(ValueError):
store._check_string(tab_line, "tab_line")


def test_objectmetadata():
"""Test ObjectMetadata class returns correct values via dot notation."""
pid = "hashstore"
ab_id = "hashstoretest"
obj_size = 1234
hex_digest_dict = {
"md5": "md5value",
"sha1": "sha1value",
"sha224": "sha224value",
"sha256": "sha256value",
"sha512": "sha512value",
}
object_metadata = ObjectMetadata(pid, ab_id, obj_size, hex_digest_dict)
assert object_metadata.pid == pid
assert object_metadata.cid == ab_id
assert object_metadata.obj_size == obj_size
assert object_metadata.hex_digests.get("md5") == hex_digest_dict["md5"]
assert object_metadata.hex_digests.get("sha1") == hex_digest_dict["sha1"]
assert object_metadata.hex_digests.get("sha224") == hex_digest_dict["sha224"]
assert object_metadata.hex_digests.get("sha256") == hex_digest_dict["sha256"]
assert object_metadata.hex_digests.get("sha512") == hex_digest_dict["sha512"]
25 changes: 1 addition & 24 deletions tests/test_hashstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import os
import pytest
from hashstore.hashstore import ObjectMetadata, HashStoreFactory
from hashstore.hashstore import HashStoreFactory
from hashstore.filehashstore import FileHashStore


Expand Down Expand Up @@ -156,26 +156,3 @@ def test_factory_get_hashstore_filehashstore_nonconflicting_dir(factory, tmp_pat
}

factory.get_hashstore(module_name, class_name, properties)


def test_objectmetadata():
"""Test ObjectMetadata class returns correct values via dot notation."""
pid = "hashstore"
ab_id = "hashstoretest"
obj_size = 1234
hex_digest_dict = {
"md5": "md5value",
"sha1": "sha1value",
"sha224": "sha224value",
"sha256": "sha256value",
"sha512": "sha512value",
}
object_metadata = ObjectMetadata(pid, ab_id, obj_size, hex_digest_dict)
assert object_metadata.pid == pid
assert object_metadata.cid == ab_id
assert object_metadata.obj_size == obj_size
assert object_metadata.hex_digests.get("md5") == hex_digest_dict["md5"]
assert object_metadata.hex_digests.get("sha1") == hex_digest_dict["sha1"]
assert object_metadata.hex_digests.get("sha224") == hex_digest_dict["sha224"]
assert object_metadata.hex_digests.get("sha256") == hex_digest_dict["sha256"]
assert object_metadata.hex_digests.get("sha512") == hex_digest_dict["sha512"]

0 comments on commit 00c9dcd

Please sign in to comment.