diff --git a/src/hashstore/__init__.py b/src/hashstore/__init__.py index 14a40b8c..a841efa3 100644 --- a/src/hashstore/__init__.py +++ b/src/hashstore/__init__.py @@ -16,6 +16,6 @@ system. """ -from hashstore.hashstore import HashStore, HashStoreFactory, ObjectMetadata +from hashstore.hashstore import HashStore, HashStoreFactory -__all__ = ("HashStore", "HashStoreFactory", "ObjectMetadata") +__all__ = ("HashStore", "HashStoreFactory") diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index b9f7addf..2be657c4 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -9,12 +9,13 @@ import os import logging import inspect +import fcntl +import yaml +from dataclasses import dataclass from pathlib import Path from contextlib import closing from tempfile import NamedTemporaryFile -import fcntl -import yaml -from hashstore import HashStore, ObjectMetadata +from hashstore import HashStore from hashstore.filehashstore_exceptions import ( CidRefsContentError, CidRefsDoesNotExist, @@ -2611,3 +2612,25 @@ def close(self): self._obj.close() else: self._obj.seek(self._pos) + + +@dataclass +class ObjectMetadata: + """Represents metadata associated with an object. + + The `ObjectMetadata` class represents metadata associated with an object, including + a persistent or authority-based identifier (`pid`), a content identifier (`cid`), + the size of the object in bytes (`obj_size`), and an optional list of hex digests + (`hex_digests`) to assist with validating objects. + + :param str pid: An authority-based or persistent identifier + :param str cid: A unique identifier for the object (Hash ID, hex digest). + :param int obj_size: The size of the object in bytes. + :param list hex_digests: A list of hex digests to validate objects + (md5, sha1, sha256, sha384, sha512) (optional). + """ + + pid: str + cid: str + obj_size: int + hex_digests: dict diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index a40c7735..f683fb0f 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -1,7 +1,6 @@ """Hashstore Interface""" from abc import ABC, abstractmethod -from collections import namedtuple import importlib.metadata import importlib.util @@ -28,19 +27,20 @@ def store_object( ): """Atomic storage of objects to disk using a given stream. Upon successful storage, it returns an `ObjectMetadata` object containing relevant file information, such as - the file's id, the file's size, and a hex digest dictionary of algorithms and checksums. - The method also tags the object, creating references for discoverability. + a persistent identifier that references the data file, the file's size, and a hex digest + dictionary of algorithms and checksums. The method also tags the object, creating + references for discoverability. `store_object` ensures that an object is stored only once by synchronizing multiple calls and rejecting attempts to store duplicate objects. If called without a pid, it stores the object without tagging, and it becomes the caller's responsibility to finalize the process by calling `tag_object` after verifying the correct object is stored. - The file's id is determined by calculating the object's content identifier based on the - store's default algorithm, which is also the permanent address of the file. The file's - identifier is then sharded using the store's configured depth and width, delimited by '/', - and concatenated to produce the final permanent address. This address is stored in the - `/store_directory/objects/` directory. + The file's permanent address is determined by calculating the object's content identifier + based on the store's default algorithm, which is also the permanent address of the file. + The content identifier is then sharded using the store's configured depth and width, + delimited by '/', and concatenated to produce the final permanent address. This address + is stored in the `/store_directory/objects/` directory. By default, the hex digest map includes common hash algorithms (md5, sha1, sha256, sha384, sha512). If an additional algorithm is provided, the method checks if it is supported and @@ -59,8 +59,8 @@ def store_object( :param str checksum_algorithm: Algorithm of the supplied checksum. :param int expected_object_size: Size of the object to verify. - :return: ObjectMetadata - Object containing the permanent address, file size, and - hex digest dictionary. + :return: ObjectMetadata - Object containing the persistent identifier (pid), + content identifier (cid), object size and hex digests dictionary (checksums). """ raise NotImplementedError() @@ -113,8 +113,9 @@ def store_metadata(self, pid, metadata, format_id): """Add or update metadata, such as `sysmeta`, to disk using the given path/stream. The `store_metadata` method uses a persistent identifier `pid` and a metadata `format_id` to determine the permanent address of the metadata object. All metadata documents for a - given `pid` will be stored in a directory (under ../metadata) that is determined by - calculating the hash of the given pid, with the document name being the hash of the pid + given `pid` will be stored in a directory that follows the HashStore configuration + settings (under ../metadata) that is determined by calculating the hash of the given pid. + Metadata documents are stored in this directory, and is each named using the hash of the pid and metadata format (`pid` + `format_id`). Upon successful storage of metadata, the method returns a string representing the file's @@ -239,25 +240,3 @@ def get_hashstore(module_name, class_name, properties=None): raise AttributeError( f"Class name '{class_name}' is not an attribute of module '{module_name}'" ) - - -class ObjectMetadata( - namedtuple("ObjectMetadata", ["pid", "cid", "obj_size", "hex_digests"]) -): - """Represents metadata associated with an object. - - The `ObjectMetadata` class represents metadata associated with an object, including - a persistent or authority-based identifier (`pid`), a content identifier (`cid`), - the size of the object in bytes (`obj_size`), and an optional list of hex digests - (`hex_digests`) to assist with validating objects. - - :param str pid: An authority-based or persistent identifier - :param str cid: A unique identifier for the object (Hash ID, hex digest). - :param int obj_size: The size of the object in bytes. - :param list hex_digests: A list of hex digests to validate objects - (md5, sha1, sha256, sha384, sha512) (optional). - """ - - # Default value to prevent dangerous default value - def __new__(cls, pid, cid, obj_size, hex_digests=None): - return super(ObjectMetadata, cls).__new__(cls, pid, cid, obj_size, hex_digests) diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 26881976..825b9273 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -4,7 +4,7 @@ import os from pathlib import Path import pytest -from hashstore.filehashstore import FileHashStore +from hashstore.filehashstore import FileHashStore, ObjectMetadata from hashstore.filehashstore_exceptions import ( NonMatchingChecksum, NonMatchingObjSize, @@ -1135,3 +1135,26 @@ def test_check_string(store): tab_line = "\t" with pytest.raises(ValueError): store._check_string(tab_line, "tab_line") + + +def test_objectmetadata(): + """Test ObjectMetadata class returns correct values via dot notation.""" + pid = "hashstore" + ab_id = "hashstoretest" + obj_size = 1234 + hex_digest_dict = { + "md5": "md5value", + "sha1": "sha1value", + "sha224": "sha224value", + "sha256": "sha256value", + "sha512": "sha512value", + } + object_metadata = ObjectMetadata(pid, ab_id, obj_size, hex_digest_dict) + assert object_metadata.pid == pid + assert object_metadata.cid == ab_id + assert object_metadata.obj_size == obj_size + assert object_metadata.hex_digests.get("md5") == hex_digest_dict["md5"] + assert object_metadata.hex_digests.get("sha1") == hex_digest_dict["sha1"] + assert object_metadata.hex_digests.get("sha224") == hex_digest_dict["sha224"] + assert object_metadata.hex_digests.get("sha256") == hex_digest_dict["sha256"] + assert object_metadata.hex_digests.get("sha512") == hex_digest_dict["sha512"] diff --git a/tests/test_hashstore.py b/tests/test_hashstore.py index bb2c1ac5..140d473a 100644 --- a/tests/test_hashstore.py +++ b/tests/test_hashstore.py @@ -2,7 +2,7 @@ import os import pytest -from hashstore.hashstore import ObjectMetadata, HashStoreFactory +from hashstore.hashstore import HashStoreFactory from hashstore.filehashstore import FileHashStore @@ -156,26 +156,3 @@ def test_factory_get_hashstore_filehashstore_nonconflicting_dir(factory, tmp_pat } factory.get_hashstore(module_name, class_name, properties) - - -def test_objectmetadata(): - """Test ObjectMetadata class returns correct values via dot notation.""" - pid = "hashstore" - ab_id = "hashstoretest" - obj_size = 1234 - hex_digest_dict = { - "md5": "md5value", - "sha1": "sha1value", - "sha224": "sha224value", - "sha256": "sha256value", - "sha512": "sha512value", - } - object_metadata = ObjectMetadata(pid, ab_id, obj_size, hex_digest_dict) - assert object_metadata.pid == pid - assert object_metadata.cid == ab_id - assert object_metadata.obj_size == obj_size - assert object_metadata.hex_digests.get("md5") == hex_digest_dict["md5"] - assert object_metadata.hex_digests.get("sha1") == hex_digest_dict["sha1"] - assert object_metadata.hex_digests.get("sha224") == hex_digest_dict["sha224"] - assert object_metadata.hex_digests.get("sha256") == hex_digest_dict["sha256"] - assert object_metadata.hex_digests.get("sha512") == hex_digest_dict["sha512"]