Skip to content

Commit

Permalink
Merge pull request #74 from DataONEorg/feature-73-refs-refactor
Browse files Browse the repository at this point in the history
Feature-73: `store_object` Refactor (with References)
  • Loading branch information
doulikecookiedough authored Dec 6, 2023
2 parents dd0d030 + f9a96d7 commit 059d3b9
Show file tree
Hide file tree
Showing 11 changed files with 1,826 additions and 502 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ venv/
ENV/
env.bak/
venv.bak/
.idea

# Spyder project settings
.spyderproject
Expand Down
2 changes: 1 addition & 1 deletion src/hashstore/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,7 +392,7 @@ def validate_object(self, obj_tuple):
obj_db_checksum = obj_tuple[2]

with self.hashstore.retrieve_object(pid_guid) as obj_stream:
computed_digest = self.hashstore.computehash(obj_stream, algo)
computed_digest = self.hashstore.get_hex_digest(obj_stream, algo)
obj_stream.close()

if computed_digest != obj_db_checksum:
Expand Down
1,126 changes: 837 additions & 289 deletions src/hashstore/filehashstore.py

Large diffs are not rendered by default.

106 changes: 76 additions & 30 deletions src/hashstore/hashstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
from abc import ABC, abstractmethod
from collections import namedtuple
import importlib.metadata
import importlib.util


class HashStore(ABC):
"""HashStore is a content-addressable file management system that
utilizes a persistent identifier (PID) in the form of a hex digest
value to address files."""
"""HashStore is a content-addressable file management system that utilizes
an object's content identifier (hex digest/checksum) to address files."""

@staticmethod
def version():
Expand All @@ -26,28 +26,32 @@ def store_object(
expected_object_size,
):
"""The `store_object` method is responsible for the atomic storage of objects to
disk using a given InputStream and a persistent identifier (pid). Upon
successful storage, the method returns a ObjectMetadata object containing
relevant file information, such as the file's id (which can be used to locate the
object on disk), the file's size, and a hex digest map of algorithms and checksums.
`store_object` also ensures that an object is stored only once by synchronizing
multiple calls and rejecting calls to store duplicate objects.
The file's id is determined by calculating the SHA-256 hex digest of the
provided pid, which is also used as the permanent address of the file. The
file's identifier is then sharded using a depth of 3 and width of 2,
disk using a given stream. Upon successful storage, the method returns a ObjectMetadata
object containing relevant file information, such as the file's id (which can be
used to locate the object on disk), the file's size, and a hex digest dict of algorithms
and checksums. Storing an object with `store_object` also tags an object (creating
references) which allow the object to be discoverable.
`store_object` also ensures that an object is stored only once by synchronizing multiple
calls and rejecting calls to store duplicate objects. Note, calling `store_object` without
a pid is a possibility, but should only store the object without tagging the object.
It is then the caller's responsibility to finalize the process by calling `tag_object`
after veriftying the correct object is stored.
The file's id is determined by calculating the object's content identifier based on
the store's default algorithm, which is also used as the permanent address of the file.
The file's identifier is then sharded using the store's configured depth and width,
delimited by '/' and concatenated to produce the final permanent address
and is stored in the `/store_directory/objects/` directory.
By default, the hex digest map includes the following hash algorithms:
Default algorithms and hex digests to return: md5, sha1, sha256, sha384, sha512,
which are the most commonly used algorithms in dataset submissions to DataONE
and the Arctic Data Center. If an additional algorithm is provided, the
`store_object` method checks if it is supported and adds it to the map along
with its corresponding hex digest. An algorithm is considered "supported" if it
is recognized as a valid hash algorithm in the `hashlib` library.
Similarly, if a file size and/or checksum & checksumAlgorithm value are provided,
md5, sha1, sha256, sha384, sha512 - which are the most commonly used algorithms in
dataset submissions to DataONE and the Arctic Data Center. If an additional algorithm
is provided, the `store_object` method checks if it is supported and adds it to the
hex digests dict along with its corresponding hex digest. An algorithm is considered
"supported" if it is recognized as a valid hash algorithm in the `hashlib` library.
Similarly, if a file size and/or checksum & checksum_algorithm value are provided,
`store_object` validates the object to ensure it matches the given arguments
before moving the file to its permanent address.
Expand All @@ -61,7 +65,50 @@ def store_object(
Returns:
object_metadata (ObjectMetadata): Object that contains the permanent address,
file size, duplicate file boolean and hex digest dictionary.
file size and hex digest dictionary.
"""
raise NotImplementedError()

@abstractmethod
def tag_object(self, pid, cid):
"""The `tag_object` method creates references that allow objects stored in HashStore
to be discoverable. Retrieving, deleting or calculating a hex digest of an object is
based on a pid argument; and to proceed, we must be able to find the object associated
with the pid.
Args:
pid (string): Authority-based or persistent identifier of object
cid (string): Content identifier of object
Returns:
boolean: `True` upon successful tagging.
"""
raise NotImplementedError()

@abstractmethod
def verify_object(
self, object_metadata, checksum, checksum_algorithm, expected_file_size
):
"""Confirms that an object_metadata's content is equal to the given values.
Args:
object_metadata (ObjectMetadata): object_metadata object
checksum (string): Value of checksum
checksum_algorithm (string): Algorithm of checksum
expected_file_size (int): Size of the tmp file
"""
raise NotImplementedError()

@abstractmethod
def find_object(self, pid):
"""The `find_object` method checks whether an object referenced by a pid exists
and returns the content identifier.
Args:
pid (string): Authority-based or persistent identifier of object
Returns:
cid (string): Content identifier of the object
"""
raise NotImplementedError()

Expand Down Expand Up @@ -89,9 +136,8 @@ def store_metadata(self, pid, metadata, format_id):
@abstractmethod
def retrieve_object(self, pid):
"""The `retrieve_object` method retrieves an object from disk using a given
persistent identifier (pid). If the object exists (determined by calculating
the object's permanent address using the SHA-256 hash of the given pid), the
method will open and return a buffered object stream ready to read from.
persistent identifier (pid). If the object exists, the method will open and return
a buffered object stream ready to read from.
Args:
pid (string): Authority-based identifier.
Expand Down Expand Up @@ -211,12 +257,12 @@ def get_hashstore(module_name, class_name, properties=None):


class ObjectMetadata(namedtuple("ObjectMetadata", ["id", "obj_size", "hex_digests"])):
"""File address containing file's path on disk and its content hash ID.
"""Represents metadata associated with an object.
Args:
ab_id (str): Hash ID (hexdigest) of file contents.
obj_size (bytes): Size of the object
hex_digests (dict, optional): A list of hex digests to validate objects
Attributes:
id (str): A unique identifier for the object (Hash ID, hex digest).
obj_size (bytes): The size of the object in bytes.
hex_digests (list, optional): A list of hex digests to validate objects
(md5, sha1, sha256, sha384, sha512)
"""

Expand Down
3 changes: 0 additions & 3 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ def init_pids():
test_pids = {
"doi:10.18739/A2901ZH2M": {
"file_size_bytes": 39993,
"object_cid": "0d555ed77052d7e166017f779cbc193357c3a5006ee8b8457230bcf7abcef65e",
"metadata_cid": "323e0799524cec4c7e14d31289cefd884b563b5c052f154a066de5ec1e477da7",
"md5": "db91c910a3202478c8def1071c54aae5",
"sha1": "1fe86e3c8043afa4c70857ca983d740ad8501ccd",
Expand All @@ -58,7 +57,6 @@ def init_pids():
},
"jtao.1700.1": {
"file_size_bytes": 8724,
"object_cid": "a8241925740d5dcd719596639e780e0a090c9d55a5d0372b0eaf55ed711d4edf",
"metadata_cid": "ddf07952ef28efc099d10d8b682480f7d2da60015f5d8873b6e1ea75b4baf689",
"md5": "f4ea2d07db950873462a064937197b0f",
"sha1": "3d25436c4490b08a2646e283dada5c60e5c0539d",
Expand All @@ -69,7 +67,6 @@ def init_pids():
},
"urn:uuid:1b35d0a5-b17a-423b-a2ed-de2b18dc367a": {
"file_size_bytes": 18699,
"object_cid": "7f5cc18f0b04e812a3b4c8f686ce34e6fec558804bf61e54b176742a7f6368d6",
"metadata_cid": "9a2e08c666b728e6cbd04d247b9e556df3de5b2ca49f7c5a24868eb27cddbff2",
"md5": "e1932fc75ca94de8b64f1d73dc898079",
"sha1": "c6d2a69a3f5adaf478ba796c114f57b990cf7ad1",
Expand Down
Loading

0 comments on commit 059d3b9

Please sign in to comment.