From 20a1c32154109576dcf048566af02df2cf5f2994 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 31 Jan 2024 13:59:43 -0800 Subject: [PATCH 01/45] Add new parameter 'id_type' to 'delete_object' method --- src/hashstore/filehashstore.py | 141 +++++++++++++++++---------------- 1 file changed, 72 insertions(+), 69 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index cf6a77ef..1cefbb24 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -770,80 +770,83 @@ def retrieve_metadata(self, pid, format_id=None): ) return metadata_stream - def delete_object(self, pid): + def delete_object(self, pid, id_type=None): logging.debug( "FileHashStore - delete_object: Request to delete object for pid: %s", pid ) self._check_string(pid, "pid", "delete_object") - try: - cid = self.find_object(pid) - except FileNotFoundError as fnfe: - if "pid refs file not found" in fnfe: - # Nothing to delete - return - if "cid refs file not found" in fnfe: - # Delete pid refs file - pid_ref_abs_path = self._resolve_path("pid", pid) - self._delete("pid", pid_ref_abs_path) - return - if "object referenced does not exist" in fnfe: - # Delete pid refs file - pid_ref_abs_path = self._resolve_path("pid", pid) - self._delete("pid", pid_ref_abs_path) - return - except ValueError as ve: - if "is missing from cid refs file" in ve: - # Delete pid refs file - pid_ref_abs_path = self._resolve_path("pid", pid) - self._delete("pid", pid_ref_abs_path) - return - - # Proceed with next steps - cid has been retrieved without any errors - while cid in self.reference_locked_cids: - logging.debug( - "FileHashStore - delete_object: (cid) %s is currently locked. Waiting", - cid, - ) - time.sleep(self.time_out_sec) - # Modify reference_locked_cids consecutively - with self.reference_lock: - logging.debug( - "FileHashStore - delete_object: Adding cid: %s to reference_locked_cids.", - cid, - ) - self.reference_locked_cids.append(cid) - try: - cid_ref_abs_path = self._resolve_path("cid", cid) - pid_ref_abs_path = self._resolve_path("pid", pid) - # First delete the pid refs file immediately - self._delete_pid_refs_file(pid_ref_abs_path) - # Remove pid from cid reference file - self._delete_cid_refs_pid(cid_ref_abs_path, pid) - # Delete cid reference file and object only if the cid refs file is empty - if os.path.getsize(cid_ref_abs_path) == 0: - self._delete("cid", cid_ref_abs_path) - self._delete("objects", cid) - info_string = ( - "FileHashStore - delete_object: Successfully deleted references and" - + f" object associated with pid: {pid}" - ) - logging.info(info_string) - else: - info_string = ( - "FileHashStore - delete_object: Successfully deleted pid refs file but" - + f" not object with cid ({cid}), cid refs file not empty." - ) - logging.info(info_string) + if id_type is "cid": + # TODO: Delete object only return True - - finally: - # Release cid + else: + # id_type is "pid" + try: + cid = self.find_object(pid) + except FileNotFoundError as fnfe: + if "pid refs file not found" in fnfe: + # Nothing to delete + return + if "cid refs file not found" in fnfe: + # Delete pid refs file + self._delete("pid", pid) + return + if "object referenced does not exist" in fnfe: + # Delete pid refs file + self._delete("pid", pid) + return + except ValueError as ve: + if "is missing from cid refs file" in ve: + # Delete pid refs file + self._delete("pid", pid) + return + + # Proceed with next steps - cid has been retrieved without any errors + while cid in self.reference_locked_cids: + logging.debug( + "FileHashStore - delete_object: (cid) %s is currently locked. Waiting", + cid, + ) + time.sleep(self.time_out_sec) + # Modify reference_locked_cids consecutively with self.reference_lock: logging.debug( - "FileHashStore - delete_object: Removing cid: %s from reference_locked_cids.", + "FileHashStore - delete_object: Adding cid: %s to reference_locked_cids.", cid, ) - self.reference_locked_cids.remove(cid) + self.reference_locked_cids.append(cid) + try: + cid_ref_abs_path = self._resolve_path("cid", cid) + pid_ref_abs_path = self._resolve_path("pid", pid) + # First delete the pid refs file immediately + self._delete_pid_refs_file(pid_ref_abs_path) + # Remove pid from cid reference file + self._delete_cid_refs_pid(cid_ref_abs_path, pid) + # Delete cid reference file and object only if the cid refs file is empty + if os.path.getsize(cid_ref_abs_path) == 0: + self._delete("cid", cid_ref_abs_path) + self._delete("objects", cid) + info_string = ( + "FileHashStore - delete_object: Successfully deleted references and" + + f" object associated with pid: {pid}" + ) + logging.info(info_string) + else: + info_string = ( + "FileHashStore - delete_object: Successfully deleted pid refs file but" + + f" not object with cid ({cid}), cid refs file not empty." + ) + logging.info(info_string) + return True + + finally: + # Release cid + with self.reference_lock: + debug_msg = ( + "FileHashStore - delete_object:" + + f" Removing cid: {cid} from reference_locked_cids." + ) + logging.debug(debug_msg) + self.reference_locked_cids.remove(cid) def delete_metadata(self, pid, format_id=None): logging.debug( @@ -1993,13 +1996,13 @@ def _resolve_path(self, entity, file): # Check for sharded path. if entity == "cid": # Note, we skip checking whether the file exists for refs - ref_file_abs_path = self._build_path(entity, file) - return ref_file_abs_path + cid_ref_file_abs_path = self._build_path(entity, file) + return cid_ref_file_abs_path elif entity == "pid": # Note, we skip checking whether the file exists for refs hash_id = self._computehash(file, self.algorithm) - ref_file_abs_path = self._build_path(entity, hash_id) - return ref_file_abs_path + pid_ref_file_abs_path = self._build_path(entity, hash_id) + return pid_ref_file_abs_path else: abspath = self._build_path(entity, file) if os.path.isfile(abspath): From 93b65260c660e6f0e37d0d9436a67c28e423b6a7 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 31 Jan 2024 14:27:59 -0800 Subject: [PATCH 02/45] Update HashStore interface docstring for 'delete_object', implement 'delete_object' for when 'id_type' is 'cid' and add new pytests --- src/hashstore/filehashstore.py | 15 +++++++++----- src/hashstore/hashstore.py | 15 +++++++------- tests/test_filehashstore_interface.py | 30 ++++++++++++++++++++++++++- 3 files changed, 47 insertions(+), 13 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 1cefbb24..a59bf56f 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -770,16 +770,21 @@ def retrieve_metadata(self, pid, format_id=None): ) return metadata_stream - def delete_object(self, pid, id_type=None): + def delete_object(self, ab_id, id_type=None): logging.debug( - "FileHashStore - delete_object: Request to delete object for pid: %s", pid + "FileHashStore - delete_object: Request to delete object for id: %s", ab_id ) - self._check_string(pid, "pid", "delete_object") + self._check_string(ab_id, "ab_id", "delete_object") if id_type is "cid": - # TODO: Delete object only - return True + cid_refs_abs_path = self._resolve_path("objects", ab_id) + if os.path.exists(cid_refs_abs_path): + self._delete("objects", ab_id) + return True + else: + return False else: # id_type is "pid" + pid = ab_id try: cid = self.find_object(pid) except FileNotFoundError as fnfe: diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index c5019825..5bfabde2 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -162,13 +162,14 @@ def retrieve_metadata(self, pid, format_id): raise NotImplementedError() @abstractmethod - def delete_object(self, pid): - """Delete an object permanently from disk using a persistent identifier (pid). - - The `delete_object` method removes the object associated with the provided `pid` from - disk, resulting in the permanent deletion of the object. - - :param str pid: Authority-based identifier. + def delete_object(self, ab_id, id_type): + """Delete an object and its related data permanently from disk using a given identifier + and 'id_type'. When 'id_type' is 'pid', HashStore will attempt to delete the object and + its associated references files. When 'id_type' is 'cid', HashStore will only attempt to + delete the object. If 'id_type' is not supplied, `delete_object` will assume it is 'pid'. + + :param str ab_id: Authority-based identifier. + :param str id_type: "pid" or "Cid :return: bool - `True` upon successful deletion. """ diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 40dc99f0..9405e62e 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -42,6 +42,21 @@ def test_store_object(pids, store): object_metadata = store.store_object(pid, path) assert object_metadata.cid == pids[pid][store.algorithm] assert store._count(entity) == 3 + assert store._count("pid") == 3 + assert store._count("cid") == 3 + + +def test_store_object_only_object(pids, store): + """Test store object stores an object only (no reference files will be created)""" + test_dir = "tests/testdata/" + entity = "objects" + for pid in pids.keys(): + path = Path(test_dir + pid.replace("/", "_")) + object_metadata = store.store_object(data=path) + assert object_metadata.cid == pids[pid][store.algorithm] + assert store._count(entity) == 3 + assert store._count("pid") == 0 + assert store._count("cid") == 0 def test_store_object_files_path(pids, store): @@ -882,7 +897,7 @@ def test_retrieve_metadata_format_id_empty_spaces(store): def test_delete_object(pids, store): - """Test delete_object successfully deletes objects from /objects.""" + """Test delete_object successfully deletes objects from /objects and all refs files.""" test_dir = "tests/testdata/" entity = "objects" format_id = "http://ns.dataone.org/service/types/v2.0" @@ -894,6 +909,8 @@ def test_delete_object(pids, store): _metadata_cid = store.store_metadata(pid, syspath, format_id) store.delete_object(pid) assert store._count(entity) == 0 + assert store._count("pid") == 0 + assert store._count("cid") == 0 def test_delete_object_pid_refs_file(pids, store): @@ -942,6 +959,17 @@ def test_delete_object_cid_refs_file_with_pid_refs_remaining(pids, store): assert os.path.exists(cid_refs_file_path) +def test_delete_object_id_type_cid(pids, store): + """Test delete_object successfully deletes only object.""" + test_dir = "tests/testdata/" + entity = "objects" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid=None, data=path) + store.delete_object(object_metadata.cid, "cid") + assert store._count(entity) == 0 + + def test_delete_object_pid_empty(store): """Test delete_object raises error when empty pid supplied.""" pid = " " From 4480fba7e9355126e494bf914cd08e31d6928750 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 31 Jan 2024 14:41:01 -0800 Subject: [PATCH 03/45] Update README.md for updated 'delete_object' functionality and add TODO items --- README.md | 4 +++- src/hashstore/filehashstore.py | 4 ++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index b864c157..3f5681ec 100644 --- a/README.md +++ b/README.md @@ -112,7 +112,9 @@ tag_object(pid, cid) - If desired, this cid can then be used to locate the object on disk by following HashStore's store configuration. **How do I delete an object if I have the pid?** -- To delete an object, call the Public API method `delete_object` which will delete the object and its associated references and reference files where relevant. +- To delete an object and all its associated reference files, call the Public API method `delete_object` with `id_type` 'pid'. +- To delete only an object, call `delete_object` with `id_type` 'cid' which will remove the object if it it is not referenced by any pids. +- To delete an object and all its related data (reference files and system metadata), call the Public API method `delete_object` with `id_type` 'clear'. - Note, `delete_object` and `tag_object` calls are synchronized on their content identifier values so that the shared reference files are not unintentionally modified concurrently. An object that is in the process of being deleted should not be tagged, and vice versa. These calls have been implemented to occur sequentially to improve clarity in the event of an unexpected conflict or issue. diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index a59bf56f..e7952450 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -701,6 +701,7 @@ def store_metadata(self, pid, metadata, format_id=None): "FileHashStore - store_metadata: Attempting to store metadata for pid: %s", pid, ) + # TODO: Refactor the way we store metadata and add pytests to confirm metadata_cid = self._put_metadata(metadata, pid, checked_format_id) logging.info( @@ -841,6 +842,7 @@ def delete_object(self, ab_id, id_type=None): + f" not object with cid ({cid}), cid refs file not empty." ) logging.info(info_string) + # TODO:: Check 'id_type' for 'clear' & attempt to remove all metadata docs if so return True finally: @@ -1264,6 +1266,8 @@ def delete_tmp_file(): os.umask(oldmask) return tmp + # TODO: Clean up refs file methods, a lot of redundant code + def _write_cid_refs_file(self, path, pid): """Write the CID reference file in the supplied path to a file. A reference file contains every PID that references a CID, each on its own line. This method will From d26b1c32ad0647e35f9875c9a3e2516cf2d279a4 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 5 Feb 2024 16:31:11 -0800 Subject: [PATCH 04/45] Update 'ObjectMetadata' class with new attribute 'pid' and revise affected code and docstrings --- src/hashstore/filehashstore.py | 6 ++++-- src/hashstore/hashstore.py | 15 +++++++++------ tests/test_hashstore.py | 4 +++- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index e7952450..2aa4cf48 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -948,7 +948,9 @@ def _store_and_validate_data( file_size_to_validate, ) - object_metadata = ObjectMetadata(object_cid, obj_file_size, hex_digest_dict) + object_metadata = ObjectMetadata( + pid, object_cid, obj_file_size, hex_digest_dict + ) logging.debug( "FileHashStore - put_object: Successfully put object for pid: %s", pid, @@ -985,7 +987,7 @@ def _store_data_only(self, data): ) = self._move_and_get_checksums(None, stream) object_metadata = ObjectMetadata( - object_ref_pid_location, obj_file_size, hex_digest_dict + None, object_ref_pid_location, obj_file_size, hex_digest_dict ) # The permanent address of the data stored is based on the data's checksum cid = hex_digest_dict.get(self.algorithm) diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 5bfabde2..0236c453 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -255,12 +255,15 @@ def get_hashstore(module_name, class_name, properties=None): ) -class ObjectMetadata(namedtuple("ObjectMetadata", ["cid", "obj_size", "hex_digests"])): +class ObjectMetadata( + namedtuple("ObjectMetadata", ["pid", "cid", "obj_size", "hex_digests"]) +): """Represents metadata associated with an object. - The `ObjectMetadata` class represents metadata associated with an object, - including a content identifier (`cid`), the size of the object in bytes (`obj_size`), - and an optional list of hex digests (`hex_digests`) to assist with validating objects. + The `ObjectMetadata` class represents metadata associated with an object, including + a persistent or authority-based identifier (`pid`), a content identifier (`cid`), + the size of the object in bytes (`obj_size`), and an optional list of hex digests + (`hex_digests`) to assist with validating objects. :param str cid: A unique identifier for the object (Hash ID, hex digest). :param bytes obj_size: The size of the object in bytes. @@ -269,5 +272,5 @@ class ObjectMetadata(namedtuple("ObjectMetadata", ["cid", "obj_size", "hex_diges """ # Default value to prevent dangerous default value - def __new__(cls, cid, obj_size, hex_digests=None): - return super(ObjectMetadata, cls).__new__(cls, cid, obj_size, hex_digests) + def __new__(cls, pid, cid, obj_size, hex_digests=None): + return super(ObjectMetadata, cls).__new__(cls, pid, cid, obj_size, hex_digests) diff --git a/tests/test_hashstore.py b/tests/test_hashstore.py index a2d42398..eb79caa8 100644 --- a/tests/test_hashstore.py +++ b/tests/test_hashstore.py @@ -77,6 +77,7 @@ def test_factory_get_hashstore_filehashstore_incorrect_algorithm_format(factory) def test_objectmetadata(): """Test ObjectMetadata class returns correct values via dot notation.""" + pid = "hashstore" ab_id = "hashstoretest" obj_size = 1234 hex_digest_dict = { @@ -86,7 +87,8 @@ def test_objectmetadata(): "sha256": "sha256value", "sha512": "sha512value", } - object_metadata = ObjectMetadata(ab_id, obj_size, hex_digest_dict) + object_metadata = ObjectMetadata(pid, ab_id, obj_size, hex_digest_dict) + assert object_metadata.pid == pid assert object_metadata.cid == ab_id assert object_metadata.obj_size == obj_size assert object_metadata.hex_digests.get("md5") == hex_digest_dict["md5"] From 371afd2a875019dcbefb1902f7fb358639ef1984 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 5 Feb 2024 16:41:54 -0800 Subject: [PATCH 05/45] Update HashStore interface docstrings --- src/hashstore/filehashstore.py | 2 +- src/hashstore/hashstore.py | 76 ++++++++++++++-------------------- 2 files changed, 33 insertions(+), 45 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 2aa4cf48..b9b6012e 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -842,7 +842,7 @@ def delete_object(self, ab_id, id_type=None): + f" not object with cid ({cid}), cid refs file not empty." ) logging.info(info_string) - # TODO:: Check 'id_type' for 'clear' & attempt to remove all metadata docs if so + # TODO: Check 'id_type' for 'clear' & attempt to remove all metadata docs if so return True finally: diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 0236c453..068f9039 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -25,13 +25,12 @@ def store_object( checksum_algorithm, expected_object_size, ): - """Atomic storage of objects to disk using a given stream. - - The `store_object` method ensures atomic storage of objects to disk. Upon successful - storage, it returns an ObjectMetadata object containing relevant file information, - such as the file's id (used to locate the object on disk), the file's size, and a hex digest - dictionary of algorithms and checksums. The method also tags the object, creating references - for discoverability. + """Atomic storage of objects to disk using a given stream. The `store_object` method + ensures atomic storage of objects to disk. Upon successful storage, it returns an + `ObjectMetadata` object containing relevant file information, such as the file's id + (used to locate the object on disk), the file's size, and a hex digest dictionary of + algorithms and checksums. The method also tags the object, creating references for + discoverability. `store_object` ensures that an object is stored only once by synchronizing multiple calls and rejecting attempts to store duplicate objects. If called without a pid, it stores the @@ -68,11 +67,9 @@ def store_object( @abstractmethod def tag_object(self, pid, cid): - """Create references to make objects discoverable in HashStore. - - The `tag_object` method enables operations such as retrieving, deleting, or calculating - a hex digest based on the provided pid argument. To perform these actions, it's crucial - to locate the object associated with the given pid. + """Creates references that allow objects stored in HashStore to be discoverable. Retrieving, + deleting or calculating a hex digest of an object is based on a pid argument; and to + proceed, we must be able to find the object associated with the pid. :param str pid: Authority-based or persistent identifier of the object. :param str cid: Content identifier of the object. @@ -85,10 +82,8 @@ def tag_object(self, pid, cid): def verify_object( self, object_metadata, checksum, checksum_algorithm, expected_file_size ): - """Confirm equality of content in an ObjectMetadata. - - The `verify_object` method verifies that the content in the provided `object_metadata` - matches the specified values. + """Confirm equality of content in an ObjectMetadata. The `verify_object` method verifies + that the content in the provided `object_metadata` matches the specified values. :param ObjectMetadata object_metadata: ObjectMetadata object. :param str checksum: Value of the checksum. @@ -102,7 +97,6 @@ def verify_object( @abstractmethod def find_object(self, pid): """Check if an object referenced by a pid exists and retrieve its content identifier. - The `find_object` method validates the existence of an object based on the provided pid and returns the associated content identifier. @@ -114,11 +108,12 @@ def find_object(self, pid): @abstractmethod def store_metadata(self, pid, metadata, format_id): - """Add or update metadata, such as `sysmeta`, to disk using the given path/stream. - - The `store_metadata` method uses a persistent identifier `pid` and a metadata `format_id` - to determine the permanent address of the metadata object. The permanent address is - calculated by obtaining the SHA-256 hex digest of the concatenation of `pid` & `format_id`. + """Add or update metadata, such as `sysmeta`, to disk using the given path/stream. The + `store_metadata` method uses a persistent identifier `pid` and a metadata `format_id` + to determine the permanent address of the metadata object. All metadata documents for a + given `pid` will be stored in a directory (under ../metadata) that is determined by + calculating the hash of the given pid, with the document name being the hash of the + metadata format (`format_id`). Upon successful storage of metadata, the method returns a string representing the file's permanent address. Metadata objects are stored in parallel to objects in the @@ -134,10 +129,9 @@ def store_metadata(self, pid, metadata, format_id): @abstractmethod def retrieve_object(self, pid): - """Retrieve an object from disk using a persistent identifier (pid). - - The `retrieve_object` method opens and returns a buffered object stream ready for reading - if the object associated with the provided `pid` exists on disk. + """Retrieve an object from disk using a persistent identifier (pid). The `retrieve_object` + method opens and returns a buffered object stream ready for reading if the object + associated with the provided `pid` exists on disk. :param str pid: Authority-based identifier. @@ -148,11 +142,8 @@ def retrieve_object(self, pid): @abstractmethod def retrieve_metadata(self, pid, format_id): """Retrieve the metadata object from disk using a persistent identifier (pid) - and metadata namespace (format_id). - - The `retrieve_metadata` method calculates the metadata object's permanent address - by hashing the concatenation of the given `pid` and `format_id`. If the object - exists, the method opens and returns a buffered metadata stream ready for reading. + and metadata namespace (format_id). If the metadata document exists, the method opens + and returns a buffered metadata stream ready for reading. :param str pid: Authority-based identifier. :param str format_id: Metadata format. @@ -163,10 +154,11 @@ def retrieve_metadata(self, pid, format_id): @abstractmethod def delete_object(self, ab_id, id_type): - """Delete an object and its related data permanently from disk using a given identifier - and 'id_type'. When 'id_type' is 'pid', HashStore will attempt to delete the object and - its associated references files. When 'id_type' is 'cid', HashStore will only attempt to - delete the object. If 'id_type' is not supplied, `delete_object` will assume it is 'pid'. + """Deletes an object and its related data permanently from HashStore using a given + persistent identifier. If the `id_type` is 'pid', the object associated with the pid will + be deleted if it is not referenced by any other pids, along with its reference files and + all metadata documents found in its respective metadata directory. If the `id_type` is + 'cid', only the object will be deleted if it is not referenced by other pids. :param str ab_id: Authority-based identifier. :param str id_type: "pid" or "Cid @@ -177,11 +169,9 @@ def delete_object(self, ab_id, id_type): @abstractmethod def delete_metadata(self, pid, format_id): - """Delete a metadata document permanently from disk using a persistent identifier (pid) - and metadata namespace (format_id). - - The `delete_metadata` method removes the metadata document associated with the provided - `pid` and `format_id` from disk, resulting in its permanent deletion. + """Deletes a metadata document (ex. `sysmeta`) permanently from HashStore using a given + persistent identifier and its respective metadata namespace. If a `format_id` is supplied, + only the metadata document associated with the `format_id` will be deleted. :param str pid: Authority-based identifier. :param str format_id: Metadata format. @@ -192,10 +182,8 @@ def delete_metadata(self, pid, format_id): @abstractmethod def get_hex_digest(self, pid, algorithm): - """Calculate the hex digest of an object in HashStore. - - The `get_hex_digest` method calculates the hex digest of an object that exists - in HashStore using a given persistent identifier and hash algorithm. + """Calculates the hex digest of an object that exists in HashStore using a given persistent + identifier and hash algorithm. :param str pid: Authority-based identifier. :param str algorithm: Algorithm of hex digest to generate. From 826ca4deb24c9154fe29047838a46052b9f4b0d6 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 5 Feb 2024 18:16:11 -0800 Subject: [PATCH 06/45] Refactor 'store_metadata' to store documents under a directory formed by sharding the hash of the 'pid', with metadata doc name being the hash of the 'format_id' and update related pytests --- src/hashstore/filehashstore.py | 9 +++++---- tests/test_filehashstore.py | 19 ++++++++++++++---- tests/test_filehashstore_interface.py | 29 ++++++++++++++++++++++++--- 3 files changed, 46 insertions(+), 11 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index b9b6012e..acab3bb1 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1472,9 +1472,10 @@ def _put_metadata(self, metadata, pid, format_id): metadata_tmp = self._mktmpmetadata(metadata_stream) # Get target and related paths (permanent location) - metadata_cid = self._computehash(pid + format_id) - rel_path = "/".join(self._shard(metadata_cid)) - full_path = self._get_store_path("metadata") / rel_path + metadata_directory = self._computehash(pid) + metadata_document_name = self._computehash(format_id) + rel_path = "/".join(self._shard(metadata_directory)) + full_path = self._get_store_path("metadata") / rel_path / metadata_document_name # Move metadata to target path if os.path.exists(metadata_tmp): @@ -1487,7 +1488,7 @@ def _put_metadata(self, metadata, pid, format_id): "FileHashStore - _put_metadata: Successfully put metadata for pid: %s", pid, ) - return metadata_cid + return full_path except Exception as err: exception_string = ( f"FileHashStore - _put_metadata: Unexpected {err=}, {type(err)=}" diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index c11726ef..14b6adf0 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -650,7 +650,15 @@ def test_put_metadata_cid(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename metadata_cid = store._put_metadata(syspath, pid, format_id) - assert metadata_cid == pids[pid]["metadata_cid"] + + # Manually calculate expected path + metadata_directory = store._computehash(pid) + metadata_document_name = store._computehash(format_id) + rel_path = "/".join(store._shard(metadata_directory)) + full_path = ( + store._get_store_path("metadata") / rel_path / metadata_document_name + ) + assert metadata_cid == full_path def test_mktmpmetadata(pids, store): @@ -817,7 +825,7 @@ def test_get_store_path_refs(store): assert path_metadata_string.endswith("/metacat/refs") -def test_exists_with_object_metadata_id(pids, store): +def test_exists_object_with_object_metadata_id(pids, store): """Test exists method with an absolute file path.""" test_dir = "tests/testdata/" entity = "objects" @@ -827,7 +835,7 @@ def test_exists_with_object_metadata_id(pids, store): assert store._exists(entity, object_metadata.cid) -def test_exists_with_sharded_path(pids, store): +def test_exists_object_with_sharded_path(pids, store): """Test exists method with an absolute file path.""" test_dir = "tests/testdata/" entity = "objects" @@ -839,7 +847,10 @@ def test_exists_with_sharded_path(pids, store): assert store._exists(entity, object_metadata_shard_path) -def test_exists_with_nonexistent_file(store): +# TODO: Test exists for metadata + + +def test_exists_object_with_nonexistent_file(store): """Test exists method with a nonexistent file.""" entity = "objects" non_existent_file = "tests/testdata/filedoesnotexist" diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 9405e62e..293fe264 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -654,17 +654,32 @@ def test_store_metadata(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename metadata_cid = store.store_metadata(pid, syspath, format_id) - assert metadata_cid == pids[pid]["metadata_cid"] + # Manually calculate expected path + metadata_directory = store._computehash(pid) + metadata_document_name = store._computehash(format_id) + rel_path = "/".join(store._shard(metadata_directory)) + full_path = ( + store._get_store_path("metadata") / rel_path / metadata_document_name + ) + assert metadata_cid == full_path def test_store_metadata_default_format_id(pids, store): """Test store metadata returns expected id when storing with default format_id.""" test_dir = "tests/testdata/" + format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename metadata_cid = store.store_metadata(pid, syspath) - assert metadata_cid == pids[pid]["metadata_cid"] + # Manually calculate expected path + metadata_directory = store._computehash(pid) + metadata_document_name = store._computehash(format_id) + rel_path = "/".join(store._shard(metadata_directory)) + full_path = ( + store._get_store_path("metadata") / rel_path / metadata_document_name + ) + assert metadata_cid == full_path def test_store_metadata_files_path(pids, store): @@ -676,8 +691,16 @@ def test_store_metadata_files_path(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename metadata_cid = store.store_metadata(pid, syspath, format_id) + # TODO: Ensure exists works as expected for metadata assert store._exists(entity, metadata_cid) - assert metadata_cid == pids[pid]["metadata_cid"] + # Manually calculate expected path + metadata_directory = store._computehash(pid) + metadata_document_name = store._computehash(format_id) + rel_path = "/".join(store._shard(metadata_directory)) + full_path = ( + store._get_store_path("metadata") / rel_path / metadata_document_name + ) + assert metadata_cid == full_path assert store._count(entity) == 3 From 07cb891d14357cfe98725aad1edba51c31355595 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 6 Feb 2024 09:49:22 -0800 Subject: [PATCH 07/45] Update HashStore client module and revise pytests --- src/hashstore/hashstoreclient.py | 2 +- tests/test_filehashstore.py | 11 +++++++++- tests/test_filehashstore_interface.py | 29 +++++---------------------- tests/test_hashstore_client.py | 17 ++++++++++++++-- 4 files changed, 31 insertions(+), 28 deletions(-) diff --git a/src/hashstore/hashstoreclient.py b/src/hashstore/hashstoreclient.py index d786682f..c4e26474 100644 --- a/src/hashstore/hashstoreclient.py +++ b/src/hashstore/hashstoreclient.py @@ -850,7 +850,7 @@ def main(): raise ValueError("'-path' option is required") # Store metadata to HashStore metadata_cid = hashstore_c.hashstore.store_metadata(pid, path, formatid) - print(f"Metadata ID: {metadata_cid}") + print(f"Metadata Path: {metadata_cid}") elif getattr(args, "client_retrieveobject"): if pid is None: diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 14b6adf0..05afb37d 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -847,7 +847,16 @@ def test_exists_object_with_sharded_path(pids, store): assert store._exists(entity, object_metadata_shard_path) -# TODO: Test exists for metadata +def test_exists_metadata_files_path(pids, store): + """Test exists works as expected for metadata.""" + test_dir = "tests/testdata/" + entity = "metadata" + format_id = "http://ns.dataone.org/service/types/v2.0" + for pid in pids.keys(): + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + metadata_cid = store.store_metadata(pid, syspath, format_id) + assert store._exists(entity, metadata_cid) def test_exists_object_with_nonexistent_file(store): diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 293fe264..0f1df1cf 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -649,6 +649,7 @@ def test_find_object_pid_empty(store): def test_store_metadata(pids, store): """Test store metadata.""" test_dir = "tests/testdata/" + entity = "metadata" format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" @@ -662,6 +663,7 @@ def test_store_metadata(pids, store): store._get_store_path("metadata") / rel_path / metadata_document_name ) assert metadata_cid == full_path + assert store._count(entity) == 3 def test_store_metadata_default_format_id(pids, store): @@ -682,28 +684,6 @@ def test_store_metadata_default_format_id(pids, store): assert metadata_cid == full_path -def test_store_metadata_files_path(pids, store): - """Test store metadata with path.""" - test_dir = "tests/testdata/" - entity = "metadata" - format_id = "http://ns.dataone.org/service/types/v2.0" - for pid in pids.keys(): - filename = pid.replace("/", "_") + ".xml" - syspath = Path(test_dir) / filename - metadata_cid = store.store_metadata(pid, syspath, format_id) - # TODO: Ensure exists works as expected for metadata - assert store._exists(entity, metadata_cid) - # Manually calculate expected path - metadata_directory = store._computehash(pid) - metadata_document_name = store._computehash(format_id) - rel_path = "/".join(store._shard(metadata_directory)) - full_path = ( - store._get_store_path("metadata") / rel_path / metadata_document_name - ) - assert metadata_cid == full_path - assert store._count(entity) == 3 - - def test_store_metadata_files_string(pids, store): """Test store metadata with string.""" test_dir = "tests/testdata/" @@ -782,7 +762,7 @@ def test_store_metadata_metadata_none(store): store.store_metadata(pid, syspath_string, format_id) -def test_store_metadata_metadata_cid(pids, store): +def test_store_metadata_metadata_path(pids, store): """Test store metadata returns expected metadata_cid.""" test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" @@ -792,7 +772,8 @@ def test_store_metadata_metadata_cid(pids, store): syspath = Path(test_dir) / filename _object_metadata = store.store_object(pid, path) metadata_cid = store.store_metadata(pid, syspath, format_id) - assert metadata_cid == pids[pid]["metadata_cid"] + metadata_path = store._resolve_path("metadata", metadata_cid) + assert metadata_cid == metadata_path def test_store_metadata_thread_lock(store): diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index 3511bbe8..be4e5f7a 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -4,6 +4,8 @@ from pathlib import Path from hashstore import hashstoreclient +# pylint: disable=W0212 + def test_create_hashstore(tmp_path): """Test creating a HashStore through the client.""" @@ -135,11 +137,12 @@ def test_store_object(store, pids): assert store._exists("objects", pids[pid][store.algorithm]) -def test_store_metadata(store, pids): +def test_store_metadata(capsys, store, pids): """Test storing metadata to HashStore through client.""" client_directory = os.getcwd() + "/src/hashstore" test_dir = "tests/testdata/" namespace = "http://ns.dataone.org/service/types/v2.0" + entity = "metadata" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename @@ -164,7 +167,17 @@ def test_store_metadata(store, pids): sys.argv = chs_args hashstoreclient.main() - assert store._exists("metadata", pids[pid]["metadata_cid"]) + metadata_directory = store._computehash(pid) + metadata_document_name = store._computehash(namespace) + rel_path = "/".join(store._shard(metadata_directory)) + full_path = ( + store._get_store_path("metadata") / rel_path / metadata_document_name + ) + capsystext = capsys.readouterr().out + expected_output = f"Metadata Path: {full_path}\n" + assert capsystext == expected_output + + assert store._count(entity) == 3 def test_retrieve_objects(capsys, pids, store): From 89fd718482b08fd4f5539c4705e6acaa89344ab2 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 6 Feb 2024 10:12:10 -0800 Subject: [PATCH 08/45] Refactor 'retrieve_metadata' & 'delete_metadata' and add TODO item --- src/hashstore/filehashstore.py | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index acab3bb1..1cfebe39 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -755,10 +755,17 @@ def retrieve_metadata(self, pid, format_id=None): checked_format_id = self._check_arg_format_id(format_id, "retrieve_metadata") entity = "metadata" - metadata_cid = self._computehash(pid + checked_format_id) - metadata_exists = self._exists(entity, metadata_cid) + metadata_directory = self._computehash(pid) + if format_id is None: + metadata_document_name = self._computehash(self.sysmeta_ns) + else: + metadata_document_name = self._computehash(checked_format_id) + rel_path = "/".join(self._shard(metadata_directory)) + full_path_without_directory = rel_path + "/" + metadata_document_name + metadata_exists = self._exists(entity, full_path_without_directory) + if metadata_exists: - metadata_stream = self._open(entity, metadata_cid) + metadata_stream = self._open(entity, full_path_without_directory) else: exception_string = ( f"FileHashStore - retrieve_metadata: No metadata found for pid: {pid}" @@ -862,10 +869,19 @@ def delete_metadata(self, pid, format_id=None): ) self._check_string(pid, "pid", "delete_metadata") checked_format_id = self._check_arg_format_id(format_id, "delete_metadata") - + # TODO: Delete all metadata related to the given pid when format_id is None entity = "metadata" - metadata_cid = self._computehash(pid + checked_format_id) - self._delete(entity, metadata_cid) + metadata_directory = self._computehash(pid) + if format_id is None: + metadata_document_name = self._computehash(self.sysmeta_ns) + else: + metadata_document_name = self._computehash(checked_format_id) + rel_path = "/".join(self._shard(metadata_directory)) + full_path_without_directory = rel_path + "/" + metadata_document_name + metadata_exists = self._exists(entity, full_path_without_directory) + + if metadata_exists: + self._delete(entity, full_path_without_directory) logging.info( "FileHashStore - delete_metadata: Successfully deleted metadata for pid: %s", @@ -1982,7 +1998,8 @@ def _build_path(self, entity, hash_id, extension=""): def _resolve_path(self, entity, file): """Attempt to determine the absolute path of a file ID or path through - successive checking of candidate paths. + successive checking of candidate paths - first by checking whether the 'file' + exists, followed by checking the entity type with respect to the file. :param str entity: Desired entity type ("objects", "metadata", "cid", "pid"), where "cid" & "pid" represents resolving the path to the refs files. From bc0560ca1c558f65f589edb2cb3990f2c2b5fea2 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 6 Feb 2024 11:05:10 -0800 Subject: [PATCH 09/45] Refactor 'delete_metadata' to delete all metadata documents for a given pid if no 'format_id' is supplied, and add new pytests --- src/hashstore/filehashstore.py | 45 ++++++++++++++++++--------- tests/test_filehashstore_interface.py | 45 +++++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 14 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 1cfebe39..14a45f18 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -869,25 +869,42 @@ def delete_metadata(self, pid, format_id=None): ) self._check_string(pid, "pid", "delete_metadata") checked_format_id = self._check_arg_format_id(format_id, "delete_metadata") - # TODO: Delete all metadata related to the given pid when format_id is None + # Get the metadata directory path for the given pid entity = "metadata" metadata_directory = self._computehash(pid) + rel_path = "/".join(self._shard(metadata_directory)) + metadata_rel_path = self._get_store_path("metadata") / rel_path if format_id is None: - metadata_document_name = self._computehash(self.sysmeta_ns) + # Delete all metadata files + metadata_files = os.listdir(metadata_rel_path) + metadata_file_paths = [ + metadata_rel_path / file + for file in metadata_files + if os.path.isfile(metadata_rel_path / file) + ] + for file_path in metadata_file_paths: + os.remove(file_path) + + info_string = ( + "FileHashStore - delete_metadata: Successfully deleted all metadata for pid: %s", + pid, + ) + logging.info(info_string) + return True else: + # Delete a specific metadata file metadata_document_name = self._computehash(checked_format_id) - rel_path = "/".join(self._shard(metadata_directory)) - full_path_without_directory = rel_path + "/" + metadata_document_name - metadata_exists = self._exists(entity, full_path_without_directory) - - if metadata_exists: - self._delete(entity, full_path_without_directory) - - logging.info( - "FileHashStore - delete_metadata: Successfully deleted metadata for pid: %s", - pid, - ) - return True + full_path_without_directory = rel_path + "/" + metadata_document_name + metadata_exists = self._exists(entity, full_path_without_directory) + if metadata_exists: + self._delete(entity, full_path_without_directory) + + info_string = ( + "FileHashStore - delete_metadata: Successfully deleted metadata for pid:" + + f" {pid} for format_id: {format_id}" + ) + logging.info(info_string) + return True def get_hex_digest(self, pid, algorithm): logging.debug( diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 0f1df1cf..a71a8289 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -666,6 +666,33 @@ def test_store_metadata(pids, store): assert store._count(entity) == 3 +def test_store_metadata_one_pid_multiple_metadata_documents(store): + """Test store metadata for a pid with multiple metadata documents.""" + test_dir = "tests/testdata/" + entity = "metadata" + pid = "jtao.1700.1" + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + metadata_directory = store._computehash(pid) + rel_path = "/".join(store._shard(metadata_directory)) + format_id = "http://ns.dataone.org/service/types/v2.0" + format_id3 = "http://ns.dataone.org/service/types/v3.0" + format_id4 = "http://ns.dataone.org/service/types/v4.0" + metadata_cid = store.store_metadata(pid, syspath, format_id) + metadata_cid3 = store.store_metadata(pid, syspath, format_id3) + metadata_cid4 = store.store_metadata(pid, syspath, format_id4) + metadata_document_name = store._computehash(format_id) + metadata_document_name3 = store._computehash(format_id3) + metadata_document_name4 = store._computehash(format_id4) + full_path = store._get_store_path("metadata") / rel_path / metadata_document_name + full_path3 = store._get_store_path("metadata") / rel_path / metadata_document_name3 + full_path4 = store._get_store_path("metadata") / rel_path / metadata_document_name4 + assert metadata_cid == full_path + assert metadata_cid3 == full_path3 + assert metadata_cid4 == full_path4 + assert store._count(entity) == 3 + + def test_store_metadata_default_format_id(pids, store): """Test store metadata returns expected id when storing with default format_id.""" test_dir = "tests/testdata/" @@ -1004,6 +1031,24 @@ def test_delete_metadata(pids, store): assert store._count(entity) == 0 +def test_delete_metadata_one_pid_multiple_metadata_documents(store): + """Test delete_metadata for a pid with multiple metadata documents deletes + all metadata files as expected.""" + test_dir = "tests/testdata/" + entity = "metadata" + pid = "jtao.1700.1" + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + format_id = "http://ns.dataone.org/service/types/v2.0" + format_id3 = "http://ns.dataone.org/service/types/v3.0" + format_id4 = "http://ns.dataone.org/service/types/v4.0" + _metadata_cid = store.store_metadata(pid, syspath, format_id) + _metadata_cid3 = store.store_metadata(pid, syspath, format_id3) + _metadata_cid4 = store.store_metadata(pid, syspath, format_id4) + store.delete_metadata(pid) + assert store._count(entity) == 0 + + def test_delete_metadata_does_not_exist(pids, store): """Test delete_metadata does not throw exception when called to delete metadata that does not exist.""" From 278314da8bb4b1039227571572741fa8d8afb698 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 6 Feb 2024 12:22:52 -0800 Subject: [PATCH 10/45] Update HashStore interface 'delete_object' and 'delete_metadata' docstrings, remove boolean return values and update pytests --- src/hashstore/filehashstore.py | 16 +++++++--------- src/hashstore/hashstore.py | 4 ---- tests/test_filehashstore_interface.py | 25 ++++++++++++++++++++----- 3 files changed, 27 insertions(+), 18 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 14a45f18..a50655aa 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -784,12 +784,11 @@ def delete_object(self, ab_id, id_type=None): ) self._check_string(ab_id, "ab_id", "delete_object") if id_type is "cid": - cid_refs_abs_path = self._resolve_path("objects", ab_id) - if os.path.exists(cid_refs_abs_path): + cid_refs_abs_path = self._resolve_path("cid", ab_id) + # If the refs file still exists, do not delete the object + if not os.path.exists(cid_refs_abs_path): self._delete("objects", ab_id) - return True - else: - return False + return else: # id_type is "pid" pid = ab_id @@ -849,8 +848,7 @@ def delete_object(self, ab_id, id_type=None): + f" not object with cid ({cid}), cid refs file not empty." ) logging.info(info_string) - # TODO: Check 'id_type' for 'clear' & attempt to remove all metadata docs if so - return True + return finally: # Release cid @@ -890,7 +888,7 @@ def delete_metadata(self, pid, format_id=None): pid, ) logging.info(info_string) - return True + return else: # Delete a specific metadata file metadata_document_name = self._computehash(checked_format_id) @@ -904,7 +902,7 @@ def delete_metadata(self, pid, format_id=None): + f" {pid} for format_id: {format_id}" ) logging.info(info_string) - return True + return def get_hex_digest(self, pid, algorithm): logging.debug( diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 068f9039..1e6e50c5 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -162,8 +162,6 @@ def delete_object(self, ab_id, id_type): :param str ab_id: Authority-based identifier. :param str id_type: "pid" or "Cid - - :return: bool - `True` upon successful deletion. """ raise NotImplementedError() @@ -175,8 +173,6 @@ def delete_metadata(self, pid, format_id): :param str pid: Authority-based identifier. :param str format_id: Metadata format. - - :return: bool - `True` upon successful deletion. """ raise NotImplementedError() diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index a71a8289..ef2944bf 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -990,7 +990,7 @@ def test_delete_object_cid_refs_file_with_pid_refs_remaining(pids, store): assert os.path.exists(cid_refs_file_path) -def test_delete_object_id_type_cid(pids, store): +def test_delete_object_idtype_cid(pids, store): """Test delete_object successfully deletes only object.""" test_dir = "tests/testdata/" entity = "objects" @@ -1001,6 +1001,23 @@ def test_delete_object_id_type_cid(pids, store): assert store._count(entity) == 0 +def test_delete_object_idtype_cid_refs_file_exists(pids, store): + """Test delete_object does not delete object if a cid refs file still exists.""" + test_dir = "tests/testdata/" + entity = "objects" + format_id = "http://ns.dataone.org/service/types/v2.0" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + object_metadata = store.store_object(pid, path) + _metadata_cid = store.store_metadata(pid, syspath, format_id) + store.delete_object(object_metadata.cid, "cid") + assert store._count(entity) == 3 + assert store._count("pid") == 3 + assert store._count("cid") == 3 + + def test_delete_object_pid_empty(store): """Test delete_object raises error when empty pid supplied.""" pid = " " @@ -1026,8 +1043,7 @@ def test_delete_metadata(pids, store): syspath = Path(test_dir) / filename _object_metadata = store.store_object(pid, path) _metadata_cid = store.store_metadata(pid, syspath, format_id) - is_deleted = store.delete_metadata(pid, format_id) - assert is_deleted + store.delete_metadata(pid, format_id) assert store._count(entity) == 0 @@ -1054,8 +1070,7 @@ def test_delete_metadata_does_not_exist(pids, store): metadata that does not exist.""" format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): - is_deleted = store.delete_metadata(pid, format_id) - assert is_deleted + store.delete_metadata(pid, format_id) def test_delete_metadata_default_format_id(store, pids): From 7755a62cb2f829378a5e61a4f2cc0ccd7e8c3227 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 6 Feb 2024 12:37:09 -0800 Subject: [PATCH 11/45] Add new static method '_get_file_paths' and refactor 'delete_metadata'; and clean up code --- src/hashstore/filehashstore.py | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index a50655aa..cfd53235 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -701,7 +701,6 @@ def store_metadata(self, pid, metadata, format_id=None): "FileHashStore - store_metadata: Attempting to store metadata for pid: %s", pid, ) - # TODO: Refactor the way we store metadata and add pytests to confirm metadata_cid = self._put_metadata(metadata, pid, checked_format_id) logging.info( @@ -874,14 +873,9 @@ def delete_metadata(self, pid, format_id=None): metadata_rel_path = self._get_store_path("metadata") / rel_path if format_id is None: # Delete all metadata files - metadata_files = os.listdir(metadata_rel_path) - metadata_file_paths = [ - metadata_rel_path / file - for file in metadata_files - if os.path.isfile(metadata_rel_path / file) - ] + metadata_file_paths = self._get_file_paths(metadata_rel_path) for file_path in metadata_file_paths: - os.remove(file_path) + self._delete(entity, file_path) info_string = ( "FileHashStore - delete_metadata: Successfully deleted all metadata for pid: %s", @@ -2075,6 +2069,30 @@ def _get_store_path(self, entity): f"entity: {entity} does not exist. Do you mean 'objects', 'metadata' or 'refs'?" ) + @staticmethod + def _get_file_paths(directory): + """Get the file paths of a given directory + + :param mixed directory: String or path to directory. + + :raises FileNotFoundError: If the directory doesn't exist + + :return: file_paths - File paths of the given directory + :rtype: List + """ + if os.path.exists(directory): + files = os.listdir(directory) + file_paths = [ + directory / file for file in files if os.path.isfile(directory / file) + ] + return file_paths + else: + err_msg = ( + "FileHashStore - _get_file_paths: Directory does not exist: %s", + directory, + ) + raise FileNotFoundError(err_msg) + def _count(self, entity): """Return the count of the number of files in the `root` directory. From cec0c5e8579705bdd4fac018a12a41204c8bfb11 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 6 Feb 2024 13:55:29 -0800 Subject: [PATCH 12/45] Refactor 'delete_object' to coordinate deletion of files to improve atomicity of the process, and to also delete all metadata documents when 'id_type' is pid --- src/hashstore/filehashstore.py | 101 +++++++++++++++++++++++++-------- src/hashstore/hashstore.py | 4 +- 2 files changed, 79 insertions(+), 26 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index cfd53235..94b10820 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -791,6 +791,15 @@ def delete_object(self, ab_id, id_type=None): else: # id_type is "pid" pid = ab_id + objects_to_delete = [] + rel_path = "/".join(self._shard(self._computehash(pid))) + metadata_rel_path = self._get_store_path("metadata") / rel_path + metadata_file_paths = self._get_file_paths(metadata_rel_path) + # Rename paths by appending _delete to the file name + if metadata_file_paths is not None: + for path in metadata_file_paths: + objects_to_delete.append(self._rename_path_for_deletion(path)) + try: cid = self.find_object(pid) except FileNotFoundError as fnfe: @@ -799,19 +808,46 @@ def delete_object(self, ab_id, id_type=None): return if "cid refs file not found" in fnfe: # Delete pid refs file - self._delete("pid", pid) + objects_to_delete.append( + self._rename_path_for_deletion(self._resolve_path("pid", pid)) + ) + # Remove all files confirmed for deletion + for obj in objects_to_delete: + os.remove(obj) return if "object referenced does not exist" in fnfe: # Delete pid refs file + pid_ref_abs_path = self._resolve_path("pid", pid) + # Add pid refs file to delete + objects_to_delete.append( + self._rename_path_for_deletion(pid_ref_abs_path) + ) + # Remove pid from cid refs file + # Retrieve the cid from the pid refs file + with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: + pid_refs_cid = pid_ref_file.read() + cid_ref_abs_path = self._resolve_path("cid", pid_refs_cid) + # Remove if the pid refs is found + if self._is_pid_in_cid_refs_file(pid, cid_ref_abs_path): + self._delete_cid_refs_pid(cid_ref_abs_path, pid) + # Remove all files confirmed for deletion + for obj in objects_to_delete: + os.remove(obj) self._delete("pid", pid) return except ValueError as ve: if "is missing from cid refs file" in ve: # Delete pid refs file - self._delete("pid", pid) + pid_ref_abs_path = self._resolve_path("pid", pid) + objects_to_delete.append( + self._rename_path_for_deletion(pid_ref_abs_path) + ) + # Remove all files confirmed for deletion + for obj in objects_to_delete: + os.remove(obj) return - # Proceed with next steps - cid has been retrieved without any errors + # Proceed with next steps - cid has been retrieved without any issues while cid in self.reference_locked_cids: logging.debug( "FileHashStore - delete_object: (cid) %s is currently locked. Waiting", @@ -829,24 +865,29 @@ def delete_object(self, ab_id, id_type=None): cid_ref_abs_path = self._resolve_path("cid", cid) pid_ref_abs_path = self._resolve_path("pid", pid) # First delete the pid refs file immediately - self._delete_pid_refs_file(pid_ref_abs_path) + objects_to_delete.append( + self._rename_path_for_deletion(pid_ref_abs_path) + ) # Remove pid from cid reference file self._delete_cid_refs_pid(cid_ref_abs_path, pid) # Delete cid reference file and object only if the cid refs file is empty if os.path.getsize(cid_ref_abs_path) == 0: - self._delete("cid", cid_ref_abs_path) - self._delete("objects", cid) - info_string = ( - "FileHashStore - delete_object: Successfully deleted references and" - + f" object associated with pid: {pid}" + objects_to_delete.append( + self._rename_path_for_deletion(cid_ref_abs_path) ) - logging.info(info_string) - else: - info_string = ( - "FileHashStore - delete_object: Successfully deleted pid refs file but" - + f" not object with cid ({cid}), cid refs file not empty." + obj_real_path = self._resolve_path("objects", cid) + objects_to_delete.append( + self._rename_path_for_deletion(obj_real_path) ) - logging.info(info_string) + # Remove all files confirmed for deletion + for obj in objects_to_delete: + os.remove(obj) + + info_string = ( + "FileHashStore - delete_object: Successfully deleted references, metadata and" + + f" object associated with pid: {pid}" + ) + logging.info(info_string) return finally: @@ -874,8 +915,9 @@ def delete_metadata(self, pid, format_id=None): if format_id is None: # Delete all metadata files metadata_file_paths = self._get_file_paths(metadata_rel_path) - for file_path in metadata_file_paths: - self._delete(entity, file_path) + if metadata_file_paths is not None: + for file_path in metadata_file_paths: + self._delete(entity, file_path) info_string = ( "FileHashStore - delete_metadata: Successfully deleted all metadata for pid: %s", @@ -1941,6 +1983,21 @@ def _delete(self, entity, file): logging.error(exception_string) raise err + @staticmethod + def _rename_path_for_deletion(path): + """Move and rename a given path by appending '_delete' to the file name + + :param Path path: Path to file to rename + + :return: Path to the renamed file + :rtype: str + """ + if isinstance(path, str): + path = Path(path) + delete_path = path.with_name(path.stem + "_delete" + path.suffix) + shutil.move(path, delete_path) + return delete_path + def _remove_empty(self, subpath): """Successively remove all empty folders starting with `subpath` and proceeding "up" through directory tree until reaching the `root` @@ -2071,13 +2128,13 @@ def _get_store_path(self, entity): @staticmethod def _get_file_paths(directory): - """Get the file paths of a given directory + """Get the file paths of a given directory if it exists :param mixed directory: String or path to directory. :raises FileNotFoundError: If the directory doesn't exist - :return: file_paths - File paths of the given directory + :return: file_paths - File paths of the given directory or None if directory doesn't exist :rtype: List """ if os.path.exists(directory): @@ -2087,11 +2144,7 @@ def _get_file_paths(directory): ] return file_paths else: - err_msg = ( - "FileHashStore - _get_file_paths: Directory does not exist: %s", - directory, - ) - raise FileNotFoundError(err_msg) + return None def _count(self, entity): """Return the count of the number of files in the `root` directory. diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 1e6e50c5..b94762a3 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -168,8 +168,8 @@ def delete_object(self, ab_id, id_type): @abstractmethod def delete_metadata(self, pid, format_id): """Deletes a metadata document (ex. `sysmeta`) permanently from HashStore using a given - persistent identifier and its respective metadata namespace. If a `format_id` is supplied, - only the metadata document associated with the `format_id` will be deleted. + persistent identifier (`pid`) and format_id (metadata namespace). If a `format_id` is + not supplied, all metadata documents associated with the given `pid` will be deleted. :param str pid: Authority-based identifier. :param str format_id: Metadata format. From caa522bba3189aa22e61634cb20317800892b868 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 6 Feb 2024 14:05:00 -0800 Subject: [PATCH 13/45] Refactor 'delete_metadata' to be more atomic --- src/hashstore/filehashstore.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 94b10820..aa6ece13 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -914,10 +914,13 @@ def delete_metadata(self, pid, format_id=None): metadata_rel_path = self._get_store_path("metadata") / rel_path if format_id is None: # Delete all metadata files + objects_to_delete = [] metadata_file_paths = self._get_file_paths(metadata_rel_path) if metadata_file_paths is not None: - for file_path in metadata_file_paths: - self._delete(entity, file_path) + for path in metadata_file_paths: + objects_to_delete.append(self._rename_path_for_deletion(path)) + for obj in objects_to_delete: + os.remove(obj) info_string = ( "FileHashStore - delete_metadata: Successfully deleted all metadata for pid: %s", From 4707283669871fcf35d9dda49552abffd14b8bbf Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 6 Feb 2024 14:22:11 -0800 Subject: [PATCH 14/45] Merge methods '_write_cid_refs_file' and '_write_pid_refs_file' into one method '_write_refs_file' and update pytests --- src/hashstore/filehashstore.py | 74 ++++++++------------------ tests/test_filehashstore_references.py | 42 +++++++-------- 2 files changed, 44 insertions(+), 72 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index aa6ece13..ec0767d2 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -578,7 +578,7 @@ def tag_object(self, pid, cid): raise FileExistsError(exception_string) elif os.path.exists(cid_ref_abs_path): # Create the pid refs file - pid_tmp_file_path = self._write_pid_refs_file(tmp_root_path, cid) + pid_tmp_file_path = self._write_refs_file(tmp_root_path, cid, "pid") self._create_path(os.path.dirname(pid_ref_abs_path)) shutil.move(pid_tmp_file_path, pid_ref_abs_path) # Update cid ref files as it already exists @@ -594,8 +594,8 @@ def tag_object(self, pid, cid): else: # All ref files begin as tmp files and get moved sequentially at once # Get tmp files with the expected cid and pid refs content - pid_tmp_file_path = self._write_pid_refs_file(tmp_root_path, cid) - cid_tmp_file_path = self._write_cid_refs_file(tmp_root_path, pid) + pid_tmp_file_path = self._write_refs_file(tmp_root_path, cid, "pid") + cid_tmp_file_path = self._write_refs_file(tmp_root_path, pid, "cid") # Create paths for pid ref file in '.../refs/pid' and cid ref file in '.../refs/cid' self._create_path(os.path.dirname(pid_ref_abs_path)) self._create_path(os.path.dirname(cid_ref_abs_path)) @@ -1340,35 +1340,37 @@ def delete_tmp_file(): # TODO: Clean up refs file methods, a lot of redundant code - def _write_cid_refs_file(self, path, pid): - """Write the CID reference file in the supplied path to a file. A reference file - contains every PID that references a CID, each on its own line. This method will - only write into an empty file and will not overwrite an existing one. + def _write_refs_file(self, path, ref_id, ref_type): + """Write a reference file in the supplied path into a temporary file. + All `pid` or `cid` reference files begin with a single identifier, with the + primary difference being that a cid reference file can contain multiple lines + of `pid`s that reference the `cid`. - :param str path: Path of the file to be written into. - :param str pid: Authority-based or persistent identifier of the object. + :param str path: Directory to write the temporary file + :param str ref_id: Authority-based, persistent or content identifier - :return: cid_tmp_file_path - Path to the cid tmp file + :return: tmp_file_path - Path to the tmp refs file :rtype: string """ logging.debug( - "FileHashStore - write_cid_refs_file: Writing pid (%s) into file: %s", - pid, + "FileHashStore - write_cid_refs_file: Writing id (%s) into file: %s", + ref_id, path, ) try: - with self._mktmpfile(path) as cid_tmp_file: - cid_tmp_file_path = cid_tmp_file.name - with open(cid_tmp_file_path, "w", encoding="utf8") as tmp_cid_ref_file: - tmp_cid_ref_file.write(pid + "\n") - # Ensure that file is immediately written to and not held in memory - # tmp_cid_ref_file.flush() - return cid_tmp_file_path + with self._mktmpfile(path) as tmp_file: + tmp_file_path = tmp_file.name + with open(tmp_file_path, "w", encoding="utf8") as tmp_cid_ref_file: + if ref_type is "cid": + tmp_cid_ref_file.write(ref_id + "\n") + if ref_type is "pid": + tmp_cid_ref_file.write(ref_id) + return tmp_file_path except Exception as err: exception_string = ( - "FileHashStore - write_cid_refs_file: failed to write cid refs file for pid:" - + f" {pid} into path: {path}. Unexpected {err=}, {type(err)=}" + "FileHashStore - _write_refs_file: failed to write cid refs file for pid:" + + f" {ref_id} into path: {path}. Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) raise err @@ -1464,36 +1466,6 @@ def _delete_cid_refs_pid(cid_ref_abs_path, pid): logging.error(exception_string) raise err - def _write_pid_refs_file(self, path, cid): - """Generate a tmp pid refs file into the given path for the given CID (content - identifier). A reference file for a PID contains the CID that it references. - - :param str path: Path of the file to be written into. - :param str cid: Content identifier. - - :return: pid_tmp_file_path - :rtype: string - """ - logging.debug( - "FileHashStore - _write_pid_refs_file: Writing cid (%s) into file: %s", - cid, - path, - ) - try: - with self._mktmpfile(path) as pid_tmp_file: - pid_tmp_file_path = pid_tmp_file.name - with open(pid_tmp_file_path, "w", encoding="utf8") as pid_ref_file: - pid_ref_file.write(cid) - return pid_tmp_file_path - - except Exception as err: - exception_string = ( - f"FileHashStore - _write_pid_refs_file: failed to write cid ({cid})" - + f" into pid refs file: {path}. Unexpected {err=}, {type(err)=}" - ) - logging.error(exception_string) - raise err - def _delete_pid_refs_file(self, pid_ref_abs_path): """Delete a PID reference file. diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 92f5ae59..08d0c79a 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -247,18 +247,18 @@ def test_verify_object_exception_incorrect_checksum_algo(pids, store): store.verify_object(object_metadata, checksum, "md2", expected_file_size) -def test_write_cid_refs_file(store): +def test_write_refs_file_cid(store): """Test that write_cid_reference writes a reference file.""" tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, "test_pid") + tmp_cid_refs_file = store._write_refs_file(tmp_root_path, "test_pid", "cid") assert os.path.exists(tmp_cid_refs_file) -def test_write_cid_refs_file_content(pids, store): +def test_write_refs_file_cid_content(pids, store): """Test that write_cid_ref_file writes the expected content.""" for pid in pids.keys(): tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) + tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") with open(tmp_cid_refs_file, "r", encoding="utf8") as f: cid_ref_file_pid = f.read() @@ -269,7 +269,7 @@ def test_update_cid_refs_content(pids, store): """Test that update_cid_ref updates the ref file as expected.""" for pid in pids.keys(): tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) + tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") pid_other = "dou.test.1" store._update_cid_refs(tmp_cid_refs_file, pid_other) @@ -283,7 +283,7 @@ def test_update_cid_refs_content_multiple(pids, store): """Test that update_cid_refs adds multiple references successfully.""" for pid in pids.keys(): tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) + tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") cid_reference_list = [pid] for i in range(0, 5): @@ -305,7 +305,7 @@ def test_update_cid_refs_content_pid_exists(pids, store): and proceeds to complete the tagging process (verify_object)""" for pid in pids.keys(): tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) + tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") # Exception should not be thrown store._update_cid_refs(tmp_cid_refs_file, pid) @@ -323,7 +323,7 @@ def test_delete_cid_refs_pid(pids, store): """Test that delete_cid_refs_pid deletes the given pid from the ref file.""" for pid in pids.keys(): tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) + tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") pid_other = "dou.test.1" store._update_cid_refs(tmp_cid_refs_file, pid_other) @@ -340,7 +340,7 @@ def test_delete_cid_refs_pid_file(pids, store): """Test that delete_cid_refs_pid leaves a file empty when removing the last pid.""" for pid in pids.keys(): tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) + tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") # First remove the pid store._delete_cid_refs_pid(tmp_cid_refs_file, pid) @@ -348,21 +348,21 @@ def test_delete_cid_refs_pid_file(pids, store): assert os.path.getsize(tmp_cid_refs_file) == 0 -def test_write_pid_refs_file(pids, store): +def test_write_refs_file_pid(pids, store): """Test that write_pid_refs_file writes a reference file.""" for pid in pids.keys(): cid = pids[pid]["sha256"] tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, cid) + tmp_pid_refs_file = store._write_refs_file(tmp_root_path, cid, "pid") assert os.path.exists(tmp_pid_refs_file) -def test_write_pid_refs_file_content(pids, store): +def test_write_refs_file_content_pid(pids, store): """Test that write_pid_refs_file writes the expected content.""" for pid in pids.keys(): cid = pids[pid]["sha256"] tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, cid) + tmp_pid_refs_file = store._write_refs_file(tmp_root_path, cid, "pid") with open(tmp_pid_refs_file, "r", encoding="utf8") as f: pid_refs_cid = f.read() @@ -374,7 +374,7 @@ def test_delete_pid_refs_file(pids, store): for pid in pids.keys(): cid = pids[pid]["sha256"] tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, cid) + tmp_pid_refs_file = store._write_refs_file(tmp_root_path, cid, "pid") store._delete_pid_refs_file(tmp_pid_refs_file) assert not os.path.exists(tmp_pid_refs_file) @@ -402,7 +402,7 @@ def test_verify_hashstore_references_pid_refs_incorrect_cid(pids, store): cid = pids[pid]["sha256"] # Write the cid refs file and move it where it needs to be tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) + tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") cid_ref_abs_path = store._resolve_path("cid", cid) store._create_path(os.path.dirname(cid_ref_abs_path)) shutil.move(tmp_cid_refs_file, cid_ref_abs_path) @@ -410,7 +410,7 @@ def test_verify_hashstore_references_pid_refs_incorrect_cid(pids, store): pid_ref_abs_path = store._resolve_path("pid", pid) store._create_path(os.path.dirname(pid_ref_abs_path)) tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, "bad_cid") + tmp_pid_refs_file = store._write_refs_file(tmp_root_path, "bad_cid", "pid") shutil.move(tmp_pid_refs_file, pid_ref_abs_path) with pytest.raises(ValueError): @@ -424,7 +424,7 @@ def test_verify_hashstore_references_cid_refs_file_missing(pids, store): pid_ref_abs_path = store._resolve_path("pid", pid) store._create_path(os.path.dirname(pid_ref_abs_path)) tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, "bad_cid") + tmp_pid_refs_file = store._write_refs_file(tmp_root_path, "bad_cid", "pid") shutil.move(tmp_pid_refs_file, pid_ref_abs_path) with pytest.raises(FileNotFoundError): @@ -438,7 +438,7 @@ def test_verify_hashstore_references_cid_refs_file_missing_pid(pids, store): cid = pids[pid]["sha256"] # Get a tmp cid refs file and write the wrong pid into it tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, "bad pid") + tmp_cid_refs_file = store._write_refs_file(tmp_root_path, "bad pid", "cid") cid_ref_abs_path = store._resolve_path("cid", cid) store._create_path(os.path.dirname(cid_ref_abs_path)) shutil.move(tmp_cid_refs_file, cid_ref_abs_path) @@ -446,7 +446,7 @@ def test_verify_hashstore_references_cid_refs_file_missing_pid(pids, store): pid_ref_abs_path = store._resolve_path("pid", pid) store._create_path(os.path.dirname(pid_ref_abs_path)) tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, cid) + tmp_pid_refs_file = store._write_refs_file(tmp_root_path, cid, "pid") shutil.move(tmp_pid_refs_file, pid_ref_abs_path) with pytest.raises(ValueError): @@ -462,7 +462,7 @@ def test_verify_hashstore_references_cid_refs_file_with_multiple_refs_missing_pi cid = pids[pid]["sha256"] # Write the wrong pid into a cid refs file and move it where it needs to be tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, "bad pid") + tmp_cid_refs_file = store._write_refs_file(tmp_root_path, "bad pid", "cid") cid_ref_abs_path = store._resolve_path("cid", cid) store._create_path(os.path.dirname(cid_ref_abs_path)) shutil.move(tmp_cid_refs_file, cid_ref_abs_path) @@ -470,7 +470,7 @@ def test_verify_hashstore_references_cid_refs_file_with_multiple_refs_missing_pi pid_ref_abs_path = store._resolve_path("pid", pid) store._create_path(os.path.dirname(pid_ref_abs_path)) tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, cid) + tmp_pid_refs_file = store._write_refs_file(tmp_root_path, cid, "pid") shutil.move(tmp_pid_refs_file, pid_ref_abs_path) cid_reference_list = [pid] From 1bb5dc5fc2a77275c2819124e0e01ffbe69660f4 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 6 Feb 2024 14:23:49 -0800 Subject: [PATCH 15/45] Remove redundant '_delete_pid_refs_file' method and update pytests --- src/hashstore/filehashstore.py | 28 -------------------------- tests/test_filehashstore_references.py | 19 ----------------- 2 files changed, 47 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index ec0767d2..b5d96bb6 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1466,34 +1466,6 @@ def _delete_cid_refs_pid(cid_ref_abs_path, pid): logging.error(exception_string) raise err - def _delete_pid_refs_file(self, pid_ref_abs_path): - """Delete a PID reference file. - - :param str pid_ref_abs_path: Absolute path to the PID reference file. - """ - logging.debug( - "FileHashStore - _delete_pid_refs_file: Deleting reference file: %s", - pid_ref_abs_path, - ) - - try: - if not os.path.exists(pid_ref_abs_path): - err_msg = ( - "FileHashStore - _delete_pid_refs_file: pid reference file not found: %s", - pid_ref_abs_path, - ) - raise FileNotFoundError(err_msg) - else: - self._delete("pid", pid_ref_abs_path) - - except Exception as err: - exception_string = ( - "FileHashStore - _delete_pid_refs_file: failed to delete pid refs file:" - + f" {pid_ref_abs_path}. Unexpected {err=}, {type(err)=}" - ) - logging.error(exception_string) - raise err - def _put_metadata(self, metadata, pid, format_id): """Store contents of metadata to `[self.root]/metadata` using the hash of the given PID and format ID as the permanent address. diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 08d0c79a..ef2cecb4 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -369,25 +369,6 @@ def test_write_refs_file_content_pid(pids, store): assert cid == pid_refs_cid -def test_delete_pid_refs_file(pids, store): - """Test that delete_pid_refs_file deletes a reference file.""" - for pid in pids.keys(): - cid = pids[pid]["sha256"] - tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_pid_refs_file = store._write_refs_file(tmp_root_path, cid, "pid") - store._delete_pid_refs_file(tmp_pid_refs_file) - - assert not os.path.exists(tmp_pid_refs_file) - - -def test_delete_pid_refs_file_file_not_found(pids, store): - """Test that delete_pid_refs_file raises an exception when refs file not found.""" - for pid in pids.keys(): - pid_ref_abs_path = store._resolve_path("pid", pid) - with pytest.raises(FileNotFoundError): - store._delete_pid_refs_file(pid_ref_abs_path) - - def test_verify_hashstore_references_pid_refs_file_missing(pids, store): """Test _verify_hashstore_references throws exception when pid refs file is missing.""" for pid in pids.keys(): From 0c38cc77084f3eadb0e89e71173ba588522f6a16 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 6 Feb 2024 14:55:18 -0800 Subject: [PATCH 16/45] Merge methods '_delete_cid_refs_pid' and '_update_cid_refs' into one method '_update_refs_file' and update pytests --- src/hashstore/filehashstore.py | 98 ++++++++++---------------- tests/test_filehashstore_interface.py | 2 +- tests/test_filehashstore_references.py | 32 ++++----- 3 files changed, 54 insertions(+), 78 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index b5d96bb6..d02ab961 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -583,7 +583,7 @@ def tag_object(self, pid, cid): shutil.move(pid_tmp_file_path, pid_ref_abs_path) # Update cid ref files as it already exists if not self._is_pid_in_cid_refs_file(pid, cid_ref_abs_path): - self._update_cid_refs(cid_ref_abs_path, pid) + self._update_refs_file(cid_ref_abs_path, pid, "add") self._verify_hashstore_references(pid, cid, "update") logging.info( "FileHashStore - tag_object: Successfully updated cid: %s with pid: %s", @@ -829,7 +829,7 @@ def delete_object(self, ab_id, id_type=None): cid_ref_abs_path = self._resolve_path("cid", pid_refs_cid) # Remove if the pid refs is found if self._is_pid_in_cid_refs_file(pid, cid_ref_abs_path): - self._delete_cid_refs_pid(cid_ref_abs_path, pid) + self._update_refs_file(cid_ref_abs_path, pid, "remove") # Remove all files confirmed for deletion for obj in objects_to_delete: os.remove(obj) @@ -869,7 +869,7 @@ def delete_object(self, ab_id, id_type=None): self._rename_path_for_deletion(pid_ref_abs_path) ) # Remove pid from cid reference file - self._delete_cid_refs_pid(cid_ref_abs_path, pid) + self._update_refs_file(cid_ref_abs_path, pid, "remove") # Delete cid reference file and object only if the cid refs file is empty if os.path.getsize(cid_ref_abs_path) == 0: objects_to_delete.append( @@ -1393,75 +1393,51 @@ def _is_pid_in_cid_refs_file(self, pid, cid_ref_abs_path): return False @staticmethod - def _update_cid_refs(cid_ref_abs_path, pid): - """Update an existing CID reference file with the given PID. + def _update_refs_file(refs_file_path, ref_id, update_type): + """Add or remove an existing ref from a refs file. - :param str cid_ref_abs_path: Absolute path to the CID reference file. - :param str pid: Authority-based or persistent identifier of the object. + :param str refs_file_path: Absolute path to the refs file. + :param str ref_id: Authority-based or persistent identifier of the object. + :param str update_type: 'add' or 'remove' """ - logging.debug( - "FileHashStore - update_cid_refs: Adding pid (%s) into cid reference file: %s", - pid, - cid_ref_abs_path, + debug_msg = ( + f"FileHashStore - _update_refs_file: Updating ({update_type}) for ref_id: {ref_id}" + + f" at refs file: {refs_file_path}." ) - if not os.path.exists(cid_ref_abs_path): + logging.debug(debug_msg) + if not os.path.exists(refs_file_path): exception_string = ( - f"FileHashStore - update_cid_refs: {cid_ref_abs_path} does not exist." - + f" Cannot write pid: {[pid]}" + f"FileHashStore - _update_refs_file: {refs_file_path} does not exist." + + f" Cannot {update_type} ref_id: {ref_id}" ) logging.error(exception_string) raise FileNotFoundError(exception_string) try: - with open(cid_ref_abs_path, "a", encoding="utf8") as cid_ref_file: - # Lock file for the shortest amount of time possible - file_descriptor = cid_ref_file.fileno() - fcntl.flock(file_descriptor, fcntl.LOCK_EX) - cid_ref_file.write(pid + "\n") - # The context manager will take care of releasing the lock - # But the code to explicitly release the lock if desired is below - # fcntl.flock(f, fcntl.LOCK_UN) - except Exception as err: - exception_string = ( - "FileHashStore - update_cid_refs: failed to update reference for cid:" - + f" {cid_ref_abs_path} for pid: {pid}. Unexpected {err=}, {type(err)=}" - ) - logging.error(exception_string) - raise err - - @staticmethod - def _delete_cid_refs_pid(cid_ref_abs_path, pid): - """Delete a PID from a CID reference file. - - :param str cid_ref_abs_path: Absolute path to the CID reference file. - :param str pid: Authority-based or persistent identifier of the object. - """ - logging.debug( - "FileHashStore - _delete_cid_refs_pid: Deleting pid (%s) from cid reference file: %s", - pid, - cid_ref_abs_path, - ) - try: - with open(cid_ref_abs_path, "r+", encoding="utf8") as cid_ref_file: - # Lock file immediately, this process needs to complete - # before any others read/modify the content of cid_ref_file - file_descriptor = cid_ref_file.fileno() - fcntl.flock(file_descriptor, fcntl.LOCK_EX) - new_pid_lines = [ - cid_pid_line - for cid_pid_line in cid_ref_file.readlines() - if cid_pid_line.strip() != pid - ] - cid_ref_file.seek(0) - cid_ref_file.writelines(new_pid_lines) - cid_ref_file.truncate() - # The context manager will take care of releasing the lock - # But the code to explicitly release the lock if desired is below - # fcntl.flock(f, fcntl.LOCK_UN) + if update_type is "add": + with open(refs_file_path, "a", encoding="utf8") as ref_file: + # Lock file for the shortest amount of time possible + file_descriptor = ref_file.fileno() + fcntl.flock(file_descriptor, fcntl.LOCK_EX) + ref_file.write(ref_id + "\n") + if update_type is "remove": + with open(refs_file_path, "r+", encoding="utf8") as ref_file: + # Lock file immediately, this process needs to complete + # before any others read/modify the content of resf file + file_descriptor = ref_file.fileno() + fcntl.flock(file_descriptor, fcntl.LOCK_EX) + new_pid_lines = [ + cid_pid_line + for cid_pid_line in ref_file.readlines() + if cid_pid_line.strip() != ref_id + ] + ref_file.seek(0) + ref_file.writelines(new_pid_lines) + ref_file.truncate() except Exception as err: exception_string = ( - "FileHashStore - _delete_cid_refs_pid: failed to remove pid from cid refs file:" - + f" {cid_ref_abs_path} for pid: {pid}. Unexpected {err=}, {type(err)=}" + f"FileHashStore - _update_refs_file: failed to {update_type} for ref_id: {ref_id}" + + f" at refs file: {refs_file_path}. Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) raise err diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index ef2944bf..c165cec8 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -984,7 +984,7 @@ def test_delete_object_cid_refs_file_with_pid_refs_remaining(pids, store): cid = object_metadata.cid cid_refs_abs_path = store._resolve_path("cid", cid) # pylint: disable=W0212 - store._update_cid_refs(cid_refs_abs_path, "dou.test.1") + store._update_refs_file(cid_refs_abs_path, "dou.test.1", "add") store.delete_object(pid) cid_refs_file_path = store._resolve_path("cid", cid) assert os.path.exists(cid_refs_file_path) diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index ef2cecb4..28c9a08d 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -97,7 +97,7 @@ def test_tag_object_cid_refs_file_exists(pids, store): assert not os.path.exists(second_cid_hash) -def test_tag_object_cid_refs_update_cid_refs_updated(store): +def test_tag_object_cid_refs_update_refs_file_updated(store): """Test tag object updates a cid reference file that already exists.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" @@ -150,7 +150,7 @@ def test_tag_object_cid_refs_update_pid_found_but_file_missing(store): # Manually update the cid refs, pid refs file missing at this point additional_pid = "dou.test.1" cid_ref_abs_path = store._resolve_path("cid", cid) - store._update_cid_refs(cid_ref_abs_path, additional_pid) + store._update_refs_file(cid_ref_abs_path, additional_pid, "add") # Confirm the pid refs file is missing pid_refs_file_path = store._resolve_path("pid", additional_pid) @@ -265,13 +265,13 @@ def test_write_refs_file_cid_content(pids, store): assert pid == cid_ref_file_pid.strip() -def test_update_cid_refs_content(pids, store): +def test_update_refs_file_content(pids, store): """Test that update_cid_ref updates the ref file as expected.""" for pid in pids.keys(): tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") pid_other = "dou.test.1" - store._update_cid_refs(tmp_cid_refs_file, pid_other) + store._update_refs_file(tmp_cid_refs_file, pid_other, "add") with open(tmp_cid_refs_file, "r", encoding="utf8") as f: for _, line in enumerate(f, start=1): @@ -279,7 +279,7 @@ def test_update_cid_refs_content(pids, store): assert value == pid or value == pid_other -def test_update_cid_refs_content_multiple(pids, store): +def test_update_refs_file_content_multiple(pids, store): """Test that update_cid_refs adds multiple references successfully.""" for pid in pids.keys(): tmp_root_path = store._get_store_path("refs") / "tmp" @@ -287,7 +287,7 @@ def test_update_cid_refs_content_multiple(pids, store): cid_reference_list = [pid] for i in range(0, 5): - store._update_cid_refs(tmp_cid_refs_file, f"dou.test.{i}") + store._update_refs_file(tmp_cid_refs_file, f"dou.test.{i}", "add") cid_reference_list.append(f"dou.test.{i}") line_count = 0 @@ -300,34 +300,34 @@ def test_update_cid_refs_content_multiple(pids, store): assert line_count == 6 -def test_update_cid_refs_content_pid_exists(pids, store): +def test_update_refs_file_content_pid_exists(pids, store): """Test that update_cid_ref does not throw exception if pid already exists and proceeds to complete the tagging process (verify_object)""" for pid in pids.keys(): tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") # Exception should not be thrown - store._update_cid_refs(tmp_cid_refs_file, pid) + store._update_refs_file(tmp_cid_refs_file, pid, "add") -def test_update_cid_refs_content_cid_refs_does_not_exist(pids, store): +def test_update_refs_file_content_cid_refs_does_not_exist(pids, store): """Test that update_cid_ref throws exception if cid refs file doesn't exist.""" for pid in pids.keys(): cid = pids[pid]["sha256"] cid_ref_abs_path = store._resolve_path("cid", cid) with pytest.raises(FileNotFoundError): - store._update_cid_refs(cid_ref_abs_path, pid) + store._update_refs_file(cid_ref_abs_path, pid, "add") -def test_delete_cid_refs_pid(pids, store): +def test_update_refs_file_remove(pids, store): """Test that delete_cid_refs_pid deletes the given pid from the ref file.""" for pid in pids.keys(): tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") pid_other = "dou.test.1" - store._update_cid_refs(tmp_cid_refs_file, pid_other) - store._delete_cid_refs_pid(tmp_cid_refs_file, pid) + store._update_refs_file(tmp_cid_refs_file, pid_other, "add") + store._update_refs_file(tmp_cid_refs_file, pid, "remove") with open(tmp_cid_refs_file, "r", encoding="utf8") as f: for _, line in enumerate(f, start=1): @@ -336,13 +336,13 @@ def test_delete_cid_refs_pid(pids, store): assert value == pid_other -def test_delete_cid_refs_pid_file(pids, store): +def test_update_refs_file_remove_file(pids, store): """Test that delete_cid_refs_pid leaves a file empty when removing the last pid.""" for pid in pids.keys(): tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") # First remove the pid - store._delete_cid_refs_pid(tmp_cid_refs_file, pid) + store._update_refs_file(tmp_cid_refs_file, pid, "remove") assert os.path.exists(tmp_cid_refs_file) assert os.path.getsize(tmp_cid_refs_file) == 0 @@ -456,7 +456,7 @@ def test_verify_hashstore_references_cid_refs_file_with_multiple_refs_missing_pi cid_reference_list = [pid] for i in range(0, 5): - store._update_cid_refs(cid_ref_abs_path, f"dou.test.{i}") + store._update_refs_file(cid_ref_abs_path, f"dou.test.{i}", "add") cid_reference_list.append(f"dou.test.{i}") with pytest.raises(ValueError): From 5cd09082eab9c9ecd3a07a9f2f0d9117b412cd49 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 6 Feb 2024 14:59:00 -0800 Subject: [PATCH 17/45] Rename '_is_pid_in_cid_refs_file' to '_is_string_in_refs_file' --- src/hashstore/filehashstore.py | 43 +++++++++++++++++----------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index d02ab961..7584e873 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -582,7 +582,7 @@ def tag_object(self, pid, cid): self._create_path(os.path.dirname(pid_ref_abs_path)) shutil.move(pid_tmp_file_path, pid_ref_abs_path) # Update cid ref files as it already exists - if not self._is_pid_in_cid_refs_file(pid, cid_ref_abs_path): + if not self._is_string_in_refs_file(pid, cid_ref_abs_path): self._update_refs_file(cid_ref_abs_path, pid, "add") self._verify_hashstore_references(pid, cid, "update") logging.info( @@ -637,7 +637,7 @@ def find_object(self, pid): cid_ref_abs_path = self._resolve_path("cid", pid_refs_cid) if os.path.exists(cid_ref_abs_path): # Check that the pid is actually found in the cid reference file - if self._is_pid_in_cid_refs_file(pid, cid_ref_abs_path): + if self._is_string_in_refs_file(pid, cid_ref_abs_path): # Object must also exist in order to return the cid retrieved if not self._exists("objects", pid_refs_cid): err_msg = ( @@ -828,7 +828,7 @@ def delete_object(self, ab_id, id_type=None): pid_refs_cid = pid_ref_file.read() cid_ref_abs_path = self._resolve_path("cid", pid_refs_cid) # Remove if the pid refs is found - if self._is_pid_in_cid_refs_file(pid, cid_ref_abs_path): + if self._is_string_in_refs_file(pid, cid_ref_abs_path): self._update_refs_file(cid_ref_abs_path, pid, "remove") # Remove all files confirmed for deletion for obj in objects_to_delete: @@ -1375,23 +1375,6 @@ def _write_refs_file(self, path, ref_id, ref_type): logging.error(exception_string) raise err - def _is_pid_in_cid_refs_file(self, pid, cid_ref_abs_path): - """Check a cid reference file for a pid. - - :param str pid: Authority-based or persistent identifier of the object. - :param str cid_ref_abs_path: Path to the cid refs file - - :return: pid_found - :rtype: boolean - """ - with open(cid_ref_abs_path, "r", encoding="utf8") as cid_ref_file: - # Confirm that pid is not currently already tagged - for line in cid_ref_file: - value = line.strip() - if pid == value: - return True - return False - @staticmethod def _update_refs_file(refs_file_path, ref_id, update_type): """Add or remove an existing ref from a refs file. @@ -1442,6 +1425,24 @@ def _update_refs_file(refs_file_path, ref_id, update_type): logging.error(exception_string) raise err + @staticmethod + def _is_string_in_refs_file(ref_id, refs_file_path): + """Check a reference file for a ref_id (`cid` or `pid`). + + :param str pid: Authority-based, persistent identifier or content identifier + :param str refs_file_path: Path to the refs file + + :return: pid_found + :rtype: boolean + """ + with open(refs_file_path, "r", encoding="utf8") as ref_file: + # Confirm that pid is not currently already tagged + for line in ref_file: + value = line.strip() + if ref_id == value: + return True + return False + def _put_metadata(self, metadata, pid, format_id): """Store contents of metadata to `[self.root]/metadata` using the hash of the given PID and format ID as the permanent address. @@ -1643,7 +1644,7 @@ def _verify_hashstore_references(self, pid, cid, verify_type): logging.error(exception_string) raise ValueError(exception_string) # Then the pid - pid_found = self._is_pid_in_cid_refs_file(pid, cid_ref_abs_path) + pid_found = self._is_string_in_refs_file(pid, cid_ref_abs_path) if not pid_found: exception_string = ( "FileHashStore - _verify_hashstore_references: Cid refs file exists" From 10a51445928c596728d811ddd89b579e7bc8570b Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 6 Feb 2024 15:11:32 -0800 Subject: [PATCH 18/45] Cleanup 'FileHashStore' module --- src/hashstore/filehashstore.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 7584e873..fb227eb7 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -787,17 +787,19 @@ def delete_object(self, ab_id, id_type=None): # If the refs file still exists, do not delete the object if not os.path.exists(cid_refs_abs_path): self._delete("objects", ab_id) - return else: # id_type is "pid" pid = ab_id + # Create a list of objects to delete to minimize delay objects_to_delete = [] + # Get the metadata documents to delete rel_path = "/".join(self._shard(self._computehash(pid))) metadata_rel_path = self._get_store_path("metadata") / rel_path metadata_file_paths = self._get_file_paths(metadata_rel_path) - # Rename paths by appending _delete to the file name + # Add these files to be permanently deleted if metadata_file_paths is not None: for path in metadata_file_paths: + # Rename files by appending _delete to the file name objects_to_delete.append(self._rename_path_for_deletion(path)) try: @@ -816,15 +818,14 @@ def delete_object(self, ab_id, id_type=None): os.remove(obj) return if "object referenced does not exist" in fnfe: - # Delete pid refs file + # Add pid refs file to be permanently deleted pid_ref_abs_path = self._resolve_path("pid", pid) - # Add pid refs file to delete objects_to_delete.append( self._rename_path_for_deletion(pid_ref_abs_path) ) # Remove pid from cid refs file - # Retrieve the cid from the pid refs file with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: + # Retrieve the cid pid_refs_cid = pid_ref_file.read() cid_ref_abs_path = self._resolve_path("cid", pid_refs_cid) # Remove if the pid refs is found @@ -833,11 +834,10 @@ def delete_object(self, ab_id, id_type=None): # Remove all files confirmed for deletion for obj in objects_to_delete: os.remove(obj) - self._delete("pid", pid) return except ValueError as ve: if "is missing from cid refs file" in ve: - # Delete pid refs file + # Add pid refs file to be permanently deleted pid_ref_abs_path = self._resolve_path("pid", pid) objects_to_delete.append( self._rename_path_for_deletion(pid_ref_abs_path) @@ -864,7 +864,7 @@ def delete_object(self, ab_id, id_type=None): try: cid_ref_abs_path = self._resolve_path("cid", cid) pid_ref_abs_path = self._resolve_path("pid", pid) - # First delete the pid refs file immediately + # Add pid refs file to be permanently deleted objects_to_delete.append( self._rename_path_for_deletion(pid_ref_abs_path) ) @@ -1343,11 +1343,12 @@ def delete_tmp_file(): def _write_refs_file(self, path, ref_id, ref_type): """Write a reference file in the supplied path into a temporary file. All `pid` or `cid` reference files begin with a single identifier, with the - primary difference being that a cid reference file can contain multiple lines - of `pid`s that reference the `cid`. + difference being that a cid reference file can potentially contain multiple + lines of `pid`s that reference the `cid`. :param str path: Directory to write the temporary file :param str ref_id: Authority-based, persistent or content identifier + :param str ref_type: 'cid' or 'pid' :return: tmp_file_path - Path to the tmp refs file :rtype: string @@ -1429,7 +1430,7 @@ def _update_refs_file(refs_file_path, ref_id, update_type): def _is_string_in_refs_file(ref_id, refs_file_path): """Check a reference file for a ref_id (`cid` or `pid`). - :param str pid: Authority-based, persistent identifier or content identifier + :param str ref_id: Authority-based, persistent identifier or content identifier :param str refs_file_path: Path to the refs file :return: pid_found @@ -1909,7 +1910,7 @@ def _delete(self, entity, file): @staticmethod def _rename_path_for_deletion(path): - """Move and rename a given path by appending '_delete' to the file name + """Rename a given path by appending '_delete' and move it to the renamed path. :param Path path: Path to file to rename From 77150db8ac3cd2b42ea680a158269ce6de87d2ca Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 6 Feb 2024 16:23:12 -0800 Subject: [PATCH 19/45] Review & clean up 'test_filehashstore_interface' module and revise exception class thrown in FileHashStore for clarity --- src/hashstore/filehashstore.py | 8 +- tests/test_filehashstore.py | 2 +- tests/test_filehashstore_interface.py | 162 ++++++++++++++------------ 3 files changed, 93 insertions(+), 79 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index fb227eb7..194fdb00 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1204,14 +1204,14 @@ def _move_and_get_checksums( tmp_file_size, file_size_to_validate, ) - except Exception as ge: + except ValueError as ve: # If any exception is thrown during validation, exception_string = ( "FileHashStore - _move_and_get_checksums: Object exists but cannot be verified" - + f" (validation error): {abs_file_path}, deleting temporary file. Error: {ge}" + + f" (validation error): {abs_file_path}, deleting temporary file. Error: {ve}" ) logging.error(exception_string) - raise FileExistsError from ge + raise ValueError from ve finally: # Delete the temporary file, it already exists so it is redundant self._delete(entity, tmp_file_name) @@ -1338,8 +1338,6 @@ def delete_tmp_file(): os.umask(oldmask) return tmp - # TODO: Clean up refs file methods, a lot of redundant code - def _write_refs_file(self, path, ref_id, ref_type): """Write a reference file in the supplied path into a temporary file. All `pid` or `cid` reference files begin with a single identifier, with the diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 05afb37d..05888251 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -431,7 +431,7 @@ def test_move_and_get_checksums_duplicates_raises_error(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") - with pytest.raises(FileExistsError): + with pytest.raises(ValueError): # pylint: disable=W0212 store._move_and_get_checksums( pid, diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index c165cec8..d0c21713 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -1,4 +1,5 @@ """Test module for FileHashStore HashStore interface methods.""" + import io import os from pathlib import Path @@ -18,23 +19,8 @@ ) -def test_pids_length(pids): - """Ensure test harness pids are present.""" - assert len(pids) == 3 - - -def test_store_address_length(pids, store): - """Test store object object_cid length is 64 characters.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - object_cid = object_metadata.cid - assert len(object_cid) == 64 - - -def test_store_object(pids, store): - """Test store object.""" +def test_store_object_refs_files_and_object(pids, store): + """Test store object stores objects and creates reference files.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -60,7 +46,7 @@ def test_store_object_only_object(pids, store): def test_store_object_files_path(pids, store): - """Test store object when given a path.""" + """Test store object when given a path object.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -71,7 +57,7 @@ def test_store_object_files_path(pids, store): def test_store_object_files_string(pids, store): - """Test store object when given a string.""" + """Test store object when given a string object.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -82,7 +68,7 @@ def test_store_object_files_string(pids, store): def test_store_object_files_input_stream(pids, store): - """Test store object given an input stream.""" + """Test store object when given a stream object.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -94,8 +80,8 @@ def test_store_object_files_input_stream(pids, store): assert store._count(entity) == 3 -def test_store_object_id(pids, store): - """Test store object returns expected id.""" +def test_store_object_cid(pids, store): + """Test store object returns expected content identifier.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -103,6 +89,15 @@ def test_store_object_id(pids, store): assert object_metadata.cid == pids[pid][store.algorithm] +def test_store_object_pid(pids, store): + """Test store object returns expected persistent identifier.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + assert object_metadata.pid == pid + + def test_store_object_obj_size(pids, store): """Test store object returns expected file size.""" test_dir = "tests/testdata/" @@ -149,7 +144,7 @@ def test_store_object_data_incorrect_type_none(store): pid = "jtao.1700.1" path = None with pytest.raises(TypeError): - store.store_object(pid, path) + store.store_object(pid, data=path) def test_store_object_data_incorrect_type_empty(store): @@ -157,7 +152,7 @@ def test_store_object_data_incorrect_type_empty(store): pid = "jtao.1700.1" path = "" with pytest.raises(TypeError): - store.store_object(pid, path) + store.store_object(pid, data=path) def test_store_object_data_incorrect_type_empty_spaces(store): @@ -165,7 +160,7 @@ def test_store_object_data_incorrect_type_empty_spaces(store): pid = "jtao.1700.1" path = " " with pytest.raises(TypeError): - store.store_object(pid, path) + store.store_object(pid, data=path) def test_store_object_additional_algorithm_invalid(store): @@ -179,7 +174,7 @@ def test_store_object_additional_algorithm_invalid(store): def test_store_object_additional_algorithm_hyphen_uppercase(pids, store): - """Test store object formats a given algorithm that's in uppercase.""" + """Test store object formats an additional algorithm in uppercase.""" test_dir = "tests/testdata/" entity = "objects" pid = "jtao.1700.1" @@ -192,7 +187,7 @@ def test_store_object_additional_algorithm_hyphen_uppercase(pids, store): def test_store_object_additional_algorithm_hyphen_lowercase(pids, store): - """Test store object with additional algorithm in lowercase.""" + """Test store object formats an with additional algorithm in lowercase.""" test_dir = "tests/testdata/" entity = "objects" pid = "jtao.1700.1" @@ -208,7 +203,7 @@ def test_store_object_additional_algorithm_hyphen_lowercase(pids, store): def test_store_object_additional_algorithm_underscore(pids, store): - """Test store object with additional algorithm with underscore.""" + """Test store object with formats an additional algorithm with underscore.""" test_dir = "tests/testdata/" entity = "objects" pid = "jtao.1700.1" @@ -224,7 +219,7 @@ def test_store_object_additional_algorithm_underscore(pids, store): def test_store_object_checksum_correct(store): - """Test store object successfully stores with good checksum.""" + """Test store object does not throw exception with good checksum.""" test_dir = "tests/testdata/" entity = "objects" pid = "jtao.1700.1" @@ -240,7 +235,7 @@ def test_store_object_checksum_correct(store): def test_store_object_checksum_correct_and_additional_algo(store): - """Test store object successfully stores with good checksum and same additional algorithm.""" + """Test store object with good checksum and an additional algorithm.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -264,7 +259,7 @@ def test_store_object_checksum_correct_and_additional_algo(store): def test_store_object_checksum_correct_and_additional_algo_duplicate(store): - """Test store object successfully stores with good checksum and same additional algorithm.""" + """Test store object does not throw exception with duplicate algorithms.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -296,7 +291,8 @@ def test_store_object_checksum_algorithm_empty(store): def test_store_object_checksum_empty(store): - """Test store object raises error when checksum_algorithm supplied with empty checksum.""" + """Test store object raises error when checksum_algorithm supplied with + an empty checksum.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -308,8 +304,8 @@ def test_store_object_checksum_empty(store): def test_store_object_checksum_empty_spaces(store): - """Test store object raises error when checksum_algorithm supplied and checksum is empty - with spaces.""" + """Test store object raises error when checksum_algorithm supplied and + checksum is empty with spaces.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -321,7 +317,8 @@ def test_store_object_checksum_empty_spaces(store): def test_store_object_checksum_algorithm_empty_spaces(store): - """Test store object raises error when checksum supplied with no checksum_algorithm.""" + """Test store object raises error when checksum is supplied and with empty + spaces as the checksum_algorithm.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -335,7 +332,7 @@ def test_store_object_checksum_algorithm_empty_spaces(store): def test_store_object_checksum_incorrect_checksum(store): - """Test store object raises error when supplied with bad checksum.""" + """Test store object raises error when supplied with incorrect checksum.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -364,8 +361,8 @@ def test_store_object_duplicate_does_not_store_duplicate(store): assert store._count(entity) == 1 -def test_store_object_duplicate_references_files(pids, store): - """Test that storing duplicate object but different pid creates the expected +def test_store_object_duplicate_object_references_file_count(store): + """Test that storing a duplicate object but with different pids creates the expected amount of reference files.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" @@ -382,15 +379,9 @@ def test_store_object_duplicate_references_files(pids, store): assert store._count("pid") == 3 # Confirm that there are 1 cid reference files assert store._count("cid") == 1 - # Confirm the content of the cid refence files - cid_ref_abs_path = store._resolve_path("cid", pids[pid][store.algorithm]) - with open(cid_ref_abs_path, "r", encoding="utf8") as f: - for _, line in enumerate(f, start=1): - value = line.strip() - assert value == pid or value == pid_two or value == pid_three -def test_store_object_duplicate_references_content(pids, store): +def test_store_object_duplicate_object_references_file_content(pids, store): """Test that storing duplicate object but different pid creates the expected amount of reference files.""" test_dir = "tests/testdata/" @@ -410,13 +401,10 @@ def test_store_object_duplicate_references_content(pids, store): for _, line in enumerate(f, start=1): value = line.strip() assert value == pid or value == pid_two or value == pid_three - print(os.listdir(store.root + "/refs/pid/")) - assert len(os.listdir(store.root + "/refs/pid")) == 3 - assert len(os.listdir(store.root + "/refs/cid")) == 1 def test_store_object_duplicate_raises_error_with_bad_validation_data(pids, store): - """Test store duplicate object throws FileExistsError when object exists + """Test store duplicate object throws ValueError when object exists but the data to validate against is incorrect.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" @@ -425,7 +413,7 @@ def test_store_object_duplicate_raises_error_with_bad_validation_data(pids, stor # Store first blob _object_metadata_one = store.store_object(pid, path) # Store second blob - with pytest.raises(FileExistsError): + with pytest.raises(ValueError): _object_metadata_two = store.store_object( pid, path, checksum="nonmatchingchecksum", checksum_algorithm="sha256" ) @@ -434,7 +422,7 @@ def test_store_object_duplicate_raises_error_with_bad_validation_data(pids, stor def test_store_object_with_obj_file_size(store, pids): - """Test store object with correct file sizes.""" + """Test store object stores object with correct file sizes.""" test_dir = "tests/testdata/" for pid in pids.keys(): obj_file_size = pids[pid]["file_size_bytes"] @@ -447,7 +435,7 @@ def test_store_object_with_obj_file_size(store, pids): def test_store_object_with_obj_file_size_incorrect(store, pids): - """Test store object throws exception with incorrect file size.""" + """Test store object throws exception with incorrect file sizes.""" test_dir = "tests/testdata/" for pid in pids.keys(): obj_file_size = 1234 @@ -457,7 +445,8 @@ def test_store_object_with_obj_file_size_incorrect(store, pids): def test_store_object_with_obj_file_size_non_integer(store, pids): - """Test store object throws exception with a non integer value as the file size.""" + """Test store object throws exception with a non integer value (ex. a stirng) + as the file size.""" test_dir = "tests/testdata/" for pid in pids.keys(): obj_file_size = "Bob" @@ -666,7 +655,7 @@ def test_store_metadata(pids, store): assert store._count(entity) == 3 -def test_store_metadata_one_pid_multiple_metadata_documents(store): +def test_store_metadata_one_pid_multiple_docs_correct_location(store): """Test store metadata for a pid with multiple metadata documents.""" test_dir = "tests/testdata/" entity = "metadata" @@ -712,7 +701,7 @@ def test_store_metadata_default_format_id(pids, store): def test_store_metadata_files_string(pids, store): - """Test store metadata with string.""" + """Test store metadata with a string object to the metadata.""" test_dir = "tests/testdata/" entity = "metadata" format_id = "http://ns.dataone.org/service/types/v2.0" @@ -739,7 +728,7 @@ def test_store_metadata_files_input_stream(pids, store): def test_store_metadata_pid_empty(store): - """Test store metadata raises error with empty string.""" + """Test store metadata raises error with an empty string as the pid.""" test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" pid = "" @@ -750,7 +739,7 @@ def test_store_metadata_pid_empty(store): def test_store_metadata_pid_empty_spaces(store): - """Test store metadata raises error with empty spaces.""" + """Test store metadata raises error with empty spaces as the pid.""" test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" pid = " " @@ -761,7 +750,7 @@ def test_store_metadata_pid_empty_spaces(store): def test_store_metadata_pid_format_id_spaces(store): - """Test store metadata raises error with empty spaces.""" + """Test store metadata raises error with empty spaces as the format_id.""" test_dir = "tests/testdata/" format_id = " " pid = "jtao.1700.1" @@ -772,7 +761,7 @@ def test_store_metadata_pid_format_id_spaces(store): def test_store_metadata_metadata_empty(store): - """Test store metadata raises error with empty metadata string.""" + """Test store metadata raises error with empty spaces as the metadata path.""" pid = "jtao.1700.1" format_id = "http://ns.dataone.org/service/types/v2.0" syspath_string = " " @@ -781,7 +770,7 @@ def test_store_metadata_metadata_empty(store): def test_store_metadata_metadata_none(store): - """Test store metadata raises error with empty None metadata.""" + """Test store metadata raises error with empty None metadata path.""" pid = "jtao.1700.1" format_id = "http://ns.dataone.org/service/types/v2.0" syspath_string = None @@ -790,7 +779,7 @@ def test_store_metadata_metadata_none(store): def test_store_metadata_metadata_path(pids, store): - """Test store metadata returns expected metadata_cid.""" + """Test store metadata returns expected path to metadata document.""" test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): @@ -831,7 +820,7 @@ def test_store_metadata_thread_lock(store): def test_retrieve_object(pids, store): - """Test retrieve_object returns correct object data.""" + """Test retrieve_object returns a stream to the correct object data.""" test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): @@ -862,7 +851,7 @@ def test_retrieve_object_pid_invalid(store): def test_retrieve_metadata(store): - """Test retrieve_metadata returns correct metadata.""" + """Test retrieve_metadata returns a stream to the correct metadata.""" test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" pid = "jtao.1700.1" @@ -879,7 +868,7 @@ def test_retrieve_metadata(store): def test_retrieve_metadata_default_format_id(store): - """Test retrieve_metadata retrieves expected metadata with default format_id.""" + """Test retrieve_metadata retrieves expected metadata without a format_id.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -912,7 +901,7 @@ def test_retrieve_metadata_bytes_pid_empty(store): def test_retrieve_metadata_format_id_empty(store): - """Test retrieve_metadata raises error when supplied with empty format_id.""" + """Test retrieve_metadata raises error when supplied with an empty format_id.""" format_id = "" pid = "jtao.1700.1" with pytest.raises(ValueError): @@ -920,17 +909,45 @@ def test_retrieve_metadata_format_id_empty(store): def test_retrieve_metadata_format_id_empty_spaces(store): - """Test retrieve_metadata raises error when supplied with empty spaces format_id.""" + """Test retrieve_metadata raises error when supplied with empty spaces asthe format_id.""" format_id = " " pid = "jtao.1700.1" with pytest.raises(ValueError): store.retrieve_metadata(pid, format_id) -def test_delete_object(pids, store): - """Test delete_object successfully deletes objects from /objects and all refs files.""" +def test_delete_object_object_deleted(pids, store): + """Test delete_object successfully deletes object.""" + test_dir = "tests/testdata/" + format_id = "http://ns.dataone.org/service/types/v2.0" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + _object_metadata = store.store_object(pid, path) + _metadata_cid = store.store_metadata(pid, syspath, format_id) + store.delete_object(pid) + assert store._count("objects") == 0 + + +def test_delete_object_metadata_deleted(pids, store): + """Test delete_object successfully deletes relevant metadata + files and refs files.""" + test_dir = "tests/testdata/" + format_id = "http://ns.dataone.org/service/types/v2.0" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + _object_metadata = store.store_object(pid, path) + _metadata_cid = store.store_metadata(pid, syspath, format_id) + store.delete_object(pid) + assert store._count("metadata") == 0 + + +def test_delete_object_refs_files_deleted(pids, store): + """Test delete_object successfully deletes refs files.""" test_dir = "tests/testdata/" - entity = "objects" format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -939,12 +956,11 @@ def test_delete_object(pids, store): _object_metadata = store.store_object(pid, path) _metadata_cid = store.store_metadata(pid, syspath, format_id) store.delete_object(pid) - assert store._count(entity) == 0 assert store._count("pid") == 0 assert store._count("cid") == 0 -def test_delete_object_pid_refs_file(pids, store): +def test_delete_object_pid_refs_file_deleted(pids, store): """Test delete_object deletes the associated pid refs file for the object.""" test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" @@ -959,7 +975,7 @@ def test_delete_object_pid_refs_file(pids, store): assert not os.path.exists(pid_refs_file_path) -def test_delete_object_cid_refs_file(pids, store): +def test_delete_object_cid_refs_file_deleted(pids, store): """Test delete_object deletes the associated cid refs file for the object.""" test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" @@ -976,7 +992,7 @@ def test_delete_object_cid_refs_file(pids, store): def test_delete_object_cid_refs_file_with_pid_refs_remaining(pids, store): - """Test delete_object does not delete the cid refs file that still contains ref.""" + """Test delete_object does not delete the cid refs file that still contains refs.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") From 83f8b60972a40a4eab1e58f8cd59b51747951eb0 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 6 Feb 2024 17:06:54 -0800 Subject: [PATCH 20/45] Review & clean up 'test_filehashstore_references' module and add guard rail to '_update_refs_file' to ensure duplicate pids are not written when 'update_type' is 'add' --- src/hashstore/filehashstore.py | 17 ++++---- tests/test_filehashstore_references.py | 57 +++++++++++++++----------- 2 files changed, 41 insertions(+), 33 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 194fdb00..79576ac1 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1,4 +1,5 @@ """Core module for FileHashStore""" + import atexit import io import shutil @@ -1374,8 +1375,7 @@ def _write_refs_file(self, path, ref_id, ref_type): logging.error(exception_string) raise err - @staticmethod - def _update_refs_file(refs_file_path, ref_id, update_type): + def _update_refs_file(self, refs_file_path, ref_id, update_type): """Add or remove an existing ref from a refs file. :param str refs_file_path: Absolute path to the refs file. @@ -1394,14 +1394,15 @@ def _update_refs_file(refs_file_path, ref_id, update_type): ) logging.error(exception_string) raise FileNotFoundError(exception_string) - try: if update_type is "add": - with open(refs_file_path, "a", encoding="utf8") as ref_file: - # Lock file for the shortest amount of time possible - file_descriptor = ref_file.fileno() - fcntl.flock(file_descriptor, fcntl.LOCK_EX) - ref_file.write(ref_id + "\n") + pid_found = self._is_string_in_refs_file(ref_id, refs_file_path) + if not pid_found: + with open(refs_file_path, "a", encoding="utf8") as ref_file: + # Lock file for the shortest amount of time possible + file_descriptor = ref_file.fileno() + fcntl.flock(file_descriptor, fcntl.LOCK_EX) + ref_file.write(ref_id + "\n") if update_type is "remove": with open(refs_file_path, "r+", encoding="utf8") as ref_file: # Lock file immediately, this process needs to complete diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 28c9a08d..be8bfa30 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -1,4 +1,5 @@ """Test module for FileHashStore's reference system to tag stored objects.""" + import os import shutil import pytest @@ -7,7 +8,7 @@ def test_tag_object(pids, store): - """Test tag object returns boolean.""" + """Test tag object returns true boolean when successful.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -17,7 +18,7 @@ def test_tag_object(pids, store): def test_tag_object_pid_refs_file(pids, store): - """Test tag object creates the pid reference file.""" + """Test tag object creates the expected pid reference file.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -44,7 +45,7 @@ def test_tag_object_pid_refs_file_exists(pids, store): def test_tag_object_pid_refs_file_content(pids, store): - """Test tag object creates the pid reference file contains the correct cid.""" + """Test tag object created the pid reference file with the expected cid.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -69,7 +70,7 @@ def test_tag_object_cid_refs_file(pids, store): def test_tag_object_cid_refs_file_content(pids, store): - """Test tag object tags cid reference file successfully with pid.""" + """Test tag object creates the cid reference file successfully with pid.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -82,8 +83,8 @@ def test_tag_object_cid_refs_file_content(pids, store): def test_tag_object_cid_refs_file_exists(pids, store): - """Test tag object raises exception when trying to add another cid to an - existing pid reference file and that a cid reference file is not created.""" + """Test tag object raises exception when trying to tag a pid that already + has a pid refs file, and that a cid reference file is not created.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -164,7 +165,7 @@ def test_tag_object_cid_refs_update_pid_found_but_file_missing(store): def test_verify_object(pids, store): - """Test verify object succeeds given good arguments.""" + """Test verify_object succeeds given good arguments.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -178,7 +179,7 @@ def test_verify_object(pids, store): def test_verify_object_exception_incorrect_object_metadata_type(pids, store): - """Test verify object raises exception when incorrect object is given to + """Test verify_object returns false when incorrect object is given to object_metadata arg.""" test_dir = "tests/testdata/" for pid in pids.keys(): @@ -194,7 +195,7 @@ def test_verify_object_exception_incorrect_object_metadata_type(pids, store): def test_verify_object_exception_incorrect_size(pids, store): - """Test verify object raises exception when incorrect size is supplied.""" + """Test verify_object returns false when incorrect size is supplied.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -214,7 +215,7 @@ def test_verify_object_exception_incorrect_size(pids, store): def test_verify_object_exception_incorrect_checksum(pids, store): - """Test verify object raises exception when incorrect checksum is supplied.""" + """Test verify_object returns false when incorrect checksum is supplied.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -236,7 +237,7 @@ def test_verify_object_exception_incorrect_checksum(pids, store): def test_verify_object_exception_incorrect_checksum_algo(pids, store): - """Test verify object raises exception when incorrect algorithm is supplied.""" + """Test verify_object returns false when incorrect algorithm is supplied.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -247,15 +248,15 @@ def test_verify_object_exception_incorrect_checksum_algo(pids, store): store.verify_object(object_metadata, checksum, "md2", expected_file_size) -def test_write_refs_file_cid(store): - """Test that write_cid_reference writes a reference file.""" +def test_write_refs_file_ref_type_cid(store): + """Test that write_refs_file writes a reference file.""" tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_refs_file(tmp_root_path, "test_pid", "cid") assert os.path.exists(tmp_cid_refs_file) -def test_write_refs_file_cid_content(pids, store): - """Test that write_cid_ref_file writes the expected content.""" +def test_write_refs_file_ref_type_cid_content(pids, store): + """Test that write_refs_file writes the expected content.""" for pid in pids.keys(): tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") @@ -266,7 +267,7 @@ def test_write_refs_file_cid_content(pids, store): def test_update_refs_file_content(pids, store): - """Test that update_cid_ref updates the ref file as expected.""" + """Test that update_refs_file updates the ref file as expected.""" for pid in pids.keys(): tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") @@ -280,7 +281,7 @@ def test_update_refs_file_content(pids, store): def test_update_refs_file_content_multiple(pids, store): - """Test that update_cid_refs adds multiple references successfully.""" + """Test that _update_refs_file adds multiple references successfully.""" for pid in pids.keys(): tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") @@ -301,17 +302,23 @@ def test_update_refs_file_content_multiple(pids, store): def test_update_refs_file_content_pid_exists(pids, store): - """Test that update_cid_ref does not throw exception if pid already exists - and proceeds to complete the tagging process (verify_object)""" + """Test that _update_refs_file does add a pid to a refs file that already + contains the pid.""" for pid in pids.keys(): tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") # Exception should not be thrown store._update_refs_file(tmp_cid_refs_file, pid, "add") + line_count = 0 + with open(tmp_cid_refs_file, "r", encoding="utf8") as ref_file: + for _line in ref_file: + line_count += 1 + assert line_count == 1 + def test_update_refs_file_content_cid_refs_does_not_exist(pids, store): - """Test that update_cid_ref throws exception if cid refs file doesn't exist.""" + """Test that _update_refs_file throws exception if refs file doesn't exist.""" for pid in pids.keys(): cid = pids[pid]["sha256"] cid_ref_abs_path = store._resolve_path("cid", cid) @@ -320,7 +327,7 @@ def test_update_refs_file_content_cid_refs_does_not_exist(pids, store): def test_update_refs_file_remove(pids, store): - """Test that delete_cid_refs_pid deletes the given pid from the ref file.""" + """Test that _update_refs_file deletes the given pid from the ref file.""" for pid in pids.keys(): tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") @@ -336,8 +343,8 @@ def test_update_refs_file_remove(pids, store): assert value == pid_other -def test_update_refs_file_remove_file(pids, store): - """Test that delete_cid_refs_pid leaves a file empty when removing the last pid.""" +def test_update_refs_file_empty_file(pids, store): + """Test that _update_refs_file leaves a file empty when removing the last pid.""" for pid in pids.keys(): tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") @@ -348,7 +355,7 @@ def test_update_refs_file_remove_file(pids, store): assert os.path.getsize(tmp_cid_refs_file) == 0 -def test_write_refs_file_pid(pids, store): +def test_write_refs_file_ref_type_pid(pids, store): """Test that write_pid_refs_file writes a reference file.""" for pid in pids.keys(): cid = pids[pid]["sha256"] @@ -357,7 +364,7 @@ def test_write_refs_file_pid(pids, store): assert os.path.exists(tmp_pid_refs_file) -def test_write_refs_file_content_pid(pids, store): +def test_write_refs_file_ref_type_content_pid(pids, store): """Test that write_pid_refs_file writes the expected content.""" for pid in pids.keys(): cid = pids[pid]["sha256"] From 0baaafb7b35096aaa43209967c12fbef2bd1493e Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 7 Feb 2024 09:19:19 -0800 Subject: [PATCH 21/45] Fix incorrect usage of python 'is' and replace with '==' --- src/hashstore/filehashstore.py | 10 +++++----- tests/test_filehashstore.py | 7 ++----- tests/test_hashstore_client.py | 2 ++ 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 79576ac1..94d835c3 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -783,7 +783,7 @@ def delete_object(self, ab_id, id_type=None): "FileHashStore - delete_object: Request to delete object for id: %s", ab_id ) self._check_string(ab_id, "ab_id", "delete_object") - if id_type is "cid": + if id_type == "cid": cid_refs_abs_path = self._resolve_path("cid", ab_id) # If the refs file still exists, do not delete the object if not os.path.exists(cid_refs_abs_path): @@ -1361,9 +1361,9 @@ def _write_refs_file(self, path, ref_id, ref_type): with self._mktmpfile(path) as tmp_file: tmp_file_path = tmp_file.name with open(tmp_file_path, "w", encoding="utf8") as tmp_cid_ref_file: - if ref_type is "cid": + if ref_type == "cid": tmp_cid_ref_file.write(ref_id + "\n") - if ref_type is "pid": + if ref_type == "pid": tmp_cid_ref_file.write(ref_id) return tmp_file_path @@ -1395,7 +1395,7 @@ def _update_refs_file(self, refs_file_path, ref_id, update_type): logging.error(exception_string) raise FileNotFoundError(exception_string) try: - if update_type is "add": + if update_type == "add": pid_found = self._is_string_in_refs_file(ref_id, refs_file_path) if not pid_found: with open(refs_file_path, "a", encoding="utf8") as ref_file: @@ -1403,7 +1403,7 @@ def _update_refs_file(self, refs_file_path, ref_id, update_type): file_descriptor = ref_file.fileno() fcntl.flock(file_descriptor, fcntl.LOCK_EX) ref_file.write(ref_id + "\n") - if update_type is "remove": + if update_type == "remove": with open(refs_file_path, "r+", encoding="utf8") as ref_file: # Lock file immediately, this process needs to complete # before any others read/modify the content of resf file diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 05888251..1f6a2b01 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -1,4 +1,5 @@ """Test module for FileHashStore init, core, utility and supporting methods.""" + import io import os from pathlib import Path @@ -6,11 +7,7 @@ from hashstore.filehashstore import FileHashStore # pylint: disable=W0212 - - -def test_pids_length(pids): - """Ensure test harness pids are present.""" - assert len(pids) == 3 +# TODO: To Review def test_init_directories_created(store): diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index be4e5f7a..1b270c7e 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -1,10 +1,12 @@ """Test module for the Python client (Public API calls only).""" + import sys import os from pathlib import Path from hashstore import hashstoreclient # pylint: disable=W0212 +# TODO: To Review def test_create_hashstore(tmp_path): From e4ca96741edb03bd436c884e197d5b339e4c2547 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 7 Feb 2024 14:07:12 -0800 Subject: [PATCH 22/45] Refactor 'tag_object' to handle orphaned pid refs file, add new method '_tag_pid_cid_and_verify_refs_files' and update/add new pytests --- src/hashstore/filehashstore.py | 110 +++++++++++++--- tests/test_filehashstore.py | 2 +- tests/test_filehashstore_interface.py | 9 +- tests/test_filehashstore_references.py | 166 +++++++++++++++++++------ tests/test_hashstore_client.py | 1 - 5 files changed, 217 insertions(+), 71 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 94d835c3..1168464f 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -570,13 +570,68 @@ def tag_object(self, pid, cid): # Proceed to tagging process if os.path.exists(pid_ref_abs_path): - # A pid reference file can only contain one cid - exception_string = ( - "FileHashStore - write_pid_refs_file: pid ref file already exists for" - + pid_ref_abs_path + # A pid reference file can only contain and reference one cid + # First, confirm that the expected cid refs file exists by getting the cid + with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: + pid_refs_cid = pid_ref_file.read() + + if pid_refs_cid != cid: + # If it's not equal to the given cid, determine if it's an orphan file + expected_cid_refs_path = self._resolve_path("cid", pid_refs_cid) + if os.path.exists( + expected_cid_refs_path + ) and self._is_string_in_refs_file(pid, expected_cid_refs_path): + # Throw exception, this pid is accounted for + exception_string = ( + "FileHashStore - tag_object: Pid refs file exists with valid pid" + + f" and cid reference files for pid: {pid} with cid: {cid}." + ) + logging.error(exception_string) + raise FileExistsError(exception_string) + # Now check the expected cid refs file + cid_ref_exists = os.path.exists(cid_ref_abs_path) + if cid_ref_exists and self._is_string_in_refs_file( + pid, cid_ref_abs_path + ): + self._verify_hashstore_references(pid, cid, "update") + else: + # Pid is not found in the cid reference file + if cid_ref_exists: + self._update_refs_file(cid_ref_abs_path, pid, "add") + else: + # Overwrite existing pid refs file, it is an orphaned file + print("****OVERWRITING EXISTING FILES****") + self._tag_pid_cid_and_verify_refs_files( + pid, + cid, + pid_ref_abs_path, + cid_ref_abs_path, + tmp_root_path, + ) + logging.info( + "FileHashStore - tag_object: Successfully tagged cid: %s with pid %s", + cid, + pid, + ) + return True + + # Check to see if the given cid's respective refs file exists + if os.path.exists(cid_ref_abs_path): + if not self._is_string_in_refs_file(pid, cid_ref_abs_path): + self._update_refs_file(cid_ref_abs_path, pid, "add") + else: + # Create cid refs file + cid_tmp_file_path = self._write_refs_file(tmp_root_path, pid, "cid") + self._create_path(os.path.dirname(cid_ref_abs_path)) + shutil.move(cid_tmp_file_path, cid_ref_abs_path) + # Ensure everything is where it needs to be + self._verify_hashstore_references(pid, cid, "update") + logging.info( + "FileHashStore - tag_object: Successfully updated cid: %s with pid: %s", + cid, + pid, ) - logging.error(exception_string) - raise FileExistsError(exception_string) + return True elif os.path.exists(cid_ref_abs_path): # Create the pid refs file pid_tmp_file_path = self._write_refs_file(tmp_root_path, cid, "pid") @@ -593,20 +648,9 @@ def tag_object(self, pid, cid): ) return True else: - # All ref files begin as tmp files and get moved sequentially at once - # Get tmp files with the expected cid and pid refs content - pid_tmp_file_path = self._write_refs_file(tmp_root_path, cid, "pid") - cid_tmp_file_path = self._write_refs_file(tmp_root_path, pid, "cid") - # Create paths for pid ref file in '.../refs/pid' and cid ref file in '.../refs/cid' - self._create_path(os.path.dirname(pid_ref_abs_path)) - self._create_path(os.path.dirname(cid_ref_abs_path)) - # Move both files - shutil.move(pid_tmp_file_path, pid_ref_abs_path) - shutil.move(cid_tmp_file_path, cid_ref_abs_path) - # Ensure that the reference files have been written as expected - # If there is an issue, client or user will have to manually review - self._verify_hashstore_references(pid, cid, "create") - + self._tag_pid_cid_and_verify_refs_files( + pid, cid, pid_ref_abs_path, cid_ref_abs_path, tmp_root_path + ) logging.info( "FileHashStore - tag_object: Successfully tagged cid: %s with pid %s", cid, @@ -1339,6 +1383,32 @@ def delete_tmp_file(): os.umask(oldmask) return tmp + def _tag_pid_cid_and_verify_refs_files( + self, pid, cid, pid_ref_abs_path, cid_ref_abs_path, tmp_root_path + ): + """Create temporary pid and cid reference files, move them into their expected + locations and verify the content. + + :param str pid: Authority-based or persistent identifier + :param str cid: Content identifier + :param str pid_ref_abs_path: Permanent address to pid refs file + :param str pid_ref_abs_path: Permanent address to pid refs file + :param str tmp_root_path: Path to folder to create temporary ref files + """ + # All ref files begin as tmp files and get moved sequentially at once + # Get tmp files with the expected cid and pid refs content + pid_tmp_file_path = self._write_refs_file(tmp_root_path, cid, "pid") + cid_tmp_file_path = self._write_refs_file(tmp_root_path, pid, "cid") + # Create paths for pid ref file in '.../refs/pid' and cid ref file in '.../refs/cid' + self._create_path(os.path.dirname(pid_ref_abs_path)) + self._create_path(os.path.dirname(cid_ref_abs_path)) + # Move both files + shutil.move(pid_tmp_file_path, pid_ref_abs_path) + shutil.move(cid_tmp_file_path, cid_ref_abs_path) + # Ensure that the reference files have been written as expected + # If there is an issue, client or user will have to manually review + self._verify_hashstore_references(pid, cid, "create") + def _write_refs_file(self, path, ref_id, ref_type): """Write a reference file in the supplied path into a temporary file. All `pid` or `cid` reference files begin with a single identifier, with the diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 1f6a2b01..32b4d029 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -890,7 +890,7 @@ def test_open_objects(pids, store): io_buffer.close() -def test_delete_by_object_metadata_id(pids, store): +def test_delete_with_object_metadata_id(pids, store): """Check objects are deleted after calling delete with object id.""" test_dir = "tests/testdata/" entity = "objects" diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index d0c21713..7dd20324 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -472,14 +472,8 @@ def test_store_object_duplicates_threads(pids, store): path = test_dir + pid entity = "objects" - file_exists_error_flag = False - def store_object_wrapper(obj_pid, obj_path): - nonlocal file_exists_error_flag - try: - store.store_object(obj_pid, obj_path) # Call store_object inside the thread - except FileExistsError: - file_exists_error_flag = True + store.store_object(obj_pid, obj_path) # Call store_object inside the thread thread1 = Thread(target=store_object_wrapper, args=(pid, path)) thread2 = Thread(target=store_object_wrapper, args=(pid, path)) @@ -493,7 +487,6 @@ def store_object_wrapper(obj_pid, obj_path): # One thread will succeed, file count must still be 1 assert store._count(entity) == 1 assert store._exists(entity, pids[pid][store.algorithm]) - assert file_exists_error_flag @slow_test diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index be8bfa30..a39a6ee1 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -7,8 +7,20 @@ # pylint: disable=W0212 +def test_tag_pid_cid_and_verify_refs_files(store): + """Check that refs files are moved to where they are expected to be.""" + pid = "dou.test.pid" + cid = "dou.test.cid" + pid_refs_file_path = store._resolve_path("pid", pid) + cid_refs_file_path = store._resolve_path("cid", cid) + tmp_root_path = store._get_store_path("refs") / "tmp" + store._tag_pid_cid_and_verify_refs_files( + pid, cid, pid_refs_file_path, cid_refs_file_path, tmp_root_path + ) + + def test_tag_object(pids, store): - """Test tag object returns true boolean when successful.""" + """Test tag_object returns true boolean when successful.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -17,8 +29,8 @@ def test_tag_object(pids, store): assert object_tagged -def test_tag_object_pid_refs_file(pids, store): - """Test tag object creates the expected pid reference file.""" +def test_tag_object_pid_refs_file_exists(pids, store): + """Test tag_object creates the expected pid reference file.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -28,8 +40,21 @@ def test_tag_object_pid_refs_file(pids, store): assert os.path.exists(pid_refs_file_path) -def test_tag_object_pid_refs_file_exists(pids, store): - """Test tag object throws exception when pid refs file already exists.""" +def test_tag_object_cid_refs_file_exists(pids, store): + """Test tag_object creates the cid reference file.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(None, path) + cid = object_metadata.cid + store.tag_object(pid, object_metadata.cid) + cid_refs_file_path = store._resolve_path("cid", cid) + assert os.path.exists(cid_refs_file_path) + + +def test_tag_object_refs_file_exists(pids, store): + """Test tag_object does not throws exception when pid refs file already exists + and verifies the content.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -40,37 +65,42 @@ def test_tag_object_pid_refs_file_exists(pids, store): assert os.path.exists(pid_refs_file_path) cid_refs_file_path = store._resolve_path("cid", cid) assert os.path.exists(cid_refs_file_path) - with pytest.raises(FileExistsError): - store.tag_object(pid, cid) + store.tag_object(pid, cid) -def test_tag_object_pid_refs_file_content(pids, store): - """Test tag object created the pid reference file with the expected cid.""" +def test_tag_object_refs_file_exists_cid_is_not_double_tagged(pids, store): + """Test tag_object succeeds when trying to tag a pid that already has a pid + refs file, and that a cid reference file that already contains cid.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(None, path) store.tag_object(pid, object_metadata.cid) - pid_refs_file_path = store._resolve_path("pid", pid) - with open(pid_refs_file_path, "r", encoding="utf8") as f: - pid_refs_cid = f.read() - assert pid_refs_cid == object_metadata.cid + store.tag_object(pid, object_metadata.cid) + + cid_refs_file_path = store._resolve_path("cid", object_metadata.cid) + line_count = 0 + with open(cid_refs_file_path, "r", encoding="utf8") as ref_file: + for _line in ref_file: + line_count += 1 + assert line_count == 1 -def test_tag_object_cid_refs_file(pids, store): - """Test tag object creates the cid reference file.""" +def test_tag_object_pid_refs_file_content(pids, store): + """Test tag_object created the pid reference file with the expected cid.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(None, path) - cid = object_metadata.cid store.tag_object(pid, object_metadata.cid) - cid_refs_file_path = store._resolve_path("cid", cid) - assert os.path.exists(cid_refs_file_path) + pid_refs_file_path = store._resolve_path("pid", pid) + with open(pid_refs_file_path, "r", encoding="utf8") as f: + pid_refs_cid = f.read() + assert pid_refs_cid == object_metadata.cid def test_tag_object_cid_refs_file_content(pids, store): - """Test tag object creates the cid reference file successfully with pid.""" + """Test tag_object creates the cid reference file successfully with pid.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -82,24 +112,8 @@ def test_tag_object_cid_refs_file_content(pids, store): assert pid_refs_cid == pid -def test_tag_object_cid_refs_file_exists(pids, store): - """Test tag object raises exception when trying to tag a pid that already - has a pid refs file, and that a cid reference file is not created.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(None, path) - store.tag_object(pid, object_metadata.cid) - another_cid = "dou.test.1" - with pytest.raises(FileExistsError): - store.tag_object(pid, another_cid) - - second_cid_hash = store._resolve_path("cid", another_cid) - assert not os.path.exists(second_cid_hash) - - def test_tag_object_cid_refs_update_refs_file_updated(store): - """Test tag object updates a cid reference file that already exists.""" + """Test tag_object updates a cid reference file that already exists.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid.replace("/", "_") @@ -113,15 +127,18 @@ def test_tag_object_cid_refs_update_refs_file_updated(store): store.tag_object(additional_pid, cid) # Read cid file to confirm cid refs file contains the additional pid + line_count = 0 cid_ref_abs_path = store._resolve_path("cid", cid) with open(cid_ref_abs_path, "r", encoding="utf8") as f: for _, line in enumerate(f, start=1): value = line.strip() + line_count += 1 assert value == pid or value == additional_pid + assert line_count == 2 def test_tag_object_cid_refs_update_pid_refs_created(store): - """Test tag object creates a pid reference file when called to tag an object + """Test tag_object creates a pid reference file when called to tag an object that already exists.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" @@ -140,15 +157,15 @@ def test_tag_object_cid_refs_update_pid_refs_created(store): def test_tag_object_cid_refs_update_pid_found_but_file_missing(store): - """Test that tag_object creates a missing pid refs file that somehow disappeared + """Test tag_object creates a missing pid refs file that somehow disappeared when called to tag a cid that already contains the pid.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(None, path) - store.tag_object(pid, object_metadata.cid) + object_metadata = store.store_object(pid, path) cid = object_metadata.cid - # Manually update the cid refs, pid refs file missing at this point + # Manually update the cid refs + # This means that the pid refs file missing at this point additional_pid = "dou.test.1" cid_ref_abs_path = store._resolve_path("cid", cid) store._update_refs_file(cid_ref_abs_path, additional_pid, "add") @@ -164,6 +181,73 @@ def test_tag_object_cid_refs_update_pid_found_but_file_missing(store): assert os.path.exists(pid_refs_file_path) +def test_tag_object_pid_refs_found_but_cid_arg_is_different(store): + """Test that tag_object throws an exception when pid refs file exists, contains a + different cid, and is correctly referenced in the associated cid refs file""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + cid = object_metadata.cid + + pid_ref_abs_path = store._resolve_path("pid", pid) + tmp_root_path = store._get_store_path("refs") / "tmp" + tmp_pid_refs_file = store._write_refs_file(tmp_root_path, cid, "pid") + shutil.move(tmp_pid_refs_file, pid_ref_abs_path) + + # Attempt to tag the existing pid with valid refs file + with pytest.raises(FileExistsError): + store.tag_object(pid, "bad_cid_value") + + +def test_tag_object_pid_refs_found_but_missing_pid_in_cid_refs_file(store): + """Test tag_object completes as expected when pid refs file exists but is missing + (not tagged) from expected cid refs file.""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + cid = object_metadata.cid + + # Remove the pid from the cid refs file + cid_ref_abs_path = store._resolve_path("cid", cid) + store._update_refs_file(cid_ref_abs_path, pid, "remove") + assert not store._is_string_in_refs_file(pid, cid_ref_abs_path) + + # Tag object, this should add the missing pid to the cid refs file + store.tag_object(pid, cid) + assert store._is_string_in_refs_file(pid, cid_ref_abs_path) + + # Confirm that there is only 1 of each expected file + assert store._count("objects") == 1 + assert store._count("pid") == 1 + assert store._count("cid") == 1 + + +def test_tag_object_pid_refs_found_but_cid_refs_file_not_found(store): + """Test tag_object completes when a pid refs file exists but the expected + cid refs file somehow disappeared.""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + cid = object_metadata.cid + + # Delete the cid refs file + cid_ref_abs_path = store._resolve_path("cid", cid) + os.remove(cid_ref_abs_path) + assert not os.path.exists(cid_ref_abs_path) + + # Tag object, this should create the missing pid refs file + store.tag_object(pid, cid) + assert os.path.exists(cid_ref_abs_path) + + # Confirm that there is only 1 of each expected file + assert store._count("objects") == 1 + assert store._count("pid") == 1 + assert store._count("cid") == 1 + + def test_verify_object(pids, store): """Test verify_object succeeds given good arguments.""" test_dir = "tests/testdata/" diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index 1b270c7e..af9997ea 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -6,7 +6,6 @@ from hashstore import hashstoreclient # pylint: disable=W0212 -# TODO: To Review def test_create_hashstore(tmp_path): From 67e8c08643dede2208b244f7c1c4cb8a682334c5 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 7 Feb 2024 16:22:50 -0800 Subject: [PATCH 23/45] Fix bug in 'delete_object' where exception cannot be parsed (cast str() on fnfe), revise '_verify_hashstor_references' to improve clarity and update affected code --- src/hashstore/filehashstore.py | 51 +++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 16 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 1168464f..1cb4ac6d 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -593,14 +593,20 @@ def tag_object(self, pid, cid): if cid_ref_exists and self._is_string_in_refs_file( pid, cid_ref_abs_path ): - self._verify_hashstore_references(pid, cid, "update") + self._verify_hashstore_references( + pid, cid, "Pid and cid refs found, verify refs files." + ) else: # Pid is not found in the cid reference file if cid_ref_exists: self._update_refs_file(cid_ref_abs_path, pid, "add") + self._verify_hashstore_references( + pid, + cid, + "Orphan pid refs file found, updating cid refs file.", + ) else: # Overwrite existing pid refs file, it is an orphaned file - print("****OVERWRITING EXISTING FILES****") self._tag_pid_cid_and_verify_refs_files( pid, cid, @@ -619,13 +625,20 @@ def tag_object(self, pid, cid): if os.path.exists(cid_ref_abs_path): if not self._is_string_in_refs_file(pid, cid_ref_abs_path): self._update_refs_file(cid_ref_abs_path, pid, "add") + self._verify_hashstore_references( + pid, + cid, + "Pid and cid refs file exists, update cid refs file.", + ) else: # Create cid refs file cid_tmp_file_path = self._write_refs_file(tmp_root_path, pid, "cid") self._create_path(os.path.dirname(cid_ref_abs_path)) shutil.move(cid_tmp_file_path, cid_ref_abs_path) - # Ensure everything is where it needs to be - self._verify_hashstore_references(pid, cid, "update") + # Ensure everything is where it needs to be + self._verify_hashstore_references( + pid, cid, "Pid refs file exists, create cid refs file." + ) logging.info( "FileHashStore - tag_object: Successfully updated cid: %s with pid: %s", cid, @@ -640,7 +653,11 @@ def tag_object(self, pid, cid): # Update cid ref files as it already exists if not self._is_string_in_refs_file(pid, cid_ref_abs_path): self._update_refs_file(cid_ref_abs_path, pid, "add") - self._verify_hashstore_references(pid, cid, "update") + self._verify_hashstore_references( + pid, + cid, + "Pid refs file doesn't exist, but cid refs exists.", + ) logging.info( "FileHashStore - tag_object: Successfully updated cid: %s with pid: %s", cid, @@ -850,10 +867,11 @@ def delete_object(self, ab_id, id_type=None): try: cid = self.find_object(pid) except FileNotFoundError as fnfe: - if "pid refs file not found" in fnfe: + fnfe_string = str(fnfe) + if "pid refs file not found" in fnfe_string: # Nothing to delete return - if "cid refs file not found" in fnfe: + if "cid refs file not found" in fnfe_string: # Delete pid refs file objects_to_delete.append( self._rename_path_for_deletion(self._resolve_path("pid", pid)) @@ -862,7 +880,7 @@ def delete_object(self, ab_id, id_type=None): for obj in objects_to_delete: os.remove(obj) return - if "object referenced does not exist" in fnfe: + if "object referenced does not exist" in fnfe_string: # Add pid refs file to be permanently deleted pid_ref_abs_path = self._resolve_path("pid", pid) objects_to_delete.append( @@ -881,7 +899,8 @@ def delete_object(self, ab_id, id_type=None): os.remove(obj) return except ValueError as ve: - if "is missing from cid refs file" in ve: + ve_string = str(ve) + if "is missing from cid refs file" in ve_string: # Add pid refs file to be permanently deleted pid_ref_abs_path = self._resolve_path("pid", pid) objects_to_delete.append( @@ -1407,7 +1426,7 @@ def _tag_pid_cid_and_verify_refs_files( shutil.move(cid_tmp_file_path, cid_ref_abs_path) # Ensure that the reference files have been written as expected # If there is an issue, client or user will have to manually review - self._verify_hashstore_references(pid, cid, "create") + self._verify_hashstore_references(pid, cid, "Created all refs files") def _write_refs_file(self, path, ref_id, ref_type): """Write a reference file in the supplied path into a temporary file. @@ -1674,13 +1693,13 @@ def _verify_object_information( logging.error(exception_string) raise ValueError(exception_string) - def _verify_hashstore_references(self, pid, cid, verify_type): + def _verify_hashstore_references(self, pid, cid, additional_log_string): """Verifies that the supplied pid and pid reference file and content have been written successfully. :param str pid: Authority-based or persistent identifier. :param str cid: Content identifier. - :param str verify_type: "update" or "create" + :param str additional_log_string: String to append to exception statement """ # Check that reference files were created pid_ref_abs_path = self._resolve_path("pid", pid) @@ -1689,7 +1708,7 @@ def _verify_hashstore_references(self, pid, cid, verify_type): exception_string = ( "FileHashStore - _verify_hashstore_references: Pid refs file missing: " + pid_ref_abs_path - + f" . Verify type {verify_type}" + + f" . Additional Context: {additional_log_string}" ) logging.error(exception_string) raise FileNotFoundError(exception_string) @@ -1697,7 +1716,7 @@ def _verify_hashstore_references(self, pid, cid, verify_type): exception_string = ( "FileHashStore - _verify_hashstore_references: Cid refs file missing: " + cid_ref_abs_path - + f" . Verify type {verify_type}" + + f" . Additional Context: {additional_log_string}" ) logging.error(exception_string) raise FileNotFoundError(exception_string) @@ -1709,7 +1728,7 @@ def _verify_hashstore_references(self, pid, cid, verify_type): exception_string = ( "FileHashStore - _verify_hashstore_references: Pid refs file exists" + f" ({pid_ref_abs_path}) but cid ({cid}) does not match." - + f"Verify type {verify_type}" + + f" Additional Context: {additional_log_string}" ) logging.error(exception_string) raise ValueError(exception_string) @@ -1719,7 +1738,7 @@ def _verify_hashstore_references(self, pid, cid, verify_type): exception_string = ( "FileHashStore - _verify_hashstore_references: Cid refs file exists" + f" ({cid_ref_abs_path}) but pid ({pid}) not found." - + f" Verify type {verify_type}" + + f" Additional Context: {additional_log_string}" ) logging.error(exception_string) raise ValueError(exception_string) From b0afa840ccd32673ddb07e48c69ca47bf0a32cc0 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 8 Feb 2024 15:47:15 -0800 Subject: [PATCH 24/45] Refactor 'tag_object' to improve clarity, remove redundant method and update pytests --- src/hashstore/filehashstore.py | 156 +++++++------------- tests/test_filehashstore_references.py | 195 ++++++------------------- 2 files changed, 98 insertions(+), 253 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 1cb4ac6d..960b4583 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -564,23 +564,43 @@ def tag_object(self, pid, cid): ) self.reference_locked_cids.append(cid) try: + tmp_root_path = self._get_store_path("refs") / "tmp" pid_ref_abs_path = self._resolve_path("pid", pid) cid_ref_abs_path = self._resolve_path("cid", cid) - tmp_root_path = self._get_store_path("refs") / "tmp" + pid_ref_abs_path_exists = os.path.exists(pid_ref_abs_path) + cid_ref_abs_path_exists = os.path.exists(cid_ref_abs_path) - # Proceed to tagging process - if os.path.exists(pid_ref_abs_path): + if pid_ref_abs_path_exists and cid_ref_abs_path_exists: + self._verify_hashstore_references( + pid, + cid, + "Refs file already exists, verifying.", + ) + return True + elif pid_ref_abs_path_exists and not cid_ref_abs_path_exists: # A pid reference file can only contain and reference one cid # First, confirm that the expected cid refs file exists by getting the cid with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: pid_refs_cid = pid_ref_file.read() - if pid_refs_cid != cid: - # If it's not equal to the given cid, determine if it's an orphan file - expected_cid_refs_path = self._resolve_path("cid", pid_refs_cid) - if os.path.exists( - expected_cid_refs_path - ) and self._is_string_in_refs_file(pid, expected_cid_refs_path): + if pid_refs_cid == cid: + # The pid correctly references the given cid, but the cid refs file is missing + cid_tmp_file_path = self._write_refs_file(tmp_root_path, pid, "cid") + self._create_path(os.path.dirname(cid_ref_abs_path)) + shutil.move(cid_tmp_file_path, cid_ref_abs_path) + self._verify_hashstore_references( + pid, cid, "Created missing cid refs file" + ) + return True + else: + # Check if the retrieved cid refs file exists and pid is referenced + retrieved_cid_refs_path = self._resolve_path("cid", pid_refs_cid) + retrieved_cid_refs_path_exists = os.path.exists( + retrieved_cid_refs_path + ) + if retrieved_cid_refs_path_exists and self._is_string_in_refs_file( + pid, retrieved_cid_refs_path + ): # Throw exception, this pid is accounted for exception_string = ( "FileHashStore - tag_object: Pid refs file exists with valid pid" @@ -588,64 +608,10 @@ def tag_object(self, pid, cid): ) logging.error(exception_string) raise FileExistsError(exception_string) - # Now check the expected cid refs file - cid_ref_exists = os.path.exists(cid_ref_abs_path) - if cid_ref_exists and self._is_string_in_refs_file( - pid, cid_ref_abs_path - ): - self._verify_hashstore_references( - pid, cid, "Pid and cid refs found, verify refs files." - ) - else: - # Pid is not found in the cid reference file - if cid_ref_exists: - self._update_refs_file(cid_ref_abs_path, pid, "add") - self._verify_hashstore_references( - pid, - cid, - "Orphan pid refs file found, updating cid refs file.", - ) - else: - # Overwrite existing pid refs file, it is an orphaned file - self._tag_pid_cid_and_verify_refs_files( - pid, - cid, - pid_ref_abs_path, - cid_ref_abs_path, - tmp_root_path, - ) - logging.info( - "FileHashStore - tag_object: Successfully tagged cid: %s with pid %s", - cid, - pid, - ) - return True - - # Check to see if the given cid's respective refs file exists - if os.path.exists(cid_ref_abs_path): - if not self._is_string_in_refs_file(pid, cid_ref_abs_path): - self._update_refs_file(cid_ref_abs_path, pid, "add") - self._verify_hashstore_references( - pid, - cid, - "Pid and cid refs file exists, update cid refs file.", - ) - else: - # Create cid refs file - cid_tmp_file_path = self._write_refs_file(tmp_root_path, pid, "cid") - self._create_path(os.path.dirname(cid_ref_abs_path)) - shutil.move(cid_tmp_file_path, cid_ref_abs_path) - # Ensure everything is where it needs to be - self._verify_hashstore_references( - pid, cid, "Pid refs file exists, create cid refs file." - ) - logging.info( - "FileHashStore - tag_object: Successfully updated cid: %s with pid: %s", - cid, - pid, - ) - return True - elif os.path.exists(cid_ref_abs_path): + # Orphaned pid refs file found, the retrieved cid refs file exists + # but doesn't contain the cid. Proceed to overwrite the pid refs file. + # There is no return statement, so we move out of this if block. + elif not pid_ref_abs_path_exists and cid_ref_abs_path_exists: # Create the pid refs file pid_tmp_file_path = self._write_refs_file(tmp_root_path, cid, "pid") self._create_path(os.path.dirname(pid_ref_abs_path)) @@ -664,16 +630,26 @@ def tag_object(self, pid, cid): pid, ) return True - else: - self._tag_pid_cid_and_verify_refs_files( - pid, cid, pid_ref_abs_path, cid_ref_abs_path, tmp_root_path - ) - logging.info( - "FileHashStore - tag_object: Successfully tagged cid: %s with pid %s", - cid, - pid, - ) - return True + + # All ref files begin as tmp files and get moved sequentially at once + # Get tmp files with the expected cid and pid refs content + pid_tmp_file_path = self._write_refs_file(tmp_root_path, cid, "pid") + cid_tmp_file_path = self._write_refs_file(tmp_root_path, pid, "cid") + # Create paths for pid ref file in '.../refs/pid' and cid ref file in '.../refs/cid' + self._create_path(os.path.dirname(pid_ref_abs_path)) + self._create_path(os.path.dirname(cid_ref_abs_path)) + # Move both files + shutil.move(pid_tmp_file_path, pid_ref_abs_path) + shutil.move(cid_tmp_file_path, cid_ref_abs_path) + # Ensure that the reference files have been written as expected + # If there is an issue, client or user will have to manually review + self._verify_hashstore_references(pid, cid, "Created all refs files") + logging.info( + "FileHashStore - tag_object: Successfully tagged cid: %s with pid %s", + cid, + pid, + ) + return True finally: # Release cid with self.reference_lock: @@ -1402,32 +1378,6 @@ def delete_tmp_file(): os.umask(oldmask) return tmp - def _tag_pid_cid_and_verify_refs_files( - self, pid, cid, pid_ref_abs_path, cid_ref_abs_path, tmp_root_path - ): - """Create temporary pid and cid reference files, move them into their expected - locations and verify the content. - - :param str pid: Authority-based or persistent identifier - :param str cid: Content identifier - :param str pid_ref_abs_path: Permanent address to pid refs file - :param str pid_ref_abs_path: Permanent address to pid refs file - :param str tmp_root_path: Path to folder to create temporary ref files - """ - # All ref files begin as tmp files and get moved sequentially at once - # Get tmp files with the expected cid and pid refs content - pid_tmp_file_path = self._write_refs_file(tmp_root_path, cid, "pid") - cid_tmp_file_path = self._write_refs_file(tmp_root_path, pid, "cid") - # Create paths for pid ref file in '.../refs/pid' and cid ref file in '.../refs/cid' - self._create_path(os.path.dirname(pid_ref_abs_path)) - self._create_path(os.path.dirname(cid_ref_abs_path)) - # Move both files - shutil.move(pid_tmp_file_path, pid_ref_abs_path) - shutil.move(cid_tmp_file_path, cid_ref_abs_path) - # Ensure that the reference files have been written as expected - # If there is an issue, client or user will have to manually review - self._verify_hashstore_references(pid, cid, "Created all refs files") - def _write_refs_file(self, path, ref_id, ref_type): """Write a reference file in the supplied path into a temporary file. All `pid` or `cid` reference files begin with a single identifier, with the diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index a39a6ee1..87a30f26 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -7,18 +7,6 @@ # pylint: disable=W0212 -def test_tag_pid_cid_and_verify_refs_files(store): - """Check that refs files are moved to where they are expected to be.""" - pid = "dou.test.pid" - cid = "dou.test.cid" - pid_refs_file_path = store._resolve_path("pid", pid) - cid_refs_file_path = store._resolve_path("cid", cid) - tmp_root_path = store._get_store_path("refs") / "tmp" - store._tag_pid_cid_and_verify_refs_files( - pid, cid, pid_refs_file_path, cid_refs_file_path, tmp_root_path - ) - - def test_tag_object(pids, store): """Test tag_object returns true boolean when successful.""" test_dir = "tests/testdata/" @@ -52,40 +40,6 @@ def test_tag_object_cid_refs_file_exists(pids, store): assert os.path.exists(cid_refs_file_path) -def test_tag_object_refs_file_exists(pids, store): - """Test tag_object does not throws exception when pid refs file already exists - and verifies the content.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(None, path) - cid = object_metadata.cid - store.tag_object(pid, cid) - pid_refs_file_path = store._resolve_path("pid", pid) - assert os.path.exists(pid_refs_file_path) - cid_refs_file_path = store._resolve_path("cid", cid) - assert os.path.exists(cid_refs_file_path) - store.tag_object(pid, cid) - - -def test_tag_object_refs_file_exists_cid_is_not_double_tagged(pids, store): - """Test tag_object succeeds when trying to tag a pid that already has a pid - refs file, and that a cid reference file that already contains cid.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(None, path) - store.tag_object(pid, object_metadata.cid) - store.tag_object(pid, object_metadata.cid) - - cid_refs_file_path = store._resolve_path("cid", object_metadata.cid) - line_count = 0 - with open(cid_refs_file_path, "r", encoding="utf8") as ref_file: - for _line in ref_file: - line_count += 1 - assert line_count == 1 - - def test_tag_object_pid_refs_file_content(pids, store): """Test tag_object created the pid reference file with the expected cid.""" test_dir = "tests/testdata/" @@ -112,139 +66,80 @@ def test_tag_object_cid_refs_file_content(pids, store): assert pid_refs_cid == pid -def test_tag_object_cid_refs_update_refs_file_updated(store): - """Test tag_object updates a cid reference file that already exists.""" +def test_tag_object_pid_refs_found_cid_refs_found(pids, store): + """Test tag_object does not throws exception when the refs files already exist + and verifies the content, and does not double tag the cid refs file.""" test_dir = "tests/testdata/" - pid = "jtao.1700.1" - path = test_dir + pid.replace("/", "_") - # Store data only - object_metadata = store.store_object(None, path) - cid = object_metadata.cid - # Tag object - store.tag_object(pid, cid) - # Tag the cid with another pid - additional_pid = "dou.test.1" - store.tag_object(additional_pid, cid) - - # Read cid file to confirm cid refs file contains the additional pid - line_count = 0 - cid_ref_abs_path = store._resolve_path("cid", cid) - with open(cid_ref_abs_path, "r", encoding="utf8") as f: - for _, line in enumerate(f, start=1): - value = line.strip() - line_count += 1 - assert value == pid or value == additional_pid - assert line_count == 2 - - -def test_tag_object_cid_refs_update_pid_refs_created(store): - """Test tag_object creates a pid reference file when called to tag an object - that already exists.""" - test_dir = "tests/testdata/" - pid = "jtao.1700.1" - path = test_dir + pid.replace("/", "_") - # Store data only - object_metadata = store.store_object(None, path) - cid = object_metadata.cid - # Tag object - store.tag_object(pid, cid) - # Tag the cid with another pid - additional_pid = "dou.test.1" - store.tag_object(additional_pid, cid) + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(None, path) + cid = object_metadata.cid + store.tag_object(pid, cid) + store.tag_object(pid, cid) - pid_refs_file_path = store._resolve_path("pid", additional_pid) - assert os.path.exists(pid_refs_file_path) + cid_refs_file_path = store._resolve_path("cid", object_metadata.cid) + line_count = 0 + with open(cid_refs_file_path, "r", encoding="utf8") as ref_file: + for _line in ref_file: + line_count += 1 + assert line_count == 1 -def test_tag_object_cid_refs_update_pid_found_but_file_missing(store): - """Test tag_object creates a missing pid refs file that somehow disappeared - when called to tag a cid that already contains the pid.""" +def test_tag_object_pid_refs_found_cid_refs_not_found(store): + """Test that tag_object creates a missing cid refs file when called to tag a cid + with a pid whose associated pid refs file contains the given cid.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(pid, path) cid = object_metadata.cid - # Manually update the cid refs - # This means that the pid refs file missing at this point - additional_pid = "dou.test.1" - cid_ref_abs_path = store._resolve_path("cid", cid) - store._update_refs_file(cid_ref_abs_path, additional_pid, "add") - - # Confirm the pid refs file is missing - pid_refs_file_path = store._resolve_path("pid", additional_pid) - assert not os.path.exists(pid_refs_file_path) - # Call tag_object, this should create the missing pid refs file - store.tag_object(additional_pid, cid) + # Manually delete the cid refs file, creating an orphaned pid + cid_ref_abs_path = store._resolve_path("cid", cid) + os.remove(cid_ref_abs_path) + assert store._count("cid") == 0 - # Confirm it has been created - assert os.path.exists(pid_refs_file_path) + store.tag_object(pid, cid) + assert store._count("pid") == 1 + assert store._count("cid") == 1 -def test_tag_object_pid_refs_found_but_cid_arg_is_different(store): +def test_tag_object_pid_refs_found_cid_refs_not_found_different_cid_retrieved(store): """Test that tag_object throws an exception when pid refs file exists, contains a different cid, and is correctly referenced in the associated cid refs file""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - cid = object_metadata.cid - - pid_ref_abs_path = store._resolve_path("pid", pid) - tmp_root_path = store._get_store_path("refs") / "tmp" - tmp_pid_refs_file = store._write_refs_file(tmp_root_path, cid, "pid") - shutil.move(tmp_pid_refs_file, pid_ref_abs_path) + _object_metadata = store.store_object(pid, path) - # Attempt to tag the existing pid with valid refs file with pytest.raises(FileExistsError): - store.tag_object(pid, "bad_cid_value") + store.tag_object(pid, "another_cid_value_that_is_not_found") -def test_tag_object_pid_refs_found_but_missing_pid_in_cid_refs_file(store): - """Test tag_object completes as expected when pid refs file exists but is missing - (not tagged) from expected cid refs file.""" +def test_tag_object_pid_refs_not_found_cid_refs_found(store): + """Test tag_object updates a cid reference file that already exists.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) + # Store data only + object_metadata = store.store_object(None, path) cid = object_metadata.cid - - # Remove the pid from the cid refs file - cid_ref_abs_path = store._resolve_path("cid", cid) - store._update_refs_file(cid_ref_abs_path, pid, "remove") - assert not store._is_string_in_refs_file(pid, cid_ref_abs_path) - - # Tag object, this should add the missing pid to the cid refs file + # Tag object store.tag_object(pid, cid) - assert store._is_string_in_refs_file(pid, cid_ref_abs_path) - - # Confirm that there is only 1 of each expected file - assert store._count("objects") == 1 - assert store._count("pid") == 1 - assert store._count("cid") == 1 - - -def test_tag_object_pid_refs_found_but_cid_refs_file_not_found(store): - """Test tag_object completes when a pid refs file exists but the expected - cid refs file somehow disappeared.""" - test_dir = "tests/testdata/" - pid = "jtao.1700.1" - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - cid = object_metadata.cid + # Tag the cid with another pid + additional_pid = "dou.test.1" + store.tag_object(additional_pid, cid) - # Delete the cid refs file + # Read cid file to confirm cid refs file contains the additional pid + line_count = 0 cid_ref_abs_path = store._resolve_path("cid", cid) - os.remove(cid_ref_abs_path) - assert not os.path.exists(cid_ref_abs_path) - - # Tag object, this should create the missing pid refs file - store.tag_object(pid, cid) - assert os.path.exists(cid_ref_abs_path) - - # Confirm that there is only 1 of each expected file - assert store._count("objects") == 1 - assert store._count("pid") == 1 + with open(cid_ref_abs_path, "r", encoding="utf8") as f: + for _, line in enumerate(f, start=1): + value = line.strip() + line_count += 1 + assert value == pid or value == additional_pid + assert line_count == 2 + assert store._count("pid") == 2 assert store._count("cid") == 1 From 3eda728760b3484bd03db7832d4586697d43a7b7 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 14 May 2024 15:06:53 -0700 Subject: [PATCH 25/45] Update docstrings in hashstore.py --- src/hashstore/hashstore.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index b94762a3..cdb0b2bf 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -1,4 +1,5 @@ """Hashstore Interface""" + from abc import ABC, abstractmethod from collections import namedtuple import importlib.metadata @@ -25,12 +26,10 @@ def store_object( checksum_algorithm, expected_object_size, ): - """Atomic storage of objects to disk using a given stream. The `store_object` method - ensures atomic storage of objects to disk. Upon successful storage, it returns an - `ObjectMetadata` object containing relevant file information, such as the file's id - (used to locate the object on disk), the file's size, and a hex digest dictionary of - algorithms and checksums. The method also tags the object, creating references for - discoverability. + """Atomic storage of objects to disk using a given stream. Upon successful storage, + it returns an `ObjectMetadata` object containing relevant file information, such as + the file's id, the file's size, and a hex digest dictionary of algorithms and checksums. + The method also tags the object, creating references for discoverability. `store_object` ensures that an object is stored only once by synchronizing multiple calls and rejecting attempts to store duplicate objects. If called without a pid, it stores the @@ -67,9 +66,9 @@ def store_object( @abstractmethod def tag_object(self, pid, cid): - """Creates references that allow objects stored in HashStore to be discoverable. Retrieving, - deleting or calculating a hex digest of an object is based on a pid argument; and to - proceed, we must be able to find the object associated with the pid. + """Creates references that allow objects stored in HashStore to be discoverable. + Retrieving, deleting or calculating a hex digest of an object is based on a pid + argument, to proceed, we must be able to find the object associated with the pid. :param str pid: Authority-based or persistent identifier of the object. :param str cid: Content identifier of the object. @@ -112,8 +111,8 @@ def store_metadata(self, pid, metadata, format_id): `store_metadata` method uses a persistent identifier `pid` and a metadata `format_id` to determine the permanent address of the metadata object. All metadata documents for a given `pid` will be stored in a directory (under ../metadata) that is determined by - calculating the hash of the given pid, with the document name being the hash of the - metadata format (`format_id`). + calculating the hash of the given pid, with the document name being the hash of the pid + and metadata format (`pid` + `format_id`). Upon successful storage of metadata, the method returns a string representing the file's permanent address. Metadata objects are stored in parallel to objects in the @@ -161,7 +160,7 @@ def delete_object(self, ab_id, id_type): 'cid', only the object will be deleted if it is not referenced by other pids. :param str ab_id: Authority-based identifier. - :param str id_type: "pid" or "Cid + :param str id_type: "pid" or "cid" """ raise NotImplementedError() From a19d617ef074ab567dfbb99b139a3f8dd977860d Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 14 May 2024 15:41:59 -0700 Subject: [PATCH 26/45] Revise docstrings in 'test_filehashstore' module, and added new TODO items --- tests/test_filehashstore.py | 43 +++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 32b4d029..cd1e9fac 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -7,7 +7,6 @@ from hashstore.filehashstore import FileHashStore # pylint: disable=W0212 -# TODO: To Review def test_init_directories_created(store): @@ -154,7 +153,7 @@ def test_load_properties_hashstore_yaml_missing(store): def test_validate_properties(store): - """Confirm properties validated when all key/values are supplied.""" + """Confirm no exceptions are thrown when all key/values are supplied.""" properties = { "store_path": "/etc/test", "store_depth": 3, @@ -180,7 +179,7 @@ def test_validate_properties_missing_key(store): def test_validate_properties_key_value_is_none(store): - """Confirm exception raised when value from key is 'None'.""" + """Confirm exception raised when a value from a key is 'None'.""" properties = { "store_path": "/etc/test", "store_depth": 3, @@ -218,7 +217,7 @@ def test_set_default_algorithms_missing_yaml(store, pids): def test_store_and_validate_data_files_path(pids, store): - """Test _store_and_validate_data objects with path object for the path arg.""" + """Test _store_and_validate_data with path object for the path arg.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -229,7 +228,7 @@ def test_store_and_validate_data_files_path(pids, store): def test_store_and_validate_data_files_string(pids, store): - """Test _store_and_validate_data objects with string for the path arg.""" + """Test _store_and_validate_data with string for the path arg.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -240,7 +239,7 @@ def test_store_and_validate_data_files_string(pids, store): def test_store_and_validate_data_files_stream(pids, store): - """Test _store_and_validate_data objects with stream for the path arg.""" + """Test _store_and_validate_data with stream for the path arg.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -288,7 +287,8 @@ def test_store_and_validate_data_hex_digests(pids, store): def test_store_and_validate_data_additional_algorithm(pids, store): - """Check _store_and_validate_data returns additional algorithm in hex digests.""" + """Check _store_and_validate_data returns additional algorithm in hex digests + when provided an additional algo value.""" test_dir = "tests/testdata/" for pid in pids.keys(): algo = "sha224" @@ -364,7 +364,7 @@ def test_store_data_only_hex_digests(pids, store): def test_move_and_get_checksums_id(pids, store): - """Test move returns correct id.""" + """Test _move_and_get_checksums returns correct id.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -380,7 +380,7 @@ def test_move_and_get_checksums_id(pids, store): def test_move_and_get_checksums_file_size(pids, store): - """Test move returns correct file size.""" + """Test _move_and_get_checksums returns correct file size.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -396,7 +396,7 @@ def test_move_and_get_checksums_file_size(pids, store): def test_move_and_get_checksums_hex_digests(pids, store): - """Test move returns correct hex digests.""" + """Test _move_and_get_checksums returns correct hex digests.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -415,8 +415,8 @@ def test_move_and_get_checksums_hex_digests(pids, store): assert hex_digests.get("sha512") == pids[pid]["sha512"] -def test_move_and_get_checksums_duplicates_raises_error(pids, store): - """Test move does not store duplicate objects and raises error.""" +def test_move_and_get_checksums_does_not_store_duplicate(pids, store): + """Test _move_and_get_checksums does not store duplicate objects.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -425,6 +425,19 @@ def test_move_and_get_checksums_duplicates_raises_error(pids, store): # pylint: disable=W0212 store._move_and_get_checksums(pid, input_stream) input_stream.close() + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + input_stream = io.open(path, "rb") + # pylint: disable=W0212 + store._move_and_get_checksums(pid, input_stream) + input_stream.close() + assert store._count(entity) == 3 + + +def test_move_and_get_checksums_raises_error_with_nonmatching_checksum(pids, store): + """Test _move_and_get_checksums raises error when incorrect checksum supplied.""" + test_dir = "tests/testdata/" + entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") @@ -437,7 +450,7 @@ def test_move_and_get_checksums_duplicates_raises_error(pids, store): checksum_algorithm="sha256", ) input_stream.close() - assert store._count(entity) == 3 + assert store._count(entity) == 0 def test_move_and_get_checksums_incorrect_file_size(pids, store): @@ -480,7 +493,7 @@ def test_write_to_tmp_file_and_get_hex_digests_additional_algo(store): def test_write_to_tmp_file_and_get_hex_digests_checksum_algo(store): """Test _write...hex_digests returns correct hex digests when given a checksum_algorithm - is provided.""" + and checksum.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -641,6 +654,7 @@ def test_put_metadata_with_string(pids, store): def test_put_metadata_cid(pids, store): """Test put metadata returns correct id.""" + # TODO: Review after fixing put_metadata's permanent address (pid+format_id) test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): @@ -1031,6 +1045,7 @@ def test_get_real_path_with_object_id_sharded(pids, store): def test_get_real_path_with_metadata_id(store, pids): """Test get_real_path returns absolute path given a metadata id.""" + # TODO: Review after fixing put_metadata's permanent address (pid+format_id) entity = "metadata" test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" From 6f860d98da31469f44926412378b3a327f094c0f Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 14 May 2024 15:51:38 -0700 Subject: [PATCH 27/45] Refactor 'tag_object' to use '_is_string_in_refs_file' to confirm whether a cid is in a pid refs file for consistency --- src/hashstore/filehashstore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 960b4583..373c752b 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -583,7 +583,7 @@ def tag_object(self, pid, cid): with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: pid_refs_cid = pid_ref_file.read() - if pid_refs_cid == cid: + if self._is_string_in_refs_file(cid, pid_ref_abs_path): # The pid correctly references the given cid, but the cid refs file is missing cid_tmp_file_path = self._write_refs_file(tmp_root_path, pid, "cid") self._create_path(os.path.dirname(cid_ref_abs_path)) From 3537e096c43d4b98f4769553911230e1021d4c06 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 14 May 2024 15:52:26 -0700 Subject: [PATCH 28/45] Add missing docstring item in 'ObjectMetadata' class --- src/hashstore/hashstore.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index cdb0b2bf..4e249018 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -248,6 +248,7 @@ class ObjectMetadata( the size of the object in bytes (`obj_size`), and an optional list of hex digests (`hex_digests`) to assist with validating objects. + :param str pid: An authority-based or persistent identifier :param str cid: A unique identifier for the object (Hash ID, hex digest). :param bytes obj_size: The size of the object in bytes. :param list hex_digests: A list of hex digests to validate objects From 66771d2376c27b1a5f636376752329e99f07a1bb Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 14 May 2024 16:15:42 -0700 Subject: [PATCH 29/45] Update README.md --- README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 3f5681ec..cfb4cf25 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ ## HashStore: hash-based object storage for DataONE data packages -- **Author**: Matthew B. Jones, Dou Mok, Jing Tao, Matthew Brooke +- **Author**: Dou Mok, Matthew Brooke, Jing Tao, Matthew B. Jones - **License**: [Apache 2](http://opensource.org/licenses/Apache-2.0) - [Package source code on GitHub](https://github.com/DataONEorg/hashstore) - [**Submit Bugs and feature requests**](https://github.com/DataONEorg/hashstore/issues) @@ -113,8 +113,7 @@ tag_object(pid, cid) **How do I delete an object if I have the pid?** - To delete an object and all its associated reference files, call the Public API method `delete_object` with `id_type` 'pid'. -- To delete only an object, call `delete_object` with `id_type` 'cid' which will remove the object if it it is not referenced by any pids. -- To delete an object and all its related data (reference files and system metadata), call the Public API method `delete_object` with `id_type` 'clear'. +- To delete only an object, call `delete_object` with `id_type` 'cid' which will remove the object if it is not referenced by any pids. - Note, `delete_object` and `tag_object` calls are synchronized on their content identifier values so that the shared reference files are not unintentionally modified concurrently. An object that is in the process of being deleted should not be tagged, and vice versa. These calls have been implemented to occur sequentially to improve clarity in the event of an unexpected conflict or issue. From 1ccab7865672b5af726d8d026ab366ac019f0f8d Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 20 May 2024 10:58:20 -0700 Subject: [PATCH 30/45] Add custom exception for when pid refs file is not found and update affected methods and tests --- src/hashstore/filehashstore.py | 22 ++++++++++++++++++---- tests/test_filehashstore_interface.py | 8 +++++--- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 373c752b..e8c97530 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -707,7 +707,7 @@ def find_object(self, pid): f"FileHashStore - find_object: pid refs file not found for pid ({pid}): " + pid_ref_abs_path ) - raise FileNotFoundError(err_msg) + raise PidRefsDoesNotExist(err_msg) def store_metadata(self, pid, metadata, format_id=None): logging.debug( @@ -842,11 +842,17 @@ def delete_object(self, ab_id, id_type=None): try: cid = self.find_object(pid) + except PidRefsDoesNotExist: + warn_msg = ( + "FileHashStore - delete_object: pid refs file does not exist for pid: " + + ab_id + + ". Skipping deletion request." + ) + logging.warning(warn_msg) + # Nothing to delete + return except FileNotFoundError as fnfe: fnfe_string = str(fnfe) - if "pid refs file not found" in fnfe_string: - # Nothing to delete - return if "cid refs file not found" in fnfe_string: # Delete pid refs file objects_to_delete.append( @@ -2248,3 +2254,11 @@ def close(self): self._obj.close() else: self._obj.seek(self._pos) + + +class PidRefsDoesNotExist(Exception): + """Custom exception thrown when a pid refs file does not exist.""" + + def __init__(self, message, errors=None): + super().__init__(message) + self.errors = errors diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 7dd20324..7bea57cf 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -9,6 +9,8 @@ import time import pytest +from hashstore.filehashstore import PidRefsDoesNotExist + # pylint: disable=W0212 @@ -612,7 +614,7 @@ def test_find_object_pid_refs_cid_not_found(pids, store): def test_find_object_pid_object_does_not_exist(store): """Test find object throws exception when object doesn't exist.""" - with pytest.raises(FileNotFoundError): + with pytest.raises(PidRefsDoesNotExist): store.find_object("dou.test.1") @@ -839,7 +841,7 @@ def test_retrieve_object_pid_invalid(store): """Test retrieve_object raises error when supplied with bad pid.""" pid = "jtao.1700.1" pid_does_not_exist = pid + "test" - with pytest.raises(FileNotFoundError): + with pytest.raises(PidRefsDoesNotExist): store.retrieve_object(pid_does_not_exist) @@ -1142,7 +1144,7 @@ def test_get_hex_digest_pid_not_found(store): pid = "jtao.1700.1" pid_does_not_exist = pid + "test" algorithm = "sha256" - with pytest.raises(FileNotFoundError): + with pytest.raises(PidRefsDoesNotExist): store.get_hex_digest(pid_does_not_exist, algorithm) From 513f86db88ebb6f98d5fb385188cb1e806e254f4 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 20 May 2024 11:01:35 -0700 Subject: [PATCH 31/45] Add custom exception for when cid refs file is not found and update affected methods and tests --- src/hashstore/filehashstore.py | 28 +++++++++++++++++---------- tests/test_filehashstore_interface.py | 4 ++-- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index e8c97530..3482c162 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -701,7 +701,7 @@ def find_object(self, pid): + f", but cid refs file not found: {cid_ref_abs_path}" ) logging.error(err_msg) - raise FileNotFoundError(err_msg) + raise CidRefsDoesNotExist(err_msg) else: err_msg = ( f"FileHashStore - find_object: pid refs file not found for pid ({pid}): " @@ -851,17 +851,17 @@ def delete_object(self, ab_id, id_type=None): logging.warning(warn_msg) # Nothing to delete return + except CidRefsDoesNotExist: + # Delete pid refs file + objects_to_delete.append( + self._rename_path_for_deletion(self._resolve_path("pid", pid)) + ) + # Remove all files confirmed for deletion + for obj in objects_to_delete: + os.remove(obj) + return except FileNotFoundError as fnfe: fnfe_string = str(fnfe) - if "cid refs file not found" in fnfe_string: - # Delete pid refs file - objects_to_delete.append( - self._rename_path_for_deletion(self._resolve_path("pid", pid)) - ) - # Remove all files confirmed for deletion - for obj in objects_to_delete: - os.remove(obj) - return if "object referenced does not exist" in fnfe_string: # Add pid refs file to be permanently deleted pid_ref_abs_path = self._resolve_path("pid", pid) @@ -2262,3 +2262,11 @@ class PidRefsDoesNotExist(Exception): def __init__(self, message, errors=None): super().__init__(message) self.errors = errors + + +class CidRefsDoesNotExist(Exception): + """Custom exception thrown when a cid refs file does not exist.""" + + def __init__(self, message, errors=None): + super().__init__(message) + self.errors = errors diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 7bea57cf..2002da56 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -9,7 +9,7 @@ import time import pytest -from hashstore.filehashstore import PidRefsDoesNotExist +from hashstore.filehashstore import CidRefsDoesNotExist, PidRefsDoesNotExist # pylint: disable=W0212 @@ -608,7 +608,7 @@ def test_find_object_pid_refs_cid_not_found(pids, store): pid_ref_file.write("intentionally.wrong.pid") pid_ref_file.truncate() - with pytest.raises(FileNotFoundError): + with pytest.raises(CidRefsDoesNotExist): store.find_object(pid) From 4a602761b2de0bbffdc3f7a14151bc9c39256ef4 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 20 May 2024 11:12:30 -0700 Subject: [PATCH 32/45] Add custom exception for when refs file exists but object is not found, update affected methods and add new pytest --- src/hashstore/filehashstore.py | 49 +++++++++++++++------------ tests/test_filehashstore_interface.py | 25 ++++++++++++-- 2 files changed, 50 insertions(+), 24 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 3482c162..d5ddb44e 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -683,7 +683,7 @@ def find_object(self, pid): + pid_ref_abs_path + f", but object referenced does not exist, cid: {pid_refs_cid}" ) - raise FileNotFoundError(err_msg) + raise RefsFileExistsButCidObjMissing(err_msg) else: return pid_refs_cid else: @@ -860,26 +860,24 @@ def delete_object(self, ab_id, id_type=None): for obj in objects_to_delete: os.remove(obj) return - except FileNotFoundError as fnfe: - fnfe_string = str(fnfe) - if "object referenced does not exist" in fnfe_string: - # Add pid refs file to be permanently deleted - pid_ref_abs_path = self._resolve_path("pid", pid) - objects_to_delete.append( - self._rename_path_for_deletion(pid_ref_abs_path) - ) - # Remove pid from cid refs file - with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: - # Retrieve the cid - pid_refs_cid = pid_ref_file.read() - cid_ref_abs_path = self._resolve_path("cid", pid_refs_cid) - # Remove if the pid refs is found - if self._is_string_in_refs_file(pid, cid_ref_abs_path): - self._update_refs_file(cid_ref_abs_path, pid, "remove") - # Remove all files confirmed for deletion - for obj in objects_to_delete: - os.remove(obj) - return + except RefsFileExistsButCidObjMissing: + # Add pid refs file to be permanently deleted + pid_ref_abs_path = self._resolve_path("pid", pid) + objects_to_delete.append( + self._rename_path_for_deletion(pid_ref_abs_path) + ) + # Remove pid from cid refs file + with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: + # Retrieve the cid + pid_refs_cid = pid_ref_file.read() + cid_ref_abs_path = self._resolve_path("cid", pid_refs_cid) + # Remove if the pid refs is found + if self._is_string_in_refs_file(pid, cid_ref_abs_path): + self._update_refs_file(cid_ref_abs_path, pid, "remove") + # Remove all files confirmed for deletion + for obj in objects_to_delete: + os.remove(obj) + return except ValueError as ve: ve_string = str(ve) if "is missing from cid refs file" in ve_string: @@ -2270,3 +2268,12 @@ class CidRefsDoesNotExist(Exception): def __init__(self, message, errors=None): super().__init__(message) self.errors = errors + + +class RefsFileExistsButCidObjMissing(Exception): + """Custom exception thrown when pid and cid refs file exists, but the + cid object does not.""" + + def __init__(self, message, errors=None): + super().__init__(message) + self.errors = errors diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 2002da56..98425957 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -9,7 +9,11 @@ import time import pytest -from hashstore.filehashstore import CidRefsDoesNotExist, PidRefsDoesNotExist +from hashstore.filehashstore import ( + CidRefsDoesNotExist, + PidRefsDoesNotExist, + RefsFileExistsButCidObjMissing, +) # pylint: disable=W0212 @@ -593,7 +597,22 @@ def test_find_object(pids, store): assert cid == object_metadata.hex_digests.get("sha256") -def test_find_object_pid_refs_cid_not_found(pids, store): +def test_find_object_refs_exist_but_obj_not_found(pids, store): + """Test find_object throws exception when refs file exist but the object does not.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + store.store_object(pid, path) + + cid = store.find_object(pid) + obj_path = store._resolve_path("objects", cid) + os.remove(obj_path) + + with pytest.raises(RefsFileExistsButCidObjMissing): + store.find_object(pid) + + +def test_find_object_cid_refs_not_found(pids, store): """Test find_object throws exception when pid refs file is found with a cid but the cid does not exist.""" test_dir = "tests/testdata/" @@ -612,7 +631,7 @@ def test_find_object_pid_refs_cid_not_found(pids, store): store.find_object(pid) -def test_find_object_pid_object_does_not_exist(store): +def test_find_object_pid_refs_not_found(store): """Test find object throws exception when object doesn't exist.""" with pytest.raises(PidRefsDoesNotExist): store.find_object("dou.test.1") From 4ebe6faa24a105f4d26ba815668f70a68758c81e Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 20 May 2024 11:19:52 -0700 Subject: [PATCH 33/45] Add custom exception for when pid refs file exists but the pid is not found in its expected cid refs file, update affected methods and add new pytest --- src/hashstore/filehashstore.py | 33 ++++++++++++++++----------- tests/test_filehashstore_interface.py | 19 +++++++++++++++ 2 files changed, 39 insertions(+), 13 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index d5ddb44e..b63ca9b6 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -694,7 +694,7 @@ def find_object(self, pid): + f", but is missing from cid refs file: {cid_ref_abs_path}" ) logging.error(err_msg) - raise ValueError(err_msg) + raise PidNotFoundInCidRefsFile(err_msg) else: err_msg = ( f"FileHashStore - find_object: pid refs file exists with cid: {pid_refs_cid}" @@ -878,18 +878,16 @@ def delete_object(self, ab_id, id_type=None): for obj in objects_to_delete: os.remove(obj) return - except ValueError as ve: - ve_string = str(ve) - if "is missing from cid refs file" in ve_string: - # Add pid refs file to be permanently deleted - pid_ref_abs_path = self._resolve_path("pid", pid) - objects_to_delete.append( - self._rename_path_for_deletion(pid_ref_abs_path) - ) - # Remove all files confirmed for deletion - for obj in objects_to_delete: - os.remove(obj) - return + except PidNotFoundInCidRefsFile: + # Add pid refs file to be permanently deleted + pid_ref_abs_path = self._resolve_path("pid", pid) + objects_to_delete.append( + self._rename_path_for_deletion(pid_ref_abs_path) + ) + # Remove all files confirmed for deletion + for obj in objects_to_delete: + os.remove(obj) + return # Proceed with next steps - cid has been retrieved without any issues while cid in self.reference_locked_cids: @@ -2277,3 +2275,12 @@ class RefsFileExistsButCidObjMissing(Exception): def __init__(self, message, errors=None): super().__init__(message) self.errors = errors + + +class PidNotFoundInCidRefsFile(Exception): + """Custom exception thrown when pid reference file exists with a cid, but + the respective cid reference file does not contain the pid.""" + + def __init__(self, message, errors=None): + super().__init__(message) + self.errors = errors diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 98425957..53e40e55 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -11,6 +11,7 @@ from hashstore.filehashstore import ( CidRefsDoesNotExist, + PidNotFoundInCidRefsFile, PidRefsDoesNotExist, RefsFileExistsButCidObjMissing, ) @@ -631,6 +632,24 @@ def test_find_object_cid_refs_not_found(pids, store): store.find_object(pid) +def test_find_object_cid_refs_does_not_contain_pid(pids, store): + """Test find_object throws exception when pid refs file is found with a cid + but the cid refs file does not contain the pid.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + + # Remove the pid from the cid refs file + cid_ref_abs_path = store._resolve_path( + "cid", object_metadata.hex_digests.get("sha256") + ) + store._update_refs_file(cid_ref_abs_path, pid, "remove") + + with pytest.raises(PidNotFoundInCidRefsFile): + store.find_object(pid) + + def test_find_object_pid_refs_not_found(store): """Test find object throws exception when object doesn't exist.""" with pytest.raises(PidRefsDoesNotExist): From 854c99fbb99f19bda68216f1199ebfc3a0e55e0f Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 20 May 2024 12:47:17 -0700 Subject: [PATCH 34/45] Refactor 'delete_object' to improve clarity and include missing file locks/synchronization --- src/hashstore/filehashstore.py | 287 ++++++++++++++++++++++----------- 1 file changed, 190 insertions(+), 97 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index b63ca9b6..c4ea3385 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -824,123 +824,216 @@ def delete_object(self, ab_id, id_type=None): cid_refs_abs_path = self._resolve_path("cid", ab_id) # If the refs file still exists, do not delete the object if not os.path.exists(cid_refs_abs_path): - self._delete("objects", ab_id) + cid = ab_id + # Synchronize the cid + while cid in self.reference_locked_cids: + logging.debug( + "FileHashStore - delete_object: (cid) %s is currently locked. Waiting", + cid, + ) + time.sleep(self.time_out_sec) + # Modify reference_locked_cids consecutively + with self.reference_lock: + logging.debug( + "FileHashStore - delete_object: Add cid: %s to reference_locked_cids.", + cid, + ) + self.reference_locked_cids.append(ab_id) + + try: + self._delete("objects", ab_id) + finally: + # Release cid + with self.reference_lock: + logging.debug( + "FileHashStore - delete_object: Removing cid: %s from" + + "reference_locked_cids.", + cid, + ) + self.reference_locked_cids.remove(cid) else: # id_type is "pid" pid = ab_id - # Create a list of objects to delete to minimize delay objects_to_delete = [] - # Get the metadata documents to delete + # Get the metadata documents to minimize time spent in synchronization rel_path = "/".join(self._shard(self._computehash(pid))) metadata_rel_path = self._get_store_path("metadata") / rel_path metadata_file_paths = self._get_file_paths(metadata_rel_path) - # Add these files to be permanently deleted - if metadata_file_paths is not None: - for path in metadata_file_paths: - # Rename files by appending _delete to the file name - objects_to_delete.append(self._rename_path_for_deletion(path)) - - try: - cid = self.find_object(pid) - except PidRefsDoesNotExist: - warn_msg = ( - "FileHashStore - delete_object: pid refs file does not exist for pid: " - + ab_id - + ". Skipping deletion request." - ) - logging.warning(warn_msg) - # Nothing to delete - return - except CidRefsDoesNotExist: - # Delete pid refs file - objects_to_delete.append( - self._rename_path_for_deletion(self._resolve_path("pid", pid)) - ) - # Remove all files confirmed for deletion - for obj in objects_to_delete: - os.remove(obj) - return - except RefsFileExistsButCidObjMissing: - # Add pid refs file to be permanently deleted - pid_ref_abs_path = self._resolve_path("pid", pid) - objects_to_delete.append( - self._rename_path_for_deletion(pid_ref_abs_path) - ) - # Remove pid from cid refs file - with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: - # Retrieve the cid - pid_refs_cid = pid_ref_file.read() - cid_ref_abs_path = self._resolve_path("cid", pid_refs_cid) - # Remove if the pid refs is found - if self._is_string_in_refs_file(pid, cid_ref_abs_path): - self._update_refs_file(cid_ref_abs_path, pid, "remove") - # Remove all files confirmed for deletion - for obj in objects_to_delete: - os.remove(obj) - return - except PidNotFoundInCidRefsFile: - # Add pid refs file to be permanently deleted - pid_ref_abs_path = self._resolve_path("pid", pid) - objects_to_delete.append( - self._rename_path_for_deletion(pid_ref_abs_path) - ) - # Remove all files confirmed for deletion - for obj in objects_to_delete: - os.remove(obj) - return - # Proceed with next steps - cid has been retrieved without any issues - while cid in self.reference_locked_cids: + # Storing and deleting objects are synchronized together + # Duplicate store object requests for a pid are rejected, but deleting an object + # will wait for a pid to be released if it's found to be in use before proceeding. + while pid in self.object_locked_pids: logging.debug( - "FileHashStore - delete_object: (cid) %s is currently locked. Waiting", - cid, + "FileHashStore - delete_object: pid (%s) is currently locked. Waiting.", + pid, ) time.sleep(self.time_out_sec) - # Modify reference_locked_cids consecutively - with self.reference_lock: + # Modify object_locked_pids consecutively + with self.object_lock: logging.debug( - "FileHashStore - delete_object: Adding cid: %s to reference_locked_cids.", - cid, + "FileHashStore - store_object: Adding pid: %s to object_locked_pids.", + pid, ) - self.reference_locked_cids.append(cid) + self.object_locked_pids.append(pid) + try: - cid_ref_abs_path = self._resolve_path("cid", cid) - pid_ref_abs_path = self._resolve_path("pid", pid) - # Add pid refs file to be permanently deleted - objects_to_delete.append( - self._rename_path_for_deletion(pid_ref_abs_path) - ) - # Remove pid from cid reference file - self._update_refs_file(cid_ref_abs_path, pid, "remove") - # Delete cid reference file and object only if the cid refs file is empty - if os.path.getsize(cid_ref_abs_path) == 0: + # Before we begin deletion process, we look for the `cid` by calling + # `find_object` which will throw custom exceptions if there is an issue with + # the reference files, which help us determine the path to proceed with. + try: + cid = self.find_object(pid) + + # Proceed with next steps - cid has been retrieved without any issues + # We must synchronized here based on the `cid` because multiple threads may + # try to access the `cid_reference_file` + while cid in self.reference_locked_cids: + logging.debug( + "FileHashStore - delete_object: (cid) %s is currently locked. Waiting", + cid, + ) + time.sleep(self.time_out_sec) + # Modify reference_locked_cids consecutively + with self.reference_lock: + logging.debug( + "FileHashStore - delete_object: Add cid: %s to reference_locked_cids.", + cid, + ) + self.reference_locked_cids.append(cid) + + try: + cid_ref_abs_path = self._resolve_path("cid", cid) + pid_ref_abs_path = self._resolve_path("pid", pid) + # Add pid refs file to be permanently deleted + objects_to_delete.append( + self._rename_path_for_deletion(pid_ref_abs_path) + ) + # Remove pid from cid reference file + self._update_refs_file(cid_ref_abs_path, pid, "remove") + # Delete cid reference file and object only if the cid refs file is empty + if os.path.getsize(cid_ref_abs_path) == 0: + objects_to_delete.append( + self._rename_path_for_deletion(cid_ref_abs_path) + ) + obj_real_path = self._resolve_path("objects", cid) + objects_to_delete.append( + self._rename_path_for_deletion(obj_real_path) + ) + # Remove metadata files if they exist + if metadata_file_paths is not None: + for path in metadata_file_paths: + # Rename files by appending _delete to the file name + objects_to_delete.append( + self._rename_path_for_deletion(path) + ) + # Remove all files confirmed for deletion + for obj in objects_to_delete: + os.remove(obj) + + info_string = ( + "FileHashStore - delete_object: Successfully deleted references," + + f" metadata and object associated with pid: {pid}" + ) + logging.info(info_string) + return + + finally: + # Release cid + with self.reference_lock: + debug_msg = ( + "FileHashStore - delete_object:" + + f" Removing cid: {cid} from reference_locked_cids." + ) + logging.debug(debug_msg) + self.reference_locked_cids.remove(cid) + + except PidRefsDoesNotExist: + warn_msg = ( + "FileHashStore - delete_object: pid refs file does not exist for pid: " + + ab_id + + ". Skipping deletion request." + ) + logging.warning(warn_msg) + + # Remove metadata files if they exist + if metadata_file_paths is not None: + for path in metadata_file_paths: + # Rename files by appending _delete to the file name + objects_to_delete.append( + self._rename_path_for_deletion(path) + ) + # Remove all files confirmed for deletion + for obj in objects_to_delete: + os.remove(obj) + return + + except CidRefsDoesNotExist: + # Delete pid refs file objects_to_delete.append( - self._rename_path_for_deletion(cid_ref_abs_path) + self._rename_path_for_deletion(self._resolve_path("pid", pid)) ) - obj_real_path = self._resolve_path("objects", cid) + # Remove metadata files if they exist + if metadata_file_paths is not None: + for path in metadata_file_paths: + # Rename files by appending _delete to the file name + objects_to_delete.append( + self._rename_path_for_deletion(path) + ) + # Remove all files confirmed for deletion + for obj in objects_to_delete: + os.remove(obj) + return + + except RefsFileExistsButCidObjMissing: + # Add pid refs file to be permanently deleted + pid_ref_abs_path = self._resolve_path("pid", pid) objects_to_delete.append( - self._rename_path_for_deletion(obj_real_path) + self._rename_path_for_deletion(pid_ref_abs_path) ) - # Remove all files confirmed for deletion - for obj in objects_to_delete: - os.remove(obj) - - info_string = ( - "FileHashStore - delete_object: Successfully deleted references, metadata and" - + f" object associated with pid: {pid}" - ) - logging.info(info_string) - return - + # Remove pid from cid refs file + with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: + # Retrieve the cid + pid_refs_cid = pid_ref_file.read() + cid_ref_abs_path = self._resolve_path("cid", pid_refs_cid) + # Remove if the pid refs is found + if self._is_string_in_refs_file(pid, cid_ref_abs_path): + self._update_refs_file(cid_ref_abs_path, pid, "remove") + # Remove metadata files if they exist + if metadata_file_paths is not None: + for path in metadata_file_paths: + # Rename files by appending _delete to the file name + objects_to_delete.append( + self._rename_path_for_deletion(path) + ) + # Remove all files confirmed for deletion + for obj in objects_to_delete: + os.remove(obj) + return + + except PidNotFoundInCidRefsFile: + # Add pid refs file to be permanently deleted + pid_ref_abs_path = self._resolve_path("pid", pid) + objects_to_delete.append( + self._rename_path_for_deletion(pid_ref_abs_path) + ) + if metadata_file_paths is not None: + for path in metadata_file_paths: + # Rename files by appending _delete to the file name + objects_to_delete.append( + self._rename_path_for_deletion(path) + ) + # Remove all files confirmed for deletion + for obj in objects_to_delete: + os.remove(obj) + return finally: - # Release cid - with self.reference_lock: - debug_msg = ( - "FileHashStore - delete_object:" - + f" Removing cid: {cid} from reference_locked_cids." + # Release pid + with self.object_lock: + logging.debug( + "FileHashStore - delete_object: Removing pid: %s from object_locked_pids.", + pid, ) - logging.debug(debug_msg) - self.reference_locked_cids.remove(cid) + self.object_locked_pids.remove(pid) def delete_metadata(self, pid, format_id=None): logging.debug( From 781e6ce05bc59deec825dd6e7e50370362863165 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 20 May 2024 12:57:16 -0700 Subject: [PATCH 35/45] Remove redundant methods 'has_subdir' and 'remove_empty' and update pytests --- src/hashstore/filehashstore.py | 32 ------------- tests/test_filehashstore.py | 85 ---------------------------------- 2 files changed, 117 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index c4ea3385..1801b2f3 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -2056,38 +2056,6 @@ def _rename_path_for_deletion(path): shutil.move(path, delete_path) return delete_path - def _remove_empty(self, subpath): - """Successively remove all empty folders starting with `subpath` and - proceeding "up" through directory tree until reaching the `root` - folder. - - :param str subpath: Name of directory. - """ - # Don't attempt to remove any folders if subpath is not a - # subdirectory of the root directory. - if not self._has_subdir(subpath): - return - - while subpath != self.root: - if len(os.listdir(subpath)) > 0 or os.path.islink(subpath): - break - os.rmdir(subpath) - subpath = os.path.dirname(subpath) - - def _has_subdir(self, path): - """Return whether `path` is a subdirectory of the `root` directory. - - :param str path: Name of path. - - :return: `True` if subdirectory. - :rtype: bool - """ - # Append os.sep so that paths like /usr/var2/log doesn't match /usr/var. - root_path = os.path.realpath(self.root) + os.sep - subpath = os.path.realpath(path) - is_subdir = subpath.startswith(root_path) - return is_subdir - def _create_path(self, path): """Physically create the folder path (and all intermediate ones) on disk. diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index cd1e9fac..50ae0ee7 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -916,91 +916,6 @@ def test_delete_with_object_metadata_id(pids, store): assert store._count(entity) == 0 -def test_remove_empty_removes_empty_folders_string(store): - """Test empty folders (via string) are removed.""" - three_dirs = "dir1/dir2/dir3" - two_dirs = "dir1/dir4" - one_dir = "dir5" - os.makedirs(os.path.join(store.root, three_dirs)) - os.makedirs(os.path.join(store.root, two_dirs)) - os.makedirs(os.path.join(store.root, one_dir)) - assert os.path.exists(os.path.join(store.root, three_dirs)) - assert os.path.exists(os.path.join(store.root, two_dirs)) - assert os.path.exists(os.path.join(store.root, one_dir)) - # pylint: disable=W0212 - store._remove_empty(os.path.join(store.root, three_dirs)) - store._remove_empty(os.path.join(store.root, two_dirs)) - store._remove_empty(os.path.join(store.root, one_dir)) - assert not os.path.exists(os.path.join(store.root, three_dirs)) - assert not os.path.exists(os.path.join(store.root, two_dirs)) - assert not os.path.exists(os.path.join(store.root, one_dir)) - - -def test_remove_empty_removes_empty_folders_path(store): - """Test empty folders (via Path object) are removed.""" - three_dirs = Path("dir1/dir2/dir3") - two_dirs = Path("dir1/dir4") - one_dir = Path("dir5") - (store.root / three_dirs).mkdir(parents=True) - (store.root / two_dirs).mkdir(parents=True) - (store.root / one_dir).mkdir(parents=True) - assert (store.root / three_dirs).exists() - assert (store.root / two_dirs).exists() - assert (store.root / one_dir).exists() - # pylint: disable=W0212 - store._remove_empty(store.root / three_dirs) - store._remove_empty(store.root / two_dirs) - store._remove_empty(store.root / one_dir) - assert not (store.root / three_dirs).exists() - assert not (store.root / two_dirs).exists() - assert not (store.root / one_dir).exists() - - -def test_remove_empty_does_not_remove_nonempty_folders(pids, store): - """Test non-empty folders are not removed.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store._store_and_validate_data(pid, path) - object_metadata_shard = store._shard(object_metadata.cid) - object_metadata_shard_path = "/".join(object_metadata_shard) - # Get parent directory of the relative path - parent_dir = os.path.dirname(object_metadata_shard_path) - # Attempt to remove the parent directory - # pylint: disable=W0212 - store._remove_empty(parent_dir) - abs_parent_dir = store.objects + "/" + parent_dir - assert os.path.exists(abs_parent_dir) - - -def test_has_subdir_subdirectory_string(store): - """Test that subdirectory is recognized.""" - sub_dir = store.root + "/filehashstore/test" - os.makedirs(sub_dir) - # pylint: disable=W0212 - is_sub_dir = store._has_subdir(sub_dir) - assert is_sub_dir - - -def test_has_subdir_subdirectory_path(store): - """Test that subdirectory is recognized.""" - sub_dir = Path(store.root) / "filehashstore" / "test" - sub_dir.mkdir(parents=True) - # pylint: disable=W0212 - is_sub_dir = store._has_subdir(sub_dir) - assert is_sub_dir - - -def test_has_subdir_non_subdirectory(store): - """Test that non-subdirectory is not recognized.""" - parent_dir = os.path.dirname(store.root) - non_sub_dir = parent_dir + "/filehashstore/test" - os.makedirs(non_sub_dir) - # pylint: disable=W0212 - is_sub_dir = store._has_subdir(non_sub_dir) - assert not is_sub_dir - - def test_create_path(pids, store): """Test makepath creates folder successfully.""" for pid in pids: From be1e27b7e5632d7a200aff978c3c63fdc3ea2846 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 21 May 2024 11:58:33 -0700 Subject: [PATCH 36/45] Fix bug with incorrect stored metadata document name and update all pytests --- src/hashstore/filehashstore.py | 8 ++++---- tests/test_filehashstore.py | 2 +- tests/test_filehashstore_interface.py | 10 +++++----- tests/test_hashstore_client.py | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 1801b2f3..baec42cc 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -794,9 +794,9 @@ def retrieve_metadata(self, pid, format_id=None): entity = "metadata" metadata_directory = self._computehash(pid) if format_id is None: - metadata_document_name = self._computehash(self.sysmeta_ns) + metadata_document_name = self._computehash(pid + self.sysmeta_ns) else: - metadata_document_name = self._computehash(checked_format_id) + metadata_document_name = self._computehash(pid + checked_format_id) rel_path = "/".join(self._shard(metadata_directory)) full_path_without_directory = rel_path + "/" + metadata_document_name metadata_exists = self._exists(entity, full_path_without_directory) @@ -1065,7 +1065,7 @@ def delete_metadata(self, pid, format_id=None): return else: # Delete a specific metadata file - metadata_document_name = self._computehash(checked_format_id) + metadata_document_name = self._computehash(pid + checked_format_id) full_path_without_directory = rel_path + "/" + metadata_document_name metadata_exists = self._exists(entity, full_path_without_directory) if metadata_exists: @@ -1598,7 +1598,7 @@ def _put_metadata(self, metadata, pid, format_id): # Get target and related paths (permanent location) metadata_directory = self._computehash(pid) - metadata_document_name = self._computehash(format_id) + metadata_document_name = self._computehash(pid + format_id) rel_path = "/".join(self._shard(metadata_directory)) full_path = self._get_store_path("metadata") / rel_path / metadata_document_name diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 50ae0ee7..5c7acdfe 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -664,7 +664,7 @@ def test_put_metadata_cid(pids, store): # Manually calculate expected path metadata_directory = store._computehash(pid) - metadata_document_name = store._computehash(format_id) + metadata_document_name = store._computehash(pid + format_id) rel_path = "/".join(store._shard(metadata_directory)) full_path = ( store._get_store_path("metadata") / rel_path / metadata_document_name diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 53e40e55..af76998a 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -679,7 +679,7 @@ def test_store_metadata(pids, store): metadata_cid = store.store_metadata(pid, syspath, format_id) # Manually calculate expected path metadata_directory = store._computehash(pid) - metadata_document_name = store._computehash(format_id) + metadata_document_name = store._computehash(pid + format_id) rel_path = "/".join(store._shard(metadata_directory)) full_path = ( store._get_store_path("metadata") / rel_path / metadata_document_name @@ -703,9 +703,9 @@ def test_store_metadata_one_pid_multiple_docs_correct_location(store): metadata_cid = store.store_metadata(pid, syspath, format_id) metadata_cid3 = store.store_metadata(pid, syspath, format_id3) metadata_cid4 = store.store_metadata(pid, syspath, format_id4) - metadata_document_name = store._computehash(format_id) - metadata_document_name3 = store._computehash(format_id3) - metadata_document_name4 = store._computehash(format_id4) + metadata_document_name = store._computehash(pid + format_id) + metadata_document_name3 = store._computehash(pid + format_id3) + metadata_document_name4 = store._computehash(pid + format_id4) full_path = store._get_store_path("metadata") / rel_path / metadata_document_name full_path3 = store._get_store_path("metadata") / rel_path / metadata_document_name3 full_path4 = store._get_store_path("metadata") / rel_path / metadata_document_name4 @@ -725,7 +725,7 @@ def test_store_metadata_default_format_id(pids, store): metadata_cid = store.store_metadata(pid, syspath) # Manually calculate expected path metadata_directory = store._computehash(pid) - metadata_document_name = store._computehash(format_id) + metadata_document_name = store._computehash(pid + format_id) rel_path = "/".join(store._shard(metadata_directory)) full_path = ( store._get_store_path("metadata") / rel_path / metadata_document_name diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index af9997ea..f6db7ed5 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -169,7 +169,7 @@ def test_store_metadata(capsys, store, pids): hashstoreclient.main() metadata_directory = store._computehash(pid) - metadata_document_name = store._computehash(namespace) + metadata_document_name = store._computehash(pid + namespace) rel_path = "/".join(store._shard(metadata_directory)) full_path = ( store._get_store_path("metadata") / rel_path / metadata_document_name From 57af22fb1882c5bfae7615ea92d651b1e33ea5b0 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 21 May 2024 12:05:39 -0700 Subject: [PATCH 37/45] Remove resolved todo items from 'test_filehashstore' --- tests/test_filehashstore.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 5c7acdfe..4150d800 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -654,7 +654,6 @@ def test_put_metadata_with_string(pids, store): def test_put_metadata_cid(pids, store): """Test put metadata returns correct id.""" - # TODO: Review after fixing put_metadata's permanent address (pid+format_id) test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): @@ -960,7 +959,6 @@ def test_get_real_path_with_object_id_sharded(pids, store): def test_get_real_path_with_metadata_id(store, pids): """Test get_real_path returns absolute path given a metadata id.""" - # TODO: Review after fixing put_metadata's permanent address (pid+format_id) entity = "metadata" test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" From 58d5e0f34637dcfdba2e8d0e8acb33f87a8ee058 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 21 May 2024 12:09:12 -0700 Subject: [PATCH 38/45] Update README hashstore layout example --- README.md | 76 ++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 64 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index cfb4cf25..ae14f610 100644 --- a/README.md +++ b/README.md @@ -62,7 +62,7 @@ properties = { # Get HashStore from factory module_name = "hashstore.filehashstore.filehashstore" class_name = "FileHashStore" -my_store = factory.get_hashstore(module_name, class_name, properties) +my_store = hashstore_factory.get_hashstore(module_name, class_name, properties) # Store objects (.../[hashstore_path]/objects/) pid = "j.tao.1700.1" @@ -140,21 +140,21 @@ These reference files are implemented in HashStore underneath the hood with no e **'pid' Reference Files** - Pid (persistent identifier) reference files are created when storing an object with an identifier. -- Pid reference files are located in HashStores '/refs/pid' directory +- Pid reference files are located in HashStores '/refs/pids' directory - If an identifier is not available at the time of storing an object, the calling app/client must create this association between a pid and the object it represents by calling `tag_object` separately. - Each pid reference file contains a string that represents the content identifier of the object it references - Like how objects are stored once and only once, there is also only one pid reference file for each object. **'cid' Reference Files** - Cid (content identifier) reference files are created at the same time as pid reference files when storing an object with an identifier. -- Cid reference files are located in HashStore's '/refs/cid' directory +- Cid reference files are located in HashStore's '/refs/cids' directory - A cid reference file is a list of all the pids that reference a cid, delimited by a new line ("\n") character ###### What does HashStore look like? ```shell -# Example layout in HashStore with a single file stored along with its metadata and reference files. +# Example layout in HashStore with three files stored along with its metadata and reference files. # This uses a store depth of 3, with a width of 2 and "SHA-256" as its default store algorithm ## Notes: ## - Objects are stored using their content identifier as the file address @@ -162,14 +162,66 @@ These reference files are implemented in HashStore underneath the hood with no e ## - The reference file for each cid contains multiple pids each on its own line .../metacat/hashstore/ -└─ objects - └─ /d5/95/3b/d802fa74edea72eb941...00d154a727ed7c2 -└─ metadata - └─ /15/8d/7e/55c36a810d7c14479c9...b20d7df66768b04 -└─ refs - └─ pid/0d/55/5e/d77052d7e166017f779...7230bcf7abcef65e - └─ cid/d5/95/3b/d802fa74edea72eb941...00d154a727ed7c2 -hashstore.yaml + ├── hashstore.yaml + ├── objects + | ├── 4d + | │ └── 19 + | │ └── 81 + | | └── 71eef969d553d4c9537b1811a7b078f9a3804fc978a761bc014c05972c + | ├── 94 + | │ └── f9 + | │ └── b6 + | | └── c88f1f458e410c30c351c6384ea42ac1b5ee1f8430d3e365e43b78a38a + | └── 44 + | └── 73 + | └── 51 + | └── 6a592209cbcd3a7ba4edeebbdb374ee8e4a49d19896fafb8f278dc25fa + └── metadata + | ├── 0d + | │ └── 55 + | │ └── 55 + | | └── 5ed77052d7e166017f779cbc193357c3a5006ee8b8457230bcf7abcef65e + | | └── 323e0799524cec4c7e14d31289cefd884b563b5c052f154a066de5ec1e477da7 + | | └── sha256(pid+formatId_annotations) + | ├── a8 + | │ └── 24 + | │ └── 19 + | | └── 25740d5dcd719596639e780e0a090c9d55a5d0372b0eaf55ed711d4edf + | | └── ddf07952ef28efc099d10d8b682480f7d2da60015f5d8873b6e1ea75b4baf689 + | | └── sha256(pid+formatId_annotations) + | └── 7f + | └── 5c + | └── c1 + | └── 8f0b04e812a3b4c8f686ce34e6fec558804bf61e54b176742a7f6368d6 + | └── 9a2e08c666b728e6cbd04d247b9e556df3de5b2ca49f7c5a24868eb27cddbff2 + | └── sha256(pid+formatId_annotations) + └── refs + ├── cids + | ├── 4d + | | └── 19 + | | └── 81 + | | └── 71eef969d553d4c9537b1811a7b078f9a3804fc978a761bc014c05972c + | ├── 94 + | │ └── f9 + | │ └── b6 + | | └── c88f1f458e410c30c351c6384ea42ac1b5ee1f8430d3e365e43b78a38a + | └── 44 + | └── 73 + | └── 51 + | └── 6a592209cbcd3a7ba4edeebbdb374ee8e4a49d19896fafb8f278dc25fa + └── pids + ├── 0d + | └── 55 + | └── 55 + | └── 5ed77052d7e166017f779cbc193357c3a5006ee8b8457230bcf7abcef65e + ├── a8 + │ └── 24 + │ └── 19 + | └── 25740d5dcd719596639e780e0a090c9d55a5d0372b0eaf55ed711d4edf + └── 7f + └── 5c + └── c1 + └── 8f0b04e812a3b4c8f686ce34e6fec558804bf61e54b176742a7f6368d6 ``` ## Development build From 523024e54a1f3c12b8cab742ecd03936481738ff Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 21 May 2024 12:51:21 -0700 Subject: [PATCH 39/45] Add new pytests for '_resolve_path' method --- src/hashstore/filehashstore.py | 2 ++ tests/test_filehashstore.py | 66 ++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index baec42cc..5342f415 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -2109,6 +2109,8 @@ def _resolve_path(self, entity, file): if entity == "objects": rel_root = self.objects if entity == "metadata": + # TODO: The resolve_path method is not consistent in its usage regarding metadata + # Review and refactor when time permitting. rel_root = self.metadata relpath = os.path.join(rel_root, file) if os.path.isfile(relpath): diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 4150d800..ffd012c1 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -1009,3 +1009,69 @@ def test_cast_to_bytes(store): # pylint: disable=W0212 string_bytes = store._cast_to_bytes(string) assert isinstance(string_bytes, bytes) + + +def test_resolve_path_objects(pids, store): + """Confirm resolve path returns correct object path""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = Path(test_dir + pid.replace("/", "_")) + object_metadata = store.store_object(pid, path) + cid = object_metadata.cid + + obj_resolved_path = store._resolve_path("objects", cid) + calculated_obj_path = store.objects + "/" + "/".join(store._shard(cid)) + + assert calculated_obj_path == obj_resolved_path + + +def test_resolve_path_metadata(pids, store): + """Confirm resolve path returns correct metadata path.""" + test_dir = "tests/testdata/" + format_id = "http://ns.dataone.org/service/types/v2.0" + for pid in pids.keys(): + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + _metadata_cid = store.store_metadata(pid, syspath, format_id) + + metadata_directory = store._computehash(pid) + metadata_document_name = store._computehash(pid + format_id) + rel_path = "/".join(store._shard(metadata_directory)) + full_path_without_dir = rel_path + "/" + metadata_document_name + + metadata_resolved_path = store._resolve_path("metadata", full_path_without_dir) + calculated_metadata_path = ( + store.metadata + "/" + rel_path + "/" + metadata_document_name + ) + + assert calculated_metadata_path == metadata_resolved_path + + +def test_resolve_path_refs_pid(pids, store): + """Confirm resolve path returns correct object pid refs path""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = Path(test_dir + pid.replace("/", "_")) + _object_metadata = store.store_object(pid, path) + + resolved_pid_ref_abs_path = store._resolve_path("pid", pid) + pid_refs_metadata_hashid = store._computehash(pid) + calculated_pid_ref_path = ( + store.refs + "/pid/" + "/".join(store._shard(pid_refs_metadata_hashid)) + ) + + assert resolved_pid_ref_abs_path == calculated_pid_ref_path + + +def test_resolve_path_refs_cid(pids, store): + """Confirm resolve path returns correct object pid refs path""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = Path(test_dir + pid.replace("/", "_")) + object_metadata = store.store_object(pid, path) + cid = object_metadata.cid + + resolved_cid_ref_abs_path = store._resolve_path("cid", cid) + calculated_cid_ref_path = store.refs + "/cid/" + "/".join(store._shard(cid)) + + assert resolved_cid_ref_abs_path == calculated_cid_ref_path From 84e00d4e3b3c40e9c3160d32e2507e9fc61623b5 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 21 May 2024 15:34:55 -0700 Subject: [PATCH 40/45] Revise '_resolve_path' method to improve clarity --- src/hashstore/filehashstore.py | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 5342f415..745fbca9 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -2100,24 +2100,27 @@ def _resolve_path(self, entity, file): :return: Path to file :rtype: str """ - # Check for absolute path. - if os.path.isfile(file): - return file - # Check for relative path. - rel_root = "" if entity == "objects": rel_root = self.objects - if entity == "metadata": + relpath = os.path.join(rel_root, file) + if os.path.isfile(relpath): + return relpath + else: + abspath = self._build_path(entity, file) + if os.path.isfile(abspath): + return abspath + elif entity == "metadata": # TODO: The resolve_path method is not consistent in its usage regarding metadata # Review and refactor when time permitting. + if os.path.isfile(file): + return file rel_root = self.metadata - relpath = os.path.join(rel_root, file) - if os.path.isfile(relpath): - return relpath - + relpath = os.path.join(rel_root, file) + if os.path.isfile(relpath): + return relpath # Check for sharded path. - if entity == "cid": + elif entity == "cid": # Note, we skip checking whether the file exists for refs cid_ref_file_abs_path = self._build_path(entity, file) return cid_ref_file_abs_path @@ -2127,9 +2130,11 @@ def _resolve_path(self, entity, file): pid_ref_file_abs_path = self._build_path(entity, hash_id) return pid_ref_file_abs_path else: - abspath = self._build_path(entity, file) - if os.path.isfile(abspath): - return abspath + exception_string = ( + "FileHashStore - _resolve_path: entity must be" + + " 'object', 'metadata', 'cid' or 'pid" + ) + raise ValueError(exception_string) def _get_store_path(self, entity): """Return a path object of the root directory of the store. From 4922c247af37e0f52a26e7dd5333f371c8ef7a9e Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 21 May 2024 16:02:02 -0700 Subject: [PATCH 41/45] Fix typo in exception string for 'resolve_path' and rename variables in 'retrieve_metadata' to improve clarity --- src/hashstore/filehashstore.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 745fbca9..f2065f30 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -798,11 +798,15 @@ def retrieve_metadata(self, pid, format_id=None): else: metadata_document_name = self._computehash(pid + checked_format_id) rel_path = "/".join(self._shard(metadata_directory)) - full_path_without_directory = rel_path + "/" + metadata_document_name - metadata_exists = self._exists(entity, full_path_without_directory) + metadata_rel_path = rel_path + "/" + metadata_document_name + metadata_exists = self._exists(entity, metadata_rel_path) if metadata_exists: - metadata_stream = self._open(entity, full_path_without_directory) + metadata_stream = self._open(entity, metadata_rel_path) + logging.info( + "FileHashStore - retrieve_metadata: Retrieved metadata for pid: %s", pid + ) + return metadata_stream else: exception_string = ( f"FileHashStore - retrieve_metadata: No metadata found for pid: {pid}" @@ -810,11 +814,6 @@ def retrieve_metadata(self, pid, format_id=None): logging.error(exception_string) raise ValueError(exception_string) - logging.info( - "FileHashStore - retrieve_metadata: Retrieved metadata for pid: %s", pid - ) - return metadata_stream - def delete_object(self, ab_id, id_type=None): logging.debug( "FileHashStore - delete_object: Request to delete object for id: %s", ab_id @@ -2111,8 +2110,6 @@ def _resolve_path(self, entity, file): if os.path.isfile(abspath): return abspath elif entity == "metadata": - # TODO: The resolve_path method is not consistent in its usage regarding metadata - # Review and refactor when time permitting. if os.path.isfile(file): return file rel_root = self.metadata @@ -2132,7 +2129,7 @@ def _resolve_path(self, entity, file): else: exception_string = ( "FileHashStore - _resolve_path: entity must be" - + " 'object', 'metadata', 'cid' or 'pid" + + " 'objects', 'metadata', 'cid' or 'pid" ) raise ValueError(exception_string) From fa29b53c8f72fb36f13717f1740e13c29fa7ed0c Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 21 May 2024 16:54:44 -0700 Subject: [PATCH 42/45] Fix inaccurate exception text in '_verify_object_information' --- src/hashstore/filehashstore.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index f2065f30..fe1ea8e5 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1689,7 +1689,7 @@ def _verify_object_information( if file_size_to_validate is not None and file_size_to_validate > 0: if file_size_to_validate != tmp_file_size: exception_string = ( - "FileHashStore - _validate_arg_object: Object file size calculated: " + "FileHashStore - _verify_object_information: Object file size calculated: " + f" {tmp_file_size} does not match with expected size:" + f" {file_size_to_validate}." ) @@ -1707,7 +1707,7 @@ def _verify_object_information( if checksum_algorithm is not None and checksum is not None: if checksum_algorithm not in hex_digests: exception_string = ( - "FileHashStore - _validate_arg_object: checksum_algorithm" + "FileHashStore - _verify_object_information: checksum_algorithm" + f" ({checksum_algorithm}) cannot be found in the hex digests dictionary." ) logging.error(exception_string) @@ -1716,7 +1716,7 @@ def _verify_object_information( hex_digest_stored = hex_digests[checksum_algorithm] if hex_digest_stored != checksum.lower(): exception_string = ( - "FileHashStore - _validate_arg_object: Hex digest and checksum" + "FileHashStore - _verify_object_information: Hex digest and checksum" + f" do not match - file not stored for pid: {pid}. Algorithm:" + f" {checksum_algorithm}. Checksum provided: {checksum} !=" + f" HexDigest: {hex_digest_stored}." From b3a612ca88063756e77401e86fdd32e13e9799f0 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 21 May 2024 16:57:40 -0700 Subject: [PATCH 43/45] Rename 'refs/pid' and 'refs/cid' directories to 'refs/pids' and 'refs/cids' --- src/hashstore/filehashstore.py | 4 ++-- tests/test_filehashstore.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index fe1ea8e5..9ec884b5 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -111,8 +111,8 @@ def __init__(self, properties=None): self._create_path(self.metadata + "/tmp") if not os.path.exists(self.refs): self._create_path(self.refs + "/tmp") - self._create_path(self.refs + "/pid") - self._create_path(self.refs + "/cid") + self._create_path(self.refs + "/pids") + self._create_path(self.refs + "/cids") logging.debug( "FileHashStore - Initialization success. Store root: %s", self.root ) diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index ffd012c1..1404832a 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -18,8 +18,8 @@ def test_init_directories_created(store): assert os.path.exists(store.metadata + "/tmp") assert os.path.exists(store.refs) assert os.path.exists(store.refs + "/tmp") - assert os.path.exists(store.refs + "/pid") - assert os.path.exists(store.refs + "/cid") + assert os.path.exists(store.refs + "/pids") + assert os.path.exists(store.refs + "/cids") def test_init_existing_store_incorrect_algorithm_format(store): From f238df56b5afb92bd046e92f862dad93b8c6b4e2 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 21 May 2024 17:07:31 -0700 Subject: [PATCH 44/45] Improve logging statements for 'find_object' by including pid value --- src/hashstore/filehashstore.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 9ec884b5..63ad485c 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -691,6 +691,8 @@ def find_object(self, pid): err_msg = ( "FileHashStore - find_object: pid refs file exists with cid: " + pid_refs_cid + + " for pid: " + + pid + f", but is missing from cid refs file: {cid_ref_abs_path}" ) logging.error(err_msg) @@ -698,7 +700,7 @@ def find_object(self, pid): else: err_msg = ( f"FileHashStore - find_object: pid refs file exists with cid: {pid_refs_cid}" - + f", but cid refs file not found: {cid_ref_abs_path}" + + f", but cid refs file not found: {cid_ref_abs_path} for pid: {pid}" ) logging.error(err_msg) raise CidRefsDoesNotExist(err_msg) From 8418000596d71c87b2570d73d6bb7b887dcc018b Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 22 May 2024 09:42:25 -0700 Subject: [PATCH 45/45] Update README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index ae14f610..da7aa829 100644 --- a/README.md +++ b/README.md @@ -114,7 +114,7 @@ tag_object(pid, cid) **How do I delete an object if I have the pid?** - To delete an object and all its associated reference files, call the Public API method `delete_object` with `id_type` 'pid'. - To delete only an object, call `delete_object` with `id_type` 'cid' which will remove the object if it is not referenced by any pids. -- Note, `delete_object` and `tag_object` calls are synchronized on their content identifier values so that the shared reference files are not unintentionally modified concurrently. An object that is in the process of being deleted should not be tagged, and vice versa. These calls have been implemented to occur sequentially to improve clarity in the event of an unexpected conflict or issue. +- Note, `delete_object` and `store_object` are synchronized based on a given 'pid'. An object that is in the process of being stored based on a pid should not be deleted at the same time. Additionally, `delete_object` further synchronizes with `tag_object` based on a `cid`. Every object is stored once, is unique and shares one cid reference file. The API calls to access this cid reference file must be coordinated to prevent file system locking exceptions. ###### Working with metadata (store, retrieve, delete) @@ -126,8 +126,8 @@ HashStore's '/metadata' directory holds all metadata for objects stored in HashS - If there are multiple metadata objects, a 'format_id' must be specified when calling `retrieve_metadata` (ex. `retrieve_metadata(pid, format_id)`) **How do I delete a metadata file?** -- Like `retrieve_metadata`, call the Public API method `delete_metadata` which will delete the metadata object associated with the given pid. -- If there are multiple metadata objects, a 'format_id' must be specified when calling `delete_metadata` to ensure the expected metadata object is deleted. +- Like `retrieve_metadata`, call the Public API method `delete_metadata` to delete all metadata documents associated with the given pid. +- If there are multiple metadata objects, and you wish to only delete one type, a 'format_id' must be specified when calling `delete_metadata(pid, format_id)` to ensure the expected metadata object is deleted. ###### What are HashStore reference files?