From a921a7dabf98a8e32ceb64ebd7f6a39a51e02328 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 23 Jan 2024 14:50:03 -0800 Subject: [PATCH 01/13] Tidy up logging statements in '_verify_object_information()' --- src/hashstore/filehashstore.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 2420fbc1..96efdf20 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1488,7 +1488,7 @@ def _verify_object_information( exception_string = ( "FileHashStore - _validate_arg_object: Object file size calculated: " + f" {tmp_file_size} does not match with expected size:" - + f"{file_size_to_validate}." + + f" {file_size_to_validate}." ) if pid is not None: self.delete(entity, tmp_file_name) @@ -1522,7 +1522,7 @@ def _verify_object_information( # Delete the tmp file self.delete(entity, tmp_file_name) exception_string_for_pid = ( - exception_string + f". Tmp file ({tmp_file_name}) deleted." + exception_string + f" Tmp file ({tmp_file_name}) deleted." ) logging.error(exception_string_for_pid) raise ValueError(exception_string_for_pid) From d0cf9128e6ce57454e2c77f594c96228eb3b5da2 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 23 Jan 2024 14:56:21 -0800 Subject: [PATCH 02/13] Update 'ObjectMetadata' class attribute 'id' to 'cid' and revise all affected code, tests and documentation --- README.md | 18 +++++++------- src/hashstore/filehashstore.py | 8 +++--- src/hashstore/hashstore.py | 6 ++--- tests/test_filehashstore.py | 26 ++++++++++---------- tests/test_filehashstore_interface.py | 14 +++++------ tests/test_filehashstore_references.py | 34 +++++++++++++------------- tests/test_hashstore.py | 2 +- tests/test_hashstore_client.py | 2 +- 8 files changed, 55 insertions(+), 55 deletions(-) diff --git a/README.md b/README.md index d7f749c5..19056ee7 100644 --- a/README.md +++ b/README.md @@ -68,7 +68,7 @@ my_store = factory.get_hashstore(module_name, class_name, properties) pid = "j.tao.1700.1" object = "/path/to/your/object.data" object_metadata = my_store.store_object(pid, object) -object_cid = object_metadata.id +object_cid = object_metadata.cid # Store metadata (.../[hashstore_path]/metadata/) # By default, storing metadata will use the given properties namespace `format_id` @@ -200,28 +200,28 @@ How to use HashStore client (command line app) $ python './src/hashstore/hashstoreclient.py' /path/to/store/ -chs -dp=3 -wp=2 -ap=SHA-256 -nsp="http://www.ns.test/v1" # Get the checksum of a data object -$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -getchecksum -pid=content_identifier -algo=SHA-256 +$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -getchecksum -pid=persistent_identifier -algo=SHA-256 # Find an object (returns the content identifier) -$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -findobject -pid=content_identifier +$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -findobject -pid=persistent_identifier # Store a data object -$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -storeobject -pid=content_identifier -path=/path/to/object +$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -storeobject -pid=persistent_identifier -path=/path/to/object # Store a metadata object -$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -storemetadata -pid=content_identifier -path=/path/to/metadata/object -formatid=http://ns.dataone.org/service/types/v2.0 +$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -storemetadata -pid=persistent_identifier -path=/path/to/metadata/object -formatid=http://ns.dataone.org/service/types/v2.0 # Retrieve a data object -$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -retrieveobject -pid=content_identifier +$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -retrieveobject -pid=persistent_identifier # Retrieve a metadata object -$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -retrievemetadata -pid=content_identifier -formatid=http://ns.dataone.org/service/types/v2.0 +$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -retrievemetadata -pid=persistent_identifier -formatid=http://ns.dataone.org/service/types/v2.0 # Delete a data object -$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -deleteobject -pid=content_identifier +$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -deleteobject -pid=persistent_identifier # Delete a metadata file -$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -deletemetadata -pid=content_identifier -formatid=http://ns.dataone.org/service/types/v2.0 +$ python './src/hashstore/hashstoreclient.py' /path/to/store/ -deletemetadata -pid=persistent_identifier -formatid=http://ns.dataone.org/service/types/v2.0 ``` ## License diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 96efdf20..76e09967 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -425,7 +425,7 @@ def store_object( object_metadata = self.store_data_only(data) logging.info( "FileHashStore - store_object: Successfully stored object for cid: %s", - object_metadata.id, + object_metadata.cid, ) else: # Else the object will be stored and tagged @@ -470,7 +470,7 @@ def store_object( checksum_algorithm=checksum_algorithm_checked, file_size_to_validate=expected_object_size, ) - self.tag_object(pid, object_metadata.id) + self.tag_object(pid, object_metadata.cid) logging.info( "FileHashStore - store_object: Successfully stored object for pid: %s", pid, @@ -509,7 +509,7 @@ def verify_object( else: logging.info( "FileHashStore - verify_object: Called to verify object with id: %s", - object_metadata.id, + object_metadata.cid, ) object_metadata_hex_digests = object_metadata.hex_digests object_metadata_file_size = object_metadata.obj_size @@ -526,7 +526,7 @@ def verify_object( ) logging.info( "FileHashStore - verify_object: object has been validated for cid: %s", - object_metadata.id, + object_metadata.cid, ) def tag_object(self, pid, cid): diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 37e228d8..4611e700 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -254,7 +254,7 @@ def get_hashstore(module_name, class_name, properties=None): ) -class ObjectMetadata(namedtuple("ObjectMetadata", ["id", "obj_size", "hex_digests"])): +class ObjectMetadata(namedtuple("ObjectMetadata", ["cid", "obj_size", "hex_digests"])): """Represents metadata associated with an object. The `ObjectMetadata` class represents metadata associated with an object, @@ -268,5 +268,5 @@ class ObjectMetadata(namedtuple("ObjectMetadata", ["id", "obj_size", "hex_digest """ # Default value to prevent dangerous default value - def __new__(cls, ab_id, obj_size, hex_digests=None): - return super(ObjectMetadata, cls).__new__(cls, ab_id, obj_size, hex_digests) + def __new__(cls, cid, obj_size, hex_digests=None): + return super(ObjectMetadata, cls).__new__(cls, cid, obj_size, hex_digests) diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 4688ba87..1d549a6e 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -224,7 +224,7 @@ def test_store_and_validate_data_files_path(pids, store): for pid in pids.keys(): path = Path(test_dir) / pid.replace("/", "_") object_metadata = store.store_and_validate_data(pid, path) - object_metadata_id = object_metadata.id + object_metadata_id = object_metadata.cid assert store.exists(entity, object_metadata_id) @@ -235,7 +235,7 @@ def test_store_and_validate_data_files_string(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_and_validate_data(pid, path) - object_metadata_id = object_metadata.id + object_metadata_id = object_metadata.cid assert store.exists(entity, object_metadata_id) @@ -248,7 +248,7 @@ def test_store_and_validate_data_files_stream(pids, store): input_stream = io.open(path, "rb") object_metadata = store.store_and_validate_data(pid, input_stream) input_stream.close() - object_metadata_id = object_metadata.id + object_metadata_id = object_metadata.cid assert store.exists(entity, object_metadata_id) assert store.count(entity) == 3 @@ -259,7 +259,7 @@ def test_store_and_validate_data_cid(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_and_validate_data(pid, path) - object_metadata_id = object_metadata.id + object_metadata_id = object_metadata.cid assert object_metadata_id == pids[pid][store.algorithm] @@ -335,7 +335,7 @@ def test_store_data_only_cid(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_data_only(path) - object_metadata_id = object_metadata.id + object_metadata_id = object_metadata.cid assert object_metadata_id == pids[pid][store.algorithm] @@ -821,7 +821,7 @@ def test_exists_with_object_metadata_id(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_and_validate_data(pid, path) - assert store.exists(entity, object_metadata.id) + assert store.exists(entity, object_metadata.cid) def test_exists_with_sharded_path(pids, store): @@ -831,7 +831,7 @@ def test_exists_with_sharded_path(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_and_validate_data(pid, path) - object_metadata_shard = store.shard(object_metadata.id) + object_metadata_shard = store.shard(object_metadata.cid) object_metadata_shard_path = "/".join(object_metadata_shard) assert store.exists(entity, object_metadata_shard_path) @@ -864,7 +864,7 @@ def test_open_objects(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_and_validate_data(pid, path) - object_metadata_id = object_metadata.id + object_metadata_id = object_metadata.cid io_buffer = store.open(entity, object_metadata_id) assert isinstance(io_buffer, io.BufferedReader) io_buffer.close() @@ -877,7 +877,7 @@ def test_delete_by_object_metadata_id(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_and_validate_data(pid, path) - object_metadata_id = object_metadata.id + object_metadata_id = object_metadata.cid store.delete(entity, object_metadata_id) assert store.count(entity) == 0 @@ -928,7 +928,7 @@ def test_remove_empty_does_not_remove_nonempty_folders(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_and_validate_data(pid, path) - object_metadata_shard = store.shard(object_metadata.id) + object_metadata_shard = store.shard(object_metadata.cid) object_metadata_shard_path = "/".join(object_metadata_shard) # Get parent directory of the relative path parent_dir = os.path.dirname(object_metadata_shard_path) @@ -992,7 +992,7 @@ def test_get_real_path_with_object_id(store, pids): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_and_validate_data(pid, path) - obj_abs_path = store.resolve_path(entity, object_metadata.id) + obj_abs_path = store.resolve_path(entity, object_metadata.cid) assert os.path.exists(obj_abs_path) @@ -1003,7 +1003,7 @@ def test_get_real_path_with_object_id_sharded(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_and_validate_data(pid, path) - object_metadata_shard = store.shard(object_metadata.id) + object_metadata_shard = store.shard(object_metadata.cid) object_metadata_shard_path = "/".join(object_metadata_shard) obj_abs_path = store.resolve_path(entity, object_metadata_shard_path) assert os.path.exists(obj_abs_path) @@ -1030,7 +1030,7 @@ def test_get_real_path_with_bad_entity(store, pids): path = test_dir + pid.replace("/", "_") object_metadata = store.store_and_validate_data(pid, path) with pytest.raises(ValueError): - store.resolve_path(entity, object_metadata.id) + store.resolve_path(entity, object_metadata.cid) def test_build_path(store, pids): diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index f38f46d9..5cd8b54e 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -26,7 +26,7 @@ def test_store_address_length(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(pid, path) - object_cid = object_metadata.id + object_cid = object_metadata.cid assert len(object_cid) == 64 @@ -37,7 +37,7 @@ def test_store_object(pids, store): for pid in pids.keys(): path = Path(test_dir + pid.replace("/", "_")) object_metadata = store.store_object(pid, path) - assert object_metadata.id == pids[pid][store.algorithm] + assert object_metadata.cid == pids[pid][store.algorithm] assert store.count(entity) == 3 @@ -82,7 +82,7 @@ def test_store_object_id(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(pid, path) - assert object_metadata.id == pids[pid][store.algorithm] + assert object_metadata.cid == pids[pid][store.algorithm] def test_store_object_obj_size(pids, store): @@ -558,7 +558,7 @@ def test_store_object_large_file(store): # Store object pid = "testfile_filehashstore" object_metadata = store.store_object(pid, file_path) - object_metadata_id = object_metadata.id + object_metadata_id = object_metadata.cid assert object_metadata_id == object_metadata.hex_digests.get("sha256") @@ -577,7 +577,7 @@ def test_store_object_sparse_large_file(store): # Store object pid = "testfile_filehashstore" object_metadata = store.store_object(pid, file_path) - object_metadata_id = object_metadata.id + object_metadata_id = object_metadata.cid assert object_metadata_id == object_metadata.hex_digests.get("sha256") @@ -918,7 +918,7 @@ def test_delete_object_cid_refs_file(pids, store): syspath = Path(test_dir) / filename object_metadata = store.store_object(pid, path) _metadata_cid = store.store_metadata(pid, syspath, format_id) - cid = object_metadata.id + cid = object_metadata.cid store.delete_object(pid) cid_refs_file_path = store.resolve_path("cid", cid) assert not os.path.exists(cid_refs_file_path) @@ -930,7 +930,7 @@ def test_delete_object_cid_refs_file_with_pid_refs_remaining(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(pid, path) - cid = object_metadata.id + cid = object_metadata.cid cid_refs_abs_path = store.resolve_path("cid", cid) # pylint: disable=W0212 store._update_cid_refs(cid_refs_abs_path, "dou.test.1") diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index ccd147fd..c2fe81ee 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -12,7 +12,7 @@ def test_tag_object(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(None, path) - object_tagged = store.tag_object(pid, object_metadata.id) + object_tagged = store.tag_object(pid, object_metadata.cid) assert object_tagged @@ -22,7 +22,7 @@ def test_tag_object_pid_refs_file(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(None, path) - store.tag_object(pid, object_metadata.id) + store.tag_object(pid, object_metadata.cid) pid_refs_file_path = store.resolve_path("pid", pid) assert os.path.exists(pid_refs_file_path) @@ -33,7 +33,7 @@ def test_tag_object_pid_refs_file_exists(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(None, path) - cid = object_metadata.id + cid = object_metadata.cid store.tag_object(pid, cid) pid_refs_file_path = store.resolve_path("pid", pid) assert os.path.exists(pid_refs_file_path) @@ -49,11 +49,11 @@ def test_tag_object_pid_refs_file_content(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(None, path) - store.tag_object(pid, object_metadata.id) + store.tag_object(pid, object_metadata.cid) pid_refs_file_path = store.resolve_path("pid", pid) with open(pid_refs_file_path, "r", encoding="utf8") as f: pid_refs_cid = f.read() - assert pid_refs_cid == object_metadata.id + assert pid_refs_cid == object_metadata.cid def test_tag_object_cid_refs_file(pids, store): @@ -62,8 +62,8 @@ def test_tag_object_cid_refs_file(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(None, path) - cid = object_metadata.id - store.tag_object(pid, object_metadata.id) + cid = object_metadata.cid + store.tag_object(pid, object_metadata.cid) cid_refs_file_path = store.resolve_path("cid", cid) assert os.path.exists(cid_refs_file_path) @@ -74,8 +74,8 @@ def test_tag_object_cid_refs_file_content(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(None, path) - store.tag_object(pid, object_metadata.id) - cid_refs_file_path = store.resolve_path("cid", object_metadata.id) + store.tag_object(pid, object_metadata.cid) + cid_refs_file_path = store.resolve_path("cid", object_metadata.cid) with open(cid_refs_file_path, "r", encoding="utf8") as f: pid_refs_cid = f.read().strip() assert pid_refs_cid == pid @@ -88,7 +88,7 @@ def test_tag_object_cid_refs_file_exists(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(None, path) - store.tag_object(pid, object_metadata.id) + store.tag_object(pid, object_metadata.cid) another_cid = "dou.test.1" with pytest.raises(FileExistsError): store.tag_object(pid, another_cid) @@ -104,7 +104,7 @@ def test_tag_object_cid_refs_update_cid_refs_updated(store): path = test_dir + pid.replace("/", "_") # Store data only object_metadata = store.store_object(None, path) - cid = object_metadata.id + cid = object_metadata.cid # Tag object store.tag_object(pid, cid) # Tag the cid with another pid @@ -127,7 +127,7 @@ def test_tag_object_cid_refs_update_pid_refs_created(store): path = test_dir + pid.replace("/", "_") # Store data only object_metadata = store.store_object(None, path) - cid = object_metadata.id + cid = object_metadata.cid # Tag object store.tag_object(pid, cid) # Tag the cid with another pid @@ -145,8 +145,8 @@ def test_tag_object_cid_refs_update_pid_found_but_file_missing(store): pid = "jtao.1700.1" path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(None, path) - store.tag_object(pid, object_metadata.id) - cid = object_metadata.id + store.tag_object(pid, object_metadata.cid) + cid = object_metadata.cid # Manually update the cid refs, pid refs file missing at this point additional_pid = "dou.test.1" cid_ref_abs_path = store.resolve_path("cid", cid) @@ -204,7 +204,7 @@ def test_verify_object_exception_incorrect_size(pids, store): with pytest.raises(ValueError): store.verify_object(object_metadata, checksum, checksum_algorithm, 1000) - cid = object_metadata.id + cid = object_metadata.cid cid = object_metadata.hex_digests[store.algorithm] cid_abs_path = store.resolve_path("cid", cid) assert not os.path.exists(cid_abs_path) @@ -216,7 +216,7 @@ def test_verify_object_exception_incorrect_checksum(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(data=path) - cid = object_metadata.id + cid = object_metadata.cid store.tag_object(pid, cid) checksum_algorithm = store.algorithm expected_file_size = object_metadata.obj_size @@ -225,7 +225,7 @@ def test_verify_object_exception_incorrect_checksum(pids, store): object_metadata, "abc123", checksum_algorithm, expected_file_size ) - cid = object_metadata.id + cid = object_metadata.cid cid = object_metadata.hex_digests[store.algorithm] cid_abs_path = store.resolve_path("cid", cid) assert not os.path.exists(cid_abs_path) diff --git a/tests/test_hashstore.py b/tests/test_hashstore.py index e161c967..a2d42398 100644 --- a/tests/test_hashstore.py +++ b/tests/test_hashstore.py @@ -87,7 +87,7 @@ def test_objectmetadata(): "sha512": "sha512value", } object_metadata = ObjectMetadata(ab_id, obj_size, hex_digest_dict) - assert object_metadata.id == ab_id + assert object_metadata.cid == ab_id assert object_metadata.obj_size == obj_size assert object_metadata.hex_digests.get("md5") == hex_digest_dict["md5"] assert object_metadata.hex_digests.get("sha1") == hex_digest_dict["sha1"] diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index 6eaf16d7..3aee347a 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -84,7 +84,7 @@ def test_find_object(capsys, store, pids): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(pid, path) - cid = object_metadata.id + cid = object_metadata.cid client_module_path = f"{client_directory}/client.py" test_store = store.root From c1d54037ee6da3f0643ad969b8686bce5fab0893 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 23 Jan 2024 15:03:44 -0800 Subject: [PATCH 03/13] Add new pytest for 'delete_metadata' to confirm exception is not thrown when called to delete metadata that does not exist --- tests/test_filehashstore_interface.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 5cd8b54e..238d4e80 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -964,10 +964,20 @@ def test_delete_metadata(pids, store): syspath = Path(test_dir) / filename _object_metadata = store.store_object(pid, path) _metadata_cid = store.store_metadata(pid, syspath, format_id) - store.delete_metadata(pid, format_id) + is_deleted = store.delete_metadata(pid, format_id) + assert is_deleted assert store.count(entity) == 0 +def test_delete_metadata_does_not_exist(pids, store): + """Test delete_metadata does not throw exception when called to delete + metadata that does not exist.""" + format_id = "http://ns.dataone.org/service/types/v2.0" + for pid in pids.keys(): + is_deleted = store.delete_metadata(pid, format_id) + assert is_deleted + + def test_delete_metadata_default_format_id(store, pids): """Test delete_metadata deletes successfully with default format_id.""" test_dir = "tests/testdata/" From ff8c03ff155ba11297e371751fc2a0de4576fa37 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 23 Jan 2024 15:08:37 -0800 Subject: [PATCH 04/13] Revise logging statement for accuracy in 'store_data_only()' --- src/hashstore/filehashstore.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 76e09967..d6dbe0ca 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -903,7 +903,7 @@ def store_data_only(self, data): size, and hex digest dictionary. """ logging.debug( - "FileHashStore - store_object: Request to store data object only." + "FileHashStore - store_data_only: Request to store data object only." ) try: @@ -924,14 +924,14 @@ def store_data_only(self, data): # The permanent address of the data stored is based on the data's checksum cid = hex_digest_dict.get(self.algorithm) logging.debug( - "FileHashStore - store_object: Successfully stored object with cid: %s", + "FileHashStore - store_data_only: Successfully stored object with cid: %s", cid, ) return object_metadata # pylint: disable=W0718 except Exception as err: exception_string = ( - "FileHashStore - store_object (store_data_only): failed to store object." + "FileHashStore - store_data_only: failed to store object." + f" Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) From e7b746c25a656b30fb5355fd146026b6c0965698 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 23 Jan 2024 16:21:09 -0800 Subject: [PATCH 05/13] Add new method '_is_pid_in_cid_refs_file' and refactor '_verify_hashstore_references', 'find_object' and 'tag_object' --- src/hashstore/filehashstore.py | 73 ++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 30 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index d6dbe0ca..2aac209d 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -571,7 +571,8 @@ def tag_object(self, pid, cid): self.create_path(os.path.dirname(pid_ref_abs_path)) shutil.move(pid_tmp_file_path, pid_ref_abs_path) # Update cid ref files as it already exists - self._update_cid_refs(cid_ref_abs_path, pid) + if not self._is_pid_in_cid_refs_file(pid, cid_ref_abs_path): + self._update_cid_refs(cid_ref_abs_path, pid) self._verify_hashstore_references(pid, cid, "update") logging.info( "FileHashStore - tag_object: Successfully updated cid: %s with pid: %s", @@ -616,27 +617,39 @@ def find_object(self, pid): self._check_string(pid, "pid", "find_object") pid_ref_abs_path = self.resolve_path("pid", pid) - if not os.path.exists(pid_ref_abs_path): - err_msg = ( - f"FileHashStore - find_object: pid ({pid}) reference file not found: " - + pid_ref_abs_path - ) - raise FileNotFoundError(err_msg) - else: + if os.path.exists(pid_ref_abs_path): # Read the file to get the cid from the pid reference with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: pid_refs_cid = pid_ref_file.read() + # Confirm that the cid reference file exists cid_ref_abs_path = self.resolve_path("cid", pid_refs_cid) - if not os.path.exists(cid_ref_abs_path): + if os.path.exists(cid_ref_abs_path): + # Check that the pid is actually found in the cid reference file + if self._is_pid_in_cid_refs_file(pid, cid_ref_abs_path): + return pid_refs_cid + else: + # If not, it is an orphan pid refs file + err_msg = ( + "FileHashStore - find_object: pid refs file exists with cid: " + + pid_refs_cid + + f", but is missing from cid refs file: {cid_ref_abs_path}" + ) + logging.error(err_msg) + raise ValueError(err_msg) + else: err_msg = ( f"FileHashStore - find_object: pid refs file exists with cid: {pid_refs_cid}" + f", but cid refs file not found: {cid_ref_abs_path}" ) logging.error(err_msg) raise FileNotFoundError(err_msg) - else: - return pid_refs_cid + else: + err_msg = ( + f"FileHashStore - find_object: pid refs file not found for pid ({pid}): " + + pid_ref_abs_path + ) + raise FileNotFoundError(err_msg) def store_metadata(self, pid, metadata, format_id=None): logging.debug( @@ -1233,6 +1246,23 @@ def _write_cid_refs_file(self, path, pid): logging.error(exception_string) raise err + def _is_pid_in_cid_refs_file(self, pid, cid_ref_abs_path): + """Check a cid reference file for a pid. + + :param str pid: Authority-based or persistent identifier of the object. + :param str cid_ref_abs_path: Path to the cid refs file + + :return: pid_found + :rtype: boolean + """ + with open(cid_ref_abs_path, "r", encoding="utf8") as cid_ref_file: + # Confirm that pid is not currently already tagged + for line in cid_ref_file: + value = line.strip() + if pid == value: + return True + return False + def _update_cid_refs(self, cid_ref_abs_path, pid): """Update an existing CID reference file with the given PID. @@ -1253,18 +1283,7 @@ def _update_cid_refs(self, cid_ref_abs_path, pid): raise FileNotFoundError(exception_string) try: - with open(cid_ref_abs_path, "a+", encoding="utf8") as cid_ref_file: - # Confirm that pid is not currently already tagged - for line in cid_ref_file: - value = line.strip() - if pid == value: - warning_msg = ( - f"FileHashStore - update_cid_refs: pid ({pid}) already reference in" - + f" cid reference file: {cid_ref_abs_path} " - ) - logging.warning(warning_msg) - # Exit try statement, we do not want to write the pid - return + with open(cid_ref_abs_path, "a", encoding="utf8") as cid_ref_file: # Lock file for the shortest amount of time possible file_descriptor = cid_ref_file.fileno() fcntl.flock(file_descriptor, fcntl.LOCK_EX) @@ -1574,13 +1593,7 @@ def _verify_hashstore_references(self, pid, cid, verify_type): logging.error(exception_string) raise ValueError(exception_string) # Then the pid - pid_found = False - with open(cid_ref_abs_path, "r", encoding="utf8") as cid_ref_file: - for _, line in enumerate(cid_ref_file, start=1): - value = line.strip() - if value == pid: - pid_found = True - break + pid_found = self._is_pid_in_cid_refs_file(pid, cid_ref_abs_path) if not pid_found: exception_string = ( "FileHashStore - _verify_hashstore_references: Cid refs file exists" From 8f6f879b1bc618343e6cc429ea3da06927cbb28b Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 25 Jan 2024 09:16:28 -0800 Subject: [PATCH 06/13] Refactor 'verify_object' to return boolean values and update docstrings & pytests --- src/hashstore/filehashstore.py | 38 ++++++++++++++++---------- src/hashstore/hashstore.py | 8 +++--- tests/test_filehashstore_references.py | 16 +++++++---- 3 files changed, 38 insertions(+), 24 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 2aac209d..52a0286f 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -514,20 +514,30 @@ def verify_object( object_metadata_hex_digests = object_metadata.hex_digests object_metadata_file_size = object_metadata.obj_size checksum_algorithm_checked = self.clean_algorithm(checksum_algorithm) - self._verify_object_information( - pid=None, - checksum=checksum, - checksum_algorithm=checksum_algorithm_checked, - entity="objects", - hex_digests=object_metadata_hex_digests, - tmp_file_name=None, - tmp_file_size=object_metadata_file_size, - file_size_to_validate=expected_file_size, - ) - logging.info( - "FileHashStore - verify_object: object has been validated for cid: %s", - object_metadata.cid, - ) + + try: + self._verify_object_information( + pid=None, + checksum=checksum, + checksum_algorithm=checksum_algorithm_checked, + entity="objects", + hex_digests=object_metadata_hex_digests, + tmp_file_name=None, + tmp_file_size=object_metadata_file_size, + file_size_to_validate=expected_file_size, + ) + logging.info( + "FileHashStore - verify_object: object has been validated for cid: %s", + object_metadata.cid, + ) + return True + # pylint: disable=W0718 + except Exception as err: + exception_string = ( + f"FileHashStore - verify_object: object not valid: {err}." + ) + logging.info(exception_string) + return False def tag_object(self, pid, cid): logging.debug( diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 4611e700..c5019825 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -95,7 +95,7 @@ def verify_object( :param str checksum_algorithm: Algorithm of the checksum. :param int expected_file_size: Size of the temporary file. - :return: None + :return: bool - `True` if valid """ raise NotImplementedError() @@ -258,10 +258,10 @@ class ObjectMetadata(namedtuple("ObjectMetadata", ["cid", "obj_size", "hex_diges """Represents metadata associated with an object. The `ObjectMetadata` class represents metadata associated with an object, - including a unique identifier (`id`), the size of the object in bytes (`obj_size`), - and an optional list of hex digests (`hex_digests`) to validate objects. + including a content identifier (`cid`), the size of the object in bytes (`obj_size`), + and an optional list of hex digests (`hex_digests`) to assist with validating objects. - :param str id: A unique identifier for the object (Hash ID, hex digest). + :param str cid: A unique identifier for the object (Hash ID, hex digest). :param bytes obj_size: The size of the object in bytes. :param list hex_digests: A list of hex digests to validate objects (md5, sha1, sha256, sha384, sha512) (optional). diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index c2fe81ee..5ec56e59 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -201,8 +201,11 @@ def test_verify_object_exception_incorrect_size(pids, store): object_metadata = store.store_object(data=path) checksum = object_metadata.hex_digests.get(store.algorithm) checksum_algorithm = store.algorithm - with pytest.raises(ValueError): - store.verify_object(object_metadata, checksum, checksum_algorithm, 1000) + + is_valid = store.verify_object( + object_metadata, checksum, checksum_algorithm, 1000 + ) + assert not is_valid cid = object_metadata.cid cid = object_metadata.hex_digests[store.algorithm] @@ -220,10 +223,11 @@ def test_verify_object_exception_incorrect_checksum(pids, store): store.tag_object(pid, cid) checksum_algorithm = store.algorithm expected_file_size = object_metadata.obj_size - with pytest.raises(ValueError): - store.verify_object( - object_metadata, "abc123", checksum_algorithm, expected_file_size - ) + + is_valid = store.verify_object( + object_metadata, "abc123", checksum_algorithm, expected_file_size + ) + assert not is_valid cid = object_metadata.cid cid = object_metadata.hex_digests[store.algorithm] From 4830d0df1227643e1f3efe031a7eac7de5fe354d Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 25 Jan 2024 09:35:31 -0800 Subject: [PATCH 07/13] Refactor 'delete_object' to handle exceptions raised from calling 'find_object' --- src/hashstore/filehashstore.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 52a0286f..597ebf05 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -765,8 +765,25 @@ def delete_object(self, pid): "FileHashStore - delete_object: Request to delete object for pid: %s", pid ) self._check_string(pid, "pid", "delete_object") - cid = self.find_object(pid) + try: + cid = self.find_object(pid) + except FileNotFoundError as fnfe: + if "pid refs file not found" in fnfe: + # Nothing to delete + return + if "cid refs file not found" in fnfe: + # Delete pid refs file + pid_ref_abs_path = self.resolve_path("pid", pid) + self.delete("pid", pid_ref_abs_path) + return + except ValueError as ve: + if "is missing from cid refs file" in ve: + # Delete pid refs file + pid_ref_abs_path = self.resolve_path("pid", pid) + self.delete("pid", pid_ref_abs_path) + return + # Proceed with next steps - cid has been retrieved without any errors while cid in self.reference_locked_cids: logging.debug( "FileHashStore - delete_object: (cid) %s is currently locked. Waiting", @@ -783,9 +800,10 @@ def delete_object(self, pid): try: cid_ref_abs_path = self.resolve_path("cid", cid) pid_ref_abs_path = self.resolve_path("pid", pid) + # First delete the pid refs file immediately + self._delete_pid_refs_file(pid_ref_abs_path) # Remove pid from cid reference file self._delete_cid_refs_pid(cid_ref_abs_path, pid) - self._delete_pid_refs_file(pid_ref_abs_path) # Delete cid reference file and object only if the cid refs file is empty if os.path.getsize(cid_ref_abs_path) == 0: self.delete("cid", cid_ref_abs_path) From f7cffb88ab6f60cc7adfc090f23e95c8438a1e0e Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 25 Jan 2024 10:03:32 -0800 Subject: [PATCH 08/13] Revise 'find_object' to also check that cid retrieved exists before returning the cid found, and update 'delete_object' --- README.md | 8 ++++---- src/hashstore/filehashstore.py | 18 +++++++++++++++++- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 19056ee7..f7ad6fb4 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ Documentation is a work in progress, and can be found on the [Metacat repository ## HashStore Overview -HashStore is a content-addressable file management system that utilizes the content identifier of an object to address files. The system stores both objects, references (refs) and metadata in its respective directories and provides an API for interacting with the store. HashStore storage classes (like `FileHashStore`) must implement the HashStore interface to ensure the expected usage of HashStore. +HashStore is a content-addressable file management system that utilizes the content identifier of an object to address files. The system stores objects, references (refs) and metadata in its respective directories and provides an API for interacting with the store. HashStore storage classes (like `FileHashStore`) must implement the HashStore interface to ensure the expected usage of HashStore. ###### Public API Methods - store_object @@ -89,7 +89,7 @@ metadata_cid = my_store.store_metadata(pid, metadata, format_id) In HashStore, objects are first saved as temporary files while their content identifiers are calculated. Once the default hash algorithm list and their hashes are generated, objects are stored in their permanent location using the store's algorithm's corresponding hash value, the store depth and the store width. Lastly, reference files are created for the object so that they can be found and retrieved given an identifier (ex. persistent identifier (pid)). Note: Objects are also stored once and only once. -By calling the various interface methods for `store_object`, the calling app/client can validate, store and tag an object simultaneously if the relevant data is available. In the absence of an identfiier (ex. persistent identifier (pid)), `store_object` can be called to solely store an object. The client is then expected to call `verify_object` when the relevant metadata is available to confirm that the object has been stored as expected. If the object is determined to be invalid (via `verify_object`), it will be deleted. Lastly, to finalize this process of storing an object (to make the object discoverable), the client calls `tag_object`. In summary, there are two expected paths to store an object: +By calling the various interface methods for `store_object`, the calling app/client can validate, store and tag an object simultaneously if the relevant data is available. In the absence of an identifier (ex. persistent identifier (pid)), `store_object` can be called to solely store an object. The client is then expected to call `verify_object` when the relevant metadata is available to confirm that the object has been stored as expected. If the object is determined to be invalid (via `verify_object`), it will be deleted. Lastly, to finalize this process of storing an object (to make the object discoverable), the client calls `tag_object`. In summary, there are two expected paths to store an object: ```py # All-in-one process which stores, validates and tags an object @@ -108,8 +108,8 @@ tag_object(pid, cid) - To retrieve an object, call the Public API method `retrieve_object` which opens a stream to the object if it exists. **How do I find an object or check that it exists if I have the pid?** -- To find the location of the object, call the Public API method `find_object` which will return the content identifier (cid) of the object. -- This cid can then be used to locate the object on disk by following HashStore's store configuration. +- To check if an object exists, call the Public API method `find_object` which will return the content identifier (cid) of the object if it exists. +- If desired, this cid can then be used to locate the object on disk by following HashStore's store configuration. **How do I delete an object if I have the pid?** - To delete an object, call the Public API method `delete_object` which will delete the object and its associated references and reference files where relevant. diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 597ebf05..65a0d0b9 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -637,7 +637,16 @@ def find_object(self, pid): if os.path.exists(cid_ref_abs_path): # Check that the pid is actually found in the cid reference file if self._is_pid_in_cid_refs_file(pid, cid_ref_abs_path): - return pid_refs_cid + # Object must also exist in order to return the cid retrieved + if not self.exists("objects", pid_refs_cid): + err_msg = ( + f"FileHashStore - find_object: Refs file found for pid ({pid}) at" + + pid_ref_abs_path + + f", but object referenced does not exist, cid: {pid_refs_cid}" + ) + raise FileNotFoundError(err_msg) + else: + return pid_refs_cid else: # If not, it is an orphan pid refs file err_msg = ( @@ -776,6 +785,11 @@ def delete_object(self, pid): pid_ref_abs_path = self.resolve_path("pid", pid) self.delete("pid", pid_ref_abs_path) return + if "object referenced does not exist" in fnfe: + # Delete pid refs file + pid_ref_abs_path = self.resolve_path("pid", pid) + self.delete("pid", pid_ref_abs_path) + return except ValueError as ve: if "is missing from cid refs file" in ve: # Delete pid refs file @@ -1974,9 +1988,11 @@ def resolve_path(self, entity, file): # Check for sharded path. if entity == "cid": + # Note, we skip checking whether the file exists for refs ref_file_abs_path = self.build_path(entity, file) return ref_file_abs_path elif entity == "pid": + # Note, we skip checking whether the file exists for refs hash_id = self.computehash(file, self.algorithm) ref_file_abs_path = self.build_path(entity, hash_id) return ref_file_abs_path From e534465f0ff127c426fbf3d348dba082aaa88fd6 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 25 Jan 2024 10:16:37 -0800 Subject: [PATCH 09/13] Refactor '.replace' calls to use '.strip' instead where relevant --- src/hashstore/filehashstore.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 65a0d0b9..32cb2e75 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1667,7 +1667,7 @@ def _check_arg_data(self, data): logging.error(exception_string) raise TypeError(exception_string) if isinstance(data, str): - if data.replace(" ", "") == "": + if data.strip() == "": exception_string = ( "FileHashStore - _validate_arg_data: Data string cannot be empty." ) @@ -1724,7 +1724,7 @@ def _check_arg_format_id(self, format_id, method): :rtype: str """ checked_format_id = None - if format_id is not None and format_id.replace(" ", "") == "": + if format_id and not format_id.strip(): exception_string = f"FileHashStore - {method}: Format_id cannot be empty." logging.error(exception_string) raise ValueError(exception_string) From 2025fc9ef63fa32d28032a066e77e8a04b00fee6 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 25 Jan 2024 10:23:24 -0800 Subject: [PATCH 10/13] Update README.md --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index f7ad6fb4..b864c157 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ HashStore is a content-addressable file management system that utilizes the cont - delete_metadata - get_hex_digest -For details, please see the HashStore interface (HashStore.java) +For details, please see the HashStore interface (hashstore.py) ###### How do I create a HashStore? @@ -89,17 +89,17 @@ metadata_cid = my_store.store_metadata(pid, metadata, format_id) In HashStore, objects are first saved as temporary files while their content identifiers are calculated. Once the default hash algorithm list and their hashes are generated, objects are stored in their permanent location using the store's algorithm's corresponding hash value, the store depth and the store width. Lastly, reference files are created for the object so that they can be found and retrieved given an identifier (ex. persistent identifier (pid)). Note: Objects are also stored once and only once. -By calling the various interface methods for `store_object`, the calling app/client can validate, store and tag an object simultaneously if the relevant data is available. In the absence of an identifier (ex. persistent identifier (pid)), `store_object` can be called to solely store an object. The client is then expected to call `verify_object` when the relevant metadata is available to confirm that the object has been stored as expected. If the object is determined to be invalid (via `verify_object`), it will be deleted. Lastly, to finalize this process of storing an object (to make the object discoverable), the client calls `tag_object`. In summary, there are two expected paths to store an object: +By calling the various interface methods for `store_object`, the calling app/client can validate, store and tag an object simultaneously if the relevant data is available. In the absence of an identifier (ex. persistent identifier (pid)), `store_object` can be called to solely store an object. The client is then expected to call `verify_object` when the relevant metadata is available to confirm that the object has been stored as expected. If the object is determined to be invalid (via `verify_object`), the client is expected to delete the object directly. Lastly, to finalize this process of storing an object (to make the object discoverable), the client calls `tag_object`. In summary, there are two expected paths to store an object: ```py # All-in-one process which stores, validates and tags an object -objectMetadata objInfo = store_object(InputStream, pid, additionalAlgorithm, checksum, checksumAlgorithm, objSize) +objectMetadata objInfo = store_object(stream, pid, additional_algo, checksum, checksum_algo, objSize) # Manual Process # Store object -obj_metadata = store_object(InputStream) +obj_metadata = store_object(stream) # Validate object, throws exceptions if there is a mismatch and deletes the associated file -verify_object(objInfo, checksum, checksumAlgorithn, objSize) +verify_object(obj_metadata, checksum, checksumAlgorithn, objSize) # Tag object, makes the object discoverable (find, retrieve, delete) tag_object(pid, cid) ``` @@ -152,7 +152,7 @@ These reference files are implemented in HashStore underneath the hood with no e ###### What does HashStore look like? -``` +```shell # Example layout in HashStore with a single file stored along with its metadata and reference files. # This uses a store depth of 3, with a width of 2 and "SHA-256" as its default store algorithm ## Notes: From 63ed8d44635f5cd00918484d5719805c9e4d407d Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 25 Jan 2024 10:43:09 -0800 Subject: [PATCH 11/13] Add underscore to all non-Public API methods to reduce confusion and update pytests --- src/hashstore/filehashstore.py | 196 ++++++++++++------------- tests/test_filehashstore.py | 163 ++++++++++---------- tests/test_filehashstore_interface.py | 79 +++++----- tests/test_filehashstore_references.py | 92 ++++++------ tests/test_hashstore_client.py | 8 +- 5 files changed, 270 insertions(+), 268 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 32cb2e75..1a582f3f 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -97,7 +97,7 @@ def __init__(self, properties=None): "FileHashStore - HashStore does not exist & configuration file not found." + " Writing configuration file." ) - self.write_properties(properties) + self._write_properties(properties) # Default algorithm list for FileHashStore based on config file written self._set_default_algorithms() # Complete initialization/instantiation by setting and creating store directories @@ -105,13 +105,13 @@ def __init__(self, properties=None): self.metadata = self.root + "/metadata" self.refs = self.root + "/refs" if not os.path.exists(self.objects): - self.create_path(self.objects + "/tmp") + self._create_path(self.objects + "/tmp") if not os.path.exists(self.metadata): - self.create_path(self.metadata + "/tmp") + self._create_path(self.metadata + "/tmp") if not os.path.exists(self.refs): - self.create_path(self.refs + "/tmp") - self.create_path(self.refs + "/pid") - self.create_path(self.refs + "/cid") + self._create_path(self.refs + "/tmp") + self._create_path(self.refs + "/pid") + self._create_path(self.refs + "/cid") logging.debug( "FileHashStore - Initialization success. Store root: %s", self.root ) @@ -126,7 +126,7 @@ def __init__(self, properties=None): # Configuration and Related Methods - def load_properties(self): + def _load_properties(self): """Get and return the contents of the current HashStore configuration. :return: HashStore properties with the following keys (and values): @@ -160,7 +160,7 @@ def load_properties(self): ) return hashstore_yaml_dict - def write_properties(self, properties): + def _write_properties(self, properties): """Writes 'hashstore.yaml' to FileHashStore's root directory with the respective properties object supplied. @@ -212,7 +212,7 @@ def write_properties(self, properties): # If given store path doesn't exist yet, create it. if not os.path.exists(self.root): - self.create_path(self.root) + self._create_path(self.root) # .yaml file to write hashstore_configuration_yaml = self._build_hashstore_yaml_string( @@ -303,7 +303,7 @@ def _verify_hashstore_properties(self, properties, prop_store_path): self.hashstore_configuration_yaml, ) # If 'hashstore.yaml' is found, verify given properties before init - hashstore_yaml_dict = self.load_properties() + hashstore_yaml_dict = self._load_properties() for key in self.property_required_keys: # 'store_path' is required to init HashStore but not saved in `hashstore.yaml` if key != "store_path": @@ -422,7 +422,7 @@ def store_object( if pid is None and self._check_arg_data(data): # If no pid is supplied, store the object only without tagging logging.debug("FileHashStore - store_object: Request to store data only.") - object_metadata = self.store_data_only(data) + object_metadata = self._store_data_only(data) logging.info( "FileHashStore - store_object: Successfully stored object for cid: %s", object_metadata.cid, @@ -462,7 +462,7 @@ def store_object( "FileHashStore - store_object: Attempting to store object for pid: %s", pid, ) - object_metadata = self.store_and_validate_data( + object_metadata = self._store_and_validate_data( pid, data, additional_algorithm=additional_algorithm_checked, @@ -513,7 +513,7 @@ def verify_object( ) object_metadata_hex_digests = object_metadata.hex_digests object_metadata_file_size = object_metadata.obj_size - checksum_algorithm_checked = self.clean_algorithm(checksum_algorithm) + checksum_algorithm_checked = self._clean_algorithm(checksum_algorithm) try: self._verify_object_information( @@ -562,9 +562,9 @@ def tag_object(self, pid, cid): ) self.reference_locked_cids.append(cid) try: - pid_ref_abs_path = self.resolve_path("pid", pid) - cid_ref_abs_path = self.resolve_path("cid", cid) - tmp_root_path = self.get_store_path("refs") / "tmp" + pid_ref_abs_path = self._resolve_path("pid", pid) + cid_ref_abs_path = self._resolve_path("cid", cid) + tmp_root_path = self._get_store_path("refs") / "tmp" # Proceed to tagging process if os.path.exists(pid_ref_abs_path): @@ -578,7 +578,7 @@ def tag_object(self, pid, cid): elif os.path.exists(cid_ref_abs_path): # Create the pid refs file pid_tmp_file_path = self._write_pid_refs_file(tmp_root_path, cid) - self.create_path(os.path.dirname(pid_ref_abs_path)) + self._create_path(os.path.dirname(pid_ref_abs_path)) shutil.move(pid_tmp_file_path, pid_ref_abs_path) # Update cid ref files as it already exists if not self._is_pid_in_cid_refs_file(pid, cid_ref_abs_path): @@ -596,8 +596,8 @@ def tag_object(self, pid, cid): pid_tmp_file_path = self._write_pid_refs_file(tmp_root_path, cid) cid_tmp_file_path = self._write_cid_refs_file(tmp_root_path, pid) # Create paths for pid ref file in '.../refs/pid' and cid ref file in '.../refs/cid' - self.create_path(os.path.dirname(pid_ref_abs_path)) - self.create_path(os.path.dirname(cid_ref_abs_path)) + self._create_path(os.path.dirname(pid_ref_abs_path)) + self._create_path(os.path.dirname(cid_ref_abs_path)) # Move both files shutil.move(pid_tmp_file_path, pid_ref_abs_path) shutil.move(cid_tmp_file_path, cid_ref_abs_path) @@ -626,19 +626,19 @@ def find_object(self, pid): ) self._check_string(pid, "pid", "find_object") - pid_ref_abs_path = self.resolve_path("pid", pid) + pid_ref_abs_path = self._resolve_path("pid", pid) if os.path.exists(pid_ref_abs_path): # Read the file to get the cid from the pid reference with open(pid_ref_abs_path, "r", encoding="utf8") as pid_ref_file: pid_refs_cid = pid_ref_file.read() # Confirm that the cid reference file exists - cid_ref_abs_path = self.resolve_path("cid", pid_refs_cid) + cid_ref_abs_path = self._resolve_path("cid", pid_refs_cid) if os.path.exists(cid_ref_abs_path): # Check that the pid is actually found in the cid reference file if self._is_pid_in_cid_refs_file(pid, cid_ref_abs_path): # Object must also exist in order to return the cid retrieved - if not self.exists("objects", pid_refs_cid): + if not self._exists("objects", pid_refs_cid): err_msg = ( f"FileHashStore - find_object: Refs file found for pid ({pid}) at" + pid_ref_abs_path @@ -700,7 +700,7 @@ def store_metadata(self, pid, metadata, format_id=None): "FileHashStore - store_metadata: Attempting to store metadata for pid: %s", pid, ) - metadata_cid = self.put_metadata(metadata, pid, checked_format_id) + metadata_cid = self._put_metadata(metadata, pid, checked_format_id) logging.info( "FileHashStore - store_metadata: Successfully stored metadata for pid: %s", @@ -731,7 +731,7 @@ def retrieve_object(self, pid): "FileHashStore - retrieve_object: Metadata exists for pid: %s, retrieving object.", pid, ) - obj_stream = self.open(entity, object_cid) + obj_stream = self._open(entity, object_cid) else: exception_string = ( f"FileHashStore - retrieve_object: No object found for pid: {pid}" @@ -753,10 +753,10 @@ def retrieve_metadata(self, pid, format_id=None): checked_format_id = self._check_arg_format_id(format_id, "retrieve_metadata") entity = "metadata" - metadata_cid = self.computehash(pid + checked_format_id) - metadata_exists = self.exists(entity, metadata_cid) + metadata_cid = self._computehash(pid + checked_format_id) + metadata_exists = self._exists(entity, metadata_cid) if metadata_exists: - metadata_stream = self.open(entity, metadata_cid) + metadata_stream = self._open(entity, metadata_cid) else: exception_string = ( f"FileHashStore - retrieve_metadata: No metadata found for pid: {pid}" @@ -782,19 +782,19 @@ def delete_object(self, pid): return if "cid refs file not found" in fnfe: # Delete pid refs file - pid_ref_abs_path = self.resolve_path("pid", pid) - self.delete("pid", pid_ref_abs_path) + pid_ref_abs_path = self._resolve_path("pid", pid) + self._delete("pid", pid_ref_abs_path) return if "object referenced does not exist" in fnfe: # Delete pid refs file - pid_ref_abs_path = self.resolve_path("pid", pid) - self.delete("pid", pid_ref_abs_path) + pid_ref_abs_path = self._resolve_path("pid", pid) + self._delete("pid", pid_ref_abs_path) return except ValueError as ve: if "is missing from cid refs file" in ve: # Delete pid refs file - pid_ref_abs_path = self.resolve_path("pid", pid) - self.delete("pid", pid_ref_abs_path) + pid_ref_abs_path = self._resolve_path("pid", pid) + self._delete("pid", pid_ref_abs_path) return # Proceed with next steps - cid has been retrieved without any errors @@ -812,16 +812,16 @@ def delete_object(self, pid): ) self.reference_locked_cids.append(cid) try: - cid_ref_abs_path = self.resolve_path("cid", cid) - pid_ref_abs_path = self.resolve_path("pid", pid) + cid_ref_abs_path = self._resolve_path("cid", cid) + pid_ref_abs_path = self._resolve_path("pid", pid) # First delete the pid refs file immediately self._delete_pid_refs_file(pid_ref_abs_path) # Remove pid from cid reference file self._delete_cid_refs_pid(cid_ref_abs_path, pid) # Delete cid reference file and object only if the cid refs file is empty if os.path.getsize(cid_ref_abs_path) == 0: - self.delete("cid", cid_ref_abs_path) - self.delete("objects", cid) + self._delete("cid", cid_ref_abs_path) + self._delete("objects", cid) info_string = ( "FileHashStore - delete_object: Successfully deleted references and" + f" object associated with pid: {pid}" @@ -853,8 +853,8 @@ def delete_metadata(self, pid, format_id=None): checked_format_id = self._check_arg_format_id(format_id, "delete_metadata") entity = "metadata" - metadata_cid = self.computehash(pid + checked_format_id) - self.delete(entity, metadata_cid) + metadata_cid = self._computehash(pid + checked_format_id) + self._delete(entity, metadata_cid) logging.info( "FileHashStore - delete_metadata: Successfully deleted metadata for pid: %s", @@ -871,16 +871,16 @@ def get_hex_digest(self, pid, algorithm): self._check_string(algorithm, "algorithm", "get_hex_digest") entity = "objects" - algorithm = self.clean_algorithm(algorithm) + algorithm = self._clean_algorithm(algorithm) object_cid = self.find_object(pid) - if not self.exists(entity, object_cid): + if not self._exists(entity, object_cid): exception_string = ( f"FileHashStore - get_hex_digest: No object found for pid: {pid}" ) logging.error(exception_string) raise ValueError(exception_string) - cid_stream = self.open(entity, object_cid) - hex_digest = self.computehash(cid_stream, algorithm=algorithm) + cid_stream = self._open(entity, object_cid) + hex_digest = self._computehash(cid_stream, algorithm=algorithm) info_string = ( f"FileHashStore - get_hex_digest: Successfully calculated hex digest for pid: {pid}." @@ -891,7 +891,7 @@ def get_hex_digest(self, pid, algorithm): # FileHashStore Core Methods - def store_and_validate_data( + def _store_and_validate_data( self, pid, file, @@ -944,7 +944,7 @@ def store_and_validate_data( ) return object_metadata - def store_data_only(self, data): + def _store_data_only(self, data): """Store an object to HashStore and return the ID and a hex digest dictionary of the default algorithms. This method does not validate the object and writes directly to `/objects` after the hex digests are calculated. @@ -958,7 +958,7 @@ def store_data_only(self, data): size, and hex digest dictionary. """ logging.debug( - "FileHashStore - store_data_only: Request to store data object only." + "FileHashStore - _store_data_only: Request to store data object only." ) try: @@ -979,14 +979,14 @@ def store_data_only(self, data): # The permanent address of the data stored is based on the data's checksum cid = hex_digest_dict.get(self.algorithm) logging.debug( - "FileHashStore - store_data_only: Successfully stored object with cid: %s", + "FileHashStore - _store_data_only: Successfully stored object with cid: %s", cid, ) return object_metadata # pylint: disable=W0718 except Exception as err: exception_string = ( - "FileHashStore - store_data_only: failed to store object." + "FileHashStore - _store_data_only: failed to store object." + f" Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) @@ -1043,7 +1043,7 @@ def _move_and_get_checksums( # Objects are stored with their content identifier based on the store algorithm entity = "objects" object_cid = hex_digests.get(self.algorithm) - abs_file_path = self.build_path(entity, object_cid, extension) + abs_file_path = self._build_path(entity, object_cid, extension) # Only move file if it doesn't exist. We do not check before we create the tmp # file and calculate the hex digests because the given checksum could be incorrect. @@ -1059,7 +1059,7 @@ def _move_and_get_checksums( tmp_file_size, file_size_to_validate, ) - self.create_path(os.path.dirname(abs_file_path)) + self._create_path(os.path.dirname(abs_file_path)) try: debug_msg = ( "FileHashStore - _move_and_get_checksums: Moving temp file to permanent" @@ -1096,12 +1096,12 @@ def _move_and_get_checksums( + f" found but with incomplete state, deleting file: {abs_file_path}", ) logging.debug(debug_msg) - self.delete(entity, abs_file_path) + self._delete(entity, abs_file_path) logging.debug( "FileHashStore - _move_and_get_checksums: Deleting temporary file: %s", tmp_file_name, ) - self.delete(entity, tmp_file_name) + self._delete(entity, tmp_file_name) err_msg = ( "Aborting store_object upload - an unexpected error has occurred when moving" + f" file to: {object_cid} - Error: {err}" @@ -1131,7 +1131,7 @@ def _move_and_get_checksums( raise FileExistsError from ge finally: # Delete the temporary file, it already exists so it is redundant - self.delete(entity, tmp_file_name) + self._delete(entity, tmp_file_name) return object_cid, tmp_file_size, hex_digests @@ -1155,7 +1155,7 @@ def _write_to_tmp_file_and_get_hex_digests( algorithm_list_to_calculate = self._refine_algorithm_list( additional_algorithm, checksum_algorithm ) - tmp_root_path = self.get_store_path("objects") / "tmp" + tmp_root_path = self._get_store_path("objects") / "tmp" tmp = self._mktmpfile(tmp_root_path) logging.debug( @@ -1235,7 +1235,7 @@ def _mktmpfile(self, path): """ # Physically create directory if it doesn't exist if os.path.exists(path) is False: - self.create_path(path) + self._create_path(path) tmp = NamedTemporaryFile(dir=path, delete=False) @@ -1425,7 +1425,7 @@ def _delete_pid_refs_file(self, pid_ref_abs_path): ) raise FileNotFoundError(err_msg) else: - self.delete("pid", pid_ref_abs_path) + self._delete("pid", pid_ref_abs_path) except Exception as err: exception_string = ( @@ -1435,7 +1435,7 @@ def _delete_pid_refs_file(self, pid_ref_abs_path): logging.error(exception_string) raise err - def put_metadata(self, metadata, pid, format_id): + def _put_metadata(self, metadata, pid, format_id): """Store contents of metadata to `[self.root]/metadata` using the hash of the given PID and format ID as the permanent address. @@ -1447,7 +1447,7 @@ def put_metadata(self, metadata, pid, format_id): :rtype: str """ logging.debug( - "FileHashStore - put_metadata: Request to put metadata for pid: %s", pid + "FileHashStore - _put_metadata: Request to put metadata for pid: %s", pid ) # Create metadata tmp file and write to it metadata_stream = Stream(metadata) @@ -1455,9 +1455,9 @@ def put_metadata(self, metadata, pid, format_id): metadata_tmp = self._mktmpmetadata(metadata_stream) # Get target and related paths (permanent location) - metadata_cid = self.computehash(pid + format_id) - rel_path = "/".join(self.shard(metadata_cid)) - full_path = self.get_store_path("metadata") / rel_path + metadata_cid = self._computehash(pid + format_id) + rel_path = "/".join(self._shard(metadata_cid)) + full_path = self._get_store_path("metadata") / rel_path # Move metadata to target path if os.path.exists(metadata_tmp): @@ -1467,26 +1467,26 @@ def put_metadata(self, metadata, pid, format_id): # Metadata will be replaced if it exists shutil.move(metadata_tmp, full_path) logging.debug( - "FileHashStore - put_metadata: Successfully put metadata for pid: %s", + "FileHashStore - _put_metadata: Successfully put metadata for pid: %s", pid, ) return metadata_cid except Exception as err: exception_string = ( - f"FileHashStore - put_metadata: Unexpected {err=}, {type(err)=}" + f"FileHashStore - _put_metadata: Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) if os.path.exists(metadata_tmp): # Remove tmp metadata, calling app must re-upload logging.debug( - "FileHashStore - put_metadata: Deleting metadata for pid: %s", + "FileHashStore - _put_metadata: Deleting metadata for pid: %s", pid, ) self.metadata.delete(metadata_tmp) raise else: exception_string = ( - f"FileHashStore - put_metadata: Attempt to move metadata for pid: {pid}" + f"FileHashStore - _put_metadata: Attempt to move metadata for pid: {pid}" + f", but metadata temp file not found: {metadata_tmp}" ) logging.error(exception_string) @@ -1501,7 +1501,7 @@ def _mktmpmetadata(self, stream): :rtype: str """ # Create temporary file in .../{store_path}/tmp - tmp_root_path = self.get_store_path("metadata") / "tmp" + tmp_root_path = self._get_store_path("metadata") / "tmp" tmp = self._mktmpfile(tmp_root_path) # tmp is a file-like object that is already opened for writing by default @@ -1552,7 +1552,7 @@ def _verify_object_information( + f" {file_size_to_validate}." ) if pid is not None: - self.delete(entity, tmp_file_name) + self._delete(entity, tmp_file_name) exception_string_for_pid = ( exception_string + f" Tmp file deleted and file not stored for pid: {pid}" @@ -1581,7 +1581,7 @@ def _verify_object_information( ) if pid is not None: # Delete the tmp file - self.delete(entity, tmp_file_name) + self._delete(entity, tmp_file_name) exception_string_for_pid = ( exception_string + f" Tmp file ({tmp_file_name}) deleted." ) @@ -1590,8 +1590,8 @@ def _verify_object_information( else: # Delete the object cid = hex_digests[self.algorithm] - cid_abs_path = self.resolve_path("cid", cid) - self.delete(entity, cid_abs_path) + cid_abs_path = self._resolve_path("cid", cid) + self._delete(entity, cid_abs_path) logging.error(exception_string) raise ValueError(exception_string) @@ -1604,8 +1604,8 @@ def _verify_hashstore_references(self, pid, cid, verify_type): :param str verify_type: "update" or "create" """ # Check that reference files were created - pid_ref_abs_path = self.resolve_path("pid", pid) - cid_ref_abs_path = self.resolve_path("cid", cid) + pid_ref_abs_path = self._resolve_path("pid", pid) + cid_ref_abs_path = self._resolve_path("cid", cid) if not os.path.exists(pid_ref_abs_path): exception_string = ( "FileHashStore - _verify_hashstore_references: Pid refs file missing: " @@ -1695,7 +1695,7 @@ def _check_arg_algorithms_and_checksum( additional_algorithm_checked = None if additional_algorithm != self.algorithm and additional_algorithm is not None: # Set additional_algorithm - additional_algorithm_checked = self.clean_algorithm(additional_algorithm) + additional_algorithm_checked = self._clean_algorithm(additional_algorithm) checksum_algorithm_checked = None if checksum is not None: self._check_string( @@ -1710,7 +1710,7 @@ def _check_arg_algorithms_and_checksum( "_check_arg_algorithms_and_checksum (store_object)", ) # Set checksum_algorithm - checksum_algorithm_checked = self.clean_algorithm(checksum_algorithm) + checksum_algorithm_checked = self._clean_algorithm(checksum_algorithm) return additional_algorithm_checked, checksum_algorithm_checked def _check_arg_format_id(self, format_id, method): @@ -1746,7 +1746,7 @@ def _refine_algorithm_list(self, additional_algorithm, checksum_algorithm): """ algorithm_list_to_calculate = self.default_algo_list if checksum_algorithm is not None: - self.clean_algorithm(checksum_algorithm) + self._clean_algorithm(checksum_algorithm) if checksum_algorithm in self.other_algo_list: debug_additional_other_algo_str = ( f"FileHashStore - _refine_algorithm_list: checksum algo: {checksum_algorithm}" @@ -1755,7 +1755,7 @@ def _refine_algorithm_list(self, additional_algorithm, checksum_algorithm): logging.debug(debug_additional_other_algo_str) algorithm_list_to_calculate.append(checksum_algorithm) if additional_algorithm is not None: - self.clean_algorithm(additional_algorithm) + self._clean_algorithm(additional_algorithm) if additional_algorithm in self.other_algo_list: debug_additional_other_algo_str = ( f"FileHashStore - _refine_algorithm_list: addit algo: {additional_algorithm}" @@ -1768,7 +1768,7 @@ def _refine_algorithm_list(self, additional_algorithm, checksum_algorithm): algorithm_list_to_calculate = set(algorithm_list_to_calculate) return algorithm_list_to_calculate - def clean_algorithm(self, algorithm_string): + def _clean_algorithm(self, algorithm_string): """Format a string and ensure that it is supported and compatible with the Python `hashlib` library. @@ -1791,14 +1791,14 @@ def clean_algorithm(self, algorithm_string): and cleaned_string not in self.other_algo_list ): exception_string = ( - "FileHashStore: clean_algorithm: Algorithm not supported:" + "FileHashStore: _clean_algorithm: Algorithm not supported:" + cleaned_string ) logging.error(exception_string) raise ValueError(exception_string) return cleaned_string - def computehash(self, stream, algorithm=None): + def _computehash(self, stream, algorithm=None): """Compute the hash of a file-like object (or string) using the store algorithm by default or with an optional supported algorithm. @@ -1812,14 +1812,14 @@ def computehash(self, stream, algorithm=None): if algorithm is None: hashobj = hashlib.new(self.algorithm) else: - check_algorithm = self.clean_algorithm(algorithm) + check_algorithm = self._clean_algorithm(algorithm) hashobj = hashlib.new(check_algorithm) for data in stream: hashobj.update(self._cast_to_bytes(data)) hex_digest = hashobj.hexdigest() return hex_digest - def exists(self, entity, file): + def _exists(self, entity, file): """Check whether a given file id or path exists on disk. :param str entity: Desired entity type (e.g., "objects", "metadata"). @@ -1828,10 +1828,10 @@ def exists(self, entity, file): :return: True if the file exists. :rtype: bool """ - file_exists = bool(self.resolve_path(entity, file)) + file_exists = bool(self._resolve_path(entity, file)) return file_exists - def shard(self, digest): + def _shard(self, digest): """Generates a list given a digest of `self.depth` number of tokens with width `self.width` from the first part of the digest plus the remainder. @@ -1857,7 +1857,7 @@ def compact(items): return hierarchical_list - def open(self, entity, file, mode="rb"): + def _open(self, entity, file, mode="rb"): """Return open buffer object from given id or path. Caller is responsible for closing the stream. @@ -1868,7 +1868,7 @@ def open(self, entity, file, mode="rb"): :return: An `io` stream dependent on the `mode`. :rtype: io.BufferedReader """ - realpath = self.resolve_path(entity, file) + realpath = self._resolve_path(entity, file) if realpath is None: raise IOError(f"Could not locate file: {file}") @@ -1877,14 +1877,14 @@ def open(self, entity, file, mode="rb"): buffer = io.open(realpath, mode) return buffer - def delete(self, entity, file): + def _delete(self, entity, file): """Delete file using id or path. Remove any empty directories after deleting. No exception is raised if file doesn't exist. :param str entity: Desired entity type (ex. "objects", "metadata"). :param str file: Address ID or path of file. """ - realpath = self.resolve_path(entity, file) + realpath = self._resolve_path(entity, file) if realpath is None: return None @@ -1929,7 +1929,7 @@ def _has_subdir(self, path): is_subdir = subpath.startswith(root_path) return is_subdir - def create_path(self, path): + def _create_path(self, path): """Physically create the folder path (and all intermediate ones) on disk. :param str path: The path to create. @@ -1940,7 +1940,7 @@ def create_path(self, path): except FileExistsError: assert os.path.isdir(path), f"expected {path} to be a directory" - def build_path(self, entity, hash_id, extension=""): + def _build_path(self, entity, hash_id, extension=""): """Build the absolute file path for a given hash ID with an optional file extension. :param str entity: Desired entity type (ex. "objects", "metadata"). @@ -1950,8 +1950,8 @@ def build_path(self, entity, hash_id, extension=""): :return: An absolute file path for the specified hash ID. :rtype: str """ - paths = self.shard(hash_id) - root_dir = self.get_store_path(entity) + paths = self._shard(hash_id) + root_dir = self._get_store_path(entity) if extension and not extension.startswith(os.extsep): extension = os.extsep + extension @@ -1961,7 +1961,7 @@ def build_path(self, entity, hash_id, extension=""): absolute_path = os.path.join(root_dir, *paths) + extension return absolute_path - def resolve_path(self, entity, file): + def _resolve_path(self, entity, file): """Attempt to determine the absolute path of a file ID or path through successive checking of candidate paths. @@ -1989,19 +1989,19 @@ def resolve_path(self, entity, file): # Check for sharded path. if entity == "cid": # Note, we skip checking whether the file exists for refs - ref_file_abs_path = self.build_path(entity, file) + ref_file_abs_path = self._build_path(entity, file) return ref_file_abs_path elif entity == "pid": # Note, we skip checking whether the file exists for refs - hash_id = self.computehash(file, self.algorithm) - ref_file_abs_path = self.build_path(entity, hash_id) + hash_id = self._computehash(file, self.algorithm) + ref_file_abs_path = self._build_path(entity, hash_id) return ref_file_abs_path else: - abspath = self.build_path(entity, file) + abspath = self._build_path(entity, file) if os.path.isfile(abspath): return abspath - def get_store_path(self, entity): + def _get_store_path(self, entity): """Return a path object of the root directory of the store. :param str entity: Desired entity type: "objects" or "metadata" @@ -2024,7 +2024,7 @@ def get_store_path(self, entity): f"entity: {entity} does not exist. Do you mean 'objects', 'metadata' or 'refs'?" ) - def count(self, entity): + def _count(self, entity): """Return the count of the number of files in the `root` directory. :param str entity: Desired entity type (ex. "objects", "metadata"). diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 1d549a6e..2aafec81 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -5,8 +5,7 @@ import pytest from hashstore.filehashstore import FileHashStore - -# Tests for HashStore Configuration and Related Methods +# pylint: disable=W0212 def test_pids_length(pids): @@ -121,7 +120,7 @@ def test_init_with_existing_hashstore_missing_yaml(store, pids): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - store.store_and_validate_data(pid, path) + store._store_and_validate_data(pid, path) os.remove(store.hashstore_configuration_yaml) properties = { "store_path": store.root, @@ -135,8 +134,8 @@ def test_init_with_existing_hashstore_missing_yaml(store, pids): def test_load_properties(store): - """Verify dictionary returned from load_properties matches initialization.""" - hashstore_yaml_dict = store.load_properties() + """Verify dictionary returned from _load_properties matches initialization.""" + hashstore_yaml_dict = store._load_properties() assert hashstore_yaml_dict.get("store_depth") == 3 assert hashstore_yaml_dict.get("store_width") == 2 assert hashstore_yaml_dict.get("store_algorithm") == "SHA-256" @@ -150,7 +149,7 @@ def test_load_properties_hashstore_yaml_missing(store): """Confirm FileNotFoundError is raised when hashstore.yaml does not exist.""" os.remove(store.hashstore_configuration_yaml) with pytest.raises(FileNotFoundError): - store.load_properties() + store._load_properties() def test_validate_properties(store): @@ -207,7 +206,7 @@ def test_set_default_algorithms_missing_yaml(store, pids): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - store.store_and_validate_data(pid, path) + store._store_and_validate_data(pid, path) os.remove(store.hashstore_configuration_yaml) with pytest.raises(FileNotFoundError): # pylint: disable=W0212 @@ -218,67 +217,67 @@ def test_set_default_algorithms_missing_yaml(store, pids): def test_store_and_validate_data_files_path(pids, store): - """Test store_and_validate_data objects with path object for the path arg.""" + """Test _store_and_validate_data objects with path object for the path arg.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): path = Path(test_dir) / pid.replace("/", "_") - object_metadata = store.store_and_validate_data(pid, path) + object_metadata = store._store_and_validate_data(pid, path) object_metadata_id = object_metadata.cid - assert store.exists(entity, object_metadata_id) + assert store._exists(entity, object_metadata_id) def test_store_and_validate_data_files_string(pids, store): - """Test store_and_validate_data objects with string for the path arg.""" + """Test _store_and_validate_data objects with string for the path arg.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_and_validate_data(pid, path) + object_metadata = store._store_and_validate_data(pid, path) object_metadata_id = object_metadata.cid - assert store.exists(entity, object_metadata_id) + assert store._exists(entity, object_metadata_id) def test_store_and_validate_data_files_stream(pids, store): - """Test store_and_validate_data objects with stream for the path arg.""" + """Test _store_and_validate_data objects with stream for the path arg.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") - object_metadata = store.store_and_validate_data(pid, input_stream) + object_metadata = store._store_and_validate_data(pid, input_stream) input_stream.close() object_metadata_id = object_metadata.cid - assert store.exists(entity, object_metadata_id) - assert store.count(entity) == 3 + assert store._exists(entity, object_metadata_id) + assert store._count(entity) == 3 def test_store_and_validate_data_cid(pids, store): - """Check store_and_validate_data returns correct id.""" + """Check _store_and_validate_data returns correct id.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_and_validate_data(pid, path) + object_metadata = store._store_and_validate_data(pid, path) object_metadata_id = object_metadata.cid assert object_metadata_id == pids[pid][store.algorithm] def test_store_and_validate_data_file_size(pids, store): - """Check store_and_validate_data returns correct file size.""" + """Check _store_and_validate_data returns correct file size.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_and_validate_data(pid, path) + object_metadata = store._store_and_validate_data(pid, path) object_size = object_metadata.obj_size assert object_size == pids[pid]["file_size_bytes"] def test_store_and_validate_data_hex_digests(pids, store): - """Check store_and_validate_data successfully generates hex digests dictionary.""" + """Check _store_and_validate_data successfully generates hex digests dictionary.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_and_validate_data(pid, path) + object_metadata = store._store_and_validate_data(pid, path) object_metadata_hex_digests = object_metadata.hex_digests assert object_metadata_hex_digests.get("md5") == pids[pid]["md5"] assert object_metadata_hex_digests.get("sha1") == pids[pid]["sha1"] @@ -288,12 +287,12 @@ def test_store_and_validate_data_hex_digests(pids, store): def test_store_and_validate_data_additional_algorithm(pids, store): - """Check store_and_validate_data returns additional algorithm in hex digests.""" + """Check _store_and_validate_data returns additional algorithm in hex digests.""" test_dir = "tests/testdata/" for pid in pids.keys(): algo = "sha224" path = test_dir + pid.replace("/", "_") - object_metadata = store.store_and_validate_data( + object_metadata = store._store_and_validate_data( pid, path, additional_algorithm=algo ) hex_digests = object_metadata.hex_digests @@ -302,20 +301,20 @@ def test_store_and_validate_data_additional_algorithm(pids, store): def test_store_and_validate_data_with_correct_checksums(pids, store): - """Check store_and_validate_data with valid checksum and checksum algorithm supplied.""" + """Check _store_and_validate_data with valid checksum and checksum algorithm supplied.""" test_dir = "tests/testdata/" for pid in pids.keys(): algo = "sha224" algo_checksum = pids[pid][algo] path = test_dir + pid.replace("/", "_") - store.store_and_validate_data( + store._store_and_validate_data( pid, path, checksum=algo_checksum, checksum_algorithm=algo ) - assert store.count("objects") == 3 + assert store._count("objects") == 3 def test_store_and_validate_data_with_incorrect_checksum(pids, store): - """Check store_and_validate_data fails when a bad checksum supplied.""" + """Check _store_and_validate_data fails when a bad checksum supplied.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -323,38 +322,38 @@ def test_store_and_validate_data_with_incorrect_checksum(pids, store): algo_checksum = "badChecksumValue" path = test_dir + pid.replace("/", "_") with pytest.raises(ValueError): - store.store_and_validate_data( + store._store_and_validate_data( pid, path, checksum=algo_checksum, checksum_algorithm=algo ) - assert store.count(entity) == 0 + assert store._count(entity) == 0 def test_store_data_only_cid(pids, store): - """Check store_data_only returns correct id.""" + """Check _store_data_only returns correct id.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_data_only(path) + object_metadata = store._store_data_only(path) object_metadata_id = object_metadata.cid assert object_metadata_id == pids[pid][store.algorithm] def test_store_data_only_file_size(pids, store): - """Check store_data_only returns correct file size.""" + """Check _store_data_only returns correct file size.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_data_only(path) + object_metadata = store._store_data_only(path) object_size = object_metadata.obj_size assert object_size == pids[pid]["file_size_bytes"] def test_store_data_only_hex_digests(pids, store): - """Check store_data_only generates hex digests dictionary.""" + """Check _store_data_only generates hex digests dictionary.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_data_only(path) + object_metadata = store._store_data_only(path) object_metadata_hex_digests = object_metadata.hex_digests assert object_metadata_hex_digests.get("md5") == pids[pid]["md5"] assert object_metadata_hex_digests.get("sha1") == pids[pid]["sha1"] @@ -437,7 +436,7 @@ def test_move_and_get_checksums_duplicates_raises_error(pids, store): checksum_algorithm="sha256", ) input_stream.close() - assert store.count(entity) == 3 + assert store._count(entity) == 3 def test_move_and_get_checksums_incorrect_file_size(pids, store): @@ -607,23 +606,23 @@ def test_write_to_tmp_file_and_get_hex_digests_with_unsupported_algorithm(pids, def test_mktmpfile(store): """Test that _mktmpfile creates and returns a tmp file.""" path = store.root + "/doutest/tmp/" - store.create_path(path) + store._create_path(path) # pylint: disable=W0212 tmp = store._mktmpfile(path) assert os.path.exists(tmp.name) def test_put_metadata_with_path(pids, store): - """Test put_metadata with path object for the path arg.""" + """Test _put_metadata with path object for the path arg.""" entity = "metadata" test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - metadata_cid = store.put_metadata(syspath, pid, format_id) - assert store.exists(entity, metadata_cid) - assert store.count(entity) == 3 + metadata_cid = store._put_metadata(syspath, pid, format_id) + assert store._exists(entity, metadata_cid) + assert store._count(entity) == 3 def test_put_metadata_with_string(pids, store): @@ -634,9 +633,9 @@ def test_put_metadata_with_string(pids, store): for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = str(Path(test_dir) / filename) - metadata_cid = store.put_metadata(syspath, pid, format_id) - assert store.exists(entity, metadata_cid) - assert store.count(entity) == 3 + metadata_cid = store._put_metadata(syspath, pid, format_id) + assert store._exists(entity, metadata_cid) + assert store._count(entity) == 3 def test_put_metadata_cid(pids, store): @@ -646,7 +645,7 @@ def test_put_metadata_cid(pids, store): for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - metadata_cid = store.put_metadata(syspath, pid, format_id) + metadata_cid = store._put_metadata(syspath, pid, format_id) assert metadata_cid == pids[pid]["metadata_cid"] @@ -771,9 +770,9 @@ def test_clean_algorithm(store): algorithm_underscore = "sha_256" algorithm_hyphen = "sha-256" algorithm_other_hyphen = "sha3-256" - cleaned_algo_underscore = store.clean_algorithm(algorithm_underscore) - cleaned_algo_hyphen = store.clean_algorithm(algorithm_hyphen) - cleaned_algo_other_hyphen = store.clean_algorithm(algorithm_other_hyphen) + cleaned_algo_underscore = store._clean_algorithm(algorithm_underscore) + cleaned_algo_hyphen = store._clean_algorithm(algorithm_hyphen) + cleaned_algo_other_hyphen = store._clean_algorithm(algorithm_other_hyphen) assert cleaned_algo_underscore == "sha256" assert cleaned_algo_hyphen == "sha256" assert cleaned_algo_other_hyphen == "sha3_256" @@ -785,7 +784,7 @@ def test_computehash(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") obj_stream = io.open(path, "rb") - obj_sha256_hash = store.computehash(obj_stream, "sha256") + obj_sha256_hash = store._computehash(obj_stream, "sha256") obj_stream.close() assert pids[pid]["sha256"] == obj_sha256_hash @@ -793,7 +792,7 @@ def test_computehash(pids, store): def test_get_store_path_object(store): """Check get_store_path for object path.""" # pylint: disable=W0212 - path_objects = store.get_store_path("objects") + path_objects = store._get_store_path("objects") path_objects_string = str(path_objects) assert path_objects_string.endswith("/metacat/objects") @@ -801,7 +800,7 @@ def test_get_store_path_object(store): def test_get_store_path_metadata(store): """Check get_store_path for metadata path.""" # pylint: disable=W0212 - path_metadata = store.get_store_path("metadata") + path_metadata = store._get_store_path("metadata") path_metadata_string = str(path_metadata) assert path_metadata_string.endswith("/metacat/metadata") @@ -809,7 +808,7 @@ def test_get_store_path_metadata(store): def test_get_store_path_refs(store): """Check get_store_path for refs path.""" # pylint: disable=W0212 - path_metadata = store.get_store_path("refs") + path_metadata = store._get_store_path("refs") path_metadata_string = str(path_metadata) assert path_metadata_string.endswith("/metacat/refs") @@ -820,8 +819,8 @@ def test_exists_with_object_metadata_id(pids, store): entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_and_validate_data(pid, path) - assert store.exists(entity, object_metadata.cid) + object_metadata = store._store_and_validate_data(pid, path) + assert store._exists(entity, object_metadata.cid) def test_exists_with_sharded_path(pids, store): @@ -830,17 +829,17 @@ def test_exists_with_sharded_path(pids, store): entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_and_validate_data(pid, path) - object_metadata_shard = store.shard(object_metadata.cid) + object_metadata = store._store_and_validate_data(pid, path) + object_metadata_shard = store._shard(object_metadata.cid) object_metadata_shard_path = "/".join(object_metadata_shard) - assert store.exists(entity, object_metadata_shard_path) + assert store._exists(entity, object_metadata_shard_path) def test_exists_with_nonexistent_file(store): """Test exists method with a nonexistent file.""" entity = "objects" non_existent_file = "tests/testdata/filedoesnotexist" - does_not_exist = store.exists(entity, non_existent_file) + does_not_exist = store._exists(entity, non_existent_file) assert does_not_exist is False @@ -853,7 +852,7 @@ def test_shard(store): "5e", "d77052d7e166017f779cbc193357c3a5006ee8b8457230bcf7abcef65e", ] - sharded_list = store.shard(hash_id) + sharded_list = store._shard(hash_id) assert predefined_list == sharded_list @@ -863,9 +862,9 @@ def test_open_objects(pids, store): entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_and_validate_data(pid, path) + object_metadata = store._store_and_validate_data(pid, path) object_metadata_id = object_metadata.cid - io_buffer = store.open(entity, object_metadata_id) + io_buffer = store._open(entity, object_metadata_id) assert isinstance(io_buffer, io.BufferedReader) io_buffer.close() @@ -876,10 +875,10 @@ def test_delete_by_object_metadata_id(pids, store): entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_and_validate_data(pid, path) + object_metadata = store._store_and_validate_data(pid, path) object_metadata_id = object_metadata.cid - store.delete(entity, object_metadata_id) - assert store.count(entity) == 0 + store._delete(entity, object_metadata_id) + assert store._count(entity) == 0 def test_remove_empty_removes_empty_folders_string(store): @@ -927,8 +926,8 @@ def test_remove_empty_does_not_remove_nonempty_folders(pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_and_validate_data(pid, path) - object_metadata_shard = store.shard(object_metadata.cid) + object_metadata = store._store_and_validate_data(pid, path) + object_metadata_shard = store._shard(object_metadata.cid) object_metadata_shard_path = "/".join(object_metadata_shard) # Get parent directory of the relative path parent_dir = os.path.dirname(object_metadata_shard_path) @@ -973,7 +972,7 @@ def test_create_path(pids, store): root_directory = store.root pid_hex_digest_directory = pids[pid]["metadata_cid"][:2] pid_directory = root_directory + pid_hex_digest_directory - store.create_path(pid_directory) + store._create_path(pid_directory) assert os.path.isdir(pid_directory) @@ -981,7 +980,7 @@ def test_get_real_path_file_does_not_exist(store): """Test get_real_path returns None when object does not exist.""" entity = "objects" test_path = "tests/testdata/helloworld.txt" - real_path_exists = store.resolve_path(entity, test_path) + real_path_exists = store._resolve_path(entity, test_path) assert real_path_exists is None @@ -991,8 +990,8 @@ def test_get_real_path_with_object_id(store, pids): entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_and_validate_data(pid, path) - obj_abs_path = store.resolve_path(entity, object_metadata.cid) + object_metadata = store._store_and_validate_data(pid, path) + obj_abs_path = store._resolve_path(entity, object_metadata.cid) assert os.path.exists(obj_abs_path) @@ -1002,10 +1001,10 @@ def test_get_real_path_with_object_id_sharded(pids, store): entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_and_validate_data(pid, path) - object_metadata_shard = store.shard(object_metadata.cid) + object_metadata = store._store_and_validate_data(pid, path) + object_metadata_shard = store._shard(object_metadata.cid) object_metadata_shard_path = "/".join(object_metadata_shard) - obj_abs_path = store.resolve_path(entity, object_metadata_shard_path) + obj_abs_path = store._resolve_path(entity, object_metadata_shard_path) assert os.path.exists(obj_abs_path) @@ -1018,7 +1017,7 @@ def test_get_real_path_with_metadata_id(store, pids): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename metadata_cid = store.store_metadata(pid, syspath, format_id) - metadata_abs_path = store.resolve_path(entity, metadata_cid) + metadata_abs_path = store._resolve_path(entity, metadata_cid) assert os.path.exists(metadata_abs_path) @@ -1028,9 +1027,9 @@ def test_get_real_path_with_bad_entity(store, pids): entity = "bad_entity" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.store_and_validate_data(pid, path) + object_metadata = store._store_and_validate_data(pid, path) with pytest.raises(ValueError): - store.resolve_path(entity, object_metadata.cid) + store._resolve_path(entity, object_metadata.cid) def test_build_path(store, pids): @@ -1039,9 +1038,9 @@ def test_build_path(store, pids): entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - _ = store.store_and_validate_data(pid, path) + _ = store._store_and_validate_data(pid, path) # pylint: disable=W0212 - abs_path = store.build_path(entity, pids[pid][store.algorithm]) + abs_path = store._build_path(entity, pids[pid][store.algorithm]) assert os.path.exists(abs_path) @@ -1051,8 +1050,8 @@ def test_count(pids, store): entity = "objects" for pid in pids.keys(): path_string = test_dir + pid.replace("/", "_") - store.store_and_validate_data(pid, path_string) - assert store.count(entity) == 3 + store._store_and_validate_data(pid, path_string) + assert store._count(entity) == 3 def test_cast_to_bytes(store): diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 238d4e80..40dc99f0 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -8,6 +8,9 @@ import time import pytest +# pylint: disable=W0212 + + # Define a mark to be used to label slow tests slow_test = pytest.mark.skipif( "not config.getoption('--run-slow')", @@ -38,7 +41,7 @@ def test_store_object(pids, store): path = Path(test_dir + pid.replace("/", "_")) object_metadata = store.store_object(pid, path) assert object_metadata.cid == pids[pid][store.algorithm] - assert store.count(entity) == 3 + assert store._count(entity) == 3 def test_store_object_files_path(pids, store): @@ -48,8 +51,8 @@ def test_store_object_files_path(pids, store): for pid in pids.keys(): path = Path(test_dir + pid.replace("/", "_")) _object_metadata = store.store_object(pid, path) - assert store.exists(entity, pids[pid][store.algorithm]) - assert store.count(entity) == 3 + assert store._exists(entity, pids[pid][store.algorithm]) + assert store._count(entity) == 3 def test_store_object_files_string(pids, store): @@ -59,8 +62,8 @@ def test_store_object_files_string(pids, store): for pid in pids.keys(): path_string = test_dir + pid.replace("/", "_") _object_metadata = store.store_object(pid, path_string) - assert store.exists(entity, pids[pid][store.algorithm]) - assert store.count(entity) == 3 + assert store._exists(entity, pids[pid][store.algorithm]) + assert store._count(entity) == 3 def test_store_object_files_input_stream(pids, store): @@ -72,8 +75,8 @@ def test_store_object_files_input_stream(pids, store): input_stream = io.open(path, "rb") _object_metadata = store.store_object(pid, input_stream) input_stream.close() - assert store.exists(entity, pids[pid][store.algorithm]) - assert store.count(entity) == 3 + assert store._exists(entity, pids[pid][store.algorithm]) + assert store._count(entity) == 3 def test_store_object_id(pids, store): @@ -170,7 +173,7 @@ def test_store_object_additional_algorithm_hyphen_uppercase(pids, store): object_metadata = store.store_object(pid, path, algorithm_with_hyphen_and_upper) sha256_cid = object_metadata.hex_digests.get("sha384") assert sha256_cid == pids[pid]["sha384"] - assert store.exists(entity, pids[pid][store.algorithm]) + assert store._exists(entity, pids[pid][store.algorithm]) def test_store_object_additional_algorithm_hyphen_lowercase(pids, store): @@ -186,7 +189,7 @@ def test_store_object_additional_algorithm_hyphen_lowercase(pids, store): "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) assert additional_sha3_256_hex_digest == sha3_256_checksum - assert store.exists(entity, pids[pid][store.algorithm]) + assert store._exists(entity, pids[pid][store.algorithm]) def test_store_object_additional_algorithm_underscore(pids, store): @@ -202,7 +205,7 @@ def test_store_object_additional_algorithm_underscore(pids, store): "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) assert additional_sha3_256_hex_digest == sha3_256_checksum - assert store.exists(entity, pids[pid][store.algorithm]) + assert store._exists(entity, pids[pid][store.algorithm]) def test_store_object_checksum_correct(store): @@ -218,7 +221,7 @@ def test_store_object_checksum_correct(store): _object_metadata = store.store_object( pid, path, checksum=checksum_correct, checksum_algorithm=checksum_algo ) - assert store.count(entity) == 1 + assert store._count(entity) == 1 def test_store_object_checksum_correct_and_additional_algo(store): @@ -343,7 +346,7 @@ def test_store_object_duplicate_does_not_store_duplicate(store): pid_that_refs_existing_cid = "dou.test.1" _object_metadata_two = store.store_object(pid_that_refs_existing_cid, path) # Confirm only one object exists and the tmp file created is deleted - assert store.count(entity) == 1 + assert store._count(entity) == 1 def test_store_object_duplicate_references_files(pids, store): @@ -361,11 +364,11 @@ def test_store_object_duplicate_references_files(pids, store): pid_three = "dou.test.2" _object_metadata_three = store.store_object(pid_three, path) # Confirm that there are 3 pid reference files - assert store.count("pid") == 3 + assert store._count("pid") == 3 # Confirm that there are 1 cid reference files - assert store.count("cid") == 1 + assert store._count("cid") == 1 # Confirm the content of the cid refence files - cid_ref_abs_path = store.resolve_path("cid", pids[pid][store.algorithm]) + cid_ref_abs_path = store._resolve_path("cid", pids[pid][store.algorithm]) with open(cid_ref_abs_path, "r", encoding="utf8") as f: for _, line in enumerate(f, start=1): value = line.strip() @@ -387,7 +390,7 @@ def test_store_object_duplicate_references_content(pids, store): pid_three = "dou.test.2" store.store_object(pid_three, path) # Confirm the content of the cid refence files - cid_ref_abs_path = store.resolve_path("cid", pids[pid][store.algorithm]) + cid_ref_abs_path = store._resolve_path("cid", pids[pid][store.algorithm]) with open(cid_ref_abs_path, "r", encoding="utf8") as f: for _, line in enumerate(f, start=1): value = line.strip() @@ -411,8 +414,8 @@ def test_store_object_duplicate_raises_error_with_bad_validation_data(pids, stor _object_metadata_two = store.store_object( pid, path, checksum="nonmatchingchecksum", checksum_algorithm="sha256" ) - assert store.count(entity) == 1 - assert store.exists(entity, pids[pid][store.algorithm]) + assert store._count(entity) == 1 + assert store._exists(entity, pids[pid][store.algorithm]) def test_store_object_with_obj_file_size(store, pids): @@ -484,8 +487,8 @@ def store_object_wrapper(obj_pid, obj_path): thread2.join() thread3.join() # One thread will succeed, file count must still be 1 - assert store.count(entity) == 1 - assert store.exists(entity, pids[pid][store.algorithm]) + assert store._count(entity) == 1 + assert store._exists(entity, pids[pid][store.algorithm]) assert file_exists_error_flag @@ -600,7 +603,7 @@ def test_find_object_pid_refs_cid_not_found(pids, store): _object_metadata = store.store_object(pid, path) # Place the wrong cid into the pid refs file that has already been created - pid_ref_abs_path = store.resolve_path("pid", pid) + pid_ref_abs_path = store._resolve_path("pid", pid) with open(pid_ref_abs_path, "w", encoding="utf8") as pid_ref_file: pid_ref_file.seek(0) pid_ref_file.write("intentionally.wrong.pid") @@ -658,9 +661,9 @@ def test_store_metadata_files_path(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename metadata_cid = store.store_metadata(pid, syspath, format_id) - assert store.exists(entity, metadata_cid) + assert store._exists(entity, metadata_cid) assert metadata_cid == pids[pid]["metadata_cid"] - assert store.count(entity) == 3 + assert store._count(entity) == 3 def test_store_metadata_files_string(pids, store): @@ -672,8 +675,8 @@ def test_store_metadata_files_string(pids, store): filename = pid.replace("/", "_") + ".xml" syspath_string = str(Path(test_dir) / filename) metadata_cid = store.store_metadata(pid, syspath_string, format_id) - assert store.exists(entity, metadata_cid) - assert store.count(entity) == 3 + assert store._exists(entity, metadata_cid) + assert store._count(entity) == 3 def test_store_metadata_files_input_stream(pids, store): @@ -687,7 +690,7 @@ def test_store_metadata_files_input_stream(pids, store): syspath_stream = io.open(syspath_string, "rb") _metadata_cid = store.store_metadata(pid, syspath_stream, format_id) syspath_stream.close() - assert store.count(entity) == 3 + assert store._count(entity) == 3 def test_store_metadata_pid_empty(store): @@ -778,7 +781,7 @@ def test_store_metadata_thread_lock(store): thread2.join() thread3.join() thread4.join() - assert store.count(entity) == 1 + assert store._count(entity) == 1 def test_retrieve_object(pids, store): @@ -792,7 +795,7 @@ def test_retrieve_object(pids, store): object_metadata = store.store_object(pid, path) store.store_metadata(pid, syspath, format_id) obj_stream = store.retrieve_object(pid) - sha256_hex = store.computehash(obj_stream) + sha256_hex = store._computehash(obj_stream) obj_stream.close() assert sha256_hex == object_metadata.hex_digests.get("sha256") @@ -890,7 +893,7 @@ def test_delete_object(pids, store): _object_metadata = store.store_object(pid, path) _metadata_cid = store.store_metadata(pid, syspath, format_id) store.delete_object(pid) - assert store.count(entity) == 0 + assert store._count(entity) == 0 def test_delete_object_pid_refs_file(pids, store): @@ -904,7 +907,7 @@ def test_delete_object_pid_refs_file(pids, store): _object_metadata = store.store_object(pid, path) _metadata_cid = store.store_metadata(pid, syspath, format_id) store.delete_object(pid) - pid_refs_file_path = store.resolve_path("pid", pid) + pid_refs_file_path = store._resolve_path("pid", pid) assert not os.path.exists(pid_refs_file_path) @@ -920,7 +923,7 @@ def test_delete_object_cid_refs_file(pids, store): _metadata_cid = store.store_metadata(pid, syspath, format_id) cid = object_metadata.cid store.delete_object(pid) - cid_refs_file_path = store.resolve_path("cid", cid) + cid_refs_file_path = store._resolve_path("cid", cid) assert not os.path.exists(cid_refs_file_path) @@ -931,11 +934,11 @@ def test_delete_object_cid_refs_file_with_pid_refs_remaining(pids, store): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(pid, path) cid = object_metadata.cid - cid_refs_abs_path = store.resolve_path("cid", cid) + cid_refs_abs_path = store._resolve_path("cid", cid) # pylint: disable=W0212 store._update_cid_refs(cid_refs_abs_path, "dou.test.1") store.delete_object(pid) - cid_refs_file_path = store.resolve_path("cid", cid) + cid_refs_file_path = store._resolve_path("cid", cid) assert os.path.exists(cid_refs_file_path) @@ -966,7 +969,7 @@ def test_delete_metadata(pids, store): _metadata_cid = store.store_metadata(pid, syspath, format_id) is_deleted = store.delete_metadata(pid, format_id) assert is_deleted - assert store.count(entity) == 0 + assert store._count(entity) == 0 def test_delete_metadata_does_not_exist(pids, store): @@ -989,7 +992,7 @@ def test_delete_metadata_default_format_id(store, pids): _object_metadata = store.store_object(pid, path) _metadata_cid = store.store_metadata(pid, syspath) store.delete_metadata(pid) - assert store.count(entity) == 0 + assert store._count(entity) == 0 def test_delete_metadata_pid_empty(store): @@ -1100,14 +1103,14 @@ def test_store_and_delete_objects_100_pids_1_cid(store): store.store_object(pid_modified, path) assert sum([len(files) for _, _, files in os.walk(store.root + "/refs/pid")]) == 100 assert sum([len(files) for _, _, files in os.walk(store.root + "/refs/cid")]) == 1 - assert store.count("objects") == 1 + assert store._count("objects") == 1 # Delete for i in range(1, upper_limit): pid_modified = f"dou.test.{str(i)}" store.delete_object(pid_modified) assert sum([len(files) for _, _, files in os.walk(store.root + "/refs/pid")]) == 0 assert sum([len(files) for _, _, files in os.walk(store.root + "/refs/cid")]) == 0 - assert store.count("objects") == 0 + assert store._count("objects") == 0 def test_store_and_delete_object_300_pids_1_cid_threads(store): @@ -1159,4 +1162,4 @@ def delete_object_wrapper(pid_var): assert sum([len(files) for _, _, files in os.walk(store.root + "/refs/pid")]) == 0 assert sum([len(files) for _, _, files in os.walk(store.root + "/refs/cid")]) == 0 - assert store.count("objects") == 0 + assert store._count("objects") == 0 diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py index 5ec56e59..92f5ae59 100644 --- a/tests/test_filehashstore_references.py +++ b/tests/test_filehashstore_references.py @@ -23,7 +23,7 @@ def test_tag_object_pid_refs_file(pids, store): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(None, path) store.tag_object(pid, object_metadata.cid) - pid_refs_file_path = store.resolve_path("pid", pid) + pid_refs_file_path = store._resolve_path("pid", pid) assert os.path.exists(pid_refs_file_path) @@ -35,9 +35,9 @@ def test_tag_object_pid_refs_file_exists(pids, store): object_metadata = store.store_object(None, path) cid = object_metadata.cid store.tag_object(pid, cid) - pid_refs_file_path = store.resolve_path("pid", pid) + pid_refs_file_path = store._resolve_path("pid", pid) assert os.path.exists(pid_refs_file_path) - cid_refs_file_path = store.resolve_path("cid", cid) + cid_refs_file_path = store._resolve_path("cid", cid) assert os.path.exists(cid_refs_file_path) with pytest.raises(FileExistsError): store.tag_object(pid, cid) @@ -50,7 +50,7 @@ def test_tag_object_pid_refs_file_content(pids, store): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(None, path) store.tag_object(pid, object_metadata.cid) - pid_refs_file_path = store.resolve_path("pid", pid) + pid_refs_file_path = store._resolve_path("pid", pid) with open(pid_refs_file_path, "r", encoding="utf8") as f: pid_refs_cid = f.read() assert pid_refs_cid == object_metadata.cid @@ -64,7 +64,7 @@ def test_tag_object_cid_refs_file(pids, store): object_metadata = store.store_object(None, path) cid = object_metadata.cid store.tag_object(pid, object_metadata.cid) - cid_refs_file_path = store.resolve_path("cid", cid) + cid_refs_file_path = store._resolve_path("cid", cid) assert os.path.exists(cid_refs_file_path) @@ -75,7 +75,7 @@ def test_tag_object_cid_refs_file_content(pids, store): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(None, path) store.tag_object(pid, object_metadata.cid) - cid_refs_file_path = store.resolve_path("cid", object_metadata.cid) + cid_refs_file_path = store._resolve_path("cid", object_metadata.cid) with open(cid_refs_file_path, "r", encoding="utf8") as f: pid_refs_cid = f.read().strip() assert pid_refs_cid == pid @@ -93,7 +93,7 @@ def test_tag_object_cid_refs_file_exists(pids, store): with pytest.raises(FileExistsError): store.tag_object(pid, another_cid) - second_cid_hash = store.resolve_path("cid", another_cid) + second_cid_hash = store._resolve_path("cid", another_cid) assert not os.path.exists(second_cid_hash) @@ -112,7 +112,7 @@ def test_tag_object_cid_refs_update_cid_refs_updated(store): store.tag_object(additional_pid, cid) # Read cid file to confirm cid refs file contains the additional pid - cid_ref_abs_path = store.resolve_path("cid", cid) + cid_ref_abs_path = store._resolve_path("cid", cid) with open(cid_ref_abs_path, "r", encoding="utf8") as f: for _, line in enumerate(f, start=1): value = line.strip() @@ -134,7 +134,7 @@ def test_tag_object_cid_refs_update_pid_refs_created(store): additional_pid = "dou.test.1" store.tag_object(additional_pid, cid) - pid_refs_file_path = store.resolve_path("pid", additional_pid) + pid_refs_file_path = store._resolve_path("pid", additional_pid) assert os.path.exists(pid_refs_file_path) @@ -149,11 +149,11 @@ def test_tag_object_cid_refs_update_pid_found_but_file_missing(store): cid = object_metadata.cid # Manually update the cid refs, pid refs file missing at this point additional_pid = "dou.test.1" - cid_ref_abs_path = store.resolve_path("cid", cid) + cid_ref_abs_path = store._resolve_path("cid", cid) store._update_cid_refs(cid_ref_abs_path, additional_pid) # Confirm the pid refs file is missing - pid_refs_file_path = store.resolve_path("pid", additional_pid) + pid_refs_file_path = store._resolve_path("pid", additional_pid) assert not os.path.exists(pid_refs_file_path) # Call tag_object, this should create the missing pid refs file @@ -209,7 +209,7 @@ def test_verify_object_exception_incorrect_size(pids, store): cid = object_metadata.cid cid = object_metadata.hex_digests[store.algorithm] - cid_abs_path = store.resolve_path("cid", cid) + cid_abs_path = store._resolve_path("cid", cid) assert not os.path.exists(cid_abs_path) @@ -231,7 +231,7 @@ def test_verify_object_exception_incorrect_checksum(pids, store): cid = object_metadata.cid cid = object_metadata.hex_digests[store.algorithm] - cid_abs_path = store.resolve_path("cid", cid) + cid_abs_path = store._resolve_path("cid", cid) assert not os.path.exists(cid_abs_path) @@ -249,7 +249,7 @@ def test_verify_object_exception_incorrect_checksum_algo(pids, store): def test_write_cid_refs_file(store): """Test that write_cid_reference writes a reference file.""" - tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, "test_pid") assert os.path.exists(tmp_cid_refs_file) @@ -257,7 +257,7 @@ def test_write_cid_refs_file(store): def test_write_cid_refs_file_content(pids, store): """Test that write_cid_ref_file writes the expected content.""" for pid in pids.keys(): - tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) with open(tmp_cid_refs_file, "r", encoding="utf8") as f: cid_ref_file_pid = f.read() @@ -268,7 +268,7 @@ def test_write_cid_refs_file_content(pids, store): def test_update_cid_refs_content(pids, store): """Test that update_cid_ref updates the ref file as expected.""" for pid in pids.keys(): - tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) pid_other = "dou.test.1" store._update_cid_refs(tmp_cid_refs_file, pid_other) @@ -282,7 +282,7 @@ def test_update_cid_refs_content(pids, store): def test_update_cid_refs_content_multiple(pids, store): """Test that update_cid_refs adds multiple references successfully.""" for pid in pids.keys(): - tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) cid_reference_list = [pid] @@ -304,7 +304,7 @@ def test_update_cid_refs_content_pid_exists(pids, store): """Test that update_cid_ref does not throw exception if pid already exists and proceeds to complete the tagging process (verify_object)""" for pid in pids.keys(): - tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) # Exception should not be thrown store._update_cid_refs(tmp_cid_refs_file, pid) @@ -314,7 +314,7 @@ def test_update_cid_refs_content_cid_refs_does_not_exist(pids, store): """Test that update_cid_ref throws exception if cid refs file doesn't exist.""" for pid in pids.keys(): cid = pids[pid]["sha256"] - cid_ref_abs_path = store.resolve_path("cid", cid) + cid_ref_abs_path = store._resolve_path("cid", cid) with pytest.raises(FileNotFoundError): store._update_cid_refs(cid_ref_abs_path, pid) @@ -322,7 +322,7 @@ def test_update_cid_refs_content_cid_refs_does_not_exist(pids, store): def test_delete_cid_refs_pid(pids, store): """Test that delete_cid_refs_pid deletes the given pid from the ref file.""" for pid in pids.keys(): - tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) pid_other = "dou.test.1" @@ -339,7 +339,7 @@ def test_delete_cid_refs_pid(pids, store): def test_delete_cid_refs_pid_file(pids, store): """Test that delete_cid_refs_pid leaves a file empty when removing the last pid.""" for pid in pids.keys(): - tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) # First remove the pid store._delete_cid_refs_pid(tmp_cid_refs_file, pid) @@ -352,7 +352,7 @@ def test_write_pid_refs_file(pids, store): """Test that write_pid_refs_file writes a reference file.""" for pid in pids.keys(): cid = pids[pid]["sha256"] - tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_root_path = store._get_store_path("refs") / "tmp" tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, cid) assert os.path.exists(tmp_pid_refs_file) @@ -361,7 +361,7 @@ def test_write_pid_refs_file_content(pids, store): """Test that write_pid_refs_file writes the expected content.""" for pid in pids.keys(): cid = pids[pid]["sha256"] - tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_root_path = store._get_store_path("refs") / "tmp" tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, cid) with open(tmp_pid_refs_file, "r", encoding="utf8") as f: pid_refs_cid = f.read() @@ -373,7 +373,7 @@ def test_delete_pid_refs_file(pids, store): """Test that delete_pid_refs_file deletes a reference file.""" for pid in pids.keys(): cid = pids[pid]["sha256"] - tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_root_path = store._get_store_path("refs") / "tmp" tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, cid) store._delete_pid_refs_file(tmp_pid_refs_file) @@ -383,7 +383,7 @@ def test_delete_pid_refs_file(pids, store): def test_delete_pid_refs_file_file_not_found(pids, store): """Test that delete_pid_refs_file raises an exception when refs file not found.""" for pid in pids.keys(): - pid_ref_abs_path = store.resolve_path("pid", pid) + pid_ref_abs_path = store._resolve_path("pid", pid) with pytest.raises(FileNotFoundError): store._delete_pid_refs_file(pid_ref_abs_path) @@ -401,15 +401,15 @@ def test_verify_hashstore_references_pid_refs_incorrect_cid(pids, store): for pid in pids.keys(): cid = pids[pid]["sha256"] # Write the cid refs file and move it where it needs to be - tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, pid) - cid_ref_abs_path = store.resolve_path("cid", cid) - store.create_path(os.path.dirname(cid_ref_abs_path)) + cid_ref_abs_path = store._resolve_path("cid", cid) + store._create_path(os.path.dirname(cid_ref_abs_path)) shutil.move(tmp_cid_refs_file, cid_ref_abs_path) # Write the pid refs file and move it where it needs to be with a bad cid - pid_ref_abs_path = store.resolve_path("pid", pid) - store.create_path(os.path.dirname(pid_ref_abs_path)) - tmp_root_path = store.get_store_path("refs") / "tmp" + pid_ref_abs_path = store._resolve_path("pid", pid) + store._create_path(os.path.dirname(pid_ref_abs_path)) + tmp_root_path = store._get_store_path("refs") / "tmp" tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, "bad_cid") shutil.move(tmp_pid_refs_file, pid_ref_abs_path) @@ -421,9 +421,9 @@ def test_verify_hashstore_references_cid_refs_file_missing(pids, store): """Test _verify_hashstore_references throws exception when cid refs file is missing.""" for pid in pids.keys(): cid = pids[pid]["sha256"] - pid_ref_abs_path = store.resolve_path("pid", pid) - store.create_path(os.path.dirname(pid_ref_abs_path)) - tmp_root_path = store.get_store_path("refs") / "tmp" + pid_ref_abs_path = store._resolve_path("pid", pid) + store._create_path(os.path.dirname(pid_ref_abs_path)) + tmp_root_path = store._get_store_path("refs") / "tmp" tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, "bad_cid") shutil.move(tmp_pid_refs_file, pid_ref_abs_path) @@ -437,15 +437,15 @@ def test_verify_hashstore_references_cid_refs_file_missing_pid(pids, store): for pid in pids.keys(): cid = pids[pid]["sha256"] # Get a tmp cid refs file and write the wrong pid into it - tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, "bad pid") - cid_ref_abs_path = store.resolve_path("cid", cid) - store.create_path(os.path.dirname(cid_ref_abs_path)) + cid_ref_abs_path = store._resolve_path("cid", cid) + store._create_path(os.path.dirname(cid_ref_abs_path)) shutil.move(tmp_cid_refs_file, cid_ref_abs_path) # Now write the pid refs file, both cid and pid refs must be present - pid_ref_abs_path = store.resolve_path("pid", pid) - store.create_path(os.path.dirname(pid_ref_abs_path)) - tmp_root_path = store.get_store_path("refs") / "tmp" + pid_ref_abs_path = store._resolve_path("pid", pid) + store._create_path(os.path.dirname(pid_ref_abs_path)) + tmp_root_path = store._get_store_path("refs") / "tmp" tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, cid) shutil.move(tmp_pid_refs_file, pid_ref_abs_path) @@ -461,15 +461,15 @@ def test_verify_hashstore_references_cid_refs_file_with_multiple_refs_missing_pi for pid in pids.keys(): cid = pids[pid]["sha256"] # Write the wrong pid into a cid refs file and move it where it needs to be - tmp_root_path = store.get_store_path("refs") / "tmp" + tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_cid_refs_file(tmp_root_path, "bad pid") - cid_ref_abs_path = store.resolve_path("cid", cid) - store.create_path(os.path.dirname(cid_ref_abs_path)) + cid_ref_abs_path = store._resolve_path("cid", cid) + store._create_path(os.path.dirname(cid_ref_abs_path)) shutil.move(tmp_cid_refs_file, cid_ref_abs_path) # Now write the pid refs with expected values - pid_ref_abs_path = store.resolve_path("pid", pid) - store.create_path(os.path.dirname(pid_ref_abs_path)) - tmp_root_path = store.get_store_path("refs") / "tmp" + pid_ref_abs_path = store._resolve_path("pid", pid) + store._create_path(os.path.dirname(pid_ref_abs_path)) + tmp_root_path = store._get_store_path("refs") / "tmp" tmp_pid_refs_file = store._write_pid_refs_file(tmp_root_path, cid) shutil.move(tmp_pid_refs_file, pid_ref_abs_path) diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index 3aee347a..3511bbe8 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -132,7 +132,7 @@ def test_store_object(store, pids): sys.argv = chs_args hashstoreclient.main() - assert store.exists("objects", pids[pid][store.algorithm]) + assert store._exists("objects", pids[pid][store.algorithm]) def test_store_metadata(store, pids): @@ -164,7 +164,7 @@ def test_store_metadata(store, pids): sys.argv = chs_args hashstoreclient.main() - assert store.exists("metadata", pids[pid]["metadata_cid"]) + assert store._exists("metadata", pids[pid]["metadata_cid"]) def test_retrieve_objects(capsys, pids, store): @@ -272,7 +272,7 @@ def test_delete_objects(pids, store): sys.argv = chs_args hashstoreclient.main() - assert not store.exists("objects", pids[pid][store.algorithm]) + assert not store._exists("objects", pids[pid][store.algorithm]) def test_delete_metadata(pids, store): @@ -304,4 +304,4 @@ def test_delete_metadata(pids, store): sys.argv = chs_args hashstoreclient.main() - assert not store.exists("metadata", pids[pid]["metadata_cid"]) + assert not store._exists("metadata", pids[pid]["metadata_cid"]) From f040cda30b90bb37dd44274e80e820f671b2a375 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 25 Jan 2024 10:53:18 -0800 Subject: [PATCH 12/13] Update inaccurate docstring in 'HashStoreClient' --- src/hashstore/hashstoreclient.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/hashstore/hashstoreclient.py b/src/hashstore/hashstoreclient.py index a1457e46..d786682f 100644 --- a/src/hashstore/hashstoreclient.py +++ b/src/hashstore/hashstoreclient.py @@ -245,11 +245,11 @@ class HashStoreClient: MET_TYPE = "metadata" def __init__(self, properties, testflag=None): - """Store objects in a given directory into HashStore. + """Initialize the HashStoreClient with optional flag to test with the + test server at 'test.arcticdata.io' - :param str origin_dir: Directory to convert. - :param str obj_type: Type of objects ('object' or 'metadata'). - :param int num: Number of files to store. + :param dict properties: HashStore properties to initialize with + :param str testflag: 'knbvm' to denote testing on 'test.arcticdata.io' """ factory = HashStoreFactory() From 031d46831fc5d4e469bde231bd382577d11008f4 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 25 Jan 2024 11:10:08 -0800 Subject: [PATCH 13/13] Refactor '_load_properties' to be static and update pytests and affected code --- src/hashstore/filehashstore.py | 24 ++++++++++++++---------- tests/test_filehashstore.py | 8 ++++++-- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 1a582f3f..cf6a77ef 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -126,7 +126,8 @@ def __init__(self, properties=None): # Configuration and Related Methods - def _load_properties(self): + @staticmethod + def _load_properties(hahstore_yaml_path, hashstore_required_prop_keys): """Get and return the contents of the current HashStore configuration. :return: HashStore properties with the following keys (and values): @@ -136,7 +137,7 @@ def _load_properties(self): - ``store_metadata_namespace`` (str): Namespace for the HashStore's system metadata. :rtype: dict """ - if not os.path.exists(self.hashstore_configuration_yaml): + if not os.path.exists(hahstore_yaml_path): exception_string = ( "FileHashStore - load_properties: hashstore.yaml not found" + " in store root path." @@ -145,14 +146,12 @@ def _load_properties(self): raise FileNotFoundError(exception_string) # Open file - with open( - self.hashstore_configuration_yaml, "r", encoding="utf-8" - ) as hs_yaml_file: + with open(hahstore_yaml_path, "r", encoding="utf-8") as hs_yaml_file: yaml_data = yaml.safe_load(hs_yaml_file) # Get hashstore properties hashstore_yaml_dict = {} - for key in self.property_required_keys: + for key in hashstore_required_prop_keys: if key != "store_path": hashstore_yaml_dict[key] = yaml_data[key] logging.debug( @@ -303,7 +302,9 @@ def _verify_hashstore_properties(self, properties, prop_store_path): self.hashstore_configuration_yaml, ) # If 'hashstore.yaml' is found, verify given properties before init - hashstore_yaml_dict = self._load_properties() + hashstore_yaml_dict = self._load_properties( + self.hashstore_configuration_yaml, self.property_required_keys + ) for key in self.property_required_keys: # 'store_path' is required to init HashStore but not saved in `hashstore.yaml` if key != "store_path": @@ -1305,7 +1306,8 @@ def _is_pid_in_cid_refs_file(self, pid, cid_ref_abs_path): return True return False - def _update_cid_refs(self, cid_ref_abs_path, pid): + @staticmethod + def _update_cid_refs(cid_ref_abs_path, pid): """Update an existing CID reference file with the given PID. :param str cid_ref_abs_path: Absolute path to the CID reference file. @@ -1341,7 +1343,8 @@ def _update_cid_refs(self, cid_ref_abs_path, pid): logging.error(exception_string) raise err - def _delete_cid_refs_pid(self, cid_ref_abs_path, pid): + @staticmethod + def _delete_cid_refs_pid(cid_ref_abs_path, pid): """Delete a PID from a CID reference file. :param str cid_ref_abs_path: Absolute path to the CID reference file. @@ -1645,7 +1648,8 @@ def _verify_hashstore_references(self, pid, cid, verify_type): logging.error(exception_string) raise ValueError(exception_string) - def _check_arg_data(self, data): + @staticmethod + def _check_arg_data(data): """Checks a data argument to ensure that it is either a string, path, or stream object. diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 2aafec81..c11726ef 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -135,7 +135,9 @@ def test_init_with_existing_hashstore_missing_yaml(store, pids): def test_load_properties(store): """Verify dictionary returned from _load_properties matches initialization.""" - hashstore_yaml_dict = store._load_properties() + hashstore_yaml_dict = store._load_properties( + store.hashstore_configuration_yaml, store.property_required_keys + ) assert hashstore_yaml_dict.get("store_depth") == 3 assert hashstore_yaml_dict.get("store_width") == 2 assert hashstore_yaml_dict.get("store_algorithm") == "SHA-256" @@ -149,7 +151,9 @@ def test_load_properties_hashstore_yaml_missing(store): """Confirm FileNotFoundError is raised when hashstore.yaml does not exist.""" os.remove(store.hashstore_configuration_yaml) with pytest.raises(FileNotFoundError): - store._load_properties() + store._load_properties( + store.hashstore_configuration_yaml, store.property_required_keys + ) def test_validate_properties(store):