From 608fd2aefdd50af568d868adb8aca0e368d6920d Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 19 Jun 2023 10:45:42 -0700 Subject: [PATCH 001/165] Rename 'sysmeta' directory to 'metadata' and update affected values --- src/hashstore/filehashstore/filehashstore.py | 30 ++++++++++---------- tests/filehashstore/test_filehashstore.py | 2 +- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/hashstore/filehashstore/filehashstore.py b/src/hashstore/filehashstore/filehashstore.py index e5a72f3f..21c182ce 100644 --- a/src/hashstore/filehashstore/filehashstore.py +++ b/src/hashstore/filehashstore/filehashstore.py @@ -64,9 +64,9 @@ class FileHashStore(HashStore): # Variables to orchestrate thread locking and object store synchronization time_out_sec = 1 object_lock = threading.Lock() - sysmeta_lock = threading.Lock() + metadata_lock = threading.Lock() object_locked_pids = [] - sysmeta_locked_pids = [] + metadata_locked_pids = [] def __init__(self, properties=None): if properties: @@ -127,7 +127,7 @@ def __init__(self, properties=None): self.put_properties(properties) # Complete initialization/instantiation by setting store directories self.objects = self.root + "/objects" - self.sysmeta = self.root + "/sysmeta" + self.metadata = self.root + "/metadata" logging.debug( "FileHashStore - Initialization success. Store root: %s", self.root ) @@ -446,19 +446,19 @@ def store_sysmeta(self, pid, sysmeta): raise TypeError(exception_string) # Wait for the pid to release if it's in use - while pid in self.sysmeta_locked_pids: + while pid in self.metadata_locked_pids: logging.debug( "FileHashStore - store_sysmeta: %s is currently being stored. Waiting.", pid, ) time.sleep(self.time_out_sec) - # Modify sysmeta_locked_pids consecutively - with self.sysmeta_lock: + # Modify metadata_locked_pids consecutively + with self.metadata_lock: logging.debug( - "FileHashStore - store_sysmeta: Adding pid: %s to sysmeta_locked_pids.", + "FileHashStore - store_sysmeta: Adding pid: %s to metadata_locked_pids.", pid, ) - self.sysmeta_locked_pids.append(pid) + self.metadata_locked_pids.append(pid) try: logging.debug( "FileHashStore - store_sysmeta: Attempting to store sysmeta for pid: %s", @@ -467,12 +467,12 @@ def store_sysmeta(self, pid, sysmeta): sysmeta_cid = self.put_sysmeta(pid, sysmeta) finally: # Release pid - with self.sysmeta_lock: + with self.metadata_lock: logging.debug( - "FileHashStore - store_sysmeta: Removing pid: %s from sysmeta_locked_pids.", + "FileHashStore - store_sysmeta: Removing pid: %s from metadata_locked_pids.", pid, ) - self.sysmeta_locked_pids.remove(pid) + self.metadata_locked_pids.remove(pid) logging.info( "FileHashStore - store_sysmeta: Successfully stored sysmeta for pid: %s", pid, @@ -916,7 +916,7 @@ def put_sysmeta(self, pid, sysmeta): logging.debug( "FileHashStore - put_sysmeta: Deleting sysmeta for pid: %s", pid ) - self.sysmeta.delete(sysmeta_tmp) + self.metadata.delete(sysmeta_tmp) err_msg = f"Aborting store_sysmeta upload - an unexpected error has occurred: {err}" logging.error("FileHashStore - put_sysmeta: %s", err_msg) raise @@ -1030,7 +1030,7 @@ def get_store_path(self, entity): if entity == "objects": return Path(self.objects) elif entity == "sysmeta": - return Path(self.sysmeta) + return Path(self.metadata) else: raise ValueError( f"entity: {entity} does not exist. Do you mean 'objects' or 'sysmeta'?" @@ -1187,7 +1187,7 @@ def get_real_path(self, entity, file): if entity == "objects": rel_root = self.objects elif entity == "sysmeta": - rel_root = self.sysmeta + rel_root = self.metadata else: raise ValueError( f"entity: {entity} does not exist. Do you mean 'objects' or 'sysmeta'?" @@ -1240,7 +1240,7 @@ def count(self, entity): if entity == "objects": directory_to_count = self.objects elif entity == "sysmeta": - directory_to_count = self.sysmeta + directory_to_count = self.metadata else: raise ValueError( f"entity: {entity} does not exist. Do you mean 'objects' or 'sysmeta'?" diff --git a/tests/filehashstore/test_filehashstore.py b/tests/filehashstore/test_filehashstore.py index f36bb2ce..5392a261 100644 --- a/tests/filehashstore/test_filehashstore.py +++ b/tests/filehashstore/test_filehashstore.py @@ -467,7 +467,7 @@ def test_get_store_path_sysmeta(store): # pylint: disable=W0212 path_sysmeta = store.get_store_path("sysmeta") path_sysmeta_string = str(path_sysmeta) - assert path_sysmeta_string.endswith("/metacat/sysmeta") + assert path_sysmeta_string.endswith("/metacat/metadata") def test_exists_with_absolute_path(pids, store): From 977550f0ecba54a39d4b6fdb5d3eb1a4e2958065 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 20 Jun 2023 10:00:00 -0700 Subject: [PATCH 002/165] Rename 'hashstore' interface 'store_sysmeta' to 'store_metadata' and update signature --- src/hashstore/hashstore.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 25a5979e..2722e8da 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -63,21 +63,23 @@ def store_object( raise NotImplementedError() @abstractmethod - def store_sysmeta(self, pid, sysmeta): + def store_metadata(self, pid, format_id, sysmeta): """The `store_sysmeta` method is responsible for adding and/or updating metadata - (`sysmeta`) to disk using a given InputStream and a persistent identifier - (pid). The metadata object consists of a header and body portion. The header - is formed by writing the namespace/format (utf-8) of the metadata document - followed by a null character `\x00` and the body follows immediately after. + (ex. `sysmeta`) to disk using a given path/stream, a persistent identifier `pid` + and a metadata `format_id`. The metadata object consists of a header and + body section, split by a null character `\x00`. - Upon successful storage of sysmeta, the method returns a String that - represents the file's permanent address, and similarly to 'store_object', this - permanent address is determined by calculating the SHA-256 hex digest of the - provided pid. Finally, sysmeta are stored in parallel to objects in the - `/store_directory/sysmeta/` directory. + The header contains the metadata object's permanent address, which is determined + by calculating the SHA-256 hex digest of the provided `pid` + `format_id`; and the + body contains the metadata content (ex. `sysmeta`). + + Upon successful storage of sysmeta, `store_sysmeta` returns a string that + represents the file's permanent address. Lastly, the metadata objects are stored + in parallel to objects in the `/store_directory/metadata/` directory. Args: pid (string): Authority-based identifier. + format_id (string): Metadata format sysmeta (mixed): String or path to sysmeta document. Returns: From 51f9a199d9110dc23bc1d6234bc830873fa1c6a1 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 20 Jun 2023 10:39:45 -0700 Subject: [PATCH 003/165] Refactor 'store_sysmeta' to 'store_medata' and update related pytests --- src/hashstore/filehashstore/filehashstore.py | 159 ++++++++++-------- src/hashstore/hashstore.py | 14 +- tests/conftest.py | 10 +- tests/filehashstore/test_filehashstore.py | 57 ++++--- .../test_filehashstore_interface.py | 110 ++++++------ 5 files changed, 194 insertions(+), 156 deletions(-) diff --git a/src/hashstore/filehashstore/filehashstore.py b/src/hashstore/filehashstore/filehashstore.py index 21c182ce..ef04020c 100644 --- a/src/hashstore/filehashstore/filehashstore.py +++ b/src/hashstore/filehashstore/filehashstore.py @@ -32,7 +32,7 @@ class FileHashStore(HashStore): store_depth (int): Depth when sharding an object's hex digest. store_width (int): Width of directories when sharding an object's hex digest. store_algorithm (str): Hash algorithm used for calculating the object's hex digest. - store_sysmeta_namespace (str): Namespace for the HashStore's system metadata. + store_metadata_namespace (str): Namespace for the HashStore's system metadata. """ # Property (hashstore configuration) requirements @@ -41,7 +41,7 @@ class FileHashStore(HashStore): "store_depth", "store_width", "store_algorithm", - "store_sysmeta_namespace", + "store_metadata_namespace", ] # Permissions settings for writing files and creating directories fmode = 0o664 @@ -77,7 +77,7 @@ def __init__(self, properties=None): prop_store_depth, prop_store_width, prop_store_algorithm, - prop_store_sysmeta_namespace, + prop_store_metadata_namespace, ) = [ checked_properties[property_name] for property_name in self.property_required_keys @@ -116,7 +116,7 @@ def __init__(self, properties=None): self.depth = prop_store_depth self.width = prop_store_width self.algorithm = prop_store_algorithm - self.sysmeta_ns = prop_store_sysmeta_namespace + self.sysmeta_ns = prop_store_metadata_namespace # Write 'hashstore.yaml' to store path if not os.path.exists(self.hashstore_configuration_yaml): # pylint: disable=W1201 @@ -146,7 +146,7 @@ def get_properties(self): Returns: hashstore_yaml_dict (dict): HashStore properties with the following keys/values: - "store_path", "store_depth", "store_width", "store_algorithm","store_sysmeta_namespace". + "store_path", "store_depth", "store_width", "store_algorithm","store_metadata_namespace". """ if not os.path.exists(self.hashstore_configuration_yaml): exception_string = "hashstore.yaml not found in store root path." @@ -174,7 +174,7 @@ def put_properties(self, properties): store_depth (int): Depth when sharding an object's hex digest. store_width (int): Width of directories when sharding an object's hex digest. store_algorithm (str): Hash algorithm used for calculating the object's hex digest. - store_sysmeta_namespace (str): Namespace for the HashStore's system metadata. + store_metadata_namespace (str): Namespace for the HashStore's system metadata. """ # If hashstore.yaml already exists, must throw exception and proceed with caution if os.path.exists(self.hashstore_configuration_yaml): @@ -192,7 +192,7 @@ def put_properties(self, properties): store_depth, store_width, store_algorithm, - store_sysmeta_namespace, + store_metadata_namespace, ) = [ checked_properties[property_name] for property_name in self.property_required_keys @@ -204,7 +204,7 @@ def put_properties(self, properties): store_depth, store_width, store_algorithm, - store_sysmeta_namespace, + store_metadata_namespace, ) # Write 'hashstore.yaml' with open( @@ -219,7 +219,7 @@ def put_properties(self, properties): @staticmethod def _build_hashstore_yaml_string( - store_path, store_depth, store_width, store_algorithm, store_sysmeta_namespace + store_path, store_depth, store_width, store_algorithm, store_metadata_namespace ): """Build a YAML string representing the configuration for a HashStore. @@ -228,7 +228,7 @@ def _build_hashstore_yaml_string( store_depth (int): Depth when sharding an object's hex digest. store_width (int): Width of directories when sharding an object's hex digest. store_algorithm (str): Hash algorithm used for calculating the object's hex digest. - store_sysmeta_namespace (str): Namespace for the HashStore's system metadata. + store_metadata_namespace (str): Namespace for the HashStore's system metadata. Returns: hashstore_configuration_yaml (str): A YAML string representing the configuration for @@ -256,7 +256,7 @@ def _build_hashstore_yaml_string( # │ └── 8f0b04e812a3b4c8f686ce34e6fec558804bf61e54b176742a7f6368d6 ############### Format of the Metadata ############### - store_sysmeta_namespace: "{store_sysmeta_namespace}" + store_metadata_namespace: "{store_metadata_namespace}" ############### Hash Algorithms ############### # Hash algorithm to use when calculating object's hex digest for the permanent address @@ -418,66 +418,75 @@ def store_object( ) return hash_address - def store_sysmeta(self, pid, sysmeta): + def store_metadata(self, pid, format_id, metadata): logging.debug( - "FileHashStore - store_sysmeta: Request to store sysmeta for pid: %s", pid + "FileHashStore - store_metadata: Request to store metadata for pid: %s", pid ) - # Validate input parameters - logging.debug("FileHashStore - store_sysmeta: Validating arguments.") + # Validate input parameters, begin with persistent identifier + logging.debug("FileHashStore - store_metadata: Validating arguments.") if pid is None or pid.replace(" ", "") == "": exception_string = f"Pid cannot be None or empty, pid: {pid}" - logging.error("FileHashStore - store_sysmeta: %s", exception_string) + logging.error("FileHashStore - store_metadata: %s", exception_string) + raise ValueError(exception_string) + # Then format_id of the metadata + if format_id is None or format_id.replace(" ", "") == "": + exception_string = ( + f"format_id cannot be None or empty, format_id: {format_id}" + ) + logging.error("FileHashStore - store_metadata: %s", exception_string) raise ValueError(exception_string) + # Metadata content must be a str, path or stream and cannot be empty if ( - not isinstance(sysmeta, str) - and not isinstance(sysmeta, Path) - and not isinstance(sysmeta, io.BufferedIOBase) + not isinstance(metadata, str) + and not isinstance(metadata, Path) + and not isinstance(metadata, io.BufferedIOBase) ): exception_string = ( - "Sysmeta must be a path or string type, data type supplied: " - + {type(sysmeta)} + "Metadata must be a path or string type, data type supplied: " + + {type(metadata)} ) - logging.error("FileHashStore - store_sysmeta: %s", exception_string) + logging.error("FileHashStore - store_metadata: %s", exception_string) raise TypeError(exception_string) - if isinstance(sysmeta, str): - if sysmeta.replace(" ", "") == "": - exception_string = "Given string path to sysmeta cannot be empty." - logging.error("FileHashStore - store_sysmeta: %s", exception_string) + if isinstance(metadata, str): + if metadata.replace(" ", "") == "": + exception_string = "Given string path to metadata cannot be empty." + logging.error("FileHashStore - store_metadata: %s", exception_string) raise TypeError(exception_string) # Wait for the pid to release if it's in use while pid in self.metadata_locked_pids: logging.debug( - "FileHashStore - store_sysmeta: %s is currently being stored. Waiting.", + "FileHashStore - store_metadata: %s is currently being stored. Waiting.", pid, ) time.sleep(self.time_out_sec) # Modify metadata_locked_pids consecutively with self.metadata_lock: logging.debug( - "FileHashStore - store_sysmeta: Adding pid: %s to metadata_locked_pids.", + "FileHashStore - store_metadata: Adding pid: %s to metadata_locked_pids.", pid, ) self.metadata_locked_pids.append(pid) try: logging.debug( - "FileHashStore - store_sysmeta: Attempting to store sysmeta for pid: %s", + "FileHashStore - store_metadata: Attempting to store metadata for pid: %s", pid, ) - sysmeta_cid = self.put_sysmeta(pid, sysmeta) + # TODO: Determine if format_id should be part of config file, then update + metadata_cid = self.put_metadata(pid, format_id, metadata) finally: # Release pid with self.metadata_lock: logging.debug( - "FileHashStore - store_sysmeta: Removing pid: %s from metadata_locked_pids.", + "FileHashStore - store_metadata: Removing pid: %s from metadata_locked_pids.", pid, ) self.metadata_locked_pids.remove(pid) logging.info( - "FileHashStore - store_sysmeta: Successfully stored sysmeta for pid: %s", + "FileHashStore - store_metadata: Successfully stored metadata for pid: %s", pid, ) - return sysmeta_cid + return metadata_cid def retrieve_object(self, pid): logging.debug( @@ -872,74 +881,78 @@ def _mktempfile(self, stream, algorithm=None): logging.debug("FileHashStore - _mktempfile: Hex digests calculated.") return hex_digest_dict, tmp.name - def put_sysmeta(self, pid, sysmeta): - """Store contents of `sysmeta` on disk using the hash of the given pid + def put_metadata(self, pid, format_id, metadata): + """Store contents of metadata on disk using the hash of the given pid + and format_id as the permanent address. Args: pid (string): Authority-based identifier. - sysmeta (mixed): String or path to sysmeta document. + format_id (string): Metadata format + metadata (mixed): String or path to metadata document. Returns: - ab_id (string): Address of the sysmeta document. + ab_id (string): Address of the metadata document. """ logging.debug( - "FileHashStore - put_sysmeta: Request to put sysmeta for pid: %s", pid + "FileHashStore - put_metadata: Request to put metadata for pid: %s", pid ) # Create tmp file and write to it - sysmeta_stream = Stream(sysmeta) - with closing(sysmeta_stream): - sysmeta_tmp = self._mktmpsysmeta(sysmeta_stream, self.sysmeta_ns) + metadata_stream = Stream(metadata) + with closing(metadata_stream): + # TODO: Determine if format_id should be part of config file, then update + metadata_tmp = self._mktmpmetadata(metadata_stream, self.sysmeta_ns) # Target path (permanent location) - ab_id = self.get_sha256_hex_digest(pid) + ab_id = self.get_sha256_hex_digest(pid + format_id) rel_path = "/".join(self.shard(ab_id)) - full_path = self.get_store_path("sysmeta") / rel_path + full_path = self.get_store_path("metadata") / rel_path - # Move sysmeta to target path - if os.path.exists(sysmeta_tmp): + # Move metadata to target path + if os.path.exists(metadata_tmp): try: parent = full_path.parent parent.mkdir(parents=True, exist_ok=True) - # Sysmeta will be replaced if it exists - shutil.move(sysmeta_tmp, full_path) + # Metadata will be replaced if it exists + shutil.move(metadata_tmp, full_path) logging.debug( - "FileHashStore - put_sysmeta: Successfully put sysmeta for pid: %s", + "FileHashStore - put_metadata: Successfully put metadata for pid: %s", pid, ) return ab_id except Exception as err: exception_string = f"Unexpected {err=}, {type(err)=}" - logging.error("FileHashStore - put_sysmeta: %s", exception_string) - if os.path.exists(sysmeta_tmp): - # Remove tmp sysmeta, calling app must re-upload + logging.error("FileHashStore - put_metadata: %s", exception_string) + if os.path.exists(metadata_tmp): + # Remove tmp metadata, calling app must re-upload logging.debug( - "FileHashStore - put_sysmeta: Deleting sysmeta for pid: %s", pid + "FileHashStore - put_metadata: Deleting metadata for pid: %s", + pid, ) - self.metadata.delete(sysmeta_tmp) - err_msg = f"Aborting store_sysmeta upload - an unexpected error has occurred: {err}" - logging.error("FileHashStore - put_sysmeta: %s", err_msg) + self.metadata.delete(metadata_tmp) + err_msg = f"Aborting store_metadata upload - unexpected error: {err}" + logging.error("FileHashStore - put_metadata: %s", err_msg) raise else: exception_string = ( - f"Attempt to move sysmeta for pid: {pid}" - + f", but sysmeta temp file not found: {sysmeta_tmp}" + f"Attempt to move metadata for pid: {pid}" + + f", but metadata temp file not found: {metadata_tmp}" ) - logging.error("FileHashStore - put_sysmeta: %s", exception_string) + logging.error("FileHashStore - put_metadata: %s", exception_string) raise FileNotFoundError() - def _mktmpsysmeta(self, stream, namespace): - """Create a named temporary file with `sysmeta` bytes and `namespace`. + def _mktmpmetadata(self, stream, format_id): + """Create a named temporary file with `metadata` bytes and `namespace`. Args: - stream (io.BufferedReader): Sysmeta stream. - namespace (string): Format of sysmeta. + stream (io.BufferedReader): Metadata stream. + format_id (string): Format of metadata. Returns: tmp.name (string): Name of temporary file created and written into. """ # Create temporary file in .../{store_path}/tmp - tmp_root_path = self.get_store_path("sysmeta") / "tmp" + tmp_root_path = self.get_store_path("metadata") / "tmp" # Physically create directory if it doesn't exist if os.path.exists(tmp_root_path) is False: self.create_path(tmp_root_path) @@ -955,17 +968,17 @@ def _mktmpsysmeta(self, stream, namespace): # tmp is a file-like object that is already opened for writing by default logging.debug( - "FileHashStore - _mktmpsysmeta: Writing stream to tmp sysmeta file: %s", + "FileHashStore - _mktmpmetadata: Writing stream to tmp metadata file: %s", tmp.name, ) with tmp as tmp_file: - tmp_file.write(namespace.encode("utf-8")) + tmp_file.write(format_id.encode("utf-8")) tmp_file.write(b"\x00") for data in stream: tmp_file.write(self._to_bytes(data)) logging.debug( - "FileHashStore - _mktmpsysmeta: Successfully written to tmp sysmeta file: %s", + "FileHashStore - _mktmpmetadata: Successfully written to tmp metadata file: %s", tmp.name, ) return tmp.name @@ -1025,15 +1038,15 @@ def get_store_path(self, entity): """Return a path object of the root directory of the store. Args: - entity (str): Desired entity type (ex. "objects", "sysmeta"). + entity (str): Desired entity type: "objects" or "metadata" """ if entity == "objects": return Path(self.objects) - elif entity == "sysmeta": + elif entity == "metadata": return Path(self.metadata) else: raise ValueError( - f"entity: {entity} does not exist. Do you mean 'objects' or 'sysmeta'?" + f"entity: {entity} does not exist. Do you mean 'objects' or 'metadata'?" ) def exists(self, entity, file): @@ -1172,7 +1185,7 @@ def get_real_path(self, entity, file): the expected file path of the id. Args: - entity (str): desired entity type (ex. "objects", "sysmeta"). \n + entity (str): desired entity type (ex. "objects", "metadata"). \n file (string): Name of file. Returns: @@ -1186,7 +1199,7 @@ def get_real_path(self, entity, file): rel_root = "" if entity == "objects": rel_root = self.objects - elif entity == "sysmeta": + elif entity == "metadata": rel_root = self.metadata else: raise ValueError( @@ -1239,11 +1252,11 @@ def count(self, entity): directory_to_count = "" if entity == "objects": directory_to_count = self.objects - elif entity == "sysmeta": + elif entity == "metadata": directory_to_count = self.metadata else: raise ValueError( - f"entity: {entity} does not exist. Do you mean 'objects' or 'sysmeta'?" + f"entity: {entity} does not exist. Do you mean 'objects' or 'metadata'?" ) for _, _, files in os.walk(directory_to_count): diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 2722e8da..4afe379f 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -63,27 +63,27 @@ def store_object( raise NotImplementedError() @abstractmethod - def store_metadata(self, pid, format_id, sysmeta): - """The `store_sysmeta` method is responsible for adding and/or updating metadata + def store_metadata(self, pid, format_id, metadata): + """The `store_metadata` method is responsible for adding and/or updating metadata (ex. `sysmeta`) to disk using a given path/stream, a persistent identifier `pid` - and a metadata `format_id`. The metadata object consists of a header and - body section, split by a null character `\x00`. + and a metadata `format_id`. The metadata object consists of a header and a body + section, which is split by a null character `\x00`. The header contains the metadata object's permanent address, which is determined by calculating the SHA-256 hex digest of the provided `pid` + `format_id`; and the body contains the metadata content (ex. `sysmeta`). - Upon successful storage of sysmeta, `store_sysmeta` returns a string that + Upon successful storage of metadata, `store_metadata` returns a string that represents the file's permanent address. Lastly, the metadata objects are stored in parallel to objects in the `/store_directory/metadata/` directory. Args: pid (string): Authority-based identifier. format_id (string): Metadata format - sysmeta (mixed): String or path to sysmeta document. + metadata (mixed): String or path to metadata document. Returns: - sysmeta_cid (string): Address of the sysmeta document. + metadata_cid (string): Address of the metadata document. """ raise NotImplementedError() diff --git a/tests/conftest.py b/tests/conftest.py index e8c1bcf4..9f9eca47 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -26,7 +26,7 @@ def init_props(tmp_path): "store_depth": 3, "store_width": 2, "store_algorithm": "sha256", - "store_sysmeta_namespace": "http://ns.dataone.org/service/types/v2.0", + "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", } return properties @@ -40,10 +40,14 @@ def init_store(props): @pytest.fixture(name="pids") def init_pids(): - """Shared test harness data.""" + """Shared test harness data. + - ab_id: the hex digest of the pid + - ab_format_id: the hex digest of the pid + format_id + """ test_pids = { "doi:10.18739/A2901ZH2M": { "ab_id": "0d555ed77052d7e166017f779cbc193357c3a5006ee8b8457230bcf7abcef65e", + "ab_format_id": "323e0799524cec4c7e14d31289cefd884b563b5c052f154a066de5ec1e477da7", "md5": "db91c910a3202478c8def1071c54aae5", "sha1": "1fe86e3c8043afa4c70857ca983d740ad8501ccd", "sha224": "922b1e86f83d3ea3060fd0f7b2cf04476e8b3ddeaa3cf48c2c3cf502", @@ -53,6 +57,7 @@ def init_pids(): }, "jtao.1700.1": { "ab_id": "a8241925740d5dcd719596639e780e0a090c9d55a5d0372b0eaf55ed711d4edf", + "ab_format_id": "ddf07952ef28efc099d10d8b682480f7d2da60015f5d8873b6e1ea75b4baf689", "md5": "f4ea2d07db950873462a064937197b0f", "sha1": "3d25436c4490b08a2646e283dada5c60e5c0539d", "sha224": "9b3a96f434f3c894359193a63437ef86fbd5a1a1a6cc37f1d5013ac1", @@ -62,6 +67,7 @@ def init_pids(): }, "urn:uuid:1b35d0a5-b17a-423b-a2ed-de2b18dc367a": { "ab_id": "7f5cc18f0b04e812a3b4c8f686ce34e6fec558804bf61e54b176742a7f6368d6", + "ab_format_id": "9a2e08c666b728e6cbd04d247b9e556df3de5b2ca49f7c5a24868eb27cddbff2", "md5": "e1932fc75ca94de8b64f1d73dc898079", "sha1": "c6d2a69a3f5adaf478ba796c114f57b990cf7ad1", "sha224": "f86491d23d25dbaf7620542f056aba8a092a70be625502a6afd1fde0", diff --git a/tests/filehashstore/test_filehashstore.py b/tests/filehashstore/test_filehashstore.py index 5392a261..822a1cfb 100644 --- a/tests/filehashstore/test_filehashstore.py +++ b/tests/filehashstore/test_filehashstore.py @@ -18,7 +18,7 @@ def test_init_with_existing_hashstore_mismatched_config(store): "store_depth": 1, "store_width": 2, "store_algorithm": "sha256", - "store_sysmeta_namespace": "http://ns.dataone.org/service/types/v2.0", + "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", } with pytest.raises(ValueError): FileHashStore(properties) @@ -37,7 +37,7 @@ def test_init_with_existing_hashstore_missing_yaml(store, pids): "store_depth": 3, "store_width": 2, "store_algorithm": "sha256", - "store_sysmeta_namespace": "http://ns.dataone.org/service/types/v2.0", + "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", } with pytest.raises(FileNotFoundError): FileHashStore(properties) @@ -51,7 +51,7 @@ def test_get_properties(store): assert hashstore_yaml_dict.get("store_width") == 2 assert hashstore_yaml_dict.get("store_algorithm") == "sha256" assert ( - hashstore_yaml_dict.get("store_sysmeta_namespace") + hashstore_yaml_dict.get("store_metadata_namespace") == "http://ns.dataone.org/service/types/v2.0" ) @@ -70,7 +70,7 @@ def test_validate_properties(store): "store_depth": 3, "store_width": 2, "store_algorithm": "sha256", - "store_sysmeta_namespace": "http://ns.dataone.org/service/types/v2.0", + "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", } # pylint: disable=W0212 assert store._validate_properties(properties) @@ -96,7 +96,7 @@ def test_validate_properties_key_value_is_none(store): "store_depth": 3, "store_width": 2, "store_algorithm": "sha256", - "store_sysmeta_namespace": None, + "store_metadata_namespace": None, } with pytest.raises(ValueError): # pylint: disable=W0212 @@ -381,51 +381,54 @@ def test_mktempfile_with_unsupported_algorithm(pids, store): input_stream.close() -def test_put_sysmeta_with_path(pids, store): - """Test put sysmeta with path object.""" - entity = "sysmeta" +def test_put_metadata_with_path(pids, store): + """Test put metadata with path object.""" + entity = "metadata" test_dir = "tests/testdata/" + format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - ab_id = store.store_sysmeta(pid, syspath) + ab_id = store.store_metadata(pid, format_id, syspath) assert store.exists(entity, ab_id) assert store.count(entity) == 3 -def test_put_sysmeta_with_string(pids, store): - """Test put sysmeta with string.""" - entity = "sysmeta" +def test_put_metadata_with_string(pids, store): + """Test put metadata with string.""" + entity = "metadata" test_dir = "tests/testdata/" + format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = str(Path(test_dir) / filename) - ab_id = store.store_sysmeta(pid, syspath) + ab_id = store.store_metadata(pid, format_id, syspath) assert store.exists(entity, ab_id) assert store.count(entity) == 3 -def test_put_sysmeta_ab_id(pids, store): - """Test put sysmeta returns correct id.""" +def test_put_metadata_ab_id(pids, store): + """Test put metadata returns correct id.""" test_dir = "tests/testdata/" + format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - ab_id = store.store_sysmeta(pid, syspath) - assert ab_id == pids[pid]["ab_id"] + ab_id = store.store_metadata(pid, format_id, syspath) + assert ab_id == pids[pid]["ab_format_id"] -def test_mktmpsysmeta(pids, store): - """Test mktmpsysmeta creates tmpFile.""" +def test_mktmpmetadata(pids, store): + """Test mktmpmetadata creates tmpFile.""" test_dir = "tests/testdata/" - entity = "sysmeta" + entity = "metadata" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename sys_stream = io.open(syspath, "rb") - namespace = "http://ns.dataone.org/service/types/v2.0" + format_id = "http://ns.dataone.org/service/types/v2.0" # pylint: disable=W0212 - tmp_name = store._mktmpsysmeta(sys_stream, namespace) + tmp_name = store._mktmpmetadata(sys_stream, format_id) sys_stream.close() assert store.exists(entity, tmp_name) @@ -462,12 +465,12 @@ def test_get_store_path_object(store): assert path_objects_string.endswith("/metacat/objects") -def test_get_store_path_sysmeta(store): - """Check get_store_path for sysmeta path.""" +def test_get_store_path_metadata(store): + """Check get_store_path for metadata path.""" # pylint: disable=W0212 - path_sysmeta = store.get_store_path("sysmeta") - path_sysmeta_string = str(path_sysmeta) - assert path_sysmeta_string.endswith("/metacat/metadata") + path_metadata = store.get_store_path("metadata") + path_metadata_string = str(path_metadata) + assert path_metadata_string.endswith("/metacat/metadata") def test_exists_with_absolute_path(pids, store): diff --git a/tests/filehashstore/test_filehashstore_interface.py b/tests/filehashstore/test_filehashstore_interface.py index a1bca2e7..8108560c 100644 --- a/tests/filehashstore/test_filehashstore_interface.py +++ b/tests/filehashstore/test_filehashstore_interface.py @@ -31,13 +31,14 @@ def test_store_object_files_path(pids, store): """Test store object when given a path.""" test_dir = "tests/testdata/" entity = "objects" + format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): path = Path(test_dir + pid.replace("/", "_")) filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename _hash_address = store.store_object(pid, path) - ab_id = store.store_sysmeta(pid, syspath) - assert store.exists(entity, ab_id) + _metadata_id = store.store_metadata(pid, format_id, syspath) + assert store.exists(entity, pids[pid]["ab_id"]) assert store.count(entity) == 3 @@ -45,13 +46,14 @@ def test_store_object_files_string(pids, store): """Test store object when given a string.""" test_dir = "tests/testdata/" entity = "objects" + format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): path_string = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename _hash_address = store.store_object(pid, path_string) - ab_id = store.store_sysmeta(pid, syspath) - assert store.exists(entity, ab_id) + _metadata_id = store.store_metadata(pid, format_id, syspath) + assert store.exists(entity, pids[pid]["ab_id"]) assert store.count(entity) == 3 @@ -409,112 +411,121 @@ def test_store_object_sparse_large_file(store): assert hash_address_id == pid_sha256_hex_digest -def test_store_sysmeta_files_path(pids, store): - """Test store sysmeta with path.""" +def test_store_metadata_files_path(pids, store): + """Test store metadata with path.""" test_dir = "tests/testdata/" - entity = "sysmeta" + entity = "metadata" + format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename _hash_address = store.store_object(pid, path) - ab_id = store.store_sysmeta(pid, syspath) + ab_id = store.store_metadata(pid, format_id, syspath) assert store.exists(entity, ab_id) assert store.count(entity) == 3 -def test_store_sysmeta_files_string(pids, store): - """Test store sysmeta with string.""" +def test_store_metadata_files_string(pids, store): + """Test store metadata with string.""" test_dir = "tests/testdata/" - entity = "sysmeta" + entity = "metadata" + format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): path_string = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath_string = str(Path(test_dir) / filename) _hash_address = store.store_object(pid, path_string) - ab_id = store.store_sysmeta(pid, syspath_string) + ab_id = store.store_metadata(pid, format_id, syspath_string) assert store.exists(entity, ab_id) assert store.count(entity) == 3 -def test_store_sysmeta_files_input_stream(pids, store): - """Test store sysmeta with an input stream to sysmeta.""" +def test_store_metadata_files_input_stream(pids, store): + """Test store metadata with an input stream to metadata.""" test_dir = "tests/testdata/" - entity = "sysmeta" + entity = "metadata" + format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") _hash_address = store.store_object(pid, path) filename = pid.replace("/", "_") + ".xml" syspath_string = str(Path(test_dir) / filename) syspath_stream = io.open(syspath_string, "rb") - _ab_id = store.store_sysmeta(pid, syspath_stream) + _ab_id = store.store_metadata(pid, format_id, syspath_stream) syspath_stream.close() assert store.count(entity) == 3 -def test_store_sysmeta_pid_empty(store): - """Test store sysmeta raises error with empty string.""" +def test_store_metadata_pid_empty(store): + """Test store metadata raises error with empty string.""" test_dir = "tests/testdata/" + format_id = "http://ns.dataone.org/service/types/v2.0" pid = "" filename = pid.replace("/", "_") + ".xml" syspath_string = str(Path(test_dir) / filename) with pytest.raises(ValueError): - store.store_sysmeta(pid, syspath_string) + store.store_metadata(pid, format_id, syspath_string) -def test_store_sysmeta_pid_empty_spaces(store): - """Test store sysmeta raises error with empty string.""" +def test_store_metadata_pid_empty_spaces(store): + """Test store metadata raises error with empty string.""" test_dir = "tests/testdata/" + format_id = "http://ns.dataone.org/service/types/v2.0" pid = " " filename = pid.replace("/", "_") + ".xml" syspath_string = str(Path(test_dir) / filename) with pytest.raises(ValueError): - store.store_sysmeta(pid, syspath_string) + store.store_metadata(pid, format_id, syspath_string) -def test_store_sysmeta_sysmeta_empty(store): - """Test store sysmeta raises error with empty sysmeta string.""" +def test_store_metadata_metadata_empty(store): + """Test store metadata raises error with empty metadata string.""" pid = "jtao.1700.1" + format_id = "http://ns.dataone.org/service/types/v2.0" syspath_string = " " with pytest.raises(TypeError): - store.store_sysmeta(pid, syspath_string) + store.store_metadata(pid, format_id, syspath_string) -def test_store_sysmeta_sysmeta_none(store): - """Test store sysmeta raises error with empty sysmeta string.""" +def test_store_metadata_metadata_none(store): + """Test store metadata raises error with empty metadata string.""" pid = "jtao.1700.1" + format_id = "http://ns.dataone.org/service/types/v2.0" syspath_string = None with pytest.raises(TypeError): - store.store_sysmeta(pid, syspath_string) + store.store_metadata(pid, format_id, syspath_string) -def test_store_sysmeta_ab_id(pids, store): - """Test store sysmeta returns expected ab_id.""" +def test_store_metadata_ab_id(pids, store): + """Test store metadata returns expected ab_id.""" test_dir = "tests/testdata/" + format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename _hash_address = store.store_object(pid, path) - ab_id = store.store_sysmeta(pid, syspath) - assert ab_id == pids[pid]["ab_id"] + ab_id = store.store_metadata(pid, format_id, syspath) + assert ab_id == pids[pid]["ab_format_id"] -def test_store_sysmeta_thread_lock(store): - """Test store sysmeta thread lock.""" +def test_store_metadata_thread_lock(store): + """Test store metadata thread lock.""" test_dir = "tests/testdata/" - entity = "sysmeta" + entity = "metadata" + format_id = "http://ns.dataone.org/service/types/v2.0" pid = "jtao.1700.1" path = test_dir + pid filename = pid + ".xml" syspath = Path(test_dir) / filename _hash_address = store.store_object(pid, path) - store.store_sysmeta(pid, syspath) + store.store_metadata(pid, format_id, syspath) # Start threads - thread1 = Thread(target=store.store_sysmeta, args=(pid, syspath)) - thread2 = Thread(target=store.store_sysmeta, args=(pid, syspath)) - thread3 = Thread(target=store.store_sysmeta, args=(pid, syspath)) - thread4 = Thread(target=store.store_sysmeta, args=(pid, syspath)) + thread1 = Thread(target=store.store_metadata, args=(pid, format_id, syspath)) + thread2 = Thread(target=store.store_metadata, args=(pid, format_id, syspath)) + thread3 = Thread(target=store.store_metadata, args=(pid, format_id, syspath)) + thread4 = Thread(target=store.store_metadata, args=(pid, format_id, syspath)) thread1.start() thread2.start() thread3.start() @@ -529,12 +540,13 @@ def test_store_sysmeta_thread_lock(store): def test_retrieve_object(pids, store): """Test retrieve_object returns correct object data.""" test_dir = "tests/testdata/" + format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename hash_address = store.store_object(pid, path) - store.store_sysmeta(pid, syspath) + store.store_metadata(pid, format_id, syspath) obj_stream = store.retrieve_object(pid) sha256_hex = store.computehash(obj_stream) obj_stream.close() @@ -557,14 +569,15 @@ def test_retrieve_object_pid_invalid(store): def test_retrieve_sysmeta(store): - """Test retrieve_sysmeta returns correct sysmeta data.""" + """Test retrieve_sysmeta returns correct metadata data.""" test_dir = "tests/testdata/" + format_id = "http://ns.dataone.org/service/types/v2.0" pid = "jtao.1700.1" path = test_dir + pid filename = pid + ".xml" syspath = Path(test_dir) / filename _hash_address = store.store_object(pid, path) - _ab_id = store.store_sysmeta(pid, syspath) + _ab_id = store.store_metadata(pid, format_id, syspath) sysmeta_ret = store.retrieve_sysmeta(pid) sysmeta = syspath.read_bytes() assert sysmeta.decode("utf-8") == sysmeta_ret @@ -589,12 +602,13 @@ def test_delete_objects(pids, store): """Test delete_object successfully deletes objects.""" test_dir = "tests/testdata/" entity = "objects" + format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename _hash_address = store.store_object(pid, path) - _ab_id = store.store_sysmeta(pid, syspath) + _ab_id = store.store_metadata(pid, format_id, syspath) store.delete_object(pid) assert store.count(entity) == 0 @@ -616,13 +630,14 @@ def test_delete_object_pid_none(store): def test_delete_sysmeta(pids, store): """Test delete_sysmeta successfully deletes sysmeta.""" test_dir = "tests/testdata/" - entity = "sysmeta" + entity = "metadata" + format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename _hash_address = store.store_object(pid, path) - _ab_id = store.store_sysmeta(pid, syspath) + _ab_id = store.store_metadata(pid, format_id, syspath) store.delete_sysmeta(pid) assert store.count(entity) == 0 @@ -644,12 +659,13 @@ def test_delete_sysmeta_pid_none(store): def test_get_hex_digest(store): """Test get_hex_digest for expected value.""" test_dir = "tests/testdata/" + format_id = "http://ns.dataone.org/service/types/v2.0" pid = "jtao.1700.1" path = test_dir + pid filename = pid + ".xml" syspath = Path(test_dir) / filename _hash_address = store.store_object(pid, path) - _ab_id = store.store_sysmeta(pid, syspath) + _ab_id = store.store_metadata(pid, format_id, syspath) sha3_256_hex_digest = ( "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) From f9763ead9f7205ddba3969921cdaa1b21b5c02be Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 20 Jun 2023 10:45:53 -0700 Subject: [PATCH 004/165] Refactor 'retrieve_sysmeta' to 'retrieve_metadata' --- src/hashstore/filehashstore/filehashstore.py | 18 +++++++++--------- src/hashstore/hashstore.py | 9 +++++---- .../test_filehashstore_interface.py | 12 +++++++----- 3 files changed, 21 insertions(+), 18 deletions(-) diff --git a/src/hashstore/filehashstore/filehashstore.py b/src/hashstore/filehashstore/filehashstore.py index ef04020c..ee7fb130 100644 --- a/src/hashstore/filehashstore/filehashstore.py +++ b/src/hashstore/filehashstore/filehashstore.py @@ -516,35 +516,35 @@ def retrieve_object(self, pid): ) return obj_stream - def retrieve_sysmeta(self, pid): + def retrieve_metadata(self, pid, format_id): logging.debug( - "FileHashStore - retrieve_sysmeta: Request to retrieve sysmeta for pid: %s", + "FileHashStore - retrieve_metadata: Request to retrieve sysmeta for pid: %s", pid, ) if pid is None or pid.replace(" ", "") == "": exception_string = f"Pid cannot be None or empty, pid: {pid}" - logging.error("FileHashStore - retrieve_sysmeta: %s", exception_string) + logging.error("FileHashStore - retrieve_metadata: %s", exception_string) raise ValueError(exception_string) - entity = "sysmeta" - ab_id = self.get_sha256_hex_digest(pid) + entity = "metadata" + ab_id = self.get_sha256_hex_digest(pid + format_id) sysmeta_exists = self.exists(entity, ab_id) if sysmeta_exists: logging.debug( - "FileHashStore - retrieve_sysmeta: Sysmeta exists for pid: %s, retrieving sysmeta.", + "FileHashStore - retrieve_metadata: Metadata exists for pid: %s, retrieving sysmeta.", pid, ) - ab_id = self.get_sha256_hex_digest(pid) + ab_id = self.get_sha256_hex_digest(pid + format_id) s_path = self.open(entity, ab_id) s_content = s_path.read().decode("utf-8").split("\x00", 1) s_path.close() sysmeta = s_content[1] else: exception_string = f"No sysmeta found for pid: {pid}" - logging.error("FileHashStore - retrieve_sysmeta: %s", exception_string) + logging.error("FileHashStore - retrieve_metadata: %s", exception_string) raise ValueError(exception_string) logging.info( - "FileHashStore - retrieve_sysmeta: Retrieved sysmeta for pid: %s", pid + "FileHashStore - retrieve_metadata: Retrieved metadata for pid: %s", pid ) return sysmeta diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 4afe379f..eeb197d6 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -103,15 +103,16 @@ def retrieve_object(self, pid): raise NotImplementedError() @abstractmethod - def retrieve_sysmeta(self, pid): - """The 'retrieve_sysmeta' method retrieves the metadata content from disk and + def retrieve_metadata(self, pid, format_id): + """The 'retrieve_metadata' method retrieves the metadata content from disk and returns it in the form of a String using a given persistent identifier. Args: - pid (string): Authority-based identifier. + pid (string): Authority-based identifier + format_id (string): Metadata format Returns: - sysmeta (string): Sysmeta content. + metadata (string): Sysmeta content. """ raise NotImplementedError() diff --git a/tests/filehashstore/test_filehashstore_interface.py b/tests/filehashstore/test_filehashstore_interface.py index 8108560c..a19f17ef 100644 --- a/tests/filehashstore/test_filehashstore_interface.py +++ b/tests/filehashstore/test_filehashstore_interface.py @@ -568,8 +568,8 @@ def test_retrieve_object_pid_invalid(store): store.retrieve_object(pid_does_not_exist) -def test_retrieve_sysmeta(store): - """Test retrieve_sysmeta returns correct metadata data.""" +def test_retrieve_metadata(store): + """Test retrieve_metadata returns correct metadata data.""" test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" pid = "jtao.1700.1" @@ -578,24 +578,26 @@ def test_retrieve_sysmeta(store): syspath = Path(test_dir) / filename _hash_address = store.store_object(pid, path) _ab_id = store.store_metadata(pid, format_id, syspath) - sysmeta_ret = store.retrieve_sysmeta(pid) + sysmeta_ret = store.retrieve_metadata(pid, format_id) sysmeta = syspath.read_bytes() assert sysmeta.decode("utf-8") == sysmeta_ret def test_retrieve_sysmeta_pid_invalid(store): """Test retrieve_sysmeta raises error when supplied with bad pid.""" + format_id = "http://ns.dataone.org/service/types/v2.0" pid = "jtao.1700.1" pid_does_not_exist = pid + "test" with pytest.raises(ValueError): - store.retrieve_sysmeta(pid_does_not_exist) + store.retrieve_metadata(pid_does_not_exist, format_id) def test_retrieve_sysmeta_pid_empty(store): """Test retrieve_sysmeta raises error when supplied with empty pid.""" + format_id = "http://ns.dataone.org/service/types/v2.0" pid = " " with pytest.raises(ValueError): - store.retrieve_sysmeta(pid) + store.retrieve_metadata(pid, format_id) def test_delete_objects(pids, store): From 18a91facec14e77a0ee1f5bb73ff5df60976b1a4 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 20 Jun 2023 10:48:50 -0700 Subject: [PATCH 005/165] Refactor 'delete_sysmeta' to 'delete_metadata' --- src/hashstore/filehashstore/filehashstore.py | 6 +++--- src/hashstore/hashstore.py | 11 ++++++----- .../test_filehashstore_interface.py | 16 +++++++++------- 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/src/hashstore/filehashstore/filehashstore.py b/src/hashstore/filehashstore/filehashstore.py index ee7fb130..62029be4 100644 --- a/src/hashstore/filehashstore/filehashstore.py +++ b/src/hashstore/filehashstore/filehashstore.py @@ -566,7 +566,7 @@ def delete_object(self, pid): ) return True - def delete_sysmeta(self, pid): + def delete_metadata(self, pid, format_id): logging.debug( "FileHashStore - delete_sysmeta: Request to delete sysmeta for pid: %s", pid, @@ -576,8 +576,8 @@ def delete_sysmeta(self, pid): logging.error("FileHashStore - delete_sysmeta: %s", exception_string) raise ValueError(exception_string) - entity = "sysmeta" - ab_id = self.get_sha256_hex_digest(pid) + entity = "metadata" + ab_id = self.get_sha256_hex_digest(pid + format_id) self.delete(entity, ab_id) logging.info( "FileHashStore - delete_sysmeta: Successfully deleted sysmeta for pid: %s", diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index eeb197d6..d5ebfa9e 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -105,7 +105,7 @@ def retrieve_object(self, pid): @abstractmethod def retrieve_metadata(self, pid, format_id): """The 'retrieve_metadata' method retrieves the metadata content from disk and - returns it in the form of a String using a given persistent identifier. + returns it in the form of a String using a given persistent identifier and format_id. Args: pid (string): Authority-based identifier @@ -130,12 +130,13 @@ def delete_object(self, pid): raise NotImplementedError() @abstractmethod - def delete_sysmeta(self, pid): - """The 'delete_sysmeta' method deletes a metadata document (sysmeta) permanently - from disk using a given persistent identifier. + def delete_metadata(self, pid, format_id): + """The 'delete_metadata' method deletes a metadata document permanently + from disk using a given persistent identifier and format_id. Args: - pid (string): Authority-based identifier. + pid (string): Authority-based identifier + format_id (string): Metadata format Returns: boolean: `True` upon successful deletion. diff --git a/tests/filehashstore/test_filehashstore_interface.py b/tests/filehashstore/test_filehashstore_interface.py index a19f17ef..35049a29 100644 --- a/tests/filehashstore/test_filehashstore_interface.py +++ b/tests/filehashstore/test_filehashstore_interface.py @@ -629,8 +629,8 @@ def test_delete_object_pid_none(store): store.delete_object(pid) -def test_delete_sysmeta(pids, store): - """Test delete_sysmeta successfully deletes sysmeta.""" +def test_delete_metadata(pids, store): + """Test delete_metadata successfully deletes sysmeta.""" test_dir = "tests/testdata/" entity = "metadata" format_id = "http://ns.dataone.org/service/types/v2.0" @@ -640,22 +640,24 @@ def test_delete_sysmeta(pids, store): syspath = Path(test_dir) / filename _hash_address = store.store_object(pid, path) _ab_id = store.store_metadata(pid, format_id, syspath) - store.delete_sysmeta(pid) + store.delete_metadata(pid, format_id) assert store.count(entity) == 0 -def test_delete_sysmeta_pid_empty(store): +def test_delete_metadata_pid_empty(store): """Test delete_object raises error when empty pid supplied.""" + format_id = "http://ns.dataone.org/service/types/v2.0" pid = " " with pytest.raises(ValueError): - store.delete_sysmeta(pid) + store.delete_metadata(pid, format_id) -def test_delete_sysmeta_pid_none(store): +def test_delete_metadata_pid_none(store): """Test delete_object raises error when pid is 'None'.""" + format_id = "http://ns.dataone.org/service/types/v2.0" pid = None with pytest.raises(ValueError): - store.delete_sysmeta(pid) + store.delete_metadata(pid, format_id) def test_get_hex_digest(store): From 379e017e9ad922b027642ac5161ccbaa18b843b1 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 20 Jun 2023 12:02:37 -0700 Subject: [PATCH 006/165] Clean up 'store_metadata' refactor, add new pytests and update doc strings --- src/hashstore/filehashstore/filehashstore.py | 46 +++++---- tests/conftest.py | 16 ++-- tests/filehashstore/test_filehashstore.py | 26 ++--- .../test_filehashstore_interface.py | 96 ++++++++++++------- 4 files changed, 105 insertions(+), 79 deletions(-) diff --git a/src/hashstore/filehashstore/filehashstore.py b/src/hashstore/filehashstore/filehashstore.py index 62029be4..754660ff 100644 --- a/src/hashstore/filehashstore/filehashstore.py +++ b/src/hashstore/filehashstore/filehashstore.py @@ -145,8 +145,12 @@ def get_properties(self): """Get and return the contents of the current HashStore configuration. Returns: - hashstore_yaml_dict (dict): HashStore properties with the following keys/values: - "store_path", "store_depth", "store_width", "store_algorithm","store_metadata_namespace". + hashstore_yaml_dict (dict): HashStore properties with the following keys (and values): + store_path (str): Path to the HashStore directory. + store_depth (int): Depth when sharding an object's hex digest. + store_width (int): Width of directories when sharding an object's hex digest. + store_algorithm (str): Hash algorithm used for calculating the object's hex digest. + store_metadata_namespace (str): Namespace for the HashStore's system metadata. """ if not os.path.exists(self.hashstore_configuration_yaml): exception_string = "hashstore.yaml not found in store root path." @@ -256,6 +260,7 @@ def _build_hashstore_yaml_string( # │ └── 8f0b04e812a3b4c8f686ce34e6fec558804bf61e54b176742a7f6368d6 ############### Format of the Metadata ############### + # The default metadata format store_metadata_namespace: "{store_metadata_namespace}" ############### Hash Algorithms ############### @@ -422,7 +427,7 @@ def store_metadata(self, pid, format_id, metadata): logging.debug( "FileHashStore - store_metadata: Request to store metadata for pid: %s", pid ) - # Validate input parameters, begin with persistent identifier + # Validate input parameters, begin with persistent identifier (pid) logging.debug("FileHashStore - store_metadata: Validating arguments.") if pid is None or pid.replace(" ", "") == "": exception_string = f"Pid cannot be None or empty, pid: {pid}" @@ -431,11 +436,16 @@ def store_metadata(self, pid, format_id, metadata): # Then format_id of the metadata if format_id is None or format_id.replace(" ", "") == "": exception_string = ( - f"format_id cannot be None or empty, format_id: {format_id}" + f"Format_id cannot be None or empty, format_id: {format_id}" ) logging.error("FileHashStore - store_metadata: %s", exception_string) raise ValueError(exception_string) # Metadata content must be a str, path or stream and cannot be empty + if isinstance(metadata, str): + if metadata.replace(" ", "") == "": + exception_string = "Given string path to metadata cannot be empty." + logging.error("FileHashStore - store_metadata: %s", exception_string) + raise TypeError(exception_string) if ( not isinstance(metadata, str) and not isinstance(metadata, Path) @@ -447,11 +457,6 @@ def store_metadata(self, pid, format_id, metadata): ) logging.error("FileHashStore - store_metadata: %s", exception_string) raise TypeError(exception_string) - if isinstance(metadata, str): - if metadata.replace(" ", "") == "": - exception_string = "Given string path to metadata cannot be empty." - logging.error("FileHashStore - store_metadata: %s", exception_string) - raise TypeError(exception_string) # Wait for the pid to release if it's in use while pid in self.metadata_locked_pids: @@ -460,19 +465,20 @@ def store_metadata(self, pid, format_id, metadata): pid, ) time.sleep(self.time_out_sec) - # Modify metadata_locked_pids consecutively + with self.metadata_lock: logging.debug( "FileHashStore - store_metadata: Adding pid: %s to metadata_locked_pids.", pid, ) + # Modify metadata_locked_pids consecutively self.metadata_locked_pids.append(pid) + try: logging.debug( "FileHashStore - store_metadata: Attempting to store metadata for pid: %s", pid, ) - # TODO: Determine if format_id should be part of config file, then update metadata_cid = self.put_metadata(pid, format_id, metadata) finally: # Release pid @@ -887,7 +893,7 @@ def put_metadata(self, pid, format_id, metadata): Args: pid (string): Authority-based identifier. - format_id (string): Metadata format + format_id (string): Metadata format. metadata (mixed): String or path to metadata document. Returns: @@ -896,16 +902,14 @@ def put_metadata(self, pid, format_id, metadata): logging.debug( "FileHashStore - put_metadata: Request to put metadata for pid: %s", pid ) - - # Create tmp file and write to it + # Create metadata tmp file and write to it metadata_stream = Stream(metadata) with closing(metadata_stream): - # TODO: Determine if format_id should be part of config file, then update - metadata_tmp = self._mktmpmetadata(metadata_stream, self.sysmeta_ns) + metadata_tmp = self._mktmpmetadata(metadata_stream, format_id) - # Target path (permanent location) - ab_id = self.get_sha256_hex_digest(pid + format_id) - rel_path = "/".join(self.shard(ab_id)) + # Get target and related paths (permanent location) + ab_format_id = self.get_sha256_hex_digest(pid + format_id) + rel_path = "/".join(self.shard(ab_format_id)) full_path = self.get_store_path("metadata") / rel_path # Move metadata to target path @@ -919,7 +923,7 @@ def put_metadata(self, pid, format_id, metadata): "FileHashStore - put_metadata: Successfully put metadata for pid: %s", pid, ) - return ab_id + return ab_format_id except Exception as err: exception_string = f"Unexpected {err=}, {type(err)=}" logging.error("FileHashStore - put_metadata: %s", exception_string) @@ -949,7 +953,7 @@ def _mktmpmetadata(self, stream, format_id): format_id (string): Format of metadata. Returns: - tmp.name (string): Name of temporary file created and written into. + tmp.name (string): Path/name of temporary file created and written into. """ # Create temporary file in .../{store_path}/tmp tmp_root_path = self.get_store_path("metadata") / "tmp" diff --git a/tests/conftest.py b/tests/conftest.py index 9f9eca47..49fb8468 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -41,13 +41,13 @@ def init_store(props): @pytest.fixture(name="pids") def init_pids(): """Shared test harness data. - - ab_id: the hex digest of the pid - - ab_format_id: the hex digest of the pid + format_id + object_cid: hex digest of the pid + metadata_cid: hex digest of the pid + store_metadata_namespace """ test_pids = { "doi:10.18739/A2901ZH2M": { - "ab_id": "0d555ed77052d7e166017f779cbc193357c3a5006ee8b8457230bcf7abcef65e", - "ab_format_id": "323e0799524cec4c7e14d31289cefd884b563b5c052f154a066de5ec1e477da7", + "object_cid": "0d555ed77052d7e166017f779cbc193357c3a5006ee8b8457230bcf7abcef65e", + "metadata_cid": "323e0799524cec4c7e14d31289cefd884b563b5c052f154a066de5ec1e477da7", "md5": "db91c910a3202478c8def1071c54aae5", "sha1": "1fe86e3c8043afa4c70857ca983d740ad8501ccd", "sha224": "922b1e86f83d3ea3060fd0f7b2cf04476e8b3ddeaa3cf48c2c3cf502", @@ -56,8 +56,8 @@ def init_pids(): "sha512": "e9bcd6b91b102ef5803d1bd60c7a5d2dbec1a2baf5f62f7da60de07607ad6797d6a9b740d97a257fd2774f2c26503d455d8f2a03a128773477dfa96ab96a2e54", }, "jtao.1700.1": { - "ab_id": "a8241925740d5dcd719596639e780e0a090c9d55a5d0372b0eaf55ed711d4edf", - "ab_format_id": "ddf07952ef28efc099d10d8b682480f7d2da60015f5d8873b6e1ea75b4baf689", + "object_cid": "a8241925740d5dcd719596639e780e0a090c9d55a5d0372b0eaf55ed711d4edf", + "metadata_cid": "ddf07952ef28efc099d10d8b682480f7d2da60015f5d8873b6e1ea75b4baf689", "md5": "f4ea2d07db950873462a064937197b0f", "sha1": "3d25436c4490b08a2646e283dada5c60e5c0539d", "sha224": "9b3a96f434f3c894359193a63437ef86fbd5a1a1a6cc37f1d5013ac1", @@ -66,8 +66,8 @@ def init_pids(): "sha512": "bf9e7f4d4e66bd082817d87659d1d57c2220c376cd032ed97cadd481cf40d78dd479cbed14d34d98bae8cebc603b40c633d088751f07155a94468aa59e2ad109", }, "urn:uuid:1b35d0a5-b17a-423b-a2ed-de2b18dc367a": { - "ab_id": "7f5cc18f0b04e812a3b4c8f686ce34e6fec558804bf61e54b176742a7f6368d6", - "ab_format_id": "9a2e08c666b728e6cbd04d247b9e556df3de5b2ca49f7c5a24868eb27cddbff2", + "object_cid": "7f5cc18f0b04e812a3b4c8f686ce34e6fec558804bf61e54b176742a7f6368d6", + "metadata_cid": "9a2e08c666b728e6cbd04d247b9e556df3de5b2ca49f7c5a24868eb27cddbff2", "md5": "e1932fc75ca94de8b64f1d73dc898079", "sha1": "c6d2a69a3f5adaf478ba796c114f57b990cf7ad1", "sha224": "f86491d23d25dbaf7620542f056aba8a092a70be625502a6afd1fde0", diff --git a/tests/filehashstore/test_filehashstore.py b/tests/filehashstore/test_filehashstore.py index 822a1cfb..761f338d 100644 --- a/tests/filehashstore/test_filehashstore.py +++ b/tests/filehashstore/test_filehashstore.py @@ -152,14 +152,14 @@ def test_put_object_files_stream(pids, store): assert store.count(entity) == 3 -def test_put_object_id(pids, store): +def test_put_object_cid(pids, store): """Check put returns correct id.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") hashaddress = store.put_object(pid, path) hashaddress_id = hashaddress.id - assert hashaddress_id == pids[pid]["ab_id"] + assert hashaddress_id == pids[pid]["object_cid"] def test_put_object_relpath(pids, store): @@ -262,8 +262,8 @@ def test_move_and_get_checksums_id(pids, store): _, ) = store._move_and_get_checksums(pid, input_stream) input_stream.close() - ab_id = store.get_sha256_hex_digest(pid) - assert move_id == ab_id + metadata_cid = store.get_sha256_hex_digest(pid) + assert move_id == metadata_cid def test_move_and_get_checksums_hex_digests(pids, store): @@ -389,8 +389,8 @@ def test_put_metadata_with_path(pids, store): for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - ab_id = store.store_metadata(pid, format_id, syspath) - assert store.exists(entity, ab_id) + metadata_cid = store.store_metadata(pid, format_id, syspath) + assert store.exists(entity, metadata_cid) assert store.count(entity) == 3 @@ -402,12 +402,12 @@ def test_put_metadata_with_string(pids, store): for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = str(Path(test_dir) / filename) - ab_id = store.store_metadata(pid, format_id, syspath) - assert store.exists(entity, ab_id) + metadata_cid = store.store_metadata(pid, format_id, syspath) + assert store.exists(entity, metadata_cid) assert store.count(entity) == 3 -def test_put_metadata_ab_id(pids, store): +def test_put_metadata_cid(pids, store): """Test put metadata returns correct id.""" test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" @@ -415,7 +415,7 @@ def test_put_metadata_ab_id(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename ab_id = store.store_metadata(pid, format_id, syspath) - assert ab_id == pids[pid]["ab_format_id"] + assert ab_id == pids[pid]["metadata_cid"] def test_mktmpmetadata(pids, store): @@ -650,7 +650,7 @@ def test_create_path(pids, store): """Test makepath creates folder successfully.""" for pid in pids: root_directory = store.root - pid_hex_digest_directory = pids[pid]["ab_id"][:2] + pid_hex_digest_directory = pids[pid]["metadata_cid"][:2] pid_directory = root_directory + pid_hex_digest_directory store.create_path(pid_directory) assert os.path.isdir(pid_directory) @@ -708,7 +708,7 @@ def test_build_abs_path(store, pids): path = test_dir + pid.replace("/", "_") _ = store.put_object(pid, path) # pylint: disable=W0212 - abs_path = store.build_abs_path(entity, pids[pid]["ab_id"]) + abs_path = store.build_abs_path(entity, pids[pid]["object_cid"]) assert abs_path @@ -734,4 +734,4 @@ def test_get_sha256_hex_digest(pids, store): """Test for correct sha256 return value.""" for pid in pids: hash_val = store.get_sha256_hex_digest(pid) - assert hash_val == pids[pid]["ab_id"] + assert hash_val == pids[pid]["object_cid"] diff --git a/tests/filehashstore/test_filehashstore_interface.py b/tests/filehashstore/test_filehashstore_interface.py index 35049a29..60af8e59 100644 --- a/tests/filehashstore/test_filehashstore_interface.py +++ b/tests/filehashstore/test_filehashstore_interface.py @@ -18,13 +18,13 @@ def test_pids_length(pids): def test_store_address_length(pids, store): - """Test store object ab_id length is 64 characters.""" + """Test store object object_cid length is 64 characters.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") hash_address = store.store_object(pid, path) - ab_id = hash_address.id - assert len(ab_id) == 64 + object_cid = hash_address.id + assert len(object_cid) == 64 def test_store_object_files_path(pids, store): @@ -38,7 +38,7 @@ def test_store_object_files_path(pids, store): syspath = Path(test_dir) / filename _hash_address = store.store_object(pid, path) _metadata_id = store.store_metadata(pid, format_id, syspath) - assert store.exists(entity, pids[pid]["ab_id"]) + assert store.exists(entity, pids[pid]["object_cid"]) assert store.count(entity) == 3 @@ -53,7 +53,7 @@ def test_store_object_files_string(pids, store): syspath = Path(test_dir) / filename _hash_address = store.store_object(pid, path_string) _metadata_id = store.store_metadata(pid, format_id, syspath) - assert store.exists(entity, pids[pid]["ab_id"]) + assert store.exists(entity, pids[pid]["object_cid"]) assert store.count(entity) == 3 @@ -66,18 +66,18 @@ def test_store_object_files_input_stream(pids, store): input_stream = io.open(path, "rb") _hash_address = store.store_object(pid, input_stream) input_stream.close() - ab_id = store.get_sha256_hex_digest(pid) - assert store.exists(entity, ab_id) + object_cid = store.get_sha256_hex_digest(pid) + assert store.exists(entity, object_cid) assert store.count(entity) == 3 def test_store_object_id(pids, store): - """Test store object returns expected id (ab_id).""" + """Test store object returns expected id (object_cid).""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") hash_address = store.store_object(pid, path) - assert hash_address.id == pids[pid]["ab_id"] + assert hash_address.id == pids[pid]["object_cid"] def test_store_object_rel_path(pids, store): @@ -86,9 +86,9 @@ def test_store_object_rel_path(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") hash_address = store.store_object(pid, path) - ab_id = pids[pid]["ab_id"] - ab_id_rel_path = "/".join(store.shard(ab_id)) - assert hash_address.relpath == ab_id_rel_path + object_cid = pids[pid]["object_cid"] + object_cid_rel_path = "/".join(store.shard(object_cid)) + assert hash_address.relpath == object_cid_rel_path def test_store_object_abs_path(pids, store): @@ -97,10 +97,10 @@ def test_store_object_abs_path(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") hash_address = store.store_object(pid, path) - ab_id = pids[pid]["ab_id"] - ab_id_rel_path = "/".join(store.shard(ab_id)) - ab_id_abs_path = store.objects + "/" + ab_id_rel_path - assert hash_address.abspath == ab_id_abs_path + object_cid = pids[pid]["object_cid"] + object_cid_rel_path = "/".join(store.shard(object_cid)) + object_cid_abs_path = store.objects + "/" + object_cid_rel_path + assert hash_address.abspath == object_cid_abs_path def test_store_object_is_duplicate(pids, store): @@ -196,8 +196,8 @@ def test_store_object_additional_algorithm_hyphen_uppercase(pids, store): hash_address = store.store_object(pid, path, algorithm_with_hyphen_and_upper) sha256_cid = hash_address.hex_digests.get("sha384") assert sha256_cid == pids[pid]["sha384"] - ab_id = store.get_sha256_hex_digest(pid) - assert store.exists(entity, ab_id) + object_cid = store.get_sha256_hex_digest(pid) + assert store.exists(entity, object_cid) def test_store_object_additional_algorithm_hyphen_lowercase(store): @@ -213,8 +213,8 @@ def test_store_object_additional_algorithm_hyphen_lowercase(store): "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) assert additional_sha3_256_hex_digest == sha3_256_checksum - ab_id = store.get_sha256_hex_digest(pid) - assert store.exists(entity, ab_id) + object_cid = store.get_sha256_hex_digest(pid) + assert store.exists(entity, object_cid) def test_store_object_additional_algorithm_underscore(store): @@ -328,8 +328,8 @@ def test_store_object_duplicate_raises_error(store): with pytest.raises(FileExistsError): _hash_address_two = store.store_object(pid, path) assert store.count(entity) == 1 - ab_id = store.get_sha256_hex_digest(pid) - assert store.exists(entity, ab_id) + object_cid = store.get_sha256_hex_digest(pid) + assert store.exists(entity, object_cid) def test_store_object_duplicates_threads(store): @@ -359,8 +359,8 @@ def store_object_wrapper(pid, path): thread3.join() # One thread will succeed, file count must still be 1 assert store.count(entity) == 1 - ab_id = store.get_sha256_hex_digest(pid) - assert store.exists(entity, ab_id) + object_cid = store.get_sha256_hex_digest(pid) + assert store.exists(entity, object_cid) assert file_exists_error_flag @@ -421,8 +421,8 @@ def test_store_metadata_files_path(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename _hash_address = store.store_object(pid, path) - ab_id = store.store_metadata(pid, format_id, syspath) - assert store.exists(entity, ab_id) + metadata_cid = store.store_metadata(pid, format_id, syspath) + assert store.exists(entity, metadata_cid) assert store.count(entity) == 3 @@ -436,8 +436,8 @@ def test_store_metadata_files_string(pids, store): filename = pid.replace("/", "_") + ".xml" syspath_string = str(Path(test_dir) / filename) _hash_address = store.store_object(pid, path_string) - ab_id = store.store_metadata(pid, format_id, syspath_string) - assert store.exists(entity, ab_id) + metadata_cid = store.store_metadata(pid, format_id, syspath_string) + assert store.exists(entity, metadata_cid) assert store.count(entity) == 3 @@ -452,7 +452,7 @@ def test_store_metadata_files_input_stream(pids, store): filename = pid.replace("/", "_") + ".xml" syspath_string = str(Path(test_dir) / filename) syspath_stream = io.open(syspath_string, "rb") - _ab_id = store.store_metadata(pid, format_id, syspath_stream) + _metadata_cid = store.store_metadata(pid, format_id, syspath_stream) syspath_stream.close() assert store.count(entity) == 3 @@ -479,6 +479,28 @@ def test_store_metadata_pid_empty_spaces(store): store.store_metadata(pid, format_id, syspath_string) +def test_store_metadata_format_id_empty(store): + """Test store metadata raises error with empty string.""" + test_dir = "tests/testdata/" + format_id = "" + pid = "jtao.1700.1" + filename = pid.replace("/", "_") + ".xml" + syspath_string = str(Path(test_dir) / filename) + with pytest.raises(ValueError): + store.store_metadata(pid, format_id, syspath_string) + + +def test_store_metadata_pid_format_id_spaces(store): + """Test store metadata raises error with empty string.""" + test_dir = "tests/testdata/" + format_id = " " + pid = "jtao.1700.1" + filename = pid.replace("/", "_") + ".xml" + syspath_string = str(Path(test_dir) / filename) + with pytest.raises(ValueError): + store.store_metadata(pid, format_id, syspath_string) + + def test_store_metadata_metadata_empty(store): """Test store metadata raises error with empty metadata string.""" pid = "jtao.1700.1" @@ -497,8 +519,8 @@ def test_store_metadata_metadata_none(store): store.store_metadata(pid, format_id, syspath_string) -def test_store_metadata_ab_id(pids, store): - """Test store metadata returns expected ab_id.""" +def test_store_metadata_metadata_cid(pids, store): + """Test store metadata returns expected metadata_cid.""" test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): @@ -506,8 +528,8 @@ def test_store_metadata_ab_id(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename _hash_address = store.store_object(pid, path) - ab_id = store.store_metadata(pid, format_id, syspath) - assert ab_id == pids[pid]["ab_format_id"] + metadata_cid = store.store_metadata(pid, format_id, syspath) + assert metadata_cid == pids[pid]["metadata_cid"] def test_store_metadata_thread_lock(store): @@ -577,7 +599,7 @@ def test_retrieve_metadata(store): filename = pid + ".xml" syspath = Path(test_dir) / filename _hash_address = store.store_object(pid, path) - _ab_id = store.store_metadata(pid, format_id, syspath) + _metadata_cid = store.store_metadata(pid, format_id, syspath) sysmeta_ret = store.retrieve_metadata(pid, format_id) sysmeta = syspath.read_bytes() assert sysmeta.decode("utf-8") == sysmeta_ret @@ -610,7 +632,7 @@ def test_delete_objects(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename _hash_address = store.store_object(pid, path) - _ab_id = store.store_metadata(pid, format_id, syspath) + _metadata_cid = store.store_metadata(pid, format_id, syspath) store.delete_object(pid) assert store.count(entity) == 0 @@ -639,7 +661,7 @@ def test_delete_metadata(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename _hash_address = store.store_object(pid, path) - _ab_id = store.store_metadata(pid, format_id, syspath) + _metadata_cid = store.store_metadata(pid, format_id, syspath) store.delete_metadata(pid, format_id) assert store.count(entity) == 0 @@ -669,7 +691,7 @@ def test_get_hex_digest(store): filename = pid + ".xml" syspath = Path(test_dir) / filename _hash_address = store.store_object(pid, path) - _ab_id = store.store_metadata(pid, format_id, syspath) + _metadata_cid = store.store_metadata(pid, format_id, syspath) sha3_256_hex_digest = ( "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) From 4b4e8315d41d149b8d2b350f57e1363a7cbe3cfe Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 20 Jun 2023 12:09:54 -0700 Subject: [PATCH 007/165] Clean up 'retrieve_metadata' & 'delete_metadata' refactor, remove all references to sysmeta, and clarify variables & comments for ab_id (object_cid vs. metadata_cid) --- src/hashstore/filehashstore/filehashstore.py | 92 ++++++++++---------- tests/filehashstore/test_filehashstore.py | 4 +- 2 files changed, 48 insertions(+), 48 deletions(-) diff --git a/src/hashstore/filehashstore/filehashstore.py b/src/hashstore/filehashstore/filehashstore.py index 754660ff..48c5dea9 100644 --- a/src/hashstore/filehashstore/filehashstore.py +++ b/src/hashstore/filehashstore/filehashstore.py @@ -505,16 +505,16 @@ def retrieve_object(self, pid): raise ValueError(exception_string) entity = "objects" - ab_id = self.get_sha256_hex_digest(pid) - sysmeta_exists = self.exists(entity, ab_id) - if sysmeta_exists: + metadata_cid = self.get_sha256_hex_digest(pid) + metadata_exists = self.exists(entity, metadata_cid) + if metadata_exists: logging.debug( - "FileHashStore - retrieve_object: Sysmeta exists for pid: %s, retrieving object.", + "FileHashStore - retrieve_object: Metadata exists for pid: %s, retrieving object.", pid, ) - obj_stream = self.open(entity, ab_id) + obj_stream = self.open(entity, metadata_cid) else: - exception_string = f"No sysmeta found for pid: {pid}" + exception_string = f"No metadata found for pid: {pid}" logging.error("FileHashStore - retrieve_object: %s", exception_string) raise ValueError(exception_string) logging.info( @@ -524,7 +524,7 @@ def retrieve_object(self, pid): def retrieve_metadata(self, pid, format_id): logging.debug( - "FileHashStore - retrieve_metadata: Request to retrieve sysmeta for pid: %s", + "FileHashStore - retrieve_metadata: Request to retrieve metadata for pid: %s", pid, ) if pid is None or pid.replace(" ", "") == "": @@ -533,26 +533,26 @@ def retrieve_metadata(self, pid, format_id): raise ValueError(exception_string) entity = "metadata" - ab_id = self.get_sha256_hex_digest(pid + format_id) - sysmeta_exists = self.exists(entity, ab_id) - if sysmeta_exists: + metadata_cid = self.get_sha256_hex_digest(pid + format_id) + metadata_exists = self.exists(entity, metadata_cid) + if metadata_exists: logging.debug( - "FileHashStore - retrieve_metadata: Metadata exists for pid: %s, retrieving sysmeta.", + "FileHashStore - retrieve_metadata: Metadata exists for pid: %s, retrieving metadata.", pid, ) - ab_id = self.get_sha256_hex_digest(pid + format_id) - s_path = self.open(entity, ab_id) + metadata_cid = self.get_sha256_hex_digest(pid + format_id) + s_path = self.open(entity, metadata_cid) s_content = s_path.read().decode("utf-8").split("\x00", 1) s_path.close() - sysmeta = s_content[1] + metadata = s_content[1] else: - exception_string = f"No sysmeta found for pid: {pid}" + exception_string = f"No metadata found for pid: {pid}" logging.error("FileHashStore - retrieve_metadata: %s", exception_string) raise ValueError(exception_string) logging.info( "FileHashStore - retrieve_metadata: Retrieved metadata for pid: %s", pid ) - return sysmeta + return metadata def delete_object(self, pid): logging.debug( @@ -564,8 +564,8 @@ def delete_object(self, pid): raise ValueError(exception_string) entity = "objects" - ab_id = self.get_sha256_hex_digest(pid) - self.delete(entity, ab_id) + object_cid = self.get_sha256_hex_digest(pid) + self.delete(entity, object_cid) logging.info( "FileHashStore - delete_object: Successfully deleted object for pid: %s", pid, @@ -574,19 +574,19 @@ def delete_object(self, pid): def delete_metadata(self, pid, format_id): logging.debug( - "FileHashStore - delete_sysmeta: Request to delete sysmeta for pid: %s", + "FileHashStore - delete_metadata: Request to delete metadata for pid: %s", pid, ) if pid is None or pid.replace(" ", "") == "": exception_string = f"Pid cannot be None or empty, pid: {pid}" - logging.error("FileHashStore - delete_sysmeta: %s", exception_string) + logging.error("FileHashStore - delete_metadata: %s", exception_string) raise ValueError(exception_string) entity = "metadata" - ab_id = self.get_sha256_hex_digest(pid + format_id) - self.delete(entity, ab_id) + metadata_cid = self.get_sha256_hex_digest(pid + format_id) + self.delete(entity, metadata_cid) logging.info( - "FileHashStore - delete_sysmeta: Successfully deleted sysmeta for pid: %s", + "FileHashStore - delete_metadata: Successfully deleted metadata for pid: %s", pid, ) return True @@ -607,12 +607,12 @@ def get_hex_digest(self, pid, algorithm): entity = "objects" algorithm = self.clean_algorithm(algorithm) - ab_id = self.get_sha256_hex_digest(pid) - if not self.exists(entity, ab_id): + object_cid = self.get_sha256_hex_digest(pid) + if not self.exists(entity, object_cid): exception_string = f"No object found for pid: {pid}" logging.error("FileHashStore - get_hex_digest: %s", exception_string) raise ValueError(exception_string) - c_stream = self.open(entity, ab_id) + c_stream = self.open(entity, object_cid) hex_digest = self.computehash(c_stream, algorithm=algorithm) logging_info_statement = ( @@ -658,7 +658,7 @@ def put_object( ) with closing(stream): ( - ab_id, + object_cid, rel_path, abs_path, is_duplicate, @@ -673,7 +673,7 @@ def put_object( ) hash_address = HashAddress( - ab_id, rel_path, abs_path, is_duplicate, hex_digest_dict + object_cid, rel_path, abs_path, is_duplicate, hex_digest_dict ) logging.debug( "FileHashStore - put_object: Successfully put object for pid: %s", @@ -716,8 +716,8 @@ def _move_and_get_checksums( digest dictionary. """ entity = "objects" - ab_id = self.get_sha256_hex_digest(pid) - abs_file_path = self.build_abs_path(entity, ab_id, extension) + object_cid = self.get_sha256_hex_digest(pid) + abs_file_path = self.build_abs_path(entity, object_cid, extension) self.create_path(os.path.dirname(abs_file_path)) # Only put file if it doesn't exist if os.path.isfile(abs_file_path): @@ -801,7 +801,7 @@ def _move_and_get_checksums( self.delete(entity, tmp_file_name) err_msg = ( "Aborting store_object upload - an unexpected error has occurred when moving" - + f" file to: {ab_id} - Error: {err}" + + f" file to: {object_cid} - Error: {err}" ) logging.error("FileHashStore - _move_and_get_checksums: %s", err_msg) raise @@ -815,7 +815,7 @@ def _move_and_get_checksums( is_duplicate = True self.delete(entity, tmp_file_name) - return ab_id, rel_file_path, abs_file_path, is_duplicate, hex_digests + return object_cid, rel_file_path, abs_file_path, is_duplicate, hex_digests def _mktempfile(self, stream, algorithm=None): """Create a named temporary file from a `Stream` object and @@ -897,7 +897,7 @@ def put_metadata(self, pid, format_id, metadata): metadata (mixed): String or path to metadata document. Returns: - ab_id (string): Address of the metadata document. + metadata_cid (string): Address of the metadata document. """ logging.debug( "FileHashStore - put_metadata: Request to put metadata for pid: %s", pid @@ -908,8 +908,8 @@ def put_metadata(self, pid, format_id, metadata): metadata_tmp = self._mktmpmetadata(metadata_stream, format_id) # Get target and related paths (permanent location) - ab_format_id = self.get_sha256_hex_digest(pid + format_id) - rel_path = "/".join(self.shard(ab_format_id)) + metadata_cid = self.get_sha256_hex_digest(pid + format_id) + rel_path = "/".join(self.shard(metadata_cid)) full_path = self.get_store_path("metadata") / rel_path # Move metadata to target path @@ -923,7 +923,7 @@ def put_metadata(self, pid, format_id, metadata): "FileHashStore - put_metadata: Successfully put metadata for pid: %s", pid, ) - return ab_format_id + return metadata_cid except Exception as err: exception_string = f"Unexpected {err=}, {type(err)=}" logging.error("FileHashStore - put_metadata: %s", exception_string) @@ -1022,7 +1022,7 @@ def computehash(self, stream, algorithm=None): or with optional algorithm supported. Args: - stream (io.BufferedReader): A buffered stream of an ab_id object. \n + stream (io.BufferedReader): A buffered stream of an object_cid object. \n algorithm (string): Algorithm of hex digest to generate. Returns: @@ -1057,7 +1057,7 @@ def exists(self, entity, file): """Check whether a given file id or path exists on disk. Args: - entity (str): Desired entity type (ex. "objects", "sysmeta"). \n + entity (str): Desired entity type (ex. "objects", "metadata"). \n file (str): The name of the file to check. Returns: @@ -1099,7 +1099,7 @@ def open(self, entity, file, mode="rb"): for closing the stream. Args: - entity (str): Desired entity type (ex. "objects", "sysmeta"). \n + entity (str): Desired entity type (ex. "objects", "metadata"). \n file (str): Address ID or path of file. \n mode (str, optional): Mode to open file in. Defaults to 'rb'. @@ -1120,7 +1120,7 @@ def delete(self, entity, file): deleting. No exception is raised if file doesn't exist. Args: - entity (str): Desired entity type (ex. "objects", "sysmeta"). \n + entity (str): Desired entity type (ex. "objects", "metadata"). \n file (str): Address ID or path of file. """ realpath = self.get_real_path(entity, file) @@ -1207,7 +1207,7 @@ def get_real_path(self, entity, file): rel_root = self.metadata else: raise ValueError( - f"entity: {entity} does not exist. Do you mean 'objects' or 'sysmeta'?" + f"entity: {entity} does not exist. Do you mean 'objects' or 'metadata'?" ) relpath = os.path.join(rel_root, file) if os.path.isfile(relpath): @@ -1221,18 +1221,18 @@ def get_real_path(self, entity, file): # Could not determine a match. return None - def build_abs_path(self, entity, ab_id, extension=""): + def build_abs_path(self, entity, cid, extension=""): """Build the absolute file path for a given hash id with an optional file extension. Args: - entity (str): Desired entity type (ex. "objects", "sysmeta"). \n - ab_id (str): A hash id to build a file path for. \n + entity (str): Desired entity type (ex. "objects", "metadata"). \n + cid (str): A hash id to build a file path for. \n extension (str): An optional file extension to append to the file path. Returns: absolute_path (str): An absolute file path for the specified hash id. """ - paths = self.shard(ab_id) + paths = self.shard(cid) root_dir = self.get_store_path(entity) if extension and not extension.startswith(os.extsep): @@ -1247,7 +1247,7 @@ def count(self, entity): """Return count of the number of files in the `root` directory. Args: - entity (str): Desired entity type (ex. "objects", "sysmeta"). + entity (str): Desired entity type (ex. "objects", "metadata"). Returns: count (int): Number of files in the directory. diff --git a/tests/filehashstore/test_filehashstore.py b/tests/filehashstore/test_filehashstore.py index 761f338d..3902963b 100644 --- a/tests/filehashstore/test_filehashstore.py +++ b/tests/filehashstore/test_filehashstore.py @@ -414,8 +414,8 @@ def test_put_metadata_cid(pids, store): for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - ab_id = store.store_metadata(pid, format_id, syspath) - assert ab_id == pids[pid]["metadata_cid"] + metadata_cid = store.store_metadata(pid, format_id, syspath) + assert metadata_cid == pids[pid]["metadata_cid"] def test_mktmpmetadata(pids, store): From 59b0b747a16edfe426c40155896cec8140eb1021 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 20 Jun 2023 14:05:01 -0700 Subject: [PATCH 008/165] Clean up code base, add new pytests, fix minor bugs and update comments & doc strings --- src/hashstore/filehashstore/filehashstore.py | 52 +++++++---- src/hashstore/hashstore.py | 2 +- .../test_filehashstore_interface.py | 91 ++++++++++++++++--- 3 files changed, 114 insertions(+), 31 deletions(-) diff --git a/src/hashstore/filehashstore/filehashstore.py b/src/hashstore/filehashstore/filehashstore.py index 48c5dea9..a96feabb 100644 --- a/src/hashstore/filehashstore/filehashstore.py +++ b/src/hashstore/filehashstore/filehashstore.py @@ -20,7 +20,7 @@ class FileHashStore(HashStore): an authority-based identifier's hex digest with a given hash algorithm value to address files. - FileHashStore initializes by providing a properties dictionary containing the + FileHashStore initializes using a given properties dictionary containing the required keys (see Args). Upon initialization, FileHashStore verifies the provided properties and attempts to write a configuration file 'hashstore.yaml' to the given store path directory. Properties must always be supplied to ensure consistent @@ -362,8 +362,9 @@ def store_object( ) additional_algorithm_checked = None if additional_algorithm != self.algorithm and additional_algorithm is not None: + # Set additional_algorithm additional_algorithm_checked = self.clean_algorithm(additional_algorithm) - # Checksum and checksum_algorithm must both be supplied + # Checksum and checksum_algorithm must both be supplied if one is supplied if checksum is not None: if checksum_algorithm is None or checksum_algorithm.replace(" ", "") == "": exception_string = ( @@ -374,7 +375,6 @@ def store_object( raise ValueError(exception_string) checksum_algorithm_checked = None if checksum_algorithm is not None: - checksum_algorithm_checked = self.clean_algorithm(checksum_algorithm) if checksum is None or checksum.replace(" ", "") == "": exception_string = ( "checksum cannot be None or empty if checksum_algorithm is" @@ -382,6 +382,8 @@ def store_object( ) logging.error("FileHashStore - store_object: %s", exception_string) raise ValueError(exception_string) + # Set checksum_algorithm + checksum_algorithm_checked = self.clean_algorithm(checksum_algorithm) # Wait for the pid to release if it's in use while pid in self.object_locked_pids: @@ -505,16 +507,16 @@ def retrieve_object(self, pid): raise ValueError(exception_string) entity = "objects" - metadata_cid = self.get_sha256_hex_digest(pid) - metadata_exists = self.exists(entity, metadata_cid) - if metadata_exists: + object_cid = self.get_sha256_hex_digest(pid) + object_exists = self.exists(entity, object_cid) + if object_exists: logging.debug( "FileHashStore - retrieve_object: Metadata exists for pid: %s, retrieving object.", pid, ) - obj_stream = self.open(entity, metadata_cid) + obj_stream = self.open(entity, object_cid) else: - exception_string = f"No metadata found for pid: {pid}" + exception_string = f"No object found for pid: {pid}" logging.error("FileHashStore - retrieve_object: %s", exception_string) raise ValueError(exception_string) logging.info( @@ -531,20 +533,26 @@ def retrieve_metadata(self, pid, format_id): exception_string = f"Pid cannot be None or empty, pid: {pid}" logging.error("FileHashStore - retrieve_metadata: %s", exception_string) raise ValueError(exception_string) + if format_id is None or format_id.replace(" ", "") == "": + exception_string = f"Format_id cannot be None or empty, format_id: {format_id}" + logging.error("FileHashStore - retrieve_metadata: %s", exception_string) + raise ValueError(exception_string) entity = "metadata" metadata_cid = self.get_sha256_hex_digest(pid + format_id) metadata_exists = self.exists(entity, metadata_cid) if metadata_exists: logging.debug( - "FileHashStore - retrieve_metadata: Metadata exists for pid: %s, retrieving metadata.", - pid, + "FileHashStore - retrieve_metadata: Metadata exists for pid: %s", + pid + ", retrieving metadata.", ) metadata_cid = self.get_sha256_hex_digest(pid + format_id) - s_path = self.open(entity, metadata_cid) - s_content = s_path.read().decode("utf-8").split("\x00", 1) - s_path.close() - metadata = s_content[1] + metadata_cid_stream = self.open(entity, metadata_cid) + metadata_cid_content = ( + metadata_cid_stream.read().decode("utf-8").split("\x00", 1) + ) + metadata_cid_stream.close() + metadata = metadata_cid_content[1] else: exception_string = f"No metadata found for pid: {pid}" logging.error("FileHashStore - retrieve_metadata: %s", exception_string) @@ -581,6 +589,12 @@ def delete_metadata(self, pid, format_id): exception_string = f"Pid cannot be None or empty, pid: {pid}" logging.error("FileHashStore - delete_metadata: %s", exception_string) raise ValueError(exception_string) + if format_id is None or format_id.replace(" ", "") == "": + exception_string = ( + f"Format_id cannot be None or empty, format_id: {format_id}" + ) + logging.error("FileHashStore - delete_metadata: %s", exception_string) + raise ValueError(exception_string) entity = "metadata" metadata_cid = self.get_sha256_hex_digest(pid + format_id) @@ -612,8 +626,8 @@ def get_hex_digest(self, pid, algorithm): exception_string = f"No object found for pid: {pid}" logging.error("FileHashStore - get_hex_digest: %s", exception_string) raise ValueError(exception_string) - c_stream = self.open(entity, object_cid) - hex_digest = self.computehash(c_stream, algorithm=algorithm) + cid_stream = self.open(entity, object_cid) + hex_digest = self.computehash(cid_stream, algorithm=algorithm) logging_info_statement = ( f"FileHashStore - get_hex_digest: Successfully calculated hex digest for pid: {pid}." @@ -888,8 +902,8 @@ def _mktempfile(self, stream, algorithm=None): return hex_digest_dict, tmp.name def put_metadata(self, pid, format_id, metadata): - """Store contents of metadata on disk using the hash of the given pid - and format_id as the permanent address. + """Store contents of metadata to `[self.root]/metadata` using the hash of the + given pid and format_id as the permanent address. Args: pid (string): Authority-based identifier. @@ -946,7 +960,7 @@ def put_metadata(self, pid, format_id, metadata): raise FileNotFoundError() def _mktmpmetadata(self, stream, format_id): - """Create a named temporary file with `metadata` bytes and `namespace`. + """Create a named temporary file with `stream` (metadata) and `format_id`. Args: stream (io.BufferedReader): Metadata stream. diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index d5ebfa9e..562c3d26 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -26,7 +26,7 @@ def store_object( """The `store_object` method is responsible for the atomic storage of objects to disk using a given InputStream and a persistent identifier (pid). Upon successful storage, the method returns a HashAddress object containing - relevant file information, such as the file's id, relative path, absolute + relevant file information, such as the file's cid, relative path, absolute path, duplicate object status, and hex digest map of algorithms and checksums. `store_object` also ensures that an object is stored only once by synchronizing multiple calls and rejecting calls to store duplicate objects. diff --git a/tests/filehashstore/test_filehashstore_interface.py b/tests/filehashstore/test_filehashstore_interface.py index 60af8e59..1c5a25f0 100644 --- a/tests/filehashstore/test_filehashstore_interface.py +++ b/tests/filehashstore/test_filehashstore_interface.py @@ -27,6 +27,21 @@ def test_store_address_length(pids, store): assert len(object_cid) == 64 +def test_store_object(pids, store): + """Test store object.""" + test_dir = "tests/testdata/" + entity = "objects" + format_id = "http://ns.dataone.org/service/types/v2.0" + for pid in pids.keys(): + path = Path(test_dir + pid.replace("/", "_")) + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + hash_address = store.store_object(pid, path) + _metadata_cid = store.store_metadata(pid, format_id, syspath) + assert hash_address.id == pids[pid]["object_cid"] + assert store.count(entity) == 3 + + def test_store_object_files_path(pids, store): """Test store object when given a path.""" test_dir = "tests/testdata/" @@ -37,7 +52,7 @@ def test_store_object_files_path(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename _hash_address = store.store_object(pid, path) - _metadata_id = store.store_metadata(pid, format_id, syspath) + _metadata_cid = store.store_metadata(pid, format_id, syspath) assert store.exists(entity, pids[pid]["object_cid"]) assert store.count(entity) == 3 @@ -52,7 +67,7 @@ def test_store_object_files_string(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename _hash_address = store.store_object(pid, path_string) - _metadata_id = store.store_metadata(pid, format_id, syspath) + _metadata_cid = store.store_metadata(pid, format_id, syspath) assert store.exists(entity, pids[pid]["object_cid"]) assert store.count(entity) == 3 @@ -411,6 +426,19 @@ def test_store_object_sparse_large_file(store): assert hash_address_id == pid_sha256_hex_digest +def test_store_metadata(pids, store): + """Test store metadata.""" + test_dir = "tests/testdata/" + format_id = "http://ns.dataone.org/service/types/v2.0" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + _hash_address = store.store_object(pid, path) + metadata_cid = store.store_metadata(pid, format_id, syspath) + assert metadata_cid == pids[pid]["metadata_cid"] + + def test_store_metadata_files_path(pids, store): """Test store metadata with path.""" test_dir = "tests/testdata/" @@ -423,6 +451,7 @@ def test_store_metadata_files_path(pids, store): _hash_address = store.store_object(pid, path) metadata_cid = store.store_metadata(pid, format_id, syspath) assert store.exists(entity, metadata_cid) + assert metadata_cid == pids[pid]["metadata_cid"] assert store.count(entity) == 3 @@ -591,7 +620,7 @@ def test_retrieve_object_pid_invalid(store): def test_retrieve_metadata(store): - """Test retrieve_metadata returns correct metadata data.""" + """Test retrieve_metadata returns correct metadata.""" test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" pid = "jtao.1700.1" @@ -600,13 +629,13 @@ def test_retrieve_metadata(store): syspath = Path(test_dir) / filename _hash_address = store.store_object(pid, path) _metadata_cid = store.store_metadata(pid, format_id, syspath) - sysmeta_ret = store.retrieve_metadata(pid, format_id) - sysmeta = syspath.read_bytes() - assert sysmeta.decode("utf-8") == sysmeta_ret + metadata_bytes = store.retrieve_metadata(pid, format_id) + metadata = syspath.read_bytes() + assert metadata.decode("utf-8") == metadata_bytes -def test_retrieve_sysmeta_pid_invalid(store): - """Test retrieve_sysmeta raises error when supplied with bad pid.""" +def test_retrieve_metadata_bytes_pid_invalid(store): + """Test retrieve_metadata raises error when supplied with bad pid.""" format_id = "http://ns.dataone.org/service/types/v2.0" pid = "jtao.1700.1" pid_does_not_exist = pid + "test" @@ -614,14 +643,38 @@ def test_retrieve_sysmeta_pid_invalid(store): store.retrieve_metadata(pid_does_not_exist, format_id) -def test_retrieve_sysmeta_pid_empty(store): - """Test retrieve_sysmeta raises error when supplied with empty pid.""" +def test_retrieve_metadata_bytes_pid_empty(store): + """Test retrieve_metadata raises error when supplied with empty pid.""" format_id = "http://ns.dataone.org/service/types/v2.0" pid = " " with pytest.raises(ValueError): store.retrieve_metadata(pid, format_id) +def test_retrieve_metadata_format_id_none(store): + """Test retrieve_metadata raises error when supplied with None format_id""" + format_id = None + pid = "jtao.1700.1" + with pytest.raises(ValueError): + store.retrieve_metadata(pid, format_id) + + +def test_retrieve_metadata_format_id_empty(store): + """Test retrieve_metadata raises error when supplied with empty format_id.""" + format_id = "" + pid = "jtao.1700.1" + with pytest.raises(ValueError): + store.retrieve_metadata(pid, format_id) + + +def test_retrieve_metadata_format_id_empty_spaces(store): + """Test retrieve_metadata raises error when supplied with empty format_id.""" + format_id = " " + pid = "jtao.1700.1" + with pytest.raises(ValueError): + store.retrieve_metadata(pid, format_id) + + def test_delete_objects(pids, store): """Test delete_object successfully deletes objects.""" test_dir = "tests/testdata/" @@ -652,7 +705,7 @@ def test_delete_object_pid_none(store): def test_delete_metadata(pids, store): - """Test delete_metadata successfully deletes sysmeta.""" + """Test delete_metadata successfully deletes metadata.""" test_dir = "tests/testdata/" entity = "metadata" format_id = "http://ns.dataone.org/service/types/v2.0" @@ -682,6 +735,22 @@ def test_delete_metadata_pid_none(store): store.delete_metadata(pid, format_id) +def test_delete_metadata_format_id_empty(store): + """Test delete_object raises error when empty format_id supplied.""" + format_id = " " + pid = "jtao.1700.1" + with pytest.raises(ValueError): + store.delete_metadata(pid, format_id) + + +def test_delete_metadata_format_id_none(store): + """Test delete_object raises error when format_id is 'None'.""" + format_id = None + pid = "jtao.1700.1" + with pytest.raises(ValueError): + store.delete_metadata(pid, format_id) + + def test_get_hex_digest(store): """Test get_hex_digest for expected value.""" test_dir = "tests/testdata/" From 917388ebdd334c895b7204fe29a67a03baafc975 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 21 Jun 2023 09:36:33 -0700 Subject: [PATCH 009/165] Refactore store_metadata to use default format_id/namespace when 'None' is provided --- src/hashstore/filehashstore/filehashstore.py | 22 +++-- src/hashstore/hashstore.py | 2 +- tests/conftest.py | 4 +- tests/filehashstore/test_filehashstore.py | 6 +- .../test_filehashstore_interface.py | 88 ++++++++++++++----- 5 files changed, 84 insertions(+), 38 deletions(-) diff --git a/src/hashstore/filehashstore/filehashstore.py b/src/hashstore/filehashstore/filehashstore.py index a96feabb..ef208c4a 100644 --- a/src/hashstore/filehashstore/filehashstore.py +++ b/src/hashstore/filehashstore/filehashstore.py @@ -425,7 +425,7 @@ def store_object( ) return hash_address - def store_metadata(self, pid, format_id, metadata): + def store_metadata(self, pid, format_id=None, metadata): logging.debug( "FileHashStore - store_metadata: Request to store metadata for pid: %s", pid ) @@ -436,12 +436,16 @@ def store_metadata(self, pid, format_id, metadata): logging.error("FileHashStore - store_metadata: %s", exception_string) raise ValueError(exception_string) # Then format_id of the metadata - if format_id is None or format_id.replace(" ", "") == "": - exception_string = ( - f"Format_id cannot be None or empty, format_id: {format_id}" - ) + checked_format_id = None + if format_id is not None and format_id.replace(" ", "") == "": + exception_string = "Format_id cannot be empty." logging.error("FileHashStore - store_metadata: %s", exception_string) raise ValueError(exception_string) + elif format_id is None: + # Use default value set by hashstore config + checked_format_id = self.sysmeta_ns + else: + checked_format_id = format_id # Metadata content must be a str, path or stream and cannot be empty if isinstance(metadata, str): if metadata.replace(" ", "") == "": @@ -481,7 +485,7 @@ def store_metadata(self, pid, format_id, metadata): "FileHashStore - store_metadata: Attempting to store metadata for pid: %s", pid, ) - metadata_cid = self.put_metadata(pid, format_id, metadata) + metadata_cid = self.put_metadata(metadata, pid, checked_format_id) finally: # Release pid with self.metadata_lock: @@ -534,7 +538,9 @@ def retrieve_metadata(self, pid, format_id): logging.error("FileHashStore - retrieve_metadata: %s", exception_string) raise ValueError(exception_string) if format_id is None or format_id.replace(" ", "") == "": - exception_string = f"Format_id cannot be None or empty, format_id: {format_id}" + exception_string = ( + f"Format_id cannot be None or empty, format_id: {format_id}" + ) logging.error("FileHashStore - retrieve_metadata: %s", exception_string) raise ValueError(exception_string) @@ -901,7 +907,7 @@ def _mktempfile(self, stream, algorithm=None): logging.debug("FileHashStore - _mktempfile: Hex digests calculated.") return hex_digest_dict, tmp.name - def put_metadata(self, pid, format_id, metadata): + def put_metadata(self, metadata, pid, format_id): """Store contents of metadata to `[self.root]/metadata` using the hash of the given pid and format_id as the permanent address. diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 562c3d26..19c4d609 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -63,7 +63,7 @@ def store_object( raise NotImplementedError() @abstractmethod - def store_metadata(self, pid, format_id, metadata): + def store_metadata(self, pid, metadata, format_id): """The `store_metadata` method is responsible for adding and/or updating metadata (ex. `sysmeta`) to disk using a given path/stream, a persistent identifier `pid` and a metadata `format_id`. The metadata object consists of a header and a body diff --git a/tests/conftest.py b/tests/conftest.py index 49fb8468..d15e8875 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -41,8 +41,8 @@ def init_store(props): @pytest.fixture(name="pids") def init_pids(): """Shared test harness data. - object_cid: hex digest of the pid - metadata_cid: hex digest of the pid + store_metadata_namespace + - object_cid: hex digest of the pid + - metadata_cid: hex digest of the pid + store_metadata_namespace """ test_pids = { "doi:10.18739/A2901ZH2M": { diff --git a/tests/filehashstore/test_filehashstore.py b/tests/filehashstore/test_filehashstore.py index 3902963b..597156ac 100644 --- a/tests/filehashstore/test_filehashstore.py +++ b/tests/filehashstore/test_filehashstore.py @@ -389,7 +389,7 @@ def test_put_metadata_with_path(pids, store): for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - metadata_cid = store.store_metadata(pid, format_id, syspath) + metadata_cid = store.store_metadata(pid, syspath, format_id) assert store.exists(entity, metadata_cid) assert store.count(entity) == 3 @@ -402,7 +402,7 @@ def test_put_metadata_with_string(pids, store): for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = str(Path(test_dir) / filename) - metadata_cid = store.store_metadata(pid, format_id, syspath) + metadata_cid = store.store_metadata(pid, syspath, format_id) assert store.exists(entity, metadata_cid) assert store.count(entity) == 3 @@ -414,7 +414,7 @@ def test_put_metadata_cid(pids, store): for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - metadata_cid = store.store_metadata(pid, format_id, syspath) + metadata_cid = store.store_metadata(pid, syspath, format_id) assert metadata_cid == pids[pid]["metadata_cid"] diff --git a/tests/filehashstore/test_filehashstore_interface.py b/tests/filehashstore/test_filehashstore_interface.py index 1c5a25f0..56314419 100644 --- a/tests/filehashstore/test_filehashstore_interface.py +++ b/tests/filehashstore/test_filehashstore_interface.py @@ -37,7 +37,7 @@ def test_store_object(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename hash_address = store.store_object(pid, path) - _metadata_cid = store.store_metadata(pid, format_id, syspath) + _metadata_cid = store.store_metadata(pid, syspath, format_id) assert hash_address.id == pids[pid]["object_cid"] assert store.count(entity) == 3 @@ -52,7 +52,7 @@ def test_store_object_files_path(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename _hash_address = store.store_object(pid, path) - _metadata_cid = store.store_metadata(pid, format_id, syspath) + _metadata_cid = store.store_metadata(pid, syspath, format_id) assert store.exists(entity, pids[pid]["object_cid"]) assert store.count(entity) == 3 @@ -67,7 +67,7 @@ def test_store_object_files_string(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename _hash_address = store.store_object(pid, path_string) - _metadata_cid = store.store_metadata(pid, format_id, syspath) + _metadata_cid = store.store_metadata(pid, syspath, format_id) assert store.exists(entity, pids[pid]["object_cid"]) assert store.count(entity) == 3 @@ -435,10 +435,50 @@ def test_store_metadata(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename _hash_address = store.store_object(pid, path) - metadata_cid = store.store_metadata(pid, format_id, syspath) + metadata_cid = store.store_metadata(pid, syspath, format_id) assert metadata_cid == pids[pid]["metadata_cid"] +def test_store_metadata_format_id_is_none(pids, store): + """Confirm default name space is used when format_id is not supplied""" + test_dir = "tests/testdata/" + format_id = "http://ns.dataone.org/service/types/v2.0" + entity = "metadata" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + _hash_address = store.store_object(pid, path) + metadata_cid = store.store_metadata(pid, syspath) + metadata_cid_stream = store.open(entity, metadata_cid) + metadata_cid_content = ( + metadata_cid_stream.read().decode("utf-8").split("\x00", 1) + ) + metadata_cid_stream.close() + metadata_format = metadata_cid_content[0] + assert metadata_format == format_id + + +def test_store_metadata_format_id_is_custom(pids, store): + """Confirm default name space is used when format_id is not supplied""" + test_dir = "tests/testdata/" + format_id = "http://hashstore.world.com/types/v1.0" + entity = "metadata" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + _hash_address = store.store_object(pid, path) + metadata_cid = store.store_metadata(pid, syspath, format_id) + metadata_cid_stream = store.open(entity, metadata_cid) + metadata_cid_content = ( + metadata_cid_stream.read().decode("utf-8").split("\x00", 1) + ) + metadata_cid_stream.close() + metadata_format = metadata_cid_content[0] + assert metadata_format == format_id + + def test_store_metadata_files_path(pids, store): """Test store metadata with path.""" test_dir = "tests/testdata/" @@ -449,7 +489,7 @@ def test_store_metadata_files_path(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename _hash_address = store.store_object(pid, path) - metadata_cid = store.store_metadata(pid, format_id, syspath) + metadata_cid = store.store_metadata(pid, syspath, format_id) assert store.exists(entity, metadata_cid) assert metadata_cid == pids[pid]["metadata_cid"] assert store.count(entity) == 3 @@ -465,7 +505,7 @@ def test_store_metadata_files_string(pids, store): filename = pid.replace("/", "_") + ".xml" syspath_string = str(Path(test_dir) / filename) _hash_address = store.store_object(pid, path_string) - metadata_cid = store.store_metadata(pid, format_id, syspath_string) + metadata_cid = store.store_metadata(pid, syspath_string, format_id) assert store.exists(entity, metadata_cid) assert store.count(entity) == 3 @@ -481,7 +521,7 @@ def test_store_metadata_files_input_stream(pids, store): filename = pid.replace("/", "_") + ".xml" syspath_string = str(Path(test_dir) / filename) syspath_stream = io.open(syspath_string, "rb") - _metadata_cid = store.store_metadata(pid, format_id, syspath_stream) + _metadata_cid = store.store_metadata(pid, syspath_stream, format_id) syspath_stream.close() assert store.count(entity) == 3 @@ -494,7 +534,7 @@ def test_store_metadata_pid_empty(store): filename = pid.replace("/", "_") + ".xml" syspath_string = str(Path(test_dir) / filename) with pytest.raises(ValueError): - store.store_metadata(pid, format_id, syspath_string) + store.store_metadata(pid, syspath_string, format_id) def test_store_metadata_pid_empty_spaces(store): @@ -505,7 +545,7 @@ def test_store_metadata_pid_empty_spaces(store): filename = pid.replace("/", "_") + ".xml" syspath_string = str(Path(test_dir) / filename) with pytest.raises(ValueError): - store.store_metadata(pid, format_id, syspath_string) + store.store_metadata(pid, syspath_string, format_id) def test_store_metadata_format_id_empty(store): @@ -516,7 +556,7 @@ def test_store_metadata_format_id_empty(store): filename = pid.replace("/", "_") + ".xml" syspath_string = str(Path(test_dir) / filename) with pytest.raises(ValueError): - store.store_metadata(pid, format_id, syspath_string) + store.store_metadata(pid, syspath_string, format_id) def test_store_metadata_pid_format_id_spaces(store): @@ -527,7 +567,7 @@ def test_store_metadata_pid_format_id_spaces(store): filename = pid.replace("/", "_") + ".xml" syspath_string = str(Path(test_dir) / filename) with pytest.raises(ValueError): - store.store_metadata(pid, format_id, syspath_string) + store.store_metadata(pid, syspath_string, format_id) def test_store_metadata_metadata_empty(store): @@ -536,7 +576,7 @@ def test_store_metadata_metadata_empty(store): format_id = "http://ns.dataone.org/service/types/v2.0" syspath_string = " " with pytest.raises(TypeError): - store.store_metadata(pid, format_id, syspath_string) + store.store_metadata(pid, syspath_string, format_id) def test_store_metadata_metadata_none(store): @@ -545,7 +585,7 @@ def test_store_metadata_metadata_none(store): format_id = "http://ns.dataone.org/service/types/v2.0" syspath_string = None with pytest.raises(TypeError): - store.store_metadata(pid, format_id, syspath_string) + store.store_metadata(pid, syspath_string, format_id) def test_store_metadata_metadata_cid(pids, store): @@ -557,7 +597,7 @@ def test_store_metadata_metadata_cid(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename _hash_address = store.store_object(pid, path) - metadata_cid = store.store_metadata(pid, format_id, syspath) + metadata_cid = store.store_metadata(pid, syspath, format_id) assert metadata_cid == pids[pid]["metadata_cid"] @@ -571,12 +611,12 @@ def test_store_metadata_thread_lock(store): filename = pid + ".xml" syspath = Path(test_dir) / filename _hash_address = store.store_object(pid, path) - store.store_metadata(pid, format_id, syspath) + store.store_metadata(pid, syspath, format_id) # Start threads - thread1 = Thread(target=store.store_metadata, args=(pid, format_id, syspath)) - thread2 = Thread(target=store.store_metadata, args=(pid, format_id, syspath)) - thread3 = Thread(target=store.store_metadata, args=(pid, format_id, syspath)) - thread4 = Thread(target=store.store_metadata, args=(pid, format_id, syspath)) + thread1 = Thread(target=store.store_metadata, args=(pid, syspath, format_id)) + thread2 = Thread(target=store.store_metadata, args=(pid, syspath, format_id)) + thread3 = Thread(target=store.store_metadata, args=(pid, syspath, format_id)) + thread4 = Thread(target=store.store_metadata, args=(pid, syspath, format_id)) thread1.start() thread2.start() thread3.start() @@ -597,7 +637,7 @@ def test_retrieve_object(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename hash_address = store.store_object(pid, path) - store.store_metadata(pid, format_id, syspath) + store.store_metadata(pid, syspath, format_id) obj_stream = store.retrieve_object(pid) sha256_hex = store.computehash(obj_stream) obj_stream.close() @@ -628,7 +668,7 @@ def test_retrieve_metadata(store): filename = pid + ".xml" syspath = Path(test_dir) / filename _hash_address = store.store_object(pid, path) - _metadata_cid = store.store_metadata(pid, format_id, syspath) + _metadata_cid = store.store_metadata(pid, syspath, format_id) metadata_bytes = store.retrieve_metadata(pid, format_id) metadata = syspath.read_bytes() assert metadata.decode("utf-8") == metadata_bytes @@ -685,7 +725,7 @@ def test_delete_objects(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename _hash_address = store.store_object(pid, path) - _metadata_cid = store.store_metadata(pid, format_id, syspath) + _metadata_cid = store.store_metadata(pid, syspath, format_id) store.delete_object(pid) assert store.count(entity) == 0 @@ -714,7 +754,7 @@ def test_delete_metadata(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename _hash_address = store.store_object(pid, path) - _metadata_cid = store.store_metadata(pid, format_id, syspath) + _metadata_cid = store.store_metadata(pid, syspath, format_id) store.delete_metadata(pid, format_id) assert store.count(entity) == 0 @@ -760,7 +800,7 @@ def test_get_hex_digest(store): filename = pid + ".xml" syspath = Path(test_dir) / filename _hash_address = store.store_object(pid, path) - _metadata_cid = store.store_metadata(pid, format_id, syspath) + _metadata_cid = store.store_metadata(pid, syspath, format_id) sha3_256_hex_digest = ( "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) From 0db5f549cd6074ca48a5780f8f1c6905d81d6257 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 21 Jun 2023 09:39:41 -0700 Subject: [PATCH 010/165] Fix bug left behind from testing RE: non-default arguments --- src/hashstore/filehashstore/filehashstore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hashstore/filehashstore/filehashstore.py b/src/hashstore/filehashstore/filehashstore.py index ef208c4a..daa41997 100644 --- a/src/hashstore/filehashstore/filehashstore.py +++ b/src/hashstore/filehashstore/filehashstore.py @@ -425,7 +425,7 @@ def store_object( ) return hash_address - def store_metadata(self, pid, format_id=None, metadata): + def store_metadata(self, pid, metadata, format_id=None): logging.debug( "FileHashStore - store_metadata: Request to store metadata for pid: %s", pid ) From 0fe4f7b2d978a8d265e19cadcc676ad2c9b743a5 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 21 Jun 2023 12:36:54 -0700 Subject: [PATCH 011/165] Refactor default_algo_list to be initialized via by translating DataONE standardized algorithm values, fixed latent bug with '_mktempfile' and add new pytests --- src/hashstore/filehashstore/filehashstore.py | 111 ++++++++++++------ tests/filehashstore/test_filehashstore.py | 74 +++++++++++- .../test_filehashstore_interface.py | 24 +++- 3 files changed, 167 insertions(+), 42 deletions(-) diff --git a/src/hashstore/filehashstore/filehashstore.py b/src/hashstore/filehashstore/filehashstore.py index daa41997..5a7fa684 100644 --- a/src/hashstore/filehashstore/filehashstore.py +++ b/src/hashstore/filehashstore/filehashstore.py @@ -46,10 +46,6 @@ class FileHashStore(HashStore): # Permissions settings for writing files and creating directories fmode = 0o664 dmode = 0o755 - # Default and other algorithm list for FileHashStore - # The default algorithm list includes the hash algorithms calculated when - # storing an object to disk and returned to the caller after successful storage. - default_algo_list = ["sha1", "sha256", "sha384", "sha512", "md5"] # The other algorithm list consists of additional algorithms that can be included # for calculating when storing objects, in addition to the default list. other_algo_list = [ @@ -125,6 +121,8 @@ def __init__(self, properties=None): + " Writing configuration file." ) self.put_properties(properties) + # Default algorithm list for FileHashStore based on config file written + self._set_default_algorithms() # Complete initialization/instantiation by setting store directories self.objects = self.root + "/objects" self.metadata = self.root + "/metadata" @@ -139,7 +137,7 @@ def __init__(self, properties=None): # Cannot instantiate or initialize FileHashStore without config raise ValueError(exception_string) - # Configuration Methods + # Configuration and Related Methods def get_properties(self): """Get and return the contents of the current HashStore configuration. @@ -269,22 +267,12 @@ def _build_hashstore_yaml_string( # Algorithm values supported by python hashlib 3.9.0+ for File Hash Store (FHS) # The default algorithm list includes the hash algorithms calculated when storing an # object to disk and returned to the caller after successful storage. - filehashstore_default_algo_list: - - "sha1" - - "sha256" - - "sha384" - - "sha512" - - "md5" - # The other algorithm list consists of additional algorithms that can be included for - # calculating when storing objects, in addition to the default list. - filehashstore_other_algo_list: - - "sha224" - - "sha3_224" - - "sha3_256" - - "sha3_384" - - "sha3_512" - - "blake2b" - - "blake2s" + store_default_algo_list: + - "MD5" + - "SHA-1" + - "SHA-256" + - "SHA-384" + - "SHA-512" """ return hashstore_configuration_yaml @@ -321,13 +309,47 @@ def _validate_properties(self, properties): raise ValueError(exception_string) return properties + def _set_default_algorithms(self): + """Set the default algorithms to calculate when storing objects.""" + + def lookup_algo(algo): + """Translate DataONE controlled algorithms to python hashlib values: + https://dataoneorg.github.io/api-documentation/apis/Types.html#Types.ChecksumAlgorithm + """ + dataone_algo_translation = { + "MD5": "md5", + "SHA-1": "sha1", + "SHA-256": "sha256", + "SHA-384": "sha384", + "SHA-512": "sha512", + } + return dataone_algo_translation[algo] + + if not os.path.exists(self.hashstore_configuration_yaml): + exception_string = "hashstore.yaml not found in store root path." + logging.critical( + "FileHashStore - set_default_algorithms: %s", exception_string + ) + raise FileNotFoundError(exception_string) + with open(self.hashstore_configuration_yaml, "r", encoding="utf-8") as file: + yaml_data = yaml.safe_load(file) + + yaml_store_default_algo_list = yaml_data["store_default_algo_list"] + translated_default_algo_list = [] + for algo in yaml_store_default_algo_list: + translated_default_algo_list.append(lookup_algo(algo)) + + # Set class variable + self.default_algo_list = translated_default_algo_list + return + # Public API / HashStore Interface Methods def store_object( self, pid, data, - additional_algorithm="sha256", + additional_algorithm=None, checksum=None, checksum_algorithm=None, ): @@ -755,7 +777,9 @@ def _move_and_get_checksums( + f" file and calculating checksums for pid: {pid}" ) logging.debug(debug_tmp_file_str) - hex_digests, tmp_file_name = self._mktempfile(stream, additional_algorithm) + hex_digests, tmp_file_name = self._mktempfile( + stream, additional_algorithm, checksum_algorithm + ) logging.debug( "FileHashStore - _move_and_get_checksums: Temp file created: %s", tmp_file_name, @@ -837,15 +861,15 @@ def _move_and_get_checksums( return object_cid, rel_file_path, abs_file_path, is_duplicate, hex_digests - def _mktempfile(self, stream, algorithm=None): - """Create a named temporary file from a `Stream` object and - return its filename and a dictionary of its algorithms and hex digests. - If an algorithm is provided, it will add the respective hex digest to - the dictionary. + def _mktempfile(self, stream, additional_algorithm=None, checksum_algorithm=None): + """Create a named temporary file from a `Stream` object and return its filename + and a dictionary of its algorithms and hex digests. If an additionak and/or checksum + algorithm is provided, it will add the respective hex digest to the dictionary. Args: stream (io.BufferedReader): Object stream. - algorithm (string): Algorithm of additional hex digest to generate. + algorithm (string): Algorithm of additional hex digest to generate + checksum_algorithm (string): Algorithm of additional checksum algo to generate Returns: hex_digest_dict, tmp.name (tuple pack): @@ -869,16 +893,27 @@ def _mktempfile(self, stream, algorithm=None): finally: os.umask(oldmask) - # Additional hash object to digest - if algorithm is not None: - self.clean_algorithm(algorithm) - if algorithm in self.other_algo_list: + # Additional hash objects to digest + if checksum_algorithm is not None: + self.clean_algorithm(checksum_algorithm) + if checksum_algorithm in self.other_algo_list: + debug_additional_other_algo_str = ( + f"FileHashStore - _mktempfile: checksum algorithm: {checksum_algorithm}" + + " found in other_algo_lists, adding to list of algorithms to calculate." + ) + logging.debug(debug_additional_other_algo_str) + algorithm_list_to_calculate.append(checksum_algorithm) + if additional_algorithm is not None: + self.clean_algorithm(additional_algorithm) + if additional_algorithm in self.other_algo_list: debug_additional_other_algo_str = ( - f"FileHashStore - _mktempfile: additional algorithm: {algorithm} found" - + " in other_algo_lists, adding to list of algorithms to calculate." + f"FileHashStore - _mktempfile: additional algorithm: {additional_algorithm}" + + " found in other_algo_lists, adding to list of algorithms to calculate." ) logging.debug(debug_additional_other_algo_str) - algorithm_list_to_calculate.append(algorithm) + algorithm_list_to_calculate.append(additional_algorithm) + # Remove duplicates + algorithm_list_to_calculate = set(algorithm_list_to_calculate) logging.debug( "FileHashStore - _mktempfile: tmp file created: %s, calculating hex digests.", @@ -925,7 +960,7 @@ def put_metadata(self, metadata, pid, format_id): # Create metadata tmp file and write to it metadata_stream = Stream(metadata) with closing(metadata_stream): - metadata_tmp = self._mktmpmetadata(metadata_stream, format_id) + metadata_tmp = self._mktempmetadata(metadata_stream, format_id) # Get target and related paths (permanent location) metadata_cid = self.get_sha256_hex_digest(pid + format_id) @@ -965,7 +1000,7 @@ def put_metadata(self, metadata, pid, format_id): logging.error("FileHashStore - put_metadata: %s", exception_string) raise FileNotFoundError() - def _mktmpmetadata(self, stream, format_id): + def _mktempmetadata(self, stream, format_id): """Create a named temporary file with `stream` (metadata) and `format_id`. Args: diff --git a/tests/filehashstore/test_filehashstore.py b/tests/filehashstore/test_filehashstore.py index 597156ac..da4a9e18 100644 --- a/tests/filehashstore/test_filehashstore.py +++ b/tests/filehashstore/test_filehashstore.py @@ -111,6 +111,19 @@ def test_validate_properties_incorrect_type(store): store._validate_properties(properties) +def test_set_default_algorithms_missing_yaml(store, pids): + """Confirm set_default_algorithms raises FileNotFoundError when hashstore.yaml + not found.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + store.put_object(pid, path) + os.remove(store.hashstore_configuration_yaml) + with pytest.raises(FileNotFoundError): + # pylint: disable=W0212 + store._set_default_algorithms() + + def test_pids_length(pids): """Ensure test harness pids are present.""" assert len(pids) == 3 @@ -327,6 +340,63 @@ def test_move_and_get_checksums_duplicates_raises_error(pids, store): assert store.count(entity) == 3 +def test_mktempfile_additional_algo(store): + """Test _mktempfile returns correct hex digests for additional algorithm.""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid + input_stream = io.open(path, "rb") + checksum_algo = "sha3_256" + checksum_correct = ( + "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" + ) + # pylint: disable=W0212 + hex_digests, _ = store._mktempfile(input_stream, additional_algorithm=checksum_algo) + input_stream.close() + assert hex_digests.get("sha3_256") == checksum_correct + + +def test_mktempfile_checksum_algo(store): + """Test _mktempfile returns correct hex digests for checksum algorithm.""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid + input_stream = io.open(path, "rb") + checksum_algo = "sha3_256" + checksum_correct = ( + "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" + ) + # pylint: disable=W0212 + hex_digests, _ = store._mktempfile(input_stream, checksum_algorithm=checksum_algo) + input_stream.close() + assert hex_digests.get("sha3_256") == checksum_correct + + +def test_mktempfile_checksum_and_additional_algo(store): + """Test _mktempfile returns correct hex digests for checksum algorithm.""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid + input_stream = io.open(path, "rb") + additional_algo = "sha224" + additional_algo_checksum = ( + "9b3a96f434f3c894359193a63437ef86fbd5a1a1a6cc37f1d5013ac1" + ) + checksum_algo = "sha3_256" + checksum_correct = ( + "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" + ) + # pylint: disable=W0212 + hex_digests, _ = store._mktempfile( + input_stream, + additional_algorithm=additional_algo, + checksum_algorithm=checksum_algo, + ) + input_stream.close() + assert hex_digests.get("sha3_256") == checksum_correct + assert hex_digests.get("sha224") == additional_algo_checksum + + def test_mktempfile_hex_digests(pids, store): """Test _mktempfile returns correct hex digests.""" test_dir = "tests/testdata/" @@ -418,7 +488,7 @@ def test_put_metadata_cid(pids, store): assert metadata_cid == pids[pid]["metadata_cid"] -def test_mktmpmetadata(pids, store): +def test_mktempmetadata(pids, store): """Test mktmpmetadata creates tmpFile.""" test_dir = "tests/testdata/" entity = "metadata" @@ -428,7 +498,7 @@ def test_mktmpmetadata(pids, store): sys_stream = io.open(syspath, "rb") format_id = "http://ns.dataone.org/service/types/v2.0" # pylint: disable=W0212 - tmp_name = store._mktmpmetadata(sys_stream, format_id) + tmp_name = store._mktempmetadata(sys_stream, format_id) sys_stream.close() assert store.exists(entity, tmp_name) diff --git a/tests/filehashstore/test_filehashstore_interface.py b/tests/filehashstore/test_filehashstore_interface.py index 56314419..bb65e8b2 100644 --- a/tests/filehashstore/test_filehashstore_interface.py +++ b/tests/filehashstore/test_filehashstore_interface.py @@ -255,16 +255,36 @@ def test_store_object_checksum_correct(store): entity = "objects" pid = "jtao.1700.1" path = test_dir + pid - algorithm_other = "sha3_256" + checksum_algo = "sha3_256" checksum_correct = ( "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) _hash_address = store.store_object( - pid, path, checksum=checksum_correct, checksum_algorithm=algorithm_other + pid, path, checksum=checksum_correct, checksum_algorithm=checksum_algo ) assert store.count(entity) == 1 +def test_store_object_checksum_correct_and_additional_algo(store): + """Test store object successfully stores with good checksum and same additional algorithm""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid + algorithm_additional = "sha3_256" + algorithm_checksum = "sha3_256" + checksum_correct = ( + "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" + ) + hash_address = store.store_object( + pid, + path, + additional_algorithm=algorithm_additional, + checksum=checksum_correct, + checksum_algorithm=algorithm_checksum, + ) + assert hash_address.hex_digests.get("sha3_256") == checksum_correct + + def test_store_object_checksum_algorithm_empty(store): """Test store object raises error when checksum supplied with no checksum_algorithm.""" test_dir = "tests/testdata/" From 155be3393bc4795b51402a9f971c85eb9499f0e6 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 21 Jun 2023 12:41:58 -0700 Subject: [PATCH 012/165] Update README.md to add information on how to run tests --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index fd9bd193..7d77c837 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,10 @@ HashStore is a python package, and built using the [Python Poetry](https://pytho To install `hashstore` locally, create a virtual environment for python 3.9+, install poetry, and then install or build the package with `poetry install` or `poetry build`, respectively. +To run tests, navigate to the root directory and run `pytest -s`. The test suite contains tests that +take a longer time to run (relating to the storage of large files) - to execute all tests, run +`pytest --run-slow`. To see detailed + ## License ``` Copyright [2022] [Regents of the University of California] From 90aa9475f3339806b3e969055f97c4521945c663 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 21 Jun 2023 12:52:01 -0700 Subject: [PATCH 013/165] Review and cleanup 'test_filehashstore' pytests and comments --- tests/filehashstore/test_filehashstore.py | 67 +++++++++++++---------- 1 file changed, 38 insertions(+), 29 deletions(-) diff --git a/tests/filehashstore/test_filehashstore.py b/tests/filehashstore/test_filehashstore.py index da4a9e18..5552f194 100644 --- a/tests/filehashstore/test_filehashstore.py +++ b/tests/filehashstore/test_filehashstore.py @@ -6,6 +6,11 @@ from hashstore.filehashstore.filehashstore import FileHashStore +def test_pids_length(pids): + """Ensure test harness pids are present.""" + assert len(pids) == 3 + + def test_init_put_properties_hashstore_yaml_exists(store): """Verify properties file present in store root directory.""" assert os.path.exists(store.hashstore_configuration_yaml) @@ -90,7 +95,7 @@ def test_validate_properties_missing_key(store): def test_validate_properties_key_value_is_none(store): - """Confirm exception raised when value from key is 'None'""" + """Confirm exception raised when value from key is 'None'.""" properties = { "store_path": "/etc/test", "store_depth": 3, @@ -105,7 +110,7 @@ def test_validate_properties_key_value_is_none(store): def test_validate_properties_incorrect_type(store): """Confirm exception raised when key missing in properties.""" - properties = "etc/filehashstore" + properties = "etc/filehashstore/hashstore.yaml" with pytest.raises(ValueError): # pylint: disable=W0212 store._validate_properties(properties) @@ -124,11 +129,6 @@ def test_set_default_algorithms_missing_yaml(store, pids): store._set_default_algorithms() -def test_pids_length(pids): - """Ensure test harness pids are present.""" - assert len(pids) == 3 - - def test_put_object_files_path(pids, store): """Test put objects with path object.""" test_dir = "tests/testdata/" @@ -225,7 +225,7 @@ def test_put_object_hex_digests(pids, store): def test_put_object_additional_algorithm(pids, store): - """Check put returns additional algorithm in hex digests.""" + """Check put_object returns additional algorithm in hex digests.""" test_dir = "tests/testdata/" for pid in pids.keys(): algo = "sha224" @@ -237,7 +237,7 @@ def test_put_object_additional_algorithm(pids, store): def test_put_object_with_correct_checksums(pids, store): - """Check put success with good checksum supplied.""" + """Check put_object success with valid checksum and checksum algorithm supplied.""" test_dir = "tests/testdata/" for pid in pids.keys(): algo = "sha224" @@ -275,8 +275,8 @@ def test_move_and_get_checksums_id(pids, store): _, ) = store._move_and_get_checksums(pid, input_stream) input_stream.close() - metadata_cid = store.get_sha256_hex_digest(pid) - assert move_id == metadata_cid + object_cid = store.get_sha256_hex_digest(pid) + assert move_id == object_cid def test_move_and_get_checksums_hex_digests(pids, store): @@ -302,7 +302,7 @@ def test_move_and_get_checksums_hex_digests(pids, store): def test_move_and_get_checksums_abs_path(pids, store): - """Test move returns correct absolute path.""" + """Test move returns correct absolute path that exists.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -397,6 +397,25 @@ def test_mktempfile_checksum_and_additional_algo(store): assert hex_digests.get("sha224") == additional_algo_checksum +def test_mktempfile_checksum_and_additional_algo_duplicate(store): + """Test _mktempfile succeeds with duplicate algorithms (de-duplicates)""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid + input_stream = io.open(path, "rb") + additional_algo = "sha224" + checksum_algo = "sha224" + checksum_correct = "9b3a96f434f3c894359193a63437ef86fbd5a1a1a6cc37f1d5013ac1" + # pylint: disable=W0212 + hex_digests, _ = store._mktempfile( + input_stream, + additional_algorithm=additional_algo, + checksum_algorithm=checksum_algo, + ) + input_stream.close() + assert hex_digests.get("sha224") == checksum_correct + + def test_mktempfile_hex_digests(pids, store): """Test _mktempfile returns correct hex digests.""" test_dir = "tests/testdata/" @@ -413,7 +432,7 @@ def test_mktempfile_hex_digests(pids, store): assert hex_digests.get("sha512") == pids[pid]["sha512"] -def test_mktempfile_object(pids, store): +def test_mktempfile_tmpfile_object(pids, store): """Test _mktempfile creates file successfully.""" test_dir = "tests/testdata/" for pid in pids.keys(): @@ -425,19 +444,6 @@ def test_mktempfile_object(pids, store): assert os.path.isfile(tmp_file_name) is True -def test_mktempfile_with_algorithm(pids, store): - """Test _mktempfile returns additional hex digest when supplied.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - input_stream = io.open(path, "rb") - algo = "sha224" - # pylint: disable=W0212 - hex_digests, _ = store._mktempfile(input_stream, algo) - input_stream.close() - assert hex_digests.get("sha224") == pids[pid]["sha224"] - - def test_mktempfile_with_unsupported_algorithm(pids, store): """Test _mktempfile raises error when bad algorithm supplied.""" test_dir = "tests/testdata/" @@ -447,12 +453,15 @@ def test_mktempfile_with_unsupported_algorithm(pids, store): algo = "md2" with pytest.raises(ValueError): # pylint: disable=W0212 - _, _ = store._mktempfile(input_stream, algo) + _, _ = store._mktempfile(input_stream, additional_algorithm=algo) + with pytest.raises(ValueError): + # pylint: disable=W0212 + _, _ = store._mktempfile(input_stream, checksum_algorithm=algo) input_stream.close() def test_put_metadata_with_path(pids, store): - """Test put metadata with path object.""" + """Test put_metadata with path object.""" entity = "metadata" test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" @@ -465,7 +474,7 @@ def test_put_metadata_with_path(pids, store): def test_put_metadata_with_string(pids, store): - """Test put metadata with string.""" + """Test_put metadata with string.""" entity = "metadata" test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" From a4a012095a6ea90696eed03a4614e78069330f9f Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 21 Jun 2023 13:00:15 -0700 Subject: [PATCH 014/165] Review and cleanup 'test_filehashstore_interface' pytests and comments --- .../test_filehashstore_interface.py | 38 +++++++++++++++---- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/tests/filehashstore/test_filehashstore_interface.py b/tests/filehashstore/test_filehashstore_interface.py index bb65e8b2..35ad3d7e 100644 --- a/tests/filehashstore/test_filehashstore_interface.py +++ b/tests/filehashstore/test_filehashstore_interface.py @@ -266,7 +266,31 @@ def test_store_object_checksum_correct(store): def test_store_object_checksum_correct_and_additional_algo(store): - """Test store object successfully stores with good checksum and same additional algorithm""" + """Test store object successfully stores with good checksum and same additional algorithm.""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid + algorithm_additional = "sha224" + sha224_additional_checksum = ( + "9b3a96f434f3c894359193a63437ef86fbd5a1a1a6cc37f1d5013ac1" + ) + algorithm_checksum = "sha3_256" + checksum_correct = ( + "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" + ) + hash_address = store.store_object( + pid, + path, + additional_algorithm=algorithm_additional, + checksum=checksum_correct, + checksum_algorithm=algorithm_checksum, + ) + assert hash_address.hex_digests.get("sha224") == sha224_additional_checksum + assert hash_address.hex_digests.get("sha3_256") == checksum_correct + + +def test_store_object_checksum_correct_and_additional_algo_duplicate(store): + """Test store object successfully stores with good checksum and same additional algorithm.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -298,7 +322,7 @@ def test_store_object_checksum_algorithm_empty(store): def test_store_object_checksum_empty(store): - """Test store object raises error when checksum_algorithm supplied and checksum is empty.""" + """Test store object raises error when checksum_algorithm supplied with empty checksum.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -480,7 +504,7 @@ def test_store_metadata_format_id_is_none(pids, store): def test_store_metadata_format_id_is_custom(pids, store): - """Confirm default name space is used when format_id is not supplied""" + """Confirm new format_id is stored when default 'None' is overridden.""" test_dir = "tests/testdata/" format_id = "http://hashstore.world.com/types/v1.0" entity = "metadata" @@ -558,7 +582,7 @@ def test_store_metadata_pid_empty(store): def test_store_metadata_pid_empty_spaces(store): - """Test store metadata raises error with empty string.""" + """Test store metadata raises error with empty spaces.""" test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" pid = " " @@ -580,7 +604,7 @@ def test_store_metadata_format_id_empty(store): def test_store_metadata_pid_format_id_spaces(store): - """Test store metadata raises error with empty string.""" + """Test store metadata raises error with empty spaces.""" test_dir = "tests/testdata/" format_id = " " pid = "jtao.1700.1" @@ -600,7 +624,7 @@ def test_store_metadata_metadata_empty(store): def test_store_metadata_metadata_none(store): - """Test store metadata raises error with empty metadata string.""" + """Test store metadata raises error with empty None metadata.""" pid = "jtao.1700.1" format_id = "http://ns.dataone.org/service/types/v2.0" syspath_string = None @@ -728,7 +752,7 @@ def test_retrieve_metadata_format_id_empty(store): def test_retrieve_metadata_format_id_empty_spaces(store): - """Test retrieve_metadata raises error when supplied with empty format_id.""" + """Test retrieve_metadata raises error when supplied with empty spaces format_id.""" format_id = " " pid = "jtao.1700.1" with pytest.raises(ValueError): From dd5a559ef512eb8f0df0808e773dffd066435480 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 22 Jun 2023 08:29:25 -0700 Subject: [PATCH 015/165] Update README.md with usage example --- README.md | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/README.md b/README.md index 7d77c837..7046f7d2 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,47 @@ To run tests, navigate to the root directory and run `pytest -s`. The test suite take a longer time to run (relating to the storage of large files) - to execute all tests, run `pytest --run-slow`. To see detailed +## Usage Example +``` +# To view more details about the Public API - see 'hashstore.py` interface documentation + +# Instantiate a factory +hashstore_factory = HashStoreFactory() + +# Create a properties dictionary with the required fields +hashstore_path = "/path/to/your/store" +properties = { + "store_path": hashstore_path, + "store_depth": 3, + "store_width": 2, + "store_algorithm": "sha256", + "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", +} + +# Get HashStore from factory +module_name = "hashstore.filehashstore.filehashstore" +class_name = "FileHashStore" +my_store = factory.get_hashstore(module_name, class_name, properties) + +# Store objects (.../[hashstore_path]/objects/) +pid = "j.tao.1700.1" +object = "/path/to/your/object.data" +object_cid = mystore.store_object(pid, object) + +# Store metadata (.../[hashstore_path]/metadata/) +# By default, storing metadata will use the given properties namespace `format_id` +pid = "j.tao.1700.1" +sysmeta = "/path/to/your/metadata/document.xml" +metadata_cid = mystore.store_metadata(pid, sysmeta) + +# If you want to store other types of metadata, add an additional `format_id` +pid = "j.tao.1700.1" +sysmeta = "/path/to/your/metadata/document.xml" +format_id = "http://custom.metadata.format/type/v1.0" +metadata_cid = mystore.store_metadata(pid, sysmeta, format_id) + +``` + ## License ``` Copyright [2022] [Regents of the University of California] From 646579fb47a556e3018b4619da2113f8e4a92792 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 22 Jun 2023 08:35:07 -0700 Subject: [PATCH 016/165] Update 'Usage Example' in README.md --- README.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 7046f7d2..ddbeab2c 100644 --- a/README.md +++ b/README.md @@ -28,9 +28,9 @@ take a longer time to run (relating to the storage of large files) - to execute `pytest --run-slow`. To see detailed ## Usage Example -``` -# To view more details about the Public API - see 'hashstore.py` interface documentation +To view more details about the Public API - see 'hashstore.py` interface documentation +``` # Instantiate a factory hashstore_factory = HashStoreFactory() @@ -59,12 +59,14 @@ object_cid = mystore.store_object(pid, object) pid = "j.tao.1700.1" sysmeta = "/path/to/your/metadata/document.xml" metadata_cid = mystore.store_metadata(pid, sysmeta) +``` -# If you want to store other types of metadata, add an additional `format_id` +If you want to store other types of metadata, add an additional `format_id` +``` pid = "j.tao.1700.1" -sysmeta = "/path/to/your/metadata/document.xml" +metadata = "/path/to/your/metadata/document.xml" format_id = "http://custom.metadata.format/type/v1.0" -metadata_cid = mystore.store_metadata(pid, sysmeta, format_id) +metadata_cid = mystore.store_metadata(pid, metadata, format_id) ``` From cb16f7ed1bbddb90fab2d34d0f5d88f3ba2f9f90 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 22 Jun 2023 08:43:46 -0700 Subject: [PATCH 017/165] Clean-up usage example in README.md --- README.md | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index ddbeab2c..2314ee5a 100644 --- a/README.md +++ b/README.md @@ -52,22 +52,21 @@ my_store = factory.get_hashstore(module_name, class_name, properties) # Store objects (.../[hashstore_path]/objects/) pid = "j.tao.1700.1" object = "/path/to/your/object.data" -object_cid = mystore.store_object(pid, object) +object_cid = my_store.store_object(pid, object) # Store metadata (.../[hashstore_path]/metadata/) # By default, storing metadata will use the given properties namespace `format_id` pid = "j.tao.1700.1" -sysmeta = "/path/to/your/metadata/document.xml" -metadata_cid = mystore.store_metadata(pid, sysmeta) +sysmeta = "/path/to/your/sysmeta/document.xml" +metadata_cid = my_store.store_metadata(pid, sysmeta) ``` -If you want to store other types of metadata, add an additional `format_id` +If you want to store other types of metadata, add an additional `format_id`. ``` pid = "j.tao.1700.1" -metadata = "/path/to/your/metadata/document.xml" -format_id = "http://custom.metadata.format/type/v1.0" -metadata_cid = mystore.store_metadata(pid, metadata, format_id) - +metadata = "/path/to/your/metadata/document.json" +format_id = "http://custom.metadata.com/json/type/v1.0" +metadata_cid = my_store.store_metadata(pid, metadata, format_id) ``` ## License From 9573fa6c0aa67fd245ddcda8b9eaa04d7fd8555d Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 22 Jun 2023 15:29:38 -0700 Subject: [PATCH 018/165] Update 'README.md' and '__init__.py' --- README.md | 3 ++- src/hashstore/__init__.py | 10 ++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 2314ee5a..48a81833 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,8 @@ my_store = factory.get_hashstore(module_name, class_name, properties) # Store objects (.../[hashstore_path]/objects/) pid = "j.tao.1700.1" object = "/path/to/your/object.data" -object_cid = my_store.store_object(pid, object) +hash_address = my_store.store_object(pid, object) +object_cid = hash_address.id # Store metadata (.../[hashstore_path]/metadata/) # By default, storing metadata will use the given properties namespace `format_id` diff --git a/src/hashstore/__init__.py b/src/hashstore/__init__.py index 2902348a..11db5e95 100644 --- a/src/hashstore/__init__.py +++ b/src/hashstore/__init__.py @@ -11,10 +11,12 @@ - Data objects are named using the SHA-256, base64-encoded hash of their contents (thus, a content-identifier) - Metadata objects are stored with the formatId, a null character and its contents -- Metadata objects are named using the SHA-256, base64-encoded hash of their - persistent identifier (PID) -- An object's persistent identifier can be used to read both metadata and contents - of the object +- Metadata objects are named using the SHA-256 + formatId, base64-encoded hash of + their persistent identifier (PID) """ from hashstore.hashstore import HashStore +from hashstore.hashaddress import HashAddress +from hashstore.hashstore_factory import HashStoreFactory + +__all__ = ("HashStore", "HashAddress", "HashStoreFactory") From ad1d4afd7742908a165b62c12e60857ce0729724 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 5 Jul 2023 12:22:05 -0700 Subject: [PATCH 019/165] Update 'hashstore.py' interface doc strings --- src/hashstore/hashstore.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 19c4d609..cac31905 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -66,12 +66,8 @@ def store_object( def store_metadata(self, pid, metadata, format_id): """The `store_metadata` method is responsible for adding and/or updating metadata (ex. `sysmeta`) to disk using a given path/stream, a persistent identifier `pid` - and a metadata `format_id`. The metadata object consists of a header and a body - section, which is split by a null character `\x00`. - - The header contains the metadata object's permanent address, which is determined - by calculating the SHA-256 hex digest of the provided `pid` + `format_id`; and the - body contains the metadata content (ex. `sysmeta`). + and a metadata `format_id`. The metadata object's permanent address, which is + determined by calculating the SHA-256 hex digest of the provided `pid` + `format_id`. Upon successful storage of metadata, `store_metadata` returns a string that represents the file's permanent address. Lastly, the metadata objects are stored @@ -98,7 +94,7 @@ def retrieve_object(self, pid): pid (string): Authority-based identifier. Returns: - obj_stream (io.BufferedReader): A buffered stream of an ab_id object. + obj_stream (io.BufferedReader): A buffered stream of a data object. """ raise NotImplementedError() @@ -112,7 +108,7 @@ def retrieve_metadata(self, pid, format_id): format_id (string): Metadata format Returns: - metadata (string): Sysmeta content. + metadata_stream (io.BufferedReader): A buffered stream of a metadata object. """ raise NotImplementedError() From eeec1aa1af37531f82f1a2c5cd04e38bc23e6b98 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 5 Jul 2023 12:28:44 -0700 Subject: [PATCH 020/165] Refactor 'store_metadata()' to only store metadata content and update affected methods and pytests --- src/hashstore/filehashstore/filehashstore.py | 16 ++----- tests/filehashstore/test_filehashstore.py | 2 +- .../test_filehashstore_interface.py | 46 ++----------------- 3 files changed, 9 insertions(+), 55 deletions(-) diff --git a/src/hashstore/filehashstore/filehashstore.py b/src/hashstore/filehashstore/filehashstore.py index 5a7fa684..a558d72a 100644 --- a/src/hashstore/filehashstore/filehashstore.py +++ b/src/hashstore/filehashstore/filehashstore.py @@ -574,13 +574,7 @@ def retrieve_metadata(self, pid, format_id): "FileHashStore - retrieve_metadata: Metadata exists for pid: %s", pid + ", retrieving metadata.", ) - metadata_cid = self.get_sha256_hex_digest(pid + format_id) - metadata_cid_stream = self.open(entity, metadata_cid) - metadata_cid_content = ( - metadata_cid_stream.read().decode("utf-8").split("\x00", 1) - ) - metadata_cid_stream.close() - metadata = metadata_cid_content[1] + metadata_stream = self.open(entity, metadata_cid) else: exception_string = f"No metadata found for pid: {pid}" logging.error("FileHashStore - retrieve_metadata: %s", exception_string) @@ -588,7 +582,7 @@ def retrieve_metadata(self, pid, format_id): logging.info( "FileHashStore - retrieve_metadata: Retrieved metadata for pid: %s", pid ) - return metadata + return metadata_stream def delete_object(self, pid): logging.debug( @@ -960,7 +954,7 @@ def put_metadata(self, metadata, pid, format_id): # Create metadata tmp file and write to it metadata_stream = Stream(metadata) with closing(metadata_stream): - metadata_tmp = self._mktempmetadata(metadata_stream, format_id) + metadata_tmp = self._mktempmetadata(metadata_stream) # Get target and related paths (permanent location) metadata_cid = self.get_sha256_hex_digest(pid + format_id) @@ -1000,7 +994,7 @@ def put_metadata(self, metadata, pid, format_id): logging.error("FileHashStore - put_metadata: %s", exception_string) raise FileNotFoundError() - def _mktempmetadata(self, stream, format_id): + def _mktempmetadata(self, stream): """Create a named temporary file with `stream` (metadata) and `format_id`. Args: @@ -1031,8 +1025,6 @@ def _mktempmetadata(self, stream, format_id): tmp.name, ) with tmp as tmp_file: - tmp_file.write(format_id.encode("utf-8")) - tmp_file.write(b"\x00") for data in stream: tmp_file.write(self._to_bytes(data)) diff --git a/tests/filehashstore/test_filehashstore.py b/tests/filehashstore/test_filehashstore.py index 5552f194..5a051921 100644 --- a/tests/filehashstore/test_filehashstore.py +++ b/tests/filehashstore/test_filehashstore.py @@ -507,7 +507,7 @@ def test_mktempmetadata(pids, store): sys_stream = io.open(syspath, "rb") format_id = "http://ns.dataone.org/service/types/v2.0" # pylint: disable=W0212 - tmp_name = store._mktempmetadata(sys_stream, format_id) + tmp_name = store._mktempmetadata(sys_stream) sys_stream.close() assert store.exists(entity, tmp_name) diff --git a/tests/filehashstore/test_filehashstore_interface.py b/tests/filehashstore/test_filehashstore_interface.py index 35ad3d7e..25188ae0 100644 --- a/tests/filehashstore/test_filehashstore_interface.py +++ b/tests/filehashstore/test_filehashstore_interface.py @@ -483,46 +483,6 @@ def test_store_metadata(pids, store): assert metadata_cid == pids[pid]["metadata_cid"] -def test_store_metadata_format_id_is_none(pids, store): - """Confirm default name space is used when format_id is not supplied""" - test_dir = "tests/testdata/" - format_id = "http://ns.dataone.org/service/types/v2.0" - entity = "metadata" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - filename = pid.replace("/", "_") + ".xml" - syspath = Path(test_dir) / filename - _hash_address = store.store_object(pid, path) - metadata_cid = store.store_metadata(pid, syspath) - metadata_cid_stream = store.open(entity, metadata_cid) - metadata_cid_content = ( - metadata_cid_stream.read().decode("utf-8").split("\x00", 1) - ) - metadata_cid_stream.close() - metadata_format = metadata_cid_content[0] - assert metadata_format == format_id - - -def test_store_metadata_format_id_is_custom(pids, store): - """Confirm new format_id is stored when default 'None' is overridden.""" - test_dir = "tests/testdata/" - format_id = "http://hashstore.world.com/types/v1.0" - entity = "metadata" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - filename = pid.replace("/", "_") + ".xml" - syspath = Path(test_dir) / filename - _hash_address = store.store_object(pid, path) - metadata_cid = store.store_metadata(pid, syspath, format_id) - metadata_cid_stream = store.open(entity, metadata_cid) - metadata_cid_content = ( - metadata_cid_stream.read().decode("utf-8").split("\x00", 1) - ) - metadata_cid_stream.close() - metadata_format = metadata_cid_content[0] - assert metadata_format == format_id - - def test_store_metadata_files_path(pids, store): """Test store metadata with path.""" test_dir = "tests/testdata/" @@ -713,9 +673,11 @@ def test_retrieve_metadata(store): syspath = Path(test_dir) / filename _hash_address = store.store_object(pid, path) _metadata_cid = store.store_metadata(pid, syspath, format_id) - metadata_bytes = store.retrieve_metadata(pid, format_id) + metadata_stream = store.retrieve_metadata(pid, format_id) + metadata_content = metadata_stream.read().decode("utf-8") + metadata_stream.close() metadata = syspath.read_bytes() - assert metadata.decode("utf-8") == metadata_bytes + assert metadata.decode("utf-8") == metadata_content def test_retrieve_metadata_bytes_pid_invalid(store): From 5beb69172817464adbe91e516983e37734784052 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 5 Jul 2023 13:04:28 -0700 Subject: [PATCH 021/165] Refactor 'retrieve_metadata()' and revise/add junit tests --- src/hashstore/filehashstore/filehashstore.py | 14 ++++-- tests/filehashstore/test_filehashstore.py | 1 - .../test_filehashstore_interface.py | 47 +++++++++++-------- 3 files changed, 38 insertions(+), 24 deletions(-) diff --git a/src/hashstore/filehashstore/filehashstore.py b/src/hashstore/filehashstore/filehashstore.py index a558d72a..9cea906e 100644 --- a/src/hashstore/filehashstore/filehashstore.py +++ b/src/hashstore/filehashstore/filehashstore.py @@ -550,7 +550,7 @@ def retrieve_object(self, pid): ) return obj_stream - def retrieve_metadata(self, pid, format_id): + def retrieve_metadata(self, pid, format_id=None): logging.debug( "FileHashStore - retrieve_metadata: Request to retrieve metadata for pid: %s", pid, @@ -559,15 +559,21 @@ def retrieve_metadata(self, pid, format_id): exception_string = f"Pid cannot be None or empty, pid: {pid}" logging.error("FileHashStore - retrieve_metadata: %s", exception_string) raise ValueError(exception_string) - if format_id is None or format_id.replace(" ", "") == "": + checked_format_id = None + if format_id is None: + checked_format_id = self.sysmeta_ns + elif format_id.replace(" ", "") == "": exception_string = ( - f"Format_id cannot be None or empty, format_id: {format_id}" + "Format_id cannot empty, must be 'None'" + + "for default HashStore format or supplied." ) logging.error("FileHashStore - retrieve_metadata: %s", exception_string) raise ValueError(exception_string) + else: + checked_format_id = format_id entity = "metadata" - metadata_cid = self.get_sha256_hex_digest(pid + format_id) + metadata_cid = self.get_sha256_hex_digest(pid + checked_format_id) metadata_exists = self.exists(entity, metadata_cid) if metadata_exists: logging.debug( diff --git a/tests/filehashstore/test_filehashstore.py b/tests/filehashstore/test_filehashstore.py index 5a051921..b01bf7c2 100644 --- a/tests/filehashstore/test_filehashstore.py +++ b/tests/filehashstore/test_filehashstore.py @@ -505,7 +505,6 @@ def test_mktempmetadata(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename sys_stream = io.open(syspath, "rb") - format_id = "http://ns.dataone.org/service/types/v2.0" # pylint: disable=W0212 tmp_name = store._mktempmetadata(sys_stream) sys_stream.close() diff --git a/tests/filehashstore/test_filehashstore_interface.py b/tests/filehashstore/test_filehashstore_interface.py index 25188ae0..45fcd3e9 100644 --- a/tests/filehashstore/test_filehashstore_interface.py +++ b/tests/filehashstore/test_filehashstore_interface.py @@ -483,6 +483,18 @@ def test_store_metadata(pids, store): assert metadata_cid == pids[pid]["metadata_cid"] +def test_store_metadata_default_format_id(pids, store): + """Test store metadata returns expected id when storing with default format_id""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + _hash_address = store.store_object(pid, path) + metadata_cid = store.store_metadata(pid, syspath) + assert metadata_cid == pids[pid]["metadata_cid"] + + def test_store_metadata_files_path(pids, store): """Test store metadata with path.""" test_dir = "tests/testdata/" @@ -552,17 +564,6 @@ def test_store_metadata_pid_empty_spaces(store): store.store_metadata(pid, syspath_string, format_id) -def test_store_metadata_format_id_empty(store): - """Test store metadata raises error with empty string.""" - test_dir = "tests/testdata/" - format_id = "" - pid = "jtao.1700.1" - filename = pid.replace("/", "_") + ".xml" - syspath_string = str(Path(test_dir) / filename) - with pytest.raises(ValueError): - store.store_metadata(pid, syspath_string, format_id) - - def test_store_metadata_pid_format_id_spaces(store): """Test store metadata raises error with empty spaces.""" test_dir = "tests/testdata/" @@ -680,6 +681,22 @@ def test_retrieve_metadata(store): assert metadata.decode("utf-8") == metadata_content +def test_retrieve_metadata_default_format_id(store): + """Test retrieve_metadata retrieves expected metadata when format_id is none""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid + filename = pid + ".xml" + syspath = Path(test_dir) / filename + _hash_address = store.store_object(pid, path) + _metadata_cid = store.store_metadata(pid, syspath) + metadata_stream = store.retrieve_metadata(pid) + metadata_content = metadata_stream.read().decode("utf-8") + metadata_stream.close() + metadata = syspath.read_bytes() + assert metadata.decode("utf-8") == metadata_content + + def test_retrieve_metadata_bytes_pid_invalid(store): """Test retrieve_metadata raises error when supplied with bad pid.""" format_id = "http://ns.dataone.org/service/types/v2.0" @@ -697,14 +714,6 @@ def test_retrieve_metadata_bytes_pid_empty(store): store.retrieve_metadata(pid, format_id) -def test_retrieve_metadata_format_id_none(store): - """Test retrieve_metadata raises error when supplied with None format_id""" - format_id = None - pid = "jtao.1700.1" - with pytest.raises(ValueError): - store.retrieve_metadata(pid, format_id) - - def test_retrieve_metadata_format_id_empty(store): """Test retrieve_metadata raises error when supplied with empty format_id.""" format_id = "" From 26a396c7c3d037ff1050da9eb31fa91121c72b89 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 5 Jul 2023 13:15:15 -0700 Subject: [PATCH 022/165] Refactor 'delete_metadata()' and revise/add junit tests --- src/hashstore/filehashstore/filehashstore.py | 16 ++++++++---- .../test_filehashstore_interface.py | 26 ++++++++++++------- 2 files changed, 27 insertions(+), 15 deletions(-) diff --git a/src/hashstore/filehashstore/filehashstore.py b/src/hashstore/filehashstore/filehashstore.py index 9cea906e..f0c50532 100644 --- a/src/hashstore/filehashstore/filehashstore.py +++ b/src/hashstore/filehashstore/filehashstore.py @@ -565,7 +565,7 @@ def retrieve_metadata(self, pid, format_id=None): elif format_id.replace(" ", "") == "": exception_string = ( "Format_id cannot empty, must be 'None'" - + "for default HashStore format or supplied." + + " for default HashStore format or supplied." ) logging.error("FileHashStore - retrieve_metadata: %s", exception_string) raise ValueError(exception_string) @@ -608,7 +608,7 @@ def delete_object(self, pid): ) return True - def delete_metadata(self, pid, format_id): + def delete_metadata(self, pid, format_id=None): logging.debug( "FileHashStore - delete_metadata: Request to delete metadata for pid: %s", pid, @@ -617,15 +617,21 @@ def delete_metadata(self, pid, format_id): exception_string = f"Pid cannot be None or empty, pid: {pid}" logging.error("FileHashStore - delete_metadata: %s", exception_string) raise ValueError(exception_string) - if format_id is None or format_id.replace(" ", "") == "": + checked_format_id = None + if format_id is None: + checked_format_id = self.sysmeta_ns + elif format_id.replace(" ", "") == "": exception_string = ( - f"Format_id cannot be None or empty, format_id: {format_id}" + "Format_id cannot empty, must be 'None'" + + " for default HashStore format or supplied." ) logging.error("FileHashStore - delete_metadata: %s", exception_string) raise ValueError(exception_string) + else: + checked_format_id = format_id entity = "metadata" - metadata_cid = self.get_sha256_hex_digest(pid + format_id) + metadata_cid = self.get_sha256_hex_digest(pid + checked_format_id) self.delete(entity, metadata_cid) logging.info( "FileHashStore - delete_metadata: Successfully deleted metadata for pid: %s", diff --git a/tests/filehashstore/test_filehashstore_interface.py b/tests/filehashstore/test_filehashstore_interface.py index 45fcd3e9..f8e4b328 100644 --- a/tests/filehashstore/test_filehashstore_interface.py +++ b/tests/filehashstore/test_filehashstore_interface.py @@ -484,7 +484,7 @@ def test_store_metadata(pids, store): def test_store_metadata_default_format_id(pids, store): - """Test store metadata returns expected id when storing with default format_id""" + """Test store metadata returns expected id when storing with default format_id.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -682,7 +682,7 @@ def test_retrieve_metadata(store): def test_retrieve_metadata_default_format_id(store): - """Test retrieve_metadata retrieves expected metadata when format_id is none""" + """Test retrieve_metadata retrieves expected metadata with default format_id.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -774,6 +774,20 @@ def test_delete_metadata(pids, store): assert store.count(entity) == 0 +def test_delete_metadata_default_format_id(store, pids): + """Test delete_metadata deletes successfully with default format_id.""" + test_dir = "tests/testdata/" + entity = "metadata" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + _hash_address = store.store_object(pid, path) + _metadata_cid = store.store_metadata(pid, syspath) + store.delete_metadata(pid) + assert store.count(entity) == 0 + + def test_delete_metadata_pid_empty(store): """Test delete_object raises error when empty pid supplied.""" format_id = "http://ns.dataone.org/service/types/v2.0" @@ -798,14 +812,6 @@ def test_delete_metadata_format_id_empty(store): store.delete_metadata(pid, format_id) -def test_delete_metadata_format_id_none(store): - """Test delete_object raises error when format_id is 'None'.""" - format_id = None - pid = "jtao.1700.1" - with pytest.raises(ValueError): - store.delete_metadata(pid, format_id) - - def test_get_hex_digest(store): """Test get_hex_digest for expected value.""" test_dir = "tests/testdata/" From 0b30226d455a40ed91814917d49d1143538771bd Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 12 Jul 2023 09:28:10 -0700 Subject: [PATCH 023/165] Refactor init method by extracting method to verify HashStore properties --- src/hashstore/filehashstore/filehashstore.py | 67 ++++++++++++-------- 1 file changed, 42 insertions(+), 25 deletions(-) diff --git a/src/hashstore/filehashstore/filehashstore.py b/src/hashstore/filehashstore/filehashstore.py index f0c50532..13a0835e 100644 --- a/src/hashstore/filehashstore/filehashstore.py +++ b/src/hashstore/filehashstore/filehashstore.py @@ -81,31 +81,7 @@ def __init__(self, properties=None): # Check to see if a configuration is present in the given store path self.hashstore_configuration_yaml = prop_store_path + "/hashstore.yaml" - if os.path.exists(self.hashstore_configuration_yaml): - logging.debug( - "FileHashStore - Config found (hashstore.yaml) at {%s}. Verifying properties.", - self.hashstore_configuration_yaml, - ) - # If 'hashstore.yaml' is found, verify given properties before init - hashstore_yaml_dict = self.get_properties() - for key in self.property_required_keys: - if hashstore_yaml_dict[key] != properties[key]: - exception_string = ( - f"Given properties ({key}: {properties[key]}) does not match " - + f"HashStore configuration ({key}: {hashstore_yaml_dict[key]})" - + f"found at: {self.hashstore_configuration_yaml}" - ) - logging.critical("FileHashStore - %s", exception_string) - raise ValueError(exception_string) - else: - # Check if HashStore exists and throw exception if found - if any(Path(prop_store_path).iterdir()): - exception_string = ( - f"HashStore directories and/or objects found at: {prop_store_path} but" - + f" missing configuration file at: {self.hashstore_configuration_yaml}." - ) - logging.critical("FileHashStore - %s", exception_string) - raise FileNotFoundError(exception_string) + self.verify_hashstore_properties(properties, prop_store_path) logging.debug("FileHashStore - Initializing, properties verified.") self.root = prop_store_path @@ -139,6 +115,47 @@ def __init__(self, properties=None): # Configuration and Related Methods + def verify_hashstore_properties(self, properties, prop_store_path): + """Determines whether FileHashStore can instantiate by validating a set of arguments + and throwing exceptions. HashStore will not instantiate if an existing configuration + file's properties (`hashstore.yaml`) are different from what is supplied - or if an + object store exists at the given path, but it is missing the `hashstore.yaml` config file. + + If `hashstore.yaml` exists, it will retrieve its properties and compare them with the + given values; and if there is a mismatch, an exception will be thrown. If not, it will + look to see if any directories/files exist in the given store path and throw an exception + if any file or directory is found. + + Args: + properties (dict): HashStore properties + prop_store_path (string): Store path to check + """ + if os.path.exists(self.hashstore_configuration_yaml): + logging.debug( + "FileHashStore - Config found (hashstore.yaml) at {%s}. Verifying properties.", + self.hashstore_configuration_yaml, + ) + # If 'hashstore.yaml' is found, verify given properties before init + hashstore_yaml_dict = self.get_properties() + for key in self.property_required_keys: + if hashstore_yaml_dict[key] != properties[key]: + exception_string = ( + f"Given properties ({key}: {properties[key]}) does not match " + + f"HashStore configuration ({key}: {hashstore_yaml_dict[key]})" + + f"found at: {self.hashstore_configuration_yaml}" + ) + logging.critical("FileHashStore - %s", exception_string) + raise ValueError(exception_string) + else: + # Check if HashStore exists and throw exception if found + if any(Path(prop_store_path).iterdir()): + exception_string = ( + f"HashStore directories and/or objects found at: {prop_store_path} but" + + f" missing configuration file at: {self.hashstore_configuration_yaml}." + ) + logging.critical("FileHashStore - %s", exception_string) + raise FileNotFoundError(exception_string) + def get_properties(self): """Get and return the contents of the current HashStore configuration. From 48dd286a61ac9d30e642d278f915550bf83cd0e4 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 12 Jul 2023 10:19:04 -0700 Subject: [PATCH 024/165] Revise init process to create store directories if they do not exist and rename config methods --- src/hashstore/filehashstore/filehashstore.py | 24 +++++++++++++------- tests/filehashstore/test_filehashstore.py | 21 ++++++++++++----- 2 files changed, 31 insertions(+), 14 deletions(-) diff --git a/src/hashstore/filehashstore/filehashstore.py b/src/hashstore/filehashstore/filehashstore.py index 13a0835e..5e4405f5 100644 --- a/src/hashstore/filehashstore/filehashstore.py +++ b/src/hashstore/filehashstore/filehashstore.py @@ -79,10 +79,12 @@ def __init__(self, properties=None): for property_name in self.property_required_keys ] + # Check to see if a configuration is present in the given store path self.hashstore_configuration_yaml = prop_store_path + "/hashstore.yaml" self.verify_hashstore_properties(properties, prop_store_path) + # If no exceptions thrown, FileHashStore ready for initialization logging.debug("FileHashStore - Initializing, properties verified.") self.root = prop_store_path self.depth = prop_store_depth @@ -96,12 +98,18 @@ def __init__(self, properties=None): "FileHashStore - HashStore does not exist & configuration file not found." + " Writing configuration file." ) - self.put_properties(properties) + self.write_properties(properties) # Default algorithm list for FileHashStore based on config file written self._set_default_algorithms() - # Complete initialization/instantiation by setting store directories + # Complete initialization/instantiation by setting and creating store directories self.objects = self.root + "/objects" self.metadata = self.root + "/metadata" + if not os.path.exists(self.root): + self.create_path(self.root) + if not os.path.exists(self.objects): + self.create_path(self.objects + "/tmp") + if not os.path.exists(self.metadata): + self.create_path(self.metadata + "/tmp") logging.debug( "FileHashStore - Initialization success. Store root: %s", self.root ) @@ -136,7 +144,7 @@ def verify_hashstore_properties(self, properties, prop_store_path): self.hashstore_configuration_yaml, ) # If 'hashstore.yaml' is found, verify given properties before init - hashstore_yaml_dict = self.get_properties() + hashstore_yaml_dict = self.load_properties() for key in self.property_required_keys: if hashstore_yaml_dict[key] != properties[key]: exception_string = ( @@ -156,7 +164,7 @@ def verify_hashstore_properties(self, properties, prop_store_path): logging.critical("FileHashStore - %s", exception_string) raise FileNotFoundError(exception_string) - def get_properties(self): + def load_properties(self): """Get and return the contents of the current HashStore configuration. Returns: @@ -169,7 +177,7 @@ def get_properties(self): """ if not os.path.exists(self.hashstore_configuration_yaml): exception_string = "hashstore.yaml not found in store root path." - logging.critical("FileHashStore - get_properties: %s", exception_string) + logging.critical("FileHashStore - load_properties: %s", exception_string) raise FileNotFoundError(exception_string) # Open file with open(self.hashstore_configuration_yaml, "r", encoding="utf-8") as file: @@ -179,11 +187,11 @@ def get_properties(self): for key in self.property_required_keys: hashstore_yaml_dict[key] = yaml_data[key] logging.debug( - "FileHashStore - get_properties: Successfully retrieved 'hashstore.yaml' properties." + "FileHashStore - load_properties: Successfully retrieved 'hashstore.yaml' properties." ) return hashstore_yaml_dict - def put_properties(self, properties): + def write_properties(self, properties): """Writes 'hashstore.yaml' to FileHashStore's root directory with the respective properties object supplied. @@ -1245,7 +1253,7 @@ def _has_subdir(self, path): return is_subdir def create_path(self, path): - """Physically create the folder path on disk. + """Physically create the folder path (and all intermediate ones) on disk. Args: path (str): The path to create. diff --git a/tests/filehashstore/test_filehashstore.py b/tests/filehashstore/test_filehashstore.py index b01bf7c2..09cf07b9 100644 --- a/tests/filehashstore/test_filehashstore.py +++ b/tests/filehashstore/test_filehashstore.py @@ -11,7 +11,16 @@ def test_pids_length(pids): assert len(pids) == 3 -def test_init_put_properties_hashstore_yaml_exists(store): +def test_init_directories_created(store): + """Confirm that object and metadata directories have been created.""" + assert os.path.exists(store.root) + assert os.path.exists(store.objects) + assert os.path.exists(store.objects + "/tmp") + assert os.path.exists(store.metadata) + assert os.path.exists(store.metadata + "/tmp") + + +def test_init_write_properties_hashstore_yaml_exists(store): """Verify properties file present in store root directory.""" assert os.path.exists(store.hashstore_configuration_yaml) @@ -48,9 +57,9 @@ def test_init_with_existing_hashstore_missing_yaml(store, pids): FileHashStore(properties) -def test_get_properties(store): - """Verify dictionary returned from get_properties matches initialization.""" - hashstore_yaml_dict = store.get_properties() +def test_load_properties(store): + """Verify dictionary returned from load_properties matches initialization.""" + hashstore_yaml_dict = store.load_properties() assert hashstore_yaml_dict.get("store_path") == store.root assert hashstore_yaml_dict.get("store_depth") == 3 assert hashstore_yaml_dict.get("store_width") == 2 @@ -61,11 +70,11 @@ def test_get_properties(store): ) -def test_get_properties_hashstore_yaml_missing(store): +def test_load_properties_hashstore_yaml_missing(store): """Confirm FileNotFoundError is raised when hashstore.yaml does not exist.""" os.remove(store.hashstore_configuration_yaml) with pytest.raises(FileNotFoundError): - store.get_properties() + store.load_properties() def test_validate_properties(store): From 9d4c9e7737076f263093826c0c22bc8f431afc1c Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 12 Jul 2023 10:54:59 -0700 Subject: [PATCH 025/165] Fix latent bug with init process when verifying HashStore properties and iterating over a directory that does not exist --- src/hashstore/filehashstore/filehashstore.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/hashstore/filehashstore/filehashstore.py b/src/hashstore/filehashstore/filehashstore.py index 5e4405f5..cd972a7d 100644 --- a/src/hashstore/filehashstore/filehashstore.py +++ b/src/hashstore/filehashstore/filehashstore.py @@ -155,14 +155,15 @@ def verify_hashstore_properties(self, properties, prop_store_path): logging.critical("FileHashStore - %s", exception_string) raise ValueError(exception_string) else: - # Check if HashStore exists and throw exception if found - if any(Path(prop_store_path).iterdir()): - exception_string = ( - f"HashStore directories and/or objects found at: {prop_store_path} but" - + f" missing configuration file at: {self.hashstore_configuration_yaml}." - ) - logging.critical("FileHashStore - %s", exception_string) - raise FileNotFoundError(exception_string) + if os.path.exists(prop_store_path): + # Check if HashStore exists and throw exception if found + if any(Path(prop_store_path).iterdir()): + exception_string = ( + f"HashStore directories and/or objects found at: {prop_store_path} but" + + f" missing configuration file at: {self.hashstore_configuration_yaml}." + ) + logging.critical("FileHashStore - %s", exception_string) + raise FileNotFoundError(exception_string) def load_properties(self): """Get and return the contents of the current HashStore configuration. From bbf1ffadcfcbaa1cbac0de759af351c0ad738816 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 12 Jul 2023 11:22:28 -0700 Subject: [PATCH 026/165] Refactor 'store_object' by extracting method to validate checksum arguments --- .vscode/settings.json | 10 ++-- src/hashstore/filehashstore/filehashstore.py | 54 ++++++++++++-------- 2 files changed, 40 insertions(+), 24 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index c31c8612..cb3d2335 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,9 +1,13 @@ { "python.terminal.activateEnvInCurrentTerminal": true, - "python.formatting.provider": "black", + "python.formatting.provider": "none", "python.testing.pytestArgs": [ "tests" ], "python.testing.unittestEnabled": false, - "python.testing.pytestEnabled": true -} + "python.testing.pytestEnabled": true, + "editor.formatOnSave": true, + "[python]": { + "editor.defaultFormatter": "ms-python.black-formatter" + } +} \ No newline at end of file diff --git a/src/hashstore/filehashstore/filehashstore.py b/src/hashstore/filehashstore/filehashstore.py index cd972a7d..12993459 100644 --- a/src/hashstore/filehashstore/filehashstore.py +++ b/src/hashstore/filehashstore/filehashstore.py @@ -79,7 +79,6 @@ def __init__(self, properties=None): for property_name in self.property_required_keys ] - # Check to see if a configuration is present in the given store path self.hashstore_configuration_yaml = prop_store_path + "/hashstore.yaml" self.verify_hashstore_properties(properties, prop_store_path) @@ -413,25 +412,9 @@ def store_object( # Set additional_algorithm additional_algorithm_checked = self.clean_algorithm(additional_algorithm) # Checksum and checksum_algorithm must both be supplied if one is supplied - if checksum is not None: - if checksum_algorithm is None or checksum_algorithm.replace(" ", "") == "": - exception_string = ( - "checksum_algorithm cannot be None or empty if checksum is" - + "supplied." - ) - logging.error("FileHashStore - store_object: %s", exception_string) - raise ValueError(exception_string) - checksum_algorithm_checked = None - if checksum_algorithm is not None: - if checksum is None or checksum.replace(" ", "") == "": - exception_string = ( - "checksum cannot be None or empty if checksum_algorithm is" - + " supplied." - ) - logging.error("FileHashStore - store_object: %s", exception_string) - raise ValueError(exception_string) - # Set checksum_algorithm - checksum_algorithm_checked = self.clean_algorithm(checksum_algorithm) + checksum_algorithm_checked = self.validate_checksum_args( + checksum, checksum_algorithm + ) # Wait for the pid to release if it's in use while pid in self.object_locked_pids: @@ -698,6 +681,35 @@ def get_hex_digest(self, pid, algorithm): # FileHashStore Core Methods + def validate_checksum_args(self, checksum, checksum_algorithm): + """Determines whether calling app has supplied the necessary arguments to validate + an object with a checksum value + + Args: + checksum (string): value of checksum + checksum_algorithm (string): algorithm of checksum + """ + checksum_algorithm_checked = None + if checksum is not None: + if checksum_algorithm is None or checksum_algorithm.replace(" ", "") == "": + exception_string = ( + "checksum_algorithm cannot be None or empty if checksum is" + + "supplied." + ) + logging.error("FileHashStore - store_object: %s", exception_string) + raise ValueError(exception_string) + if checksum_algorithm is not None: + if checksum is None or checksum.replace(" ", "") == "": + exception_string = ( + "checksum cannot be None or empty if checksum_algorithm is" + + " supplied." + ) + logging.error("FileHashStore - store_object: %s", exception_string) + raise ValueError(exception_string) + # Set checksum_algorithm + checksum_algorithm_checked = self.clean_algorithm(checksum_algorithm) + return checksum_algorithm_checked + def put_object( self, pid, @@ -820,6 +832,7 @@ def _move_and_get_checksums( # Only move file if it doesn't exist. # Files are stored once and only once if not os.path.isfile(abs_file_path): + is_duplicate = False if checksum_algorithm is not None and checksum is not None: hex_digest_stored = hex_digests[checksum_algorithm] if hex_digest_stored != checksum: @@ -833,7 +846,6 @@ def _move_and_get_checksums( "FileHashStore - _move_and_get_checksums: %s", exception_string ) raise ValueError(exception_string) - is_duplicate = False try: debug_move_tmp_file_str = ( "FileHashStore - _move_and_get_checksums: Moving temp file to permanent" From 9e2451bd7af37039c744c5ae5deba801b479b63c Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 12 Jul 2023 11:40:30 -0700 Subject: [PATCH 027/165] Re-organize methods --- src/hashstore/filehashstore/filehashstore.py | 152 ++++++++++--------- 1 file changed, 79 insertions(+), 73 deletions(-) diff --git a/src/hashstore/filehashstore/filehashstore.py b/src/hashstore/filehashstore/filehashstore.py index 12993459..edb35b84 100644 --- a/src/hashstore/filehashstore/filehashstore.py +++ b/src/hashstore/filehashstore/filehashstore.py @@ -81,7 +81,7 @@ def __init__(self, properties=None): # Check to see if a configuration is present in the given store path self.hashstore_configuration_yaml = prop_store_path + "/hashstore.yaml" - self.verify_hashstore_properties(properties, prop_store_path) + self._verify_hashstore_properties(properties, prop_store_path) # If no exceptions thrown, FileHashStore ready for initialization logging.debug("FileHashStore - Initializing, properties verified.") @@ -122,48 +122,6 @@ def __init__(self, properties=None): # Configuration and Related Methods - def verify_hashstore_properties(self, properties, prop_store_path): - """Determines whether FileHashStore can instantiate by validating a set of arguments - and throwing exceptions. HashStore will not instantiate if an existing configuration - file's properties (`hashstore.yaml`) are different from what is supplied - or if an - object store exists at the given path, but it is missing the `hashstore.yaml` config file. - - If `hashstore.yaml` exists, it will retrieve its properties and compare them with the - given values; and if there is a mismatch, an exception will be thrown. If not, it will - look to see if any directories/files exist in the given store path and throw an exception - if any file or directory is found. - - Args: - properties (dict): HashStore properties - prop_store_path (string): Store path to check - """ - if os.path.exists(self.hashstore_configuration_yaml): - logging.debug( - "FileHashStore - Config found (hashstore.yaml) at {%s}. Verifying properties.", - self.hashstore_configuration_yaml, - ) - # If 'hashstore.yaml' is found, verify given properties before init - hashstore_yaml_dict = self.load_properties() - for key in self.property_required_keys: - if hashstore_yaml_dict[key] != properties[key]: - exception_string = ( - f"Given properties ({key}: {properties[key]}) does not match " - + f"HashStore configuration ({key}: {hashstore_yaml_dict[key]})" - + f"found at: {self.hashstore_configuration_yaml}" - ) - logging.critical("FileHashStore - %s", exception_string) - raise ValueError(exception_string) - else: - if os.path.exists(prop_store_path): - # Check if HashStore exists and throw exception if found - if any(Path(prop_store_path).iterdir()): - exception_string = ( - f"HashStore directories and/or objects found at: {prop_store_path} but" - + f" missing configuration file at: {self.hashstore_configuration_yaml}." - ) - logging.critical("FileHashStore - %s", exception_string) - raise FileNotFoundError(exception_string) - def load_properties(self): """Get and return the contents of the current HashStore configuration. @@ -301,6 +259,48 @@ def _build_hashstore_yaml_string( """ return hashstore_configuration_yaml + def _verify_hashstore_properties(self, properties, prop_store_path): + """Determines whether FileHashStore can instantiate by validating a set of arguments + and throwing exceptions. HashStore will not instantiate if an existing configuration + file's properties (`hashstore.yaml`) are different from what is supplied - or if an + object store exists at the given path, but it is missing the `hashstore.yaml` config file. + + If `hashstore.yaml` exists, it will retrieve its properties and compare them with the + given values; and if there is a mismatch, an exception will be thrown. If not, it will + look to see if any directories/files exist in the given store path and throw an exception + if any file or directory is found. + + Args: + properties (dict): HashStore properties + prop_store_path (string): Store path to check + """ + if os.path.exists(self.hashstore_configuration_yaml): + logging.debug( + "FileHashStore - Config found (hashstore.yaml) at {%s}. Verifying properties.", + self.hashstore_configuration_yaml, + ) + # If 'hashstore.yaml' is found, verify given properties before init + hashstore_yaml_dict = self.load_properties() + for key in self.property_required_keys: + if hashstore_yaml_dict[key] != properties[key]: + exception_string = ( + f"Given properties ({key}: {properties[key]}) does not match " + + f"HashStore configuration ({key}: {hashstore_yaml_dict[key]})" + + f"found at: {self.hashstore_configuration_yaml}" + ) + logging.critical("FileHashStore - %s", exception_string) + raise ValueError(exception_string) + else: + if os.path.exists(prop_store_path): + # Check if HashStore exists and throw exception if found + if any(Path(prop_store_path).iterdir()): + exception_string = ( + f"HashStore directories and/or objects found at: {prop_store_path} but" + + f" missing configuration file at: {self.hashstore_configuration_yaml}." + ) + logging.critical("FileHashStore - %s", exception_string) + raise FileNotFoundError(exception_string) + def _validate_properties(self, properties): """Validate a properties dictionary by checking if it contains all the required keys and non-None values. @@ -412,7 +412,7 @@ def store_object( # Set additional_algorithm additional_algorithm_checked = self.clean_algorithm(additional_algorithm) # Checksum and checksum_algorithm must both be supplied if one is supplied - checksum_algorithm_checked = self.validate_checksum_args( + checksum_algorithm_checked = self._validate_checksum_args( checksum, checksum_algorithm ) @@ -681,35 +681,6 @@ def get_hex_digest(self, pid, algorithm): # FileHashStore Core Methods - def validate_checksum_args(self, checksum, checksum_algorithm): - """Determines whether calling app has supplied the necessary arguments to validate - an object with a checksum value - - Args: - checksum (string): value of checksum - checksum_algorithm (string): algorithm of checksum - """ - checksum_algorithm_checked = None - if checksum is not None: - if checksum_algorithm is None or checksum_algorithm.replace(" ", "") == "": - exception_string = ( - "checksum_algorithm cannot be None or empty if checksum is" - + "supplied." - ) - logging.error("FileHashStore - store_object: %s", exception_string) - raise ValueError(exception_string) - if checksum_algorithm is not None: - if checksum is None or checksum.replace(" ", "") == "": - exception_string = ( - "checksum cannot be None or empty if checksum_algorithm is" - + " supplied." - ) - logging.error("FileHashStore - store_object: %s", exception_string) - raise ValueError(exception_string) - # Set checksum_algorithm - checksum_algorithm_checked = self.clean_algorithm(checksum_algorithm) - return checksum_algorithm_checked - def put_object( self, pid, @@ -1086,6 +1057,41 @@ def _mktempmetadata(self, stream): # FileHashStore Utility & Supporting Methods + def _validate_checksum_args(self, checksum, checksum_algorithm): + """Determines whether calling app has supplied the necessary arguments to validate + an object with a checksum value + + Args: + checksum (string): value of checksum + checksum_algorithm (string): algorithm of checksum + """ + checksum_algorithm_checked = None + if checksum is not None: + if checksum_algorithm is None or checksum_algorithm.replace(" ", "") == "": + exception_string = ( + "checksum_algorithm cannot be None or empty if checksum is" + + "supplied." + ) + logging.error( + "FileHashStore - validate_checksum_args (store_object): %s", + exception_string, + ) + raise ValueError(exception_string) + if checksum_algorithm is not None: + if checksum is None or checksum.replace(" ", "") == "": + exception_string = ( + "checksum cannot be None or empty if checksum_algorithm is" + + " supplied." + ) + logging.error( + "FileHashStore - validate_checksum_args (store_object): %s", + exception_string, + ) + raise ValueError(exception_string) + # Set checksum_algorithm + checksum_algorithm_checked = self.clean_algorithm(checksum_algorithm) + return checksum_algorithm_checked + def clean_algorithm(self, algorithm_string): """Format a string and ensure that it is supported and compatible with the python hashlib library. From 2fc60c6df37cb15b43a581f08177cef0456bd6c4 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 12 Jul 2023 11:47:24 -0700 Subject: [PATCH 028/165] Add new static method 'is_string_none_or_empty()' and refactor relevant methods --- src/hashstore/filehashstore/filehashstore.py | 85 +++++++++----------- 1 file changed, 37 insertions(+), 48 deletions(-) diff --git a/src/hashstore/filehashstore/filehashstore.py b/src/hashstore/filehashstore/filehashstore.py index edb35b84..186d62e5 100644 --- a/src/hashstore/filehashstore/filehashstore.py +++ b/src/hashstore/filehashstore/filehashstore.py @@ -383,10 +383,7 @@ def store_object( ) # Validate input parameters logging.debug("FileHashStore - store_object: Validating arguments.") - if pid is None or pid.replace(" ", "") == "": - exception_string = f"Pid cannot be None or empty, pid: {pid}." - logging.error("FileHashStore - store_object: %s", exception_string) - raise ValueError(exception_string) + self._is_string_none_or_empty(pid, "pid", "store_object") if ( not isinstance(data, str) and not isinstance(data, Path) @@ -462,10 +459,7 @@ def store_metadata(self, pid, metadata, format_id=None): ) # Validate input parameters, begin with persistent identifier (pid) logging.debug("FileHashStore - store_metadata: Validating arguments.") - if pid is None or pid.replace(" ", "") == "": - exception_string = f"Pid cannot be None or empty, pid: {pid}" - logging.error("FileHashStore - store_metadata: %s", exception_string) - raise ValueError(exception_string) + self._is_string_none_or_empty(pid, "pid", "store_metadata") # Then format_id of the metadata checked_format_id = None if format_id is not None and format_id.replace(" ", "") == "": @@ -536,10 +530,7 @@ def retrieve_object(self, pid): "FileHashStore - retrieve_object: Request to retrieve object for pid: %s", pid, ) - if pid is None or pid.replace(" ", "") == "": - exception_string = f"Pid cannot be None or empty, pid: {pid}" - logging.error("FileHashStore - retrieve_object: %s", exception_string) - raise ValueError(exception_string) + self._is_string_none_or_empty(pid, "pid", "retrieve_object") entity = "objects" object_cid = self.get_sha256_hex_digest(pid) @@ -568,6 +559,8 @@ def retrieve_metadata(self, pid, format_id=None): exception_string = f"Pid cannot be None or empty, pid: {pid}" logging.error("FileHashStore - retrieve_metadata: %s", exception_string) raise ValueError(exception_string) + self._is_string_none_or_empty(pid, "pid", "retrieve_metadata") + checked_format_id = None if format_id is None: checked_format_id = self.sysmeta_ns @@ -603,10 +596,7 @@ def delete_object(self, pid): logging.debug( "FileHashStore - delete_object: Request to delete object for pid: %s", pid ) - if pid is None or pid.replace(" ", "") == "": - exception_string = f"Pid cannot be None or empty, pid: {pid}" - logging.error("FileHashStore - delete_object: %s", exception_string) - raise ValueError(exception_string) + self._is_string_none_or_empty(pid, "pid", "delete_object") entity = "objects" object_cid = self.get_sha256_hex_digest(pid) @@ -622,10 +612,7 @@ def delete_metadata(self, pid, format_id=None): "FileHashStore - delete_metadata: Request to delete metadata for pid: %s", pid, ) - if pid is None or pid.replace(" ", "") == "": - exception_string = f"Pid cannot be None or empty, pid: {pid}" - logging.error("FileHashStore - delete_metadata: %s", exception_string) - raise ValueError(exception_string) + self._is_string_none_or_empty(pid, "pid", "delete_metadata") checked_format_id = None if format_id is None: checked_format_id = self.sysmeta_ns @@ -653,14 +640,8 @@ def get_hex_digest(self, pid, algorithm): "FileHashStore - get_hex_digest: Request to get hex digest for object with pid: %s", pid, ) - if pid is None or pid.replace(" ", "") == "": - exception_string = f"Pid cannot be None or empty, pid: {pid}" - logging.error("FileHashStore - get_hex_digest: %s", exception_string) - raise ValueError(exception_string) - if algorithm is None or algorithm.replace(" ", "") == "": - exception_string = f"Algorithm cannot be None or empty, pid: {pid}" - logging.error("FileHashStore - get_hex_digest: %s", exception_string) - raise ValueError(exception_string) + self._is_string_none_or_empty(pid, "pid", "get_hex_digest") + self._is_string_none_or_empty(algorithm, "algorithm", "get_hex_digest") entity = "objects" algorithm = self.clean_algorithm(algorithm) @@ -1067,27 +1048,17 @@ def _validate_checksum_args(self, checksum, checksum_algorithm): """ checksum_algorithm_checked = None if checksum is not None: - if checksum_algorithm is None or checksum_algorithm.replace(" ", "") == "": - exception_string = ( - "checksum_algorithm cannot be None or empty if checksum is" - + "supplied." - ) - logging.error( - "FileHashStore - validate_checksum_args (store_object): %s", - exception_string, - ) - raise ValueError(exception_string) + self._is_string_none_or_empty( + checksum_algorithm, + "checksum_algorithm", + "validate_checksum_args (store_object)", + ) if checksum_algorithm is not None: - if checksum is None or checksum.replace(" ", "") == "": - exception_string = ( - "checksum cannot be None or empty if checksum_algorithm is" - + " supplied." - ) - logging.error( - "FileHashStore - validate_checksum_args (store_object): %s", - exception_string, - ) - raise ValueError(exception_string) + self._is_string_none_or_empty( + checksum, + "checksum", + "validate_checksum_args (store_object)", + ) # Set checksum_algorithm checksum_algorithm_checked = self.clean_algorithm(checksum_algorithm) return checksum_algorithm_checked @@ -1373,6 +1344,24 @@ def count(self, entity): # Other Static Methods + @staticmethod + def _is_string_none_or_empty(string, arg, method): + """Checks whether a string is None or empty and throws an exception if so. + + Args: + string (string): Value to check + arg (): Name of argument to check + method (string): Calling method for logging purposes + + """ + if string is None or string.replace(" ", "") == "": + exception_string = ( + f"FileHashStore - {method}: {arg} cannot be None" + + f" or empty, {arg}: {string}." + ) + logging.error(exception_string) + raise ValueError(exception_string) + @staticmethod def _to_bytes(text): """Convert text to sequence of bytes using utf-8 encoding. From 3b9243b11e73a5c41f271ee3045404d755eb6e86 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 12 Jul 2023 11:56:50 -0700 Subject: [PATCH 029/165] Refactor public api metadata methods by extracting new method to determine what metadata namespace to use --- src/hashstore/filehashstore/filehashstore.py | 66 ++++++++------------ 1 file changed, 26 insertions(+), 40 deletions(-) diff --git a/src/hashstore/filehashstore/filehashstore.py b/src/hashstore/filehashstore/filehashstore.py index 186d62e5..958981c4 100644 --- a/src/hashstore/filehashstore/filehashstore.py +++ b/src/hashstore/filehashstore/filehashstore.py @@ -460,17 +460,7 @@ def store_metadata(self, pid, metadata, format_id=None): # Validate input parameters, begin with persistent identifier (pid) logging.debug("FileHashStore - store_metadata: Validating arguments.") self._is_string_none_or_empty(pid, "pid", "store_metadata") - # Then format_id of the metadata - checked_format_id = None - if format_id is not None and format_id.replace(" ", "") == "": - exception_string = "Format_id cannot be empty." - logging.error("FileHashStore - store_metadata: %s", exception_string) - raise ValueError(exception_string) - elif format_id is None: - # Use default value set by hashstore config - checked_format_id = self.sysmeta_ns - else: - checked_format_id = format_id + checked_format_id = self._validate_format_id(format_id, "store_metadata") # Metadata content must be a str, path or stream and cannot be empty if isinstance(metadata, str): if metadata.replace(" ", "") == "": @@ -555,24 +545,8 @@ def retrieve_metadata(self, pid, format_id=None): "FileHashStore - retrieve_metadata: Request to retrieve metadata for pid: %s", pid, ) - if pid is None or pid.replace(" ", "") == "": - exception_string = f"Pid cannot be None or empty, pid: {pid}" - logging.error("FileHashStore - retrieve_metadata: %s", exception_string) - raise ValueError(exception_string) self._is_string_none_or_empty(pid, "pid", "retrieve_metadata") - - checked_format_id = None - if format_id is None: - checked_format_id = self.sysmeta_ns - elif format_id.replace(" ", "") == "": - exception_string = ( - "Format_id cannot empty, must be 'None'" - + " for default HashStore format or supplied." - ) - logging.error("FileHashStore - retrieve_metadata: %s", exception_string) - raise ValueError(exception_string) - else: - checked_format_id = format_id + checked_format_id = self._validate_format_id(format_id, "retrieve_metadata") entity = "metadata" metadata_cid = self.get_sha256_hex_digest(pid + checked_format_id) @@ -613,18 +587,7 @@ def delete_metadata(self, pid, format_id=None): pid, ) self._is_string_none_or_empty(pid, "pid", "delete_metadata") - checked_format_id = None - if format_id is None: - checked_format_id = self.sysmeta_ns - elif format_id.replace(" ", "") == "": - exception_string = ( - "Format_id cannot empty, must be 'None'" - + " for default HashStore format or supplied." - ) - logging.error("FileHashStore - delete_metadata: %s", exception_string) - raise ValueError(exception_string) - else: - checked_format_id = format_id + checked_format_id = self._validate_format_id(format_id, "delete_metadata") entity = "metadata" metadata_cid = self.get_sha256_hex_digest(pid + checked_format_id) @@ -1063,6 +1026,29 @@ def _validate_checksum_args(self, checksum, checksum_algorithm): checksum_algorithm_checked = self.clean_algorithm(checksum_algorithm) return checksum_algorithm_checked + def _validate_format_id(self, format_id, method): + """Determines the metadata namespace (format_id) to use for storing, + retrieving and deleting metadata. + + Args: + format_id (string): Metadata namespace to review + method (string): Calling method for logging purposes + + Returns: + checked_format_id (string): Valid metadata namespace + """ + checked_format_id = None + if format_id is not None and format_id.replace(" ", "") == "": + exception_string = f"FileHashStore - {method}: Format_id cannot be empty." + logging.error(exception_string) + raise ValueError(exception_string) + elif format_id is None: + # Use default value set by hashstore config + checked_format_id = self.sysmeta_ns + else: + checked_format_id = format_id + return checked_format_id + def clean_algorithm(self, algorithm_string): """Format a string and ensure that it is supported and compatible with the python hashlib library. From 5cef488c6d6171138ad612935ac63f0308b16003 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 12 Jul 2023 13:08:12 -0700 Subject: [PATCH 030/165] Refactor 'FileHashStore' codebase by extracting implementation details into new methods & removing and/or standarding usage of logs and comments --- src/hashstore/filehashstore/filehashstore.py | 390 +++++++++++-------- 1 file changed, 238 insertions(+), 152 deletions(-) diff --git a/src/hashstore/filehashstore/filehashstore.py b/src/hashstore/filehashstore/filehashstore.py index 958981c4..04d69aa2 100644 --- a/src/hashstore/filehashstore/filehashstore.py +++ b/src/hashstore/filehashstore/filehashstore.py @@ -113,11 +113,12 @@ def __init__(self, properties=None): "FileHashStore - Initialization success. Store root: %s", self.root ) else: + # Cannot instantiate or initialize FileHashStore without config exception_string = ( - f"HashStore properties must be supplied. Properties: {properties}" + "FileHashStore - HashStore properties must be supplied." + + f" Properties: {properties}" ) - logging.debug("FileHashStore - %s", exception_string) - # Cannot instantiate or initialize FileHashStore without config + logging.debug(exception_string) raise ValueError(exception_string) # Configuration and Related Methods @@ -134,12 +135,16 @@ def load_properties(self): store_metadata_namespace (str): Namespace for the HashStore's system metadata. """ if not os.path.exists(self.hashstore_configuration_yaml): - exception_string = "hashstore.yaml not found in store root path." - logging.critical("FileHashStore - load_properties: %s", exception_string) + exception_string = ( + "FileHashStore - load_properties: hashstore.yaml not found" + + " in store root path." + ) + logging.critical(exception_string) raise FileNotFoundError(exception_string) # Open file with open(self.hashstore_configuration_yaml, "r", encoding="utf-8") as file: yaml_data = yaml.safe_load(file) + # Get hashstore properties hashstore_yaml_dict = {} for key in self.property_required_keys: @@ -164,9 +169,10 @@ def write_properties(self, properties): # If hashstore.yaml already exists, must throw exception and proceed with caution if os.path.exists(self.hashstore_configuration_yaml): exception_string = ( - "FileHashStore configuration file 'hashstore.yaml' already exists." + "FileHashStore - put_properties: configuration file 'hashstore.yaml'" + + " already exists." ) - logging.error("FileHashStore - put_properties: %s", exception_string) + logging.error(exception_string) raise FileExistsError(exception_string) # Validate properties checked_properties = self._validate_properties(properties) @@ -196,6 +202,7 @@ def write_properties(self, properties): self.hashstore_configuration_yaml, "w", encoding="utf-8" ) as hashstore_yaml: hashstore_yaml.write(hashstore_configuration_yaml) + logging.debug( "FileHashStore - put_properties: Configuration file written to: %s", self.hashstore_configuration_yaml, @@ -284,21 +291,22 @@ def _verify_hashstore_properties(self, properties, prop_store_path): for key in self.property_required_keys: if hashstore_yaml_dict[key] != properties[key]: exception_string = ( - f"Given properties ({key}: {properties[key]}) does not match " - + f"HashStore configuration ({key}: {hashstore_yaml_dict[key]})" + f"FileHashStore - Given properties ({key}: {properties[key]}) does not" + + f" match. HashStore configuration ({key}: {hashstore_yaml_dict[key]})" + f"found at: {self.hashstore_configuration_yaml}" ) - logging.critical("FileHashStore - %s", exception_string) + logging.critical(exception_string) raise ValueError(exception_string) else: if os.path.exists(prop_store_path): # Check if HashStore exists and throw exception if found if any(Path(prop_store_path).iterdir()): exception_string = ( - f"HashStore directories and/or objects found at: {prop_store_path} but" - + f" missing configuration file at: {self.hashstore_configuration_yaml}." + "FileHashStore - HashStore directories and/or objects found at:" + + f" {prop_store_path} but missing configuration file at: " + + self.hashstore_configuration_yaml ) - logging.critical("FileHashStore - %s", exception_string) + logging.critical(exception_string) raise FileNotFoundError(exception_string) def _validate_properties(self, properties): @@ -316,21 +324,27 @@ def _validate_properties(self, properties): properties (dict): The given properties object (that has been validated). """ if not isinstance(properties, dict): - exception_string = "Invalid argument - expected a dictionary." - logging.debug("FileHashStore - _validate_properties: %s", exception_string) + exception_string = ( + "FileHashStore - _validate_properties: Invalid argument -" + + " expected a dictionary." + ) + logging.debug(exception_string) raise ValueError(exception_string) + for key in self.property_required_keys: if key not in properties: - exception_string = f"Missing required key: {key}." - logging.debug( - "FileHashStore - _validate_properties: %s", exception_string + exception_string = ( + "FileHashStore - _validate_properties: Missing required" + + f" key: {key}." ) + logging.debug(exception_string) raise KeyError(exception_string) if properties.get(key) is None: - exception_string = f"Value for key: {key} is none." - logging.debug( - "FileHashStore - _validate_properties: %s", exception_string + exception_string = ( + "FileHashStore - _validate_properties: Value for key:" + + f" {key} is none." ) + logging.debug(exception_string) raise ValueError(exception_string) return properties @@ -351,10 +365,11 @@ def lookup_algo(algo): return dataone_algo_translation[algo] if not os.path.exists(self.hashstore_configuration_yaml): - exception_string = "hashstore.yaml not found in store root path." - logging.critical( - "FileHashStore - set_default_algorithms: %s", exception_string + exception_string = ( + "FileHashStore - set_default_algorithms: hashstore.yaml not found" + + " in store root path." ) + logging.critical(exception_string) raise FileNotFoundError(exception_string) with open(self.hashstore_configuration_yaml, "r", encoding="utf-8") as file: yaml_data = yaml.safe_load(file) @@ -382,35 +397,13 @@ def store_object( "FileHashStore - store_object: Request to store object for pid: %s", pid ) # Validate input parameters - logging.debug("FileHashStore - store_object: Validating arguments.") self._is_string_none_or_empty(pid, "pid", "store_object") - if ( - not isinstance(data, str) - and not isinstance(data, Path) - and not isinstance(data, io.BufferedIOBase) - ): - exception_string = ( - "Data must be a path, string or buffered stream type." - + f" data type supplied: {type(data)}" - ) - logging.error("FileHashStore - store_object: %s", exception_string) - raise TypeError(exception_string) - if isinstance(data, str): - if data.replace(" ", "") == "": - exception_string = "Data string cannot be empty." - logging.error("FileHashStore - store_object: %s", exception_string) - raise TypeError(exception_string) - # Format additional algorithm if supplied - logging.debug( - "FileHashStore - store_object: Validating algorithm and checksum args." - ) - additional_algorithm_checked = None - if additional_algorithm != self.algorithm and additional_algorithm is not None: - # Set additional_algorithm - additional_algorithm_checked = self.clean_algorithm(additional_algorithm) - # Checksum and checksum_algorithm must both be supplied if one is supplied - checksum_algorithm_checked = self._validate_checksum_args( - checksum, checksum_algorithm + self._validate_data_to_store(data) + ( + additional_algorithm_checked, + checksum_algorithm_checked, + ) = self._validate_algorithms_and_checksum( + additional_algorithm, checksum, checksum_algorithm ) # Wait for the pid to release if it's in use @@ -451,33 +444,17 @@ def store_object( "FileHashStore - store_object: Successfully stored object for pid: %s", pid, ) + return hash_address def store_metadata(self, pid, metadata, format_id=None): logging.debug( "FileHashStore - store_metadata: Request to store metadata for pid: %s", pid ) - # Validate input parameters, begin with persistent identifier (pid) - logging.debug("FileHashStore - store_metadata: Validating arguments.") + # Validate input parameters self._is_string_none_or_empty(pid, "pid", "store_metadata") checked_format_id = self._validate_format_id(format_id, "store_metadata") - # Metadata content must be a str, path or stream and cannot be empty - if isinstance(metadata, str): - if metadata.replace(" ", "") == "": - exception_string = "Given string path to metadata cannot be empty." - logging.error("FileHashStore - store_metadata: %s", exception_string) - raise TypeError(exception_string) - if ( - not isinstance(metadata, str) - and not isinstance(metadata, Path) - and not isinstance(metadata, io.BufferedIOBase) - ): - exception_string = ( - "Metadata must be a path or string type, data type supplied: " - + {type(metadata)} - ) - logging.error("FileHashStore - store_metadata: %s", exception_string) - raise TypeError(exception_string) + self._validate_metadata_to_store(metadata) # Wait for the pid to release if it's in use while pid in self.metadata_locked_pids: @@ -513,6 +490,7 @@ def store_metadata(self, pid, metadata, format_id=None): "FileHashStore - store_metadata: Successfully stored metadata for pid: %s", pid, ) + return metadata_cid def retrieve_object(self, pid): @@ -525,6 +503,7 @@ def retrieve_object(self, pid): entity = "objects" object_cid = self.get_sha256_hex_digest(pid) object_exists = self.exists(entity, object_cid) + if object_exists: logging.debug( "FileHashStore - retrieve_object: Metadata exists for pid: %s, retrieving object.", @@ -532,12 +511,15 @@ def retrieve_object(self, pid): ) obj_stream = self.open(entity, object_cid) else: - exception_string = f"No object found for pid: {pid}" - logging.error("FileHashStore - retrieve_object: %s", exception_string) + exception_string = ( + f"FileHashStore - retrieve_object: No object found for pid: {pid}" + ) + logging.error(exception_string) raise ValueError(exception_string) logging.info( "FileHashStore - retrieve_object: Retrieved object for pid: %s", pid ) + return obj_stream def retrieve_metadata(self, pid, format_id=None): @@ -552,15 +534,14 @@ def retrieve_metadata(self, pid, format_id=None): metadata_cid = self.get_sha256_hex_digest(pid + checked_format_id) metadata_exists = self.exists(entity, metadata_cid) if metadata_exists: - logging.debug( - "FileHashStore - retrieve_metadata: Metadata exists for pid: %s", - pid + ", retrieving metadata.", - ) metadata_stream = self.open(entity, metadata_cid) else: - exception_string = f"No metadata found for pid: {pid}" - logging.error("FileHashStore - retrieve_metadata: %s", exception_string) + exception_string = ( + f"FileHashStore - retrieve_metadata: No metadata found for pid: {pid}" + ) + logging.error(exception_string) raise ValueError(exception_string) + logging.info( "FileHashStore - retrieve_metadata: Retrieved metadata for pid: %s", pid ) @@ -575,6 +556,7 @@ def delete_object(self, pid): entity = "objects" object_cid = self.get_sha256_hex_digest(pid) self.delete(entity, object_cid) + logging.info( "FileHashStore - delete_object: Successfully deleted object for pid: %s", pid, @@ -592,6 +574,7 @@ def delete_metadata(self, pid, format_id=None): entity = "metadata" metadata_cid = self.get_sha256_hex_digest(pid + checked_format_id) self.delete(entity, metadata_cid) + logging.info( "FileHashStore - delete_metadata: Successfully deleted metadata for pid: %s", pid, @@ -610,17 +593,19 @@ def get_hex_digest(self, pid, algorithm): algorithm = self.clean_algorithm(algorithm) object_cid = self.get_sha256_hex_digest(pid) if not self.exists(entity, object_cid): - exception_string = f"No object found for pid: {pid}" - logging.error("FileHashStore - get_hex_digest: %s", exception_string) + exception_string = ( + f"FileHashStore - get_hex_digest: No object found for pid: {pid}" + ) + logging.error(exception_string) raise ValueError(exception_string) cid_stream = self.open(entity, object_cid) hex_digest = self.computehash(cid_stream, algorithm=algorithm) - logging_info_statement = ( + info_msg = ( f"FileHashStore - get_hex_digest: Successfully calculated hex digest for pid: {pid}." + f" Hex Digest: {hex_digest}", ) - logging.info(logging_info_statement) + logging.info(info_msg) return hex_digest # FileHashStore Core Methods @@ -720,22 +705,24 @@ def _move_and_get_checksums( object_cid = self.get_sha256_hex_digest(pid) abs_file_path = self.build_abs_path(entity, object_cid, extension) self.create_path(os.path.dirname(abs_file_path)) - # Only put file if it doesn't exist + + # Only create tmp file to be moved if target destination doesn't exist if os.path.isfile(abs_file_path): - exception_string = f"File already exists for pid: {pid} at {abs_file_path}" - logging.error( - "FileHashStore - _move_and_get_checksums: %s", exception_string + exception_string = ( + "FileHashStore - _move_and_get_checksums: File already exists" + + f" for pid: {pid} at {abs_file_path}" ) + logging.error(exception_string) raise FileExistsError(exception_string) rel_file_path = os.path.relpath(abs_file_path, self.objects) # Create temporary file and calculate hex digests - debug_tmp_file_str = ( + debug_msg = ( "FileHashStore - _move_and_get_checksums: Creating temp" + f" file and calculating checksums for pid: {pid}" ) - logging.debug(debug_tmp_file_str) + logging.debug(debug_msg) hex_digests, tmp_file_name = self._mktempfile( stream, additional_algorithm, checksum_algorithm ) @@ -746,56 +733,48 @@ def _move_and_get_checksums( # Only move file if it doesn't exist. # Files are stored once and only once + is_object_duplicate = False if not os.path.isfile(abs_file_path): - is_duplicate = False - if checksum_algorithm is not None and checksum is not None: - hex_digest_stored = hex_digests[checksum_algorithm] - if hex_digest_stored != checksum: - self.delete(entity, tmp_file_name) - exception_string = ( - "Hex digest and checksum do not match - file not stored." - + f" Algorithm: {checksum_algorithm}." - + f" Checksum provided: {checksum} != Hex Digest: {hex_digest_stored}" - ) - logging.error( - "FileHashStore - _move_and_get_checksums: %s", exception_string - ) - raise ValueError(exception_string) + self._validate_object( + checksum, checksum_algorithm, entity, hex_digests, tmp_file_name + ) + try: - debug_move_tmp_file_str = ( + debug_msg = ( "FileHashStore - _move_and_get_checksums: Moving temp file to permanent" + f" location: {abs_file_path}", ) - logging.debug(debug_move_tmp_file_str) + logging.debug(debug_msg) shutil.move(tmp_file_name, abs_file_path) except Exception as err: # Revert storage process - exception_string = f"Unexpected {err=}, {type(err)=}" - logging.error( - "FileHashStore - _move_and_get_checksums: %s", exception_string + exception_string = ( + "FileHashStore - _move_and_get_checksums:" + + f" Unexpected {err=}, {type(err)=}" ) + logging.error(exception_string) if os.path.isfile(abs_file_path): # Check to see if object has moved successfully before deleting - debug_file_found_exception_str = ( + debug_msg = ( "FileHashStore - _move_and_get_checksums: Permanent file" + f" found during exception, checking hex digest for pid: {pid}" ) - logging.debug(debug_file_found_exception_str) + logging.debug(debug_msg) pid_checksum = self.get_hex_digest(pid, self.algorithm) if pid_checksum == hex_digests.get(self.algorithm): # If the checksums match, return and log warning - warning_file_stored_str = ( + warning_msg = ( "FileHashStore - _move_and_get_checksums: File moved" + f" successfully but unexpected issue encountered: {exception_string}", ) - logging.warning(warning_file_stored_str) + logging.warning(warning_msg) return else: - debug_file_incomplete_state_str = ( + debug_msg = ( "FileHashStore - _move_and_get_checksums: Permanent file" + f" found but with incomplete state, deleting file: {abs_file_path}", ) - logging.debug(debug_file_incomplete_state_str) + logging.debug(debug_msg) self.delete(entity, abs_file_path) logging.debug( "FileHashStore - _move_and_get_checksums: Deleting temporary file: %s", @@ -810,15 +789,21 @@ def _move_and_get_checksums( raise else: # Else delete temporary file - warning_duplicate_file_str = ( + warning_msg = ( f"FileHashStore - _move_and_get_checksums: Object exists at: {abs_file_path}," + " deleting temporary file." ) - logging.warning(warning_duplicate_file_str) - is_duplicate = True + logging.warning(warning_msg) + is_object_duplicate = True self.delete(entity, tmp_file_name) - return object_cid, rel_file_path, abs_file_path, is_duplicate, hex_digests + return ( + object_cid, + rel_file_path, + abs_file_path, + is_object_duplicate, + hex_digests, + ) def _mktempfile(self, stream, additional_algorithm=None, checksum_algorithm=None): """Create a named temporary file from a `Stream` object and return its filename @@ -835,9 +820,6 @@ def _mktempfile(self, stream, additional_algorithm=None, checksum_algorithm=None hex_digest_dict (dictionary): Algorithms and their hex digests. tmp.name: Name of temporary file created and written into. """ - algorithm_list_to_calculate = self.default_algo_list - - # Create temporary file in .../{store_path}/tmp tmp_root_path = self.get_store_path("objects") / "tmp" # Physically create directory if it doesn't exist if os.path.exists(tmp_root_path) is False: @@ -853,26 +835,9 @@ def _mktempfile(self, stream, additional_algorithm=None, checksum_algorithm=None os.umask(oldmask) # Additional hash objects to digest - if checksum_algorithm is not None: - self.clean_algorithm(checksum_algorithm) - if checksum_algorithm in self.other_algo_list: - debug_additional_other_algo_str = ( - f"FileHashStore - _mktempfile: checksum algorithm: {checksum_algorithm}" - + " found in other_algo_lists, adding to list of algorithms to calculate." - ) - logging.debug(debug_additional_other_algo_str) - algorithm_list_to_calculate.append(checksum_algorithm) - if additional_algorithm is not None: - self.clean_algorithm(additional_algorithm) - if additional_algorithm in self.other_algo_list: - debug_additional_other_algo_str = ( - f"FileHashStore - _mktempfile: additional algorithm: {additional_algorithm}" - + " found in other_algo_lists, adding to list of algorithms to calculate." - ) - logging.debug(debug_additional_other_algo_str) - algorithm_list_to_calculate.append(additional_algorithm) - # Remove duplicates - algorithm_list_to_calculate = set(algorithm_list_to_calculate) + algorithm_list_to_calculate = self._refine_algorithm_list( + additional_algorithm, checksum_algorithm + ) logging.debug( "FileHashStore - _mktempfile: tmp file created: %s, calculating hex digests.", @@ -939,8 +904,10 @@ def put_metadata(self, metadata, pid, format_id): ) return metadata_cid except Exception as err: - exception_string = f"Unexpected {err=}, {type(err)=}" - logging.error("FileHashStore - put_metadata: %s", exception_string) + exception_string = ( + f"FileHashStore - put_metadata: Unexpected {err=}, {type(err)=}" + ) + logging.error(exception_string) if os.path.exists(metadata_tmp): # Remove tmp metadata, calling app must re-upload logging.debug( @@ -948,16 +915,14 @@ def put_metadata(self, metadata, pid, format_id): pid, ) self.metadata.delete(metadata_tmp) - err_msg = f"Aborting store_metadata upload - unexpected error: {err}" - logging.error("FileHashStore - put_metadata: %s", err_msg) raise else: exception_string = ( - f"Attempt to move metadata for pid: {pid}" + f"FileHashStore - put_metadata: Attempt to move metadata for pid: {pid}" + f", but metadata temp file not found: {metadata_tmp}" ) - logging.error("FileHashStore - put_metadata: %s", exception_string) - raise FileNotFoundError() + logging.error(exception_string) + raise FileNotFoundError(exception_string) def _mktempmetadata(self, stream): """Create a named temporary file with `stream` (metadata) and `format_id`. @@ -1001,14 +966,47 @@ def _mktempmetadata(self, stream): # FileHashStore Utility & Supporting Methods - def _validate_checksum_args(self, checksum, checksum_algorithm): + def _validate_data_to_store(self, data): + """Evaluates a data argument to ensure that it is either a string, path or + stream object before attempting to store it. + + Args: + data (string, path, stream): object to validate + """ + if ( + not isinstance(data, str) + and not isinstance(data, Path) + and not isinstance(data, io.BufferedIOBase) + ): + exception_string = ( + "FileHashStore - store_object: Data must be a path, string or buffered" + + f" stream type. Data type supplied: {type(data)}" + ) + logging.error(exception_string) + raise TypeError(exception_string) + if isinstance(data, str): + if data.replace(" ", "") == "": + exception_string = ( + "FileHashStore - store_object: Data string cannot be empty." + ) + logging.error(exception_string) + raise TypeError(exception_string) + + def _validate_algorithms_and_checksum( + self, additional_algorithm, checksum, checksum_algorithm + ): """Determines whether calling app has supplied the necessary arguments to validate an object with a checksum value Args: + additional_algorithm: value of additional algorithm to calculate checksum (string): value of checksum checksum_algorithm (string): algorithm of checksum """ + additional_algorithm_checked = None + if additional_algorithm != self.algorithm and additional_algorithm is not None: + # Set additional_algorithm + additional_algorithm_checked = self.clean_algorithm(additional_algorithm) checksum_algorithm_checked = None if checksum is not None: self._is_string_none_or_empty( @@ -1024,7 +1022,92 @@ def _validate_checksum_args(self, checksum, checksum_algorithm): ) # Set checksum_algorithm checksum_algorithm_checked = self.clean_algorithm(checksum_algorithm) - return checksum_algorithm_checked + return additional_algorithm_checked, checksum_algorithm_checked + + def _refine_algorithm_list(self, additional_algorithm, checksum_algorithm): + """Create the final list of hash algorithms to calculate + + Args: + additional_algorithm (string) + checksum_algorithm (string) + + Return: + algorithm_list_to_calculate (set): De-duplicated list of hash algorithms + """ + algorithm_list_to_calculate = self.default_algo_list + if checksum_algorithm is not None: + self.clean_algorithm(checksum_algorithm) + if checksum_algorithm in self.other_algo_list: + debug_additional_other_algo_str = ( + f"FileHashStore - _mktempfile: checksum algorithm: {checksum_algorithm}" + + " found in other_algo_lists, adding to list of algorithms to calculate." + ) + logging.debug(debug_additional_other_algo_str) + algorithm_list_to_calculate.append(checksum_algorithm) + if additional_algorithm is not None: + self.clean_algorithm(additional_algorithm) + if additional_algorithm in self.other_algo_list: + debug_additional_other_algo_str = ( + f"FileHashStore - _mktempfile: additional algorithm: {additional_algorithm}" + + " found in other_algo_lists, adding to list of algorithms to calculate." + ) + logging.debug(debug_additional_other_algo_str) + algorithm_list_to_calculate.append(additional_algorithm) + + # Remove duplicates + algorithm_list_to_calculate = set(algorithm_list_to_calculate) + return algorithm_list_to_calculate + + def _validate_object( + self, checksum, checksum_algorithm, entity, hex_digests, tmp_file_name + ): + """Evaluates an object's integrity + + Args: + checksum: Value of checksum + checksum_algoritm: Algorithm of checksum + entity: Type of object + hex_digests: Dictionary of hex digests to select from + tmp_file_name: Name of tmp file + """ + if checksum_algorithm is not None and checksum is not None: + hex_digest_stored = hex_digests[checksum_algorithm] + if hex_digest_stored != checksum: + self.delete(entity, tmp_file_name) + exception_string = ( + "FileHashStore - _move_and_get_checksums: Hex digest and checksum" + + f" do not match - file not stored. Algorithm: {checksum_algorithm}." + + f" Checksum provided: {checksum} != Hex Digest: {hex_digest_stored}" + ) + logging.error(exception_string) + raise ValueError(exception_string) + + def _validate_metadata_to_store(self, metadata): + """Evaluates a metadata argument to ensure that it is either a string, path or + stream object before attempting to store it. + + Args: + metadata (string, path, stream): metadata to validate + """ + if isinstance(metadata, str): + if metadata.replace(" ", "") == "": + exception_string = ( + "FileHashStore - store_metadata: Given string path to" + + " metadata cannot be empty." + ) + logging.error(exception_string) + raise TypeError(exception_string) + if ( + not isinstance(metadata, str) + and not isinstance(metadata, Path) + and not isinstance(metadata, io.BufferedIOBase) + ): + exception_string = ( + "FileHashStore - store_metadata: Metadata must be a path or string" + + f" type, data type supplied: {type(metadata)}" + ) + logging.error(exception_string) + raise TypeError(exception_string) def _validate_format_id(self, format_id, method): """Determines the metadata namespace (format_id) to use for storing, @@ -1072,8 +1155,11 @@ def clean_algorithm(self, algorithm_string): cleaned_string not in self.default_algo_list and cleaned_string not in self.other_algo_list ): - exception_string = f"Algorithm not supported: {cleaned_string}" - logging.error("FileHashStore: clean_algorithm: %s", exception_string) + exception_string = ( + "FileHashStore: clean_algorithm: Algorithm not supported:" + + cleaned_string + ) + logging.error(exception_string) raise ValueError(exception_string) return cleaned_string From d2962b20c695763f7fe2e09d158ac9f7af49099a Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 12 Jul 2023 13:14:06 -0700 Subject: [PATCH 031/165] Update HashStore interface doc string for 'retrieve_metadata' --- src/hashstore/hashstore.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index cac31905..e782d87d 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -100,8 +100,11 @@ def retrieve_object(self, pid): @abstractmethod def retrieve_metadata(self, pid, format_id): - """The 'retrieve_metadata' method retrieves the metadata content from disk and - returns it in the form of a String using a given persistent identifier and format_id. + """The 'retrieve_metadata' method retrieves the metadata object from disk using + a given persistent identifier (pid) and metadata namespace (format_id). + If the object exists (determined by calculating the metadata object's permanent + address using the SHA-256 hash of the given pid+format_id), the method will open + and return a buffered metadata stream ready to read from. Args: pid (string): Authority-based identifier From e1da0ec0c55e8d21360a881f4f8b060beaa3d60b Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 12 Jul 2023 13:42:07 -0700 Subject: [PATCH 032/165] Standardized algorithm value to store in 'hashstore.yaml' for cross-language compatibility --- src/hashstore/filehashstore/filehashstore.py | 16 +++++++++++++--- tests/filehashstore/test_filehashstore.py | 4 +++- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/src/hashstore/filehashstore/filehashstore.py b/src/hashstore/filehashstore/filehashstore.py index 04d69aa2..81335830 100644 --- a/src/hashstore/filehashstore/filehashstore.py +++ b/src/hashstore/filehashstore/filehashstore.py @@ -86,6 +86,8 @@ def __init__(self, properties=None): # If no exceptions thrown, FileHashStore ready for initialization logging.debug("FileHashStore - Initializing, properties verified.") self.root = prop_store_path + if not os.path.exists(self.root): + self.create_path(self.root) self.depth = prop_store_depth self.width = prop_store_width self.algorithm = prop_store_algorithm @@ -103,8 +105,6 @@ def __init__(self, properties=None): # Complete initialization/instantiation by setting and creating store directories self.objects = self.root + "/objects" self.metadata = self.root + "/metadata" - if not os.path.exists(self.root): - self.create_path(self.root) if not os.path.exists(self.objects): self.create_path(self.objects + "/tmp") if not os.path.exists(self.metadata): @@ -189,6 +189,16 @@ def write_properties(self, properties): for property_name in self.property_required_keys ] + # Standardize algorithm value for cross-language compatibility + dataone_algo_translation = { + "md5": "MD5", + "sha1": "SHA-1", + "sha256": "SHA-256", + "sha384": "SHA-384", + "sha512": "SHA-512", + } + store_algorithm = dataone_algo_translation[store_algorithm] + # .yaml file to write hashstore_configuration_yaml = self._build_hashstore_yaml_string( store_path, @@ -293,7 +303,7 @@ def _verify_hashstore_properties(self, properties, prop_store_path): exception_string = ( f"FileHashStore - Given properties ({key}: {properties[key]}) does not" + f" match. HashStore configuration ({key}: {hashstore_yaml_dict[key]})" - + f"found at: {self.hashstore_configuration_yaml}" + + f" found at: {self.hashstore_configuration_yaml}" ) logging.critical(exception_string) raise ValueError(exception_string) diff --git a/tests/filehashstore/test_filehashstore.py b/tests/filehashstore/test_filehashstore.py index 09cf07b9..66923f7d 100644 --- a/tests/filehashstore/test_filehashstore.py +++ b/tests/filehashstore/test_filehashstore.py @@ -63,7 +63,9 @@ def test_load_properties(store): assert hashstore_yaml_dict.get("store_path") == store.root assert hashstore_yaml_dict.get("store_depth") == 3 assert hashstore_yaml_dict.get("store_width") == 2 - assert hashstore_yaml_dict.get("store_algorithm") == "sha256" + # Note, the store_algorithm from `hashstore.yaml` gets translated to a standardized value + # Ex. "sha256" is supplied but is written into the file as "SHA-256" + assert hashstore_yaml_dict.get("store_algorithm") == "SHA-256" assert ( hashstore_yaml_dict.get("store_metadata_namespace") == "http://ns.dataone.org/service/types/v2.0" From 4a90f3b1e27b136e6250b95e03f258aec5d8e0eb Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 12 Jul 2023 14:01:36 -0700 Subject: [PATCH 033/165] Refactored 'write_properties' to check cross-language compatibility for store_algorithm before writing 'hashstore.yaml' and added new pytest --- src/hashstore/filehashstore/filehashstore.py | 21 +++++++++++++++----- tests/conftest.py | 4 ++-- tests/test_hashstore.py | 17 ++++++++++++++++ 3 files changed, 35 insertions(+), 7 deletions(-) diff --git a/src/hashstore/filehashstore/filehashstore.py b/src/hashstore/filehashstore/filehashstore.py index 81335830..79a169bb 100644 --- a/src/hashstore/filehashstore/filehashstore.py +++ b/src/hashstore/filehashstore/filehashstore.py @@ -169,7 +169,7 @@ def write_properties(self, properties): # If hashstore.yaml already exists, must throw exception and proceed with caution if os.path.exists(self.hashstore_configuration_yaml): exception_string = ( - "FileHashStore - put_properties: configuration file 'hashstore.yaml'" + "FileHashStore - write_properties: configuration file 'hashstore.yaml'" + " already exists." ) logging.error(exception_string) @@ -190,21 +190,32 @@ def write_properties(self, properties): ] # Standardize algorithm value for cross-language compatibility - dataone_algo_translation = { + checked_store_algoritm = None + # Note, this must be declared here because HashStore has not yet been initialized + accepted_store_algorithms = { "md5": "MD5", "sha1": "SHA-1", "sha256": "SHA-256", "sha384": "SHA-384", "sha512": "SHA-512", } - store_algorithm = dataone_algo_translation[store_algorithm] + if store_algorithm in accepted_store_algorithms: + checked_store_algoritm = accepted_store_algorithms[store_algorithm] + else: + exception_string = ( + "FileHashStore - write_properties: algorithm supplied cannot" + + " be used as default for HashStore. Must be one of:" + + " md5, sha1, sha256, sha384, sha512" + ) + logging.error(exception_string) + raise ValueError(exception_string) # .yaml file to write hashstore_configuration_yaml = self._build_hashstore_yaml_string( store_path, store_depth, store_width, - store_algorithm, + checked_store_algoritm, store_metadata_namespace, ) # Write 'hashstore.yaml' @@ -214,7 +225,7 @@ def write_properties(self, properties): hashstore_yaml.write(hashstore_configuration_yaml) logging.debug( - "FileHashStore - put_properties: Configuration file written to: %s", + "FileHashStore - write_properties: Configuration file written to: %s", self.hashstore_configuration_yaml, ) return diff --git a/tests/conftest.py b/tests/conftest.py index d15e8875..e18826d7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -41,8 +41,8 @@ def init_store(props): @pytest.fixture(name="pids") def init_pids(): """Shared test harness data. - - object_cid: hex digest of the pid - - metadata_cid: hex digest of the pid + store_metadata_namespace + - object_cid: hex digest of the pid + - metadata_cid: hex digest of the pid + store_metadata_namespace """ test_pids = { "doi:10.18739/A2901ZH2M": { diff --git a/tests/test_hashstore.py b/tests/test_hashstore.py index 8bd6b44f..45344156 100644 --- a/tests/test_hashstore.py +++ b/tests/test_hashstore.py @@ -1,5 +1,6 @@ """Test module for HashStore (and HashStoreFactory)""" import pytest +import os from hashstore.filehashstore.filehashstore import FileHashStore from hashstore.hashstore_factory import HashStoreFactory @@ -39,3 +40,19 @@ def test_factory_get_hashstore_unsupported_module(factory): module_name = "hashstore.s3filestore.s3filestore" class_name = "FileHashStore" factory.get_hashstore(module_name, class_name) + + +def test_factory_get_hashstore_filehashstore_unsupported_algorithm(factory): + """Check factory creates instance of FileHashStore.""" + module_name = "hashstore.filehashstore.filehashstore" + class_name = "FileHashStore" + + properties = { + "store_path": os.getcwd() + "/metacat/test", + "store_depth": 3, + "store_width": 2, + "store_algorithm": "md2", + "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", + } + with pytest.raises(ValueError): + factory.get_hashstore(module_name, class_name, properties) From 8bab09a8b8c9d516134e16c0890c7bbf2eea4bef Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 12 Jul 2023 15:19:28 -0700 Subject: [PATCH 034/165] Refactored initialization process to require a DataONE controlled 'store_algorithm' property value and add new pytests --- src/hashstore/filehashstore/filehashstore.py | 14 ++++------- tests/conftest.py | 2 +- tests/filehashstore/test_filehashstore.py | 26 ++++++++++++++++++++ 3 files changed, 32 insertions(+), 10 deletions(-) diff --git a/src/hashstore/filehashstore/filehashstore.py b/src/hashstore/filehashstore/filehashstore.py index 79a169bb..b09f2e1a 100644 --- a/src/hashstore/filehashstore/filehashstore.py +++ b/src/hashstore/filehashstore/filehashstore.py @@ -192,20 +192,15 @@ def write_properties(self, properties): # Standardize algorithm value for cross-language compatibility checked_store_algoritm = None # Note, this must be declared here because HashStore has not yet been initialized - accepted_store_algorithms = { - "md5": "MD5", - "sha1": "SHA-1", - "sha256": "SHA-256", - "sha384": "SHA-384", - "sha512": "SHA-512", - } + accepted_store_algorithms = ["MD5", "SHA-1", "SHA-256", "SHA-384", "SHA-512"] if store_algorithm in accepted_store_algorithms: - checked_store_algoritm = accepted_store_algorithms[store_algorithm] + checked_store_algoritm = store_algorithm else: exception_string = ( "FileHashStore - write_properties: algorithm supplied cannot" + " be used as default for HashStore. Must be one of:" - + " md5, sha1, sha256, sha384, sha512" + + " MD5, SHA-1, SHA-256, SHA-384, SHA-512 which are DataONE" + + " controlled algorithm values" ) logging.error(exception_string) raise ValueError(exception_string) @@ -395,6 +390,7 @@ def lookup_algo(algo): with open(self.hashstore_configuration_yaml, "r", encoding="utf-8") as file: yaml_data = yaml.safe_load(file) + # Takes DataOne controlled algorithm values and translates to hashlib supported values yaml_store_default_algo_list = yaml_data["store_default_algo_list"] translated_default_algo_list = [] for algo in yaml_store_default_algo_list: diff --git a/tests/conftest.py b/tests/conftest.py index e18826d7..a6f0f3d7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -25,7 +25,7 @@ def init_props(tmp_path): "store_path": hashstore_path, "store_depth": 3, "store_width": 2, - "store_algorithm": "sha256", + "store_algorithm": "SHA-256", "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", } return properties diff --git a/tests/filehashstore/test_filehashstore.py b/tests/filehashstore/test_filehashstore.py index 66923f7d..26a64e2f 100644 --- a/tests/filehashstore/test_filehashstore.py +++ b/tests/filehashstore/test_filehashstore.py @@ -20,6 +20,32 @@ def test_init_directories_created(store): assert os.path.exists(store.metadata + "/tmp") +def test_init_existing_store_incorrect_algorithm_format(store): + """Confirm that exception is thrown when store_algorithm is not a DataONE controlled value""" + properties = { + "store_path": store.root, + "store_depth": 3, + "store_width": 2, + "store_algorithm": "sha256", + "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", + } + with pytest.raises(ValueError): + FileHashStore(properties) + + +def test_init_existing_store_correct_algorithm_format(store): + """Confirm second instance of HashStore with DataONE controlled value""" + properties = { + "store_path": store.root, + "store_depth": 3, + "store_width": 2, + "store_algorithm": "SHA-256", + "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", + } + hashstore_instance = FileHashStore(properties) + assert isinstance(hashstore_instance, FileHashStore) + + def test_init_write_properties_hashstore_yaml_exists(store): """Verify properties file present in store root directory.""" assert os.path.exists(store.hashstore_configuration_yaml) From a89390badbb6f3ea5840ad5aa8939e44578e5c76 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 12 Jul 2023 15:27:47 -0700 Subject: [PATCH 035/165] Add new pytest to check initialization with incorrectly formatted algorithm value and clean up existing init tests for accuracy --- tests/filehashstore/test_filehashstore.py | 10 +++++----- tests/test_hashstore.py | 18 +++++++++++++++++- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/tests/filehashstore/test_filehashstore.py b/tests/filehashstore/test_filehashstore.py index 26a64e2f..5859c16f 100644 --- a/tests/filehashstore/test_filehashstore.py +++ b/tests/filehashstore/test_filehashstore.py @@ -76,7 +76,7 @@ def test_init_with_existing_hashstore_missing_yaml(store, pids): "store_path": store.root, "store_depth": 3, "store_width": 2, - "store_algorithm": "sha256", + "store_algorithm": "SHA-256", "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", } with pytest.raises(FileNotFoundError): @@ -90,7 +90,7 @@ def test_load_properties(store): assert hashstore_yaml_dict.get("store_depth") == 3 assert hashstore_yaml_dict.get("store_width") == 2 # Note, the store_algorithm from `hashstore.yaml` gets translated to a standardized value - # Ex. "sha256" is supplied but is written into the file as "SHA-256" + # Ex. "SHA-256" is supplied but is written into the file as "SHA-256" assert hashstore_yaml_dict.get("store_algorithm") == "SHA-256" assert ( hashstore_yaml_dict.get("store_metadata_namespace") @@ -111,7 +111,7 @@ def test_validate_properties(store): "store_path": "/etc/test", "store_depth": 3, "store_width": 2, - "store_algorithm": "sha256", + "store_algorithm": "SHA-256", "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", } # pylint: disable=W0212 @@ -124,7 +124,7 @@ def test_validate_properties_missing_key(store): "store_path": "/etc/test", "store_depth": 3, "store_width": 2, - "store_algorithm": "sha256", + "store_algorithm": "SHA-256", } with pytest.raises(KeyError): # pylint: disable=W0212 @@ -137,7 +137,7 @@ def test_validate_properties_key_value_is_none(store): "store_path": "/etc/test", "store_depth": 3, "store_width": 2, - "store_algorithm": "sha256", + "store_algorithm": "SHA-256", "store_metadata_namespace": None, } with pytest.raises(ValueError): diff --git a/tests/test_hashstore.py b/tests/test_hashstore.py index 45344156..14aae669 100644 --- a/tests/test_hashstore.py +++ b/tests/test_hashstore.py @@ -43,7 +43,7 @@ def test_factory_get_hashstore_unsupported_module(factory): def test_factory_get_hashstore_filehashstore_unsupported_algorithm(factory): - """Check factory creates instance of FileHashStore.""" + """Check factory raises exception with store algorithm value that part of the default list""" module_name = "hashstore.filehashstore.filehashstore" class_name = "FileHashStore" @@ -56,3 +56,19 @@ def test_factory_get_hashstore_filehashstore_unsupported_algorithm(factory): } with pytest.raises(ValueError): factory.get_hashstore(module_name, class_name, properties) + + +def test_factory_get_hashstore_filehashstore_incorrect_algorithm_format(factory): + """Check factory raises exception with incorrectly formatted algorithm value""" + module_name = "hashstore.filehashstore.filehashstore" + class_name = "FileHashStore" + + properties = { + "store_path": os.getcwd() + "/metacat/test", + "store_depth": 3, + "store_width": 2, + "store_algorithm": "sha256", + "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", + } + with pytest.raises(ValueError): + factory.get_hashstore(module_name, class_name, properties) From 7e8ea6260afed4fa86c798afcfd05dac054a05a1 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 13 Jul 2023 09:37:46 -0700 Subject: [PATCH 036/165] Refactor HashAddress into HashStore module, update pytests and update poetry dependencies --- poetry.lock | 75 +++++++++++++------- src/hashstore/__init__.py | 3 +- src/hashstore/filehashstore/filehashstore.py | 3 +- src/hashstore/hashaddress.py | 27 ------- src/hashstore/hashstore.py | 26 +++++++ tests/test_hashaddress.py | 27 ------- tests/test_hashstore.py | 26 +++++++ 7 files changed, 103 insertions(+), 84 deletions(-) delete mode 100644 src/hashstore/hashaddress.py delete mode 100644 tests/test_hashaddress.py diff --git a/poetry.lock b/poetry.lock index a1f53156..338b1304 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,14 +1,15 @@ -# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand. [[package]] name = "astroid" -version = "2.15.5" +version = "2.15.6" description = "An abstract syntax tree for Python with inference support." +category = "dev" optional = false python-versions = ">=3.7.2" files = [ - {file = "astroid-2.15.5-py3-none-any.whl", hash = "sha256:078e5212f9885fa85fbb0cf0101978a336190aadea6e13305409d099f71b2324"}, - {file = "astroid-2.15.5.tar.gz", hash = "sha256:1039262575027b441137ab4a62a793a9b43defb42c32d5670f38686207cd780f"}, + {file = "astroid-2.15.6-py3-none-any.whl", hash = "sha256:389656ca57b6108f939cf5d2f9a2a825a3be50ba9d589670f393236e0a03b91c"}, + {file = "astroid-2.15.6.tar.gz", hash = "sha256:903f024859b7c7687d7a7f3a3f73b17301f8e42dfd9cc9df9d4418172d3e2dbd"}, ] [package.dependencies] @@ -23,6 +24,7 @@ wrapt = [ name = "black" version = "22.12.0" description = "The uncompromising code formatter." +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -56,13 +58,14 @@ uvloop = ["uvloop (>=0.15.2)"] [[package]] name = "click" -version = "8.1.3" +version = "8.1.5" description = "Composable command line interface toolkit" +category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "click-8.1.3-py3-none-any.whl", hash = "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"}, - {file = "click-8.1.3.tar.gz", hash = "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e"}, + {file = "click-8.1.5-py3-none-any.whl", hash = "sha256:e576aa487d679441d7d30abb87e1b43d24fc53bffb8758443b1a9e1cee504548"}, + {file = "click-8.1.5.tar.gz", hash = "sha256:4be4b1af8d665c6d942909916d31a213a106800c47d0eeba73d34da3cbc11367"}, ] [package.dependencies] @@ -72,6 +75,7 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""} name = "colorama" version = "0.4.6" description = "Cross-platform colored terminal text." +category = "dev" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" files = [ @@ -83,6 +87,7 @@ files = [ name = "dill" version = "0.3.6" description = "serialize all of python" +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -95,13 +100,14 @@ graph = ["objgraph (>=1.7.2)"] [[package]] name = "exceptiongroup" -version = "1.1.1" +version = "1.1.2" description = "Backport of PEP 654 (exception groups)" +category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "exceptiongroup-1.1.1-py3-none-any.whl", hash = "sha256:232c37c63e4f682982c8b6459f33a8981039e5fb8756b2074364e5055c498c9e"}, - {file = "exceptiongroup-1.1.1.tar.gz", hash = "sha256:d484c3090ba2889ae2928419117447a14daf3c1231d5e30d0aae34f354f01785"}, + {file = "exceptiongroup-1.1.2-py3-none-any.whl", hash = "sha256:e346e69d186172ca7cf029c8c1d16235aa0e04035e5750b4b95039e65204328f"}, + {file = "exceptiongroup-1.1.2.tar.gz", hash = "sha256:12c3e887d6485d16943a309616de20ae5582633e0a2eda17f4e10fd61c1e8af5"}, ] [package.extras] @@ -111,6 +117,7 @@ test = ["pytest (>=6)"] name = "iniconfig" version = "2.0.0" description = "brain-dead simple config-ini parsing" +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -122,6 +129,7 @@ files = [ name = "isort" version = "5.12.0" description = "A Python utility / library to sort Python imports." +category = "dev" optional = false python-versions = ">=3.8.0" files = [ @@ -139,6 +147,7 @@ requirements-deprecated-finder = ["pip-api", "pipreqs"] name = "lazy-object-proxy" version = "1.9.0" description = "A fast and thorough lazy object proxy." +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -184,6 +193,7 @@ files = [ name = "mccabe" version = "0.7.0" description = "McCabe checker, plugin for flake8" +category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -195,6 +205,7 @@ files = [ name = "mypy-extensions" version = "1.0.0" description = "Type system extensions for programs checked with the mypy type checker." +category = "dev" optional = false python-versions = ">=3.5" files = [ @@ -206,6 +217,7 @@ files = [ name = "packaging" version = "23.1" description = "Core utilities for Python packages" +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -217,6 +229,7 @@ files = [ name = "pathlib" version = "1.0.1" description = "Object-oriented filesystem paths" +category = "main" optional = false python-versions = "*" files = [ @@ -228,6 +241,7 @@ files = [ name = "pathspec" version = "0.11.1" description = "Utility library for gitignore style pattern matching of file paths." +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -237,28 +251,30 @@ files = [ [[package]] name = "platformdirs" -version = "3.5.1" +version = "3.8.1" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." +category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "platformdirs-3.5.1-py3-none-any.whl", hash = "sha256:e2378146f1964972c03c085bb5662ae80b2b8c06226c54b2ff4aa9483e8a13a5"}, - {file = "platformdirs-3.5.1.tar.gz", hash = "sha256:412dae91f52a6f84830f39a8078cecd0e866cb72294a5c66808e74d5e88d251f"}, + {file = "platformdirs-3.8.1-py3-none-any.whl", hash = "sha256:cec7b889196b9144d088e4c57d9ceef7374f6c39694ad1577a0aab50d27ea28c"}, + {file = "platformdirs-3.8.1.tar.gz", hash = "sha256:f87ca4fcff7d2b0f81c6a748a77973d7af0f4d526f98f308477c3c436c74d528"}, ] [package.extras] -docs = ["furo (>=2023.3.27)", "proselint (>=0.13)", "sphinx (>=6.2.1)", "sphinx-autodoc-typehints (>=1.23,!=1.23.4)"] -test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.3.1)", "pytest-cov (>=4)", "pytest-mock (>=3.10)"] +docs = ["furo (>=2023.5.20)", "proselint (>=0.13)", "sphinx (>=7.0.1)", "sphinx-autodoc-typehints (>=1.23,!=1.23.4)"] +test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.3.1)", "pytest-cov (>=4.1)", "pytest-mock (>=3.10)"] [[package]] name = "pluggy" -version = "1.0.0" +version = "1.2.0" description = "plugin and hook calling mechanisms for python" +category = "dev" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" files = [ - {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, - {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"}, + {file = "pluggy-1.2.0-py3-none-any.whl", hash = "sha256:c2fd55a7d7a3863cba1a013e4e2414658b1d07b6bc57b3919e0c63c9abb99849"}, + {file = "pluggy-1.2.0.tar.gz", hash = "sha256:d12f0c4b579b15f5e054301bb226ee85eeeba08ffec228092f8defbaa3a4c4b3"}, ] [package.extras] @@ -269,6 +285,7 @@ testing = ["pytest", "pytest-benchmark"] name = "pylint" version = "2.17.4" description = "python code static checker" +category = "dev" optional = false python-versions = ">=3.7.2" files = [ @@ -296,13 +313,14 @@ testutils = ["gitpython (>3)"] [[package]] name = "pytest" -version = "7.3.1" +version = "7.4.0" description = "pytest: simple powerful testing with Python" +category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "pytest-7.3.1-py3-none-any.whl", hash = "sha256:3799fa815351fea3a5e96ac7e503a96fa51cc9942c3753cda7651b93c1cfa362"}, - {file = "pytest-7.3.1.tar.gz", hash = "sha256:434afafd78b1d78ed0addf160ad2b77a30d35d4bdf8af234fe621919d9ed15e3"}, + {file = "pytest-7.4.0-py3-none-any.whl", hash = "sha256:78bf16451a2eb8c7a2ea98e32dc119fd2aa758f1d5d66dbf0a59d69a3969df32"}, + {file = "pytest-7.4.0.tar.gz", hash = "sha256:b4bf8c45bd59934ed84001ad51e11b4ee40d40a1229d2c79f9c592b0a3f6bd8a"}, ] [package.dependencies] @@ -314,12 +332,13 @@ pluggy = ">=0.12,<2.0" tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] -testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] +testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] [[package]] name = "pyyaml" version = "6.0" description = "YAML parser and emitter for Python" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -369,6 +388,7 @@ files = [ name = "tomli" version = "2.0.1" description = "A lil' TOML parser" +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -380,6 +400,7 @@ files = [ name = "tomlkit" version = "0.11.8" description = "Style preserving TOML library" +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -389,19 +410,21 @@ files = [ [[package]] name = "typing-extensions" -version = "4.6.3" +version = "4.7.1" description = "Backported and Experimental Type Hints for Python 3.7+" +category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "typing_extensions-4.6.3-py3-none-any.whl", hash = "sha256:88a4153d8505aabbb4e13aacb7c486c2b4a33ca3b3f807914a9b4c844c471c26"}, - {file = "typing_extensions-4.6.3.tar.gz", hash = "sha256:d91d5919357fe7f681a9f2b5b4cb2a5f1ef0a1e9f59c4d8ff0d3491e05c0ffd5"}, + {file = "typing_extensions-4.7.1-py3-none-any.whl", hash = "sha256:440d5dd3af93b060174bf433bccd69b0babc3b15b1a8dca43789fd7f61514b36"}, + {file = "typing_extensions-4.7.1.tar.gz", hash = "sha256:b75ddc264f0ba5615db7ba217daeb99701ad295353c45f9e95963337ceeeffb2"}, ] [[package]] name = "wrapt" version = "1.15.0" description = "Module for decorators, wrappers and monkey patching." +category = "dev" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" files = [ diff --git a/src/hashstore/__init__.py b/src/hashstore/__init__.py index 11db5e95..1e07f875 100644 --- a/src/hashstore/__init__.py +++ b/src/hashstore/__init__.py @@ -15,8 +15,7 @@ their persistent identifier (PID) """ -from hashstore.hashstore import HashStore -from hashstore.hashaddress import HashAddress +from hashstore.hashstore import HashStore, HashAddress from hashstore.hashstore_factory import HashStoreFactory __all__ = ("HashStore", "HashAddress", "HashStoreFactory") diff --git a/src/hashstore/filehashstore/filehashstore.py b/src/hashstore/filehashstore/filehashstore.py index b09f2e1a..250224c9 100644 --- a/src/hashstore/filehashstore/filehashstore.py +++ b/src/hashstore/filehashstore/filehashstore.py @@ -10,8 +10,7 @@ from contextlib import closing from tempfile import NamedTemporaryFile import yaml -from hashstore import HashStore -from hashstore.hashaddress import HashAddress +from hashstore import HashStore, HashAddress class FileHashStore(HashStore): diff --git a/src/hashstore/hashaddress.py b/src/hashstore/hashaddress.py deleted file mode 100644 index 71cbac48..00000000 --- a/src/hashstore/hashaddress.py +++ /dev/null @@ -1,27 +0,0 @@ -"""HashAddress must be returned for all HashStore implementations""" -from collections import namedtuple - - -class HashAddress( - namedtuple( - "HashAddress", ["id", "relpath", "abspath", "is_duplicate", "hex_digests"] - ) -): - """File address containing file's path on disk and its content hash ID. - - Args: - ab_id (str): Hash ID (hexdigest) of file contents. - relpath (str): Relative path location to :attr:`HashFS.root`. - abspath (str): Absolute path location of file on disk. - is_duplicate (boolean, optional): Whether the hash address created was - a duplicate of a previously existing file. Can only be ``True`` - after a put operation. Defaults to ``False``. - hex_digests (dict, optional): A list of hex digests to validate objects - (md5, sha1, sha256, sha384, sha512) - """ - - # Default value to prevent dangerous default value - def __new__(cls, ab_id, relpath, abspath, is_duplicate=False, hex_digests=None): - return super(HashAddress, cls).__new__( - cls, ab_id, relpath, abspath, is_duplicate, hex_digests - ) diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index e782d87d..84adb3ae 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -1,5 +1,6 @@ """Hashstore Interface""" from abc import ABC, abstractmethod +from collections import namedtuple import importlib.metadata @@ -155,3 +156,28 @@ def get_hex_digest(self, pid, algorithm): hex_digest (string): Hex digest of the object. """ raise NotImplementedError() + + +class HashAddress( + namedtuple( + "HashAddress", ["id", "relpath", "abspath", "is_duplicate", "hex_digests"] + ) +): + """File address containing file's path on disk and its content hash ID. + + Args: + ab_id (str): Hash ID (hexdigest) of file contents. + relpath (str): Relative path location to :attr:`HashFS.root`. + abspath (str): Absolute path location of file on disk. + is_duplicate (boolean, optional): Whether the hash address created was + a duplicate of a previously existing file. Can only be ``True`` + after a put operation. Defaults to ``False``. + hex_digests (dict, optional): A list of hex digests to validate objects + (md5, sha1, sha256, sha384, sha512) + """ + + # Default value to prevent dangerous default value + def __new__(cls, ab_id, relpath, abspath, is_duplicate=False, hex_digests=None): + return super(HashAddress, cls).__new__( + cls, ab_id, relpath, abspath, is_duplicate, hex_digests + ) diff --git a/tests/test_hashaddress.py b/tests/test_hashaddress.py deleted file mode 100644 index b7ea6971..00000000 --- a/tests/test_hashaddress.py +++ /dev/null @@ -1,27 +0,0 @@ -"""Test module for HashAddress""" -from hashstore.hashaddress import HashAddress - - -def test_hashaddress(): - """Test class returns correct values via dot notation.""" - ab_id = "hashstoretest" - rel_path = "rel/path/to/object" - abs_path = "abs/path/to/object" - is_duplicate = "false" - hex_digest_dict = { - "md5": "md5value", - "sha1": "sha1value", - "sha224": "sha224value", - "sha256": "sha256value", - "sha512": "sha512value", - } - hash_address = HashAddress(ab_id, rel_path, abs_path, is_duplicate, hex_digest_dict) - assert hash_address.id == ab_id - assert hash_address.relpath == rel_path - assert hash_address.abspath == abs_path - assert hash_address.is_duplicate == is_duplicate - assert hash_address.hex_digests.get("md5") == hex_digest_dict["md5"] - assert hash_address.hex_digests.get("sha1") == hex_digest_dict["sha1"] - assert hash_address.hex_digests.get("sha224") == hex_digest_dict["sha224"] - assert hash_address.hex_digests.get("sha256") == hex_digest_dict["sha256"] - assert hash_address.hex_digests.get("sha512") == hex_digest_dict["sha512"] diff --git a/tests/test_hashstore.py b/tests/test_hashstore.py index 14aae669..350dfa9e 100644 --- a/tests/test_hashstore.py +++ b/tests/test_hashstore.py @@ -1,6 +1,7 @@ """Test module for HashStore (and HashStoreFactory)""" import pytest import os +from hashstore.hashstore import HashAddress from hashstore.filehashstore.filehashstore import FileHashStore from hashstore.hashstore_factory import HashStoreFactory @@ -72,3 +73,28 @@ def test_factory_get_hashstore_filehashstore_incorrect_algorithm_format(factory) } with pytest.raises(ValueError): factory.get_hashstore(module_name, class_name, properties) + + +def test_hashaddress(): + """Test class returns correct values via dot notation.""" + ab_id = "hashstoretest" + rel_path = "rel/path/to/object" + abs_path = "abs/path/to/object" + is_duplicate = "false" + hex_digest_dict = { + "md5": "md5value", + "sha1": "sha1value", + "sha224": "sha224value", + "sha256": "sha256value", + "sha512": "sha512value", + } + hash_address = HashAddress(ab_id, rel_path, abs_path, is_duplicate, hex_digest_dict) + assert hash_address.id == ab_id + assert hash_address.relpath == rel_path + assert hash_address.abspath == abs_path + assert hash_address.is_duplicate == is_duplicate + assert hash_address.hex_digests.get("md5") == hex_digest_dict["md5"] + assert hash_address.hex_digests.get("sha1") == hex_digest_dict["sha1"] + assert hash_address.hex_digests.get("sha224") == hex_digest_dict["sha224"] + assert hash_address.hex_digests.get("sha256") == hex_digest_dict["sha256"] + assert hash_address.hex_digests.get("sha512") == hex_digest_dict["sha512"] From 596d7b33fd72a31011bba4cba6b46000dad1474e Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 13 Jul 2023 09:40:10 -0700 Subject: [PATCH 037/165] Refactor HashStoreFactory into HashStore module and update pytests --- src/hashstore/__init__.py | 3 +- src/hashstore/hashstore.py | 50 ++++++++++++++++++++++++++++ src/hashstore/hashstore_factory.py | 52 ------------------------------ tests/test_hashstore.py | 7 ++-- 4 files changed, 54 insertions(+), 58 deletions(-) delete mode 100644 src/hashstore/hashstore_factory.py diff --git a/src/hashstore/__init__.py b/src/hashstore/__init__.py index 1e07f875..5e58af70 100644 --- a/src/hashstore/__init__.py +++ b/src/hashstore/__init__.py @@ -15,7 +15,6 @@ their persistent identifier (PID) """ -from hashstore.hashstore import HashStore, HashAddress -from hashstore.hashstore_factory import HashStoreFactory +from hashstore.hashstore import HashStore, HashStoreFactory, HashAddress __all__ = ("HashStore", "HashAddress", "HashStoreFactory") diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 84adb3ae..ec15af06 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -158,6 +158,56 @@ def get_hex_digest(self, pid, algorithm): raise NotImplementedError() +class HashStoreFactory: + """A factory class for creating `HashStore`-like objects (classes + that implement the 'HashStore' abstract methods) + + This factory class provides a method to retrieve a `HashStore` object + based on a given module (ex. "hashstore.filehashstore.filehashstore") + and class name (ex. "FileHashStore"). + """ + + @staticmethod + def get_hashstore(module_name, class_name, properties=None): + """Get a `HashStore`-like object based on the specified `module_name` and `class_name`. + + Args: + module_name (str): Name of package (ex. "hashstore.filehashstore.filehashstore") \n + class_name (str): Name of class in the given module (ex. "FileHashStore") \n + properties (dict, optional): Desired HashStore properties, if 'None', default values + will be used. \n + Example Properties Dictionary: + { + "store_path": "var/metacat",\n + "store_depth": 3,\n + "store_width": 2,\n + "store_algorithm": "sha256",\n + "store_sysmeta_namespace": "http://ns.dataone.org/service/types/v2.0"\n + } + + Returns: + HashStore: A hash store object based on the given `module_name` and `class_name` + + Raises: + ModuleNotFoundError: If module is not found + AttributeError: If class does not exist within the module + """ + # Validate module + if importlib.util.find_spec(module_name) is None: + raise ModuleNotFoundError(f"No module found for '{module_name}'") + + # Get HashStore + imported_module = importlib.import_module(module_name) + + # If class is not part of module, raise error + if hasattr(imported_module, class_name): + hashstore_class = getattr(imported_module, class_name) + return hashstore_class(properties=properties) + raise AttributeError( + f"Class name '{class_name}' is not an attribute of module '{module_name}'" + ) + + class HashAddress( namedtuple( "HashAddress", ["id", "relpath", "abspath", "is_duplicate", "hex_digests"] diff --git a/src/hashstore/hashstore_factory.py b/src/hashstore/hashstore_factory.py deleted file mode 100644 index bcdeff0c..00000000 --- a/src/hashstore/hashstore_factory.py +++ /dev/null @@ -1,52 +0,0 @@ -"""Core module for HashStore Factory""" -import importlib - - -class HashStoreFactory: - """A factory class for creating `HashStore`-like objects (classes - that implement the 'HashStore' abstract methods) - - This factory class provides a method to retrieve a `HashStore` object - based on a given module (ex. "hashstore.filehashstore.filehashstore") - and class name (ex. "FileHashStore"). - """ - - @staticmethod - def get_hashstore(module_name, class_name, properties=None): - """Get a `HashStore`-like object based on the specified `module_name` and `class_name`. - - Args: - module_name (str): Name of package (ex. "hashstore.filehashstore.filehashstore") \n - class_name (str): Name of class in the given module (ex. "FileHashStore") \n - properties (dict, optional): Desired HashStore properties, if 'None', default values - will be used. \n - Example Properties Dictionary: - { - "store_path": "var/metacat",\n - "store_depth": 3,\n - "store_width": 2,\n - "store_algorithm": "sha256",\n - "store_sysmeta_namespace": "http://ns.dataone.org/service/types/v2.0"\n - } - - Returns: - HashStore: A hash store object based on the given `module_name` and `class_name` - - Raises: - ModuleNotFoundError: If module is not found - AttributeError: If class does not exist within the module - """ - # Validate module - if importlib.util.find_spec(module_name) is None: - raise ModuleNotFoundError(f"No module found for '{module_name}'") - - # Get HashStore - imported_module = importlib.import_module(module_name) - - # If class is not part of module, raise error - if hasattr(imported_module, class_name): - hashstore_class = getattr(imported_module, class_name) - return hashstore_class(properties=properties) - raise AttributeError( - f"Class name '{class_name}' is not an attribute of module '{module_name}'" - ) diff --git a/tests/test_hashstore.py b/tests/test_hashstore.py index 350dfa9e..a16fa422 100644 --- a/tests/test_hashstore.py +++ b/tests/test_hashstore.py @@ -1,9 +1,8 @@ -"""Test module for HashStore (and HashStoreFactory)""" -import pytest +"""Test module for HashStore Module""" import os -from hashstore.hashstore import HashAddress +import pytest +from hashstore.hashstore import HashAddress, HashStoreFactory from hashstore.filehashstore.filehashstore import FileHashStore -from hashstore.hashstore_factory import HashStoreFactory @pytest.fixture(name="factory") From 2a20e53d77c36f66a570a18e08b37afe9682ac8c Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 13 Jul 2023 09:50:53 -0700 Subject: [PATCH 038/165] Move FileHashStore module into HashStore directory and update pytests --- src/hashstore/{filehashstore => }/filehashstore.py | 0 src/hashstore/hashstore.py | 2 +- tests/conftest.py | 2 +- tests/{filehashstore => }/test_filehashstore.py | 2 +- .../test_filehashstore_interface.py | 0 .../test_stream.py => test_filehashstore_stream.py} | 2 +- tests/test_hashstore.py | 10 +++++----- 7 files changed, 9 insertions(+), 9 deletions(-) rename src/hashstore/{filehashstore => }/filehashstore.py (100%) rename tests/{filehashstore => }/test_filehashstore.py (99%) rename tests/{filehashstore => }/test_filehashstore_interface.py (100%) rename tests/{filehashstore/test_stream.py => test_filehashstore_stream.py} (96%) diff --git a/src/hashstore/filehashstore/filehashstore.py b/src/hashstore/filehashstore.py similarity index 100% rename from src/hashstore/filehashstore/filehashstore.py rename to src/hashstore/filehashstore.py diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index ec15af06..f46e1253 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -172,7 +172,7 @@ def get_hashstore(module_name, class_name, properties=None): """Get a `HashStore`-like object based on the specified `module_name` and `class_name`. Args: - module_name (str): Name of package (ex. "hashstore.filehashstore.filehashstore") \n + module_name (str): Name of package (ex. "hashstore.filehashstore") \n class_name (str): Name of class in the given module (ex. "FileHashStore") \n properties (dict, optional): Desired HashStore properties, if 'None', default values will be used. \n diff --git a/tests/conftest.py b/tests/conftest.py index a6f0f3d7..e4e0635c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,6 @@ """Pytest overall configuration file for fixtures""" import pytest -from hashstore.filehashstore.filehashstore import FileHashStore +from hashstore.filehashstore import FileHashStore def pytest_addoption(parser): diff --git a/tests/filehashstore/test_filehashstore.py b/tests/test_filehashstore.py similarity index 99% rename from tests/filehashstore/test_filehashstore.py rename to tests/test_filehashstore.py index 5859c16f..049a8f7f 100644 --- a/tests/filehashstore/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -3,7 +3,7 @@ import os from pathlib import Path import pytest -from hashstore.filehashstore.filehashstore import FileHashStore +from hashstore.filehashstore import FileHashStore def test_pids_length(pids): diff --git a/tests/filehashstore/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py similarity index 100% rename from tests/filehashstore/test_filehashstore_interface.py rename to tests/test_filehashstore_interface.py diff --git a/tests/filehashstore/test_stream.py b/tests/test_filehashstore_stream.py similarity index 96% rename from tests/filehashstore/test_stream.py rename to tests/test_filehashstore_stream.py index a60960e7..8cf4a7d0 100644 --- a/tests/filehashstore/test_stream.py +++ b/tests/test_filehashstore_stream.py @@ -3,7 +3,7 @@ import io from pathlib import Path import pytest -from hashstore.filehashstore.filehashstore import Stream +from hashstore.filehashstore import Stream def test_stream_reads_file(pids): diff --git a/tests/test_hashstore.py b/tests/test_hashstore.py index a16fa422..4414470b 100644 --- a/tests/test_hashstore.py +++ b/tests/test_hashstore.py @@ -2,7 +2,7 @@ import os import pytest from hashstore.hashstore import HashAddress, HashStoreFactory -from hashstore.filehashstore.filehashstore import FileHashStore +from hashstore.filehashstore import FileHashStore @pytest.fixture(name="factory") @@ -19,7 +19,7 @@ def test_init(factory): def test_factory_get_hashstore_filehashstore(factory, props): """Check factory creates instance of FileHashStore.""" - module_name = "hashstore.filehashstore.filehashstore" + module_name = "hashstore.filehashstore" class_name = "FileHashStore" # These props can be found in tests/conftest.py store = factory.get_hashstore(module_name, class_name, props) @@ -29,7 +29,7 @@ def test_factory_get_hashstore_filehashstore(factory, props): def test_factory_get_hashstore_unsupported_class(factory): """Check that AttributeError is raised when provided with unsupported class.""" with pytest.raises(AttributeError): - module_name = "hashstore.filehashstore.filehashstore" + module_name = "hashstore.filehashstore" class_name = "S3HashStore" factory.get_hashstore(module_name, class_name) @@ -44,7 +44,7 @@ def test_factory_get_hashstore_unsupported_module(factory): def test_factory_get_hashstore_filehashstore_unsupported_algorithm(factory): """Check factory raises exception with store algorithm value that part of the default list""" - module_name = "hashstore.filehashstore.filehashstore" + module_name = "hashstore.filehashstore" class_name = "FileHashStore" properties = { @@ -60,7 +60,7 @@ def test_factory_get_hashstore_filehashstore_unsupported_algorithm(factory): def test_factory_get_hashstore_filehashstore_incorrect_algorithm_format(factory): """Check factory raises exception with incorrectly formatted algorithm value""" - module_name = "hashstore.filehashstore.filehashstore" + module_name = "hashstore.filehashstore" class_name = "FileHashStore" properties = { From f622fafb3f2146eec95807d7d9d3bda46f57ff4a Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 13 Jul 2023 11:04:00 -0700 Subject: [PATCH 039/165] Rename 'HashAddress' class to 'ObjectMetadata' to accurately reflect its usage --- src/hashstore/__init__.py | 4 ++-- src/hashstore/filehashstore.py | 4 ++-- src/hashstore/hashstore.py | 4 ++-- tests/test_hashstore.py | 6 ++++-- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/hashstore/__init__.py b/src/hashstore/__init__.py index 5e58af70..9435077e 100644 --- a/src/hashstore/__init__.py +++ b/src/hashstore/__init__.py @@ -15,6 +15,6 @@ their persistent identifier (PID) """ -from hashstore.hashstore import HashStore, HashStoreFactory, HashAddress +from hashstore.hashstore import HashStore, HashStoreFactory, ObjectMetadata -__all__ = ("HashStore", "HashAddress", "HashStoreFactory") +__all__ = ("HashStore", "ObjectMetadata", "HashStoreFactory") diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 250224c9..474aab49 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -10,7 +10,7 @@ from contextlib import closing from tempfile import NamedTemporaryFile import yaml -from hashstore import HashStore, HashAddress +from hashstore import HashStore, ObjectMetadata class FileHashStore(HashStore): @@ -674,7 +674,7 @@ def put_object( checksum_algorithm, ) - hash_address = HashAddress( + hash_address = ObjectMetadata( object_cid, rel_path, abs_path, is_duplicate, hex_digest_dict ) logging.debug( diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index f46e1253..477c3350 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -208,7 +208,7 @@ def get_hashstore(module_name, class_name, properties=None): ) -class HashAddress( +class ObjectMetadata( namedtuple( "HashAddress", ["id", "relpath", "abspath", "is_duplicate", "hex_digests"] ) @@ -228,6 +228,6 @@ class HashAddress( # Default value to prevent dangerous default value def __new__(cls, ab_id, relpath, abspath, is_duplicate=False, hex_digests=None): - return super(HashAddress, cls).__new__( + return super(ObjectMetadata, cls).__new__( cls, ab_id, relpath, abspath, is_duplicate, hex_digests ) diff --git a/tests/test_hashstore.py b/tests/test_hashstore.py index 4414470b..8346c3d1 100644 --- a/tests/test_hashstore.py +++ b/tests/test_hashstore.py @@ -1,7 +1,7 @@ """Test module for HashStore Module""" import os import pytest -from hashstore.hashstore import HashAddress, HashStoreFactory +from hashstore.hashstore import ObjectMetadata, HashStoreFactory from hashstore.filehashstore import FileHashStore @@ -87,7 +87,9 @@ def test_hashaddress(): "sha256": "sha256value", "sha512": "sha512value", } - hash_address = HashAddress(ab_id, rel_path, abs_path, is_duplicate, hex_digest_dict) + hash_address = ObjectMetadata( + ab_id, rel_path, abs_path, is_duplicate, hex_digest_dict + ) assert hash_address.id == ab_id assert hash_address.relpath == rel_path assert hash_address.abspath == abs_path From 07abda50747f5bb09aee1dfd4843a434c993c0d5 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 13 Jul 2023 12:21:32 -0700 Subject: [PATCH 040/165] Refactor 'FileHashStore' for new (renamed) class ObjectMetadata and requirements to return object file size when calling 'store_object' --- src/hashstore/filehashstore.py | 38 +++++++++++++--------------------- src/hashstore/hashstore.py | 15 ++++++-------- 2 files changed, 20 insertions(+), 33 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 474aab49..f3ac64ed 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -441,7 +441,7 @@ def store_object( "FileHashStore - store_object: Attempting to store object for pid: %s", pid, ) - hash_address = self.put_object( + object_metadata = self.put_object( pid, data, additional_algorithm=additional_algorithm_checked, @@ -461,7 +461,7 @@ def store_object( pid, ) - return hash_address + return object_metadata def store_metadata(self, pid, metadata, format_id=None): logging.debug( @@ -649,9 +649,8 @@ def put_object( checksum_algorithm (str, optional): Algorithm value of given checksum. Returns: - hash_address (HashAddress): object that contains the permanent address, - relative file path, absolute file path, duplicate file boolean and hex - digest dictionary. + object_metadata (ObjectMetadata): object that contains the object id, + object file size, duplicate file boolean and hex digest dictionary. """ stream = Stream(file) @@ -661,8 +660,7 @@ def put_object( with closing(stream): ( object_cid, - rel_path, - abs_path, + file_size, is_duplicate, hex_digest_dict, ) = self._move_and_get_checksums( @@ -674,14 +672,14 @@ def put_object( checksum_algorithm, ) - hash_address = ObjectMetadata( - object_cid, rel_path, abs_path, is_duplicate, hex_digest_dict + object_metadata = ObjectMetadata( + object_cid, file_size, is_duplicate, hex_digest_dict ) logging.debug( "FileHashStore - put_object: Successfully put object for pid: %s", pid, ) - return hash_address + return object_metadata def _move_and_get_checksums( self, @@ -713,9 +711,8 @@ def _move_and_get_checksums( checksum_algorithm (str, optional): Algorithm value of given checksum. \n Returns: - hash_address (HashAddress): object that contains the permanent address, - relative file path, absolute file path, duplicate file boolean and hex - digest dictionary. + object_metadata (tuple): object id, object file size, duplicate file + boolean and hex digest dictionary. """ entity = "objects" object_cid = self.get_sha256_hex_digest(pid) @@ -731,15 +728,13 @@ def _move_and_get_checksums( logging.error(exception_string) raise FileExistsError(exception_string) - rel_file_path = os.path.relpath(abs_file_path, self.objects) - # Create temporary file and calculate hex digests debug_msg = ( "FileHashStore - _move_and_get_checksums: Creating temp" + f" file and calculating checksums for pid: {pid}" ) logging.debug(debug_msg) - hex_digests, tmp_file_name = self._mktempfile( + hex_digests, tmp_file_name, tmp_file_size = self._mktempfile( stream, additional_algorithm, checksum_algorithm ) logging.debug( @@ -813,13 +808,7 @@ def _move_and_get_checksums( is_object_duplicate = True self.delete(entity, tmp_file_name) - return ( - object_cid, - rel_file_path, - abs_file_path, - is_object_duplicate, - hex_digests, - ) + return (object_cid, tmp_file_size, is_object_duplicate, hex_digests) def _mktempfile(self, stream, additional_algorithm=None, checksum_algorithm=None): """Create a named temporary file from a `Stream` object and return its filename @@ -878,9 +867,10 @@ def _mktempfile(self, stream, additional_algorithm=None, checksum_algorithm=None hash_algorithm.hexdigest() for hash_algorithm in hash_algorithms ] hex_digest_dict = dict(zip(algorithm_list_to_calculate, hex_digest_list)) + tmp_file_size = os.path.getsize(tmp.name) logging.debug("FileHashStore - _mktempfile: Hex digests calculated.") - return hex_digest_dict, tmp.name + return hex_digest_dict, tmp.name, tmp_file_size def put_metadata(self, metadata, pid, format_id): """Store contents of metadata to `[self.root]/metadata` using the hash of the diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 477c3350..4e74b4ed 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -58,8 +58,8 @@ def store_object( checksum_algorithm (string): Algorithm of supplied checksum. Returns: - address (HashAddress): Object that contains the permanent address, relative - file path, absolute file path, duplicate file boolean and hex digest dictionary. + object_metadata (ObjectMetadata): Object that contains the permanent address, + file size, duplicate file boolean and hex digest dictionary. """ raise NotImplementedError() @@ -209,16 +209,13 @@ def get_hashstore(module_name, class_name, properties=None): class ObjectMetadata( - namedtuple( - "HashAddress", ["id", "relpath", "abspath", "is_duplicate", "hex_digests"] - ) + namedtuple("ObjectMetadata", ["id", "obj_size", "is_duplicate", "hex_digests"]) ): """File address containing file's path on disk and its content hash ID. Args: ab_id (str): Hash ID (hexdigest) of file contents. - relpath (str): Relative path location to :attr:`HashFS.root`. - abspath (str): Absolute path location of file on disk. + obj_size (bytes): Size of the object is_duplicate (boolean, optional): Whether the hash address created was a duplicate of a previously existing file. Can only be ``True`` after a put operation. Defaults to ``False``. @@ -227,7 +224,7 @@ class ObjectMetadata( """ # Default value to prevent dangerous default value - def __new__(cls, ab_id, relpath, abspath, is_duplicate=False, hex_digests=None): + def __new__(cls, ab_id, obj_size, is_duplicate=False, hex_digests=None): return super(ObjectMetadata, cls).__new__( - cls, ab_id, relpath, abspath, is_duplicate, hex_digests + cls, ab_id, obj_size, is_duplicate, hex_digests ) From 3134331ec2d43605fb6382fe921aa45a97f17c39 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 13 Jul 2023 12:26:36 -0700 Subject: [PATCH 041/165] Add new pytests to verify stored object file size and update all references for 'hash_address' to 'object_metadata' --- tests/conftest.py | 3 + tests/test_filehashstore.py | 244 ++++++++++++-------------- tests/test_filehashstore_interface.py | 132 +++++++------- tests/test_hashstore.py | 24 ++- 4 files changed, 188 insertions(+), 215 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index e4e0635c..9b25c520 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -46,6 +46,7 @@ def init_pids(): """ test_pids = { "doi:10.18739/A2901ZH2M": { + "file_size_bytes": 39993, "object_cid": "0d555ed77052d7e166017f779cbc193357c3a5006ee8b8457230bcf7abcef65e", "metadata_cid": "323e0799524cec4c7e14d31289cefd884b563b5c052f154a066de5ec1e477da7", "md5": "db91c910a3202478c8def1071c54aae5", @@ -56,6 +57,7 @@ def init_pids(): "sha512": "e9bcd6b91b102ef5803d1bd60c7a5d2dbec1a2baf5f62f7da60de07607ad6797d6a9b740d97a257fd2774f2c26503d455d8f2a03a128773477dfa96ab96a2e54", }, "jtao.1700.1": { + "file_size_bytes": 8724, "object_cid": "a8241925740d5dcd719596639e780e0a090c9d55a5d0372b0eaf55ed711d4edf", "metadata_cid": "ddf07952ef28efc099d10d8b682480f7d2da60015f5d8873b6e1ea75b4baf689", "md5": "f4ea2d07db950873462a064937197b0f", @@ -66,6 +68,7 @@ def init_pids(): "sha512": "bf9e7f4d4e66bd082817d87659d1d57c2220c376cd032ed97cadd481cf40d78dd479cbed14d34d98bae8cebc603b40c633d088751f07155a94468aa59e2ad109", }, "urn:uuid:1b35d0a5-b17a-423b-a2ed-de2b18dc367a": { + "file_size_bytes": 18699, "object_cid": "7f5cc18f0b04e812a3b4c8f686ce34e6fec558804bf61e54b176742a7f6368d6", "metadata_cid": "9a2e08c666b728e6cbd04d247b9e556df3de5b2ca49f7c5a24868eb27cddbff2", "md5": "e1932fc75ca94de8b64f1d73dc898079", diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 049a8f7f..70418f9c 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -172,9 +172,9 @@ def test_put_object_files_path(pids, store): entity = "objects" for pid in pids.keys(): path = Path(test_dir) / pid.replace("/", "_") - hash_address = store.put_object(pid, path) - hashaddress_id = hash_address.id - assert store.exists(entity, hashaddress_id) + object_metadata = store.put_object(pid, path) + object_metadata_id = object_metadata.id + assert store.exists(entity, object_metadata_id) def test_put_object_files_string(pids, store): @@ -183,9 +183,9 @@ def test_put_object_files_string(pids, store): entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - hash_address = store.put_object(pid, path) - hashaddress_id = hash_address.id - assert store.exists(entity, hashaddress_id) + object_metadata = store.put_object(pid, path) + object_metadata_id = object_metadata.id + assert store.exists(entity, object_metadata_id) def test_put_object_files_stream(pids, store): @@ -195,10 +195,10 @@ def test_put_object_files_stream(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") - hash_address = store.put_object(pid, input_stream) + object_metadata = store.put_object(pid, input_stream) input_stream.close() - hashaddress_id = hash_address.id - assert store.exists(entity, hashaddress_id) + object_metadata_id = object_metadata.id + assert store.exists(entity, object_metadata_id) assert store.count(entity) == 3 @@ -207,34 +207,19 @@ def test_put_object_cid(pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - hashaddress = store.put_object(pid, path) - hashaddress_id = hashaddress.id - assert hashaddress_id == pids[pid]["object_cid"] + object_metadata = store.put_object(pid, path) + object_metadata_id = object_metadata.id + assert object_metadata_id == pids[pid]["object_cid"] -def test_put_object_relpath(pids, store): - """Check put returns correct relative path.""" +def test_put_object_file_size(pids, store): + """Check put returns correct file size.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - hashaddress = store.put_object(pid, path) - hashaddress_id = hashaddress.id - hashaddress_relpath = hashaddress.relpath - shard_id_path = "/".join(store.shard(hashaddress_id)) - assert hashaddress_relpath == shard_id_path - - -def test_put_object_abspath(pids, store): - """Check put returns correct absolute path.""" - test_dir = "tests/testdata/" - entity = "objects" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - hashaddress = store.put_object(pid, path) - hashaddress_id = hashaddress.id - hashaddress_abspath = hashaddress.abspath - id_abs_path = store.get_real_path(entity, hashaddress_id) - assert hashaddress_abspath == id_abs_path + object_metadata = store.put_object(pid, path) + object_size = object_metadata.obj_size + assert object_size == pids[pid]["file_size_bytes"] def test_put_object_is_duplicate(pids, store): @@ -242,9 +227,9 @@ def test_put_object_is_duplicate(pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - hashaddress = store.put_object(pid, path) - hashaddress_is_duplicate = hashaddress.is_duplicate - assert hashaddress_is_duplicate is False + object_metadata = store.put_object(pid, path) + object_metadata_is_duplicate = object_metadata.is_duplicate + assert object_metadata_is_duplicate is False def test_put_object_hex_digests(pids, store): @@ -252,13 +237,13 @@ def test_put_object_hex_digests(pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - hashaddress = store.put_object(pid, path) - hashaddress_hex_digests = hashaddress.hex_digests - assert hashaddress_hex_digests.get("md5") == pids[pid]["md5"] - assert hashaddress_hex_digests.get("sha1") == pids[pid]["sha1"] - assert hashaddress_hex_digests.get("sha256") == pids[pid]["sha256"] - assert hashaddress_hex_digests.get("sha384") == pids[pid]["sha384"] - assert hashaddress_hex_digests.get("sha512") == pids[pid]["sha512"] + object_metadata = store.put_object(pid, path) + object_metadata_hex_digests = object_metadata.hex_digests + assert object_metadata_hex_digests.get("md5") == pids[pid]["md5"] + assert object_metadata_hex_digests.get("sha1") == pids[pid]["sha1"] + assert object_metadata_hex_digests.get("sha256") == pids[pid]["sha256"] + assert object_metadata_hex_digests.get("sha384") == pids[pid]["sha384"] + assert object_metadata_hex_digests.get("sha512") == pids[pid]["sha512"] def test_put_object_additional_algorithm(pids, store): @@ -267,8 +252,8 @@ def test_put_object_additional_algorithm(pids, store): for pid in pids.keys(): algo = "sha224" path = test_dir + pid.replace("/", "_") - hash_address = store.put_object(pid, path, additional_algorithm=algo) - hex_digests = hash_address.hex_digests + object_metadata = store.put_object(pid, path, additional_algorithm=algo) + hex_digests = object_metadata.hex_digests sha224_hash = hex_digests.get(algo) assert sha224_hash == pids[pid][algo] @@ -309,15 +294,14 @@ def test_move_and_get_checksums_id(pids, store): _, _, _, - _, ) = store._move_and_get_checksums(pid, input_stream) input_stream.close() object_cid = store.get_sha256_hex_digest(pid) assert move_id == object_cid -def test_move_and_get_checksums_hex_digests(pids, store): - """Test move returns correct hex digests.""" +def test_move_and_get_checksums_file_size(pids, store): + """Test move returns correct file size.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -325,21 +309,16 @@ def test_move_and_get_checksums_hex_digests(pids, store): # pylint: disable=W0212 ( _, + tmp_file_size, _, _, - _, - hex_digests, ) = store._move_and_get_checksums(pid, input_stream) input_stream.close() - assert hex_digests.get("md5") == pids[pid]["md5"] - assert hex_digests.get("sha1") == pids[pid]["sha1"] - assert hex_digests.get("sha256") == pids[pid]["sha256"] - assert hex_digests.get("sha384") == pids[pid]["sha384"] - assert hex_digests.get("sha512") == pids[pid]["sha512"] + assert tmp_file_size == pids[pid]["file_size_bytes"] -def test_move_and_get_checksums_abs_path(pids, store): - """Test move returns correct absolute path that exists.""" +def test_move_and_get_checksums_hex_digests(pids, store): + """Test move returns correct hex digests.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -348,13 +327,15 @@ def test_move_and_get_checksums_abs_path(pids, store): ( _, _, - abs_path, - _, _, + hex_digests, ) = store._move_and_get_checksums(pid, input_stream) input_stream.close() - store.get_sha256_hex_digest(pid) - assert os.path.isfile(abs_path) is True + assert hex_digests.get("md5") == pids[pid]["md5"] + assert hex_digests.get("sha1") == pids[pid]["sha1"] + assert hex_digests.get("sha256") == pids[pid]["sha256"] + assert hex_digests.get("sha384") == pids[pid]["sha384"] + assert hex_digests.get("sha512") == pids[pid]["sha512"] def test_move_and_get_checksums_duplicates_raises_error(pids, store): @@ -388,7 +369,9 @@ def test_mktempfile_additional_algo(store): "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) # pylint: disable=W0212 - hex_digests, _ = store._mktempfile(input_stream, additional_algorithm=checksum_algo) + hex_digests, _, _ = store._mktempfile( + input_stream, additional_algorithm=checksum_algo + ) input_stream.close() assert hex_digests.get("sha3_256") == checksum_correct @@ -404,7 +387,9 @@ def test_mktempfile_checksum_algo(store): "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) # pylint: disable=W0212 - hex_digests, _ = store._mktempfile(input_stream, checksum_algorithm=checksum_algo) + hex_digests, _, _ = store._mktempfile( + input_stream, checksum_algorithm=checksum_algo + ) input_stream.close() assert hex_digests.get("sha3_256") == checksum_correct @@ -424,7 +409,7 @@ def test_mktempfile_checksum_and_additional_algo(store): "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) # pylint: disable=W0212 - hex_digests, _ = store._mktempfile( + hex_digests, _, _ = store._mktempfile( input_stream, additional_algorithm=additional_algo, checksum_algorithm=checksum_algo, @@ -444,7 +429,7 @@ def test_mktempfile_checksum_and_additional_algo_duplicate(store): checksum_algo = "sha224" checksum_correct = "9b3a96f434f3c894359193a63437ef86fbd5a1a1a6cc37f1d5013ac1" # pylint: disable=W0212 - hex_digests, _ = store._mktempfile( + hex_digests, _, _ = store._mktempfile( input_stream, additional_algorithm=additional_algo, checksum_algorithm=checksum_algo, @@ -453,6 +438,18 @@ def test_mktempfile_checksum_and_additional_algo_duplicate(store): assert hex_digests.get("sha224") == checksum_correct +def test_mktempfile_file_size(pids, store): + """Test _mktempfile returns correct file size.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + input_stream = io.open(path, "rb") + # pylint: disable=W0212 + _, _, tmp_file_size = store._mktempfile(input_stream) + input_stream.close() + assert tmp_file_size == pids[pid]["file_size_bytes"] + + def test_mktempfile_hex_digests(pids, store): """Test _mktempfile returns correct hex digests.""" test_dir = "tests/testdata/" @@ -460,7 +457,7 @@ def test_mktempfile_hex_digests(pids, store): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") # pylint: disable=W0212 - hex_digests, _ = store._mktempfile(input_stream) + hex_digests, _, _ = store._mktempfile(input_stream) input_stream.close() assert hex_digests.get("md5") == pids[pid]["md5"] assert hex_digests.get("sha1") == pids[pid]["sha1"] @@ -476,7 +473,7 @@ def test_mktempfile_tmpfile_object(pids, store): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") # pylint: disable=W0212 - _, tmp_file_name = store._mktempfile(input_stream) + _, tmp_file_name, _ = store._mktempfile(input_stream) input_stream.close() assert os.path.isfile(tmp_file_name) is True @@ -490,10 +487,10 @@ def test_mktempfile_with_unsupported_algorithm(pids, store): algo = "md2" with pytest.raises(ValueError): # pylint: disable=W0212 - _, _ = store._mktempfile(input_stream, additional_algorithm=algo) + _, _, _ = store._mktempfile(input_stream, additional_algorithm=algo) with pytest.raises(ValueError): # pylint: disable=W0212 - _, _ = store._mktempfile(input_stream, checksum_algorithm=algo) + _, _, _ = store._mktempfile(input_stream, checksum_algorithm=algo) input_stream.close() @@ -588,26 +585,14 @@ def test_get_store_path_metadata(store): assert path_metadata_string.endswith("/metacat/metadata") -def test_exists_with_absolute_path(pids, store): +def test_exists_with_object_metadata_id(pids, store): """Test exists method with an absolute file path.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - hashaddress = store.put_object(pid, path) - hashaddress_abspath = hashaddress.abspath - assert store.exists(entity, hashaddress_abspath) - - -def test_exists_with_relative_path(pids, store): - """Test exists method with an absolute file path.""" - test_dir = "tests/testdata/" - entity = "objects" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - hashaddress = store.put_object(pid, path) - hashaddress_relpath = hashaddress.relpath - assert store.exists(entity, hashaddress_relpath) + object_metadata = store.put_object(pid, path) + assert store.exists(entity, object_metadata.id) def test_exists_with_sharded_path(pids, store): @@ -616,10 +601,10 @@ def test_exists_with_sharded_path(pids, store): entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - hashaddress = store.put_object(pid, path) - hashaddress_shard = store.shard(hashaddress.id) - hashaddress_shard_path = "/".join(hashaddress_shard) - assert store.exists(entity, hashaddress_shard_path) + object_metadata = store.put_object(pid, path) + object_metadata_shard = store.shard(object_metadata.id) + object_metadata_shard_path = "/".join(object_metadata_shard) + assert store.exists(entity, object_metadata_shard_path) def test_exists_with_nonexistent_file(store): @@ -649,34 +634,22 @@ def test_open_objects(pids, store): entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - hash_address = store.put_object(pid, path) - hashaddress_id = hash_address.id - io_buffer = store.open(entity, hashaddress_id) + object_metadata = store.put_object(pid, path) + object_metadata_id = object_metadata.id + io_buffer = store.open(entity, object_metadata_id) assert isinstance(io_buffer, io.BufferedReader) io_buffer.close() -def test_delete_by_id(pids, store): - """Check objects are deleted after calling delete with id.""" - test_dir = "tests/testdata/" - entity = "objects" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - hash_address = store.put_object(pid, path) - hashaddress_id = hash_address.id - store.delete(entity, hashaddress_id) - assert store.count(entity) == 0 - - -def test_delete_by_path(pids, store): - """Check objects are deleted after calling delete with path.""" +def test_delete_by_object_metadata_id(pids, store): + """Check objects are deleted after calling delete with hash addres id""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - hash_address = store.put_object(pid, path) - hashaddress_relpath = hash_address.relpath - store.delete(entity, hashaddress_relpath) + object_metadata = store.put_object(pid, path) + object_metadata_id = object_metadata.id + store.delete(entity, object_metadata_id) assert store.count(entity) == 0 @@ -723,10 +696,11 @@ def test_remove_empty_does_not_remove_nonempty_folders(pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - hash_address = store.put_object(pid, path) - hashaddress_relpath = hash_address.relpath + object_metadata = store.put_object(pid, path) + object_metadata_shard = store.shard(object_metadata.id) + object_metadata_shard_path = "/".join(object_metadata_shard) # Get parent directory of the relative path - parent_dir = os.path.dirname(hashaddress_relpath) + parent_dir = os.path.dirname(object_metadata_shard_path) # Attempt to remove the parent directory store.remove_empty(parent_dir) abs_parent_dir = store.objects + "/" + parent_dir @@ -779,40 +753,52 @@ def test_get_real_path_file_does_not_exist(store): assert real_path_exists is None -def test_get_real_path_absolute_path(store, pids): - """Test get_real_path returns path (is truthy) when absolute path exists.""" +def test_get_real_path_with_object_id(store, pids): + """Test get_real_path returns absolute path given an object id""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - hashaddress = store.put_object(pid, path) - hashaddress_abspath = hashaddress.abspath - abs_path = store.get_real_path(entity, hashaddress_abspath) - assert abs_path + object_metadata = store.put_object(pid, path) + obj_abs_path = store.get_real_path(entity, object_metadata.id) + assert os.path.exists(obj_abs_path) -def test_get_real_path_relative_path(store, pids): - """Test get_real_path returns path (is truthy) when rel path exists.""" +def test_get_real_path_with_object_id_sharded(pids, store): + """Test exists method with a sharded path (relative path)""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - hashaddress = store.put_object(pid, path) - hashaddress_relpath = hashaddress.relpath - rel_path = store.get_real_path(entity, hashaddress_relpath) - assert rel_path + object_metadata = store.put_object(pid, path) + object_metadata_shard = store.shard(object_metadata.id) + object_metadata_shard_path = "/".join(object_metadata_shard) + obj_abs_path = store.get_real_path(entity, object_metadata_shard_path) + assert os.path.exists(obj_abs_path) -def test_get_real_path_hex_digest_path(store, pids): - """Test get_real_path returns path (is truthy) when rel path exists.""" +def test_get_real_path_with_metadata_id(store, pids): + """Test get_real_path returns absolute path given a metadata id""" + entity = "metadata" test_dir = "tests/testdata/" - entity = "objects" + format_id = "http://ns.dataone.org/service/types/v2.0" + for pid in pids.keys(): + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + metadata_cid = store.store_metadata(pid, syspath, format_id) + metadata_abs_path = store.get_real_path(entity, metadata_cid) + assert os.path.exists(metadata_abs_path) + + +def test_get_real_path_with_bad_entity(store, pids): + """Test get_real_path returns absolute path given an object id""" + test_dir = "tests/testdata/" + entity = "bad_entity" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - hashaddress = store.put_object(pid, path) - hashaddress_id = hashaddress.id - hex_digest = store.get_real_path(entity, hashaddress_id) - assert hex_digest + object_metadata = store.put_object(pid, path) + with pytest.raises(ValueError): + store.get_real_path(entity, object_metadata.id) def test_build_abs_path(store, pids): diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index f8e4b328..6ffb5b1c 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -22,8 +22,8 @@ def test_store_address_length(pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - hash_address = store.store_object(pid, path) - object_cid = hash_address.id + object_metadata = store.store_object(pid, path) + object_cid = object_metadata.id assert len(object_cid) == 64 @@ -36,9 +36,9 @@ def test_store_object(pids, store): path = Path(test_dir + pid.replace("/", "_")) filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - hash_address = store.store_object(pid, path) + object_metadata = store.store_object(pid, path) _metadata_cid = store.store_metadata(pid, syspath, format_id) - assert hash_address.id == pids[pid]["object_cid"] + assert object_metadata.id == pids[pid]["object_cid"] assert store.count(entity) == 3 @@ -51,7 +51,7 @@ def test_store_object_files_path(pids, store): path = Path(test_dir + pid.replace("/", "_")) filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - _hash_address = store.store_object(pid, path) + _object_metadata = store.store_object(pid, path) _metadata_cid = store.store_metadata(pid, syspath, format_id) assert store.exists(entity, pids[pid]["object_cid"]) assert store.count(entity) == 3 @@ -66,7 +66,7 @@ def test_store_object_files_string(pids, store): path_string = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - _hash_address = store.store_object(pid, path_string) + _object_metadata = store.store_object(pid, path_string) _metadata_cid = store.store_metadata(pid, syspath, format_id) assert store.exists(entity, pids[pid]["object_cid"]) assert store.count(entity) == 3 @@ -79,7 +79,7 @@ def test_store_object_files_input_stream(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") - _hash_address = store.store_object(pid, input_stream) + _object_metadata = store.store_object(pid, input_stream) input_stream.close() object_cid = store.get_sha256_hex_digest(pid) assert store.exists(entity, object_cid) @@ -91,31 +91,18 @@ def test_store_object_id(pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - hash_address = store.store_object(pid, path) - assert hash_address.id == pids[pid]["object_cid"] + object_metadata = store.store_object(pid, path) + assert object_metadata.id == pids[pid]["object_cid"] -def test_store_object_rel_path(pids, store): - """Test store object returns expected relative path.""" +def test_store_object_obj_size(pids, store): + """Test store object returns expected file size.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - hash_address = store.store_object(pid, path) - object_cid = pids[pid]["object_cid"] - object_cid_rel_path = "/".join(store.shard(object_cid)) - assert hash_address.relpath == object_cid_rel_path - - -def test_store_object_abs_path(pids, store): - """Test store object returns expected absolute path.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - hash_address = store.store_object(pid, path) - object_cid = pids[pid]["object_cid"] - object_cid_rel_path = "/".join(store.shard(object_cid)) - object_cid_abs_path = store.objects + "/" + object_cid_rel_path - assert hash_address.abspath == object_cid_abs_path + object_metadata = store.store_object(pid, path) + object_size = object_metadata.obj_size + assert object_size == pids[pid]["file_size_bytes"] def test_store_object_is_duplicate(pids, store): @@ -123,8 +110,8 @@ def test_store_object_is_duplicate(pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - hash_address = store.store_object(pid, path) - assert hash_address.is_duplicate is False + object_metadata = store.store_object(pid, path) + assert object_metadata.is_duplicate is False def test_store_object_hex_digests(pids, store): @@ -132,12 +119,12 @@ def test_store_object_hex_digests(pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - hash_address = store.store_object(pid, path) - assert hash_address.hex_digests.get("md5") == pids[pid]["md5"] - assert hash_address.hex_digests.get("sha1") == pids[pid]["sha1"] - assert hash_address.hex_digests.get("sha256") == pids[pid]["sha256"] - assert hash_address.hex_digests.get("sha384") == pids[pid]["sha384"] - assert hash_address.hex_digests.get("sha512") == pids[pid]["sha512"] + object_metadata = store.store_object(pid, path) + assert object_metadata.hex_digests.get("md5") == pids[pid]["md5"] + assert object_metadata.hex_digests.get("sha1") == pids[pid]["sha1"] + assert object_metadata.hex_digests.get("sha256") == pids[pid]["sha256"] + assert object_metadata.hex_digests.get("sha384") == pids[pid]["sha384"] + assert object_metadata.hex_digests.get("sha512") == pids[pid]["sha512"] def test_store_object_pid_empty(store): @@ -208,8 +195,8 @@ def test_store_object_additional_algorithm_hyphen_uppercase(pids, store): pid = "jtao.1700.1" path = test_dir + pid algorithm_with_hyphen_and_upper = "SHA-384" - hash_address = store.store_object(pid, path, algorithm_with_hyphen_and_upper) - sha256_cid = hash_address.hex_digests.get("sha384") + object_metadata = store.store_object(pid, path, algorithm_with_hyphen_and_upper) + sha256_cid = object_metadata.hex_digests.get("sha384") assert sha256_cid == pids[pid]["sha384"] object_cid = store.get_sha256_hex_digest(pid) assert store.exists(entity, object_cid) @@ -222,8 +209,8 @@ def test_store_object_additional_algorithm_hyphen_lowercase(store): pid = "jtao.1700.1" path = test_dir + pid algorithm_other = "sha3-256" - hash_address = store.store_object(pid, path, algorithm_other) - additional_sha3_256_hex_digest = hash_address.hex_digests.get("sha3_256") + object_metadata = store.store_object(pid, path, algorithm_other) + additional_sha3_256_hex_digest = object_metadata.hex_digests.get("sha3_256") sha3_256_checksum = ( "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) @@ -239,8 +226,8 @@ def test_store_object_additional_algorithm_underscore(store): pid = "jtao.1700.1" path = test_dir + pid algorithm_other = "sha3_256" - hash_address = store.store_object(pid, path, algorithm_other) - additional_sha3_256_hex_digest = hash_address.hex_digests.get("sha3_256") + object_metadata = store.store_object(pid, path, algorithm_other) + additional_sha3_256_hex_digest = object_metadata.hex_digests.get("sha3_256") sha3_256_checksum = ( "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) @@ -259,7 +246,7 @@ def test_store_object_checksum_correct(store): checksum_correct = ( "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) - _hash_address = store.store_object( + _object_metadata = store.store_object( pid, path, checksum=checksum_correct, checksum_algorithm=checksum_algo ) assert store.count(entity) == 1 @@ -278,15 +265,15 @@ def test_store_object_checksum_correct_and_additional_algo(store): checksum_correct = ( "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) - hash_address = store.store_object( + object_metadata = store.store_object( pid, path, additional_algorithm=algorithm_additional, checksum=checksum_correct, checksum_algorithm=algorithm_checksum, ) - assert hash_address.hex_digests.get("sha224") == sha224_additional_checksum - assert hash_address.hex_digests.get("sha3_256") == checksum_correct + assert object_metadata.hex_digests.get("sha224") == sha224_additional_checksum + assert object_metadata.hex_digests.get("sha3_256") == checksum_correct def test_store_object_checksum_correct_and_additional_algo_duplicate(store): @@ -299,14 +286,14 @@ def test_store_object_checksum_correct_and_additional_algo_duplicate(store): checksum_correct = ( "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) - hash_address = store.store_object( + object_metadata = store.store_object( pid, path, additional_algorithm=algorithm_additional, checksum=checksum_correct, checksum_algorithm=algorithm_checksum, ) - assert hash_address.hex_digests.get("sha3_256") == checksum_correct + assert object_metadata.hex_digests.get("sha3_256") == checksum_correct def test_store_object_checksum_algorithm_empty(store): @@ -382,10 +369,10 @@ def test_store_object_duplicate_raises_error(store): path = test_dir + pid entity = "objects" # Store first blob - _hash_address_one = store.store_object(pid, path) + _object_metadata_one = store.store_object(pid, path) # Store second blob with pytest.raises(FileExistsError): - _hash_address_two = store.store_object(pid, path) + _object_metadata_two = store.store_object(pid, path) assert store.count(entity) == 1 object_cid = store.get_sha256_hex_digest(pid) assert store.exists(entity, object_cid) @@ -445,16 +432,17 @@ def test_store_object_large_file(store): remaining_bytes -= bytes_to_write # Store object pid = "testfile_filehashstore" - hash_address = store.store_object(pid, file_path) - hash_address_id = hash_address.id + object_metadata = store.store_object(pid, file_path) + object_metadata_id = object_metadata.id pid_sha256_hex_digest = store.get_sha256_hex_digest(pid) - assert hash_address_id == pid_sha256_hex_digest + assert object_metadata_id == pid_sha256_hex_digest @slow_test def test_store_object_sparse_large_file(store): """Test storing a large object (4GB) via sparse file. This test has also been - executed with a 10GB file and the test classes succeeded locally in 117.03s (0:01:57).""" + executed with a 10GB file and the test classes succeeded locally in 117.03s (0:01:57). + """ # file_size = 10 * 1024 * 1024 * 1024 # 10GB file_size = 4 * 1024 * 1024 * 1024 # 4GB file_path = store.root + "random_file.bin" @@ -464,10 +452,10 @@ def test_store_object_sparse_large_file(store): file.write(b"\0") # Store object pid = "testfile_filehashstore" - hash_address = store.store_object(pid, file_path) - hash_address_id = hash_address.id + object_metadata = store.store_object(pid, file_path) + object_metadata_id = object_metadata.id pid_sha256_hex_digest = store.get_sha256_hex_digest(pid) - assert hash_address_id == pid_sha256_hex_digest + assert object_metadata_id == pid_sha256_hex_digest def test_store_metadata(pids, store): @@ -478,7 +466,7 @@ def test_store_metadata(pids, store): path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - _hash_address = store.store_object(pid, path) + _object_metadata = store.store_object(pid, path) metadata_cid = store.store_metadata(pid, syspath, format_id) assert metadata_cid == pids[pid]["metadata_cid"] @@ -490,7 +478,7 @@ def test_store_metadata_default_format_id(pids, store): path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - _hash_address = store.store_object(pid, path) + _object_metadata = store.store_object(pid, path) metadata_cid = store.store_metadata(pid, syspath) assert metadata_cid == pids[pid]["metadata_cid"] @@ -504,7 +492,7 @@ def test_store_metadata_files_path(pids, store): path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - _hash_address = store.store_object(pid, path) + _object_metadata = store.store_object(pid, path) metadata_cid = store.store_metadata(pid, syspath, format_id) assert store.exists(entity, metadata_cid) assert metadata_cid == pids[pid]["metadata_cid"] @@ -520,7 +508,7 @@ def test_store_metadata_files_string(pids, store): path_string = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath_string = str(Path(test_dir) / filename) - _hash_address = store.store_object(pid, path_string) + _object_metadata = store.store_object(pid, path_string) metadata_cid = store.store_metadata(pid, syspath_string, format_id) assert store.exists(entity, metadata_cid) assert store.count(entity) == 3 @@ -533,7 +521,7 @@ def test_store_metadata_files_input_stream(pids, store): format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - _hash_address = store.store_object(pid, path) + _object_metadata = store.store_object(pid, path) filename = pid.replace("/", "_") + ".xml" syspath_string = str(Path(test_dir) / filename) syspath_stream = io.open(syspath_string, "rb") @@ -601,7 +589,7 @@ def test_store_metadata_metadata_cid(pids, store): path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - _hash_address = store.store_object(pid, path) + _object_metadata = store.store_object(pid, path) metadata_cid = store.store_metadata(pid, syspath, format_id) assert metadata_cid == pids[pid]["metadata_cid"] @@ -615,7 +603,7 @@ def test_store_metadata_thread_lock(store): path = test_dir + pid filename = pid + ".xml" syspath = Path(test_dir) / filename - _hash_address = store.store_object(pid, path) + _object_metadata = store.store_object(pid, path) store.store_metadata(pid, syspath, format_id) # Start threads thread1 = Thread(target=store.store_metadata, args=(pid, syspath, format_id)) @@ -641,12 +629,12 @@ def test_retrieve_object(pids, store): path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - hash_address = store.store_object(pid, path) + object_metadata = store.store_object(pid, path) store.store_metadata(pid, syspath, format_id) obj_stream = store.retrieve_object(pid) sha256_hex = store.computehash(obj_stream) obj_stream.close() - assert sha256_hex == hash_address.hex_digests.get("sha256") + assert sha256_hex == object_metadata.hex_digests.get("sha256") def test_retrieve_object_pid_empty(store): @@ -672,7 +660,7 @@ def test_retrieve_metadata(store): path = test_dir + pid filename = pid + ".xml" syspath = Path(test_dir) / filename - _hash_address = store.store_object(pid, path) + _object_metadata = store.store_object(pid, path) _metadata_cid = store.store_metadata(pid, syspath, format_id) metadata_stream = store.retrieve_metadata(pid, format_id) metadata_content = metadata_stream.read().decode("utf-8") @@ -688,7 +676,7 @@ def test_retrieve_metadata_default_format_id(store): path = test_dir + pid filename = pid + ".xml" syspath = Path(test_dir) / filename - _hash_address = store.store_object(pid, path) + _object_metadata = store.store_object(pid, path) _metadata_cid = store.store_metadata(pid, syspath) metadata_stream = store.retrieve_metadata(pid) metadata_content = metadata_stream.read().decode("utf-8") @@ -739,7 +727,7 @@ def test_delete_objects(pids, store): path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - _hash_address = store.store_object(pid, path) + _object_metadata = store.store_object(pid, path) _metadata_cid = store.store_metadata(pid, syspath, format_id) store.delete_object(pid) assert store.count(entity) == 0 @@ -768,7 +756,7 @@ def test_delete_metadata(pids, store): path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - _hash_address = store.store_object(pid, path) + _object_metadata = store.store_object(pid, path) _metadata_cid = store.store_metadata(pid, syspath, format_id) store.delete_metadata(pid, format_id) assert store.count(entity) == 0 @@ -782,7 +770,7 @@ def test_delete_metadata_default_format_id(store, pids): path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - _hash_address = store.store_object(pid, path) + _object_metadata = store.store_object(pid, path) _metadata_cid = store.store_metadata(pid, syspath) store.delete_metadata(pid) assert store.count(entity) == 0 @@ -820,7 +808,7 @@ def test_get_hex_digest(store): path = test_dir + pid filename = pid + ".xml" syspath = Path(test_dir) / filename - _hash_address = store.store_object(pid, path) + _object_metadata = store.store_object(pid, path) _metadata_cid = store.store_metadata(pid, syspath, format_id) sha3_256_hex_digest = ( "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" @@ -846,7 +834,7 @@ def test_get_hex_digest_pid_unsupported_algorithm(store): filename = pid + ".xml" syspath = Path(test_dir) / filename syspath.read_bytes() - _hash_address = store.store_object(pid, path) + _object_metadata = store.store_object(pid, path) algorithm = "sm3" with pytest.raises(ValueError): store.get_hex_digest(pid, algorithm) diff --git a/tests/test_hashstore.py b/tests/test_hashstore.py index 8346c3d1..59b1d1c8 100644 --- a/tests/test_hashstore.py +++ b/tests/test_hashstore.py @@ -77,9 +77,8 @@ def test_factory_get_hashstore_filehashstore_incorrect_algorithm_format(factory) def test_hashaddress(): """Test class returns correct values via dot notation.""" ab_id = "hashstoretest" - rel_path = "rel/path/to/object" - abs_path = "abs/path/to/object" is_duplicate = "false" + obj_size = 1234 hex_digest_dict = { "md5": "md5value", "sha1": "sha1value", @@ -87,15 +86,12 @@ def test_hashaddress(): "sha256": "sha256value", "sha512": "sha512value", } - hash_address = ObjectMetadata( - ab_id, rel_path, abs_path, is_duplicate, hex_digest_dict - ) - assert hash_address.id == ab_id - assert hash_address.relpath == rel_path - assert hash_address.abspath == abs_path - assert hash_address.is_duplicate == is_duplicate - assert hash_address.hex_digests.get("md5") == hex_digest_dict["md5"] - assert hash_address.hex_digests.get("sha1") == hex_digest_dict["sha1"] - assert hash_address.hex_digests.get("sha224") == hex_digest_dict["sha224"] - assert hash_address.hex_digests.get("sha256") == hex_digest_dict["sha256"] - assert hash_address.hex_digests.get("sha512") == hex_digest_dict["sha512"] + object_metadata = ObjectMetadata(ab_id, obj_size, is_duplicate, hex_digest_dict) + assert object_metadata.id == ab_id + assert object_metadata.obj_size == obj_size + assert object_metadata.is_duplicate == is_duplicate + assert object_metadata.hex_digests.get("md5") == hex_digest_dict["md5"] + assert object_metadata.hex_digests.get("sha1") == hex_digest_dict["sha1"] + assert object_metadata.hex_digests.get("sha224") == hex_digest_dict["sha224"] + assert object_metadata.hex_digests.get("sha256") == hex_digest_dict["sha256"] + assert object_metadata.hex_digests.get("sha512") == hex_digest_dict["sha512"] From c46db04194e3efd81525f5a10e52ae47f24a8ad4 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 13 Jul 2023 14:19:48 -0700 Subject: [PATCH 042/165] Updated 'store_object' signature for 'expected_object_size', added verification logic, refactored related methods and added new pytests --- src/hashstore/filehashstore.py | 71 ++++++++++++++++++++++++--- src/hashstore/hashstore.py | 6 ++- tests/test_filehashstore.py | 20 ++++++++ tests/test_filehashstore_interface.py | 43 ++++++++++++++++ 4 files changed, 131 insertions(+), 9 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index f3ac64ed..995d2805 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -408,6 +408,7 @@ def store_object( additional_algorithm=None, checksum=None, checksum_algorithm=None, + expected_object_size=None, ): logging.debug( "FileHashStore - store_object: Request to store object for pid: %s", pid @@ -415,6 +416,7 @@ def store_object( # Validate input parameters self._is_string_none_or_empty(pid, "pid", "store_object") self._validate_data_to_store(data) + self._validate_file_size(expected_object_size) ( additional_algorithm_checked, checksum_algorithm_checked, @@ -447,6 +449,7 @@ def store_object( additional_algorithm=additional_algorithm_checked, checksum=checksum, checksum_algorithm=checksum_algorithm_checked, + file_size_to_validate=expected_object_size, ) finally: # Release pid @@ -634,6 +637,7 @@ def put_object( additional_algorithm=None, checksum=None, checksum_algorithm=None, + file_size_to_validate=None, ): """Store contents of `file` on disk using the hash of the given pid @@ -660,7 +664,7 @@ def put_object( with closing(stream): ( object_cid, - file_size, + obj_file_size, is_duplicate, hex_digest_dict, ) = self._move_and_get_checksums( @@ -670,10 +674,11 @@ def put_object( additional_algorithm, checksum, checksum_algorithm, + file_size_to_validate, ) object_metadata = ObjectMetadata( - object_cid, file_size, is_duplicate, hex_digest_dict + object_cid, obj_file_size, is_duplicate, hex_digest_dict ) logging.debug( "FileHashStore - put_object: Successfully put object for pid: %s", @@ -689,6 +694,7 @@ def _move_and_get_checksums( additional_algorithm=None, checksum=None, checksum_algorithm=None, + file_size_to_validate=None, ): """Copy the contents of `stream` onto disk with an optional file extension appended. The copy process uses a temporary file to store the @@ -747,7 +753,14 @@ def _move_and_get_checksums( is_object_duplicate = False if not os.path.isfile(abs_file_path): self._validate_object( - checksum, checksum_algorithm, entity, hex_digests, tmp_file_name + pid, + checksum, + checksum_algorithm, + entity, + hex_digests, + tmp_file_name, + tmp_file_size, + file_size_to_validate, ) try: @@ -1065,25 +1078,48 @@ def _refine_algorithm_list(self, additional_algorithm, checksum_algorithm): return algorithm_list_to_calculate def _validate_object( - self, checksum, checksum_algorithm, entity, hex_digests, tmp_file_name + self, + pid, + checksum, + checksum_algorithm, + entity, + hex_digests, + tmp_file_name, + tmp_file_size, + file_size_to_validate, ): """Evaluates an object's integrity Args: + pid: For logging purposes checksum: Value of checksum checksum_algoritm: Algorithm of checksum entity: Type of object hex_digests: Dictionary of hex digests to select from tmp_file_name: Name of tmp file + tmp_file_size: Size of the tmp file + file_size_to_validate: Expected size of the object """ + if file_size_to_validate is not None and file_size_to_validate > 0: + if file_size_to_validate != tmp_file_size: + self.delete(entity, tmp_file_name) + exception_string = ( + "FileHashStore - _move_and_get_checksums: Object file size calculated: " + + f" {tmp_file_size} does not match with expected size:" + + f"{file_size_to_validate}. Tmp file deleted and file not stored for" + + f" pid: {pid}" + ) + logging.error(exception_string) + raise ValueError(exception_string) if checksum_algorithm is not None and checksum is not None: hex_digest_stored = hex_digests[checksum_algorithm] if hex_digest_stored != checksum: self.delete(entity, tmp_file_name) exception_string = ( "FileHashStore - _move_and_get_checksums: Hex digest and checksum" - + f" do not match - file not stored. Algorithm: {checksum_algorithm}." - + f" Checksum provided: {checksum} != Hex Digest: {hex_digest_stored}" + + f" do not match - file not stored for pid: {pid}. Algorithm:" + + f" {checksum_algorithm}. Checksum provided: {checksum} !=" + + f"HexDigest: {hex_digest_stored}. Tmp file deleted." ) logging.error(exception_string) raise ValueError(exception_string) @@ -1422,6 +1458,28 @@ def count(self, entity): # Other Static Methods + @staticmethod + def _validate_file_size(file_size): + """Checks whether a file size is > 0 and an int and throws exception if not. + + Args: + file_size (int): file size to check + """ + if file_size is not None: + if not isinstance(file_size, int): + exception_string = ( + "FileHashStore - _is_file_size_valid: size given must be an integer." + + f" File size: {file_size}. Arg Type: {type(file_size)}." + ) + logging.error(exception_string) + raise TypeError(exception_string) + if file_size < 1 or not isinstance(file_size, int): + exception_string = ( + "FileHashStore - _is_file_size_valid: size given must be > 0" + ) + logging.error(exception_string) + raise ValueError(exception_string) + @staticmethod def _is_string_none_or_empty(string, arg, method): """Checks whether a string is None or empty and throws an exception if so. @@ -1430,7 +1488,6 @@ def _is_string_none_or_empty(string, arg, method): string (string): Value to check arg (): Name of argument to check method (string): Calling method for logging purposes - """ if string is None or string.replace(" ", "") == "": exception_string = ( diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 4e74b4ed..96fbe99f 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -23,6 +23,7 @@ def store_object( additional_algorithm, checksum, checksum_algorithm, + expected_object_size, ): """The `store_object` method is responsible for the atomic storage of objects to disk using a given InputStream and a persistent identifier (pid). Upon @@ -46,8 +47,8 @@ def store_object( with its corresponding hex digest. An algorithm is considered "supported" if it is recognized as a valid hash algorithm in the `hashlib` library. - Similarly, if a checksum and a checksumAlgorithm value are provided, - `store_object` validates the object to ensure it matches what is provided + Similarly, if a file size and/or checksum & checksumAlgorithm value are provided, + `store_object` validates the object to ensure it matches the given arguments before moving the file to its permanent address. Args: @@ -56,6 +57,7 @@ def store_object( additional_algorithm (string): Additional hex digest to include. checksum (string): Checksum to validate against. checksum_algorithm (string): Algorithm of supplied checksum. + expected_object_size (int): Size of object to verify Returns: object_metadata (ObjectMetadata): Object that contains the permanent address, diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 70418f9c..22264e43 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -358,6 +358,26 @@ def test_move_and_get_checksums_duplicates_raises_error(pids, store): assert store.count(entity) == 3 +def test_move_and_get_checksums_file_size_raises_error(pids, store): + """Test move and get checksum raises error with incorrect file size""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + with pytest.raises(ValueError): + path = test_dir + pid.replace("/", "_") + input_stream = io.open(path, "rb") + incorrect_file_size = 1000 + # pylint: disable=W0212 + ( + _, + _, + _, + _, + ) = store._move_and_get_checksums( + pid, input_stream, file_size_to_validate=incorrect_file_size + ) + input_stream.close() + + def test_mktempfile_additional_algo(store): """Test _mktempfile returns correct hex digests for additional algorithm.""" test_dir = "tests/testdata/" diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 6ffb5b1c..e214f24c 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -378,6 +378,49 @@ def test_store_object_duplicate_raises_error(store): assert store.exists(entity, object_cid) +def test_store_object_with_obj_file_size(store, pids): + """Test store object with correct file sizes.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + obj_file_size = pids[pid]["file_size_bytes"] + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object( + pid, path, expected_object_size=obj_file_size + ) + object_size = object_metadata.obj_size + assert object_size == obj_file_size + + +def test_store_object_with_obj_file_size_incorrect(store, pids): + """Test store object throws exception with incorrect file size.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + obj_file_size = 1234 + path = test_dir + pid.replace("/", "_") + with pytest.raises(ValueError): + store.store_object(pid, path, expected_object_size=obj_file_size) + + +def test_store_object_with_obj_file_size_non_integer(store, pids): + """Test store object throws exception with a non integer value as the file size.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + obj_file_size = "Bob" + path = test_dir + pid.replace("/", "_") + with pytest.raises(TypeError): + store.store_object(pid, path, expected_object_size=obj_file_size) + + +def test_store_object_with_obj_file_size_zero(store, pids): + """Test store object throws exception with a non integer value as the file size.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + obj_file_size = 0 + path = test_dir + pid.replace("/", "_") + with pytest.raises(ValueError): + store.store_object(pid, path, expected_object_size=obj_file_size) + + def test_store_object_duplicates_threads(store): """Test store object thread lock.""" test_dir = "tests/testdata/" From bd9d3dff26b5f220f3e709922bf21412246e6c91 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 13 Jul 2023 14:30:38 -0700 Subject: [PATCH 043/165] Clean up pytests --- src/hashstore/filehashstore.py | 8 ++--- tests/test_filehashstore.py | 42 ++++++++++++--------------- tests/test_filehashstore_interface.py | 2 +- tests/test_hashstore.py | 4 +-- 4 files changed, 26 insertions(+), 30 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 995d2805..b77f26d2 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -740,7 +740,7 @@ def _move_and_get_checksums( + f" file and calculating checksums for pid: {pid}" ) logging.debug(debug_msg) - hex_digests, tmp_file_name, tmp_file_size = self._mktempfile( + hex_digests, tmp_file_name, tmp_file_size = self._mktmpfile( stream, additional_algorithm, checksum_algorithm ) logging.debug( @@ -823,7 +823,7 @@ def _move_and_get_checksums( return (object_cid, tmp_file_size, is_object_duplicate, hex_digests) - def _mktempfile(self, stream, additional_algorithm=None, checksum_algorithm=None): + def _mktmpfile(self, stream, additional_algorithm=None, checksum_algorithm=None): """Create a named temporary file from a `Stream` object and return its filename and a dictionary of its algorithms and hex digests. If an additionak and/or checksum algorithm is provided, it will add the respective hex digest to the dictionary. @@ -903,7 +903,7 @@ def put_metadata(self, metadata, pid, format_id): # Create metadata tmp file and write to it metadata_stream = Stream(metadata) with closing(metadata_stream): - metadata_tmp = self._mktempmetadata(metadata_stream) + metadata_tmp = self._mktmpmetadata(metadata_stream) # Get target and related paths (permanent location) metadata_cid = self.get_sha256_hex_digest(pid + format_id) @@ -943,7 +943,7 @@ def put_metadata(self, metadata, pid, format_id): logging.error(exception_string) raise FileNotFoundError(exception_string) - def _mktempmetadata(self, stream): + def _mktmpmetadata(self, stream): """Create a named temporary file with `stream` (metadata) and `format_id`. Args: diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 22264e43..f7b9b9ab 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -47,7 +47,7 @@ def test_init_existing_store_correct_algorithm_format(store): def test_init_write_properties_hashstore_yaml_exists(store): - """Verify properties file present in store root directory.""" + """Verify config file present in store root directory.""" assert os.path.exists(store.hashstore_configuration_yaml) @@ -57,7 +57,7 @@ def test_init_with_existing_hashstore_mismatched_config(store): "store_path": store.root, "store_depth": 1, "store_width": 2, - "store_algorithm": "sha256", + "store_algorithm": "SHA-256", "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", } with pytest.raises(ValueError): @@ -89,8 +89,6 @@ def test_load_properties(store): assert hashstore_yaml_dict.get("store_path") == store.root assert hashstore_yaml_dict.get("store_depth") == 3 assert hashstore_yaml_dict.get("store_width") == 2 - # Note, the store_algorithm from `hashstore.yaml` gets translated to a standardized value - # Ex. "SHA-256" is supplied but is written into the file as "SHA-256" assert hashstore_yaml_dict.get("store_algorithm") == "SHA-256" assert ( hashstore_yaml_dict.get("store_metadata_namespace") @@ -389,7 +387,7 @@ def test_mktempfile_additional_algo(store): "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) # pylint: disable=W0212 - hex_digests, _, _ = store._mktempfile( + hex_digests, _, _ = store._mktmpfile( input_stream, additional_algorithm=checksum_algo ) input_stream.close() @@ -407,9 +405,7 @@ def test_mktempfile_checksum_algo(store): "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) # pylint: disable=W0212 - hex_digests, _, _ = store._mktempfile( - input_stream, checksum_algorithm=checksum_algo - ) + hex_digests, _, _ = store._mktmpfile(input_stream, checksum_algorithm=checksum_algo) input_stream.close() assert hex_digests.get("sha3_256") == checksum_correct @@ -429,7 +425,7 @@ def test_mktempfile_checksum_and_additional_algo(store): "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) # pylint: disable=W0212 - hex_digests, _, _ = store._mktempfile( + hex_digests, _, _ = store._mktmpfile( input_stream, additional_algorithm=additional_algo, checksum_algorithm=checksum_algo, @@ -440,7 +436,7 @@ def test_mktempfile_checksum_and_additional_algo(store): def test_mktempfile_checksum_and_additional_algo_duplicate(store): - """Test _mktempfile succeeds with duplicate algorithms (de-duplicates)""" + """Test _mktempfile succeeds with duplicate algorithms (de-duplicates).""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -449,7 +445,7 @@ def test_mktempfile_checksum_and_additional_algo_duplicate(store): checksum_algo = "sha224" checksum_correct = "9b3a96f434f3c894359193a63437ef86fbd5a1a1a6cc37f1d5013ac1" # pylint: disable=W0212 - hex_digests, _, _ = store._mktempfile( + hex_digests, _, _ = store._mktmpfile( input_stream, additional_algorithm=additional_algo, checksum_algorithm=checksum_algo, @@ -465,7 +461,7 @@ def test_mktempfile_file_size(pids, store): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") # pylint: disable=W0212 - _, _, tmp_file_size = store._mktempfile(input_stream) + _, _, tmp_file_size = store._mktmpfile(input_stream) input_stream.close() assert tmp_file_size == pids[pid]["file_size_bytes"] @@ -477,7 +473,7 @@ def test_mktempfile_hex_digests(pids, store): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") # pylint: disable=W0212 - hex_digests, _, _ = store._mktempfile(input_stream) + hex_digests, _, _ = store._mktmpfile(input_stream) input_stream.close() assert hex_digests.get("md5") == pids[pid]["md5"] assert hex_digests.get("sha1") == pids[pid]["sha1"] @@ -493,7 +489,7 @@ def test_mktempfile_tmpfile_object(pids, store): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") # pylint: disable=W0212 - _, tmp_file_name, _ = store._mktempfile(input_stream) + _, tmp_file_name, _ = store._mktmpfile(input_stream) input_stream.close() assert os.path.isfile(tmp_file_name) is True @@ -507,10 +503,10 @@ def test_mktempfile_with_unsupported_algorithm(pids, store): algo = "md2" with pytest.raises(ValueError): # pylint: disable=W0212 - _, _, _ = store._mktempfile(input_stream, additional_algorithm=algo) + _, _, _ = store._mktmpfile(input_stream, additional_algorithm=algo) with pytest.raises(ValueError): # pylint: disable=W0212 - _, _, _ = store._mktempfile(input_stream, checksum_algorithm=algo) + _, _, _ = store._mktmpfile(input_stream, checksum_algorithm=algo) input_stream.close() @@ -551,7 +547,7 @@ def test_put_metadata_cid(pids, store): assert metadata_cid == pids[pid]["metadata_cid"] -def test_mktempmetadata(pids, store): +def test_mktmpmetadata(pids, store): """Test mktmpmetadata creates tmpFile.""" test_dir = "tests/testdata/" entity = "metadata" @@ -560,7 +556,7 @@ def test_mktempmetadata(pids, store): syspath = Path(test_dir) / filename sys_stream = io.open(syspath, "rb") # pylint: disable=W0212 - tmp_name = store._mktempmetadata(sys_stream) + tmp_name = store._mktmpmetadata(sys_stream) sys_stream.close() assert store.exists(entity, tmp_name) @@ -662,7 +658,7 @@ def test_open_objects(pids, store): def test_delete_by_object_metadata_id(pids, store): - """Check objects are deleted after calling delete with hash addres id""" + """Check objects are deleted after calling delete with hash address id.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -774,7 +770,7 @@ def test_get_real_path_file_does_not_exist(store): def test_get_real_path_with_object_id(store, pids): - """Test get_real_path returns absolute path given an object id""" + """Test get_real_path returns absolute path given an object id.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -785,7 +781,7 @@ def test_get_real_path_with_object_id(store, pids): def test_get_real_path_with_object_id_sharded(pids, store): - """Test exists method with a sharded path (relative path)""" + """Test exists method with a sharded path (relative path).""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -798,7 +794,7 @@ def test_get_real_path_with_object_id_sharded(pids, store): def test_get_real_path_with_metadata_id(store, pids): - """Test get_real_path returns absolute path given a metadata id""" + """Test get_real_path returns absolute path given a metadata id.""" entity = "metadata" test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" @@ -811,7 +807,7 @@ def test_get_real_path_with_metadata_id(store, pids): def test_get_real_path_with_bad_entity(store, pids): - """Test get_real_path returns absolute path given an object id""" + """Test get_real_path returns absolute path given an object id.""" test_dir = "tests/testdata/" entity = "bad_entity" for pid in pids.keys(): diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index e214f24c..ecadca95 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -412,7 +412,7 @@ def test_store_object_with_obj_file_size_non_integer(store, pids): def test_store_object_with_obj_file_size_zero(store, pids): - """Test store object throws exception with a non integer value as the file size.""" + """Test store object throws exception with zero as the file size.""" test_dir = "tests/testdata/" for pid in pids.keys(): obj_file_size = 0 diff --git a/tests/test_hashstore.py b/tests/test_hashstore.py index 59b1d1c8..1d618810 100644 --- a/tests/test_hashstore.py +++ b/tests/test_hashstore.py @@ -37,7 +37,7 @@ def test_factory_get_hashstore_unsupported_class(factory): def test_factory_get_hashstore_unsupported_module(factory): """Check that ModuleNotFoundError is raised when provided with unsupported module.""" with pytest.raises(ModuleNotFoundError): - module_name = "hashstore.s3filestore.s3filestore" + module_name = "hashstore.s3filestore" class_name = "FileHashStore" factory.get_hashstore(module_name, class_name) @@ -51,7 +51,7 @@ def test_factory_get_hashstore_filehashstore_unsupported_algorithm(factory): "store_path": os.getcwd() + "/metacat/test", "store_depth": 3, "store_width": 2, - "store_algorithm": "md2", + "store_algorithm": "MD2", "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", } with pytest.raises(ValueError): From 9eb0a2aef967db486be2c1ad6f027a85715a4b7d Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 13 Jul 2023 14:46:56 -0700 Subject: [PATCH 044/165] Re-arrange order of '__all__' values in init.py --- src/hashstore/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hashstore/__init__.py b/src/hashstore/__init__.py index 9435077e..352bd3d3 100644 --- a/src/hashstore/__init__.py +++ b/src/hashstore/__init__.py @@ -17,4 +17,4 @@ from hashstore.hashstore import HashStore, HashStoreFactory, ObjectMetadata -__all__ = ("HashStore", "ObjectMetadata", "HashStoreFactory") +__all__ = ("HashStore", "HashStoreFactory", "ObjectMetadata") From caa75d85e8c17ace356766b8bec3363654c08b1b Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 14 Jul 2023 10:12:19 -0700 Subject: [PATCH 045/165] Add new python module 'client.py' --- src/hashstore/client.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 src/hashstore/client.py diff --git a/src/hashstore/client.py b/src/hashstore/client.py new file mode 100644 index 00000000..bb27d06b --- /dev/null +++ b/src/hashstore/client.py @@ -0,0 +1,17 @@ +"""HashStore Command Line App""" +from argparse import ArgumentParser + +if __name__ == "__main__": + PROGRAM_NAME = "HashStore Command Line Client" + DESCRIPTION = ( + "A command-line tool to convert a directory of data objects" + + " into a hashstore and perform operations to store, retrieve," + + " and delete the objects." + ) + EPILOG = "By Dou Mok" + parser = ArgumentParser( + prog=PROGRAM_NAME, + description=DESCRIPTION, + epilog=EPILOG, + ) + parser.parse_args() From d83d31010cac99a423cf078dfd4db39bd2e21b69 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 14 Jul 2023 12:06:10 -0700 Subject: [PATCH 046/165] Add initial positional and optional arguments --- src/hashstore/client.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index bb27d06b..4a233490 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -8,10 +8,23 @@ + " into a hashstore and perform operations to store, retrieve," + " and delete the objects." ) - EPILOG = "By Dou Mok" + EPILOG = "Created for DataONE (NCEAS)" parser = ArgumentParser( prog=PROGRAM_NAME, description=DESCRIPTION, epilog=EPILOG, ) - parser.parse_args() + + # Positional Arguments + # Path of the HashStore to create and/or store/delete objects to/from + parser.add_argument("store_path", help="Path of the HashStore") + + # Optional Arguments + parser.add_argument("-chs", dest="action", help="Create a HashStore") + parser.add_argument("-sobj", dest="action", help="Store an object to the HashStore") + parser.add_argument( + "-dobj", dest="action", help="Delete an object to the HashStore" + ) + parser.add_argument("-pid", dest="pid", help="Object Identifier") + + parser.parse_args(["--help"]) From 5ec71a0b5ed2a2a89bd3c64e099227355143883e Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 14 Jul 2023 12:22:30 -0700 Subject: [PATCH 047/165] Add missing optional arguments --- src/hashstore/client.py | 39 +++++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 4a233490..3eb7e272 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -15,16 +15,47 @@ epilog=EPILOG, ) - # Positional Arguments - # Path of the HashStore to create and/or store/delete objects to/from + ### Positional Arguments + + # Path of the HashStore parser.add_argument("store_path", help="Path of the HashStore") - # Optional Arguments + ### Optional Arguments + + # HashStore creation and property arguments parser.add_argument("-chs", dest="action", help="Create a HashStore") + parser.add_argument("-store_depth", dest="action", help="Depth of HashStore") + parser.add_argument("-store_width", dest="action", help="Width of HashStore") + parser.add_argument( + "-store_algorithm", + dest="action", + help="Algorithm to use when calculating object address", + ) + parser.add_argument( + "-store_namespace", + dest="action", + help="Default metadata namespace for metadata", + ) + + # Directory to convert into a HashStore + parser.add_argument( + "-dir", + dest="action", + help="Directory of objects to convert to a HashStore", + ) + + # Public API Equivalent Methods + # object identifier + parser.add_argument("-pid", dest="pid", help="Object Identifier") + # store_object parser.add_argument("-sobj", dest="action", help="Store an object to the HashStore") + # delete_object parser.add_argument( "-dobj", dest="action", help="Delete an object to the HashStore" ) - parser.add_argument("-pid", dest="pid", help="Object Identifier") + # TODO: Add methods and functionality parser.parse_args(["--help"]) + + # args = parser.parse_args() + # print(args) From c2a0b322faccfb8f3bc8e090fa734e949988ed3a Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 14 Jul 2023 13:36:46 -0700 Subject: [PATCH 048/165] Fix 'dest' keys for optional arguments --- src/hashstore/client.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 3eb7e272..3773bd6e 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -24,16 +24,16 @@ # HashStore creation and property arguments parser.add_argument("-chs", dest="action", help="Create a HashStore") - parser.add_argument("-store_depth", dest="action", help="Depth of HashStore") - parser.add_argument("-store_width", dest="action", help="Width of HashStore") + parser.add_argument("-store_depth", dest="depth", help="Depth of HashStore") + parser.add_argument("-store_width", dest="width", help="Width of HashStore") parser.add_argument( "-store_algorithm", - dest="action", + dest="algorithm", help="Algorithm to use when calculating object address", ) parser.add_argument( "-store_namespace", - dest="action", + dest="formatid", help="Default metadata namespace for metadata", ) From 561ba3c593e2fecb67987019ef732c91bf565a96 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 14 Jul 2023 14:25:07 -0700 Subject: [PATCH 049/165] Add functionality to create a HashStore in a given directory via client --- src/hashstore/client.py | 107 ++++++++++++++++++++++++++-------------- 1 file changed, 69 insertions(+), 38 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 3773bd6e..b0d97896 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -1,61 +1,92 @@ """HashStore Command Line App""" from argparse import ArgumentParser +from hashstore import HashStoreFactory -if __name__ == "__main__": - PROGRAM_NAME = "HashStore Command Line Client" - DESCRIPTION = ( - "A command-line tool to convert a directory of data objects" - + " into a hashstore and perform operations to store, retrieve," - + " and delete the objects." - ) - EPILOG = "Created for DataONE (NCEAS)" - parser = ArgumentParser( - prog=PROGRAM_NAME, - description=DESCRIPTION, - epilog=EPILOG, - ) - ### Positional Arguments +def add_client_optional_arguments(argp): + """Adds the optional arguments for HashStore Client. - # Path of the HashStore - parser.add_argument("store_path", help="Path of the HashStore") + Args: + argp (parser): argparse Parser object - ### Optional Arguments - - # HashStore creation and property arguments - parser.add_argument("-chs", dest="action", help="Create a HashStore") - parser.add_argument("-store_depth", dest="depth", help="Depth of HashStore") - parser.add_argument("-store_width", dest="width", help="Width of HashStore") - parser.add_argument( + """ + argp.add_argument( + "-chs", + dest="create_hashstore", + action="store_true", + help="Create a HashStore", + ) + argp.add_argument("-dp", "-store_depth", dest="depth", help="Depth of HashStore") + argp.add_argument("-wp", "-store_width", dest="width", help="Width of HashStore") + argp.add_argument( + "-ap", "-store_algorithm", dest="algorithm", help="Algorithm to use when calculating object address", ) - parser.add_argument( + argp.add_argument( + "-nsp", "-store_namespace", dest="formatid", help="Default metadata namespace for metadata", ) # Directory to convert into a HashStore - parser.add_argument( + argp.add_argument( "-dir", - dest="action", + dest="directory_to_convert", help="Directory of objects to convert to a HashStore", ) - # Public API Equivalent Methods - # object identifier - parser.add_argument("-pid", dest="pid", help="Object Identifier") - # store_object - parser.add_argument("-sobj", dest="action", help="Store an object to the HashStore") - # delete_object - parser.add_argument( - "-dobj", dest="action", help="Delete an object to the HashStore" + +def get_hashstore(properties): + """Create a HashStore instance with the supplied properties. + + Args: + properties: HashStore properties (see 'FileHashStore' module for details) + + Returns: + hashstore (FileHashStore): HashStore + """ + store = HashStoreFactory() + + # Get HashStore from factory + module_name = "filehashstore" + class_name = "FileHashStore" + + # Class variables + hashstore = store.get_hashstore(module_name, class_name, properties) + return hashstore + + +if __name__ == "__main__": + PROGRAM_NAME = "HashStore Command Line Client" + DESCRIPTION = ( + "A command-line tool to convert a directory of data objects" + + " into a hashstore and perform operations to store, retrieve," + + " and delete the objects." + ) + EPILOG = "Created for DataONE (NCEAS)" + parser = ArgumentParser( + prog=PROGRAM_NAME, + description=DESCRIPTION, + epilog=EPILOG, ) - # TODO: Add methods and functionality - parser.parse_args(["--help"]) + ### Add Positional and Optional Arguments + parser.add_argument("store_path", help="Path of the HashStore") + add_client_optional_arguments(parser) - # args = parser.parse_args() - # print(args) + # Client entry point + args = parser.parse_args() + if getattr(args, "create_hashstore"): + # Create a HashStore at the given directory + # Get store attributes and validate properties + props = { + "store_path": getattr(args, "store_path"), + "store_depth": getattr(args, "depth"), + "store_width": getattr(args, "width"), + "store_algorithm": getattr(args, "algorithm"), + "store_metadata_namespace": getattr(args, "formatid"), + } + my_store = get_hashstore(props) From 37eed903f0a4a1ab47675fe60bac6040952f2be0 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 14 Jul 2023 14:37:02 -0700 Subject: [PATCH 050/165] Code Check-in: Add skeleton to convert a directory (store objects) into a HashStore at a given store path --- src/hashstore/client.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index b0d97896..441d44f7 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -33,8 +33,8 @@ def add_client_optional_arguments(argp): # Directory to convert into a HashStore argp.add_argument( - "-dir", - dest="directory_to_convert", + "-cvd", + dest="convert_directory", help="Directory of objects to convert to a HashStore", ) @@ -59,6 +59,12 @@ def get_hashstore(properties): return hashstore +def convert_directory_to_hashstore(directory, hashstore_path): + """Store objects in a given directory into HashStore.""" + print(directory) + print(hashstore_path) + + if __name__ == "__main__": PROGRAM_NAME = "HashStore Command Line Client" DESCRIPTION = ( @@ -90,3 +96,9 @@ def get_hashstore(properties): "store_metadata_namespace": getattr(args, "formatid"), } my_store = get_hashstore(props) + if getattr(args, "convert_directory"): + # If HashStore does not exist, raise exception + ## Calling app must create HashStore first + # Get list of files from directory + # Store them into HashStore + pass From dbbba3b6abf32aa528dbfb6577d7d2d1f5476810 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 14 Jul 2023 14:39:59 -0700 Subject: [PATCH 051/165] Reword comments --- src/hashstore/client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 441d44f7..987d90bf 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -4,7 +4,7 @@ def add_client_optional_arguments(argp): - """Adds the optional arguments for HashStore Client. + """Adds the optional arguments for the HashStore Client. Args: argp (parser): argparse Parser object @@ -87,7 +87,7 @@ def convert_directory_to_hashstore(directory, hashstore_path): args = parser.parse_args() if getattr(args, "create_hashstore"): # Create a HashStore at the given directory - # Get store attributes and validate properties + # Get store attributes, HashStore will validate properties props = { "store_path": getattr(args, "store_path"), "store_depth": getattr(args, "depth"), From 206ac674f93bc26bd9a64fdadf409d8e8b9da4d4 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 17 Jul 2023 11:32:45 -0700 Subject: [PATCH 052/165] Fix bug in '_verify_hashstore_properties' where instantiation via CLI fails due to str vs int comparison --- src/hashstore/filehashstore.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index b77f26d2..7b09d51c 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -304,7 +304,10 @@ def _verify_hashstore_properties(self, properties, prop_store_path): # If 'hashstore.yaml' is found, verify given properties before init hashstore_yaml_dict = self.load_properties() for key in self.property_required_keys: - if hashstore_yaml_dict[key] != properties[key]: + checked_key = properties[key] + if key is "store_depth" or key is "store_width": + checked_key = int(properties[key]) + if hashstore_yaml_dict[key] != checked_key: exception_string = ( f"FileHashStore - Given properties ({key}: {properties[key]}) does not" + f" match. HashStore configuration ({key}: {hashstore_yaml_dict[key]})" From c5fd845d6cec9d9f3c034ceb72f02f0b41bb3469 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 17 Jul 2023 11:57:53 -0700 Subject: [PATCH 053/165] Add method to store/convert a directory of objects into HashStore via 'client.py' --- src/hashstore/client.py | 100 ++++++++++++++++++++++++++++++++++------ 1 file changed, 85 insertions(+), 15 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 987d90bf..c001c994 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -1,4 +1,6 @@ """HashStore Command Line App""" +import os +import yaml from argparse import ArgumentParser from hashstore import HashStoreFactory @@ -48,21 +50,73 @@ def get_hashstore(properties): Returns: hashstore (FileHashStore): HashStore """ - store = HashStoreFactory() + factory = HashStoreFactory() # Get HashStore from factory module_name = "filehashstore" class_name = "FileHashStore" # Class variables - hashstore = store.get_hashstore(module_name, class_name, properties) + hashstore = factory.get_hashstore(module_name, class_name, properties) return hashstore -def convert_directory_to_hashstore(directory, hashstore_path): - """Store objects in a given directory into HashStore.""" - print(directory) - print(hashstore_path) +def load_properties(hashstore_yaml): + """Get and return the contents of the current HashStore configuration. + + Returns: + hashstore_yaml_dict (dict): HashStore properties with the following keys (and values): + store_path (str): Path to the HashStore directory. + store_depth (int): Depth when sharding an object's hex digest. + store_width (int): Width of directories when sharding an object's hex digest. + store_algorithm (str): Hash algorithm used for calculating the object's hex digest. + store_metadata_namespace (str): Namespace for the HashStore's system metadata. + """ + property_required_keys = [ + "store_path", + "store_depth", + "store_width", + "store_algorithm", + "store_metadata_namespace", + ] + + if not os.path.exists(hashstore_yaml): + exception_string = ( + "HashStore CLI Client - load_properties: hashstore.yaml not found" + + " in store root path." + ) + raise FileNotFoundError(exception_string) + # Open file + with open(hashstore_yaml, "r", encoding="utf-8") as file: + yaml_data = yaml.safe_load(file) + + # Get hashstore properties + hashstore_yaml_dict = {} + for key in property_required_keys: + checked_property = yaml_data[key] + if key == "store_depth" or key == "store_width": + checked_property = int(yaml_data[key]) + hashstore_yaml_dict[key] = checked_property + return hashstore_yaml_dict + + +def convert_directory_to_hashstore(config_yaml): + """Store objects in a given directory into HashStore with a random pid.""" + properties = load_properties(config_yaml) + store = get_hashstore(properties) + + # Get list of files from directory + obj_list = os.listdir(directory_to_convert) + + # Store them into HashStore + # pylint: disable=C0103 + pid_count = 1 + for obj in obj_list: + # Temporary unique identifier + pid = f"dou.test.{pid_count}" + pid_count += 1 + obj_file_path = directory_to_convert + "/" + obj + _hash_address = store.store_object(pid, obj_file_path) if __name__ == "__main__": @@ -85,20 +139,36 @@ def convert_directory_to_hashstore(directory, hashstore_path): # Client entry point args = parser.parse_args() + + # Create HashStore if -chs flag is true if getattr(args, "create_hashstore"): # Create a HashStore at the given directory # Get store attributes, HashStore will validate properties props = { "store_path": getattr(args, "store_path"), - "store_depth": getattr(args, "depth"), - "store_width": getattr(args, "width"), + "store_depth": int(getattr(args, "depth")), + "store_width": int(getattr(args, "width")), "store_algorithm": getattr(args, "algorithm"), "store_metadata_namespace": getattr(args, "formatid"), } - my_store = get_hashstore(props) - if getattr(args, "convert_directory"): - # If HashStore does not exist, raise exception - ## Calling app must create HashStore first - # Get list of files from directory - # Store them into HashStore - pass + get_hashstore(props) + + # Convert a directory into HashStore if config file and directory exist + elif getattr(args, "convert_directory") is not None: + directory_to_convert = getattr(args, "convert_directory") + if os.path.exists(directory_to_convert): + store_path = getattr(args, "store_path") + store_path_config_yaml = store_path + "/hashstore.yaml" + if os.path.exists(store_path_config_yaml): + convert_directory_to_hashstore(store_path_config_yaml) + else: + # If HashStore does not exist, raise exception + # Calling app must create HashStore first before calling methods + raise FileNotFoundError( + f"Missing config file (hashstore.yaml) at store path: {store_path}." + + " HashStore must be initialized, use `--help` for more information." + ) + else: + raise FileNotFoundError( + f"Directory to convert does not exist: {getattr(args, 'convert_directory')}." + ) From 4392e21bad3ac6175e44506cd369f3c663bf557f Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 17 Jul 2023 13:22:28 -0700 Subject: [PATCH 054/165] Add new optional argument '-nobj' to allow storing of a specific number of objs in a directory to convert --- src/hashstore/client.py | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index c001c994..c542ad73 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -1,7 +1,7 @@ """HashStore Command Line App""" import os -import yaml from argparse import ArgumentParser +import yaml from hashstore import HashStoreFactory @@ -39,6 +39,11 @@ def add_client_optional_arguments(argp): dest="convert_directory", help="Directory of objects to convert to a HashStore", ) + argp.add_argument( + "-nobj", + dest="num_obj_to_convert", + help="Number of objects to convert", + ) def get_hashstore(properties): @@ -100,22 +105,27 @@ def load_properties(hashstore_yaml): return hashstore_yaml_dict -def convert_directory_to_hashstore(config_yaml): - """Store objects in a given directory into HashStore with a random pid.""" +def convert_directory_to_hashstore(config_yaml, num): + """Store objects in a given directory into HashStore with a random pid. + + Args: + config_yaml (str): Path to HashStore config file `hashstore.yaml` + num (int): Number of files to store + """ properties = load_properties(config_yaml) store = get_hashstore(properties) # Get list of files from directory obj_list = os.listdir(directory_to_convert) + if num is None: + checked_num = len(obj_list) + else: + checked_num = int(num) # Store them into HashStore - # pylint: disable=C0103 - pid_count = 1 - for obj in obj_list: - # Temporary unique identifier - pid = f"dou.test.{pid_count}" - pid_count += 1 - obj_file_path = directory_to_convert + "/" + obj + for i in range(0, checked_num): + pid = f"dou.test.{i}" + obj_file_path = directory_to_convert + "/" + obj_list[i] _hash_address = store.store_object(pid, obj_file_path) @@ -157,10 +167,13 @@ def convert_directory_to_hashstore(config_yaml): elif getattr(args, "convert_directory") is not None: directory_to_convert = getattr(args, "convert_directory") if os.path.exists(directory_to_convert): + number_of_objects_to_convert = getattr(args, "num_obj_to_convert") store_path = getattr(args, "store_path") store_path_config_yaml = store_path + "/hashstore.yaml" if os.path.exists(store_path_config_yaml): - convert_directory_to_hashstore(store_path_config_yaml) + convert_directory_to_hashstore( + store_path_config_yaml, number_of_objects_to_convert + ) else: # If HashStore does not exist, raise exception # Calling app must create HashStore first before calling methods From 788f688d937c52f237207da1b594b422b024ac91 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 19 Jul 2023 10:45:54 -0700 Subject: [PATCH 055/165] Revise usage of comparison operatior 'is' to '==' for string comparison --- src/hashstore/filehashstore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 7b09d51c..a8092e9f 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -305,7 +305,7 @@ def _verify_hashstore_properties(self, properties, prop_store_path): hashstore_yaml_dict = self.load_properties() for key in self.property_required_keys: checked_key = properties[key] - if key is "store_depth" or key is "store_width": + if key == "store_depth" or key == "store_width": checked_key = int(properties[key]) if hashstore_yaml_dict[key] != checked_key: exception_string = ( From 18e063243c4b813aed5cb8602ff1b8a606819120 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 19 Jul 2023 12:02:30 -0700 Subject: [PATCH 056/165] Add basic metadata from 'client.py' store conversion into 'client_metadata.txt' at store path --- src/hashstore/client.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index c542ad73..28a42d18 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -3,6 +3,7 @@ from argparse import ArgumentParser import yaml from hashstore import HashStoreFactory +from datetime import datetime def add_client_optional_arguments(argp): @@ -105,6 +106,17 @@ def load_properties(hashstore_yaml): return hashstore_yaml_dict +def write_command_metadata(directory, filename, content): + """Write a text file to a given directory.""" + # Combine the directory path and filename + file_path = f"{directory}/{filename}" + + # Open the file in write mode ('w') + with open(file_path, "w", encoding="utf-8") as file: + # Write the content to the file + file.write(content) + + def convert_directory_to_hashstore(config_yaml, num): """Store objects in a given directory into HashStore with a random pid. @@ -123,10 +135,15 @@ def convert_directory_to_hashstore(config_yaml, num): checked_num = int(num) # Store them into HashStore + start_time = datetime.now() for i in range(0, checked_num): pid = f"dou.test.{i}" obj_file_path = directory_to_convert + "/" + obj_list[i] _hash_address = store.store_object(pid, obj_file_path) + end_time = datetime.now() + + content = f"Start Time: {start_time}\nEnd Time: {end_time}" + write_command_metadata(properties["store_path"], "client_metadata.txt", content) if __name__ == "__main__": From da7f3da389f9459147ffa4f1b1ee50df38646308 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 20 Jul 2023 09:38:28 -0700 Subject: [PATCH 057/165] Refactor 'client.py' to process a queue of objects to store using 5 threads total --- src/hashstore/client.py | 60 ++++++++++++++++++++++++++++++++--------- 1 file changed, 48 insertions(+), 12 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 28a42d18..d459b02d 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -1,9 +1,11 @@ """HashStore Command Line App""" import os from argparse import ArgumentParser +from datetime import datetime +import queue +import threading import yaml from hashstore import HashStoreFactory -from datetime import datetime def add_client_optional_arguments(argp): @@ -106,7 +108,7 @@ def load_properties(hashstore_yaml): return hashstore_yaml_dict -def write_command_metadata(directory, filename, content): +def write_text_to_path(directory, filename, content): """Write a text file to a given directory.""" # Combine the directory path and filename file_path = f"{directory}/{filename}" @@ -117,33 +119,65 @@ def write_command_metadata(directory, filename, content): file.write(content) -def convert_directory_to_hashstore(config_yaml, num): +def convert_directory_to_hashstore(obj_directory, config_yaml, num): """Store objects in a given directory into HashStore with a random pid. Args: + obj_directory (str): Directory to convert config_yaml (str): Path to HashStore config file `hashstore.yaml` num (int): Number of files to store """ + properties = load_properties(config_yaml) store = get_hashstore(properties) + def process_store_obj_queue(my_queue): + """Store object to HashStore""" + while not my_queue.empty(): + queue_item = my_queue.get() + pid = queue_item["pid"] + obj_path = queue_item["obj_path"] + _hash_address = store.store_object(pid, obj_path) + # Get list of files from directory - obj_list = os.listdir(directory_to_convert) + obj_list = os.listdir(obj_directory) + # Create queue + store_obj_queue = queue.Queue(maxsize=len(obj_list)) + + # Check number of files to store if num is None: checked_num = len(obj_list) else: checked_num = int(num) - # Store them into HashStore - start_time = datetime.now() + # Make a queue of objects to store for i in range(0, checked_num): - pid = f"dou.test.{i}" - obj_file_path = directory_to_convert + "/" + obj_list[i] - _hash_address = store.store_object(pid, obj_file_path) - end_time = datetime.now() + item_dict = { + "pid": f"dou.test.{i}", + "obj_path": obj_directory + "/" + obj_list[i], + } + store_obj_queue.put(item_dict) + + # Number of threads + num_threads = 5 + # Create and start threads + start_time = datetime.now() + threads = [] + for _ in range(num_threads): + thread = threading.Thread( + target=process_store_obj_queue, args=(store_obj_queue,) + ) + thread.start() + threads.append(thread) + + # Wait for all threads to finish + for thread in threads: + thread.join() + + end_time = datetime.now() content = f"Start Time: {start_time}\nEnd Time: {end_time}" - write_command_metadata(properties["store_path"], "client_metadata.txt", content) + write_text_to_path(properties["store_path"], "client_metadata.txt", content) if __name__ == "__main__": @@ -189,7 +223,9 @@ def convert_directory_to_hashstore(config_yaml, num): store_path_config_yaml = store_path + "/hashstore.yaml" if os.path.exists(store_path_config_yaml): convert_directory_to_hashstore( - store_path_config_yaml, number_of_objects_to_convert + directory_to_convert, + store_path_config_yaml, + number_of_objects_to_convert, ) else: # If HashStore does not exist, raise exception From 016b4fbdd910bbd32f8cd682c1e59347ceb0612f Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 20 Jul 2023 11:39:26 -0700 Subject: [PATCH 058/165] Refactor 'client.py' to use asyncio to test storing objects concurrently --- src/hashstore/client.py | 69 +++++++++++++++++++++++++---------------- 1 file changed, 43 insertions(+), 26 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index d459b02d..34b83d15 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -2,8 +2,8 @@ import os from argparse import ArgumentParser from datetime import datetime +import asyncio import queue -import threading import yaml from hashstore import HashStoreFactory @@ -119,7 +119,7 @@ def write_text_to_path(directory, filename, content): file.write(content) -def convert_directory_to_hashstore(obj_directory, config_yaml, num): +async def convert_directory_to_hashstore(obj_directory, config_yaml, num): """Store objects in a given directory into HashStore with a random pid. Args: @@ -131,18 +131,25 @@ def convert_directory_to_hashstore(obj_directory, config_yaml, num): properties = load_properties(config_yaml) store = get_hashstore(properties) - def process_store_obj_queue(my_queue): + # def process_store_obj_queue_thread(my_queue): + # """Store object to HashStore""" + # while not my_queue.empty(): + # queue_item = my_queue.get() + # pid = queue_item["pid"] + # obj_path = queue_item["obj_path"] + # _hash_address = store.store_object(pid, obj_path) + + async def store_obj(item): """Store object to HashStore""" - while not my_queue.empty(): - queue_item = my_queue.get() - pid = queue_item["pid"] - obj_path = queue_item["obj_path"] - _hash_address = store.store_object(pid, obj_path) + pid = item["pid"] + obj_path = item["obj_path"] + store.store_object(pid, obj_path) # Get list of files from directory obj_list = os.listdir(obj_directory) # Create queue store_obj_queue = queue.Queue(maxsize=len(obj_list)) + store_obj_list = [] # Check number of files to store if num is None: @@ -157,24 +164,32 @@ def process_store_obj_queue(my_queue): "obj_path": obj_directory + "/" + obj_list[i], } store_obj_queue.put(item_dict) + store_obj_list.append(item_dict) - # Number of threads - num_threads = 5 - - # Create and start threads + # Start start_time = datetime.now() - threads = [] - for _ in range(num_threads): - thread = threading.Thread( - target=process_store_obj_queue, args=(store_obj_queue,) - ) - thread.start() - threads.append(thread) - # Wait for all threads to finish - for thread in threads: - thread.join() + coroutines = [store_obj(item) for item in store_obj_list] + await asyncio.gather(*coroutines) + + # # Number of threads + # num_threads = 5 + # threads = [] + # for _ in range(num_threads): + # thread = threading.Thread( + # target=process_store_obj_queue, args=(store_obj_queue,) + # ) + # thread.start() + # threads.append(thread) + + # # Wait for all threads to finish + # for thread in threads: + # thread.join() + + # for _ in range(0, checked_num): + # process_store_obj_queue(store_obj_queue) + # End end_time = datetime.now() content = f"Start Time: {start_time}\nEnd Time: {end_time}" write_text_to_path(properties["store_path"], "client_metadata.txt", content) @@ -222,10 +237,12 @@ def process_store_obj_queue(my_queue): store_path = getattr(args, "store_path") store_path_config_yaml = store_path + "/hashstore.yaml" if os.path.exists(store_path_config_yaml): - convert_directory_to_hashstore( - directory_to_convert, - store_path_config_yaml, - number_of_objects_to_convert, + asyncio.run( + convert_directory_to_hashstore( + directory_to_convert, + store_path_config_yaml, + number_of_objects_to_convert, + ) ) else: # If HashStore does not exist, raise exception From d4ad44be15c21c162871f4b19b468e59c3c6c42b Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 20 Jul 2023 12:25:16 -0700 Subject: [PATCH 059/165] Refactor 'client.py' by adding await statement when calling store object to test improving speed --- src/hashstore/client.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 34b83d15..d5f884b9 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -143,7 +143,11 @@ async def store_obj(item): """Store object to HashStore""" pid = item["pid"] obj_path = item["obj_path"] - store.store_object(pid, obj_path) + + async def store_obj_await(pid, path): + store.store_object(pid, path) + + await store_obj_await(pid, obj_path) # Get list of files from directory obj_list = os.listdir(obj_directory) @@ -191,7 +195,7 @@ async def store_obj(item): # End end_time = datetime.now() - content = f"Start Time: {start_time}\nEnd Time: {end_time}" + content = f"Start Time: {start_time}\nEnd Time: {end_time}\nTotal Time to Store {checked_num} Objects: {end_time - start_time}" write_text_to_path(properties["store_path"], "client_metadata.txt", content) From ae43b5d75a9ab2c1fa02c77c3bf1176b97945bf3 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 20 Jul 2023 14:09:55 -0700 Subject: [PATCH 060/165] Refactor 'client.py' to test multiprocessing --- src/hashstore/client.py | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index d5f884b9..f8fa31a6 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -6,6 +6,7 @@ import queue import yaml from hashstore import HashStoreFactory +import multiprocessing def add_client_optional_arguments(argp): @@ -131,6 +132,12 @@ async def convert_directory_to_hashstore(obj_directory, config_yaml, num): properties = load_properties(config_yaml) store = get_hashstore(properties) + def store_obj(obj_name): + """Store object to HashStore""" + pid = f"dou.test.{obj_name}" + obj_path = (obj_directory + "/" + obj_name,) + _hash_address = store.store_object(pid, obj_path) + # def process_store_obj_queue_thread(my_queue): # """Store object to HashStore""" # while not my_queue.empty(): @@ -139,15 +146,15 @@ async def convert_directory_to_hashstore(obj_directory, config_yaml, num): # obj_path = queue_item["obj_path"] # _hash_address = store.store_object(pid, obj_path) - async def store_obj(item): - """Store object to HashStore""" - pid = item["pid"] - obj_path = item["obj_path"] + # async def store_obj(item): + # """Store object to HashStore""" + # pid = item["pid"] + # obj_path = item["obj_path"] - async def store_obj_await(pid, path): - store.store_object(pid, path) + # async def store_obj_await(pid, path): + # store.store_object(pid, path) - await store_obj_await(pid, obj_path) + # await store_obj_await(pid, obj_path) # Get list of files from directory obj_list = os.listdir(obj_directory) @@ -173,8 +180,14 @@ async def store_obj_await(pid, path): # Start start_time = datetime.now() - coroutines = [store_obj(item) for item in store_obj_list] - await asyncio.gather(*coroutines) + num_processes = 4 + pool = multiprocessing.Pool(processes=num_processes) + + pool.close() + pool.join() + + # coroutines = [store_obj(item) for item in store_obj_list] + # await asyncio.gather(*coroutines) # # Number of threads # num_threads = 5 From 4fecb2f8375e566ab1c56b36ec61b70f84007f83 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 20 Jul 2023 14:50:41 -0700 Subject: [PATCH 061/165] Clean up code and separate test methods to store directories --- src/hashstore/client.py | 181 +++++++++++++++++++++++++++++----------- 1 file changed, 130 insertions(+), 51 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index f8fa31a6..b3124861 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -2,11 +2,12 @@ import os from argparse import ArgumentParser from datetime import datetime +import threading import asyncio import queue import yaml -from hashstore import HashStoreFactory import multiprocessing +from hashstore import HashStoreFactory def add_client_optional_arguments(argp): @@ -120,7 +121,7 @@ def write_text_to_path(directory, filename, content): file.write(content) -async def convert_directory_to_hashstore(obj_directory, config_yaml, num): +async def convert_dir_to_hs_async(obj_directory, config_yaml, num): """Store objects in a given directory into HashStore with a random pid. Args: @@ -132,35 +133,71 @@ async def convert_directory_to_hashstore(obj_directory, config_yaml, num): properties = load_properties(config_yaml) store = get_hashstore(properties) - def store_obj(obj_name): + async def store_obj(item): """Store object to HashStore""" - pid = f"dou.test.{obj_name}" - obj_path = (obj_directory + "/" + obj_name,) - _hash_address = store.store_object(pid, obj_path) + pid = item["pid"] + obj_path = item["obj_path"] + + async def store_obj_await(pid, path): + store.store_object(pid, path) + + await store_obj_await(pid, obj_path) + + # Get list of files from directory + obj_list = os.listdir(obj_directory) + store_obj_list = [] + + # Check number of files to store + if num is None: + checked_num = len(obj_list) + else: + checked_num = int(num) + + # Make a queue of objects to store + for i in range(0, checked_num): + item_dict = { + "pid": f"dou.test.{i}", + "obj_path": obj_directory + "/" + obj_list[i], + } + store_obj_list.append(item_dict) + + start_time = datetime.now() + + coroutines = [store_obj(item) for item in store_obj_list] + await asyncio.gather(*coroutines) + + end_time = datetime.now() + content = ( + f"Start Time: {start_time}\nEnd Time: {end_time}\n" + + f"Total Time to Store {checked_num} Objects: {end_time - start_time}" + ) + write_text_to_path(properties["store_path"], "client_metadata.txt", content) + - # def process_store_obj_queue_thread(my_queue): - # """Store object to HashStore""" - # while not my_queue.empty(): - # queue_item = my_queue.get() - # pid = queue_item["pid"] - # obj_path = queue_item["obj_path"] - # _hash_address = store.store_object(pid, obj_path) +def convert_dir_to_hs_thread(obj_directory, config_yaml, num): + """Store objects in a given directory into HashStore with a random pid. - # async def store_obj(item): - # """Store object to HashStore""" - # pid = item["pid"] - # obj_path = item["obj_path"] + Args: + obj_directory (str): Directory to convert + config_yaml (str): Path to HashStore config file `hashstore.yaml` + num (int): Number of files to store + """ - # async def store_obj_await(pid, path): - # store.store_object(pid, path) + properties = load_properties(config_yaml) + store = get_hashstore(properties) - # await store_obj_await(pid, obj_path) + def process_store_obj_queue(my_queue): + """Store object to HashStore""" + while not my_queue.empty(): + queue_item = my_queue.get() + pid = queue_item["pid"] + obj_path = queue_item["obj_path"] + _hash_address = store.store_object(pid, obj_path) # Get list of files from directory obj_list = os.listdir(obj_directory) # Create queue store_obj_queue = queue.Queue(maxsize=len(obj_list)) - store_obj_list = [] # Check number of files to store if num is None: @@ -175,40 +212,77 @@ def store_obj(obj_name): "obj_path": obj_directory + "/" + obj_list[i], } store_obj_queue.put(item_dict) - store_obj_list.append(item_dict) - # Start start_time = datetime.now() - num_processes = 4 - pool = multiprocessing.Pool(processes=num_processes) + # Number of threads + num_threads = 5 + threads = [] + for _ in range(num_threads): + thread = threading.Thread( + target=process_store_obj_queue, args=(store_obj_queue,) + ) + thread.start() + threads.append(thread) - pool.close() - pool.join() + # Wait for all threads to finish + for thread in threads: + thread.join() + + for _ in range(0, checked_num): + process_store_obj_queue(store_obj_queue) - # coroutines = [store_obj(item) for item in store_obj_list] - # await asyncio.gather(*coroutines) + # End + end_time = datetime.now() + content = ( + f"Start Time: {start_time}\nEnd Time: {end_time}\n" + + f"Total Time to Store {checked_num} Objects: {end_time - start_time}" + ) + write_text_to_path(properties["store_path"], "client_metadata.txt", content) - # # Number of threads - # num_threads = 5 - # threads = [] - # for _ in range(num_threads): - # thread = threading.Thread( - # target=process_store_obj_queue, args=(store_obj_queue,) - # ) - # thread.start() - # threads.append(thread) - # # Wait for all threads to finish - # for thread in threads: - # thread.join() +def convert_dir_to_hs_multi(obj_directory, config_yaml, num): + """Store objects in a given directory into HashStore with a random pid. - # for _ in range(0, checked_num): - # process_store_obj_queue(store_obj_queue) + Args: + obj_directory (str): Directory to convert + config_yaml (str): Path to HashStore config file `hashstore.yaml` + num (int): Number of files to store + """ + + properties = load_properties(config_yaml) + store = get_hashstore(properties) + + # Get list of files from directory + obj_list = os.listdir(obj_directory) + checked_num = len(obj_list) + checked_obj_list = [] + + # Check number of files to store + if num is not None: + checked_num = int(num) + + for i in range(0, checked_num): + tuple_item = (f"dou.test.{i}", obj_directory + "/" + obj_list[i]) + checked_obj_list.append(tuple_item) + + start_time = datetime.now() + + num_processes = 4 + pool = multiprocessing.Pool(processes=num_processes) + + # Map the square_number function to the list of numbers using the process pool + pool.starmap(store.store_object, checked_obj_list) + + # Close the pool and wait for all processes to complete + pool.close() + pool.join() - # End end_time = datetime.now() - content = f"Start Time: {start_time}\nEnd Time: {end_time}\nTotal Time to Store {checked_num} Objects: {end_time - start_time}" + content = ( + f"Start Time: {start_time}\nEnd Time: {end_time}\n" + + f"Total Time to Store {checked_num} Objects: {end_time - start_time}" + ) write_text_to_path(properties["store_path"], "client_metadata.txt", content) @@ -254,12 +328,17 @@ def store_obj(obj_name): store_path = getattr(args, "store_path") store_path_config_yaml = store_path + "/hashstore.yaml" if os.path.exists(store_path_config_yaml): - asyncio.run( - convert_directory_to_hashstore( - directory_to_convert, - store_path_config_yaml, - number_of_objects_to_convert, - ) + # asyncio.run( + # convert_dir_to_hs_async( + # directory_to_convert, + # store_path_config_yaml, + # number_of_objects_to_convert, + # ) + # ) + convert_dir_to_hs_multi( + directory_to_convert, + store_path_config_yaml, + number_of_objects_to_convert, ) else: # If HashStore does not exist, raise exception From cb0aed4de6727b6b689d5b8cbf0cf4ba7a963b1b Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 20 Jul 2023 14:57:53 -0700 Subject: [PATCH 062/165] Clean up comments --- src/hashstore/client.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index b3124861..38feb3fa 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -262,16 +262,18 @@ def convert_dir_to_hs_multi(obj_directory, config_yaml, num): if num is not None: checked_num = int(num) + # Create tuple to pass to store.store_object via starmap for i in range(0, checked_num): tuple_item = (f"dou.test.{i}", obj_directory + "/" + obj_list[i]) checked_obj_list.append(tuple_item) start_time = datetime.now() + # Setup pool and processes num_processes = 4 pool = multiprocessing.Pool(processes=num_processes) - # Map the square_number function to the list of numbers using the process pool + # Call store object pool.starmap(store.store_object, checked_obj_list) # Close the pool and wait for all processes to complete From 78f888e4c60f058a03ccd81004ec510bae7f10e4 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 20 Jul 2023 15:30:12 -0700 Subject: [PATCH 063/165] Remove explicit declaration of 'num_processes' to allow Python to determine number of processes to use --- src/hashstore/client.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 38feb3fa..23c3af70 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -270,8 +270,9 @@ def convert_dir_to_hs_multi(obj_directory, config_yaml, num): start_time = datetime.now() # Setup pool and processes - num_processes = 4 - pool = multiprocessing.Pool(processes=num_processes) + # num_processes = os.cpu_count() - 2 + # pool = multiprocessing.Pool(processes=num_processes) + pool = multiprocessing.Pool() # Call store object pool.starmap(store.store_object, checked_obj_list) From 30c8356500ba3041e95604ebdc56695ec2443a05 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 26 Jul 2023 11:24:55 -0700 Subject: [PATCH 064/165] Refactor 'client.py' to retrieve obj list from metacat postgres db via pg8000 library --- poetry.lock | 72 ++++++++++++- pyproject.toml | 1 + src/hashstore/client.py | 220 ++++++++++++++++------------------------ 3 files changed, 159 insertions(+), 134 deletions(-) diff --git a/poetry.lock b/poetry.lock index 338b1304..85abf43e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,5 +1,17 @@ # This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand. +[[package]] +name = "asn1crypto" +version = "1.5.1" +description = "Fast ASN.1 parser and serializer with definitions for private keys, public keys, certificates, CRL, OCSP, CMS, PKCS#3, PKCS#7, PKCS#8, PKCS#12, PKCS#5, X.509 and TSP" +category = "dev" +optional = false +python-versions = "*" +files = [ + {file = "asn1crypto-1.5.1-py2.py3-none-any.whl", hash = "sha256:db4e40728b728508912cbb3d44f19ce188f218e9eba635821bb4b68564f8fd67"}, + {file = "asn1crypto-1.5.1.tar.gz", hash = "sha256:13ae38502be632115abf8a24cbe5f4da52e3b5231990aff31123c805306ccb9c"}, +] + [[package]] name = "astroid" version = "2.15.6" @@ -249,6 +261,22 @@ files = [ {file = "pathspec-0.11.1.tar.gz", hash = "sha256:2798de800fa92780e33acca925945e9a19a133b715067cf165b8866c15a31687"}, ] +[[package]] +name = "pg8000" +version = "1.29.8" +description = "PostgreSQL interface library" +category = "dev" +optional = false +python-versions = ">=3.7" +files = [ + {file = "pg8000-1.29.8-py3-none-any.whl", hash = "sha256:962e9d6687f76057bd6d9c9c0f67f503a503216bf60b3a4d71e4cb8c97f8326d"}, + {file = "pg8000-1.29.8.tar.gz", hash = "sha256:609cfbccea783e15f111cc0cb2f6d4e6b4c349a695c59505a29baba6fc79ffa9"}, +] + +[package.dependencies] +python-dateutil = ">=2.8.2" +scramp = ">=1.4.3" + [[package]] name = "platformdirs" version = "3.8.1" @@ -334,6 +362,21 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] +[[package]] +name = "python-dateutil" +version = "2.8.2" +description = "Extensions to the standard Python datetime module" +category = "dev" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +files = [ + {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, + {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, +] + +[package.dependencies] +six = ">=1.5" + [[package]] name = "pyyaml" version = "6.0" @@ -384,6 +427,33 @@ files = [ {file = "PyYAML-6.0.tar.gz", hash = "sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2"}, ] +[[package]] +name = "scramp" +version = "1.4.4" +description = "An implementation of the SCRAM protocol." +category = "dev" +optional = false +python-versions = ">=3.7" +files = [ + {file = "scramp-1.4.4-py3-none-any.whl", hash = "sha256:b142312df7c2977241d951318b7ee923d6b7a4f75ba0f05b621ece1ed616faa3"}, + {file = "scramp-1.4.4.tar.gz", hash = "sha256:b7022a140040f33cf863ab2657917ed05287a807b917950489b89b9f685d59bc"}, +] + +[package.dependencies] +asn1crypto = ">=1.5.1" + +[[package]] +name = "six" +version = "1.16.0" +description = "Python 2 and 3 compatibility utilities" +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" +files = [ + {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, + {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, +] + [[package]] name = "tomli" version = "2.0.1" @@ -508,4 +578,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "b04d8166655a79de94436d54e060f0e04c185ac067bc8579619a7f8444e70370" +content-hash = "6eeffad7b4becc9f995e576d3fc5db2a8640bfe60876d254a6b5854ddd0e283a" diff --git a/pyproject.toml b/pyproject.toml index b0df7426..5a41d3f9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,7 @@ pyyaml = "^6.0" pytest = "^7.2.0" black = "^22.10.0" pylint = "^2.17.4" +pg8000 = "^1.29.8" [build-system] requires = ["poetry-core"] diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 23c3af70..5a949619 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -2,11 +2,9 @@ import os from argparse import ArgumentParser from datetime import datetime -import threading -import asyncio -import queue -import yaml import multiprocessing +import yaml +import pg8000 from hashstore import HashStoreFactory @@ -71,7 +69,7 @@ def get_hashstore(properties): return hashstore -def load_properties(hashstore_yaml): +def load_store_properties(hashstore_yaml): """Get and return the contents of the current HashStore configuration. Returns: @@ -92,7 +90,7 @@ def load_properties(hashstore_yaml): if not os.path.exists(hashstore_yaml): exception_string = ( - "HashStore CLI Client - load_properties: hashstore.yaml not found" + "HashStore CLI Client - load_store_properties: hashstore.yaml not found" + " in store root path." ) raise FileNotFoundError(exception_string) @@ -110,135 +108,101 @@ def load_properties(hashstore_yaml): return hashstore_yaml_dict -def write_text_to_path(directory, filename, content): - """Write a text file to a given directory.""" - # Combine the directory path and filename - file_path = f"{directory}/{filename}" - - # Open the file in write mode ('w') - with open(file_path, "w", encoding="utf-8") as file: - # Write the content to the file - file.write(content) - - -async def convert_dir_to_hs_async(obj_directory, config_yaml, num): - """Store objects in a given directory into HashStore with a random pid. +def load_db_properties(pgdb_yaml): + """Get and return the contents of a postgres config file Args: - obj_directory (str): Directory to convert - config_yaml (str): Path to HashStore config file `hashstore.yaml` - num (int): Number of files to store - """ - - properties = load_properties(config_yaml) - store = get_hashstore(properties) - - async def store_obj(item): - """Store object to HashStore""" - pid = item["pid"] - obj_path = item["obj_path"] + pgdb_yaml (string): Path to yaml file - async def store_obj_await(pid, path): - store.store_object(pid, path) + Returns: + hashstore_yaml_dict (dict): postgres db config properties + """ + db_keys = [ + "db_user", + "db_password", + "db_host", + "db_port", + "db_name", + ] - await store_obj_await(pid, obj_path) + if not os.path.exists(pgdb_yaml): + exception_string = ( + "HashStore CLI Client - load_db_properties: pgdb.yaml not found" + + " in store root path." + ) + raise FileNotFoundError(exception_string) + # Open file + with open(pgdb_yaml, "r", encoding="utf-8") as file: + yaml_data = yaml.safe_load(file) - # Get list of files from directory - obj_list = os.listdir(obj_directory) - store_obj_list = [] + # Get hashstore properties + db_yaml_dict = {} + for key in db_keys: + checked_property = yaml_data[key] + db_yaml_dict[key] = checked_property + return db_yaml_dict - # Check number of files to store - if num is None: - checked_num = len(obj_list) - else: - checked_num = int(num) - # Make a queue of objects to store - for i in range(0, checked_num): - item_dict = { - "pid": f"dou.test.{i}", - "obj_path": obj_directory + "/" + obj_list[i], - } - store_obj_list.append(item_dict) +def write_text_to_path(directory, filename, content): + """Write a text file to a given directory.""" + # Combine the directory path and filename + file_path = f"{directory}/{filename}.txt" - start_time = datetime.now() + # Open the file in write mode ('w') + with open(file_path, "w", encoding="utf-8") as file: + # Write the content to the file + file.write(content) - coroutines = [store_obj(item) for item in store_obj_list] - await asyncio.gather(*coroutines) - end_time = datetime.now() - content = ( - f"Start Time: {start_time}\nEnd Time: {end_time}\n" - + f"Total Time to Store {checked_num} Objects: {end_time - start_time}" +def get_objs_from_metacat_db(properties, obj_directory, num): + """Get the list of objects from knbvm's metacat db to store into HashStore""" + # Get db config from locally created file in store path (`pgdb.yaml`) + pgyaml_path = properties["store_path"] + "/pgdb.yaml" + print(f"Retrieving db config from: {pgyaml_path}") + + db_properties = load_db_properties(pgyaml_path) + db_user = db_properties["db_user"] + db_password = db_properties["db_password"] + db_host = db_properties["db_host"] + db_port = db_properties["db_port"] + db_name = db_properties["db_name"] + + # Create a connection to the database + conn = pg8000.connect( + user=db_user, + password=db_password, + host=db_host, + port=int(db_port), + database=db_name, ) - write_text_to_path(properties["store_path"], "client_metadata.txt", content) - - -def convert_dir_to_hs_thread(obj_directory, config_yaml, num): - """Store objects in a given directory into HashStore with a random pid. - - Args: - obj_directory (str): Directory to convert - config_yaml (str): Path to HashStore config file `hashstore.yaml` - num (int): Number of files to store - """ - - properties = load_properties(config_yaml) - store = get_hashstore(properties) - - def process_store_obj_queue(my_queue): - """Store object to HashStore""" - while not my_queue.empty(): - queue_item = my_queue.get() - pid = queue_item["pid"] - obj_path = queue_item["obj_path"] - _hash_address = store.store_object(pid, obj_path) - - # Get list of files from directory - obj_list = os.listdir(obj_directory) - # Create queue - store_obj_queue = queue.Queue(maxsize=len(obj_list)) - # Check number of files to store - if num is None: - checked_num = len(obj_list) - else: - checked_num = int(num) + # Create a cursor to execute queries + cursor = conn.cursor() - # Make a queue of objects to store - for i in range(0, checked_num): - item_dict = { - "pid": f"dou.test.{i}", - "obj_path": obj_directory + "/" + obj_list[i], - } - store_obj_queue.put(item_dict) + # Query to get rows from `identifier` table + query = f"SELECT * FROM identifier LIMIT {num};" + cursor.execute(query) - start_time = datetime.now() + # Fetch all rows from the result set + rows = cursor.fetchall() - # Number of threads - num_threads = 5 - threads = [] - for _ in range(num_threads): - thread = threading.Thread( - target=process_store_obj_queue, args=(store_obj_queue,) - ) - thread.start() - threads.append(thread) + # Create object list to store into HashStore + print("Creating list of objects to store into HashStore") + checked_obj_list = [] + for row in rows: + # Get pid and filename + pid_guid = row[0] + filepath_docid_rev = obj_directory + "/" + row[1] + "." + row[2] + tuple_item = (pid_guid, filepath_docid_rev) + # Only add to the list if it is an object, not metadata document + if os.path.exists(filepath_docid_rev): + checked_obj_list.append(tuple_item) - # Wait for all threads to finish - for thread in threads: - thread.join() + # Close the cursor and connection when done + cursor.close() + conn.close() - for _ in range(0, checked_num): - process_store_obj_queue(store_obj_queue) - - # End - end_time = datetime.now() - content = ( - f"Start Time: {start_time}\nEnd Time: {end_time}\n" - + f"Total Time to Store {checked_num} Objects: {end_time - start_time}" - ) - write_text_to_path(properties["store_path"], "client_metadata.txt", content) + return checked_obj_list def convert_dir_to_hs_multi(obj_directory, config_yaml, num): @@ -249,23 +213,19 @@ def convert_dir_to_hs_multi(obj_directory, config_yaml, num): config_yaml (str): Path to HashStore config file `hashstore.yaml` num (int): Number of files to store """ - - properties = load_properties(config_yaml) + properties = load_store_properties(config_yaml) store = get_hashstore(properties) # Get list of files from directory obj_list = os.listdir(obj_directory) checked_num = len(obj_list) - checked_obj_list = [] # Check number of files to store if num is not None: checked_num = int(num) - # Create tuple to pass to store.store_object via starmap - for i in range(0, checked_num): - tuple_item = (f"dou.test.{i}", obj_directory + "/" + obj_list[i]) - checked_obj_list.append(tuple_item) + # Get list of objects to store + checked_obj_list = get_objs_from_metacat_db(properties, obj_directory, checked_num) start_time = datetime.now() @@ -275,6 +235,7 @@ def convert_dir_to_hs_multi(obj_directory, config_yaml, num): pool = multiprocessing.Pool() # Call store object + print("Storing objects") pool.starmap(store.store_object, checked_obj_list) # Close the pool and wait for all processes to complete @@ -286,7 +247,7 @@ def convert_dir_to_hs_multi(obj_directory, config_yaml, num): f"Start Time: {start_time}\nEnd Time: {end_time}\n" + f"Total Time to Store {checked_num} Objects: {end_time - start_time}" ) - write_text_to_path(properties["store_path"], "client_metadata.txt", content) + write_text_to_path(properties["store_path"], "client_metadata", content) if __name__ == "__main__": @@ -331,13 +292,6 @@ def convert_dir_to_hs_multi(obj_directory, config_yaml, num): store_path = getattr(args, "store_path") store_path_config_yaml = store_path + "/hashstore.yaml" if os.path.exists(store_path_config_yaml): - # asyncio.run( - # convert_dir_to_hs_async( - # directory_to_convert, - # store_path_config_yaml, - # number_of_objects_to_convert, - # ) - # ) convert_dir_to_hs_multi( directory_to_convert, store_path_config_yaml, From 0c8d31be646ddb8eea962d40627e9dbff2f72859 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 26 Jul 2023 11:48:11 -0700 Subject: [PATCH 065/165] Fix bug when forming file path to store (concatenating integers) --- src/hashstore/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 5a949619..385bb33a 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -192,7 +192,7 @@ def get_objs_from_metacat_db(properties, obj_directory, num): for row in rows: # Get pid and filename pid_guid = row[0] - filepath_docid_rev = obj_directory + "/" + row[1] + "." + row[2] + filepath_docid_rev = obj_directory + "/" + row[1] + "." + str(row[2]) tuple_item = (pid_guid, filepath_docid_rev) # Only add to the list if it is an object, not metadata document if os.path.exists(filepath_docid_rev): From 16b81d6e5a6c4e1a962314002b103b5e443a64e5 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 26 Jul 2023 13:11:17 -0700 Subject: [PATCH 066/165] Update content to write to summary txt file 'client_metadata' --- src/hashstore/client.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 385bb33a..870efd6f 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -246,6 +246,7 @@ def convert_dir_to_hs_multi(obj_directory, config_yaml, num): content = ( f"Start Time: {start_time}\nEnd Time: {end_time}\n" + f"Total Time to Store {checked_num} Objects: {end_time - start_time}" + + f"Expected number of data objects stored: {len(checked_obj_list)}" ) write_text_to_path(properties["store_path"], "client_metadata", content) From f921d217c0aea934568561f26bd7fbdec57df2bf Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 26 Jul 2023 15:55:05 -0700 Subject: [PATCH 067/165] Refactor 'client.py' and add new functionality to store metadata with optional argument '-cvt' --- src/hashstore/client.py | 101 ++++++++++++++++++++++++++++++++++------ 1 file changed, 88 insertions(+), 13 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 870efd6f..6d76ecd7 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -42,6 +42,11 @@ def add_client_optional_arguments(argp): dest="convert_directory", help="Directory of objects to convert to a HashStore", ) + argp.add_argument( + "-cvt", + dest="convert_directory_type", + help="Type of directory to convert (ex. 'objects' or 'metadata')", + ) argp.add_argument( "-nobj", dest="num_obj_to_convert", @@ -135,7 +140,7 @@ def load_db_properties(pgdb_yaml): with open(pgdb_yaml, "r", encoding="utf-8") as file: yaml_data = yaml.safe_load(file) - # Get hashstore properties + # Get database values db_yaml_dict = {} for key in db_keys: checked_property = yaml_data[key] @@ -205,11 +210,66 @@ def get_objs_from_metacat_db(properties, obj_directory, num): return checked_obj_list -def convert_dir_to_hs_multi(obj_directory, config_yaml, num): - """Store objects in a given directory into HashStore with a random pid. +def get_metadata_from_metacat_db(properties, metadata_directory, num): + """Get the list of objects from knbvm's metacat db to store into HashStore""" + # Get db config from locally created file in store path (`pgdb.yaml`) + pgyaml_path = properties["store_path"] + "/pgdb.yaml" + print(f"Retrieving db config from: {pgyaml_path}") + + db_properties = load_db_properties(pgyaml_path) + db_user = db_properties["db_user"] + db_password = db_properties["db_password"] + db_host = db_properties["db_host"] + db_port = db_properties["db_port"] + db_name = db_properties["db_name"] + + # Create a connection to the database + conn = pg8000.connect( + user=db_user, + password=db_password, + host=db_host, + port=int(db_port), + database=db_name, + ) + + # Create a cursor to execute queries + cursor = conn.cursor() + + # Query to refine matching rows between `identifier` and `systemmetadata`` table + query = f"""SELECT identifier.guid, identifier.docid, identifier.rev, + systemmetadata.object_format FROM identifier INNER JOIN systemmetadata + ON identifier.guid = systemmetadata.guid LIMIT {num};""" + cursor.execute(query) + + # Fetch all rows from the result set + rows = cursor.fetchall() + + # Create metadata list to store into HashStore + print("Creating list of metadata to store into HashStore") + checked_metadata_list = [] + for row in rows: + # Get pid, filepath and formatId + pid_guid = row[0] + metadatapath_docid_rev = metadata_directory + "/" + row[1] + "." + str(row[2]) + metadata_namespace = row[3] + tuple_item = (pid_guid, metadatapath_docid_rev, metadata_namespace) + # Only add to the list if it is an object, not metadata document + if os.path.exists(metadatapath_docid_rev): + checked_metadata_list.append(tuple_item) + + # Close the cursor and connection when done + cursor.close() + conn.close() + + return checked_metadata_list + + +def store_to_hashstore(origin_dir, obj_type, config_yaml, num): + """Store objects in a given directory into HashStore Args: - obj_directory (str): Directory to convert + origin_dir (str): Directory to convert + obj_type (str): 'object' or 'metadata' config_yaml (str): Path to HashStore config file `hashstore.yaml` num (int): Number of files to store """ @@ -217,15 +277,22 @@ def convert_dir_to_hs_multi(obj_directory, config_yaml, num): store = get_hashstore(properties) # Get list of files from directory - obj_list = os.listdir(obj_directory) - checked_num = len(obj_list) + file_list = os.listdir(origin_dir) + checked_num_of_files = len(file_list) # Check number of files to store if num is not None: - checked_num = int(num) + checked_num_of_files = int(num) # Get list of objects to store - checked_obj_list = get_objs_from_metacat_db(properties, obj_directory, checked_num) + if obj_type == "object": + checked_obj_list = get_objs_from_metacat_db( + properties, origin_dir, checked_num_of_files + ) + if obj_type == "metadata": + checked_obj_list = get_metadata_from_metacat_db( + properties, origin_dir, checked_num_of_files + ) start_time = datetime.now() @@ -235,8 +302,12 @@ def convert_dir_to_hs_multi(obj_directory, config_yaml, num): pool = multiprocessing.Pool() # Call store object - print("Storing objects") - pool.starmap(store.store_object, checked_obj_list) + if obj_type == "object": + print("Storing objects") + pool.starmap(store.store_object, checked_obj_list) + if obj_type == "metadata": + print("Storing metadata") + pool.starmap(store.store_metadata, checked_obj_list) # Close the pool and wait for all processes to complete pool.close() @@ -245,8 +316,8 @@ def convert_dir_to_hs_multi(obj_directory, config_yaml, num): end_time = datetime.now() content = ( f"Start Time: {start_time}\nEnd Time: {end_time}\n" - + f"Total Time to Store {checked_num} Objects: {end_time - start_time}" - + f"Expected number of data objects stored: {len(checked_obj_list)}" + + f"Total Time to Store {checked_num_of_files} Objects: {end_time - start_time}" + + f"Expected number of data {obj_type} stored: {len(checked_obj_list)}" ) write_text_to_path(properties["store_path"], "client_metadata", content) @@ -292,9 +363,13 @@ def convert_dir_to_hs_multi(obj_directory, config_yaml, num): number_of_objects_to_convert = getattr(args, "num_obj_to_convert") store_path = getattr(args, "store_path") store_path_config_yaml = store_path + "/hashstore.yaml" + directory_type = getattr(args, "convert_directory_type") + if directory_type != "object" or directory_type != "metadata": + raise ValueError("Directory type must be 'object' or 'metadata'") if os.path.exists(store_path_config_yaml): - convert_dir_to_hs_multi( + store_to_hashstore( directory_to_convert, + directory_type, store_path_config_yaml, number_of_objects_to_convert, ) From 0013e628380c8d750752918566759f87dc8f13af Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 26 Jul 2023 16:07:12 -0700 Subject: [PATCH 068/165] Clean up comments and code --- src/hashstore/client.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 6d76ecd7..39390524 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -161,7 +161,7 @@ def write_text_to_path(directory, filename, content): def get_objs_from_metacat_db(properties, obj_directory, num): """Get the list of objects from knbvm's metacat db to store into HashStore""" - # Get db config from locally created file in store path (`pgdb.yaml`) + # Note: Manually create `pgdb.yaml` for security purposes pgyaml_path = properties["store_path"] + "/pgdb.yaml" print(f"Retrieving db config from: {pgyaml_path}") @@ -211,8 +211,8 @@ def get_objs_from_metacat_db(properties, obj_directory, num): def get_metadata_from_metacat_db(properties, metadata_directory, num): - """Get the list of objects from knbvm's metacat db to store into HashStore""" - # Get db config from locally created file in store path (`pgdb.yaml`) + """Get the list of metadata objs from knbvm's metacat db to store into HashStore""" + # Note: Manually create `pgdb.yaml` for security purposes pgyaml_path = properties["store_path"] + "/pgdb.yaml" print(f"Retrieving db config from: {pgyaml_path}") @@ -235,7 +235,7 @@ def get_metadata_from_metacat_db(properties, metadata_directory, num): # Create a cursor to execute queries cursor = conn.cursor() - # Query to refine matching rows between `identifier` and `systemmetadata`` table + # Query to refine rows between `identifier` and `systemmetadata`` table query = f"""SELECT identifier.guid, identifier.docid, identifier.rev, systemmetadata.object_format FROM identifier INNER JOIN systemmetadata ON identifier.guid = systemmetadata.guid LIMIT {num};""" @@ -279,12 +279,11 @@ def store_to_hashstore(origin_dir, obj_type, config_yaml, num): # Get list of files from directory file_list = os.listdir(origin_dir) checked_num_of_files = len(file_list) - # Check number of files to store if num is not None: checked_num_of_files = int(num) - # Get list of objects to store + # Get list of objects to store from metacat db if obj_type == "object": checked_obj_list = get_objs_from_metacat_db( properties, origin_dir, checked_num_of_files @@ -301,7 +300,7 @@ def store_to_hashstore(origin_dir, obj_type, config_yaml, num): # pool = multiprocessing.Pool(processes=num_processes) pool = multiprocessing.Pool() - # Call store object + # Call 'obj_type' respective public API methods if obj_type == "object": print("Storing objects") pool.starmap(store.store_object, checked_obj_list) @@ -356,7 +355,7 @@ def store_to_hashstore(origin_dir, obj_type, config_yaml, num): } get_hashstore(props) - # Convert a directory into HashStore if config file and directory exist + # Convert a directory to a HashStore if config file present elif getattr(args, "convert_directory") is not None: directory_to_convert = getattr(args, "convert_directory") if os.path.exists(directory_to_convert): @@ -365,7 +364,9 @@ def store_to_hashstore(origin_dir, obj_type, config_yaml, num): store_path_config_yaml = store_path + "/hashstore.yaml" directory_type = getattr(args, "convert_directory_type") if directory_type != "object" or directory_type != "metadata": - raise ValueError("Directory type must be 'object' or 'metadata'") + raise ValueError( + "Directory `-cvt` cannot be empty, must be 'object' or 'metadata'" + ) if os.path.exists(store_path_config_yaml): store_to_hashstore( directory_to_convert, From 5181441ad2f479160fec741a3ef3ac02f9c6f4f2 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 28 Jul 2023 09:21:40 -0700 Subject: [PATCH 069/165] Revise 'client.py' to store remaining objects due to permissions error --- src/hashstore/client.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 39390524..97e21b0d 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -2,6 +2,7 @@ import os from argparse import ArgumentParser from datetime import datetime +import hashlib import multiprocessing import yaml import pg8000 @@ -159,6 +160,19 @@ def write_text_to_path(directory, filename, content): file.write(content) +def get_sha256_hex_digest(string): + """Calculate the SHA-256 digest of a UTF-8 encoded string. + + Args: + string (string): String to convert. + + Returns: + hex (string): Hexadecimal string. + """ + hex_digest = hashlib.sha256(string.encode("utf-8")).hexdigest() + return hex_digest + + def get_objs_from_metacat_db(properties, obj_directory, num): """Get the list of objects from knbvm's metacat db to store into HashStore""" # Note: Manually create `pgdb.yaml` for security purposes @@ -201,7 +215,11 @@ def get_objs_from_metacat_db(properties, obj_directory, num): tuple_item = (pid_guid, filepath_docid_rev) # Only add to the list if it is an object, not metadata document if os.path.exists(filepath_docid_rev): - checked_obj_list.append(tuple_item) + # If the file has already been stored, skip it + if os.path.exists(get_sha256_hex_digest(pid_guid)): + pass + else: + checked_obj_list.append(tuple_item) # Close the cursor and connection when done cursor.close() @@ -255,6 +273,8 @@ def get_metadata_from_metacat_db(properties, metadata_directory, num): tuple_item = (pid_guid, metadatapath_docid_rev, metadata_namespace) # Only add to the list if it is an object, not metadata document if os.path.exists(metadatapath_docid_rev): + # If the file already exists, don't attempt to add it + checked_metadata_list.append(tuple_item) # Close the cursor and connection when done From 143a6a45528f7c57d130cb12ee2e2c8cb7638b0d Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 28 Jul 2023 09:27:46 -0700 Subject: [PATCH 070/165] Improve debug messaging in main method --- src/hashstore/client.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 97e21b0d..09fad97a 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -385,7 +385,8 @@ def store_to_hashstore(origin_dir, obj_type, config_yaml, num): directory_type = getattr(args, "convert_directory_type") if directory_type != "object" or directory_type != "metadata": raise ValueError( - "Directory `-cvt` cannot be empty, must be 'object' or 'metadata'" + "Directory `-cvt` cannot be empty, must be 'object' or 'metadata'." + + f"convert_directory_type: {directory_type}" ) if os.path.exists(store_path_config_yaml): store_to_hashstore( From c472921af8eb0f7956f2afda06345b32e6846feb Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 28 Jul 2023 09:30:15 -0700 Subject: [PATCH 071/165] Fix if statement when checking directory type to convert --- src/hashstore/client.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 09fad97a..b938293b 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -383,10 +383,11 @@ def store_to_hashstore(origin_dir, obj_type, config_yaml, num): store_path = getattr(args, "store_path") store_path_config_yaml = store_path + "/hashstore.yaml" directory_type = getattr(args, "convert_directory_type") - if directory_type != "object" or directory_type != "metadata": + accepted_directory_types = ["object", "metadata"] + if directory_type not in accepted_directory_types: raise ValueError( "Directory `-cvt` cannot be empty, must be 'object' or 'metadata'." - + f"convert_directory_type: {directory_type}" + + f" convert_directory_type: {directory_type}" ) if os.path.exists(store_path_config_yaml): store_to_hashstore( From 86525c838e34e847e36d3422f19f9facffc1c33d Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 28 Jul 2023 09:48:42 -0700 Subject: [PATCH 072/165] Improve debug messaging in 'client.py' and refactor to use HashStore to check for object existence --- src/hashstore/client.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index b938293b..c6c3e523 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -173,7 +173,7 @@ def get_sha256_hex_digest(string): return hex_digest -def get_objs_from_metacat_db(properties, obj_directory, num): +def get_objs_from_metacat_db(properties, obj_directory, num, store): """Get the list of objects from knbvm's metacat db to store into HashStore""" # Note: Manually create `pgdb.yaml` for security purposes pgyaml_path = properties["store_path"] + "/pgdb.yaml" @@ -216,8 +216,10 @@ def get_objs_from_metacat_db(properties, obj_directory, num): # Only add to the list if it is an object, not metadata document if os.path.exists(filepath_docid_rev): # If the file has already been stored, skip it - if os.path.exists(get_sha256_hex_digest(pid_guid)): - pass + if os.path.exists( + store.get_real_path(store.get_sha256_hex_digest(pid_guid)) + ): + print(f"Object exists in HashStore for guid: {pid_guid}") else: checked_obj_list.append(tuple_item) @@ -306,7 +308,7 @@ def store_to_hashstore(origin_dir, obj_type, config_yaml, num): # Get list of objects to store from metacat db if obj_type == "object": checked_obj_list = get_objs_from_metacat_db( - properties, origin_dir, checked_num_of_files + properties, origin_dir, checked_num_of_files, store ) if obj_type == "metadata": checked_obj_list = get_metadata_from_metacat_db( From 74363a816fb58b237f6fddc03fb97b8354a23eae Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 28 Jul 2023 09:58:54 -0700 Subject: [PATCH 073/165] Revise if statement when checking for whether a file exists to use HashStore 'exists()' --- src/hashstore/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index c6c3e523..5f0c9ecb 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -217,7 +217,7 @@ def get_objs_from_metacat_db(properties, obj_directory, num, store): if os.path.exists(filepath_docid_rev): # If the file has already been stored, skip it if os.path.exists( - store.get_real_path(store.get_sha256_hex_digest(pid_guid)) + store.get_real_path("objects", store.get_sha256_hex_digest(pid_guid)) ): print(f"Object exists in HashStore for guid: {pid_guid}") else: From 8bc7b9946d7249ff33ce05adb84b9f7b4fdc9d61 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 28 Jul 2023 10:03:44 -0700 Subject: [PATCH 074/165] Add actual fix, did not stage changes --- douhs/hashstore.yaml | 38 ++++++++++++++++++++++++++++++++++++++ src/hashstore/client.py | 4 +--- 2 files changed, 39 insertions(+), 3 deletions(-) create mode 100644 douhs/hashstore.yaml diff --git a/douhs/hashstore.yaml b/douhs/hashstore.yaml new file mode 100644 index 00000000..c5b0a4f8 --- /dev/null +++ b/douhs/hashstore.yaml @@ -0,0 +1,38 @@ + + # Default configuration variables for HashStore + + ############### Store Path ############### + # Default path for `FileHashStore` if no path is provided + store_path: "/Users/doumok/Code/hashstore/douhs" + + ############### Directory Structure ############### + # Desired amount of directories when sharding an object to form the permanent address + store_depth: 3 # WARNING: DO NOT CHANGE UNLESS SETTING UP NEW HASHSTORE + # Width of directories created when sharding an object to form the permanent address + store_width: 2 # WARNING: DO NOT CHANGE UNLESS SETTING UP NEW HASHSTORE + # Example: + # Below, objects are shown listed in directories that are 3 levels deep (DIR_DEPTH=3), + # with each directory consisting of 2 characters (DIR_WIDTH=2). + # /var/filehashstore/objects + # ├── 7f + # │ └── 5c + # │ └── c1 + # │ └── 8f0b04e812a3b4c8f686ce34e6fec558804bf61e54b176742a7f6368d6 + + ############### Format of the Metadata ############### + # The default metadata format + store_metadata_namespace: "http://www.ns.test/v1" + + ############### Hash Algorithms ############### + # Hash algorithm to use when calculating object's hex digest for the permanent address + store_algorithm: "SHA-256" + # Algorithm values supported by python hashlib 3.9.0+ for File Hash Store (FHS) + # The default algorithm list includes the hash algorithms calculated when storing an + # object to disk and returned to the caller after successful storage. + store_default_algo_list: + - "MD5" + - "SHA-1" + - "SHA-256" + - "SHA-384" + - "SHA-512" + \ No newline at end of file diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 5f0c9ecb..854d9ada 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -216,9 +216,7 @@ def get_objs_from_metacat_db(properties, obj_directory, num, store): # Only add to the list if it is an object, not metadata document if os.path.exists(filepath_docid_rev): # If the file has already been stored, skip it - if os.path.exists( - store.get_real_path("objects", store.get_sha256_hex_digest(pid_guid)) - ): + if store.exists("objects", store.get_sha256_hex_digest(pid_guid)): print(f"Object exists in HashStore for guid: {pid_guid}") else: checked_obj_list.append(tuple_item) From 63e32d6db3d39b6db1c4120e4c29beea4a333ce0 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 28 Jul 2023 10:04:28 -0700 Subject: [PATCH 075/165] Remove testing directory --- douhs/hashstore.yaml | 38 -------------------------------------- 1 file changed, 38 deletions(-) delete mode 100644 douhs/hashstore.yaml diff --git a/douhs/hashstore.yaml b/douhs/hashstore.yaml deleted file mode 100644 index c5b0a4f8..00000000 --- a/douhs/hashstore.yaml +++ /dev/null @@ -1,38 +0,0 @@ - - # Default configuration variables for HashStore - - ############### Store Path ############### - # Default path for `FileHashStore` if no path is provided - store_path: "/Users/doumok/Code/hashstore/douhs" - - ############### Directory Structure ############### - # Desired amount of directories when sharding an object to form the permanent address - store_depth: 3 # WARNING: DO NOT CHANGE UNLESS SETTING UP NEW HASHSTORE - # Width of directories created when sharding an object to form the permanent address - store_width: 2 # WARNING: DO NOT CHANGE UNLESS SETTING UP NEW HASHSTORE - # Example: - # Below, objects are shown listed in directories that are 3 levels deep (DIR_DEPTH=3), - # with each directory consisting of 2 characters (DIR_WIDTH=2). - # /var/filehashstore/objects - # ├── 7f - # │ └── 5c - # │ └── c1 - # │ └── 8f0b04e812a3b4c8f686ce34e6fec558804bf61e54b176742a7f6368d6 - - ############### Format of the Metadata ############### - # The default metadata format - store_metadata_namespace: "http://www.ns.test/v1" - - ############### Hash Algorithms ############### - # Hash algorithm to use when calculating object's hex digest for the permanent address - store_algorithm: "SHA-256" - # Algorithm values supported by python hashlib 3.9.0+ for File Hash Store (FHS) - # The default algorithm list includes the hash algorithms calculated when storing an - # object to disk and returned to the caller after successful storage. - store_default_algo_list: - - "MD5" - - "SHA-1" - - "SHA-256" - - "SHA-384" - - "SHA-512" - \ No newline at end of file From 488e5803054b309168df253b15006861ba824e9d Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 28 Jul 2023 15:37:36 -0700 Subject: [PATCH 076/165] Revise print statements to help with testing --- src/hashstore/client.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 854d9ada..d5868fc5 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -1,4 +1,5 @@ """HashStore Command Line App""" +import sys import os from argparse import ArgumentParser from datetime import datetime @@ -274,7 +275,7 @@ def get_metadata_from_metacat_db(properties, metadata_directory, num): # Only add to the list if it is an object, not metadata document if os.path.exists(metadatapath_docid_rev): # If the file already exists, don't attempt to add it - + print(f"Metadata doc found: {metadatapath_docid_rev} for pid: {pid_guid}") checked_metadata_list.append(tuple_item) # Close the cursor and connection when done @@ -335,8 +336,8 @@ def store_to_hashstore(origin_dir, obj_type, config_yaml, num): end_time = datetime.now() content = ( f"Start Time: {start_time}\nEnd Time: {end_time}\n" - + f"Total Time to Store {checked_num_of_files} Objects: {end_time - start_time}" - + f"Expected number of data {obj_type} stored: {len(checked_obj_list)}" + + f"Total Time to Store {len(checked_obj_list)} {obj_type}" + + f" Objects: {end_time - start_time}\n" ) write_text_to_path(properties["store_path"], "client_metadata", content) From 39c84fde4f30f6d7a2674b21e01413fa3b79358c Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 28 Jul 2023 15:49:43 -0700 Subject: [PATCH 077/165] Remove 'LIMIT' statement from sql to store metadata --- src/hashstore/client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index d5868fc5..bf2875ce 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -255,9 +255,9 @@ def get_metadata_from_metacat_db(properties, metadata_directory, num): cursor = conn.cursor() # Query to refine rows between `identifier` and `systemmetadata`` table - query = f"""SELECT identifier.guid, identifier.docid, identifier.rev, + query = """SELECT identifier.guid, identifier.docid, identifier.rev, systemmetadata.object_format FROM identifier INNER JOIN systemmetadata - ON identifier.guid = systemmetadata.guid LIMIT {num};""" + ON identifier.guid = systemmetadata.guid;""" cursor.execute(query) # Fetch all rows from the result set From d6137a4656306674ab249065d765278efc730d64 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 31 Jul 2023 12:14:23 -0700 Subject: [PATCH 078/165] Add logging to improve debugging and testing process --- src/hashstore/client.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index bf2875ce..f397dd56 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -1,5 +1,6 @@ """HashStore Command Line App""" import sys +import logging import os from argparse import ArgumentParser from datetime import datetime @@ -296,6 +297,12 @@ def store_to_hashstore(origin_dir, obj_type, config_yaml, num): """ properties = load_store_properties(config_yaml) store = get_hashstore(properties) + logging.basicConfig( + filename=properties["store_path"] + "/python/python_store.log", + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) # Get list of files from directory file_list = os.listdir(origin_dir) @@ -324,10 +331,24 @@ def store_to_hashstore(origin_dir, obj_type, config_yaml, num): # Call 'obj_type' respective public API methods if obj_type == "object": print("Storing objects") - pool.starmap(store.store_object, checked_obj_list) + results = pool.starmap(store.store_object, checked_obj_list) if obj_type == "metadata": print("Storing metadata") - pool.starmap(store.store_metadata, checked_obj_list) + results = pool.starmap(store.store_metadata, checked_obj_list) + + # Log exceptions + print("Checking results and logging exceptions") + for index, result in enumerate(results): + exception_type = type(result).__name__ + exception_message = result.args[0] + err_msg = f"Exception ({exception_type}): {exception_message}" + if isinstance(result, Exception): + logging.info(err_msg) + write_text_to_path( + properties["store_path"] + f"/python/errors/{obj_type}", + f"exception_{index}_{exception_type}", + err_msg, + ) # Close the pool and wait for all processes to complete pool.close() @@ -339,6 +360,7 @@ def store_to_hashstore(origin_dir, obj_type, config_yaml, num): + f"Total Time to Store {len(checked_obj_list)} {obj_type}" + f" Objects: {end_time - start_time}\n" ) + logging.info(content) write_text_to_path(properties["store_path"], "client_metadata", content) From 98c29d6c345a9527a459d06dea3ec5212d301738 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 31 Jul 2023 14:52:17 -0700 Subject: [PATCH 079/165] Refactor clean-up process and checking results of pool.starmap() --- src/hashstore/client.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index f397dd56..ccf810cb 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -338,17 +338,11 @@ def store_to_hashstore(origin_dir, obj_type, config_yaml, num): # Log exceptions print("Checking results and logging exceptions") - for index, result in enumerate(results): - exception_type = type(result).__name__ - exception_message = result.args[0] - err_msg = f"Exception ({exception_type}): {exception_message}" + for result in results: + result_type = type(result).__name__ if isinstance(result, Exception): - logging.info(err_msg) - write_text_to_path( - properties["store_path"] + f"/python/errors/{obj_type}", - f"exception_{index}_{exception_type}", - err_msg, - ) + print(f"{result_type}: {result}") + logging.info(result) # Close the pool and wait for all processes to complete pool.close() From 5045b513ab0d12dfb50f7919c366c2a215d2ff2e Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 31 Jul 2023 15:02:31 -0700 Subject: [PATCH 080/165] Add additional print statement to assist with terminal debugging --- src/hashstore/client.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index ccf810cb..49227197 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -339,6 +339,7 @@ def store_to_hashstore(origin_dir, obj_type, config_yaml, num): # Log exceptions print("Checking results and logging exceptions") for result in results: + print(result) result_type = type(result).__name__ if isinstance(result, Exception): print(f"{result_type}: {result}") From 66bd2158797891eb5c29e65d44bba1e76de7e858 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 31 Jul 2023 15:06:14 -0700 Subject: [PATCH 081/165] Change print statement to logging statement --- src/hashstore/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 49227197..d54fa5a1 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -339,7 +339,7 @@ def store_to_hashstore(origin_dir, obj_type, config_yaml, num): # Log exceptions print("Checking results and logging exceptions") for result in results: - print(result) + logging.info(result) result_type = type(result).__name__ if isinstance(result, Exception): print(f"{result_type}: {result}") From d6ee3d40da0e52e8791d4d5556ff81fb01728b8d Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 31 Jul 2023 15:46:40 -0700 Subject: [PATCH 082/165] Refactor to log debug messages where relevant --- src/hashstore/client.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index d54fa5a1..a713ca5b 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -66,6 +66,7 @@ def get_hashstore(properties): Returns: hashstore (FileHashStore): HashStore """ + logging.info("Initializing HashStore") factory = HashStoreFactory() # Get HashStore from factory @@ -297,12 +298,6 @@ def store_to_hashstore(origin_dir, obj_type, config_yaml, num): """ properties = load_store_properties(config_yaml) store = get_hashstore(properties) - logging.basicConfig( - filename=properties["store_path"] + "/python/python_store.log", - level=logging.INFO, - format="%(asctime)s - %(levelname)s - %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", - ) # Get list of files from directory file_list = os.listdir(origin_dir) @@ -330,19 +325,19 @@ def store_to_hashstore(origin_dir, obj_type, config_yaml, num): # Call 'obj_type' respective public API methods if obj_type == "object": - print("Storing objects") + logging.info("Storing objects") results = pool.starmap(store.store_object, checked_obj_list) if obj_type == "metadata": - print("Storing metadata") + logging.info("Storing metadata") results = pool.starmap(store.store_metadata, checked_obj_list) # Log exceptions - print("Checking results and logging exceptions") + cleanup_msg = "Checking results and logging exceptions" + print(cleanup_msg) + logging.info(cleanup_msg) for result in results: - logging.info(result) - result_type = type(result).__name__ if isinstance(result, Exception): - print(f"{result_type}: {result}") + print(result) logging.info(result) # Close the pool and wait for all processes to complete @@ -356,7 +351,6 @@ def store_to_hashstore(origin_dir, obj_type, config_yaml, num): + f" Objects: {end_time - start_time}\n" ) logging.info(content) - write_text_to_path(properties["store_path"], "client_metadata", content) if __name__ == "__main__": @@ -372,7 +366,6 @@ def store_to_hashstore(origin_dir, obj_type, config_yaml, num): description=DESCRIPTION, epilog=EPILOG, ) - ### Add Positional and Optional Arguments parser.add_argument("store_path", help="Path of the HashStore") add_client_optional_arguments(parser) @@ -380,6 +373,15 @@ def store_to_hashstore(origin_dir, obj_type, config_yaml, num): # Client entry point args = parser.parse_args() + ### Initialize Logging + python_log_file_path = getattr(args, "store_path") + "/python_store.log" + logging.basicConfig( + filename=python_log_file_path, + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + # Create HashStore if -chs flag is true if getattr(args, "create_hashstore"): # Create a HashStore at the given directory From d65e10cc94eee446f76347484e20197dc2459f2f Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 1 Aug 2023 13:32:19 -0700 Subject: [PATCH 083/165] Add new optional parameters and method to 'client.py' to get the hex digest of a stored object --- src/hashstore/client.py | 43 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index a713ca5b..266577c5 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -56,6 +56,18 @@ def add_client_optional_arguments(argp): help="Number of objects to convert", ) + # Individual API calls + argp.add_argument( + "-pid", + dest="object_pid", + help="Pid/Guid of object to work with", + ) + argp.add_argument( + "-algo", + dest="object_algorithm", + help="Algorithm to work with", + ) + def get_hashstore(properties): """Create a HashStore instance with the supplied properties. @@ -353,6 +365,17 @@ def store_to_hashstore(origin_dir, obj_type, config_yaml, num): logging.info(content) +def get_obj_hex_digest_from_store(config_yaml, pid, algorithm): + """Given a pid and algorithm, get the hex digest of the object""" + properties = load_store_properties(config_yaml) + store = get_hashstore(properties) + + digest = store.get_hex_digest(pid, algorithm) + print(f"guid/pid: {pid}") + print(f"algorithm: {algorithm}") + print(f"digest: {digest}") + + if __name__ == "__main__": PROGRAM_NAME = "HashStore Command Line Client" DESCRIPTION = ( @@ -427,3 +450,23 @@ def store_to_hashstore(origin_dir, obj_type, config_yaml, num): raise FileNotFoundError( f"Directory to convert does not exist: {getattr(args, 'convert_directory')}." ) + + # Calculate the hex digest of a given pid with algorithm supplied + elif ( + getattr(args, "object_pid") is not None + and getattr(args, "object_algorithm") is not None + ): + pid = getattr(args, "object_pid") + algorithm = getattr(args, "object_algorithm") + store_path = getattr(args, "store_path") + store_path_config_yaml = store_path + "/hashstore.yaml" + + if os.path.exists(store_path_config_yaml): + get_obj_hex_digest_from_store(store_path_config_yaml, pid, algorithm) + else: + # If HashStore does not exist, raise exception + # Calling app must create HashStore first before calling methods + raise FileNotFoundError( + f"Missing config file (hashstore.yaml) at store path: {store_path}." + + " HashStore must be initialized, use `--help` for more information." + ) From 953e07678183cea085c5805519ef40350e085004 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 2 Aug 2023 10:34:45 -0700 Subject: [PATCH 084/165] Refactor 'client.py' and clean up code and comments --- src/hashstore/client.py | 224 +++++++++++++++++----------------------- 1 file changed, 97 insertions(+), 127 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 266577c5..fb699e7c 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -1,17 +1,18 @@ """HashStore Command Line App""" -import sys import logging import os from argparse import ArgumentParser from datetime import datetime -import hashlib import multiprocessing import yaml import pg8000 from hashstore import HashStoreFactory -def add_client_optional_arguments(argp): +# Supporting Methods + + +def _add_client_optional_arguments(argp): """Adds the optional arguments for the HashStore Client. Args: @@ -69,7 +70,7 @@ def add_client_optional_arguments(argp): ) -def get_hashstore(properties): +def _get_hashstore(properties): """Create a HashStore instance with the supplied properties. Args: @@ -90,7 +91,7 @@ def get_hashstore(properties): return hashstore -def load_store_properties(hashstore_yaml): +def _load_store_properties(hashstore_yaml): """Get and return the contents of the current HashStore configuration. Returns: @@ -111,7 +112,7 @@ def load_store_properties(hashstore_yaml): if not os.path.exists(hashstore_yaml): exception_string = ( - "HashStore CLI Client - load_store_properties: hashstore.yaml not found" + "HashStore CLI Client - _load_store_properties: hashstore.yaml not found" + " in store root path." ) raise FileNotFoundError(exception_string) @@ -129,8 +130,9 @@ def load_store_properties(hashstore_yaml): return hashstore_yaml_dict -def load_db_properties(pgdb_yaml): - """Get and return the contents of a postgres config file +def _load_metacat_db_properties(pgdb_yaml): + """Get and return the contents of a config file with credentials + to access a postgres db. Args: pgdb_yaml (string): Path to yaml file @@ -148,7 +150,7 @@ def load_db_properties(pgdb_yaml): if not os.path.exists(pgdb_yaml): exception_string = ( - "HashStore CLI Client - load_db_properties: pgdb.yaml not found" + "HashStore CLI Client - _load_metacat_db_properties: pgdb.yaml not found" + " in store root path." ) raise FileNotFoundError(exception_string) @@ -164,92 +166,13 @@ def load_db_properties(pgdb_yaml): return db_yaml_dict -def write_text_to_path(directory, filename, content): - """Write a text file to a given directory.""" - # Combine the directory path and filename - file_path = f"{directory}/{filename}.txt" - - # Open the file in write mode ('w') - with open(file_path, "w", encoding="utf-8") as file: - # Write the content to the file - file.write(content) - - -def get_sha256_hex_digest(string): - """Calculate the SHA-256 digest of a UTF-8 encoded string. - - Args: - string (string): String to convert. - - Returns: - hex (string): Hexadecimal string. - """ - hex_digest = hashlib.sha256(string.encode("utf-8")).hexdigest() - return hex_digest - - -def get_objs_from_metacat_db(properties, obj_directory, num, store): - """Get the list of objects from knbvm's metacat db to store into HashStore""" - # Note: Manually create `pgdb.yaml` for security purposes - pgyaml_path = properties["store_path"] + "/pgdb.yaml" - print(f"Retrieving db config from: {pgyaml_path}") - - db_properties = load_db_properties(pgyaml_path) - db_user = db_properties["db_user"] - db_password = db_properties["db_password"] - db_host = db_properties["db_host"] - db_port = db_properties["db_port"] - db_name = db_properties["db_name"] - - # Create a connection to the database - conn = pg8000.connect( - user=db_user, - password=db_password, - host=db_host, - port=int(db_port), - database=db_name, - ) - - # Create a cursor to execute queries - cursor = conn.cursor() - - # Query to get rows from `identifier` table - query = f"SELECT * FROM identifier LIMIT {num};" - cursor.execute(query) - - # Fetch all rows from the result set - rows = cursor.fetchall() - - # Create object list to store into HashStore - print("Creating list of objects to store into HashStore") - checked_obj_list = [] - for row in rows: - # Get pid and filename - pid_guid = row[0] - filepath_docid_rev = obj_directory + "/" + row[1] + "." + str(row[2]) - tuple_item = (pid_guid, filepath_docid_rev) - # Only add to the list if it is an object, not metadata document - if os.path.exists(filepath_docid_rev): - # If the file has already been stored, skip it - if store.exists("objects", store.get_sha256_hex_digest(pid_guid)): - print(f"Object exists in HashStore for guid: {pid_guid}") - else: - checked_obj_list.append(tuple_item) - - # Close the cursor and connection when done - cursor.close() - conn.close() - - return checked_obj_list - - -def get_metadata_from_metacat_db(properties, metadata_directory, num): - """Get the list of metadata objs from knbvm's metacat db to store into HashStore""" +def _get_full_obj_list_from_metacat_db(properties, metadata_directory, num): + """Get the list of objects and metadata from knbvm's metacat db""" # Note: Manually create `pgdb.yaml` for security purposes pgyaml_path = properties["store_path"] + "/pgdb.yaml" print(f"Retrieving db config from: {pgyaml_path}") - db_properties = load_db_properties(pgyaml_path) + db_properties = _load_metacat_db_properties(pgyaml_path) db_user = db_properties["db_user"] db_password = db_properties["db_password"] db_host = db_properties["db_host"] @@ -269,37 +192,86 @@ def get_metadata_from_metacat_db(properties, metadata_directory, num): cursor = conn.cursor() # Query to refine rows between `identifier` and `systemmetadata`` table - query = """SELECT identifier.guid, identifier.docid, identifier.rev, - systemmetadata.object_format FROM identifier INNER JOIN systemmetadata - ON identifier.guid = systemmetadata.guid;""" + if num is None: + limit_query = "" + else: + limit_query = f" LIMIT {num}" + query = f"""SELECT identifier.guid, identifier.docid, identifier.rev, + systemmetadata.object_format, systemmetadata.checksum, + systemmetadata.checksum_algorithm FROM identifier INNER JOIN systemmetadata + ON identifier.guid = systemmetadata.guid{limit_query};""" cursor.execute(query) # Fetch all rows from the result set rows = cursor.fetchall() - # Create metadata list to store into HashStore - print("Creating list of metadata to store into HashStore") - checked_metadata_list = [] + # Create full object list to store into HashStore + print("Creating list of objects and metadata from metacat db") + object_metadata_list = [] for row in rows: # Get pid, filepath and formatId pid_guid = row[0] metadatapath_docid_rev = metadata_directory + "/" + row[1] + "." + str(row[2]) metadata_namespace = row[3] - tuple_item = (pid_guid, metadatapath_docid_rev, metadata_namespace) - # Only add to the list if it is an object, not metadata document - if os.path.exists(metadatapath_docid_rev): - # If the file already exists, don't attempt to add it - print(f"Metadata doc found: {metadatapath_docid_rev} for pid: {pid_guid}") - checked_metadata_list.append(tuple_item) + checksum = row[4] + checksum_algorithm = row[5] + tuple_item = ( + pid_guid, + metadatapath_docid_rev, + metadata_namespace, + checksum, + checksum_algorithm, + ) + object_metadata_list.append(tuple_item) # Close the cursor and connection when done cursor.close() conn.close() - return checked_metadata_list + return object_metadata_list + +def _refine_object_list(store, metacat_obj_list): + """Refine a list of objects by checking for file existence and removing duplicates.""" + refined_list = [] + for obj in metacat_obj_list: + pid_guid = obj[0] + filepath_docid_rev = obj[1] + if os.path.exists(filepath_docid_rev): + # If the file has already been stored, skip it + if store.exists("objects", store.get_sha256_hex_digest(pid_guid)): + print( + f"Skipping store_object for {pid_guid} - object exists in HashStore" + ) + else: + tuple_item = (pid_guid, filepath_docid_rev) + refined_list.append(tuple_item) + return refined_list + + +def _refine_metadata_list(store, metacat_obj_list): + """Refine a list of metadata by checking for file existence and removing duplicates.""" + refined_list = [] + for obj in metacat_obj_list: + pid_guid = obj[0] + filepath_docid_rev = obj[1] + metadata_namespace = obj[2] + if os.path.exists(filepath_docid_rev): + # If the file has already been stored, skip it + if store.exists("metadata", store.get_sha256_hex_digest(pid_guid)): + print( + f"Skipping store_metadata for {pid_guid} - metadata exists in HashStore" + ) + else: + tuple_item = (pid_guid, metadata_namespace, filepath_docid_rev) + refined_list.append(tuple_item) + return refined_list -def store_to_hashstore(origin_dir, obj_type, config_yaml, num): + +# Concrete Methods + + +def store_to_hashstore_from_list(origin_dir, obj_type, config_yaml, num): """Store objects in a given directory into HashStore Args: @@ -308,8 +280,8 @@ def store_to_hashstore(origin_dir, obj_type, config_yaml, num): config_yaml (str): Path to HashStore config file `hashstore.yaml` num (int): Number of files to store """ - properties = load_store_properties(config_yaml) - store = get_hashstore(properties) + properties = _load_store_properties(config_yaml) + store = _get_hashstore(properties) # Get list of files from directory file_list = os.listdir(origin_dir) @@ -318,15 +290,16 @@ def store_to_hashstore(origin_dir, obj_type, config_yaml, num): if num is not None: checked_num_of_files = int(num) + # Object and Metadata list + metacat_obj_list = _get_full_obj_list_from_metacat_db( + properties, origin_dir, checked_num_of_files + ) + # Get list of objects to store from metacat db if obj_type == "object": - checked_obj_list = get_objs_from_metacat_db( - properties, origin_dir, checked_num_of_files, store - ) + checked_obj_list = _refine_object_list(store, metacat_obj_list) if obj_type == "metadata": - checked_obj_list = get_metadata_from_metacat_db( - properties, origin_dir, checked_num_of_files - ) + checked_obj_list = _refine_metadata_list(store, metacat_obj_list) start_time = datetime.now() @@ -365,14 +338,14 @@ def store_to_hashstore(origin_dir, obj_type, config_yaml, num): logging.info(content) -def get_obj_hex_digest_from_store(config_yaml, pid, algorithm): +def get_obj_hex_digest_from_store(config_yaml, pid_guid, obj_algo): """Given a pid and algorithm, get the hex digest of the object""" - properties = load_store_properties(config_yaml) - store = get_hashstore(properties) + properties = _load_store_properties(config_yaml) + store = _get_hashstore(properties) digest = store.get_hex_digest(pid, algorithm) - print(f"guid/pid: {pid}") - print(f"algorithm: {algorithm}") + print(f"guid/pid: {pid_guid}") + print(f"algorithm: {obj_algo}") print(f"digest: {digest}") @@ -391,7 +364,7 @@ def get_obj_hex_digest_from_store(config_yaml, pid, algorithm): ) ### Add Positional and Optional Arguments parser.add_argument("store_path", help="Path of the HashStore") - add_client_optional_arguments(parser) + _add_client_optional_arguments(parser) # Client entry point args = parser.parse_args() @@ -405,9 +378,8 @@ def get_obj_hex_digest_from_store(config_yaml, pid, algorithm): datefmt="%Y-%m-%d %H:%M:%S", ) - # Create HashStore if -chs flag is true if getattr(args, "create_hashstore"): - # Create a HashStore at the given directory + # Create HashStore if -chs flag is true in a given directory # Get store attributes, HashStore will validate properties props = { "store_path": getattr(args, "store_path"), @@ -416,10 +388,9 @@ def get_obj_hex_digest_from_store(config_yaml, pid, algorithm): "store_algorithm": getattr(args, "algorithm"), "store_metadata_namespace": getattr(args, "formatid"), } - get_hashstore(props) - - # Convert a directory to a HashStore if config file present + _get_hashstore(props) elif getattr(args, "convert_directory") is not None: + # Convert a directory to a HashStore if config file present directory_to_convert = getattr(args, "convert_directory") if os.path.exists(directory_to_convert): number_of_objects_to_convert = getattr(args, "num_obj_to_convert") @@ -433,7 +404,7 @@ def get_obj_hex_digest_from_store(config_yaml, pid, algorithm): + f" convert_directory_type: {directory_type}" ) if os.path.exists(store_path_config_yaml): - store_to_hashstore( + store_to_hashstore_from_list( directory_to_convert, directory_type, store_path_config_yaml, @@ -450,12 +421,11 @@ def get_obj_hex_digest_from_store(config_yaml, pid, algorithm): raise FileNotFoundError( f"Directory to convert does not exist: {getattr(args, 'convert_directory')}." ) - - # Calculate the hex digest of a given pid with algorithm supplied elif ( getattr(args, "object_pid") is not None and getattr(args, "object_algorithm") is not None ): + # Calculate the hex digest of a given pid with algorithm supplied pid = getattr(args, "object_pid") algorithm = getattr(args, "object_algorithm") store_path = getattr(args, "store_path") From 6f8c1c564974963e3eb79439da2022403a19b80e Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 2 Aug 2023 11:18:30 -0700 Subject: [PATCH 085/165] Add new optional arguments and methods to retrieve and validate objects from a HashStore --- src/hashstore/client.py | 112 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 101 insertions(+), 11 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index fb699e7c..dbe8266f 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -56,6 +56,12 @@ def _add_client_optional_arguments(argp): dest="num_obj_to_convert", help="Number of objects to convert", ) + argp.add_argument( + "-rav", + dest="retrieve_and_validate", + action="store_true", + help="Retrieve and validate objects in HashStore", + ) # Individual API calls argp.add_argument( @@ -166,7 +172,7 @@ def _load_metacat_db_properties(pgdb_yaml): return db_yaml_dict -def _get_full_obj_list_from_metacat_db(properties, metadata_directory, num): +def _get_full_obj_list_from_metacat_db(properties, metacat_dir, num): """Get the list of objects and metadata from knbvm's metacat db""" # Note: Manually create `pgdb.yaml` for security purposes pgyaml_path = properties["store_path"] + "/pgdb.yaml" @@ -211,7 +217,7 @@ def _get_full_obj_list_from_metacat_db(properties, metadata_directory, num): for row in rows: # Get pid, filepath and formatId pid_guid = row[0] - metadatapath_docid_rev = metadata_directory + "/" + row[1] + "." + str(row[2]) + metadatapath_docid_rev = metacat_dir + "/" + row[1] + "." + str(row[2]) metadata_namespace = row[3] checksum = row[4] checksum_algorithm = row[5] @@ -323,7 +329,7 @@ def store_to_hashstore_from_list(origin_dir, obj_type, config_yaml, num): for result in results: if isinstance(result, Exception): print(result) - logging.info(result) + logging.error(result) # Close the pool and wait for all processes to complete pool.close() @@ -331,6 +337,7 @@ def store_to_hashstore_from_list(origin_dir, obj_type, config_yaml, num): end_time = datetime.now() content = ( + f"store_to_hashstore_from_list:\n" f"Start Time: {start_time}\nEnd Time: {end_time}\n" + f"Total Time to Store {len(checked_obj_list)} {obj_type}" + f" Objects: {end_time - start_time}\n" @@ -338,6 +345,79 @@ def store_to_hashstore_from_list(origin_dir, obj_type, config_yaml, num): logging.info(content) +def retrieve_and_validate_from_hashstore(origin_dir, obj_type, config_yaml, num): + "Retrieve objects or metadata from a Hashstore and validate the content." + properties = _load_store_properties(config_yaml) + store = _get_hashstore(properties) + + checked_num_of_files = None + # Check number of files to store + if num is not None: + checked_num_of_files = int(num) + + # Object and Metadata list + metacat_obj_list = _get_full_obj_list_from_metacat_db( + properties, origin_dir, checked_num_of_files + ) + + # Get list of objects to store from metacat db + if obj_type == "object": + checked_obj_list = _refine_object_list(store, metacat_obj_list) + if obj_type == "metadata": + checked_obj_list = _refine_metadata_list(store, metacat_obj_list) + + start_time = datetime.now() + + # Retrieve, validate and close stream + def retrieve_and_validate(obj_tuple): + pid_guid = obj_tuple[0] + algo = obj_tuple[4] + checksum = obj_tuple[3] + obj_stream = store.retrieve_object(pid_guid) + digest = store.computehash(obj_stream, algo) + obj_stream.close() + # Check algorithm + if digest != checksum: + err_msg = ( + f"Unexpected Exception for pid/guid: {pid_guid} -" + + f" Digest calcualted from stream ({digest}) does not match" + + f" checksum from metacata db: {checksum}" + ) + raise AssertionError(err_msg) + + # Setup pool and processes + pool = multiprocessing.Pool() + + if obj_type == "object": + logging.info("Storing objects") + results = pool.starmap(retrieve_and_validate, checked_obj_list) + if obj_type == "metadata": + logging.info("Storing metadata") + # TODO + + # Log exceptions + cleanup_msg = "Checking results and logging exceptions" + print(cleanup_msg) + logging.info(cleanup_msg) + for result in results: + if isinstance(result, Exception): + print(result) + logging.info(result) + + # Close the pool and wait for all processes to complete + pool.close() + pool.join() + + end_time = datetime.now() + content = ( + f"retrieve_and_validate_from_hashstore:\n" + f"Start Time: {start_time}\nEnd Time: {end_time}\n" + + f"Total Time to retrieve and validate {len(checked_obj_list)} {obj_type}" + + f" Objects: {end_time - start_time}\n" + ) + logging.info(content) + + def get_obj_hex_digest_from_store(config_yaml, pid_guid, obj_algo): """Given a pid and algorithm, get the hex digest of the object""" properties = _load_store_properties(config_yaml) @@ -373,7 +453,7 @@ def get_obj_hex_digest_from_store(config_yaml, pid_guid, obj_algo): python_log_file_path = getattr(args, "store_path") + "/python_store.log" logging.basicConfig( filename=python_log_file_path, - level=logging.INFO, + level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) @@ -389,8 +469,9 @@ def get_obj_hex_digest_from_store(config_yaml, pid_guid, obj_algo): "store_metadata_namespace": getattr(args, "formatid"), } _get_hashstore(props) + elif getattr(args, "convert_directory") is not None: - # Convert a directory to a HashStore if config file present + # Perform operations to a HashStore if config file present directory_to_convert = getattr(args, "convert_directory") if os.path.exists(directory_to_convert): number_of_objects_to_convert = getattr(args, "num_obj_to_convert") @@ -404,12 +485,20 @@ def get_obj_hex_digest_from_store(config_yaml, pid_guid, obj_algo): + f" convert_directory_type: {directory_type}" ) if os.path.exists(store_path_config_yaml): - store_to_hashstore_from_list( - directory_to_convert, - directory_type, - store_path_config_yaml, - number_of_objects_to_convert, - ) + if getattr(args, "retrieve_and_validate"): + retrieve_and_validate_from_hashstore( + directory_to_convert, + directory_type, + store_path_config_yaml, + number_of_objects_to_convert, + ) + else: + store_to_hashstore_from_list( + directory_to_convert, + directory_type, + store_path_config_yaml, + number_of_objects_to_convert, + ) else: # If HashStore does not exist, raise exception # Calling app must create HashStore first before calling methods @@ -421,6 +510,7 @@ def get_obj_hex_digest_from_store(config_yaml, pid_guid, obj_algo): raise FileNotFoundError( f"Directory to convert does not exist: {getattr(args, 'convert_directory')}." ) + elif ( getattr(args, "object_pid") is not None and getattr(args, "object_algorithm") is not None From 65f0d1ae279d362257fdb1597854cc40635e7c7d Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 2 Aug 2023 12:59:23 -0700 Subject: [PATCH 086/165] Change 'retrieve_and_validate_from_hashstore' to use pool.map() from .starmap() --- src/hashstore/client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index dbe8266f..ee48c227 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -247,7 +247,7 @@ def _refine_object_list(store, metacat_obj_list): # If the file has already been stored, skip it if store.exists("objects", store.get_sha256_hex_digest(pid_guid)): print( - f"Skipping store_object for {pid_guid} - object exists in HashStore" + f"Refining Object List: Skipping {pid_guid} - object exists in HashStore" ) else: tuple_item = (pid_guid, filepath_docid_rev) @@ -390,7 +390,7 @@ def retrieve_and_validate(obj_tuple): if obj_type == "object": logging.info("Storing objects") - results = pool.starmap(retrieve_and_validate, checked_obj_list) + results = pool.map(retrieve_and_validate, checked_obj_list) if obj_type == "metadata": logging.info("Storing metadata") # TODO From 8e6b2b77bb647845a8bc089ae5cbef85a94eb1cd Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 3 Aug 2023 10:05:22 -0700 Subject: [PATCH 087/165] Test resolving multiprocessing pickling issue by instantiating HashStore through class declaration --- src/hashstore/client.py | 44 ++++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index ee48c227..0fa386d7 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -12,6 +12,21 @@ # Supporting Methods +class HashStoreClient: + """Create a HashStore""" + + def __init__(self, properties): + logging.info("Initializing HashStore") + factory = HashStoreFactory() + + # Get HashStore from factory + module_name = "filehashstore" + class_name = "FileHashStore" + + # Class variables + self.hashstore = factory.get_hashstore(module_name, class_name, properties) + + def _add_client_optional_arguments(argp): """Adds the optional arguments for the HashStore Client. @@ -237,19 +252,23 @@ def _get_full_obj_list_from_metacat_db(properties, metacat_dir, num): return object_metadata_list -def _refine_object_list(store, metacat_obj_list): +def _refine_object_list(store, metacat_obj_list, action): """Refine a list of objects by checking for file existence and removing duplicates.""" refined_list = [] for obj in metacat_obj_list: pid_guid = obj[0] filepath_docid_rev = obj[1] if os.path.exists(filepath_docid_rev): - # If the file has already been stored, skip it - if store.exists("objects", store.get_sha256_hex_digest(pid_guid)): - print( - f"Refining Object List: Skipping {pid_guid} - object exists in HashStore" - ) - else: + if action == "store": + # If the file has already been stored, skip it + if store.exists("objects", store.get_sha256_hex_digest(pid_guid)): + print( + f"Refining Object List: Skipping {pid_guid} - object exists in HashStore" + ) + else: + tuple_item = (pid_guid, filepath_docid_rev) + refined_list.append(tuple_item) + if action == "retrieve": tuple_item = (pid_guid, filepath_docid_rev) refined_list.append(tuple_item) return refined_list @@ -303,7 +322,7 @@ def store_to_hashstore_from_list(origin_dir, obj_type, config_yaml, num): # Get list of objects to store from metacat db if obj_type == "object": - checked_obj_list = _refine_object_list(store, metacat_obj_list) + checked_obj_list = _refine_object_list(store, metacat_obj_list, "store") if obj_type == "metadata": checked_obj_list = _refine_metadata_list(store, metacat_obj_list) @@ -348,7 +367,8 @@ def store_to_hashstore_from_list(origin_dir, obj_type, config_yaml, num): def retrieve_and_validate_from_hashstore(origin_dir, obj_type, config_yaml, num): "Retrieve objects or metadata from a Hashstore and validate the content." properties = _load_store_properties(config_yaml) - store = _get_hashstore(properties) + # store = _get_hashstore(properties) + store = HashStoreClient(properties).hashstore checked_num_of_files = None # Check number of files to store @@ -362,7 +382,7 @@ def retrieve_and_validate_from_hashstore(origin_dir, obj_type, config_yaml, num) # Get list of objects to store from metacat db if obj_type == "object": - checked_obj_list = _refine_object_list(store, metacat_obj_list) + checked_obj_list = _refine_object_list(store, metacat_obj_list, "retrieve") if obj_type == "metadata": checked_obj_list = _refine_metadata_list(store, metacat_obj_list) @@ -389,10 +409,10 @@ def retrieve_and_validate(obj_tuple): pool = multiprocessing.Pool() if obj_type == "object": - logging.info("Storing objects") + logging.info("Retrieving objects") results = pool.map(retrieve_and_validate, checked_obj_list) if obj_type == "metadata": - logging.info("Storing metadata") + logging.info("Retrieiving metadata") # TODO # Log exceptions From 8e982804e2e169ff9893179462846f2da561c64b Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 3 Aug 2023 10:11:06 -0700 Subject: [PATCH 088/165] Move 'retrieve_and_validate()' to new HashStoreClient class --- src/hashstore/client.py | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 0fa386d7..907f05c6 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -26,6 +26,24 @@ def __init__(self, properties): # Class variables self.hashstore = factory.get_hashstore(module_name, class_name, properties) + def retrieve_and_validate(self, obj_tuple): + """Retrieve and validate a list of objects.""" + pid_guid = obj_tuple[0] + algo = obj_tuple[4] + checksum = obj_tuple[3] + obj_stream = self.hashstore.retrieve_object(pid_guid) + digest = self.hashstore.computehash(obj_stream, algo) + obj_stream.close() + # Check algorithm + if digest != checksum: + err_msg = ( + f"Unexpected Exception for pid/guid: {pid_guid} -" + + f" Digest calcualted from stream ({digest}) does not match" + + f" checksum from metacata db: {checksum}" + ) + logging.info(err_msg) + raise AssertionError(err_msg) + def _add_client_optional_arguments(argp): """Adds the optional arguments for the HashStore Client. @@ -388,29 +406,12 @@ def retrieve_and_validate_from_hashstore(origin_dir, obj_type, config_yaml, num) start_time = datetime.now() - # Retrieve, validate and close stream - def retrieve_and_validate(obj_tuple): - pid_guid = obj_tuple[0] - algo = obj_tuple[4] - checksum = obj_tuple[3] - obj_stream = store.retrieve_object(pid_guid) - digest = store.computehash(obj_stream, algo) - obj_stream.close() - # Check algorithm - if digest != checksum: - err_msg = ( - f"Unexpected Exception for pid/guid: {pid_guid} -" - + f" Digest calcualted from stream ({digest}) does not match" - + f" checksum from metacata db: {checksum}" - ) - raise AssertionError(err_msg) - # Setup pool and processes pool = multiprocessing.Pool() if obj_type == "object": logging.info("Retrieving objects") - results = pool.map(retrieve_and_validate, checked_obj_list) + results = pool.map(store.retrieve_and_validate, checked_obj_list) if obj_type == "metadata": logging.info("Retrieiving metadata") # TODO From 471a9b2bb13262a481c3b1bc4f8a8ab58bf2e77d Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 3 Aug 2023 10:13:56 -0700 Subject: [PATCH 089/165] Remove incorrect class variable reference when instantiating 'HashStoreClient' --- src/hashstore/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 907f05c6..f5ab313a 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -386,7 +386,7 @@ def retrieve_and_validate_from_hashstore(origin_dir, obj_type, config_yaml, num) "Retrieve objects or metadata from a Hashstore and validate the content." properties = _load_store_properties(config_yaml) # store = _get_hashstore(properties) - store = HashStoreClient(properties).hashstore + store = HashStoreClient(properties) checked_num_of_files = None # Check number of files to store From 6296d487543d488904912ce700fdc2d88aa269a0 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 3 Aug 2023 10:22:34 -0700 Subject: [PATCH 090/165] Fix missing attributes in tuple objects when refining object list --- src/hashstore/client.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index f5ab313a..2be7bda6 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -273,9 +273,9 @@ def _get_full_obj_list_from_metacat_db(properties, metacat_dir, num): def _refine_object_list(store, metacat_obj_list, action): """Refine a list of objects by checking for file existence and removing duplicates.""" refined_list = [] - for obj in metacat_obj_list: - pid_guid = obj[0] - filepath_docid_rev = obj[1] + for tuple_item in metacat_obj_list: + pid_guid = tuple_item[0] + filepath_docid_rev = tuple_item[1] if os.path.exists(filepath_docid_rev): if action == "store": # If the file has already been stored, skip it @@ -284,10 +284,8 @@ def _refine_object_list(store, metacat_obj_list, action): f"Refining Object List: Skipping {pid_guid} - object exists in HashStore" ) else: - tuple_item = (pid_guid, filepath_docid_rev) refined_list.append(tuple_item) if action == "retrieve": - tuple_item = (pid_guid, filepath_docid_rev) refined_list.append(tuple_item) return refined_list From 511f731f1e56182cbcc14ab77cb2d6f85eb9f779 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 3 Aug 2023 10:26:08 -0700 Subject: [PATCH 091/165] Remove redundant logging message and add new message to confirm checksums match --- src/hashstore/client.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 2be7bda6..d6e983a1 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -41,8 +41,14 @@ def retrieve_and_validate(self, obj_tuple): + f" Digest calcualted from stream ({digest}) does not match" + f" checksum from metacata db: {checksum}" ) - logging.info(err_msg) raise AssertionError(err_msg) + else: + info_msg = ( + f"Checksums match for pid/guid:{pid_guid} -" + + f" Digest calcualted from stream: {digest}." + + f" Checksum from metacata db: {checksum}." + ) + logging.info(info_msg) def _add_client_optional_arguments(argp): From 0a65cc70f89956544a6bfc50b2c0a24a5f2224be Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 3 Aug 2023 10:33:56 -0700 Subject: [PATCH 092/165] Fix bug in '_refine_object_list' that added non-existent objects to working list --- src/hashstore/client.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index d6e983a1..d4728828 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -44,7 +44,7 @@ def retrieve_and_validate(self, obj_tuple): raise AssertionError(err_msg) else: info_msg = ( - f"Checksums match for pid/guid:{pid_guid} -" + f"Checksums match for pid/guid: {pid_guid} -" + f" Digest calcualted from stream: {digest}." + f" Checksum from metacata db: {checksum}." ) @@ -292,7 +292,9 @@ def _refine_object_list(store, metacat_obj_list, action): else: refined_list.append(tuple_item) if action == "retrieve": - refined_list.append(tuple_item) + if store.exists("objects", store.get_sha256_hex_digest(pid_guid)): + refined_list.append(tuple_item) + return refined_list From 3315ff64ea0185d7f0dae267bf3c3ef76db29e73 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 3 Aug 2023 10:36:20 -0700 Subject: [PATCH 093/165] Fix incorrect reference when calling HashStore to check for existence of objects --- src/hashstore/client.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index d4728828..b7ed9e4f 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -406,9 +406,11 @@ def retrieve_and_validate_from_hashstore(origin_dir, obj_type, config_yaml, num) # Get list of objects to store from metacat db if obj_type == "object": - checked_obj_list = _refine_object_list(store, metacat_obj_list, "retrieve") + checked_obj_list = _refine_object_list( + store.hashstore, metacat_obj_list, "retrieve" + ) if obj_type == "metadata": - checked_obj_list = _refine_metadata_list(store, metacat_obj_list) + checked_obj_list = _refine_metadata_list(store.hashstore, metacat_obj_list) start_time = datetime.now() From 5e2a99d481eaed7b8ec3df4b0bfb6b5ccc08973d Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 3 Aug 2023 12:19:13 -0700 Subject: [PATCH 094/165] Initial refactor for 'client.py' into classes --- src/hashstore/client.py | 75 +++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 48 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index b7ed9e4f..85d779ce 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -9,14 +9,10 @@ from hashstore import HashStoreFactory -# Supporting Methods - - class HashStoreClient: - """Create a HashStore""" + """Create a HashStore Client to use through the command line.""" def __init__(self, properties): - logging.info("Initializing HashStore") factory = HashStoreFactory() # Get HashStore from factory @@ -50,6 +46,16 @@ def retrieve_and_validate(self, obj_tuple): ) logging.info(info_msg) + def get_obj_hex_digest_from_store(self, pid_guid, obj_algo): + """Given a pid and algorithm, get the hex digest of the object""" + digest = self.hashstore.get_hex_digest(pid, algorithm) + print(f"guid/pid: {pid_guid}") + print(f"algorithm: {obj_algo}") + print(f"digest: {digest}") + + +# Supporting Methods + def _add_client_optional_arguments(argp): """Adds the optional arguments for the HashStore Client. @@ -115,27 +121,6 @@ def _add_client_optional_arguments(argp): ) -def _get_hashstore(properties): - """Create a HashStore instance with the supplied properties. - - Args: - properties: HashStore properties (see 'FileHashStore' module for details) - - Returns: - hashstore (FileHashStore): HashStore - """ - logging.info("Initializing HashStore") - factory = HashStoreFactory() - - # Get HashStore from factory - module_name = "filehashstore" - class_name = "FileHashStore" - - # Class variables - hashstore = factory.get_hashstore(module_name, class_name, properties) - return hashstore - - def _load_store_properties(hashstore_yaml): """Get and return the contents of the current HashStore configuration. @@ -330,7 +315,7 @@ def store_to_hashstore_from_list(origin_dir, obj_type, config_yaml, num): num (int): Number of files to store """ properties = _load_store_properties(config_yaml) - store = _get_hashstore(properties) + store = HashStoreClient(properties).hashstore # Get list of files from directory file_list = os.listdir(origin_dir) @@ -392,7 +377,8 @@ def retrieve_and_validate_from_hashstore(origin_dir, obj_type, config_yaml, num) "Retrieve objects or metadata from a Hashstore and validate the content." properties = _load_store_properties(config_yaml) # store = _get_hashstore(properties) - store = HashStoreClient(properties) + hashstoreclient = HashStoreClient(properties) + store = hashstoreclient.hashstore checked_num_of_files = None # Check number of files to store @@ -406,11 +392,9 @@ def retrieve_and_validate_from_hashstore(origin_dir, obj_type, config_yaml, num) # Get list of objects to store from metacat db if obj_type == "object": - checked_obj_list = _refine_object_list( - store.hashstore, metacat_obj_list, "retrieve" - ) + checked_obj_list = _refine_object_list(store, metacat_obj_list, "retrieve") if obj_type == "metadata": - checked_obj_list = _refine_metadata_list(store.hashstore, metacat_obj_list) + checked_obj_list = _refine_metadata_list(store, metacat_obj_list) start_time = datetime.now() @@ -419,7 +403,7 @@ def retrieve_and_validate_from_hashstore(origin_dir, obj_type, config_yaml, num) if obj_type == "object": logging.info("Retrieving objects") - results = pool.map(store.retrieve_and_validate, checked_obj_list) + results = pool.map(hashstoreclient.retrieve_and_validate, checked_obj_list) if obj_type == "metadata": logging.info("Retrieiving metadata") # TODO @@ -447,17 +431,6 @@ def retrieve_and_validate_from_hashstore(origin_dir, obj_type, config_yaml, num) logging.info(content) -def get_obj_hex_digest_from_store(config_yaml, pid_guid, obj_algo): - """Given a pid and algorithm, get the hex digest of the object""" - properties = _load_store_properties(config_yaml) - store = _get_hashstore(properties) - - digest = store.get_hex_digest(pid, algorithm) - print(f"guid/pid: {pid_guid}") - print(f"algorithm: {obj_algo}") - print(f"digest: {digest}") - - if __name__ == "__main__": PROGRAM_NAME = "HashStore Command Line Client" DESCRIPTION = ( @@ -497,15 +470,18 @@ def get_obj_hex_digest_from_store(config_yaml, pid_guid, obj_algo): "store_algorithm": getattr(args, "algorithm"), "store_metadata_namespace": getattr(args, "formatid"), } - _get_hashstore(props) + HashStoreClient(props) elif getattr(args, "convert_directory") is not None: # Perform operations to a HashStore if config file present directory_to_convert = getattr(args, "convert_directory") + # Check if the directory to convert exists if os.path.exists(directory_to_convert): + # If -nobj is supplied, limit the objects we work with number_of_objects_to_convert = getattr(args, "num_obj_to_convert") store_path = getattr(args, "store_path") store_path_config_yaml = store_path + "/hashstore.yaml" + # Determine if we are working with objects or metadata directory_type = getattr(args, "convert_directory_type") accepted_directory_types = ["object", "metadata"] if directory_type not in accepted_directory_types: @@ -513,6 +489,7 @@ def get_obj_hex_digest_from_store(config_yaml, pid_guid, obj_algo): "Directory `-cvt` cannot be empty, must be 'object' or 'metadata'." + f" convert_directory_type: {directory_type}" ) + # HashStore can only be called if a configuration file is present if os.path.exists(store_path_config_yaml): if getattr(args, "retrieve_and_validate"): retrieve_and_validate_from_hashstore( @@ -550,11 +527,13 @@ def get_obj_hex_digest_from_store(config_yaml, pid_guid, obj_algo): store_path = getattr(args, "store_path") store_path_config_yaml = store_path + "/hashstore.yaml" + # If HashStore does not exist, raise exception if os.path.exists(store_path_config_yaml): - get_obj_hex_digest_from_store(store_path_config_yaml, pid, algorithm) + props = _load_store_properties(store_path_config_yaml) + hs = HashStoreClient(props).hashstore + hs.get_obj_hex_digest_from_store(pid, algorithm) else: - # If HashStore does not exist, raise exception - # Calling app must create HashStore first before calling methods + # Calling app must initialize HashStore first before calling methods raise FileNotFoundError( f"Missing config file (hashstore.yaml) at store path: {store_path}." + " HashStore must be initialized, use `--help` for more information." From c9ac83b62b1c78f6f9108ab591dea05da695aa74 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 3 Aug 2023 12:48:45 -0700 Subject: [PATCH 095/165] Create new class 'MetacatDB' to encapsulate methods to access metacat postgres db --- src/hashstore/client.py | 285 ++++++++++++++++++++-------------------- 1 file changed, 142 insertions(+), 143 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 85d779ce..28aee709 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -54,7 +54,148 @@ def get_obj_hex_digest_from_store(self, pid_guid, obj_algo): print(f"digest: {digest}") -# Supporting Methods +class MetacatDB: + """Adapter class to interact with Metacat's Postgres DB""" + + def __init__(self, pgdb_yaml, hashstore): + """Initialize credentials to access metacat pgdb.""" + db_keys = [ + "db_user", + "db_password", + "db_host", + "db_port", + "db_name", + ] + + if not os.path.exists(pgdb_yaml): + exception_string = ( + "HashStore CLI Client - _load_metacat_db_properties: pgdb.yaml not found" + + " in store root path. Must be manually created with the following keys:" + + " db_user, db_password, db_host, db_port, db_name" + ) + raise FileNotFoundError(exception_string) + # Open file + with open(pgdb_yaml, "r", encoding="utf-8") as file: + yaml_data = yaml.safe_load(file) + + # Get database values + self.hashstore = hashstore + self.db_yaml_dict = {} + for key in db_keys: + checked_property = yaml_data[key] + self.db_yaml_dict[key] = checked_property + + def get_object_metadata_list(self, origin_directory, num): + """Query the metacat db for the full obj and metadata list. + + Args: + origin_directory (string): 'var/metacat/data' or 'var/metacat/documents' + num (int): Number of rows to retrieve from metacat db + """ + # Create a connection to the database + db_user = self.db_yaml_dict["db_user"] + db_password = self.db_yaml_dict["db_password"] + db_host = self.db_yaml_dict["db_host"] + db_port = self.db_yaml_dict["db_port"] + db_name = self.db_yaml_dict["db_name"] + + conn = pg8000.connect( + user=db_user, + password=db_password, + host=db_host, + port=int(db_port), + database=db_name, + ) + + # Create a cursor to execute queries + cursor = conn.cursor() + + # Query to refine rows between `identifier` and `systemmetadata`` table + if num is None: + limit_query = "" + else: + limit_query = f" LIMIT {num}" + query = f"""SELECT identifier.guid, identifier.docid, identifier.rev, + systemmetadata.object_format, systemmetadata.checksum, + systemmetadata.checksum_algorithm FROM identifier INNER JOIN systemmetadata + ON identifier.guid = systemmetadata.guid{limit_query};""" + cursor.execute(query) + + # Fetch all rows from the result set + rows = cursor.fetchall() + + # Create full object list to store into HashStore + print("Creating list of objects and metadata from metacat db") + object_metadata_list = [] + for row in rows: + # Get pid, filepath and formatId + pid_guid = row[0] + metadatapath_docid_rev = origin_directory + "/" + row[1] + "." + str(row[2]) + metadata_namespace = row[3] + checksum = row[4] + checksum_algorithm = row[5] + tuple_item = ( + pid_guid, + metadatapath_docid_rev, + metadata_namespace, + checksum, + checksum_algorithm, + ) + object_metadata_list.append(tuple_item) + + # Close the cursor and connection when done + cursor.close() + conn.close() + + return object_metadata_list + + def refine_list_for_objects(self, metacat_obj_list, action): + """Refine a list of objects by checking for file existence and removing duplicates. + + Args: + store (HashStore): HashStore object + metacat_obj_list (List): List of tuple objects representing rows from metacat db + action (string): "store" or "retrieve". + "store" will create a list of objects to store that do not exist in HashStore. + "retrieve" will create a list of objects that exist in HashStore. + + Returns: + refine_list (List): List of tuple objects based on "action" + """ + refined_object_list = [] + for tuple_item in metacat_obj_list: + pid_guid = tuple_item[0] + filepath_docid_rev = tuple_item[1] + if os.path.exists(filepath_docid_rev): + if action == "store": + # If the file has already been stored, skip it + if not self.hashstore.exists( + "objects", self.hashstore.get_sha256_hex_digest(pid_guid) + ): + refined_object_list.append(tuple_item) + if action == "retrieve": + if self.hashstore.exists( + "objects", self.hashstore.get_sha256_hex_digest(pid_guid) + ): + refined_object_list.append(tuple_item) + + return refined_object_list + + def refine_list_for_metadta(self, metacat_obj_list): + """Refine a list of metadata by checking for file existence and removing duplicates.""" + refined_metadta_list = [] + for obj in metacat_obj_list: + pid_guid = obj[0] + filepath_docid_rev = obj[1] + metadata_namespace = obj[2] + if os.path.exists(filepath_docid_rev): + # If the file has already been stored, skip it + if not self.hashstore.exists( + "metadata", self.hashstore.get_sha256_hex_digest(pid_guid) + ): + tuple_item = (pid_guid, metadata_namespace, filepath_docid_rev) + refined_metadta_list.append(tuple_item) + return refined_metadta_list def _add_client_optional_arguments(argp): @@ -160,148 +301,6 @@ def _load_store_properties(hashstore_yaml): return hashstore_yaml_dict -def _load_metacat_db_properties(pgdb_yaml): - """Get and return the contents of a config file with credentials - to access a postgres db. - - Args: - pgdb_yaml (string): Path to yaml file - - Returns: - hashstore_yaml_dict (dict): postgres db config properties - """ - db_keys = [ - "db_user", - "db_password", - "db_host", - "db_port", - "db_name", - ] - - if not os.path.exists(pgdb_yaml): - exception_string = ( - "HashStore CLI Client - _load_metacat_db_properties: pgdb.yaml not found" - + " in store root path." - ) - raise FileNotFoundError(exception_string) - # Open file - with open(pgdb_yaml, "r", encoding="utf-8") as file: - yaml_data = yaml.safe_load(file) - - # Get database values - db_yaml_dict = {} - for key in db_keys: - checked_property = yaml_data[key] - db_yaml_dict[key] = checked_property - return db_yaml_dict - - -def _get_full_obj_list_from_metacat_db(properties, metacat_dir, num): - """Get the list of objects and metadata from knbvm's metacat db""" - # Note: Manually create `pgdb.yaml` for security purposes - pgyaml_path = properties["store_path"] + "/pgdb.yaml" - print(f"Retrieving db config from: {pgyaml_path}") - - db_properties = _load_metacat_db_properties(pgyaml_path) - db_user = db_properties["db_user"] - db_password = db_properties["db_password"] - db_host = db_properties["db_host"] - db_port = db_properties["db_port"] - db_name = db_properties["db_name"] - - # Create a connection to the database - conn = pg8000.connect( - user=db_user, - password=db_password, - host=db_host, - port=int(db_port), - database=db_name, - ) - - # Create a cursor to execute queries - cursor = conn.cursor() - - # Query to refine rows between `identifier` and `systemmetadata`` table - if num is None: - limit_query = "" - else: - limit_query = f" LIMIT {num}" - query = f"""SELECT identifier.guid, identifier.docid, identifier.rev, - systemmetadata.object_format, systemmetadata.checksum, - systemmetadata.checksum_algorithm FROM identifier INNER JOIN systemmetadata - ON identifier.guid = systemmetadata.guid{limit_query};""" - cursor.execute(query) - - # Fetch all rows from the result set - rows = cursor.fetchall() - - # Create full object list to store into HashStore - print("Creating list of objects and metadata from metacat db") - object_metadata_list = [] - for row in rows: - # Get pid, filepath and formatId - pid_guid = row[0] - metadatapath_docid_rev = metacat_dir + "/" + row[1] + "." + str(row[2]) - metadata_namespace = row[3] - checksum = row[4] - checksum_algorithm = row[5] - tuple_item = ( - pid_guid, - metadatapath_docid_rev, - metadata_namespace, - checksum, - checksum_algorithm, - ) - object_metadata_list.append(tuple_item) - - # Close the cursor and connection when done - cursor.close() - conn.close() - - return object_metadata_list - - -def _refine_object_list(store, metacat_obj_list, action): - """Refine a list of objects by checking for file existence and removing duplicates.""" - refined_list = [] - for tuple_item in metacat_obj_list: - pid_guid = tuple_item[0] - filepath_docid_rev = tuple_item[1] - if os.path.exists(filepath_docid_rev): - if action == "store": - # If the file has already been stored, skip it - if store.exists("objects", store.get_sha256_hex_digest(pid_guid)): - print( - f"Refining Object List: Skipping {pid_guid} - object exists in HashStore" - ) - else: - refined_list.append(tuple_item) - if action == "retrieve": - if store.exists("objects", store.get_sha256_hex_digest(pid_guid)): - refined_list.append(tuple_item) - - return refined_list - - -def _refine_metadata_list(store, metacat_obj_list): - """Refine a list of metadata by checking for file existence and removing duplicates.""" - refined_list = [] - for obj in metacat_obj_list: - pid_guid = obj[0] - filepath_docid_rev = obj[1] - metadata_namespace = obj[2] - if os.path.exists(filepath_docid_rev): - # If the file has already been stored, skip it - if store.exists("metadata", store.get_sha256_hex_digest(pid_guid)): - print( - f"Skipping store_metadata for {pid_guid} - metadata exists in HashStore" - ) - else: - tuple_item = (pid_guid, metadata_namespace, filepath_docid_rev) - refined_list.append(tuple_item) - return refined_list - - # Concrete Methods From 7f5ec61b424e353a580cd8e8847b86cdb2fd9447 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 3 Aug 2023 14:05:52 -0700 Subject: [PATCH 096/165] Refactor 'client.py' and organize methods --- src/hashstore/client.py | 270 ++++++++++++++++++++-------------------- 1 file changed, 132 insertions(+), 138 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 28aee709..830b8674 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -13,6 +13,7 @@ class HashStoreClient: """Create a HashStore Client to use through the command line.""" def __init__(self, properties): + """Initialize HashStore and MetacatDB adapters""" factory = HashStoreFactory() # Get HashStore from factory @@ -21,8 +22,129 @@ def __init__(self, properties): # Class variables self.hashstore = factory.get_hashstore(module_name, class_name, properties) + self.metacatdb = MetacatDB(properties["store_path"], self.hashstore) - def retrieve_and_validate(self, obj_tuple): + def store_to_hashstore_from_list(self, origin_dir, obj_type, num): + """Store objects in a given directory into HashStore + + Args: + origin_dir (str): Directory to convert + obj_type (str): 'object' or 'metadata' + config_yaml (str): Path to HashStore config file `hashstore.yaml` + num (int): Number of files to store + """ + # Get list of files from directory + file_list = os.listdir(origin_dir) + checked_num_of_files = len(file_list) + # Check number of files to store + if num is not None: + checked_num_of_files = int(num) + + # Object and Metadata list + metacat_obj_list = self.metacatdb.get_object_metadata_list( + origin_dir, checked_num_of_files + ) + + # Get list of objects to store from metacat db + if obj_type == "object": + checked_obj_list = self.metacatdb.refine_list_for_objects( + metacat_obj_list, "store" + ) + if obj_type == "metadata": + checked_obj_list = self.metacatdb.refine_list_for_metadata(metacat_obj_list) + + start_time = datetime.now() + + # Setup pool and processes + # num_processes = os.cpu_count() - 2 + # pool = multiprocessing.Pool(processes=num_processes) + pool = multiprocessing.Pool() + + # Call 'obj_type' respective public API methods + if obj_type == "object": + logging.info("Storing objects") + results = pool.starmap(self.hashstore.store_object, checked_obj_list) + if obj_type == "metadata": + logging.info("Storing metadata") + results = pool.starmap(self.hashstore.store_metadata, checked_obj_list) + + # Log exceptions + cleanup_msg = "Checking results and logging exceptions" + print(cleanup_msg) + logging.info(cleanup_msg) + for result in results: + if isinstance(result, Exception): + print(result) + logging.error(result) + + # Close the pool and wait for all processes to complete + pool.close() + pool.join() + + end_time = datetime.now() + content = ( + f"HashStoreClient (store_to_hashstore_from_list):\n" + f"Start Time: {start_time}\nEnd Time: {end_time}\n" + + f"Total Time to Store {len(checked_obj_list)} {obj_type}" + + f" Objects: {end_time - start_time}\n" + ) + logging.info(content) + + def retrieve_and_validate_from_hashstore(self, origin_dir, obj_type, num): + "Retrieve objects or metadata from a Hashstore and validate the content." + checked_num_of_files = None + # Check number of files to store + if num is not None: + checked_num_of_files = int(num) + + # Object and Metadata list + metacat_obj_list = self.metacatdb.get_object_metadata_list( + origin_dir, checked_num_of_files + ) + + # Get list of objects to store from metacat db + if obj_type == "object": + checked_obj_list = self.metacatdb.refine_list_for_objects( + metacat_obj_list, "retrieve" + ) + if obj_type == "metadata": + checked_obj_list = self.metacatdb.refine_list_for_metadata(metacat_obj_list) + + start_time = datetime.now() + + # Setup pool and processes + pool = multiprocessing.Pool() + + if obj_type == "object": + logging.info("Retrieving objects") + results = pool.map(self.validate, checked_obj_list) + if obj_type == "metadata": + logging.info("Retrieiving metadata") + # TODO + + # Log exceptions + cleanup_msg = "Checking results and logging exceptions" + print(cleanup_msg) + logging.info(cleanup_msg) + for result in results: + if isinstance(result, Exception): + print(result) + logging.info(result) + + # Close the pool and wait for all processes to complete + pool.close() + pool.join() + + end_time = datetime.now() + content = ( + f"retrieve_and_validate_from_hashstore:\n" + f"Start Time: {start_time}\nEnd Time: {end_time}\n" + + f"Total Time to retrieve and validate {len(checked_obj_list)} {obj_type}" + + f" Objects: {end_time - start_time}\n" + ) + logging.info(content) + + def validate(self, obj_tuple): """Retrieve and validate a list of objects.""" pid_guid = obj_tuple[0] algo = obj_tuple[4] @@ -57,7 +179,7 @@ def get_obj_hex_digest_from_store(self, pid_guid, obj_algo): class MetacatDB: """Adapter class to interact with Metacat's Postgres DB""" - def __init__(self, pgdb_yaml, hashstore): + def __init__(self, hashstore_path, hashstore): """Initialize credentials to access metacat pgdb.""" db_keys = [ "db_user", @@ -67,7 +189,8 @@ def __init__(self, pgdb_yaml, hashstore): "db_name", ] - if not os.path.exists(pgdb_yaml): + pgyaml_path = hashstore_path + "/pgdb.yaml" + if not os.path.exists(pgyaml_path): exception_string = ( "HashStore CLI Client - _load_metacat_db_properties: pgdb.yaml not found" + " in store root path. Must be manually created with the following keys:" @@ -75,7 +198,7 @@ def __init__(self, pgdb_yaml, hashstore): ) raise FileNotFoundError(exception_string) # Open file - with open(pgdb_yaml, "r", encoding="utf-8") as file: + with open(pgyaml_path, "r", encoding="utf-8") as file: yaml_data = yaml.safe_load(file) # Get database values @@ -181,7 +304,7 @@ def refine_list_for_objects(self, metacat_obj_list, action): return refined_object_list - def refine_list_for_metadta(self, metacat_obj_list): + def refine_list_for_metadata(self, metacat_obj_list): """Refine a list of metadata by checking for file existence and removing duplicates.""" refined_metadta_list = [] for obj in metacat_obj_list: @@ -301,135 +424,6 @@ def _load_store_properties(hashstore_yaml): return hashstore_yaml_dict -# Concrete Methods - - -def store_to_hashstore_from_list(origin_dir, obj_type, config_yaml, num): - """Store objects in a given directory into HashStore - - Args: - origin_dir (str): Directory to convert - obj_type (str): 'object' or 'metadata' - config_yaml (str): Path to HashStore config file `hashstore.yaml` - num (int): Number of files to store - """ - properties = _load_store_properties(config_yaml) - store = HashStoreClient(properties).hashstore - - # Get list of files from directory - file_list = os.listdir(origin_dir) - checked_num_of_files = len(file_list) - # Check number of files to store - if num is not None: - checked_num_of_files = int(num) - - # Object and Metadata list - metacat_obj_list = _get_full_obj_list_from_metacat_db( - properties, origin_dir, checked_num_of_files - ) - - # Get list of objects to store from metacat db - if obj_type == "object": - checked_obj_list = _refine_object_list(store, metacat_obj_list, "store") - if obj_type == "metadata": - checked_obj_list = _refine_metadata_list(store, metacat_obj_list) - - start_time = datetime.now() - - # Setup pool and processes - # num_processes = os.cpu_count() - 2 - # pool = multiprocessing.Pool(processes=num_processes) - pool = multiprocessing.Pool() - - # Call 'obj_type' respective public API methods - if obj_type == "object": - logging.info("Storing objects") - results = pool.starmap(store.store_object, checked_obj_list) - if obj_type == "metadata": - logging.info("Storing metadata") - results = pool.starmap(store.store_metadata, checked_obj_list) - - # Log exceptions - cleanup_msg = "Checking results and logging exceptions" - print(cleanup_msg) - logging.info(cleanup_msg) - for result in results: - if isinstance(result, Exception): - print(result) - logging.error(result) - - # Close the pool and wait for all processes to complete - pool.close() - pool.join() - - end_time = datetime.now() - content = ( - f"store_to_hashstore_from_list:\n" - f"Start Time: {start_time}\nEnd Time: {end_time}\n" - + f"Total Time to Store {len(checked_obj_list)} {obj_type}" - + f" Objects: {end_time - start_time}\n" - ) - logging.info(content) - - -def retrieve_and_validate_from_hashstore(origin_dir, obj_type, config_yaml, num): - "Retrieve objects or metadata from a Hashstore and validate the content." - properties = _load_store_properties(config_yaml) - # store = _get_hashstore(properties) - hashstoreclient = HashStoreClient(properties) - store = hashstoreclient.hashstore - - checked_num_of_files = None - # Check number of files to store - if num is not None: - checked_num_of_files = int(num) - - # Object and Metadata list - metacat_obj_list = _get_full_obj_list_from_metacat_db( - properties, origin_dir, checked_num_of_files - ) - - # Get list of objects to store from metacat db - if obj_type == "object": - checked_obj_list = _refine_object_list(store, metacat_obj_list, "retrieve") - if obj_type == "metadata": - checked_obj_list = _refine_metadata_list(store, metacat_obj_list) - - start_time = datetime.now() - - # Setup pool and processes - pool = multiprocessing.Pool() - - if obj_type == "object": - logging.info("Retrieving objects") - results = pool.map(hashstoreclient.retrieve_and_validate, checked_obj_list) - if obj_type == "metadata": - logging.info("Retrieiving metadata") - # TODO - - # Log exceptions - cleanup_msg = "Checking results and logging exceptions" - print(cleanup_msg) - logging.info(cleanup_msg) - for result in results: - if isinstance(result, Exception): - print(result) - logging.info(result) - - # Close the pool and wait for all processes to complete - pool.close() - pool.join() - - end_time = datetime.now() - content = ( - f"retrieve_and_validate_from_hashstore:\n" - f"Start Time: {start_time}\nEnd Time: {end_time}\n" - + f"Total Time to retrieve and validate {len(checked_obj_list)} {obj_type}" - + f" Objects: {end_time - start_time}\n" - ) - logging.info(content) - - if __name__ == "__main__": PROGRAM_NAME = "HashStore Command Line Client" DESCRIPTION = ( @@ -490,18 +484,18 @@ def retrieve_and_validate_from_hashstore(origin_dir, obj_type, config_yaml, num) ) # HashStore can only be called if a configuration file is present if os.path.exists(store_path_config_yaml): + props = _load_store_properties(store_path_config_yaml) + hs = HashStoreClient(props) if getattr(args, "retrieve_and_validate"): - retrieve_and_validate_from_hashstore( + hs.retrieve_and_validate_from_hashstore( directory_to_convert, directory_type, - store_path_config_yaml, number_of_objects_to_convert, ) else: - store_to_hashstore_from_list( + hs.store_to_hashstore_from_list( directory_to_convert, directory_type, - store_path_config_yaml, number_of_objects_to_convert, ) else: From 61a3809228b93aaadd2e9b54d48844a28b3f4832 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 3 Aug 2023 14:25:37 -0700 Subject: [PATCH 097/165] Refactor 'client.py' entry point and fix bug when storing objects from list --- src/hashstore/client.py | 126 +++++++++++++++++++++------------------- 1 file changed, 67 insertions(+), 59 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 830b8674..6f04a176 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -289,13 +289,24 @@ def refine_list_for_objects(self, metacat_obj_list, action): for tuple_item in metacat_obj_list: pid_guid = tuple_item[0] filepath_docid_rev = tuple_item[1] + checksum = tuple_item[3] + checksum_algorithm = tuple_item[4] if os.path.exists(filepath_docid_rev): if action == "store": # If the file has already been stored, skip it if not self.hashstore.exists( "objects", self.hashstore.get_sha256_hex_digest(pid_guid) ): - refined_object_list.append(tuple_item) + # This tuple is formed to match 'HashStore' store_object's signature + # Which is '.starmap()'ed when called + store_object_tuple_item = ( + pid_guid, + filepath_docid_rev, + None, + checksum, + checksum_algorithm, + ) + refined_object_list.append(store_object_tuple_item) if action == "retrieve": if self.hashstore.exists( "objects", self.hashstore.get_sha256_hex_digest(pid_guid) @@ -465,69 +476,66 @@ def _load_store_properties(hashstore_yaml): } HashStoreClient(props) - elif getattr(args, "convert_directory") is not None: - # Perform operations to a HashStore if config file present - directory_to_convert = getattr(args, "convert_directory") - # Check if the directory to convert exists - if os.path.exists(directory_to_convert): - # If -nobj is supplied, limit the objects we work with - number_of_objects_to_convert = getattr(args, "num_obj_to_convert") - store_path = getattr(args, "store_path") - store_path_config_yaml = store_path + "/hashstore.yaml" - # Determine if we are working with objects or metadata - directory_type = getattr(args, "convert_directory_type") - accepted_directory_types = ["object", "metadata"] - if directory_type not in accepted_directory_types: - raise ValueError( - "Directory `-cvt` cannot be empty, must be 'object' or 'metadata'." - + f" convert_directory_type: {directory_type}" - ) - # HashStore can only be called if a configuration file is present - if os.path.exists(store_path_config_yaml): - props = _load_store_properties(store_path_config_yaml) - hs = HashStoreClient(props) - if getattr(args, "retrieve_and_validate"): - hs.retrieve_and_validate_from_hashstore( - directory_to_convert, - directory_type, - number_of_objects_to_convert, + else: + # Initialize HashStore + store_path = getattr(args, "store_path") + store_path_config_yaml = store_path + "/hashstore.yaml" + props = _load_store_properties(store_path_config_yaml) + hs = HashStoreClient(props) + + if getattr(args, "convert_directory") is not None: + directory_to_convert = getattr(args, "convert_directory") + # Check if the directory to convert exists + if os.path.exists(directory_to_convert): + # If -nobj is supplied, limit the objects we work with + number_of_objects_to_convert = getattr(args, "num_obj_to_convert") + # Determine if we are working with objects or metadata + directory_type = getattr(args, "convert_directory_type") + accepted_directory_types = ["object", "metadata"] + if directory_type not in accepted_directory_types: + raise ValueError( + "Directory `-cvt` cannot be empty, must be 'object' or 'metadata'." + + f" convert_directory_type: {directory_type}" ) + # HashStore can only be called if a configuration file is present + if os.path.exists(store_path_config_yaml): + if getattr(args, "retrieve_and_validate"): + hs.retrieve_and_validate_from_hashstore( + directory_to_convert, + directory_type, + number_of_objects_to_convert, + ) + else: + hs.store_to_hashstore_from_list( + directory_to_convert, + directory_type, + number_of_objects_to_convert, + ) else: - hs.store_to_hashstore_from_list( - directory_to_convert, - directory_type, - number_of_objects_to_convert, + # If HashStore does not exist, raise exception + # Calling app must create HashStore first before calling methods + raise FileNotFoundError( + f"Missing config file (hashstore.yaml) at store path: {store_path}." + + " HashStore must be initialized, use `--help` for more information." ) else: - # If HashStore does not exist, raise exception - # Calling app must create HashStore first before calling methods raise FileNotFoundError( - f"Missing config file (hashstore.yaml) at store path: {store_path}." - + " HashStore must be initialized, use `--help` for more information." + f"Directory to convert does not exist: {getattr(args, 'convert_directory')}." ) - else: - raise FileNotFoundError( - f"Directory to convert does not exist: {getattr(args, 'convert_directory')}." - ) - elif ( - getattr(args, "object_pid") is not None - and getattr(args, "object_algorithm") is not None - ): - # Calculate the hex digest of a given pid with algorithm supplied - pid = getattr(args, "object_pid") - algorithm = getattr(args, "object_algorithm") - store_path = getattr(args, "store_path") - store_path_config_yaml = store_path + "/hashstore.yaml" + elif ( + getattr(args, "object_pid") is not None + and getattr(args, "object_algorithm") is not None + ): + # Calculate the hex digest of a given pid with algorithm supplied + pid = getattr(args, "object_pid") + algorithm = getattr(args, "object_algorithm") - # If HashStore does not exist, raise exception - if os.path.exists(store_path_config_yaml): - props = _load_store_properties(store_path_config_yaml) - hs = HashStoreClient(props).hashstore - hs.get_obj_hex_digest_from_store(pid, algorithm) - else: - # Calling app must initialize HashStore first before calling methods - raise FileNotFoundError( - f"Missing config file (hashstore.yaml) at store path: {store_path}." - + " HashStore must be initialized, use `--help` for more information." - ) + if os.path.exists(store_path_config_yaml): + hs.get_obj_hex_digest_from_store(pid, algorithm) + else: + # Calling app must initialize HashStore first before calling methods + raise FileNotFoundError( + f"Missing config file (hashstore.yaml) at store path: {store_path}." + + " HashStore must be initialized, use `--help` for more information." + ) From 2e2281f54f0c89ddd32802e933362d6aaaec6476 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sun, 6 Aug 2023 10:24:18 -0700 Subject: [PATCH 098/165] Comment out old logging process, change logging level to error --- src/hashstore/client.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 6f04a176..b2d5bb3d 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -123,13 +123,13 @@ def retrieve_and_validate_from_hashstore(self, origin_dir, obj_type, num): # TODO # Log exceptions - cleanup_msg = "Checking results and logging exceptions" - print(cleanup_msg) - logging.info(cleanup_msg) - for result in results: - if isinstance(result, Exception): - print(result) - logging.info(result) + # cleanup_msg = "Checking results and logging exceptions" + # print(cleanup_msg) + # logging.info(cleanup_msg) + # for result in results: + # if isinstance(result, Exception): + # print(result) + # logging.error(result) # Close the pool and wait for all processes to complete pool.close() @@ -155,11 +155,11 @@ def validate(self, obj_tuple): # Check algorithm if digest != checksum: err_msg = ( - f"Unexpected Exception for pid/guid: {pid_guid} -" + f"Assertion Error for pid/guid: {pid_guid} -" + f" Digest calcualted from stream ({digest}) does not match" + f" checksum from metacata db: {checksum}" ) - raise AssertionError(err_msg) + logging.error(err_msg) else: info_msg = ( f"Checksums match for pid/guid: {pid_guid} -" @@ -459,7 +459,7 @@ def _load_store_properties(hashstore_yaml): python_log_file_path = getattr(args, "store_path") + "/python_store.log" logging.basicConfig( filename=python_log_file_path, - level=logging.DEBUG, + level=logging.ERROR, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) From 76960cc3db11e6b2110de57ba79028dfc55ab979 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sun, 6 Aug 2023 13:40:17 -0700 Subject: [PATCH 099/165] Uncomment clean up logging process --- src/hashstore/client.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index b2d5bb3d..5df5727a 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -116,20 +116,18 @@ def retrieve_and_validate_from_hashstore(self, origin_dir, obj_type, num): pool = multiprocessing.Pool() if obj_type == "object": - logging.info("Retrieving objects") results = pool.map(self.validate, checked_obj_list) - if obj_type == "metadata": - logging.info("Retrieiving metadata") - # TODO + # if obj_type == "metadata": + # TODO # Log exceptions - # cleanup_msg = "Checking results and logging exceptions" - # print(cleanup_msg) - # logging.info(cleanup_msg) - # for result in results: - # if isinstance(result, Exception): - # print(result) - # logging.error(result) + cleanup_msg = "Checking results and logging exceptions" + print(cleanup_msg) + logging.info(cleanup_msg) + for result in results: + if isinstance(result, Exception): + print(result) + logging.error(result) # Close the pool and wait for all processes to complete pool.close() From 2b3fe9177905a62152f547b14f9a2f524354da42 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 7 Aug 2023 14:57:02 -0700 Subject: [PATCH 100/165] Change logging level back to info, add new print statements to help with testing --- src/hashstore/client.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 5df5727a..af15d237 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -150,7 +150,9 @@ def validate(self, obj_tuple): obj_stream = self.hashstore.retrieve_object(pid_guid) digest = self.hashstore.computehash(obj_stream, algo) obj_stream.close() + # Check algorithm + print(f"Validating pid: {pid_guid}") if digest != checksum: err_msg = ( f"Assertion Error for pid/guid: {pid_guid} -" @@ -158,13 +160,14 @@ def validate(self, obj_tuple): + f" checksum from metacata db: {checksum}" ) logging.error(err_msg) + print(err_msg) else: info_msg = ( f"Checksums match for pid/guid: {pid_guid} -" + f" Digest calcualted from stream: {digest}." + f" Checksum from metacata db: {checksum}." ) - logging.info(info_msg) + print(info_msg) def get_obj_hex_digest_from_store(self, pid_guid, obj_algo): """Given a pid and algorithm, get the hex digest of the object""" @@ -175,7 +178,7 @@ def get_obj_hex_digest_from_store(self, pid_guid, obj_algo): class MetacatDB: - """Adapter class to interact with Metacat's Postgres DB""" + """Class to interact with Metacat's Postgres DB""" def __init__(self, hashstore_path, hashstore): """Initialize credentials to access metacat pgdb.""" @@ -457,7 +460,7 @@ def _load_store_properties(hashstore_yaml): python_log_file_path = getattr(args, "store_path") + "/python_store.log" logging.basicConfig( filename=python_log_file_path, - level=logging.ERROR, + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) From d030035f3e2b949e0cdb66ab1b746a36a787a63e Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 7 Aug 2023 15:20:47 -0700 Subject: [PATCH 101/165] Refactor 'validate' method to use context manager to ensure streams are closed --- src/hashstore/client.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index af15d237..11a877e5 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -91,7 +91,7 @@ def store_to_hashstore_from_list(self, origin_dir, obj_type, num): logging.info(content) def retrieve_and_validate_from_hashstore(self, origin_dir, obj_type, num): - "Retrieve objects or metadata from a Hashstore and validate the content." + """Retrieve objects or metadata from a Hashstore and validate the content.""" checked_num_of_files = None # Check number of files to store if num is not None: @@ -113,7 +113,9 @@ def retrieve_and_validate_from_hashstore(self, origin_dir, obj_type, num): start_time = datetime.now() # Setup pool and processes - pool = multiprocessing.Pool() + # pool = multiprocessing.Pool() + num_processes = os.cpu_count() - 2 + pool = multiprocessing.Pool(processes=num_processes) if obj_type == "object": results = pool.map(self.validate, checked_obj_list) @@ -147,25 +149,25 @@ def validate(self, obj_tuple): pid_guid = obj_tuple[0] algo = obj_tuple[4] checksum = obj_tuple[3] - obj_stream = self.hashstore.retrieve_object(pid_guid) - digest = self.hashstore.computehash(obj_stream, algo) - obj_stream.close() + with self.hashstore.retrieve_object(pid_guid) as obj_stream: + digest = self.hashstore.computehash(obj_stream, algo) + obj_stream.close() # Check algorithm print(f"Validating pid: {pid_guid}") if digest != checksum: err_msg = ( f"Assertion Error for pid/guid: {pid_guid} -" - + f" Digest calcualted from stream ({digest}) does not match" - + f" checksum from metacata db: {checksum}" + + f" Digest calculated from stream ({digest}) does not match" + + f" checksum from metacat db: {checksum}" ) logging.error(err_msg) print(err_msg) else: info_msg = ( f"Checksums match for pid/guid: {pid_guid} -" - + f" Digest calcualted from stream: {digest}." - + f" Checksum from metacata db: {checksum}." + + f" Digest calculated from stream: {digest}." + + f" Checksum from metacat db: {checksum}." ) print(info_msg) From 40a4a8f6922c7ebdd811fbf120caf67b6690f1a2 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 7 Aug 2023 15:28:56 -0700 Subject: [PATCH 102/165] Add additional print statements to assist with testing and debugging --- src/hashstore/client.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 11a877e5..01d02ea1 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -311,10 +311,12 @@ def refine_list_for_objects(self, metacat_obj_list, action): ) refined_object_list.append(store_object_tuple_item) if action == "retrieve": + print(f"Checking for pid: {pid_guid} in HashStore") if self.hashstore.exists( "objects", self.hashstore.get_sha256_hex_digest(pid_guid) ): refined_object_list.append(tuple_item) + print("Found! Adding to refined list") return refined_object_list From 16f4d0aacc06fab98b584884cf1df95f7004a755 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 7 Aug 2023 15:32:14 -0700 Subject: [PATCH 103/165] Remove explicit declaration of processes to use in multiprocessing.Pool() --- src/hashstore/client.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 01d02ea1..659e558f 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -113,9 +113,9 @@ def retrieve_and_validate_from_hashstore(self, origin_dir, obj_type, num): start_time = datetime.now() # Setup pool and processes - # pool = multiprocessing.Pool() - num_processes = os.cpu_count() - 2 - pool = multiprocessing.Pool(processes=num_processes) + pool = multiprocessing.Pool() + # num_processes = os.cpu_count() - 2 + # pool = multiprocessing.Pool(processes=num_processes) if obj_type == "object": results = pool.map(self.validate, checked_obj_list) From 3345473893b2a7fdb8105c22f9993599155c01a5 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 8 Aug 2023 09:40:22 -0700 Subject: [PATCH 104/165] Change multiprocessing process to 'spawn' to test resolving deadlocking when retrieving all files --- src/hashstore/client.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 659e558f..61b38c9d 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -113,12 +113,15 @@ def retrieve_and_validate_from_hashstore(self, origin_dir, obj_type, num): start_time = datetime.now() # Setup pool and processes - pool = multiprocessing.Pool() + # pool = multiprocessing.Pool() # num_processes = os.cpu_count() - 2 # pool = multiprocessing.Pool(processes=num_processes) - if obj_type == "object": - results = pool.map(self.validate, checked_obj_list) + # https://pythonspeed.com/articles/python-multiprocessing/ + multiprocessing.set_start_method("spawn") + with multiprocessing.get_context("spawn").Pool() as pool: + if obj_type == "object": + results = pool.map(self.validate, checked_obj_list) # if obj_type == "metadata": # TODO @@ -164,11 +167,7 @@ def validate(self, obj_tuple): logging.error(err_msg) print(err_msg) else: - info_msg = ( - f"Checksums match for pid/guid: {pid_guid} -" - + f" Digest calculated from stream: {digest}." - + f" Checksum from metacat db: {checksum}." - ) + info_msg = f"Checksums match for pid/guid: {pid_guid}!" print(info_msg) def get_obj_hex_digest_from_store(self, pid_guid, obj_algo): From 15b8e0ae48ee340c1bb9ad4321dcfe49b006f3c6 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 8 Aug 2023 10:41:41 -0700 Subject: [PATCH 105/165] Revert 'spawn' process (memory issues), reduce logging statements and call '.imap()' to start processing results earlier --- src/hashstore/client.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 61b38c9d..c35347a3 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -92,6 +92,7 @@ def store_to_hashstore_from_list(self, origin_dir, obj_type, num): def retrieve_and_validate_from_hashstore(self, origin_dir, obj_type, num): """Retrieve objects or metadata from a Hashstore and validate the content.""" + logging.info("HashStore Client - Begin retrieving and validating objects.") checked_num_of_files = None # Check number of files to store if num is not None: @@ -103,6 +104,7 @@ def retrieve_and_validate_from_hashstore(self, origin_dir, obj_type, num): ) # Get list of objects to store from metacat db + logging.info("HashStore Client - Refining object list for %s", obj_type) if obj_type == "object": checked_obj_list = self.metacatdb.refine_list_for_objects( metacat_obj_list, "retrieve" @@ -113,15 +115,11 @@ def retrieve_and_validate_from_hashstore(self, origin_dir, obj_type, num): start_time = datetime.now() # Setup pool and processes - # pool = multiprocessing.Pool() # num_processes = os.cpu_count() - 2 # pool = multiprocessing.Pool(processes=num_processes) - - # https://pythonspeed.com/articles/python-multiprocessing/ - multiprocessing.set_start_method("spawn") - with multiprocessing.get_context("spawn").Pool() as pool: - if obj_type == "object": - results = pool.map(self.validate, checked_obj_list) + pool = multiprocessing.Pool() + if obj_type == "object": + results = pool.imap(self.validate, checked_obj_list) # if obj_type == "metadata": # TODO @@ -152,12 +150,12 @@ def validate(self, obj_tuple): pid_guid = obj_tuple[0] algo = obj_tuple[4] checksum = obj_tuple[3] + + print(f"Validating pid: {pid_guid}") with self.hashstore.retrieve_object(pid_guid) as obj_stream: digest = self.hashstore.computehash(obj_stream, algo) obj_stream.close() - # Check algorithm - print(f"Validating pid: {pid_guid}") if digest != checksum: err_msg = ( f"Assertion Error for pid/guid: {pid_guid} -" From 33cf73797b029fd2b923e0afecceeb4dd7dbdfb3 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 11 Aug 2023 09:21:03 -0700 Subject: [PATCH 106/165] Refactor hashstore client --- src/hashstore/client.py | 284 +++++++++++++++++++++------------------- 1 file changed, 151 insertions(+), 133 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index c35347a3..45711253 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -4,13 +4,141 @@ from argparse import ArgumentParser from datetime import datetime import multiprocessing +from pathlib import Path import yaml import pg8000 from hashstore import HashStoreFactory +class HashStoreParser: + """Class to setup client arguments""" + + PROGRAM_NAME = "HashStore Command Line Client" + DESCRIPTION = ( + "A command-line tool to convert a directory of data objects" + + " into a hashstore and perform operations to store, retrieve," + + " and delete the objects." + ) + EPILOG = "Created for DataONE (NCEAS)" + + parser = ArgumentParser( + prog=PROGRAM_NAME, + description=DESCRIPTION, + epilog=EPILOG, + ) + + def __init__(self): + """Initialize the argparse 'parser'.""" + + # Add positional argument + self.parser.add_argument("store_path", help="Path of the HashStore") + + # Add optional arguments + self.parser.add_argument( + "-chs", + dest="create_hashstore", + action="store_true", + help="Create a HashStore", + ) + self.parser.add_argument( + "-dp", "-store_depth", dest="depth", help="Depth of HashStore" + ) + self.parser.add_argument( + "-wp", "-store_width", dest="width", help="Width of HashStore" + ) + self.parser.add_argument( + "-ap", + "-store_algorithm", + dest="algorithm", + help="Algorithm to use when calculating object address", + ) + self.parser.add_argument( + "-nsp", + "-store_namespace", + dest="formatid", + help="Default metadata namespace for metadata", + ) + + # Testing related arguments + self.parser.add_argument( + "-cvd", + dest="convert_directory", + help="Directory of objects to convert to a HashStore", + ) + self.parser.add_argument( + "-cvt", + dest="convert_directory_type", + help="Type of directory to convert (ex. 'objects' or 'metadata')", + ) + self.parser.add_argument( + "-nobj", + dest="num_obj_to_convert", + help="Number of objects to convert", + ) + self.parser.add_argument( + "-rav", + dest="retrieve_and_validate", + action="store_true", + help="Retrieve and validate objects in HashStore", + ) + + # Individual API call related arguments + self.parser.add_argument( + "-pid", + dest="object_pid", + help="Pid/Guid of object to work with", + ) + self.parser.add_argument( + "-algo", + dest="object_algorithm", + help="Algorithm to work with", + ) + + def load_store_properties(self, hashstore_yaml): + """Get and return the contents of the current HashStore configuration. + + Returns: + hashstore_yaml_dict (dict): HashStore properties with the following keys (and values): + store_path (str): Path to the HashStore directory. + store_depth (int): Depth when sharding an object's hex digest. + store_width (int): Width of directories when sharding an object's hex digest. + store_algorithm (str): Hash algorithm used for calculating the object's hex digest. + store_metadata_namespace (str): Namespace for the HashStore's system metadata. + """ + property_required_keys = [ + "store_path", + "store_depth", + "store_width", + "store_algorithm", + "store_metadata_namespace", + ] + + if not os.path.exists(hashstore_yaml): + exception_string = ( + "HashStoreParser - load_store_properties: hashstore.yaml not found" + + " in store root path." + ) + raise FileNotFoundError(exception_string) + # Open file + with open(hashstore_yaml, "r", encoding="utf-8") as file: + yaml_data = yaml.safe_load(file) + + # Get hashstore properties + hashstore_yaml_dict = {} + for key in property_required_keys: + checked_property = yaml_data[key] + if key == "store_depth" or key == "store_width": + checked_property = int(yaml_data[key]) + hashstore_yaml_dict[key] = checked_property + return hashstore_yaml_dict + + def get_parser_args(self): + """Get command line arguments""" + return self.parser.parse_args() + + class HashStoreClient: - """Create a HashStore Client to use through the command line.""" + """Create a HashStore to use through the command line.""" def __init__(self, properties): """Initialize HashStore and MetacatDB adapters""" @@ -20,9 +148,26 @@ def __init__(self, properties): module_name = "filehashstore" class_name = "FileHashStore" - # Class variables + # Instance attributes self.hashstore = factory.get_hashstore(module_name, class_name, properties) + + # Setup logging + python_log_file_path = getattr(args, "store_path") + "/python_hashstore.log" + if not os.path.exists(python_log_file_path): + Path(python_log_file_path).parent.mkdir(parents=True, exist_ok=True) + open(python_log_file_path, "w", encoding="utf-8").close() + + # Create log if it doesn't exist + logging.basicConfig( + filename=python_log_file_path, + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + + # Setup access to Metacat postgres db self.metacatdb = MetacatDB(properties["store_path"], self.hashstore) + logging.info("HashStoreClient - HashStore, Logger and MetacatDB initialized.") def store_to_hashstore_from_list(self, origin_dir, obj_type, num): """Store objects in a given directory into HashStore @@ -334,137 +479,10 @@ def refine_list_for_metadata(self, metacat_obj_list): return refined_metadta_list -def _add_client_optional_arguments(argp): - """Adds the optional arguments for the HashStore Client. - - Args: - argp (parser): argparse Parser object - - """ - argp.add_argument( - "-chs", - dest="create_hashstore", - action="store_true", - help="Create a HashStore", - ) - argp.add_argument("-dp", "-store_depth", dest="depth", help="Depth of HashStore") - argp.add_argument("-wp", "-store_width", dest="width", help="Width of HashStore") - argp.add_argument( - "-ap", - "-store_algorithm", - dest="algorithm", - help="Algorithm to use when calculating object address", - ) - argp.add_argument( - "-nsp", - "-store_namespace", - dest="formatid", - help="Default metadata namespace for metadata", - ) - - # Directory to convert into a HashStore - argp.add_argument( - "-cvd", - dest="convert_directory", - help="Directory of objects to convert to a HashStore", - ) - argp.add_argument( - "-cvt", - dest="convert_directory_type", - help="Type of directory to convert (ex. 'objects' or 'metadata')", - ) - argp.add_argument( - "-nobj", - dest="num_obj_to_convert", - help="Number of objects to convert", - ) - argp.add_argument( - "-rav", - dest="retrieve_and_validate", - action="store_true", - help="Retrieve and validate objects in HashStore", - ) - - # Individual API calls - argp.add_argument( - "-pid", - dest="object_pid", - help="Pid/Guid of object to work with", - ) - argp.add_argument( - "-algo", - dest="object_algorithm", - help="Algorithm to work with", - ) - - -def _load_store_properties(hashstore_yaml): - """Get and return the contents of the current HashStore configuration. - - Returns: - hashstore_yaml_dict (dict): HashStore properties with the following keys (and values): - store_path (str): Path to the HashStore directory. - store_depth (int): Depth when sharding an object's hex digest. - store_width (int): Width of directories when sharding an object's hex digest. - store_algorithm (str): Hash algorithm used for calculating the object's hex digest. - store_metadata_namespace (str): Namespace for the HashStore's system metadata. - """ - property_required_keys = [ - "store_path", - "store_depth", - "store_width", - "store_algorithm", - "store_metadata_namespace", - ] - - if not os.path.exists(hashstore_yaml): - exception_string = ( - "HashStore CLI Client - _load_store_properties: hashstore.yaml not found" - + " in store root path." - ) - raise FileNotFoundError(exception_string) - # Open file - with open(hashstore_yaml, "r", encoding="utf-8") as file: - yaml_data = yaml.safe_load(file) - - # Get hashstore properties - hashstore_yaml_dict = {} - for key in property_required_keys: - checked_property = yaml_data[key] - if key == "store_depth" or key == "store_width": - checked_property = int(yaml_data[key]) - hashstore_yaml_dict[key] = checked_property - return hashstore_yaml_dict - - if __name__ == "__main__": - PROGRAM_NAME = "HashStore Command Line Client" - DESCRIPTION = ( - "A command-line tool to convert a directory of data objects" - + " into a hashstore and perform operations to store, retrieve," - + " and delete the objects." - ) - EPILOG = "Created for DataONE (NCEAS)" - parser = ArgumentParser( - prog=PROGRAM_NAME, - description=DESCRIPTION, - epilog=EPILOG, - ) - ### Add Positional and Optional Arguments - parser.add_argument("store_path", help="Path of the HashStore") - _add_client_optional_arguments(parser) - - # Client entry point - args = parser.parse_args() - - ### Initialize Logging - python_log_file_path = getattr(args, "store_path") + "/python_store.log" - logging.basicConfig( - filename=python_log_file_path, - level=logging.INFO, - format="%(asctime)s - %(levelname)s - %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", - ) + # Parse arguments + parser = HashStoreParser() + args = parser.get_parser_args() if getattr(args, "create_hashstore"): # Create HashStore if -chs flag is true in a given directory @@ -482,7 +500,7 @@ def _load_store_properties(hashstore_yaml): # Initialize HashStore store_path = getattr(args, "store_path") store_path_config_yaml = store_path + "/hashstore.yaml" - props = _load_store_properties(store_path_config_yaml) + props = parser.load_store_properties_(store_path_config_yaml) hs = HashStoreClient(props) if getattr(args, "convert_directory") is not None: From 71159da46543e5eacf490cae1a4a4a615571fa9e Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 11 Aug 2023 09:27:02 -0700 Subject: [PATCH 107/165] Fix typo when calling 'HashStoreParser' load_store_properties() --- src/hashstore/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 45711253..891448e5 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -500,7 +500,7 @@ def refine_list_for_metadata(self, metacat_obj_list): # Initialize HashStore store_path = getattr(args, "store_path") store_path_config_yaml = store_path + "/hashstore.yaml" - props = parser.load_store_properties_(store_path_config_yaml) + props = parser.load_store_properties(store_path_config_yaml) hs = HashStoreClient(props) if getattr(args, "convert_directory") is not None: From 038a3cb9fecc029ebe848e31168e5c8882bccafd Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 11 Aug 2023 09:45:02 -0700 Subject: [PATCH 108/165] Refactor logging setup --- src/hashstore/client.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 891448e5..6fd5e707 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -151,20 +151,6 @@ def __init__(self, properties): # Instance attributes self.hashstore = factory.get_hashstore(module_name, class_name, properties) - # Setup logging - python_log_file_path = getattr(args, "store_path") + "/python_hashstore.log" - if not os.path.exists(python_log_file_path): - Path(python_log_file_path).parent.mkdir(parents=True, exist_ok=True) - open(python_log_file_path, "w", encoding="utf-8").close() - - # Create log if it doesn't exist - logging.basicConfig( - filename=python_log_file_path, - level=logging.INFO, - format="%(asctime)s - %(levelname)s - %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", - ) - # Setup access to Metacat postgres db self.metacatdb = MetacatDB(properties["store_path"], self.hashstore) logging.info("HashStoreClient - HashStore, Logger and MetacatDB initialized.") @@ -484,6 +470,18 @@ def refine_list_for_metadata(self, metacat_obj_list): parser = HashStoreParser() args = parser.get_parser_args() + # Setup logging + python_log_file_path = getattr(args, "store_path") + "/logs/python_hashstore.log" + if not os.path.exists(python_log_file_path): + Path(python_log_file_path).parent.mkdir(parents=True, exist_ok=True) + open(python_log_file_path, "w", encoding="utf-8").close() + logging.basicConfig( + filename=python_log_file_path, + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + if getattr(args, "create_hashstore"): # Create HashStore if -chs flag is true in a given directory # Get store attributes, HashStore will validate properties From 2b0cd1f37937854dbb3b1b889e7f893ad2e23f75 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 11 Aug 2023 11:58:45 -0700 Subject: [PATCH 109/165] Add new optional argument to explicitly test with knbvm and revise logging setup --- src/hashstore/client.py | 67 ++++++++++++++++++++++++++++++++--------- 1 file changed, 52 insertions(+), 15 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 6fd5e707..ef7007e4 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -34,6 +34,12 @@ def __init__(self): self.parser.add_argument("store_path", help="Path of the HashStore") # Add optional arguments + self.parser.add_argument( + "-knbvm", + dest="knbvm_flag", + action="store_true", + help="Flag for testing with knbvm", + ) self.parser.add_argument( "-chs", dest="create_hashstore", @@ -136,12 +142,35 @@ def get_parser_args(self): """Get command line arguments""" return self.parser.parse_args() + def initialize_logging(self, hashstore_path): + """Initialize logging for HashStore client.""" + hashstore_py_log = hashstore_path + "/logs/python_hashstore.log" + python_log_file_path = Path(hashstore_py_log) + + if not os.path.exists(python_log_file_path): + python_log_file_path.parent.mkdir(parents=True, exist_ok=True) + open(python_log_file_path, "w", encoding="utf-8").close() + logging.basicConfig( + filename=python_log_file_path, + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + + for handler in logging.root.handlers[:]: + print(handler) + class HashStoreClient: """Create a HashStore to use through the command line.""" - def __init__(self, properties): - """Initialize HashStore and MetacatDB adapters""" + def __init__(self, properties, testflag=None): + """Initialize HashStore and MetacatDB + + Args: + properties: See FileHashStore for dictionary example + testflag (str): "knbvm" to initialize MetacatDB + """ factory = HashStoreFactory() # Get HashStore from factory @@ -150,10 +179,13 @@ def __init__(self, properties): # Instance attributes self.hashstore = factory.get_hashstore(module_name, class_name, properties) + # ClientLogger(properties["store_path"]) + logging.info("HashStoreClient - HashStore initialized.") # Setup access to Metacat postgres db - self.metacatdb = MetacatDB(properties["store_path"], self.hashstore) - logging.info("HashStoreClient - HashStore, Logger and MetacatDB initialized.") + if testflag: + self.metacatdb = MetacatDB(properties["store_path"], self.hashstore) + logging.info("HashStoreClient - MetacatDB initialized.") def store_to_hashstore_from_list(self, origin_dir, obj_type, num): """Store objects in a given directory into HashStore @@ -250,7 +282,7 @@ def retrieve_and_validate_from_hashstore(self, origin_dir, obj_type, num): # pool = multiprocessing.Pool(processes=num_processes) pool = multiprocessing.Pool() if obj_type == "object": - results = pool.imap(self.validate, checked_obj_list) + results = pool.imap(self.validate_object, checked_obj_list) # if obj_type == "metadata": # TODO @@ -276,8 +308,8 @@ def retrieve_and_validate_from_hashstore(self, origin_dir, obj_type, num): ) logging.info(content) - def validate(self, obj_tuple): - """Retrieve and validate a list of objects.""" + def validate_object(self, obj_tuple): + """Retrieves an object from HashStore and validates its checksum.""" pid_guid = obj_tuple[0] algo = obj_tuple[4] checksum = obj_tuple[3] @@ -465,15 +497,13 @@ def refine_list_for_metadata(self, metacat_obj_list): return refined_metadta_list -if __name__ == "__main__": - # Parse arguments - parser = HashStoreParser() - args = parser.get_parser_args() +def initialize_logging(hashstore_path): + """Initialize logging for HashStore client.""" + hashstore_py_log = hashstore_path + "/logs/python_hashstore.log" + python_log_file_path = Path(hashstore_py_log) - # Setup logging - python_log_file_path = getattr(args, "store_path") + "/logs/python_hashstore.log" if not os.path.exists(python_log_file_path): - Path(python_log_file_path).parent.mkdir(parents=True, exist_ok=True) + python_log_file_path.parent.mkdir(parents=True, exist_ok=True) open(python_log_file_path, "w", encoding="utf-8").close() logging.basicConfig( filename=python_log_file_path, @@ -482,6 +512,12 @@ def refine_list_for_metadata(self, metacat_obj_list): datefmt="%Y-%m-%d %H:%M:%S", ) + +if __name__ == "__main__": + # Parse arguments + parser = HashStoreParser() + args = parser.get_parser_args() + if getattr(args, "create_hashstore"): # Create HashStore if -chs flag is true in a given directory # Get store attributes, HashStore will validate properties @@ -499,7 +535,8 @@ def refine_list_for_metadata(self, metacat_obj_list): store_path = getattr(args, "store_path") store_path_config_yaml = store_path + "/hashstore.yaml" props = parser.load_store_properties(store_path_config_yaml) - hs = HashStoreClient(props) + hs = HashStoreClient(props, getattr(args, "knbvm_flag")) + initialize_logging(store_path) if getattr(args, "convert_directory") is not None: directory_to_convert = getattr(args, "convert_directory") From f2db018d39d9262d85a3a02fe155053cd5c0b044 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 11 Aug 2023 12:12:18 -0700 Subject: [PATCH 110/165] Refactor logging initialization process for python client --- src/hashstore/client.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index ef7007e4..493867cc 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -499,7 +499,7 @@ def refine_list_for_metadata(self, metacat_obj_list): def initialize_logging(hashstore_path): """Initialize logging for HashStore client.""" - hashstore_py_log = hashstore_path + "/logs/python_hashstore.log" + hashstore_py_log = hashstore_path + "/python_hashstore.log" python_log_file_path = Path(hashstore_py_log) if not os.path.exists(python_log_file_path): @@ -534,9 +534,15 @@ def initialize_logging(hashstore_path): # Initialize HashStore store_path = getattr(args, "store_path") store_path_config_yaml = store_path + "/hashstore.yaml" + if not os.path.exists(store_path_config_yaml): + raise FileNotFoundError( + f"Missing config file (hashstore.yaml) at store path: {store_path}." + + " HashStore must be initialized, use `--help` for more information." + ) + initialize_logging(getattr(args, "store_path")) + props = parser.load_store_properties(store_path_config_yaml) hs = HashStoreClient(props, getattr(args, "knbvm_flag")) - initialize_logging(store_path) if getattr(args, "convert_directory") is not None: directory_to_convert = getattr(args, "convert_directory") From 1d066908be48784a7b57d9ea482fe54264440554 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 11 Aug 2023 12:35:53 -0700 Subject: [PATCH 111/165] Refactor and clean up 'client.py' --- src/hashstore/client.py | 179 ++++++++++++++++------------------------ 1 file changed, 71 insertions(+), 108 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 493867cc..70dca887 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -89,6 +89,12 @@ def __init__(self): ) # Individual API call related arguments + self.parser.add_argument( + "-getchecksum", + dest="client_getchecksum", + action="store_true", + help="Flag to call ", + ) self.parser.add_argument( "-pid", dest="object_pid", @@ -142,24 +148,6 @@ def get_parser_args(self): """Get command line arguments""" return self.parser.parse_args() - def initialize_logging(self, hashstore_path): - """Initialize logging for HashStore client.""" - hashstore_py_log = hashstore_path + "/logs/python_hashstore.log" - python_log_file_path = Path(hashstore_py_log) - - if not os.path.exists(python_log_file_path): - python_log_file_path.parent.mkdir(parents=True, exist_ok=True) - open(python_log_file_path, "w", encoding="utf-8").close() - logging.basicConfig( - filename=python_log_file_path, - level=logging.INFO, - format="%(asctime)s - %(levelname)s - %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", - ) - - for handler in logging.root.handlers[:]: - print(handler) - class HashStoreClient: """Create a HashStore to use through the command line.""" @@ -179,7 +167,6 @@ def __init__(self, properties, testflag=None): # Instance attributes self.hashstore = factory.get_hashstore(module_name, class_name, properties) - # ClientLogger(properties["store_path"]) logging.info("HashStoreClient - HashStore initialized.") # Setup access to Metacat postgres db @@ -314,7 +301,6 @@ def validate_object(self, obj_tuple): algo = obj_tuple[4] checksum = obj_tuple[3] - print(f"Validating pid: {pid_guid}") with self.hashstore.retrieve_object(pid_guid) as obj_stream: digest = self.hashstore.computehash(obj_stream, algo) obj_stream.close() @@ -327,9 +313,7 @@ def validate_object(self, obj_tuple): ) logging.error(err_msg) print(err_msg) - else: - info_msg = f"Checksums match for pid/guid: {pid_guid}!" - print(info_msg) + return def get_obj_hex_digest_from_store(self, pid_guid, obj_algo): """Given a pid and algorithm, get the hex digest of the object""" @@ -471,12 +455,10 @@ def refine_list_for_objects(self, metacat_obj_list, action): ) refined_object_list.append(store_object_tuple_item) if action == "retrieve": - print(f"Checking for pid: {pid_guid} in HashStore") if self.hashstore.exists( "objects", self.hashstore.get_sha256_hex_digest(pid_guid) ): refined_object_list.append(tuple_item) - print("Found! Adding to refined list") return refined_object_list @@ -497,22 +479,6 @@ def refine_list_for_metadata(self, metacat_obj_list): return refined_metadta_list -def initialize_logging(hashstore_path): - """Initialize logging for HashStore client.""" - hashstore_py_log = hashstore_path + "/python_hashstore.log" - python_log_file_path = Path(hashstore_py_log) - - if not os.path.exists(python_log_file_path): - python_log_file_path.parent.mkdir(parents=True, exist_ok=True) - open(python_log_file_path, "w", encoding="utf-8").close() - logging.basicConfig( - filename=python_log_file_path, - level=logging.INFO, - format="%(asctime)s - %(levelname)s - %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", - ) - - if __name__ == "__main__": # Parse arguments parser = HashStoreParser() @@ -530,73 +496,70 @@ def initialize_logging(hashstore_path): } HashStoreClient(props) - else: - # Initialize HashStore - store_path = getattr(args, "store_path") - store_path_config_yaml = store_path + "/hashstore.yaml" - if not os.path.exists(store_path_config_yaml): - raise FileNotFoundError( - f"Missing config file (hashstore.yaml) at store path: {store_path}." - + " HashStore must be initialized, use `--help` for more information." - ) - initialize_logging(getattr(args, "store_path")) - - props = parser.load_store_properties(store_path_config_yaml) - hs = HashStoreClient(props, getattr(args, "knbvm_flag")) - - if getattr(args, "convert_directory") is not None: - directory_to_convert = getattr(args, "convert_directory") - # Check if the directory to convert exists - if os.path.exists(directory_to_convert): - # If -nobj is supplied, limit the objects we work with - number_of_objects_to_convert = getattr(args, "num_obj_to_convert") - # Determine if we are working with objects or metadata - directory_type = getattr(args, "convert_directory_type") - accepted_directory_types = ["object", "metadata"] - if directory_type not in accepted_directory_types: - raise ValueError( - "Directory `-cvt` cannot be empty, must be 'object' or 'metadata'." - + f" convert_directory_type: {directory_type}" - ) - # HashStore can only be called if a configuration file is present - if os.path.exists(store_path_config_yaml): - if getattr(args, "retrieve_and_validate"): - hs.retrieve_and_validate_from_hashstore( - directory_to_convert, - directory_type, - number_of_objects_to_convert, - ) - else: - hs.store_to_hashstore_from_list( - directory_to_convert, - directory_type, - number_of_objects_to_convert, - ) - else: - # If HashStore does not exist, raise exception - # Calling app must create HashStore first before calling methods - raise FileNotFoundError( - f"Missing config file (hashstore.yaml) at store path: {store_path}." - + " HashStore must be initialized, use `--help` for more information." - ) - else: - raise FileNotFoundError( - f"Directory to convert does not exist: {getattr(args, 'convert_directory')}." + # Client setup process + # Can't use client app without first initializing HashStore + store_path = getattr(args, "store_path") + store_path_config_yaml = store_path + "/hashstore.yaml" + if not os.path.exists(store_path_config_yaml): + raise FileNotFoundError( + f"Missing config file (hashstore.yaml) at store path: {store_path}." + + " HashStore must first be initialized, use `--help` for more information." + ) + # Setup logging + # Create log file if it doesn't already exist + hashstore_py_log = store_path + "/python_hashstore.log" + python_log_file_path = Path(hashstore_py_log) + if not os.path.exists(python_log_file_path): + python_log_file_path.parent.mkdir(parents=True, exist_ok=True) + open(python_log_file_path, "w", encoding="utf-8").close() + logging.basicConfig( + filename=python_log_file_path, + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + # Instantiate HashStore Client + props = parser.load_store_properties(store_path_config_yaml) + hs = HashStoreClient(props, getattr(args, "knbvm_flag")) + + # Client entry point + if getattr(args, "convert_directory") is not None: + directory_to_convert = getattr(args, "convert_directory") + # Check if the directory to convert exists + if os.path.exists(directory_to_convert): + # If -nobj is supplied, limit the objects we work with + number_of_objects_to_convert = getattr(args, "num_obj_to_convert") + # Determine if we are working with objects or metadata + directory_type = getattr(args, "convert_directory_type") + accepted_directory_types = ["object", "metadata"] + if directory_type not in accepted_directory_types: + raise ValueError( + "Directory `-cvt` cannot be empty, must be 'object' or 'metadata'." + + f" convert_directory_type: {directory_type}" + ) + if getattr(args, "retrieve_and_validate"): + hs.retrieve_and_validate_from_hashstore( + directory_to_convert, + directory_type, + number_of_objects_to_convert, ) - - elif ( - getattr(args, "object_pid") is not None - and getattr(args, "object_algorithm") is not None - ): - # Calculate the hex digest of a given pid with algorithm supplied - pid = getattr(args, "object_pid") - algorithm = getattr(args, "object_algorithm") - - if os.path.exists(store_path_config_yaml): - hs.get_obj_hex_digest_from_store(pid, algorithm) else: - # Calling app must initialize HashStore first before calling methods - raise FileNotFoundError( - f"Missing config file (hashstore.yaml) at store path: {store_path}." - + " HashStore must be initialized, use `--help` for more information." + hs.store_to_hashstore_from_list( + directory_to_convert, + directory_type, + number_of_objects_to_convert, ) + else: + raise FileNotFoundError( + f"Directory to convert does not exist: {getattr(args, 'convert_directory')}." + ) + + elif ( + getattr(args, "client_getchecksum") + and getattr(args, "object_pid") is not None + and getattr(args, "object_algorithm") is not None + ): + # Calculate the hex digest of a given pid with algorithm supplied + pid = getattr(args, "object_pid") + algorithm = getattr(args, "object_algorithm") + hs.get_obj_hex_digest_from_store(pid, algorithm) From f62365198a89b6c7dbcca71d309fea87ef52adf8 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 11 Aug 2023 12:44:09 -0700 Subject: [PATCH 112/165] Remove redundant print statements --- src/hashstore/client.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 70dca887..660f7b4c 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -220,11 +220,9 @@ def store_to_hashstore_from_list(self, origin_dir, obj_type, num): # Log exceptions cleanup_msg = "Checking results and logging exceptions" - print(cleanup_msg) logging.info(cleanup_msg) for result in results: if isinstance(result, Exception): - print(result) logging.error(result) # Close the pool and wait for all processes to complete @@ -275,11 +273,9 @@ def retrieve_and_validate_from_hashstore(self, origin_dir, obj_type, num): # Log exceptions cleanup_msg = "Checking results and logging exceptions" - print(cleanup_msg) logging.info(cleanup_msg) for result in results: if isinstance(result, Exception): - print(result) logging.error(result) # Close the pool and wait for all processes to complete From f56271a70c54ec4ebec526a9e589746624447fbe Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 11 Aug 2023 13:22:04 -0700 Subject: [PATCH 113/165] Test multiprocessing pool.starmap() with try-except to log exceptions --- src/hashstore/client.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 660f7b4c..f2a5986b 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -212,10 +212,12 @@ def store_to_hashstore_from_list(self, origin_dir, obj_type, num): # Call 'obj_type' respective public API methods if obj_type == "object": - logging.info("Storing objects") - results = pool.starmap(self.hashstore.store_object, checked_obj_list) + try: + results = pool.starmap(self.hashstore.store_object, checked_obj_list) + # pylint: disable=W0718 + except Exception as pool_exception: + logging.error(pool_exception) if obj_type == "metadata": - logging.info("Storing metadata") results = pool.starmap(self.hashstore.store_metadata, checked_obj_list) # Log exceptions From 613172390ee8a1b4214eb5fad73b64d9877ed280 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 11 Aug 2023 14:03:44 -0700 Subject: [PATCH 114/165] Test removing default logging handlers to capture errors not logging --- src/hashstore/client.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index f2a5986b..1ce51659 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -212,15 +212,12 @@ def store_to_hashstore_from_list(self, origin_dir, obj_type, num): # Call 'obj_type' respective public API methods if obj_type == "object": - try: - results = pool.starmap(self.hashstore.store_object, checked_obj_list) - # pylint: disable=W0718 - except Exception as pool_exception: - logging.error(pool_exception) + results = pool.starmap(self.hashstore.store_object, checked_obj_list) if obj_type == "metadata": results = pool.starmap(self.hashstore.store_metadata, checked_obj_list) # Log exceptions + # TODO: This process does not properly get logged. cleanup_msg = "Checking results and logging exceptions" logging.info(cleanup_msg) for result in results: @@ -273,13 +270,6 @@ def retrieve_and_validate_from_hashstore(self, origin_dir, obj_type, num): # if obj_type == "metadata": # TODO - # Log exceptions - cleanup_msg = "Checking results and logging exceptions" - logging.info(cleanup_msg) - for result in results: - if isinstance(result, Exception): - logging.error(result) - # Close the pool and wait for all processes to complete pool.close() pool.join() @@ -505,6 +495,8 @@ def refine_list_for_metadata(self, metacat_obj_list): ) # Setup logging # Create log file if it doesn't already exist + for handler in logging.root.handlers[:]: + logging.root.removeHandler(handler) hashstore_py_log = store_path + "/python_hashstore.log" python_log_file_path = Path(hashstore_py_log) if not os.path.exists(python_log_file_path): From 8d5bd099d6941831df817ef2cd8e6466e5484b16 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 11 Aug 2023 14:14:03 -0700 Subject: [PATCH 115/165] Test exception capturing through creating a list from results --- src/hashstore/client.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 1ce51659..06522df4 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -220,9 +220,13 @@ def store_to_hashstore_from_list(self, origin_dir, obj_type, num): # TODO: This process does not properly get logged. cleanup_msg = "Checking results and logging exceptions" logging.info(cleanup_msg) + exception_list = [] for result in results: if isinstance(result, Exception): - logging.error(result) + exception_list.append(result) + # logging.error(result) + for exception in exception_list: + logging.error(exception) # Close the pool and wait for all processes to complete pool.close() @@ -495,8 +499,6 @@ def refine_list_for_metadata(self, metacat_obj_list): ) # Setup logging # Create log file if it doesn't already exist - for handler in logging.root.handlers[:]: - logging.root.removeHandler(handler) hashstore_py_log = store_path + "/python_hashstore.log" python_log_file_path = Path(hashstore_py_log) if not os.path.exists(python_log_file_path): From 270428fc34afd2f0e1e52bbc6b5b370747508278 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 11 Aug 2023 14:32:00 -0700 Subject: [PATCH 116/165] Remove cleanup process for logging exceptions from pool.starmap() and add TODO item to investigate --- src/hashstore/client.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 06522df4..a6679ae4 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -216,17 +216,7 @@ def store_to_hashstore_from_list(self, origin_dir, obj_type, num): if obj_type == "metadata": results = pool.starmap(self.hashstore.store_metadata, checked_obj_list) - # Log exceptions - # TODO: This process does not properly get logged. - cleanup_msg = "Checking results and logging exceptions" - logging.info(cleanup_msg) - exception_list = [] - for result in results: - if isinstance(result, Exception): - exception_list.append(result) - # logging.error(result) - for exception in exception_list: - logging.error(exception) + # TODO: Log exceptions from starmap() # Close the pool and wait for all processes to complete pool.close() From ef909128573542519a0300682e8f7a63d5f34fa5 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 14 Aug 2023 09:01:04 -0700 Subject: [PATCH 117/165] Test capturing exceptions in starmap results iterator with while loop --- src/hashstore/client.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index a6679ae4..b8fb319a 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -217,6 +217,13 @@ def store_to_hashstore_from_list(self, origin_dir, obj_type, num): results = pool.starmap(self.hashstore.store_metadata, checked_obj_list) # TODO: Log exceptions from starmap() + while True: + try: + yield next(results) + except StopIteration: + break + except Exception as exception_in_iterator: + logging.error(exception_in_iterator) # Close the pool and wait for all processes to complete pool.close() From b4da5f124506238831b0edae6afe98a7ae956583 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 14 Aug 2023 11:55:31 -0700 Subject: [PATCH 118/165] Fix logging message typo --- src/hashstore/filehashstore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index a8092e9f..d3323b1d 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1122,7 +1122,7 @@ def _validate_object( "FileHashStore - _move_and_get_checksums: Hex digest and checksum" + f" do not match - file not stored for pid: {pid}. Algorithm:" + f" {checksum_algorithm}. Checksum provided: {checksum} !=" - + f"HexDigest: {hex_digest_stored}. Tmp file deleted." + + f" HexDigest: {hex_digest_stored}. Tmp file deleted." ) logging.error(exception_string) raise ValueError(exception_string) From c3dac14ad154b6f75f7eb2d696df0707ef5d498b Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 14 Aug 2023 12:02:04 -0700 Subject: [PATCH 119/165] Change logging level to warning, revert changes --- src/hashstore/client.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index b8fb319a..d94b3892 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -216,19 +216,12 @@ def store_to_hashstore_from_list(self, origin_dir, obj_type, num): if obj_type == "metadata": results = pool.starmap(self.hashstore.store_metadata, checked_obj_list) - # TODO: Log exceptions from starmap() - while True: - try: - yield next(results) - except StopIteration: - break - except Exception as exception_in_iterator: - logging.error(exception_in_iterator) - # Close the pool and wait for all processes to complete pool.close() pool.join() + # TODO: Log exceptions from starmap() + end_time = datetime.now() content = ( f"HashStoreClient (store_to_hashstore_from_list):\n" @@ -267,7 +260,7 @@ def retrieve_and_validate_from_hashstore(self, origin_dir, obj_type, num): # pool = multiprocessing.Pool(processes=num_processes) pool = multiprocessing.Pool() if obj_type == "object": - results = pool.imap(self.validate_object, checked_obj_list) + results = pool.map(self.validate_object, checked_obj_list) # if obj_type == "metadata": # TODO @@ -503,7 +496,7 @@ def refine_list_for_metadata(self, metacat_obj_list): open(python_log_file_path, "w", encoding="utf-8").close() logging.basicConfig( filename=python_log_file_path, - level=logging.INFO, + level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) From b3a04f717b23989358e309fff25a848bdea486a7 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 14 Aug 2023 14:13:46 -0700 Subject: [PATCH 120/165] Update SQL query to retrieve knbvm obj list with 'ORDER BY' statement, refactor storing objects to HashStore from 'starmap' to 'imap' with try-except wrapper when calling 'store_objet' --- src/hashstore/client.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index d94b3892..b5e651f8 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -211,8 +211,11 @@ def store_to_hashstore_from_list(self, origin_dir, obj_type, num): pool = multiprocessing.Pool() # Call 'obj_type' respective public API methods + info_msg = f"HashStoreClient - Request to Store {len(checked_obj_list)} Objs" + logging.info(info_msg) if obj_type == "object": - results = pool.starmap(self.hashstore.store_object, checked_obj_list) + # results = pool.starmap(self.hashstore.store_object, checked_obj_list) + results = pool.imap(self.store_object_to_hashstore, checked_obj_list) if obj_type == "metadata": results = pool.starmap(self.hashstore.store_metadata, checked_obj_list) @@ -220,8 +223,6 @@ def store_to_hashstore_from_list(self, origin_dir, obj_type, num): pool.close() pool.join() - # TODO: Log exceptions from starmap() - end_time = datetime.now() content = ( f"HashStoreClient (store_to_hashstore_from_list):\n" @@ -231,6 +232,13 @@ def store_to_hashstore_from_list(self, origin_dir, obj_type, num): ) logging.info(content) + def store_object_to_hashstore(self, obj_tuple): + """Store an object to HashStore and log exceptions as warning.""" + try: + self.hashstore.store_object(*obj_tuple) + except Exception as so_exception: + logging.warning(so_exception) + def retrieve_and_validate_from_hashstore(self, origin_dir, obj_type, num): """Retrieve objects or metadata from a Hashstore and validate the content.""" logging.info("HashStore Client - Begin retrieving and validating objects.") @@ -370,7 +378,7 @@ def get_object_metadata_list(self, origin_directory, num): query = f"""SELECT identifier.guid, identifier.docid, identifier.rev, systemmetadata.object_format, systemmetadata.checksum, systemmetadata.checksum_algorithm FROM identifier INNER JOIN systemmetadata - ON identifier.guid = systemmetadata.guid{limit_query};""" + ON identifier.guid = systemmetadata.guid ORDER BY identifier.guid{limit_query};""" cursor.execute(query) # Fetch all rows from the result set From fd8c27c1d8a95bc5e91e10c3265090ea49a7d13b Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 15 Aug 2023 10:10:34 -0700 Subject: [PATCH 121/165] Add new optional arguments and code to facilitate public api calls through the client app --- src/hashstore/client.py | 134 ++++++++++++++++++++++++++++++++++------ 1 file changed, 116 insertions(+), 18 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index b5e651f8..cc5ebe73 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -89,22 +89,72 @@ def __init__(self): ) # Individual API call related arguments - self.parser.add_argument( - "-getchecksum", - dest="client_getchecksum", - action="store_true", - help="Flag to call ", - ) self.parser.add_argument( "-pid", dest="object_pid", help="Pid/Guid of object to work with", ) + self.parser.add_argument( + "-path", + dest="object_path", + help="Path of the data or metadata object", + ) self.parser.add_argument( "-algo", dest="object_algorithm", help="Algorithm to work with", ) + self.parser.add_argument( + "-checksum", + dest="object_checksum", + help="Checksum of data object to validate", + ) + self.parser.add_argument( + "-checksum_algo", + dest="object_checksum_algorithm", + help="Algorithm of checksum to validate", + ) + self.parser.add_argument( + "-checksum_algo", + dest="object_checksum_algorithm", + help="Size of data object to validate", + ) + self.parser.add_argument( + "-formatid", + dest="object_formatid", + help="Format/namespace of the metadata", + ) + # Public API Flags + self.parser.add_argument( + "-getchecksum", + dest="client_getchecksum", + action="store_true", + help="Flag to get the hex digest of a data object in HashStore", + ) + self.parser.add_argument( + "-storeobject", + dest="client_storeobject", + action="store_true", + help="Flag to store an object to a HashStore", + ) + self.parser.add_argument( + "-storemetadata", + dest="client_storemetadata", + action="store_true", + help="Flag to store a metadata document to a HashStore", + ) + self.parser.add_argument( + "-deleteobject", + dest="client_deleteobject", + action="store_true", + help="Flag to delete on object from a HashStore", + ) + self.parser.add_argument( + "-deletemetadata", + dest="client_deletemetadata", + action="store_true", + help="Flag to dlete a metadata document from a HashStore", + ) def load_store_properties(self, hashstore_yaml): """Get and return the contents of the current HashStore configuration. @@ -174,6 +224,8 @@ def __init__(self, properties, testflag=None): self.metacatdb = MetacatDB(properties["store_path"], self.hashstore) logging.info("HashStoreClient - MetacatDB initialized.") + # Methods relating to testing HashStore with knbvm (test.arcticdata.io) + def store_to_hashstore_from_list(self, origin_dir, obj_type, num): """Store objects in a given directory into HashStore @@ -289,29 +341,22 @@ def validate_object(self, obj_tuple): """Retrieves an object from HashStore and validates its checksum.""" pid_guid = obj_tuple[0] algo = obj_tuple[4] - checksum = obj_tuple[3] + obj_db_checksum = obj_tuple[3] with self.hashstore.retrieve_object(pid_guid) as obj_stream: digest = self.hashstore.computehash(obj_stream, algo) obj_stream.close() - if digest != checksum: + if digest != obj_db_checksum: err_msg = ( f"Assertion Error for pid/guid: {pid_guid} -" + f" Digest calculated from stream ({digest}) does not match" - + f" checksum from metacat db: {checksum}" + + f" checksum from metacat db: {obj_db_checksum}" ) logging.error(err_msg) print(err_msg) return - def get_obj_hex_digest_from_store(self, pid_guid, obj_algo): - """Given a pid and algorithm, get the hex digest of the object""" - digest = self.hashstore.get_hex_digest(pid, algorithm) - print(f"guid/pid: {pid_guid}") - print(f"algorithm: {obj_algo}") - print(f"digest: {digest}") - class MetacatDB: """Class to interact with Metacat's Postgres DB""" @@ -543,7 +588,7 @@ def refine_list_for_metadata(self, metacat_obj_list): raise FileNotFoundError( f"Directory to convert does not exist: {getattr(args, 'convert_directory')}." ) - + # Get hex digest of an object elif ( getattr(args, "client_getchecksum") and getattr(args, "object_pid") is not None @@ -552,4 +597,57 @@ def refine_list_for_metadata(self, metacat_obj_list): # Calculate the hex digest of a given pid with algorithm supplied pid = getattr(args, "object_pid") algorithm = getattr(args, "object_algorithm") - hs.get_obj_hex_digest_from_store(pid, algorithm) + digest = hs.hashstore.get_hex_digest(pid, algorithm) + print(f"guid/pid: {pid}") + print(f"algorithm: {algorithm}") + print(f"Checksum/Hex Digest: {digest}") + + elif ( + getattr(args, "client_storeobject") + and getattr(args, "object_pid") is not None + and getattr(args, "object_path") is not None + ): + # Store object to HashStore + pid = getattr(args, "object_pid") + path = getattr(args, "object_path") + algorithm = getattr(args, "object_algorithm") + checksum = getattr(args, "checksum") + checksum_algorithm = getattr(args, "checksum_algo") + size = getattr(args, "object_size") + object_metadata = hs.hashstore.store_object( + pid, path, algorithm, checksum, checksum_algorithm, size + ) + print(f"Object Metadata:\n{object_metadata}") + + elif ( + getattr(args, "client_metadata") + and getattr(args, "object_pid") is not None + and getattr(args, "object_path") is not None + ): + # Store metadata to HashStore + pid = getattr(args, "object_pid") + path = getattr(args, "object_path") + formatid = getattr(args, "object_formatid") + metadata_cid = hs.hashstore.store_metadata(pid, path, formatid) + print(f"Metadata ID: {metadata_cid}") + + elif ( + getattr(args, "client_deleteobject") and getattr(args, "object_pid") is not None + ): + # Delete object from HashStore + pid = getattr(args, "object_pid") + delete_status = hs.hashstore.delete_object(pid) + if delete_status: + print("Object for pid: {pid} has been deleted.") + + elif ( + getattr(args, "client_deletemetadata") + and getattr(args, "object_pid") is not None + and getattr(args, "object_formatid") is not None + ): + # Delete metadata from HashStore + pid = getattr(args, "object_pid") + formatid = getattr(args, "object_formatid") + delete_status = hs.hashstore.delete_metadata(pid, formatid) + if delete_status: + print("Metadata for pid: {pid} with formatid: {formatid} has been deleted.") From b6de3772833337b6de6fdaba682fc89acaa6faa1 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 15 Aug 2023 10:28:37 -0700 Subject: [PATCH 122/165] Clean up code --- src/hashstore/client.py | 57 ++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 32 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index cc5ebe73..0053b3a3 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -267,9 +267,8 @@ def store_to_hashstore_from_list(self, origin_dir, obj_type, num): logging.info(info_msg) if obj_type == "object": # results = pool.starmap(self.hashstore.store_object, checked_obj_list) - results = pool.imap(self.store_object_to_hashstore, checked_obj_list) - if obj_type == "metadata": - results = pool.starmap(self.hashstore.store_metadata, checked_obj_list) + pool.imap(self.try_store_object, checked_obj_list) + # TODO: if obj_type == "metadata": # Close the pool and wait for all processes to complete pool.close() @@ -284,10 +283,10 @@ def store_to_hashstore_from_list(self, origin_dir, obj_type, num): ) logging.info(content) - def store_object_to_hashstore(self, obj_tuple): + def try_store_object(self, obj_tuple): """Store an object to HashStore and log exceptions as warning.""" try: - self.hashstore.store_object(*obj_tuple) + return self.hashstore.store_object(*obj_tuple) except Exception as so_exception: logging.warning(so_exception) @@ -320,9 +319,8 @@ def retrieve_and_validate_from_hashstore(self, origin_dir, obj_type, num): # pool = multiprocessing.Pool(processes=num_processes) pool = multiprocessing.Pool() if obj_type == "object": - results = pool.map(self.validate_object, checked_obj_list) - # if obj_type == "metadata": - # TODO + pool.map(self.validate_object, checked_obj_list) + # TODO: if obj_type == "metadata": # Close the pool and wait for all processes to complete pool.close() @@ -344,17 +342,18 @@ def validate_object(self, obj_tuple): obj_db_checksum = obj_tuple[3] with self.hashstore.retrieve_object(pid_guid) as obj_stream: - digest = self.hashstore.computehash(obj_stream, algo) + computed_digest = self.hashstore.computehash(obj_stream, algo) obj_stream.close() if digest != obj_db_checksum: err_msg = ( f"Assertion Error for pid/guid: {pid_guid} -" - + f" Digest calculated from stream ({digest}) does not match" + + f" Digest calculated from stream ({computed_digest}) does not match" + f" checksum from metacat db: {obj_db_checksum}" ) logging.error(err_msg) print(err_msg) + return @@ -437,14 +436,14 @@ def get_object_metadata_list(self, origin_directory, num): pid_guid = row[0] metadatapath_docid_rev = origin_directory + "/" + row[1] + "." + str(row[2]) metadata_namespace = row[3] - checksum = row[4] - checksum_algorithm = row[5] + row_checksum = row[4] + row_checksum_algorithm = row[5] tuple_item = ( pid_guid, metadatapath_docid_rev, metadata_namespace, - checksum, - checksum_algorithm, + row_checksum, + row_checksum_algorithm, ) object_metadata_list.append(tuple_item) @@ -471,8 +470,8 @@ def refine_list_for_objects(self, metacat_obj_list, action): for tuple_item in metacat_obj_list: pid_guid = tuple_item[0] filepath_docid_rev = tuple_item[1] - checksum = tuple_item[3] - checksum_algorithm = tuple_item[4] + item_checksum = tuple_item[3] + item_checksum_algorithm = tuple_item[4] if os.path.exists(filepath_docid_rev): if action == "store": # If the file has already been stored, skip it @@ -485,8 +484,8 @@ def refine_list_for_objects(self, metacat_obj_list, action): pid_guid, filepath_docid_rev, None, - checksum, - checksum_algorithm, + item_checksum, + item_checksum_algorithm, ) refined_object_list.append(store_object_tuple_item) if action == "retrieve": @@ -557,7 +556,7 @@ def refine_list_for_metadata(self, metacat_obj_list): props = parser.load_store_properties(store_path_config_yaml) hs = HashStoreClient(props, getattr(args, "knbvm_flag")) - # Client entry point + # HashStore client entry point if getattr(args, "convert_directory") is not None: directory_to_convert = getattr(args, "convert_directory") # Check if the directory to convert exists @@ -588,64 +587,58 @@ def refine_list_for_metadata(self, metacat_obj_list): raise FileNotFoundError( f"Directory to convert does not exist: {getattr(args, 'convert_directory')}." ) - # Get hex digest of an object + # Calculate the hex digest of a given pid with algorithm supplied elif ( getattr(args, "client_getchecksum") and getattr(args, "object_pid") is not None and getattr(args, "object_algorithm") is not None ): - # Calculate the hex digest of a given pid with algorithm supplied pid = getattr(args, "object_pid") algorithm = getattr(args, "object_algorithm") digest = hs.hashstore.get_hex_digest(pid, algorithm) print(f"guid/pid: {pid}") print(f"algorithm: {algorithm}") print(f"Checksum/Hex Digest: {digest}") - + # Store object to HashStore elif ( getattr(args, "client_storeobject") and getattr(args, "object_pid") is not None and getattr(args, "object_path") is not None ): - # Store object to HashStore pid = getattr(args, "object_pid") path = getattr(args, "object_path") algorithm = getattr(args, "object_algorithm") checksum = getattr(args, "checksum") checksum_algorithm = getattr(args, "checksum_algo") size = getattr(args, "object_size") - object_metadata = hs.hashstore.store_object( - pid, path, algorithm, checksum, checksum_algorithm, size - ) + object_info_tuple = (pid, path, algorithm, checksum, checksum_algorithm, size) + object_metadata = hs.hashstore.store_object(*object_info_tuple) print(f"Object Metadata:\n{object_metadata}") - + # Store metadata to HashStore elif ( getattr(args, "client_metadata") and getattr(args, "object_pid") is not None and getattr(args, "object_path") is not None ): - # Store metadata to HashStore pid = getattr(args, "object_pid") path = getattr(args, "object_path") formatid = getattr(args, "object_formatid") metadata_cid = hs.hashstore.store_metadata(pid, path, formatid) print(f"Metadata ID: {metadata_cid}") - + # Delete object from HashStore elif ( getattr(args, "client_deleteobject") and getattr(args, "object_pid") is not None ): - # Delete object from HashStore pid = getattr(args, "object_pid") delete_status = hs.hashstore.delete_object(pid) if delete_status: print("Object for pid: {pid} has been deleted.") - + # Delete metadata from HashStore elif ( getattr(args, "client_deletemetadata") and getattr(args, "object_pid") is not None and getattr(args, "object_formatid") is not None ): - # Delete metadata from HashStore pid = getattr(args, "object_pid") formatid = getattr(args, "object_formatid") delete_status = hs.hashstore.delete_metadata(pid, formatid) From 13163777975ce4430bae81b3d2509449e78a1892 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 15 Aug 2023 10:45:50 -0700 Subject: [PATCH 123/165] Add client method to delete objects from HashStore with list retrieved from metacat db, revise documentation and further clean up code. --- src/hashstore/client.py | 110 ++++++++++++++++++++++++++++++++++------ 1 file changed, 95 insertions(+), 15 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 0053b3a3..d614b200 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -232,7 +232,6 @@ def store_to_hashstore_from_list(self, origin_dir, obj_type, num): Args: origin_dir (str): Directory to convert obj_type (str): 'object' or 'metadata' - config_yaml (str): Path to HashStore config file `hashstore.yaml` num (int): Number of files to store """ # Get list of files from directory @@ -258,8 +257,6 @@ def store_to_hashstore_from_list(self, origin_dir, obj_type, num): start_time = datetime.now() # Setup pool and processes - # num_processes = os.cpu_count() - 2 - # pool = multiprocessing.Pool(processes=num_processes) pool = multiprocessing.Pool() # Call 'obj_type' respective public API methods @@ -284,14 +281,24 @@ def store_to_hashstore_from_list(self, origin_dir, obj_type, num): logging.info(content) def try_store_object(self, obj_tuple): - """Store an object to HashStore and log exceptions as warning.""" + """Store an object to HashStore and log exceptions as warning. + + Args: + obj_tuple: See HashStore store_object signature for details. + """ try: return self.hashstore.store_object(*obj_tuple) except Exception as so_exception: logging.warning(so_exception) def retrieve_and_validate_from_hashstore(self, origin_dir, obj_type, num): - """Retrieve objects or metadata from a Hashstore and validate the content.""" + """Retrieve objects or metadata from a Hashstore and validate the content. + + Args: + origin_dir (str): Directory to convert + obj_type (str): 'object' or 'metadata' + num (int): Number of files to store + """ logging.info("HashStore Client - Begin retrieving and validating objects.") checked_num_of_files = None # Check number of files to store @@ -315,8 +322,6 @@ def retrieve_and_validate_from_hashstore(self, origin_dir, obj_type, num): start_time = datetime.now() # Setup pool and processes - # num_processes = os.cpu_count() - 2 - # pool = multiprocessing.Pool(processes=num_processes) pool = multiprocessing.Pool() if obj_type == "object": pool.map(self.validate_object, checked_obj_list) @@ -336,10 +341,14 @@ def retrieve_and_validate_from_hashstore(self, origin_dir, obj_type, num): logging.info(content) def validate_object(self, obj_tuple): - """Retrieves an object from HashStore and validates its checksum.""" + """Retrieves an object from HashStore and validates its checksum. + + Args: + obj_tuple: pid_guid, obj_checksum_algo, obj_checksum + """ pid_guid = obj_tuple[0] - algo = obj_tuple[4] - obj_db_checksum = obj_tuple[3] + algo = obj_tuple[1] + obj_db_checksum = obj_tuple[2] with self.hashstore.retrieve_object(pid_guid) as obj_stream: computed_digest = self.hashstore.computehash(obj_stream, algo) @@ -356,6 +365,66 @@ def validate_object(self, obj_tuple): return + def delete_objects_from_list(self, origin_dir, obj_type, num): + """Store objects in a given directory into HashStore + Args: + origin_dir (str): Directory to convert + obj_type (str): 'object' or 'metadata' + num (int): Number of files to store + """ + # Get list of files from directory + file_list = os.listdir(origin_dir) + checked_num_of_files = len(file_list) + # Check number of files to store + if num is not None: + checked_num_of_files = int(num) + + # Object and Metadata list + metacat_obj_list = self.metacatdb.get_object_metadata_list( + origin_dir, checked_num_of_files + ) + + # Get list of objects to store from metacat db + if obj_type == "object": + checked_obj_list = self.metacatdb.refine_list_for_objects( + metacat_obj_list, "delete" + ) + if obj_type == "metadata": + checked_obj_list = self.metacatdb.refine_list_for_metadata(metacat_obj_list) + + start_time = datetime.now() + + # Setup pool and processes + pool = multiprocessing.Pool() + + # Call 'obj_type' respective public API methods + info_msg = f"HashStoreClient - Request to delete {len(checked_obj_list)} Objs" + logging.info(info_msg) + if obj_type == "object": + # results = pool.starmap(self.hashstore.store_object, checked_obj_list) + pool.imap(self.try_delete_object, checked_obj_list) + # TODO: if obj_type == "metadata": + + # Close the pool and wait for all processes to complete + pool.close() + pool.join() + + end_time = datetime.now() + content = ( + f"HashStoreClient (delete_objects_from_list):\n" + f"Start Time: {start_time}\nEnd Time: {end_time}\n" + + f"Total Time to Store {len(checked_obj_list)} {obj_type}" + + f" Objects: {end_time - start_time}\n" + ) + logging.info(content) + + def try_delete_object(self, obj_pid): + """Delete an object to HashStore and log exceptions as warning.""" + try: + return self.hashstore.delete_object(obj_pid) + except Exception as do_exception: + logging.warning(do_exception) + class MetacatDB: """Class to interact with Metacat's Postgres DB""" @@ -370,6 +439,7 @@ def __init__(self, hashstore_path, hashstore): "db_name", ] + # Note, 'pgdb.yaml' config file must be manually created for security pgyaml_path = hashstore_path + "/pgdb.yaml" if not os.path.exists(pgyaml_path): exception_string = ( @@ -390,7 +460,7 @@ def __init__(self, hashstore_path, hashstore): self.db_yaml_dict[key] = checked_property def get_object_metadata_list(self, origin_directory, num): - """Query the metacat db for the full obj and metadata list. + """Query the metacat db for the full obj and metadata list and order by guid. Args: origin_directory (string): 'var/metacat/data' or 'var/metacat/documents' @@ -457,14 +527,14 @@ def refine_list_for_objects(self, metacat_obj_list, action): """Refine a list of objects by checking for file existence and removing duplicates. Args: - store (HashStore): HashStore object metacat_obj_list (List): List of tuple objects representing rows from metacat db action (string): "store" or "retrieve". "store" will create a list of objects to store that do not exist in HashStore. - "retrieve" will create a list of objects that exist in HashStore. + "retrieve" will create a list of objects (tuples) that exist in HashStore. + "delete" will create a list of object pids Returns: - refine_list (List): List of tuple objects based on "action" + refined_object_list (List): List of tuple objects based on "action" """ refined_object_list = [] for tuple_item in metacat_obj_list: @@ -492,7 +562,17 @@ def refine_list_for_objects(self, metacat_obj_list, action): if self.hashstore.exists( "objects", self.hashstore.get_sha256_hex_digest(pid_guid) ): - refined_object_list.append(tuple_item) + retrieve_object_tuple_item = ( + pid_guid, + item_checksum_algorithm, + item_checksum, + ) + refined_object_list.append(retrieve_object_tuple_item) + if action == "delete": + if self.hashstore.exists( + "objects", self.hashstore.get_sha256_hex_digest(pid_guid) + ): + refined_object_list.append(pid_guid) return refined_object_list From bae69fea80b2c5469c2b7d165da0b654b1a0ecc6 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 15 Aug 2023 11:02:11 -0700 Subject: [PATCH 124/165] Refactor and add missing optional arguments for deleting objects from a metacat db list --- src/hashstore/client.py | 45 +++++++++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index d614b200..15b40a0b 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -67,26 +67,38 @@ def __init__(self): # Testing related arguments self.parser.add_argument( - "-cvd", - dest="convert_directory", - help="Directory of objects to convert to a HashStore", + "-sdir", + dest="source_directory", + help="Source directory of objects to work with", ) self.parser.add_argument( - "-cvt", - dest="convert_directory_type", - help="Type of directory to convert (ex. 'objects' or 'metadata')", + "-stype", + dest="source_directory_type", + help="Source directory type (ex. 'objects' or 'metadata')", ) self.parser.add_argument( "-nobj", dest="num_obj_to_convert", help="Number of objects to convert", ) + self.parser.add_argument( + "-sts", + dest="store_to_hashstore", + action="store_true", + help="Retrieve and validate objects in HashStore", + ) self.parser.add_argument( "-rav", dest="retrieve_and_validate", action="store_true", help="Retrieve and validate objects in HashStore", ) + self.parser.add_argument( + "-dfs", + dest="delete_from_hashstore", + action="store_true", + help="Retrieve and validate objects in HashStore", + ) # Individual API call related arguments self.parser.add_argument( @@ -637,19 +649,25 @@ def refine_list_for_metadata(self, metacat_obj_list): hs = HashStoreClient(props, getattr(args, "knbvm_flag")) # HashStore client entry point - if getattr(args, "convert_directory") is not None: - directory_to_convert = getattr(args, "convert_directory") + if getattr(args, "source_directory") is not None: + directory_to_convert = getattr(args, "source_directory") # Check if the directory to convert exists if os.path.exists(directory_to_convert): # If -nobj is supplied, limit the objects we work with number_of_objects_to_convert = getattr(args, "num_obj_to_convert") # Determine if we are working with objects or metadata - directory_type = getattr(args, "convert_directory_type") + directory_type = getattr(args, "source_directory_type") accepted_directory_types = ["object", "metadata"] if directory_type not in accepted_directory_types: raise ValueError( - "Directory `-cvt` cannot be empty, must be 'object' or 'metadata'." - + f" convert_directory_type: {directory_type}" + "Directory `-stype` cannot be empty, must be 'object' or 'metadata'." + + f" source_directory_type: {directory_type}" + ) + if getattr(args, "store_to_hashstore"): + hs.store_to_hashstore_from_list( + directory_to_convert, + directory_type, + number_of_objects_to_convert, ) if getattr(args, "retrieve_and_validate"): hs.retrieve_and_validate_from_hashstore( @@ -657,12 +675,13 @@ def refine_list_for_metadata(self, metacat_obj_list): directory_type, number_of_objects_to_convert, ) - else: - hs.store_to_hashstore_from_list( + if getattr(args, "delete_from_hashstore"): + hs.delete_objects_from_list( directory_to_convert, directory_type, number_of_objects_to_convert, ) + else: raise FileNotFoundError( f"Directory to convert does not exist: {getattr(args, 'convert_directory')}." From 693e2dc605ba1864993c5710a032b53a5fdaef98 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 15 Aug 2023 11:04:22 -0700 Subject: [PATCH 125/165] Fix bug with argument name --- src/hashstore/client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 15b40a0b..8bca2ac2 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -127,8 +127,8 @@ def __init__(self): help="Algorithm of checksum to validate", ) self.parser.add_argument( - "-checksum_algo", - dest="object_checksum_algorithm", + "-obj_size", + dest="object_size", help="Size of data object to validate", ) self.parser.add_argument( From 190c0244f4a18d04b4763a0d2d12047edad72e45 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 17 Aug 2023 10:41:13 -0700 Subject: [PATCH 126/165] Refactor client entry point into main method, clean up comments and add new optional arguments --- src/hashstore/client.py | 170 ++++++++++++++++++++++------------------ 1 file changed, 93 insertions(+), 77 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 8bca2ac2..7356a2f4 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -11,11 +11,11 @@ class HashStoreParser: - """Class to setup client arguments""" + """Class to setup parsing arguments via argparse.""" PROGRAM_NAME = "HashStore Command Line Client" DESCRIPTION = ( - "A command-line tool to convert a directory of data objects" + "Command-line tool to convert a directory of data objects" + " into a hashstore and perform operations to store, retrieve," + " and delete the objects." ) @@ -40,6 +40,11 @@ def __init__(self): action="store_true", help="Flag for testing with knbvm", ) + self.parser.add_argument( + "-loglevel", + dest="logging_level", + help="Set logging level for the client", + ) self.parser.add_argument( "-chs", dest="create_hashstore", @@ -65,7 +70,7 @@ def __init__(self): help="Default metadata namespace for metadata", ) - # Testing related arguments + # KNBVM testing related arguments self.parser.add_argument( "-sdir", dest="source_directory", @@ -85,22 +90,22 @@ def __init__(self): "-sts", dest="store_to_hashstore", action="store_true", - help="Retrieve and validate objects in HashStore", + help="Store objects into a HashStore", ) self.parser.add_argument( "-rav", dest="retrieve_and_validate", action="store_true", - help="Retrieve and validate objects in HashStore", + help="Retrieve and validate objects in a HashStore", ) self.parser.add_argument( "-dfs", dest="delete_from_hashstore", action="store_true", - help="Retrieve and validate objects in HashStore", + help="Delete objects in a HashStore", ) - # Individual API call related arguments + # Individual API call related optional arguments self.parser.add_argument( "-pid", dest="object_pid", @@ -136,7 +141,8 @@ def __init__(self): dest="object_formatid", help="Format/namespace of the metadata", ) - # Public API Flags + + # Public API optional arguments self.parser.add_argument( "-getchecksum", dest="client_getchecksum", @@ -155,6 +161,18 @@ def __init__(self): action="store_true", help="Flag to store a metadata document to a HashStore", ) + self.parser.add_argument( + "-retrieveobject", + dest="client_retrieveobject", + action="store_true", + help="Flag to retrieve an object from a HashStore", + ) + self.parser.add_argument( + "-retrievemetadata", + dest="client_retrievemetadata", + action="store_true", + help="Flag to retrieve a metadata document from a HashStore", + ) self.parser.add_argument( "-deleteobject", dest="client_deleteobject", @@ -169,7 +187,7 @@ def __init__(self): ) def load_store_properties(self, hashstore_yaml): - """Get and return the contents of the current HashStore configuration. + """Get and return the contents of the current HashStore config file. Returns: hashstore_yaml_dict (dict): HashStore properties with the following keys (and values): @@ -207,7 +225,7 @@ def load_store_properties(self, hashstore_yaml): return hashstore_yaml_dict def get_parser_args(self): - """Get command line arguments""" + """Get command line arguments.""" return self.parser.parse_args() @@ -300,8 +318,9 @@ def try_store_object(self, obj_tuple): """ try: return self.hashstore.store_object(*obj_tuple) + # pylint: disable=W0718 except Exception as so_exception: - logging.warning(so_exception) + print(so_exception) def retrieve_and_validate_from_hashstore(self, origin_dir, obj_type, num): """Retrieve objects or metadata from a Hashstore and validate the content. @@ -366,7 +385,7 @@ def validate_object(self, obj_tuple): computed_digest = self.hashstore.computehash(obj_stream, algo) obj_stream.close() - if digest != obj_db_checksum: + if computed_digest != obj_db_checksum: err_msg = ( f"Assertion Error for pid/guid: {pid_guid} -" + f" Digest calculated from stream ({computed_digest}) does not match" @@ -434,8 +453,9 @@ def try_delete_object(self, obj_pid): """Delete an object to HashStore and log exceptions as warning.""" try: return self.hashstore.delete_object(obj_pid) + # pylint: disable=W0718 except Exception as do_exception: - logging.warning(do_exception) + print(do_exception) class MetacatDB: @@ -540,7 +560,7 @@ def refine_list_for_objects(self, metacat_obj_list, action): Args: metacat_obj_list (List): List of tuple objects representing rows from metacat db - action (string): "store" or "retrieve". + action (string): "store", "retrieve" or "delete". "store" will create a list of objects to store that do not exist in HashStore. "retrieve" will create a list of objects (tuples) that exist in HashStore. "delete" will create a list of object pids @@ -605,11 +625,12 @@ def refine_list_for_metadata(self, metacat_obj_list): return refined_metadta_list -if __name__ == "__main__": - # Parse arguments +def main(): + """Main function of the HashStore client.""" parser = HashStoreParser() args = parser.get_parser_args() + # Client setup process if getattr(args, "create_hashstore"): # Create HashStore if -chs flag is true in a given directory # Get store attributes, HashStore will validate properties @@ -621,8 +642,6 @@ def refine_list_for_metadata(self, metacat_obj_list): "store_metadata_namespace": getattr(args, "formatid"), } HashStoreClient(props) - - # Client setup process # Can't use client app without first initializing HashStore store_path = getattr(args, "store_path") store_path_config_yaml = store_path + "/hashstore.yaml" @@ -631,25 +650,38 @@ def refine_list_for_metadata(self, metacat_obj_list): f"Missing config file (hashstore.yaml) at store path: {store_path}." + " HashStore must first be initialized, use `--help` for more information." ) - # Setup logging - # Create log file if it doesn't already exist - hashstore_py_log = store_path + "/python_hashstore.log" + # Setup logging, create log file if it doesn't already exist + hashstore_py_log = store_path + "/python_client.log" python_log_file_path = Path(hashstore_py_log) if not os.path.exists(python_log_file_path): python_log_file_path.parent.mkdir(parents=True, exist_ok=True) open(python_log_file_path, "w", encoding="utf-8").close() + # Check for logging level + logging_level_arg = getattr(args, "logging_level") + if logging_level_arg is None: + logging_level = "INFO" + else: + logging_level = logging_level_arg logging.basicConfig( filename=python_log_file_path, - level=logging.WARNING, + level=logging_level, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) - # Instantiate HashStore Client - props = parser.load_store_properties(store_path_config_yaml) - hs = HashStoreClient(props, getattr(args, "knbvm_flag")) # HashStore client entry point - if getattr(args, "source_directory") is not None: + pid = getattr(args, "object_pid") + path = getattr(args, "object_path") + algorithm = getattr(args, "object_algorithm") + checksum = getattr(args, "object_checksum") + checksum_algorithm = getattr(args, "object_checksum_algorithm") + size = getattr(args, "object_size") + formatid = getattr(args, "object_formatid") + knbvm_test = getattr(args, "knbvm_flag") + # Instantiate HashStore Client + props = parser.load_store_properties(store_path_config_yaml) + hashstore_c = HashStoreClient(props, knbvm_test) + if knbvm_test: directory_to_convert = getattr(args, "source_directory") # Check if the directory to convert exists if os.path.exists(directory_to_convert): @@ -664,82 +696,66 @@ def refine_list_for_metadata(self, metacat_obj_list): + f" source_directory_type: {directory_type}" ) if getattr(args, "store_to_hashstore"): - hs.store_to_hashstore_from_list( + hashstore_c.store_to_hashstore_from_list( directory_to_convert, directory_type, number_of_objects_to_convert, ) if getattr(args, "retrieve_and_validate"): - hs.retrieve_and_validate_from_hashstore( + hashstore_c.retrieve_and_validate_from_hashstore( directory_to_convert, directory_type, number_of_objects_to_convert, ) if getattr(args, "delete_from_hashstore"): - hs.delete_objects_from_list( + hashstore_c.delete_objects_from_list( directory_to_convert, directory_type, number_of_objects_to_convert, ) - else: raise FileNotFoundError( - f"Directory to convert does not exist: {getattr(args, 'convert_directory')}." + f"Directory to convert is None or does not exist: {directory_to_convert}." ) - # Calculate the hex digest of a given pid with algorithm supplied elif ( getattr(args, "client_getchecksum") - and getattr(args, "object_pid") is not None - and getattr(args, "object_algorithm") is not None + and pid is not None + and algorithm is not None ): - pid = getattr(args, "object_pid") - algorithm = getattr(args, "object_algorithm") - digest = hs.hashstore.get_hex_digest(pid, algorithm) + # Calculate the hex digest of a given pid with algorithm supplied + digest = hashstore_c.hashstore.get_hex_digest(pid, algorithm) print(f"guid/pid: {pid}") print(f"algorithm: {algorithm}") print(f"Checksum/Hex Digest: {digest}") - # Store object to HashStore - elif ( - getattr(args, "client_storeobject") - and getattr(args, "object_pid") is not None - and getattr(args, "object_path") is not None - ): - pid = getattr(args, "object_pid") - path = getattr(args, "object_path") - algorithm = getattr(args, "object_algorithm") - checksum = getattr(args, "checksum") - checksum_algorithm = getattr(args, "checksum_algo") - size = getattr(args, "object_size") - object_info_tuple = (pid, path, algorithm, checksum, checksum_algorithm, size) - object_metadata = hs.hashstore.store_object(*object_info_tuple) + + elif getattr(args, "client_storeobject") and pid is not None and path is not None: + # Store object to HashStore + object_metadata = hashstore_c.hashstore.store_object( + pid, path, algorithm, checksum, checksum_algorithm, size + ) print(f"Object Metadata:\n{object_metadata}") - # Store metadata to HashStore - elif ( - getattr(args, "client_metadata") - and getattr(args, "object_pid") is not None - and getattr(args, "object_path") is not None - ): - pid = getattr(args, "object_pid") - path = getattr(args, "object_path") - formatid = getattr(args, "object_formatid") - metadata_cid = hs.hashstore.store_metadata(pid, path, formatid) + + elif getattr(args, "client_storemetadata") and pid is not None and path is not None: + # Store metadata to HashStore + metadata_cid = hashstore_c.hashstore.store_metadata(pid, path, formatid) print(f"Metadata ID: {metadata_cid}") - # Delete object from HashStore - elif ( - getattr(args, "client_deleteobject") and getattr(args, "object_pid") is not None - ): - pid = getattr(args, "object_pid") - delete_status = hs.hashstore.delete_object(pid) - if delete_status: - print("Object for pid: {pid} has been deleted.") - # Delete metadata from HashStore + + elif getattr(args, "client_deleteobject") and pid is not None: + # Delete object from HashStore + delete_status = hashstore_c.hashstore.delete_object(pid) + print(f"Object Deleted (T/F): {delete_status}") + elif ( getattr(args, "client_deletemetadata") - and getattr(args, "object_pid") is not None - and getattr(args, "object_formatid") is not None + and pid is not None + and formatid is not None ): - pid = getattr(args, "object_pid") - formatid = getattr(args, "object_formatid") - delete_status = hs.hashstore.delete_metadata(pid, formatid) - if delete_status: - print("Metadata for pid: {pid} with formatid: {formatid} has been deleted.") + # Delete metadata from HashStore + delete_status = hashstore_c.hashstore.delete_metadata(pid, formatid) + print( + f"Metadata for pid: {pid} & formatid: {formatid}\nDeleted (T/F): {delete_status}" + ) + + +if __name__ == "__main__": + main() From 99acf8dfd1780056f47f6be51c1b887cfbcc8b20 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 17 Aug 2023 11:24:09 -0700 Subject: [PATCH 127/165] Improve debug messaging for 'write_properties()' in FileHashStore class --- src/hashstore/filehashstore.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index d3323b1d..4871c731 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -196,8 +196,8 @@ def write_properties(self, properties): checked_store_algoritm = store_algorithm else: exception_string = ( - "FileHashStore - write_properties: algorithm supplied cannot" - + " be used as default for HashStore. Must be one of:" + f"FileHashStore - write_properties: algorithm supplied ({store_algorithm})" + + " cannot be used as default for HashStore. Must be one of:" + " MD5, SHA-1, SHA-256, SHA-384, SHA-512 which are DataONE" + " controlled algorithm values" ) From b391768d604a4ac7202765cb1a8de2b8c8e7d8a9 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 17 Aug 2023 11:33:48 -0700 Subject: [PATCH 128/165] Add new test class for python client --- tests/test_hashstore_client.py | 44 ++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 tests/test_hashstore_client.py diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py new file mode 100644 index 00000000..d14c0543 --- /dev/null +++ b/tests/test_hashstore_client.py @@ -0,0 +1,44 @@ +"""Test module for the Python client (Public API calls only)""" +import sys +import os +from pathlib import Path +from hashstore import client + + +def test_create_hashstore_via_client(tmp_path): + """Test creating a HashStore through client app.""" + client_directory = os.getcwd() + "/src/hashstore" + client_module_path = f"{client_directory}/client.py" + client_test_store = f"{tmp_path}/clienths" + create_hashstore_flag = "-chs" + store_depth = "-dp=3" + store_width = "-wp=2" + store_algorithm = "-ap=SHA-256" + store_namespace = "-nsp=http://www.ns.test/v1" + + # Add file path of HashStore to sys so modules can be discovered + sys.path.append(client_directory) + + chs_args = [ + client_module_path, + client_test_store, + create_hashstore_flag, + store_depth, + store_width, + store_algorithm, + store_namespace, + ] + + # Manually change sys args to simulate command line arguments + sys.argv = chs_args + + client.main() + + hashstore_yaml = Path(client_test_store + "/hashstore.yaml") + hashstore_object_path = Path(client_test_store + "/objects") + hashstore_metadata_path = Path(client_test_store + "/metadata") + hashstore_client_python_log = Path(client_test_store + "/python_client.log") + assert os.path.exists(hashstore_yaml) + assert os.path.exists(hashstore_object_path) + assert os.path.exists(hashstore_metadata_path) + assert os.path.exists(hashstore_client_python_log) From aca9e414c0abe469c66ca62ffa8ea75360b33329 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 17 Aug 2023 15:06:41 -0700 Subject: [PATCH 129/165] Add new client test and refactor 'HashStoreParser' init process --- src/hashstore/client.py | 29 +++++++++++----------- tests/test_hashstore_client.py | 45 +++++++++++++++++++++++++++++----- 2 files changed, 54 insertions(+), 20 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 7356a2f4..103228ce 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -13,23 +13,23 @@ class HashStoreParser: """Class to setup parsing arguments via argparse.""" - PROGRAM_NAME = "HashStore Command Line Client" - DESCRIPTION = ( - "Command-line tool to convert a directory of data objects" - + " into a hashstore and perform operations to store, retrieve," - + " and delete the objects." - ) - EPILOG = "Created for DataONE (NCEAS)" - - parser = ArgumentParser( - prog=PROGRAM_NAME, - description=DESCRIPTION, - epilog=EPILOG, - ) - def __init__(self): """Initialize the argparse 'parser'.""" + program_name = "HashStore Command Line Client" + description = ( + "Command-line tool to convert a directory of data objects" + + " into a hashstore and perform operations to store, retrieve," + + " and delete the objects." + ) + epilog = "Created for DataONE (NCEAS)" + + self.parser = ArgumentParser( + prog=program_name, + description=description, + epilog=epilog, + ) + # Add positional argument self.parser.add_argument("store_path", help="Path of the HashStore") @@ -627,6 +627,7 @@ def refine_list_for_metadata(self, metacat_obj_list): def main(): """Main function of the HashStore client.""" + parser = HashStoreParser() args = parser.get_parser_args() diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index d14c0543..2541b7b3 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -1,12 +1,13 @@ """Test module for the Python client (Public API calls only)""" +import multiprocessing import sys import os from pathlib import Path from hashstore import client -def test_create_hashstore_via_client(tmp_path): - """Test creating a HashStore through client app.""" +def test_create_hashstore(tmp_path): + """Test creating a HashStore through the client.""" client_directory = os.getcwd() + "/src/hashstore" client_module_path = f"{client_directory}/client.py" client_test_store = f"{tmp_path}/clienths" @@ -15,10 +16,6 @@ def test_create_hashstore_via_client(tmp_path): store_width = "-wp=2" store_algorithm = "-ap=SHA-256" store_namespace = "-nsp=http://www.ns.test/v1" - - # Add file path of HashStore to sys so modules can be discovered - sys.path.append(client_directory) - chs_args = [ client_module_path, client_test_store, @@ -29,6 +26,9 @@ def test_create_hashstore_via_client(tmp_path): store_namespace, ] + # Add file path of HashStore to sys so modules can be discovered + sys.path.append(client_directory) + # Manually change sys args to simulate command line arguments sys.argv = chs_args @@ -42,3 +42,36 @@ def test_create_hashstore_via_client(tmp_path): assert os.path.exists(hashstore_object_path) assert os.path.exists(hashstore_metadata_path) assert os.path.exists(hashstore_client_python_log) + + +def test_store_object_two(store): + """Test storing an object to HashStore through client app.""" + client_directory = os.getcwd() + "/src/hashstore" + client_module_path = f"{client_directory}/client.py" + test_dir = "tests/testdata/" + test_store = store.root + store_object_flag = "-storeobject" + pid = "jtao.1700.1" + client_pid_arg = f"-pid={pid}" + path = f'-path={test_dir + pid.replace("/", "_")}' + chs_args = [ + client_module_path, + test_store, + store_object_flag, + client_pid_arg, + path, + ] + + # Add file path of HashStore to sys so modules can be discovered + sys.path.append(client_directory) + + # Manually change sys args to simulate command line arguments + sys.argv = chs_args + + client.main() + + pid_sharded_path = ( + "a8/24/19/25740d5dcd719596639e780e0a090c9d55a5d0372b0eaf55ed711d4edf" + ) + expected_pid_abs_path = Path(test_store + f"/objects/{pid_sharded_path}") + assert os.path.exists(expected_pid_abs_path) From e85f487eedd9ec7479ffbf71add50b7f5bf67a99 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 17 Aug 2023 16:53:10 -0700 Subject: [PATCH 130/165] Add new tests for storing & deleting objects and metadata through client app --- tests/test_hashstore_client.py | 141 ++++++++++++++++++++++++++------- 1 file changed, 113 insertions(+), 28 deletions(-) diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index 2541b7b3..270236af 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -1,5 +1,4 @@ """Test module for the Python client (Public API calls only)""" -import multiprocessing import sys import os from pathlib import Path @@ -28,10 +27,8 @@ def test_create_hashstore(tmp_path): # Add file path of HashStore to sys so modules can be discovered sys.path.append(client_directory) - # Manually change sys args to simulate command line arguments sys.argv = chs_args - client.main() hashstore_yaml = Path(client_test_store + "/hashstore.yaml") @@ -44,34 +41,122 @@ def test_create_hashstore(tmp_path): assert os.path.exists(hashstore_client_python_log) -def test_store_object_two(store): - """Test storing an object to HashStore through client app.""" +def test_store_object(store, pids): + """Test storing objects to HashStore through client.""" client_directory = os.getcwd() + "/src/hashstore" - client_module_path = f"{client_directory}/client.py" test_dir = "tests/testdata/" - test_store = store.root - store_object_flag = "-storeobject" - pid = "jtao.1700.1" - client_pid_arg = f"-pid={pid}" - path = f'-path={test_dir + pid.replace("/", "_")}' - chs_args = [ - client_module_path, - test_store, - store_object_flag, - client_pid_arg, - path, - ] + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + client_module_path = f"{client_directory}/client.py" + test_store = store.root + store_object_flag = "-storeobject" + client_pid_arg = f"-pid={pid}" + path = f'-path={test_dir + pid.replace("/", "_")}' + chs_args = [ + client_module_path, + test_store, + store_object_flag, + client_pid_arg, + path, + ] - # Add file path of HashStore to sys so modules can be discovered - sys.path.append(client_directory) + # Add file path of HashStore to sys so modules can be discovered + sys.path.append(client_directory) + # Manually change sys args to simulate command line arguments + sys.argv = chs_args + client.main() - # Manually change sys args to simulate command line arguments - sys.argv = chs_args + assert store.exists("objects", pids[pid]["object_cid"]) - client.main() - pid_sharded_path = ( - "a8/24/19/25740d5dcd719596639e780e0a090c9d55a5d0372b0eaf55ed711d4edf" - ) - expected_pid_abs_path = Path(test_store + f"/objects/{pid_sharded_path}") - assert os.path.exists(expected_pid_abs_path) +def test_store_metadata(store, pids): + """Test storing metadata to HashStore through client.""" + client_directory = os.getcwd() + "/src/hashstore" + test_dir = "tests/testdata/" + namespace = "http://ns.dataone.org/service/types/v2.0" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + client_module_path = f"{client_directory}/client.py" + test_store = store.root + store_metadata_flag = "-storemetadata" + client_pid_arg = f"-pid={pid}" + path = f"-path={syspath}" + format_id = f"-formatid={namespace}" + chs_args = [ + client_module_path, + test_store, + store_metadata_flag, + client_pid_arg, + path, + format_id, + ] + + # Add file path of HashStore to sys so modules can be discovered + sys.path.append(client_directory) + # Manually change sys args to simulate command line arguments + sys.argv = chs_args + client.main() + + assert store.exists("metadata", pids[pid]["metadata_cid"]) + + +def test_delete_objects(pids, store): + """Test deleting objects from a HashStore through client.""" + client_directory = os.getcwd() + "/src/hashstore" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + _object_metadata = store.store_object(pid, path) + + client_module_path = f"{client_directory}/client.py" + test_store = store.root + delete_object_flag = "-deleteobject" + client_pid_arg = f"-pid={pid}" + chs_args = [ + client_module_path, + test_store, + delete_object_flag, + client_pid_arg, + ] + + # Add file path of HashStore to sys so modules can be discovered + sys.path.append(client_directory) + # Manually change sys args to simulate command line arguments + sys.argv = chs_args + client.main() + + assert not store.exists("objects", pids[pid]["object_cid"]) + + +def test_delete_metadata(pids, store): + """Test deleting metadata from a HashStore through client.""" + client_directory = os.getcwd() + "/src/hashstore" + test_dir = "tests/testdata/" + namespace = "http://ns.dataone.org/service/types/v2.0" + for pid in pids.keys(): + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + _metadata_cid = store.store_metadata(pid, syspath, namespace) + + client_module_path = f"{client_directory}/client.py" + test_store = store.root + delete_metadata_flag = "-deletemetadata" + client_pid_arg = f"-pid={pid}" + format_id = f"-formatid={namespace}" + chs_args = [ + client_module_path, + test_store, + delete_metadata_flag, + client_pid_arg, + format_id, + ] + + # Add file path of HashStore to sys so modules can be discovered + sys.path.append(client_directory) + # Manually change sys args to simulate command line arguments + sys.argv = chs_args + client.main() + + assert not store.exists("metadata", pids[pid]["metadata_cid"]) From d76fccae0afeb9abaea7d0122f193adf8273f073 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 17 Aug 2023 17:09:07 -0700 Subject: [PATCH 131/165] Update README.md for how to use client app --- README.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/README.md b/README.md index 48a81833..98ceb227 100644 --- a/README.md +++ b/README.md @@ -70,6 +70,24 @@ format_id = "http://custom.metadata.com/json/type/v1.0" metadata_cid = my_store.store_metadata(pid, metadata, format_id) ``` +How to use HashStore client (command line app) +``` +# Step 1: Create a HashStore +> python './src/hashstore/client.py' /path/to/store/ -chs -dp=3 -wp=2 -ap="SHA-256" -nsp="http://www.ns.test/v1" + +# Store a data object +> python './src/hashstore/client.py' /path/to/store/ "-storeobject" -pid=content_identifier -path=/path/to/object + +# Store a metadata object +> python './src/hashstore/client.py' /path/to/store/ "-storemetadata" -pid=content_identifier -path=/path/to/metadata/object -formatid=http://ns.dataone.org/service/types/v2.0 + +# Delete a data object +> python './src/hashstore/client.py' /path/to/store/ "-deleteobject" -pid=content_identifier + +# Delete a metadata file +> python './src/hashstore/client.py' /path/to/store/ "-deleteobject" -pid=content_identifier -formatid=http://ns.dataone.org/service/types/v2.0 +``` + ## License ``` Copyright [2022] [Regents of the University of California] From 64bbddd7b7922782ba6bd985e888fbb9e263abe9 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 17 Aug 2023 17:56:50 -0700 Subject: [PATCH 132/165] Add client code to retrieve a data or metadata object from a Hashstore and display first 1000 bytes --- src/hashstore/client.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 103228ce..bd564f3e 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -741,6 +741,26 @@ def main(): metadata_cid = hashstore_c.hashstore.store_metadata(pid, path, formatid) print(f"Metadata ID: {metadata_cid}") + elif getattr(args, "client_retrieveobject") and pid is not None: + # Retrieve object from HashStore and display the first 1000 bytes + object_stream = hashstore_c.hashstore.retrieve_object(pid) + object_content = object_stream.read(1000).decode("utf-8") + object_stream.close() + print(f"Preview of object (pid: {pid}) content:") + print(object_content) + + elif ( + getattr(args, "client_retrievemetadata") + and pid is not None + and formatid is not None + ): + # Retrieve metadata from HashStore and display the first 1000 bytes + metadata_stream = hashstore_c.hashstore.retrieve_metadata(pid, formatid) + metadata_content = metadata_stream.read(1000).decode("utf-8") + metadata_stream.close() + print(f"Preview of metadata (pid: {pid}) content:") + print(metadata_content) + elif getattr(args, "client_deleteobject") and pid is not None: # Delete object from HashStore delete_status = hashstore_c.hashstore.delete_object(pid) From e7c998d0ed46577bab329fd43dd51ce4891c892b Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 17 Aug 2023 17:58:34 -0700 Subject: [PATCH 133/165] Update README.md for how to retrieve a data or metadata object from a HashStore --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 98ceb227..97972c97 100644 --- a/README.md +++ b/README.md @@ -81,6 +81,12 @@ How to use HashStore client (command line app) # Store a metadata object > python './src/hashstore/client.py' /path/to/store/ "-storemetadata" -pid=content_identifier -path=/path/to/metadata/object -formatid=http://ns.dataone.org/service/types/v2.0 +# Retrieve a data object +> python './src/hashstore/client.py' /path/to/store/ "-retrieveobject" -pid=content_identifier + +# Retrieve a metadata object +> python './src/hashstore/client.py' /path/to/store/ "-retrieveobject" -pid=content_identifier -formatid=http://ns.dataone.org/service/types/v2.0 + # Delete a data object > python './src/hashstore/client.py' /path/to/store/ "-deleteobject" -pid=content_identifier From cb3102ac7016811e28112a6b12d8e0117700640b Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 17 Aug 2023 18:01:45 -0700 Subject: [PATCH 134/165] Fix typos in README.md and update for getting a data object's checksum through client app --- README.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 97972c97..bc4ff70a 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,9 @@ How to use HashStore client (command line app) # Step 1: Create a HashStore > python './src/hashstore/client.py' /path/to/store/ -chs -dp=3 -wp=2 -ap="SHA-256" -nsp="http://www.ns.test/v1" +# Get the checksum of a data object +> python './src/hashstore/client.py' /path/to/store/ "-getchecksum" -pid=content_identifier -algorithm=SHA-256 + # Store a data object > python './src/hashstore/client.py' /path/to/store/ "-storeobject" -pid=content_identifier -path=/path/to/object @@ -85,13 +88,13 @@ How to use HashStore client (command line app) > python './src/hashstore/client.py' /path/to/store/ "-retrieveobject" -pid=content_identifier # Retrieve a metadata object -> python './src/hashstore/client.py' /path/to/store/ "-retrieveobject" -pid=content_identifier -formatid=http://ns.dataone.org/service/types/v2.0 +> python './src/hashstore/client.py' /path/to/store/ "-retrievemetadata" -pid=content_identifier -formatid=http://ns.dataone.org/service/types/v2.0 # Delete a data object > python './src/hashstore/client.py' /path/to/store/ "-deleteobject" -pid=content_identifier # Delete a metadata file -> python './src/hashstore/client.py' /path/to/store/ "-deleteobject" -pid=content_identifier -formatid=http://ns.dataone.org/service/types/v2.0 +> python './src/hashstore/client.py' /path/to/store/ "-deletemetadata" -pid=content_identifier -formatid=http://ns.dataone.org/service/types/v2.0 ``` ## License From 09f9806f2a1ef70e133afe3e03d6efc29f87de0d Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 18 Aug 2023 08:50:04 -0700 Subject: [PATCH 135/165] Fix typo in README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index bc4ff70a..018e3b87 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,7 @@ metadata_cid = my_store.store_metadata(pid, metadata, format_id) How to use HashStore client (command line app) ``` # Step 1: Create a HashStore -> python './src/hashstore/client.py' /path/to/store/ -chs -dp=3 -wp=2 -ap="SHA-256" -nsp="http://www.ns.test/v1" +> python './src/hashstore/client.py' /path/to/store/ -chs -dp=3 -wp=2 -ap=SHA-256 -nsp="http://www.ns.test/v1" # Get the checksum of a data object > python './src/hashstore/client.py' /path/to/store/ "-getchecksum" -pid=content_identifier -algorithm=SHA-256 From ce2b2b7926d93cbde419ab7c8f7916975889c777 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 18 Aug 2023 09:27:13 -0700 Subject: [PATCH 136/165] Remove redundant attribute from ObjectMetadata 'is_duplicate' and refactor affected code --- src/hashstore/filehashstore.py | 12 +++--------- src/hashstore/hashstore.py | 13 +++---------- tests/test_filehashstore.py | 13 ------------- tests/test_filehashstore_interface.py | 9 --------- tests/test_hashstore.py | 6 ++---- 5 files changed, 8 insertions(+), 45 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 4871c731..e69913ad 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -668,7 +668,6 @@ def put_object( ( object_cid, obj_file_size, - is_duplicate, hex_digest_dict, ) = self._move_and_get_checksums( pid, @@ -680,9 +679,7 @@ def put_object( file_size_to_validate, ) - object_metadata = ObjectMetadata( - object_cid, obj_file_size, is_duplicate, hex_digest_dict - ) + object_metadata = ObjectMetadata(object_cid, obj_file_size, hex_digest_dict) logging.debug( "FileHashStore - put_object: Successfully put object for pid: %s", pid, @@ -703,8 +700,7 @@ def _move_and_get_checksums( extension appended. The copy process uses a temporary file to store the initial contents and returns a dictionary of algorithms and their hex digest values. If the file already exists, the method will immediately - return with is_duplicate: True and "None" for the remaining HashAddress - attributes. If an algorithm and checksum is provided, it will proceed to + raise an exception. If an algorithm and checksum is provided, it will proceed to validate the object (and delete the tmpFile if the hex digest stored does not match what is provided). @@ -753,7 +749,6 @@ def _move_and_get_checksums( # Only move file if it doesn't exist. # Files are stored once and only once - is_object_duplicate = False if not os.path.isfile(abs_file_path): self._validate_object( pid, @@ -821,10 +816,9 @@ def _move_and_get_checksums( + " deleting temporary file." ) logging.warning(warning_msg) - is_object_duplicate = True self.delete(entity, tmp_file_name) - return (object_cid, tmp_file_size, is_object_duplicate, hex_digests) + return (object_cid, tmp_file_size, hex_digests) def _mktmpfile(self, stream, additional_algorithm=None, checksum_algorithm=None): """Create a named temporary file from a `Stream` object and return its filename diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 96fbe99f..63189d30 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -210,23 +210,16 @@ def get_hashstore(module_name, class_name, properties=None): ) -class ObjectMetadata( - namedtuple("ObjectMetadata", ["id", "obj_size", "is_duplicate", "hex_digests"]) -): +class ObjectMetadata(namedtuple("ObjectMetadata", ["id", "obj_size", "hex_digests"])): """File address containing file's path on disk and its content hash ID. Args: ab_id (str): Hash ID (hexdigest) of file contents. obj_size (bytes): Size of the object - is_duplicate (boolean, optional): Whether the hash address created was - a duplicate of a previously existing file. Can only be ``True`` - after a put operation. Defaults to ``False``. hex_digests (dict, optional): A list of hex digests to validate objects (md5, sha1, sha256, sha384, sha512) """ # Default value to prevent dangerous default value - def __new__(cls, ab_id, obj_size, is_duplicate=False, hex_digests=None): - return super(ObjectMetadata, cls).__new__( - cls, ab_id, obj_size, is_duplicate, hex_digests - ) + def __new__(cls, ab_id, obj_size, hex_digests=None): + return super(ObjectMetadata, cls).__new__(cls, ab_id, obj_size, hex_digests) diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index f7b9b9ab..0bfca1be 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -220,16 +220,6 @@ def test_put_object_file_size(pids, store): assert object_size == pids[pid]["file_size_bytes"] -def test_put_object_is_duplicate(pids, store): - """Check put returns expected is_duplicate boolean value.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path) - object_metadata_is_duplicate = object_metadata.is_duplicate - assert object_metadata_is_duplicate is False - - def test_put_object_hex_digests(pids, store): """Check put successfully generates hex digests dictionary.""" test_dir = "tests/testdata/" @@ -291,7 +281,6 @@ def test_move_and_get_checksums_id(pids, store): move_id, _, _, - _, ) = store._move_and_get_checksums(pid, input_stream) input_stream.close() object_cid = store.get_sha256_hex_digest(pid) @@ -309,7 +298,6 @@ def test_move_and_get_checksums_file_size(pids, store): _, tmp_file_size, _, - _, ) = store._move_and_get_checksums(pid, input_stream) input_stream.close() assert tmp_file_size == pids[pid]["file_size_bytes"] @@ -323,7 +311,6 @@ def test_move_and_get_checksums_hex_digests(pids, store): input_stream = io.open(path, "rb") # pylint: disable=W0212 ( - _, _, _, hex_digests, diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index ecadca95..7ac53d23 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -105,15 +105,6 @@ def test_store_object_obj_size(pids, store): assert object_size == pids[pid]["file_size_bytes"] -def test_store_object_is_duplicate(pids, store): - """Test store object returns expected is_duplicate boolean.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - assert object_metadata.is_duplicate is False - - def test_store_object_hex_digests(pids, store): """Test store object returns expected hex digests dictionary.""" test_dir = "tests/testdata/" diff --git a/tests/test_hashstore.py b/tests/test_hashstore.py index 1d618810..68cd195a 100644 --- a/tests/test_hashstore.py +++ b/tests/test_hashstore.py @@ -74,10 +74,9 @@ def test_factory_get_hashstore_filehashstore_incorrect_algorithm_format(factory) factory.get_hashstore(module_name, class_name, properties) -def test_hashaddress(): +def test_objectmetadata(): """Test class returns correct values via dot notation.""" ab_id = "hashstoretest" - is_duplicate = "false" obj_size = 1234 hex_digest_dict = { "md5": "md5value", @@ -86,10 +85,9 @@ def test_hashaddress(): "sha256": "sha256value", "sha512": "sha512value", } - object_metadata = ObjectMetadata(ab_id, obj_size, is_duplicate, hex_digest_dict) + object_metadata = ObjectMetadata(ab_id, obj_size, hex_digest_dict) assert object_metadata.id == ab_id assert object_metadata.obj_size == obj_size - assert object_metadata.is_duplicate == is_duplicate assert object_metadata.hex_digests.get("md5") == hex_digest_dict["md5"] assert object_metadata.hex_digests.get("sha1") == hex_digest_dict["sha1"] assert object_metadata.hex_digests.get("sha224") == hex_digest_dict["sha224"] From 323674ead11673e65210a85e25af2699c96b9422 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 18 Aug 2023 09:51:23 -0700 Subject: [PATCH 137/165] Add client tests for retrieving objects and metadata --- src/hashstore/client.py | 2 - tests/test_hashstore_client.py | 90 ++++++++++++++++++++++++++++++---- 2 files changed, 80 insertions(+), 12 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index bd564f3e..9cd07fed 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -746,7 +746,6 @@ def main(): object_stream = hashstore_c.hashstore.retrieve_object(pid) object_content = object_stream.read(1000).decode("utf-8") object_stream.close() - print(f"Preview of object (pid: {pid}) content:") print(object_content) elif ( @@ -758,7 +757,6 @@ def main(): metadata_stream = hashstore_c.hashstore.retrieve_metadata(pid, formatid) metadata_content = metadata_stream.read(1000).decode("utf-8") metadata_stream.close() - print(f"Preview of metadata (pid: {pid}) content:") print(metadata_content) elif getattr(args, "client_deleteobject") and pid is not None: diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index 270236af..37a6a48e 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -10,7 +10,7 @@ def test_create_hashstore(tmp_path): client_directory = os.getcwd() + "/src/hashstore" client_module_path = f"{client_directory}/client.py" client_test_store = f"{tmp_path}/clienths" - create_hashstore_flag = "-chs" + create_hashstore_opt = "-chs" store_depth = "-dp=3" store_width = "-wp=2" store_algorithm = "-ap=SHA-256" @@ -18,7 +18,7 @@ def test_create_hashstore(tmp_path): chs_args = [ client_module_path, client_test_store, - create_hashstore_flag, + create_hashstore_opt, store_depth, store_width, store_algorithm, @@ -49,13 +49,13 @@ def test_store_object(store, pids): path = test_dir + pid.replace("/", "_") client_module_path = f"{client_directory}/client.py" test_store = store.root - store_object_flag = "-storeobject" + store_object_opt = "-storeobject" client_pid_arg = f"-pid={pid}" path = f'-path={test_dir + pid.replace("/", "_")}' chs_args = [ client_module_path, test_store, - store_object_flag, + store_object_opt, client_pid_arg, path, ] @@ -80,14 +80,14 @@ def test_store_metadata(store, pids): syspath = Path(test_dir) / filename client_module_path = f"{client_directory}/client.py" test_store = store.root - store_metadata_flag = "-storemetadata" + store_metadata_opt = "-storemetadata" client_pid_arg = f"-pid={pid}" path = f"-path={syspath}" format_id = f"-formatid={namespace}" chs_args = [ client_module_path, test_store, - store_metadata_flag, + store_metadata_opt, client_pid_arg, path, format_id, @@ -102,6 +102,76 @@ def test_store_metadata(store, pids): assert store.exists("metadata", pids[pid]["metadata_cid"]) +def test_retrieve_objects(capsys, pids, store): + """Test retrieving objects from a HashStore through client.""" + client_directory = os.getcwd() + "/src/hashstore" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + _object_metadata = store.store_object(pid, path) + + client_module_path = f"{client_directory}/client.py" + test_store = store.root + delete_object_opt = "-retrieveobject" + client_pid_arg = f"-pid={pid}" + chs_args = [ + client_module_path, + test_store, + delete_object_opt, + client_pid_arg, + ] + + # Add file path of HashStore to sys so modules can be discovered + sys.path.append(client_directory) + # Manually change sys args to simulate command line arguments + sys.argv = chs_args + client.main() + + object_stream = store.retrieve_object(pid) + object_content = object_stream.read(1000).decode("utf-8") + "\n" + object_stream.close() + + capsystext = capsys.readouterr().out + assert capsystext == object_content + + +def test_retrieve_metadata(capsys, pids, store): + """Test retrieving metadata from a HashStore through client.""" + client_directory = os.getcwd() + "/src/hashstore" + test_dir = "tests/testdata/" + namespace = "http://ns.dataone.org/service/types/v2.0" + for pid in pids.keys(): + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + _metadata_cid = store.store_metadata(pid, syspath, namespace) + + client_module_path = f"{client_directory}/client.py" + test_store = store.root + retrieve_metadata_opt = "-retrievemetadata" + client_pid_arg = f"-pid={pid}" + format_id = f"-formatid={namespace}" + chs_args = [ + client_module_path, + test_store, + retrieve_metadata_opt, + client_pid_arg, + format_id, + ] + + # Add file path of HashStore to sys so modules can be discovered + sys.path.append(client_directory) + # Manually change sys args to simulate command line arguments + sys.argv = chs_args + client.main() + + metadata_stream = store.retrieve_metadata(pid, namespace) + metadata_content = metadata_stream.read(1000).decode("utf-8") + "\n" + metadata_stream.close() + + capsystext = capsys.readouterr().out + assert capsystext == metadata_content + + def test_delete_objects(pids, store): """Test deleting objects from a HashStore through client.""" client_directory = os.getcwd() + "/src/hashstore" @@ -112,12 +182,12 @@ def test_delete_objects(pids, store): client_module_path = f"{client_directory}/client.py" test_store = store.root - delete_object_flag = "-deleteobject" + delete_object_opt = "-deleteobject" client_pid_arg = f"-pid={pid}" chs_args = [ client_module_path, test_store, - delete_object_flag, + delete_object_opt, client_pid_arg, ] @@ -142,13 +212,13 @@ def test_delete_metadata(pids, store): client_module_path = f"{client_directory}/client.py" test_store = store.root - delete_metadata_flag = "-deletemetadata" + delete_metadata_opt = "-deletemetadata" client_pid_arg = f"-pid={pid}" format_id = f"-formatid={namespace}" chs_args = [ client_module_path, test_store, - delete_metadata_flag, + delete_metadata_opt, client_pid_arg, format_id, ] From 31894b1ca656d538cc2972c49009c59f32a46a0e Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 18 Aug 2023 10:38:04 -0700 Subject: [PATCH 138/165] Add methods to test retrieving and deleting metadata from knbvm test hashstore --- src/hashstore/client.py | 114 ++++++++++++++++++++++++++++++++++------ 1 file changed, 97 insertions(+), 17 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 9cd07fed..ab94232e 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -282,7 +282,9 @@ def store_to_hashstore_from_list(self, origin_dir, obj_type, num): metacat_obj_list, "store" ) if obj_type == "metadata": - checked_obj_list = self.metacatdb.refine_list_for_metadata(metacat_obj_list) + checked_obj_list = self.metacatdb.refine_list_for_metadata( + metacat_obj_list, "store" + ) start_time = datetime.now() @@ -295,7 +297,8 @@ def store_to_hashstore_from_list(self, origin_dir, obj_type, num): if obj_type == "object": # results = pool.starmap(self.hashstore.store_object, checked_obj_list) pool.imap(self.try_store_object, checked_obj_list) - # TODO: if obj_type == "metadata": + if obj_type == "metadata": + pool.imap(self.try_store_metadata, checked_obj_list) # Close the pool and wait for all processes to complete pool.close() @@ -322,6 +325,18 @@ def try_store_object(self, obj_tuple): except Exception as so_exception: print(so_exception) + def try_store_metadata(self, obj_tuple): + """Store an object to HashStore and log exceptions as warning. + + Args: + obj_tuple: See HashStore store_object signature for details. + """ + try: + return self.hashstore.store_metadata(*obj_tuple) + # pylint: disable=W0718 + except Exception as so_exception: + print(so_exception) + def retrieve_and_validate_from_hashstore(self, origin_dir, obj_type, num): """Retrieve objects or metadata from a Hashstore and validate the content. @@ -348,7 +363,9 @@ def retrieve_and_validate_from_hashstore(self, origin_dir, obj_type, num): metacat_obj_list, "retrieve" ) if obj_type == "metadata": - checked_obj_list = self.metacatdb.refine_list_for_metadata(metacat_obj_list) + checked_obj_list = self.metacatdb.refine_list_for_metadata( + metacat_obj_list, "retrieve" + ) start_time = datetime.now() @@ -356,7 +373,8 @@ def retrieve_and_validate_from_hashstore(self, origin_dir, obj_type, num): pool = multiprocessing.Pool() if obj_type == "object": pool.map(self.validate_object, checked_obj_list) - # TODO: if obj_type == "metadata": + if obj_type == "metadata": + pool.map(self.validate_metadata, checked_obj_list) # Close the pool and wait for all processes to complete pool.close() @@ -396,6 +414,32 @@ def validate_object(self, obj_tuple): return + def validate_metadata(self, obj_tuple): + """Retrieves a metadata from HashStore and validates its checksum + + Args: + obj_tuple: pid_guid, obj_checksum_algo, obj_checksum + """ + pid_guid = obj_tuple[0] + namespace = obj_tuple[1] + metadata_db_checksum = obj_tuple[2] + algo = obj_tuple[3] + + with self.hashstore.retrieve_metadata(pid_guid, namespace) as metadata_stream: + computed_digest = self.hashstore.computehash(metadata_stream, algo) + metadata_stream.close() + + if computed_digest != metadata_db_checksum: + err_msg = ( + f"Assertion Error for pid/guid: {pid_guid} -" + + f" Digest calculated from stream ({computed_digest}) does not match" + + f" checksum from metacat db: {metadata_db_checksum}" + ) + logging.error(err_msg) + print(err_msg) + + return + def delete_objects_from_list(self, origin_dir, obj_type, num): """Store objects in a given directory into HashStore Args: @@ -421,7 +465,9 @@ def delete_objects_from_list(self, origin_dir, obj_type, num): metacat_obj_list, "delete" ) if obj_type == "metadata": - checked_obj_list = self.metacatdb.refine_list_for_metadata(metacat_obj_list) + checked_obj_list = self.metacatdb.refine_list_for_metadata( + metacat_obj_list, "delete" + ) start_time = datetime.now() @@ -434,7 +480,8 @@ def delete_objects_from_list(self, origin_dir, obj_type, num): if obj_type == "object": # results = pool.starmap(self.hashstore.store_object, checked_obj_list) pool.imap(self.try_delete_object, checked_obj_list) - # TODO: if obj_type == "metadata": + if obj_type == "metadata": + pool.imap(self.try_delete_metadata, checked_obj_list) # Close the pool and wait for all processes to complete pool.close() @@ -457,6 +504,14 @@ def try_delete_object(self, obj_pid): except Exception as do_exception: print(do_exception) + def try_delete_metadata(self, obj_pid, format_id): + """Delete an object to HashStore and log exceptions as warning.""" + try: + return self.hashstore.delete_metadata(obj_pid, format_id) + # pylint: disable=W0718 + except Exception as do_exception: + print(do_exception) + class MetacatDB: """Class to interact with Metacat's Postgres DB""" @@ -608,20 +663,45 @@ def refine_list_for_objects(self, metacat_obj_list, action): return refined_object_list - def refine_list_for_metadata(self, metacat_obj_list): + def refine_list_for_metadata(self, metacat_obj_list, action): """Refine a list of metadata by checking for file existence and removing duplicates.""" refined_metadta_list = [] - for obj in metacat_obj_list: - pid_guid = obj[0] - filepath_docid_rev = obj[1] - metadata_namespace = obj[2] + for tuple_item in metacat_obj_list: + pid_guid = tuple_item[0] + filepath_docid_rev = tuple_item[1] + metadata_namespace = tuple_item[2] + item_checksum = tuple_item[3] + item_checksum_algorithm = tuple_item[4] if os.path.exists(filepath_docid_rev): - # If the file has already been stored, skip it - if not self.hashstore.exists( - "metadata", self.hashstore.get_sha256_hex_digest(pid_guid) - ): - tuple_item = (pid_guid, metadata_namespace, filepath_docid_rev) - refined_metadta_list.append(tuple_item) + if action == "store": + # If the file has already been stored, skip it + if not self.hashstore.exists( + "metadata", self.hashstore.get_sha256_hex_digest(pid_guid) + ): + tuple_item = (pid_guid, filepath_docid_rev, metadata_namespace) + refined_metadta_list.append(tuple_item) + if action == "retrieve": + # If the file has already been stored, skip it + if not self.hashstore.exists( + "metadata", self.hashstore.get_sha256_hex_digest(pid_guid) + ): + tuple_item = ( + pid_guid, + metadata_namespace, + item_checksum, + item_checksum_algorithm, + ) + refined_metadta_list.append(tuple_item) + if action == "delete": + # If the file has already been stored, skip it + if not self.hashstore.exists( + "metadata", self.hashstore.get_sha256_hex_digest(pid_guid) + ): + tuple_item = ( + pid_guid, + metadata_namespace, + ) + refined_metadta_list.append(tuple_item) return refined_metadta_list From bdb1f46f9388054929f8e522b0caa07c6166fd36 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 18 Aug 2023 11:32:28 -0700 Subject: [PATCH 139/165] Fix minor bugs relating to retrieving and deleting metadata --- src/hashstore/client.py | 42 ++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index ab94232e..bbcac367 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -372,9 +372,9 @@ def retrieve_and_validate_from_hashstore(self, origin_dir, obj_type, num): # Setup pool and processes pool = multiprocessing.Pool() if obj_type == "object": - pool.map(self.validate_object, checked_obj_list) + pool.imap(self.validate_object, checked_obj_list) if obj_type == "metadata": - pool.map(self.validate_metadata, checked_obj_list) + pool.imap(self.validate_metadata, checked_obj_list) # Close the pool and wait for all processes to complete pool.close() @@ -491,7 +491,7 @@ def delete_objects_from_list(self, origin_dir, obj_type, num): content = ( f"HashStoreClient (delete_objects_from_list):\n" f"Start Time: {start_time}\nEnd Time: {end_time}\n" - + f"Total Time to Store {len(checked_obj_list)} {obj_type}" + + f"Total Time to Delete {len(checked_obj_list)} {obj_type}" + f" Objects: {end_time - start_time}\n" ) logging.info(content) @@ -499,15 +499,19 @@ def delete_objects_from_list(self, origin_dir, obj_type, num): def try_delete_object(self, obj_pid): """Delete an object to HashStore and log exceptions as warning.""" try: - return self.hashstore.delete_object(obj_pid) + self.hashstore.delete_object(obj_pid) + return # pylint: disable=W0718 except Exception as do_exception: print(do_exception) - def try_delete_metadata(self, obj_pid, format_id): + def try_delete_metadata(self, obj_tuple): """Delete an object to HashStore and log exceptions as warning.""" + pid_guid = obj_tuple[0] + namespace = obj_tuple[1] try: - return self.hashstore.delete_metadata(obj_pid, format_id) + self.hashstore.delete_metadata(pid_guid, namespace) + return # pylint: disable=W0718 except Exception as do_exception: print(do_exception) @@ -665,7 +669,7 @@ def refine_list_for_objects(self, metacat_obj_list, action): def refine_list_for_metadata(self, metacat_obj_list, action): """Refine a list of metadata by checking for file existence and removing duplicates.""" - refined_metadta_list = [] + refined_metadata_list = [] for tuple_item in metacat_obj_list: pid_guid = tuple_item[0] filepath_docid_rev = tuple_item[1] @@ -679,11 +683,13 @@ def refine_list_for_metadata(self, metacat_obj_list, action): "metadata", self.hashstore.get_sha256_hex_digest(pid_guid) ): tuple_item = (pid_guid, filepath_docid_rev, metadata_namespace) - refined_metadta_list.append(tuple_item) + refined_metadata_list.append(tuple_item) if action == "retrieve": - # If the file has already been stored, skip it - if not self.hashstore.exists( - "metadata", self.hashstore.get_sha256_hex_digest(pid_guid) + if self.hashstore.exists( + "metadata", + self.hashstore.get_sha256_hex_digest( + pid_guid + metadata_namespace + ), ): tuple_item = ( pid_guid, @@ -691,18 +697,20 @@ def refine_list_for_metadata(self, metacat_obj_list, action): item_checksum, item_checksum_algorithm, ) - refined_metadta_list.append(tuple_item) + refined_metadata_list.append(tuple_item) if action == "delete": - # If the file has already been stored, skip it - if not self.hashstore.exists( - "metadata", self.hashstore.get_sha256_hex_digest(pid_guid) + if self.hashstore.exists( + "metadata", + self.hashstore.get_sha256_hex_digest( + pid_guid + metadata_namespace + ), ): tuple_item = ( pid_guid, metadata_namespace, ) - refined_metadta_list.append(tuple_item) - return refined_metadta_list + refined_metadata_list.append(tuple_item) + return refined_metadata_list def main(): From ae4a94279082d8d82c55e3c6951ac854683c613f Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 18 Aug 2023 12:19:51 -0700 Subject: [PATCH 140/165] Clean up comments, add missing documentation and fix minor bugs --- src/hashstore/client.py | 50 ++++++++++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 13 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index bbcac367..f0a38c48 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -18,9 +18,9 @@ def __init__(self): program_name = "HashStore Command Line Client" description = ( - "Command-line tool to convert a directory of data objects" - + " into a hashstore and perform operations to store, retrieve," - + " and delete the objects." + "Command line tool to call store, retrieve and delete with a HashStore." + + " Additionally, methods are available to test functionality with a" + + " metacat postgres db." ) epilog = "Created for DataONE (NCEAS)" @@ -320,7 +320,8 @@ def try_store_object(self, obj_tuple): obj_tuple: See HashStore store_object signature for details. """ try: - return self.hashstore.store_object(*obj_tuple) + self.hashstore.store_object(*obj_tuple) + return # pylint: disable=W0718 except Exception as so_exception: print(so_exception) @@ -332,7 +333,8 @@ def try_store_metadata(self, obj_tuple): obj_tuple: See HashStore store_object signature for details. """ try: - return self.hashstore.store_metadata(*obj_tuple) + self.hashstore.store_metadata(*obj_tuple) + return # pylint: disable=W0718 except Exception as so_exception: print(so_exception) @@ -418,7 +420,7 @@ def validate_metadata(self, obj_tuple): """Retrieves a metadata from HashStore and validates its checksum Args: - obj_tuple: pid_guid, obj_checksum_algo, obj_checksum + obj_tuple: pid_guid, format_id, obj_checksum, obj_algorithm """ pid_guid = obj_tuple[0] namespace = obj_tuple[1] @@ -497,7 +499,11 @@ def delete_objects_from_list(self, origin_dir, obj_type, num): logging.info(content) def try_delete_object(self, obj_pid): - """Delete an object to HashStore and log exceptions as warning.""" + """Delete an object to HashStore and log exceptions as warning. + + Args: + obj_pid (str): Pid of object to delete + """ try: self.hashstore.delete_object(obj_pid) return @@ -506,7 +512,11 @@ def try_delete_object(self, obj_pid): print(do_exception) def try_delete_metadata(self, obj_tuple): - """Delete an object to HashStore and log exceptions as warning.""" + """Delete an object to HashStore and log exceptions as warning. + + Args: + obj_tuple: pid_guid, format_id (namespace) + """ pid_guid = obj_tuple[0] namespace = obj_tuple[1] try: @@ -621,7 +631,7 @@ def refine_list_for_objects(self, metacat_obj_list, action): metacat_obj_list (List): List of tuple objects representing rows from metacat db action (string): "store", "retrieve" or "delete". "store" will create a list of objects to store that do not exist in HashStore. - "retrieve" will create a list of objects (tuples) that exist in HashStore. + "retrieve" will create a list of objects that exist in HashStore. "delete" will create a list of object pids Returns: @@ -668,7 +678,18 @@ def refine_list_for_objects(self, metacat_obj_list, action): return refined_object_list def refine_list_for_metadata(self, metacat_obj_list, action): - """Refine a list of metadata by checking for file existence and removing duplicates.""" + """Refine a list of metadata by checking for file existence and removing duplicates. + + Args: + metacat_obj_list (List): List of tuple objects representing rows from metacat db + action (string): "store", "retrieve" or "delete". + "store" will create a list of metadata to store that do not exist in HashStore. + "retrieve" will create a list of metadata that exist in HashStore. + "delete" will create a list of metadata pids with their format_ids + + Returns: + refined_object_list (List): List of tuple metadata based on "action" + """ refined_metadata_list = [] for tuple_item in metacat_obj_list: pid_guid = tuple_item[0] @@ -680,7 +701,10 @@ def refine_list_for_metadata(self, metacat_obj_list, action): if action == "store": # If the file has already been stored, skip it if not self.hashstore.exists( - "metadata", self.hashstore.get_sha256_hex_digest(pid_guid) + "metadata", + self.hashstore.get_sha256_hex_digest( + pid_guid + metadata_namespace + ), ): tuple_item = (pid_guid, filepath_docid_rev, metadata_namespace) refined_metadata_list.append(tuple_item) @@ -714,7 +738,7 @@ def refine_list_for_metadata(self, metacat_obj_list, action): def main(): - """Main function of the HashStore client.""" + """Entry point of the HashStore client.""" parser = HashStoreParser() args = parser.get_parser_args() @@ -758,7 +782,7 @@ def main(): datefmt="%Y-%m-%d %H:%M:%S", ) - # HashStore client entry point + # Collect arguments to process pid = getattr(args, "object_pid") path = getattr(args, "object_path") algorithm = getattr(args, "object_algorithm") From fc1f272c5a0606ce27f41e7e56f210384bfed5f4 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Wed, 23 Aug 2023 08:22:49 -0700 Subject: [PATCH 141/165] Remove redundant code that unintentionally limited sql query results from knbvm metacat db --- src/hashstore/client.py | 40 +++++++++++----------------------------- 1 file changed, 11 insertions(+), 29 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index f0a38c48..c1b3b904 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -264,17 +264,10 @@ def store_to_hashstore_from_list(self, origin_dir, obj_type, num): obj_type (str): 'object' or 'metadata' num (int): Number of files to store """ - # Get list of files from directory - file_list = os.listdir(origin_dir) - checked_num_of_files = len(file_list) - # Check number of files to store - if num is not None: - checked_num_of_files = int(num) - + info_msg = f"HashStore Client - Begin storing {obj_type} objects." + logging.info(info_msg) # Object and Metadata list - metacat_obj_list = self.metacatdb.get_object_metadata_list( - origin_dir, checked_num_of_files - ) + metacat_obj_list = self.metacatdb.get_object_metadata_list(origin_dir, num) # Get list of objects to store from metacat db if obj_type == "object": @@ -347,16 +340,12 @@ def retrieve_and_validate_from_hashstore(self, origin_dir, obj_type, num): obj_type (str): 'object' or 'metadata' num (int): Number of files to store """ - logging.info("HashStore Client - Begin retrieving and validating objects.") - checked_num_of_files = None - # Check number of files to store - if num is not None: - checked_num_of_files = int(num) - - # Object and Metadata list - metacat_obj_list = self.metacatdb.get_object_metadata_list( - origin_dir, checked_num_of_files + info_msg = ( + f"HashStore Client - Begin retrieving and validating {obj_type} objects." ) + logging.info(info_msg) + # Object and Metadata list + metacat_obj_list = self.metacatdb.get_object_metadata_list(origin_dir, num) # Get list of objects to store from metacat db logging.info("HashStore Client - Refining object list for %s", obj_type) @@ -449,17 +438,10 @@ def delete_objects_from_list(self, origin_dir, obj_type, num): obj_type (str): 'object' or 'metadata' num (int): Number of files to store """ - # Get list of files from directory - file_list = os.listdir(origin_dir) - checked_num_of_files = len(file_list) - # Check number of files to store - if num is not None: - checked_num_of_files = int(num) - + info_msg = f"HashStore Client - Begin deleting {obj_type} objects." + logging.info(info_msg) # Object and Metadata list - metacat_obj_list = self.metacatdb.get_object_metadata_list( - origin_dir, checked_num_of_files - ) + metacat_obj_list = self.metacatdb.get_object_metadata_list(origin_dir, num) # Get list of objects to store from metacat db if obj_type == "object": From a6a5268218ede739261ba2575b0bf65beeefa7b1 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 7 Sep 2023 15:33:19 -0700 Subject: [PATCH 142/165] Update README.md code block formatting --- README.md | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 018e3b87..20c89c79 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,10 @@ take a longer time to run (relating to the storage of large files) - to execute ## Usage Example To view more details about the Public API - see 'hashstore.py` interface documentation -``` + +```py +from hashstore import HashStoreFactory + # Instantiate a factory hashstore_factory = HashStoreFactory() @@ -71,30 +74,30 @@ metadata_cid = my_store.store_metadata(pid, metadata, format_id) ``` How to use HashStore client (command line app) -``` +```sh # Step 1: Create a HashStore -> python './src/hashstore/client.py' /path/to/store/ -chs -dp=3 -wp=2 -ap=SHA-256 -nsp="http://www.ns.test/v1" +$ python './src/hashstore/client.py' /path/to/store/ -chs -dp=3 -wp=2 -ap=SHA-256 -nsp="http://www.ns.test/v1" # Get the checksum of a data object -> python './src/hashstore/client.py' /path/to/store/ "-getchecksum" -pid=content_identifier -algorithm=SHA-256 +$ python './src/hashstore/client.py' /path/to/store/ "-getchecksum" -pid=content_identifier -algorithm=SHA-256 # Store a data object -> python './src/hashstore/client.py' /path/to/store/ "-storeobject" -pid=content_identifier -path=/path/to/object +$ python './src/hashstore/client.py' /path/to/store/ "-storeobject" -pid=content_identifier -path=/path/to/object # Store a metadata object -> python './src/hashstore/client.py' /path/to/store/ "-storemetadata" -pid=content_identifier -path=/path/to/metadata/object -formatid=http://ns.dataone.org/service/types/v2.0 +$ python './src/hashstore/client.py' /path/to/store/ "-storemetadata" -pid=content_identifier -path=/path/to/metadata/object -formatid=http://ns.dataone.org/service/types/v2.0 # Retrieve a data object -> python './src/hashstore/client.py' /path/to/store/ "-retrieveobject" -pid=content_identifier +$ python './src/hashstore/client.py' /path/to/store/ "-retrieveobject" -pid=content_identifier # Retrieve a metadata object -> python './src/hashstore/client.py' /path/to/store/ "-retrievemetadata" -pid=content_identifier -formatid=http://ns.dataone.org/service/types/v2.0 +$ python './src/hashstore/client.py' /path/to/store/ "-retrievemetadata" -pid=content_identifier -formatid=http://ns.dataone.org/service/types/v2.0 # Delete a data object -> python './src/hashstore/client.py' /path/to/store/ "-deleteobject" -pid=content_identifier +$ python './src/hashstore/client.py' /path/to/store/ "-deleteobject" -pid=content_identifier # Delete a metadata file -> python './src/hashstore/client.py' /path/to/store/ "-deletemetadata" -pid=content_identifier -formatid=http://ns.dataone.org/service/types/v2.0 +$ python './src/hashstore/client.py' /path/to/store/ "-deletemetadata" -pid=content_identifier -formatid=http://ns.dataone.org/service/types/v2.0 ``` ## License From ae38be373327d27b17ab68ff66d0a30a585d6495 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 15 Sep 2023 14:22:54 -0700 Subject: [PATCH 143/165] Update README.md for formatting and typo --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 20c89c79..9e7df4b4 100644 --- a/README.md +++ b/README.md @@ -66,7 +66,7 @@ metadata_cid = my_store.store_metadata(pid, sysmeta) ``` If you want to store other types of metadata, add an additional `format_id`. -``` +```py pid = "j.tao.1700.1" metadata = "/path/to/your/metadata/document.json" format_id = "http://custom.metadata.com/json/type/v1.0" @@ -121,7 +121,7 @@ limitations under the License. Work on this package was supported by: - DataONE Network -- Arctic Data Center: NSF-PLR grant #2042102 to M. B. Jones, A. Budden, M. Schildhauer, and J. Dozier +- Arctic Data Center: NSF-PLR grant #2042102 to M. B. Jones, A. Budden, M. Schildhauer, and J. Dozier Additional support was provided for collaboration by the National Center for Ecological Analysis and Synthesis, a Center funded by the University of California, Santa Barbara, and the State of California. From ef93cc6d4ce9f1f2fbde5893a24a113cc4227add Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 15 Sep 2023 14:23:59 -0700 Subject: [PATCH 144/165] Update HashStore version (1.0.0) --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 5a41d3f9..1c9f80d9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "hashstore" -version = "0.9.0" +version = "1.0.0" description = "HashStore, a hash-based object store for data packages." authors = ["Matt Jones ", "Dou Mok "] readme = "README.md" From cf75a9af1032b5717bbb4a7483295ebeb6967d67 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 18 Sep 2023 13:58:29 -0700 Subject: [PATCH 145/165] Fix bug with FileHashStore default store algorithm not being hashlib compatible --- src/hashstore/filehashstore.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index e69913ad..080c551e 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -71,7 +71,7 @@ def __init__(self, properties=None): prop_store_path, prop_store_depth, prop_store_width, - prop_store_algorithm, + _, prop_store_metadata_namespace, ) = [ checked_properties[property_name] @@ -89,7 +89,6 @@ def __init__(self, properties=None): self.create_path(self.root) self.depth = prop_store_depth self.width = prop_store_width - self.algorithm = prop_store_algorithm self.sysmeta_ns = prop_store_metadata_namespace # Write 'hashstore.yaml' to store path if not os.path.exists(self.hashstore_configuration_yaml): @@ -392,6 +391,8 @@ def lookup_algo(algo): with open(self.hashstore_configuration_yaml, "r", encoding="utf-8") as file: yaml_data = yaml.safe_load(file) + # Set default store algorithm + self.algorithm = lookup_algo(yaml_data["store_algorithm"]) # Takes DataOne controlled algorithm values and translates to hashlib supported values yaml_store_default_algo_list = yaml_data["store_default_algo_list"] translated_default_algo_list = [] From 471616c07e0288e4251759a9f3c2d01f801279d1 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 18 Sep 2023 14:05:19 -0700 Subject: [PATCH 146/165] Add missing docstring arg values for 'file_size_to_validate' parameter --- src/hashstore/filehashstore.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 080c551e..04d8392c 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -654,7 +654,8 @@ def put_object( when returning hex digests. \n checksum (str, optional): Optional checksum to validate object against hex digest before moving to permanent location. \n - checksum_algorithm (str, optional): Algorithm value of given checksum. + checksum_algorithm (str, optional): Algorithm value of given checksum. \n + file_size_to_validate (bytes, optional): Expected size of object Returns: object_metadata (ObjectMetadata): object that contains the object id, @@ -715,6 +716,7 @@ def _move_and_get_checksums( checksum (str, optional): Optional checksum to validate object against hex digest before moving to permanent location. \n checksum_algorithm (str, optional): Algorithm value of given checksum. \n + file_size_to_validate (bytes, optional): Expected size of object Returns: object_metadata (tuple): object id, object file size, duplicate file From 7a79f53170c8f75621a7fc0bf7e38a8bb559aafb Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 18 Sep 2023 14:11:19 -0700 Subject: [PATCH 147/165] Fix typo in variable name --- src/hashstore/filehashstore.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 04d8392c..892ba137 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -188,11 +188,11 @@ def write_properties(self, properties): ] # Standardize algorithm value for cross-language compatibility - checked_store_algoritm = None + checked_store_algorithm = None # Note, this must be declared here because HashStore has not yet been initialized accepted_store_algorithms = ["MD5", "SHA-1", "SHA-256", "SHA-384", "SHA-512"] if store_algorithm in accepted_store_algorithms: - checked_store_algoritm = store_algorithm + checked_store_algorithm = store_algorithm else: exception_string = ( f"FileHashStore - write_properties: algorithm supplied ({store_algorithm})" @@ -208,7 +208,7 @@ def write_properties(self, properties): store_path, store_depth, store_width, - checked_store_algoritm, + checked_store_algorithm, store_metadata_namespace, ) # Write 'hashstore.yaml' @@ -1093,7 +1093,7 @@ def _validate_object( Args: pid: For logging purposes checksum: Value of checksum - checksum_algoritm: Algorithm of checksum + checksum_algorithm: Algorithm of checksum entity: Type of object hex_digests: Dictionary of hex digests to select from tmp_file_name: Name of tmp file From 091d18d6db4b6810227cca106963f4cf3b8e2b97 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 18 Sep 2023 14:13:48 -0700 Subject: [PATCH 148/165] Fix grammar in client comments --- src/hashstore/client.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index c1b3b904..50e67418 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -11,7 +11,7 @@ class HashStoreParser: - """Class to setup parsing arguments via argparse.""" + """Class to set up parsing arguments via argparse.""" def __init__(self): """Initialize the argparse 'parser'.""" @@ -183,7 +183,7 @@ def __init__(self): "-deletemetadata", dest="client_deletemetadata", action="store_true", - help="Flag to dlete a metadata document from a HashStore", + help="Flag to delete a metadata document from a HashStore", ) def load_store_properties(self, hashstore_yaml): @@ -249,7 +249,7 @@ def __init__(self, properties, testflag=None): self.hashstore = factory.get_hashstore(module_name, class_name, properties) logging.info("HashStoreClient - HashStore initialized.") - # Setup access to Metacat postgres db + # Set up access to Metacat postgres db if testflag: self.metacatdb = MetacatDB(properties["store_path"], self.hashstore) logging.info("HashStoreClient - MetacatDB initialized.") @@ -281,7 +281,7 @@ def store_to_hashstore_from_list(self, origin_dir, obj_type, num): start_time = datetime.now() - # Setup pool and processes + # Set up pool and processes pool = multiprocessing.Pool() # Call 'obj_type' respective public API methods @@ -360,7 +360,7 @@ def retrieve_and_validate_from_hashstore(self, origin_dir, obj_type, num): start_time = datetime.now() - # Setup pool and processes + # Set up pool and processes pool = multiprocessing.Pool() if obj_type == "object": pool.imap(self.validate_object, checked_obj_list) From 4e5c936ae3c8825325d390133c625ac6b82d7e1a Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 18 Sep 2023 14:16:24 -0700 Subject: [PATCH 149/165] Remove soon-to-be deprecated setting in VSCode settings.json --- .vscode/settings.json | 1 - 1 file changed, 1 deletion(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index cb3d2335..b15ffaa4 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,6 +1,5 @@ { "python.terminal.activateEnvInCurrentTerminal": true, - "python.formatting.provider": "none", "python.testing.pytestArgs": [ "tests" ], From 8689dd44beb6a2d79764c2b773f5c708afd5d127 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 18 Sep 2023 14:19:36 -0700 Subject: [PATCH 150/165] Update Hashstore interface doc strings for inaccurate description in 'storeObject' --- src/hashstore/hashstore.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 63189d30..6c704209 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -27,11 +27,11 @@ def store_object( ): """The `store_object` method is responsible for the atomic storage of objects to disk using a given InputStream and a persistent identifier (pid). Upon - successful storage, the method returns a HashAddress object containing - relevant file information, such as the file's cid, relative path, absolute - path, duplicate object status, and hex digest map of algorithms and - checksums. `store_object` also ensures that an object is stored only once by - synchronizing multiple calls and rejecting calls to store duplicate objects. + successful storage, the method returns a ObjectMetadata object containing + relevant file information, such as the file's id (which can be used to locate the + object on disk), the file's size, and a hex digest map of algorithms and checksums. + `store_object` also ensures that an object is stored only once by synchronizing + multiple calls and rejecting calls to store duplicate objects. The file's id is determined by calculating the SHA-256 hex digest of the provided pid, which is also used as the permanent address of the file. The From 1d152711dd3fa460f250cb82a0fee4f41076bd7e Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 18 Sep 2023 14:30:40 -0700 Subject: [PATCH 151/165] Refactor 'HashStoreClient' if statements to use class attributes/constants for 'object' and 'metadata' strings --- src/hashstore/client.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 50e67418..6259d96b 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -232,6 +232,9 @@ def get_parser_args(self): class HashStoreClient: """Create a HashStore to use through the command line.""" + OBJ_TYPE = "object" + MET_TYPE = "metadata" + def __init__(self, properties, testflag=None): """Initialize HashStore and MetacatDB @@ -270,11 +273,11 @@ def store_to_hashstore_from_list(self, origin_dir, obj_type, num): metacat_obj_list = self.metacatdb.get_object_metadata_list(origin_dir, num) # Get list of objects to store from metacat db - if obj_type == "object": + if obj_type == self.OBJ_TYPE: checked_obj_list = self.metacatdb.refine_list_for_objects( metacat_obj_list, "store" ) - if obj_type == "metadata": + if obj_type == self.MET_TYPE: checked_obj_list = self.metacatdb.refine_list_for_metadata( metacat_obj_list, "store" ) @@ -287,10 +290,10 @@ def store_to_hashstore_from_list(self, origin_dir, obj_type, num): # Call 'obj_type' respective public API methods info_msg = f"HashStoreClient - Request to Store {len(checked_obj_list)} Objs" logging.info(info_msg) - if obj_type == "object": + if obj_type == self.OBJ_TYPE: # results = pool.starmap(self.hashstore.store_object, checked_obj_list) pool.imap(self.try_store_object, checked_obj_list) - if obj_type == "metadata": + if obj_type == self.MET_TYPE: pool.imap(self.try_store_metadata, checked_obj_list) # Close the pool and wait for all processes to complete @@ -349,11 +352,11 @@ def retrieve_and_validate_from_hashstore(self, origin_dir, obj_type, num): # Get list of objects to store from metacat db logging.info("HashStore Client - Refining object list for %s", obj_type) - if obj_type == "object": + if obj_type == self.OBJ_TYPE: checked_obj_list = self.metacatdb.refine_list_for_objects( metacat_obj_list, "retrieve" ) - if obj_type == "metadata": + if obj_type == self.MET_TYPE: checked_obj_list = self.metacatdb.refine_list_for_metadata( metacat_obj_list, "retrieve" ) @@ -444,11 +447,11 @@ def delete_objects_from_list(self, origin_dir, obj_type, num): metacat_obj_list = self.metacatdb.get_object_metadata_list(origin_dir, num) # Get list of objects to store from metacat db - if obj_type == "object": + if obj_type == self.OBJ_TYPE: checked_obj_list = self.metacatdb.refine_list_for_objects( metacat_obj_list, "delete" ) - if obj_type == "metadata": + if obj_type == self.MET_TYPE: checked_obj_list = self.metacatdb.refine_list_for_metadata( metacat_obj_list, "delete" ) @@ -461,10 +464,10 @@ def delete_objects_from_list(self, origin_dir, obj_type, num): # Call 'obj_type' respective public API methods info_msg = f"HashStoreClient - Request to delete {len(checked_obj_list)} Objs" logging.info(info_msg) - if obj_type == "object": + if obj_type == self.OBJ_TYPE: # results = pool.starmap(self.hashstore.store_object, checked_obj_list) pool.imap(self.try_delete_object, checked_obj_list) - if obj_type == "metadata": + if obj_type == self.MET_TYPE: pool.imap(self.try_delete_metadata, checked_obj_list) # Close the pool and wait for all processes to complete From 54c93b13030d818ed1be6b9ec322173337dcbc23 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 19 Sep 2023 15:34:26 -0700 Subject: [PATCH 152/165] Update README.md --- README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 9e7df4b4..a45c36ef 100644 --- a/README.md +++ b/README.md @@ -79,25 +79,25 @@ How to use HashStore client (command line app) $ python './src/hashstore/client.py' /path/to/store/ -chs -dp=3 -wp=2 -ap=SHA-256 -nsp="http://www.ns.test/v1" # Get the checksum of a data object -$ python './src/hashstore/client.py' /path/to/store/ "-getchecksum" -pid=content_identifier -algorithm=SHA-256 +$ python './src/hashstore/client.py' /path/to/store/ -getchecksum -pid=content_identifier -algo=SHA-256 # Store a data object -$ python './src/hashstore/client.py' /path/to/store/ "-storeobject" -pid=content_identifier -path=/path/to/object +$ python './src/hashstore/client.py' /path/to/store/ -storeobject -pid=content_identifier -path=/path/to/object # Store a metadata object -$ python './src/hashstore/client.py' /path/to/store/ "-storemetadata" -pid=content_identifier -path=/path/to/metadata/object -formatid=http://ns.dataone.org/service/types/v2.0 +$ python './src/hashstore/client.py' /path/to/store/ -storemetadata -pid=content_identifier -path=/path/to/metadata/object -formatid=http://ns.dataone.org/service/types/v2.0 # Retrieve a data object -$ python './src/hashstore/client.py' /path/to/store/ "-retrieveobject" -pid=content_identifier +$ python './src/hashstore/client.py' /path/to/store/ -retrieveobject -pid=content_identifier # Retrieve a metadata object -$ python './src/hashstore/client.py' /path/to/store/ "-retrievemetadata" -pid=content_identifier -formatid=http://ns.dataone.org/service/types/v2.0 +$ python './src/hashstore/client.py' /path/to/store/ -retrievemetadata -pid=content_identifier -formatid=http://ns.dataone.org/service/types/v2.0 # Delete a data object -$ python './src/hashstore/client.py' /path/to/store/ "-deleteobject" -pid=content_identifier +$ python './src/hashstore/client.py' /path/to/store/ -deleteobject -pid=content_identifier # Delete a metadata file -$ python './src/hashstore/client.py' /path/to/store/ "-deletemetadata" -pid=content_identifier -formatid=http://ns.dataone.org/service/types/v2.0 +$ python './src/hashstore/client.py' /path/to/store/ -deletemetadata -pid=content_identifier -formatid=http://ns.dataone.org/service/types/v2.0 ``` ## License From 1f8135e794e98edeedbba93ba7fcbb447ed6b4fe Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 29 Sep 2023 16:27:19 -0700 Subject: [PATCH 153/165] Add code to clean up temporary files when 'store_object' is interrupted or python interpreter exits --- src/hashstore/filehashstore.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 892ba137..fba07f1e 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1,4 +1,5 @@ """Core module for FileHashStore""" +import atexit import io import shutil import threading @@ -844,6 +845,14 @@ def _mktmpfile(self, stream, additional_algorithm=None, checksum_algorithm=None) self.create_path(tmp_root_path) tmp = NamedTemporaryFile(dir=tmp_root_path, delete=False) + # Delete tmp file if python interpreter crashes or thread is interrupted + # when store_object is called + def delete_tmp_file(): + if os.path.exists(tmp.name): + os.remove(tmp.name) + + atexit.register(delete_tmp_file) + # Ensure tmp file is created with desired permissions if self.fmode is not None: oldmask = os.umask(0) From e4b36a14110379d55cee5fa26f7e69490b35ae93 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sat, 30 Sep 2023 10:24:54 -0700 Subject: [PATCH 154/165] Wrap code for writing to tmp file during 'store_object' in try-except-finally block to improve cleanup process --- src/hashstore/filehashstore.py | 38 ++++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index fba07f1e..e43f2bfb 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -875,15 +875,35 @@ def delete_tmp_file(): ] # tmp is a file-like object that is already opened for writing by default - with tmp as tmp_file: - for data in stream: - tmp_file.write(self._to_bytes(data)) - for hash_algorithm in hash_algorithms: - hash_algorithm.update(self._to_bytes(data)) - logging.debug( - "FileHashStore - _mktempfile: Object stream successfully written to tmp file: %s", - tmp.name, - ) + tmp_file_completion_flag = False + try: + with tmp as tmp_file: + for data in stream: + tmp_file.write(self._to_bytes(data)) + for hash_algorithm in hash_algorithms: + hash_algorithm.update(self._to_bytes(data)) + tmp_file_completion_flag = True + logging.debug( + "FileHashStore - _mktempfile: Object stream successfully written to tmp file: %s", + tmp.name, + ) + # pylint: disable=W0718 + except Exception as err: + exception_string = ( + f"FileHashStore - _mktempfile: Unexpected {err=}, {type(err)=}" + ) + logging.error(exception_string) + finally: + if not tmp_file_completion_flag: + try: + os.remove(tmp.name) + # pylint: disable=W0718 + except Exception as err: + exception_string = ( + f"FileHashStore - _mktempfile: Unexpected {err=} while attempting to" + + f" delete tmp file: {tmp.name}, {type(err)=}" + ) + logging.error(exception_string) hex_digest_list = [ hash_algorithm.hexdigest() for hash_algorithm in hash_algorithms From b37e0c97a7cbb3a2935310361434609417d911a7 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sat, 30 Sep 2023 10:25:27 -0700 Subject: [PATCH 155/165] Add new (run-slow) pytest for checking temporary file cleanup --- tests/test_filehashstore_interface.py | 49 +++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 7ac53d23..92b125cb 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -1,8 +1,11 @@ """Test module for FileHashStore HashStore interface methods""" import io +import os from pathlib import Path from threading import Thread import random +import threading +import time import pytest # Define a mark to be used to label slow tests @@ -444,6 +447,52 @@ def store_object_wrapper(pid, path): assert file_exists_error_flag +@slow_test +def test_store_object_interrupt_process(store): + """Test that tmp file created when storing a large object (2GB) and + interrupting the process is cleaned up. + """ + file_size = 2 * 1024 * 1024 * 1024 # 2GB + file_path = store.root + "random_file_2.bin" + + pid = "Testpid" + # Generate a random file with the specified size + with open(file_path, "wb") as file: + remaining_bytes = file_size + buffer_size = 1024 * 1024 # 1MB buffer size (adjust as needed) + + while remaining_bytes > 0: + # Generate random data for the buffer + buffer = bytearray(random.getrandbits(8) for _ in range(buffer_size)) + # Write the buffer to the file + bytes_to_write = min(buffer_size, remaining_bytes) + file.write(buffer[:bytes_to_write]) + remaining_bytes -= bytes_to_write + + interrupt_flag = False + + def store_object_wrapper(pid, path): + print(store.root) + while not interrupt_flag: + store.store_object(pid, path) # Call store_object inside the thread + + # Create/start the thread + thread = threading.Thread(target=store_object_wrapper, args=(pid, file_path)) + thread.start() + + # Sleep for 5 seconds to let the thread run + time.sleep(5) + + # Interrupt the thread + interrupt_flag = True + + # Wait for the thread to finish + thread.join() + + # Confirm no tmp objects found in objects/tmp directory + assert len(os.listdir(store.root + "/objects/tmp")) == 0 + + @slow_test def test_store_object_large_file(store): """Test storing a large object (1GB). This test has also been executed with From 543bfd86bc88a7bc527369aad9fe4c9fb520d067 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 2 Oct 2023 13:16:21 -0700 Subject: [PATCH 156/165] Refactor '_mktmpfile' method --- src/hashstore/filehashstore.py | 51 +++++++++++++++++++++------------- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index e43f2bfb..d632152c 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1,5 +1,6 @@ """Core module for FileHashStore""" import atexit +import atexit import io import shutil import threading @@ -839,6 +840,11 @@ def _mktmpfile(self, stream, additional_algorithm=None, checksum_algorithm=None) hex_digest_dict (dictionary): Algorithms and their hex digests. tmp.name: Name of temporary file created and written into. """ + # Review additional hash object to digest and create new list + algorithm_list_to_calculate = self._refine_algorithm_list( + additional_algorithm, checksum_algorithm + ) + tmp_root_path = self.get_store_path("objects") / "tmp" # Physically create directory if it doesn't exist if os.path.exists(tmp_root_path) is False: @@ -861,42 +867,56 @@ def delete_tmp_file(): finally: os.umask(oldmask) - # Additional hash objects to digest - algorithm_list_to_calculate = self._refine_algorithm_list( - additional_algorithm, checksum_algorithm - ) - logging.debug( "FileHashStore - _mktempfile: tmp file created: %s, calculating hex digests.", tmp.name, ) - hash_algorithms = [ - hashlib.new(algorithm) for algorithm in algorithm_list_to_calculate - ] - # tmp is a file-like object that is already opened for writing by default tmp_file_completion_flag = False try: + hash_algorithms = [ + hashlib.new(algorithm) for algorithm in algorithm_list_to_calculate + ] + + # tmp is a file-like object that is already opened for writing by default with tmp as tmp_file: for data in stream: tmp_file.write(self._to_bytes(data)) for hash_algorithm in hash_algorithms: hash_algorithm.update(self._to_bytes(data)) - tmp_file_completion_flag = True logging.debug( "FileHashStore - _mktempfile: Object stream successfully written to tmp file: %s", tmp.name, ) + + hex_digest_list = [ + hash_algorithm.hexdigest() for hash_algorithm in hash_algorithms + ] + hex_digest_dict = dict(zip(algorithm_list_to_calculate, hex_digest_list)) + tmp_file_size = os.path.getsize(tmp.name) + # Ready for validation and atomic move + tmp_file_completion_flag = True + + logging.debug("FileHashStore - _mktempfile: Hex digests calculated.") + return hex_digest_dict, tmp.name, tmp_file_size # pylint: disable=W0718 except Exception as err: exception_string = ( f"FileHashStore - _mktempfile: Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) + except KeyboardInterrupt: + exception_string = ( + "FileHashStore - _mktempfile: Keyboard interruption by user." + ) + logging.error(exception_string) + if os.path.exists(tmp.name): + os.remove(tmp.name) finally: if not tmp_file_completion_flag: try: - os.remove(tmp.name) + if os.path.exists(tmp.name): + os.remove(tmp.name) # pylint: disable=W0718 except Exception as err: exception_string = ( @@ -905,15 +925,6 @@ def delete_tmp_file(): ) logging.error(exception_string) - hex_digest_list = [ - hash_algorithm.hexdigest() for hash_algorithm in hash_algorithms - ] - hex_digest_dict = dict(zip(algorithm_list_to_calculate, hex_digest_list)) - tmp_file_size = os.path.getsize(tmp.name) - - logging.debug("FileHashStore - _mktempfile: Hex digests calculated.") - return hex_digest_dict, tmp.name, tmp_file_size - def put_metadata(self, metadata, pid, format_id): """Store contents of metadata to `[self.root]/metadata` using the hash of the given pid and format_id as the permanent address. From 6e5522a9cbc28b672134e592193fda0d921998ab Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 2 Oct 2023 14:25:35 -0700 Subject: [PATCH 157/165] Update client retrieve obj/metadata methods with informative print statement and update pytest --- src/hashstore/client.py | 2 ++ tests/test_hashstore_client.py | 14 ++++++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 6259d96b..c65a1cc1 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -844,6 +844,7 @@ def main(): object_content = object_stream.read(1000).decode("utf-8") object_stream.close() print(object_content) + print("...\n<-- Truncated for Display Purposes -->") elif ( getattr(args, "client_retrievemetadata") @@ -855,6 +856,7 @@ def main(): metadata_content = metadata_stream.read(1000).decode("utf-8") metadata_stream.close() print(metadata_content) + print("...\n<-- Truncated for Display Purposes -->") elif getattr(args, "client_deleteobject") and pid is not None: # Delete object from HashStore diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index 37a6a48e..7d73e524 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -128,7 +128,12 @@ def test_retrieve_objects(capsys, pids, store): client.main() object_stream = store.retrieve_object(pid) - object_content = object_stream.read(1000).decode("utf-8") + "\n" + object_content = ( + object_stream.read(1000).decode("utf-8") + + "\n" + + "...\n<-- Truncated for Display Purposes -->" + + "\n" + ) object_stream.close() capsystext = capsys.readouterr().out @@ -165,7 +170,12 @@ def test_retrieve_metadata(capsys, pids, store): client.main() metadata_stream = store.retrieve_metadata(pid, namespace) - metadata_content = metadata_stream.read(1000).decode("utf-8") + "\n" + metadata_content = ( + metadata_stream.read(1000).decode("utf-8") + + "\n" + + "...\n<-- Truncated for Display Purposes -->" + + "\n" + ) metadata_stream.close() capsystext = capsys.readouterr().out From 9bd735ee4816ac49c4126cff56d622832ff761aa Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 2 Oct 2023 14:50:45 -0700 Subject: [PATCH 158/165] Refactor client code calling Public API to throw exceptions when missing required options instead of directly exiting --- src/hashstore/client.py | 46 ++++++++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index c65a1cc1..87d4c3ed 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -815,30 +815,40 @@ def main(): raise FileNotFoundError( f"Directory to convert is None or does not exist: {directory_to_convert}." ) - elif ( - getattr(args, "client_getchecksum") - and pid is not None - and algorithm is not None - ): + elif getattr(args, "client_getchecksum"): + if pid is None: + raise ValueError("'-pid' option is required") + if algorithm is None: + raise ValueError("'-algo' option is required") # Calculate the hex digest of a given pid with algorithm supplied digest = hashstore_c.hashstore.get_hex_digest(pid, algorithm) print(f"guid/pid: {pid}") print(f"algorithm: {algorithm}") print(f"Checksum/Hex Digest: {digest}") - elif getattr(args, "client_storeobject") and pid is not None and path is not None: + elif getattr(args, "client_storeobject"): + if pid is None: + raise ValueError("'-pid' option is required") + if path is None: + raise ValueError("'-path' option is required") # Store object to HashStore object_metadata = hashstore_c.hashstore.store_object( pid, path, algorithm, checksum, checksum_algorithm, size ) print(f"Object Metadata:\n{object_metadata}") - elif getattr(args, "client_storemetadata") and pid is not None and path is not None: + elif getattr(args, "client_storemetadata"): + if pid is None: + raise ValueError("'-pid' option is required") + if path is None: + raise ValueError("'-path' option is required") # Store metadata to HashStore metadata_cid = hashstore_c.hashstore.store_metadata(pid, path, formatid) print(f"Metadata ID: {metadata_cid}") - elif getattr(args, "client_retrieveobject") and pid is not None: + elif getattr(args, "client_retrieveobject"): + if pid is None: + raise ValueError("'-pid' option is required") # Retrieve object from HashStore and display the first 1000 bytes object_stream = hashstore_c.hashstore.retrieve_object(pid) object_content = object_stream.read(1000).decode("utf-8") @@ -846,11 +856,9 @@ def main(): print(object_content) print("...\n<-- Truncated for Display Purposes -->") - elif ( - getattr(args, "client_retrievemetadata") - and pid is not None - and formatid is not None - ): + elif getattr(args, "client_retrievemetadata"): + if pid is None: + raise ValueError("'-pid' option is required") # Retrieve metadata from HashStore and display the first 1000 bytes metadata_stream = hashstore_c.hashstore.retrieve_metadata(pid, formatid) metadata_content = metadata_stream.read(1000).decode("utf-8") @@ -858,16 +866,16 @@ def main(): print(metadata_content) print("...\n<-- Truncated for Display Purposes -->") - elif getattr(args, "client_deleteobject") and pid is not None: + elif getattr(args, "client_deleteobject"): + if pid is None: + raise ValueError("'-pid' option is required") # Delete object from HashStore delete_status = hashstore_c.hashstore.delete_object(pid) print(f"Object Deleted (T/F): {delete_status}") - elif ( - getattr(args, "client_deletemetadata") - and pid is not None - and formatid is not None - ): + elif getattr(args, "client_deletemetadata"): + if pid is None: + raise ValueError("'-pid' option is required") # Delete metadata from HashStore delete_status = hashstore_c.hashstore.delete_metadata(pid, formatid) print( From c1defbb48a3a3a3f28f2a32779e753a65ba5afb5 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 5 Oct 2023 11:49:27 -0700 Subject: [PATCH 159/165] Add raise exception statement in '_mktmpfile' to help with debugging and testing --- src/hashstore/filehashstore.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index d632152c..910b6987 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -905,6 +905,8 @@ def delete_tmp_file(): f"FileHashStore - _mktempfile: Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) + # pylint: disable=W0707,W0719 + raise Exception(exception_string) except KeyboardInterrupt: exception_string = ( "FileHashStore - _mktempfile: Keyboard interruption by user." From dfb8accd02bb6b9f38aa3261f95717786fd8e1b5 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 5 Oct 2023 11:51:49 -0700 Subject: [PATCH 160/165] Move creation of file path and folders to before atomically moving file --- src/hashstore/filehashstore.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 910b6987..054fb031 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -727,7 +727,6 @@ def _move_and_get_checksums( entity = "objects" object_cid = self.get_sha256_hex_digest(pid) abs_file_path = self.build_abs_path(entity, object_cid, extension) - self.create_path(os.path.dirname(abs_file_path)) # Only create tmp file to be moved if target destination doesn't exist if os.path.isfile(abs_file_path): @@ -765,7 +764,7 @@ def _move_and_get_checksums( tmp_file_size, file_size_to_validate, ) - + self.create_path(os.path.dirname(abs_file_path)) try: debug_msg = ( "FileHashStore - _move_and_get_checksums: Moving temp file to permanent" From 0c0586244f6cba6d28b15629d2db74e9a2ebc030 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 12 Oct 2023 11:16:20 -0700 Subject: [PATCH 161/165] Add new tests for hashstore verification --- tests/test_filehashstore.py | 41 ++++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 0bfca1be..23c7e4a2 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -51,7 +51,7 @@ def test_init_write_properties_hashstore_yaml_exists(store): assert os.path.exists(store.hashstore_configuration_yaml) -def test_init_with_existing_hashstore_mismatched_config(store): +def test_init_with_existing_hashstore_mismatched_config_depth(store): """Test init with existing HashStore raises ValueError with mismatching properties.""" properties = { "store_path": store.root, @@ -64,6 +64,45 @@ def test_init_with_existing_hashstore_mismatched_config(store): FileHashStore(properties) +def test_init_with_existing_hashstore_mismatched_config_width(store): + """Test init with existing HashStore raises ValueError with mismatching properties.""" + properties = { + "store_path": store.root, + "store_depth": 3, + "store_width": 1, + "store_algorithm": "SHA-256", + "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", + } + with pytest.raises(ValueError): + FileHashStore(properties) + + +def test_init_with_existing_hashstore_mismatched_config_algo(store): + """Test init with existing HashStore raises ValueError with mismatching properties.""" + properties = { + "store_path": store.root, + "store_depth": 3, + "store_width": 1, + "store_algorithm": "SHA-512", + "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", + } + with pytest.raises(ValueError): + FileHashStore(properties) + + +def test_init_with_existing_hashstore_mismatched_config_metadata_ns(store): + """Test init with existing HashStore raises ValueError with mismatching properties.""" + properties = { + "store_path": store.root, + "store_depth": 3, + "store_width": 1, + "store_algorithm": "SHA-512", + "store_metadata_namespace": "http://ns.dataone.org/service/types/v5.0", + } + with pytest.raises(ValueError): + FileHashStore(properties) + + def test_init_with_existing_hashstore_missing_yaml(store, pids): """Test init with existing store raises FileNotFoundError when hashstore.yaml not found but objects exist.""" From 0b3ad8922cf0f5a5454100c1fa19f1a5cf0c6ce7 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 12 Oct 2023 11:53:09 -0700 Subject: [PATCH 162/165] Remove store_path as a key from hashstore config file --- src/hashstore/filehashstore.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 054fb031..c22efe5b 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1,6 +1,5 @@ """Core module for FileHashStore""" import atexit -import atexit import io import shutil import threading @@ -207,7 +206,6 @@ def write_properties(self, properties): # .yaml file to write hashstore_configuration_yaml = self._build_hashstore_yaml_string( - store_path, store_depth, store_width, checked_store_algorithm, @@ -227,7 +225,7 @@ def write_properties(self, properties): @staticmethod def _build_hashstore_yaml_string( - store_path, store_depth, store_width, store_algorithm, store_metadata_namespace + store_depth, store_width, store_algorithm, store_metadata_namespace ): """Build a YAML string representing the configuration for a HashStore. @@ -245,10 +243,6 @@ def _build_hashstore_yaml_string( hashstore_configuration_yaml = f""" # Default configuration variables for HashStore - ############### Store Path ############### - # Default path for `FileHashStore` if no path is provided - store_path: "{store_path}" - ############### Directory Structure ############### # Desired amount of directories when sharding an object to form the permanent address store_depth: {store_depth} # WARNING: DO NOT CHANGE UNLESS SETTING UP NEW HASHSTORE @@ -305,10 +299,10 @@ def _verify_hashstore_properties(self, properties, prop_store_path): # If 'hashstore.yaml' is found, verify given properties before init hashstore_yaml_dict = self.load_properties() for key in self.property_required_keys: - checked_key = properties[key] + supplied_key = properties[key] if key == "store_depth" or key == "store_width": - checked_key = int(properties[key]) - if hashstore_yaml_dict[key] != checked_key: + supplied_key = int(properties[key]) + if hashstore_yaml_dict[key] != supplied_key: exception_string = ( f"FileHashStore - Given properties ({key}: {properties[key]}) does not" + f" match. HashStore configuration ({key}: {hashstore_yaml_dict[key]})" From 29c4d839cd342b69abd1a7aa28d14f29aead96e7 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 12 Oct 2023 12:46:41 -0700 Subject: [PATCH 163/165] Refactor 'FileHashStore' init process for removal of 'store_path' from config yaml --- src/hashstore/filehashstore.py | 31 ++++++++++++++++--------------- tests/test_filehashstore.py | 1 - 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index c22efe5b..87f652e7 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -127,7 +127,6 @@ def load_properties(self): Returns: hashstore_yaml_dict (dict): HashStore properties with the following keys (and values): - store_path (str): Path to the HashStore directory. store_depth (int): Depth when sharding an object's hex digest. store_width (int): Width of directories when sharding an object's hex digest. store_algorithm (str): Hash algorithm used for calculating the object's hex digest. @@ -147,7 +146,8 @@ def load_properties(self): # Get hashstore properties hashstore_yaml_dict = {} for key in self.property_required_keys: - hashstore_yaml_dict[key] = yaml_data[key] + if key is not "store_path": + hashstore_yaml_dict[key] = yaml_data[key] logging.debug( "FileHashStore - load_properties: Successfully retrieved 'hashstore.yaml' properties." ) @@ -159,7 +159,6 @@ def write_properties(self, properties): Args: properties (dict): A python dictionary with the following keys (and values): - store_path (str): Path to the HashStore directory. store_depth (int): Depth when sharding an object's hex digest. store_width (int): Width of directories when sharding an object's hex digest. store_algorithm (str): Hash algorithm used for calculating the object's hex digest. @@ -178,7 +177,7 @@ def write_properties(self, properties): # Collect configuration properties from validated & supplied dictionary ( - store_path, + _, store_depth, store_width, store_algorithm, @@ -299,17 +298,19 @@ def _verify_hashstore_properties(self, properties, prop_store_path): # If 'hashstore.yaml' is found, verify given properties before init hashstore_yaml_dict = self.load_properties() for key in self.property_required_keys: - supplied_key = properties[key] - if key == "store_depth" or key == "store_width": - supplied_key = int(properties[key]) - if hashstore_yaml_dict[key] != supplied_key: - exception_string = ( - f"FileHashStore - Given properties ({key}: {properties[key]}) does not" - + f" match. HashStore configuration ({key}: {hashstore_yaml_dict[key]})" - + f" found at: {self.hashstore_configuration_yaml}" - ) - logging.critical(exception_string) - raise ValueError(exception_string) + # 'store_path' is required to init HashStore but not saved in `hashstore.yaml` + if key is not "store_path": + supplied_key = properties[key] + if key == "store_depth" or key == "store_width": + supplied_key = int(properties[key]) + if hashstore_yaml_dict[key] != supplied_key: + exception_string = ( + f"FileHashStore - Given properties ({key}: {properties[key]}) does not" + + f" match. HashStore configuration ({key}: {hashstore_yaml_dict[key]})" + + f" found at: {self.hashstore_configuration_yaml}" + ) + logging.critical(exception_string) + raise ValueError(exception_string) else: if os.path.exists(prop_store_path): # Check if HashStore exists and throw exception if found diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 23c7e4a2..a2f0fdfe 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -125,7 +125,6 @@ def test_init_with_existing_hashstore_missing_yaml(store, pids): def test_load_properties(store): """Verify dictionary returned from load_properties matches initialization.""" hashstore_yaml_dict = store.load_properties() - assert hashstore_yaml_dict.get("store_path") == store.root assert hashstore_yaml_dict.get("store_depth") == 3 assert hashstore_yaml_dict.get("store_width") == 2 assert hashstore_yaml_dict.get("store_algorithm") == "SHA-256" From e097de0dc9ed600e347cdd2e391e5995b996695a Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 12 Oct 2023 13:00:52 -0700 Subject: [PATCH 164/165] Refactor client main method to handle 'store_path' being removed from hashstore config file --- src/hashstore/client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hashstore/client.py b/src/hashstore/client.py index 87d4c3ed..c1e2e4b6 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -191,14 +191,12 @@ def load_store_properties(self, hashstore_yaml): Returns: hashstore_yaml_dict (dict): HashStore properties with the following keys (and values): - store_path (str): Path to the HashStore directory. store_depth (int): Depth when sharding an object's hex digest. store_width (int): Width of directories when sharding an object's hex digest. store_algorithm (str): Hash algorithm used for calculating the object's hex digest. store_metadata_namespace (str): Namespace for the HashStore's system metadata. """ property_required_keys = [ - "store_path", "store_depth", "store_width", "store_algorithm", @@ -778,6 +776,8 @@ def main(): knbvm_test = getattr(args, "knbvm_flag") # Instantiate HashStore Client props = parser.load_store_properties(store_path_config_yaml) + # Reminder: 'hashstore.yaml' only contains 4 of the required 5 properties + props["store_path"] = store_path hashstore_c = HashStoreClient(props, knbvm_test) if knbvm_test: directory_to_convert = getattr(args, "source_directory") From 94778e50ec26ea525b1471d69de7931d81b10f7c Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Thu, 12 Oct 2023 14:13:14 -0700 Subject: [PATCH 165/165] Update README.md --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index a45c36ef..0ddaac61 100644 --- a/README.md +++ b/README.md @@ -38,9 +38,8 @@ from hashstore import HashStoreFactory hashstore_factory = HashStoreFactory() # Create a properties dictionary with the required fields -hashstore_path = "/path/to/your/store" properties = { - "store_path": hashstore_path, + "store_path": "/path/to/your/store", "store_depth": 3, "store_width": 2, "store_algorithm": "sha256",