diff --git a/.gitignore b/.gitignore index 09ccd077..c2a663ae 100644 --- a/.gitignore +++ b/.gitignore @@ -131,6 +131,7 @@ venv/ ENV/ env.bak/ venv.bak/ +.idea # Spyder project settings .spyderproject diff --git a/src/hashstore/client.py b/src/hashstore/client.py index c1e2e4b6..dac73fcf 100644 --- a/src/hashstore/client.py +++ b/src/hashstore/client.py @@ -392,7 +392,7 @@ def validate_object(self, obj_tuple): obj_db_checksum = obj_tuple[2] with self.hashstore.retrieve_object(pid_guid) as obj_stream: - computed_digest = self.hashstore.computehash(obj_stream, algo) + computed_digest = self.hashstore.get_hex_digest(obj_stream, algo) obj_stream.close() if computed_digest != obj_db_checksum: diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 87f652e7..6e6c11bb 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -10,6 +10,7 @@ from pathlib import Path from contextlib import closing from tempfile import NamedTemporaryFile +import fcntl import yaml from hashstore import HashStore, ObjectMetadata @@ -17,8 +18,7 @@ class FileHashStore(HashStore): """FileHashStore is a content addressable file manager based on Derrick Gilland's 'hashfs' library. It supports the storage of objects on disk using - an authority-based identifier's hex digest with a given hash algorithm value - to address files. + a content identifier to address files. FileHashStore initializes using a given properties dictionary containing the required keys (see Args). Upon initialization, FileHashStore verifies the provided @@ -61,8 +61,10 @@ class FileHashStore(HashStore): time_out_sec = 1 object_lock = threading.Lock() metadata_lock = threading.Lock() + reference_lock = threading.Lock() object_locked_pids = [] metadata_locked_pids = [] + reference_locked_cids = [] def __init__(self, properties=None): if properties: @@ -86,8 +88,6 @@ def __init__(self, properties=None): # If no exceptions thrown, FileHashStore ready for initialization logging.debug("FileHashStore - Initializing, properties verified.") self.root = prop_store_path - if not os.path.exists(self.root): - self.create_path(self.root) self.depth = prop_store_depth self.width = prop_store_width self.sysmeta_ns = prop_store_metadata_namespace @@ -104,10 +104,15 @@ def __init__(self, properties=None): # Complete initialization/instantiation by setting and creating store directories self.objects = self.root + "/objects" self.metadata = self.root + "/metadata" + self.refs = self.root + "/refs" if not os.path.exists(self.objects): self.create_path(self.objects + "/tmp") if not os.path.exists(self.metadata): self.create_path(self.metadata + "/tmp") + if not os.path.exists(self.refs): + self.create_path(self.refs + "/tmp") + self.create_path(self.refs + "/pid") + self.create_path(self.refs + "/cid") logging.debug( "FileHashStore - Initialization success. Store root: %s", self.root ) @@ -146,7 +151,7 @@ def load_properties(self): # Get hashstore properties hashstore_yaml_dict = {} for key in self.property_required_keys: - if key is not "store_path": + if key != "store_path": hashstore_yaml_dict[key] = yaml_data[key] logging.debug( "FileHashStore - load_properties: Successfully retrieved 'hashstore.yaml' properties." @@ -196,13 +201,16 @@ def write_properties(self, properties): else: exception_string = ( f"FileHashStore - write_properties: algorithm supplied ({store_algorithm})" - + " cannot be used as default for HashStore. Must be one of:" - + " MD5, SHA-1, SHA-256, SHA-384, SHA-512 which are DataONE" - + " controlled algorithm values" + f" cannot be used as default for HashStore. Must be one of: {', '.join(accepted_store_algorithms)}" + f" which are DataONE controlled algorithm values" ) logging.error(exception_string) raise ValueError(exception_string) + # If given store path doesn't exist yet, create it. + if not os.path.exists(self.root): + self.create_path(self.root) + # .yaml file to write hashstore_configuration_yaml = self._build_hashstore_yaml_string( store_depth, @@ -229,7 +237,6 @@ def _build_hashstore_yaml_string( """Build a YAML string representing the configuration for a HashStore. Args: - store_path (str): Path to the HashStore directory. store_depth (int): Depth when sharding an object's hex digest. store_width (int): Width of directories when sharding an object's hex digest. store_algorithm (str): Hash algorithm used for calculating the object's hex digest. @@ -299,7 +306,7 @@ def _verify_hashstore_properties(self, properties, prop_store_path): hashstore_yaml_dict = self.load_properties() for key in self.property_required_keys: # 'store_path' is required to init HashStore but not saved in `hashstore.yaml` - if key is not "store_path": + if key != "store_path": supplied_key = properties[key] if key == "store_depth" or key == "store_width": supplied_key = int(properties[key]) @@ -365,7 +372,7 @@ def _validate_properties(self, properties): def _set_default_algorithms(self): """Set the default algorithms to calculate when storing objects.""" - def lookup_algo(algo): + def lookup_algo(algo_to_translate): """Translate DataONE controlled algorithms to python hashlib values: https://dataoneorg.github.io/api-documentation/apis/Types.html#Types.ChecksumAlgorithm """ @@ -376,7 +383,7 @@ def lookup_algo(algo): "SHA-384": "sha384", "SHA-512": "sha512", } - return dataone_algo_translation[algo] + return dataone_algo_translation[algo_to_translate] if not os.path.exists(self.hashstore_configuration_yaml): exception_string = ( @@ -404,77 +411,227 @@ def lookup_algo(algo): def store_object( self, - pid, - data, + pid=None, + data=None, additional_algorithm=None, checksum=None, checksum_algorithm=None, expected_object_size=None, ): + if pid is None and self._validate_arg_data(data): + # If no pid is supplied, store the object only without tagging + logging.debug("FileHashStore - store_object: Request to store data only.") + object_metadata = self.store_data_only(data) + logging.info( + "FileHashStore - store_object: Successfully stored object for cid: %s", + object_metadata.id, + ) + else: + # Else the object will be stored and tagged + logging.debug( + "FileHashStore - store_object: Request to store object for pid: %s", pid + ) + # Validate input parameters + self._validate_string(pid, "pid", "store_object") + self._validate_arg_data(data) + self._is_int_and_non_negative(expected_object_size) + ( + additional_algorithm_checked, + checksum_algorithm_checked, + ) = self._validate_arg_algorithms_and_checksum( + additional_algorithm, checksum, checksum_algorithm + ) + + # Wait for the pid to release if it's in use + while pid in self.object_locked_pids: + logging.debug( + "FileHashStore - store_object: %s is currently being stored. Waiting.", + pid, + ) + time.sleep(self.time_out_sec) + # Modify object_locked_pids consecutively + with self.object_lock: + logging.debug( + "FileHashStore - store_object: Adding pid: %s to object_locked_pids.", + pid, + ) + self.object_locked_pids.append(pid) + try: + logging.debug( + "FileHashStore - store_object: Attempting to store object for pid: %s", + pid, + ) + object_metadata = self.store_and_validate_data( + pid, + data, + additional_algorithm=additional_algorithm_checked, + checksum=checksum, + checksum_algorithm=checksum_algorithm_checked, + file_size_to_validate=expected_object_size, + ) + self.tag_object(pid, object_metadata.id) + logging.info( + "FileHashStore - store_object: Successfully stored object for pid: %s", + pid, + ) + finally: + # Release pid + with self.object_lock: + logging.debug( + "FileHashStore - store_object: Removing pid: %s from object_locked_pids.", + pid, + ) + self.object_locked_pids.remove(pid) + + return object_metadata + + def verify_object( + self, object_metadata, checksum, checksum_algorithm, expected_file_size + ): + self._validate_string(checksum, "checksum", "verify_object") + self._validate_string(checksum_algorithm, "checksum_algorithm", "verify_object") + self._is_int_and_non_negative(expected_file_size) + if object_metadata is None or not isinstance(object_metadata, ObjectMetadata): + exception_string = ( + "FileHashStore - verify_object: 'object_metadata' cannot be None." + + " Must be a 'ObjectMetadata' object." + ) + logging.error(exception_string) + raise ValueError(exception_string) + else: + logging.info( + "FileHashStore - verify_object: Called to verify object with id: %s", + object_metadata.id, + ) + object_metadata_hex_digests = object_metadata.hex_digests + object_metadata_file_size = object_metadata.obj_size + checksum_algorithm_checked = self.clean_algorithm(checksum_algorithm) + self._validate_arg_object( + pid=None, + checksum=checksum, + checksum_algorithm=checksum_algorithm_checked, + entity=None, + hex_digests=object_metadata_hex_digests, + tmp_file_name=None, + tmp_file_size=object_metadata_file_size, + file_size_to_validate=expected_file_size, + ) + logging.info( + "FileHashStore - verify_object: object has been validated for cid: %s", + object_metadata.id, + ) + + def tag_object(self, pid, cid): logging.debug( - "FileHashStore - store_object: Request to store object for pid: %s", pid - ) - # Validate input parameters - self._is_string_none_or_empty(pid, "pid", "store_object") - self._validate_data_to_store(data) - self._validate_file_size(expected_object_size) - ( - additional_algorithm_checked, - checksum_algorithm_checked, - ) = self._validate_algorithms_and_checksum( - additional_algorithm, checksum, checksum_algorithm + "FileHashStore - tag_object: Tagging object cid: {%s} with pid: {%s}.", + cid, + pid, ) - - # Wait for the pid to release if it's in use - while pid in self.object_locked_pids: + self._validate_string(pid, "pid", "tag_object") + self._validate_string(cid, "cid", "tag_object") + # Wait for the cid to release if it's being tagged + while cid in self.reference_locked_cids: logging.debug( - "FileHashStore - store_object: %s is currently being stored. Waiting.", - pid, + "FileHashStore - tag_object: (cid) %s is currently locked. Waiting.", + cid, ) time.sleep(self.time_out_sec) - # Modify object_locked_pids consecutively - with self.object_lock: + # Modify reference_locked_cids consecutively + with self.reference_lock: logging.debug( - "FileHashStore - store_object: Adding pid: %s to object_locked_pids.", - pid, + "FileHashStore - tag_object: Adding cid: %s to reference_locked_cids.", + cid, ) - self.object_locked_pids.append(pid) + self.reference_locked_cids.append(cid) try: - logging.debug( - "FileHashStore - store_object: Attempting to store object for pid: %s", - pid, - ) - object_metadata = self.put_object( - pid, - data, - additional_algorithm=additional_algorithm_checked, - checksum=checksum, - checksum_algorithm=checksum_algorithm_checked, - file_size_to_validate=expected_object_size, - ) + pid_ref_abs_path = self.get_refs_abs_path("pid", pid) + cid_ref_abs_path = self.get_refs_abs_path("cid", cid) + if os.path.exists(pid_ref_abs_path): + # A pid reference file can only contain one cid + exception_string = ( + "FileHashStore - write_pid_refs_file: pid ref file already exists for %s", + pid_ref_abs_path, + ) + logging.error(exception_string) + raise FileExistsError(exception_string) + elif os.path.exists(cid_ref_abs_path): + # Update cid ref files if it already exists + self._update_cid_refs(cid_ref_abs_path, pid) + logging.info( + "FileHashStore - tag_object: Successfully updated cid: %s with pid: %s", + cid, + pid, + ) + else: + # All ref files begin as tmp files and get moved sequentially at once + # Ensure refs tmp folder exists + tmp_root_path = self.get_store_path("refs") / "tmp" + if os.path.exists(tmp_root_path) is False: + self.create_path(tmp_root_path) + + # Then write pid_refs_file content into tmp file + pid_tmp_file = self._mktmpfile(tmp_root_path) + pid_tmp_file_path = pid_tmp_file.name + self._write_pid_refs_file(pid_tmp_file_path, cid) + # Then write cid_refs_file content into another tmp file + cid_tmp_file = self._mktmpfile(tmp_root_path) + cid_tmp_file_path = cid_tmp_file.name + self._write_cid_refs_file(cid_tmp_file_path, pid) + + # Create path for pid ref file in '.../refs/pid' + self.create_path(os.path.dirname(pid_ref_abs_path)) + # Create path for cid ref file in '.../refs/cid' + self.create_path(os.path.dirname(cid_ref_abs_path)) + # Move both files + shutil.move(pid_tmp_file_path, pid_ref_abs_path) + shutil.move(cid_tmp_file_path, cid_ref_abs_path) + # Ensure that the reference files have been written as expected + # If there is an issue, client or user will have to manually review + self._verify_hashstore_references(pid, cid) + + logging.info( + "FileHashStore - tag_object: Successfully tagged cid: %s with pid %s", + cid, + pid, + ) + return True finally: - # Release pid - with self.object_lock: + # Release cid + with self.reference_lock: logging.debug( - "FileHashStore - store_object: Removing pid: %s from object_locked_pids.", - pid, + "FileHashStore - tag_object: Removing cid: %s from reference_locked_cids.", + cid, ) - self.object_locked_pids.remove(pid) - logging.info( - "FileHashStore - store_object: Successfully stored object for pid: %s", - pid, + self.reference_locked_cids.remove(cid) + + def find_object(self, pid): + logging.debug( + "FileHashStore - find_object: Request to find object for for pid: %s", pid + ) + self._validate_string(pid, "pid", "find_object") + + pid_ref_abs_path = self.get_refs_abs_path("pid", pid) + if not os.path.exists(pid_ref_abs_path): + err_msg = ( + f"FileHashStore - find_object: pid ({pid}) reference file not found: " + + pid_ref_abs_path, ) + raise FileNotFoundError(err_msg) + else: + # Read the file to get the cid from the pid reference + with open(pid_ref_abs_path, "r", encoding="utf8") as f: + pid_refs_cid = f.read() - return object_metadata + return pid_refs_cid def store_metadata(self, pid, metadata, format_id=None): logging.debug( "FileHashStore - store_metadata: Request to store metadata for pid: %s", pid ) # Validate input parameters - self._is_string_none_or_empty(pid, "pid", "store_metadata") - checked_format_id = self._validate_format_id(format_id, "store_metadata") - self._validate_metadata_to_store(metadata) + self._validate_string(pid, "pid", "store_metadata") + checked_format_id = self._validate_arg_format_id(format_id, "store_metadata") + self._validate_arg_data(metadata) # Wait for the pid to release if it's in use while pid in self.metadata_locked_pids: @@ -498,6 +655,12 @@ def store_metadata(self, pid, metadata, format_id=None): pid, ) metadata_cid = self.put_metadata(metadata, pid, checked_format_id) + + logging.info( + "FileHashStore - store_metadata: Successfully stored metadata for pid: %s", + pid, + ) + return metadata_cid finally: # Release pid with self.metadata_lock: @@ -506,25 +669,18 @@ def store_metadata(self, pid, metadata, format_id=None): pid, ) self.metadata_locked_pids.remove(pid) - logging.info( - "FileHashStore - store_metadata: Successfully stored metadata for pid: %s", - pid, - ) - - return metadata_cid def retrieve_object(self, pid): logging.debug( "FileHashStore - retrieve_object: Request to retrieve object for pid: %s", pid, ) - self._is_string_none_or_empty(pid, "pid", "retrieve_object") + self._validate_string(pid, "pid", "retrieve_object") + object_cid = self.find_object(pid) entity = "objects" - object_cid = self.get_sha256_hex_digest(pid) - object_exists = self.exists(entity, object_cid) - if object_exists: + if object_cid: logging.debug( "FileHashStore - retrieve_object: Metadata exists for pid: %s, retrieving object.", pid, @@ -547,11 +703,11 @@ def retrieve_metadata(self, pid, format_id=None): "FileHashStore - retrieve_metadata: Request to retrieve metadata for pid: %s", pid, ) - self._is_string_none_or_empty(pid, "pid", "retrieve_metadata") - checked_format_id = self._validate_format_id(format_id, "retrieve_metadata") + self._validate_string(pid, "pid", "retrieve_metadata") + checked_format_id = self._validate_arg_format_id(format_id, "retrieve_metadata") entity = "metadata" - metadata_cid = self.get_sha256_hex_digest(pid + checked_format_id) + metadata_cid = self.computehash(pid + checked_format_id) metadata_exists = self.exists(entity, metadata_cid) if metadata_exists: metadata_stream = self.open(entity, metadata_cid) @@ -571,28 +727,62 @@ def delete_object(self, pid): logging.debug( "FileHashStore - delete_object: Request to delete object for pid: %s", pid ) - self._is_string_none_or_empty(pid, "pid", "delete_object") - - entity = "objects" - object_cid = self.get_sha256_hex_digest(pid) - self.delete(entity, object_cid) + self._validate_string(pid, "pid", "delete_object") + cid = self.find_object(pid) - logging.info( - "FileHashStore - delete_object: Successfully deleted object for pid: %s", - pid, - ) - return True + while cid in self.reference_locked_cids: + logging.debug( + "FileHashStore - delete_object: (cid) %s is currently locked. Waiting", + cid, + ) + time.sleep(self.time_out_sec) + # Modify reference_locked_cids consecutively + with self.reference_lock: + logging.debug( + "FileHashStore - delete_object: Adding cid: %s to reference_locked_cids.", + cid, + ) + self.reference_locked_cids.append(cid) + try: + # Remove pid from cid reference file + cid_ref_abs_path = self.get_refs_abs_path("cid", cid) + self._delete_cid_refs_pid(cid_ref_abs_path, pid) + # Delete cid reference file + # If the file is not empty, it will not be deleted. + cid_refs_deleted = self._delete_cid_refs_file(cid_ref_abs_path) + # Delete pid reference file + pid_ref_abs_path = self.get_refs_abs_path("pid", pid) + self._delete_pid_refs_file(pid_ref_abs_path) + # Finally, delete the object + if cid_refs_deleted: + entity = "objects" + self.delete(entity, cid) + + info_string = ( + "FileHashStore - delete_object: Successfully deleted references and/or" + + f" objects associated with pid: {pid}" + ) + logging.info(info_string) + return True + finally: + # Release cid + with self.reference_lock: + logging.debug( + "FileHashStore - delete_object: Removing cid: %s from reference_locked_cids.", + cid, + ) + self.reference_locked_cids.remove(cid) def delete_metadata(self, pid, format_id=None): logging.debug( "FileHashStore - delete_metadata: Request to delete metadata for pid: %s", pid, ) - self._is_string_none_or_empty(pid, "pid", "delete_metadata") - checked_format_id = self._validate_format_id(format_id, "delete_metadata") + self._validate_string(pid, "pid", "delete_metadata") + checked_format_id = self._validate_arg_format_id(format_id, "delete_metadata") entity = "metadata" - metadata_cid = self.get_sha256_hex_digest(pid + checked_format_id) + metadata_cid = self.computehash(pid + checked_format_id) self.delete(entity, metadata_cid) logging.info( @@ -606,12 +796,12 @@ def get_hex_digest(self, pid, algorithm): "FileHashStore - get_hex_digest: Request to get hex digest for object with pid: %s", pid, ) - self._is_string_none_or_empty(pid, "pid", "get_hex_digest") - self._is_string_none_or_empty(algorithm, "algorithm", "get_hex_digest") + self._validate_string(pid, "pid", "get_hex_digest") + self._validate_string(algorithm, "algorithm", "get_hex_digest") entity = "objects" algorithm = self.clean_algorithm(algorithm) - object_cid = self.get_sha256_hex_digest(pid) + object_cid = self.find_object(pid) if not self.exists(entity, object_cid): exception_string = ( f"FileHashStore - get_hex_digest: No object found for pid: {pid}" @@ -621,16 +811,16 @@ def get_hex_digest(self, pid, algorithm): cid_stream = self.open(entity, object_cid) hex_digest = self.computehash(cid_stream, algorithm=algorithm) - info_msg = ( + info_string = ( f"FileHashStore - get_hex_digest: Successfully calculated hex digest for pid: {pid}." + f" Hex Digest: {hex_digest}", ) - logging.info(info_msg) + logging.info(info_string) return hex_digest # FileHashStore Core Methods - def put_object( + def store_and_validate_data( self, pid, file, @@ -640,7 +830,8 @@ def put_object( checksum_algorithm=None, file_size_to_validate=None, ): - """Store contents of `file` on disk using the hash of the given pid + """Store contents of `file` on disk using, validate the object's parameters if + provided and tag/reference the object. Args: pid (string): Authority-based identifier. \n @@ -656,7 +847,7 @@ def put_object( Returns: object_metadata (ObjectMetadata): object that contains the object id, - object file size, duplicate file boolean and hex digest dictionary. + object file size and hex digest dictionary. """ stream = Stream(file) @@ -685,6 +876,57 @@ def put_object( ) return object_metadata + def store_data_only(self, data): + """Store an object to HashStore and return the id and a hex digest + dictionary of the default algorithms. This method does not validate the + object and writes directly to /objects after the hex digests are calculated. + + Args: + data (mixed): String or path to object. + + Raises: + IOError: If object fails to store + FileExistsError: If file already exists + + Returns: + object_metadata (ObjectMetadata): object that contains the object id, + object file size and hex digest dictionary. + """ + logging.debug( + "FileHashStore - store_object: Request to store data object only." + ) + + try: + # Ensure the data is a stream + stream = Stream(data) + + # Get the hex digest dictionary + with closing(stream): + ( + object_ref_pid_location, + obj_file_size, + hex_digest_dict, + ) = self._move_and_get_checksums(None, stream) + + object_metadata = ObjectMetadata( + object_ref_pid_location, obj_file_size, hex_digest_dict + ) + # The permanent address of the data stored is based on the data's checksum + cid = hex_digest_dict.get(self.algorithm) + logging.debug( + "FileHashStore - store_object: Successfully stored object with cid: %s", + cid, + ) + return object_metadata + # pylint: disable=W0718 + except Exception as err: + exception_string = ( + "FileHashStore - store_object: failed to store object." + + f" Unexpected {err=}, {type(err)=}" + ) + logging.error(exception_string) + raise err + def _move_and_get_checksums( self, pid, @@ -716,29 +958,18 @@ def _move_and_get_checksums( file_size_to_validate (bytes, optional): Expected size of object Returns: - object_metadata (tuple): object id, object file size, duplicate file - boolean and hex digest dictionary. + object_metadata (tuple): object id, object file size and hex digest dictionary. """ - entity = "objects" - object_cid = self.get_sha256_hex_digest(pid) - abs_file_path = self.build_abs_path(entity, object_cid, extension) - - # Only create tmp file to be moved if target destination doesn't exist - if os.path.isfile(abs_file_path): - exception_string = ( - "FileHashStore - _move_and_get_checksums: File already exists" - + f" for pid: {pid} at {abs_file_path}" - ) - logging.error(exception_string) - raise FileExistsError(exception_string) - - # Create temporary file and calculate hex digests debug_msg = ( "FileHashStore - _move_and_get_checksums: Creating temp" + f" file and calculating checksums for pid: {pid}" ) logging.debug(debug_msg) - hex_digests, tmp_file_name, tmp_file_size = self._mktmpfile( + ( + hex_digests, + tmp_file_name, + tmp_file_size, + ) = self._write_to_tmp_file_and_get_hex_digests( stream, additional_algorithm, checksum_algorithm ) logging.debug( @@ -746,10 +977,16 @@ def _move_and_get_checksums( tmp_file_name, ) - # Only move file if it doesn't exist. - # Files are stored once and only once + # Objects are stored with their content identifier based on the store algorithm + entity = "objects" + object_cid = hex_digests.get(self.algorithm) + abs_file_path = self.build_abs_path(entity, object_cid, extension) + + # Only move file if it doesn't exist. We do not check before we create the tmp + # file and calculate the hex digests because the given checksum could be incorrect. if not os.path.isfile(abs_file_path): - self._validate_object( + # Files are stored once and only once + self._validate_arg_object( pid, checksum, checksum_algorithm, @@ -784,12 +1021,12 @@ def _move_and_get_checksums( pid_checksum = self.get_hex_digest(pid, self.algorithm) if pid_checksum == hex_digests.get(self.algorithm): # If the checksums match, return and log warning - warning_msg = ( + exception_string = ( "FileHashStore - _move_and_get_checksums: File moved" + f" successfully but unexpected issue encountered: {exception_string}", ) - logging.warning(warning_msg) - return + logging.error(exception_string) + raise err else: debug_msg = ( "FileHashStore - _move_and_get_checksums: Permanent file" @@ -810,23 +1047,27 @@ def _move_and_get_checksums( raise else: # Else delete temporary file - warning_msg = ( - f"FileHashStore - _move_and_get_checksums: Object exists at: {abs_file_path}," - + " deleting temporary file." + exception_string = ( + "FileHashStore - _move_and_get_checksums: Object already exists at:" + + f" {abs_file_path}, deleting temporary file." ) - logging.warning(warning_msg) + logging.error(exception_string) self.delete(entity, tmp_file_name) + raise FileExistsError(exception_string) - return (object_cid, tmp_file_size, hex_digests) + return object_cid, tmp_file_size, hex_digests - def _mktmpfile(self, stream, additional_algorithm=None, checksum_algorithm=None): + def _write_to_tmp_file_and_get_hex_digests( + self, stream, additional_algorithm=None, checksum_algorithm=None + ): """Create a named temporary file from a `Stream` object and return its filename - and a dictionary of its algorithms and hex digests. If an additionak and/or checksum - algorithm is provided, it will add the respective hex digest to the dictionary. + and a dictionary of its algorithms and hex digests. If an additional and/or checksum + algorithm is provided, it will add the respective hex digest to the dictionary if + it is supported. Args: stream (io.BufferedReader): Object stream. - algorithm (string): Algorithm of additional hex digest to generate + additional_algorithm (string): Algorithm of additional hex digest to generate checksum_algorithm (string): Algorithm of additional checksum algo to generate Returns: @@ -843,26 +1084,11 @@ def _mktmpfile(self, stream, additional_algorithm=None, checksum_algorithm=None) # Physically create directory if it doesn't exist if os.path.exists(tmp_root_path) is False: self.create_path(tmp_root_path) - tmp = NamedTemporaryFile(dir=tmp_root_path, delete=False) - - # Delete tmp file if python interpreter crashes or thread is interrupted - # when store_object is called - def delete_tmp_file(): - if os.path.exists(tmp.name): - os.remove(tmp.name) - - atexit.register(delete_tmp_file) - - # Ensure tmp file is created with desired permissions - if self.fmode is not None: - oldmask = os.umask(0) - try: - os.chmod(tmp.name, self.fmode) - finally: - os.umask(oldmask) + tmp = self._mktmpfile(tmp_root_path) logging.debug( - "FileHashStore - _mktempfile: tmp file created: %s, calculating hex digests.", + "FileHashStore - _write_to_tmp_file_and_get_hex_digests: tmp file created:" + + " %s, calculating hex digests.", tmp.name, ) @@ -879,7 +1105,8 @@ def delete_tmp_file(): for hash_algorithm in hash_algorithms: hash_algorithm.update(self._to_bytes(data)) logging.debug( - "FileHashStore - _mktempfile: Object stream successfully written to tmp file: %s", + "FileHashStore - _write_to_tmp_file_and_get_hex_digests: Object stream" + + " successfully written to tmp file: %s", tmp.name, ) @@ -891,19 +1118,23 @@ def delete_tmp_file(): # Ready for validation and atomic move tmp_file_completion_flag = True - logging.debug("FileHashStore - _mktempfile: Hex digests calculated.") + logging.debug( + "FileHashStore - _write_to_tmp_file_and_get_hex_digests: Hex digests calculated." + ) return hex_digest_dict, tmp.name, tmp_file_size # pylint: disable=W0718 except Exception as err: exception_string = ( - f"FileHashStore - _mktempfile: Unexpected {err=}, {type(err)=}" + "FileHashStore - _write_to_tmp_file_and_get_hex_digests:" + + f" Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) # pylint: disable=W0707,W0719 raise Exception(exception_string) except KeyboardInterrupt: exception_string = ( - "FileHashStore - _mktempfile: Keyboard interruption by user." + "FileHashStore - _write_to_tmp_file_and_get_hex_digests:" + + " Keyboard interruption by user." ) logging.error(exception_string) if os.path.exists(tmp.name): @@ -916,11 +1147,271 @@ def delete_tmp_file(): # pylint: disable=W0718 except Exception as err: exception_string = ( - f"FileHashStore - _mktempfile: Unexpected {err=} while attempting to" + "FileHashStore - _write_to_tmp_file_and_get_hex_digests:" + + f"Unexpected {err=} while attempting to" + f" delete tmp file: {tmp.name}, {type(err)=}" ) logging.error(exception_string) + def _mktmpfile(self, path): + """Create a temporary file at the given path ready to be written. + + Args: + path (string): Path to the file location + + Returns: + tmp (file object): object with file-like interface + """ + tmp = NamedTemporaryFile(dir=path, delete=False) + + # Delete tmp file if python interpreter crashes or thread is interrupted + def delete_tmp_file(): + if os.path.exists(tmp.name): + os.remove(tmp.name) + + atexit.register(delete_tmp_file) + + # Ensure tmp file is created with desired permissions + if self.fmode is not None: + oldmask = os.umask(0) + try: + os.chmod(tmp.name, self.fmode) + finally: + os.umask(oldmask) + return tmp + + def _write_cid_refs_file(self, path, pid): + """Write the cid reference file in the supplied path to a file. A reference file + contains every pid that references a cid each on its own line. This method will + only write into an empty file, and will not write over an existing one. + + Args: + path (string): Path of file to be written into + pid (string): Authority-based or persistent identifier of object + """ + logging.debug( + "FileHashStore - write_cid_refs_file: Writing pid (%s) into file: %s", + pid, + path, + ) + + if os.path.isfile(path): + if os.path.getsize(path) != 0: + err_msg = ( + "FileHashStore - _write_cid_refs_file: Failed to write cid reference file." + + f" File is not empty: {path} " + ) + logging.error(err_msg) + raise OSError(err_msg) + + try: + with open(path, "w", encoding="utf8") as cid_ref_file: + fcntl.flock(cid_ref_file, fcntl.LOCK_EX) + cid_ref_file.write(pid + "\n") + # The context manager will take care of releasing the lock + # But the code to explicitly release the lock if desired is below + # fcntl.flock(f, fcntl.LOCK_UN) + return + + except Exception as err: + exception_string = ( + f"FileHashStore - write_cid_refs_file: failed to write pid ({pid})" + + f" into path: {path}. Unexpected {err=}, {type(err)=}" + ) + logging.error(exception_string) + raise err + + def _update_cid_refs(self, cid_ref_abs_path, pid): + """Update an existing cid reference file with the given pid. + + Args: + cid_ref_abs_path (string): Absolute path to the cid ref file + pid (string): Authority-based or persistent identifier of object + """ + logging.debug( + "FileHashStore - update_cid_refs: Adding pid (%s) into cid reference file: %s", + pid, + cid_ref_abs_path, + ) + if not os.path.exists(cid_ref_abs_path): + exception_string = ( + f"FileHashStore - update_cid_refs: {cid_ref_abs_path} does not exist." + + f" Cannot write pid: {[pid]}" + ) + logging.error(exception_string) + raise FileNotFoundError(exception_string) + + try: + with open(cid_ref_abs_path, "r", encoding="utf8") as f: + for _, line in enumerate(f, start=1): + value = line.strip() + if pid == value: + err_msg = ( + f"FileHashStore - update_cid_refs: pid ({pid}) already reference in" + + f" cid reference file: {cid_ref_abs_path} " + ) + raise ValueError(err_msg) + + with open(cid_ref_abs_path, "a+", encoding="utf8") as cid_ref_file: + fcntl.flock(cid_ref_file, fcntl.LOCK_EX) + cid_ref_file.write(pid + "\n") + # The context manager will take care of releasing the lock + # But the code to explicitly release the lock if desired is below + # fcntl.flock(f, fcntl.LOCK_UN) + return + + except Exception as err: + exception_string = ( + "FileHashStore - update_cid_refs: failed to update reference for cid:" + + f" {cid_ref_abs_path} for pid: {pid}. Unexpected {err=}, {type(err)=}" + ) + logging.error(exception_string) + raise err + + def _delete_cid_refs_pid(self, cid_ref_abs_path, pid): + """Delete a pid from a cid reference file. + + Args: + cid_ref_abs_path (string): Absolute path to the cid ref file + pid (string): Authority-based or persistent identifier of object + """ + logging.debug( + "FileHashStore - _delete_cid_refs_pid: Deleting pid (%s) from cid reference file: %s", + pid, + cid_ref_abs_path, + ) + + try: + with open(cid_ref_abs_path, "r", encoding="utf8") as cid_ref_file: + fcntl.flock(cid_ref_file, fcntl.LOCK_EX) + # Read the ref file to see if the pid is already referencing the cid + cid_ref_file_content = cid_ref_file.read() + + if pid not in cid_ref_file_content: + err_msg = ( + f"FileHashStore - _delete_cid_refs_pid: pid ({pid}) does not exist in" + + f" cid reference file: {cid_ref_abs_path} " + ) + raise ValueError(err_msg) + + with open(cid_ref_abs_path, "w", encoding="utf8") as cid_ref_file: + fcntl.flock(cid_ref_file, fcntl.LOCK_EX) + cid_ref_file.write(cid_ref_file_content.replace(pid + "\n", "")) + # The context manager will take care of releasing the lock + # But the code to explicitly release the lock if desired is below + # fcntl.flock(f, fcntl.LOCK_UN) + return + + except Exception as err: + exception_string = ( + "FileHashStore - _delete_cid_refs_pid: failed to update reference for cid:" + + f" {cid_ref_abs_path} for pid: {pid}. Unexpected {err=}, {type(err)=}" + ) + logging.error(exception_string) + raise err + + def _delete_cid_refs_file(self, cid_ref_abs_path): + """Delete a cid reference file. There must be no references remaining. + + Args: + cid_ref_abs_path (string): Absolute path to the cid ref file + + Returns: + boolean: True if deleted, False if not + """ + logging.debug( + "FileHashStore - _delete_cid_refs_file: Deleting reference file: %s", + cid_ref_abs_path, + ) + + try: + if not os.path.exists(cid_ref_abs_path): + err_msg = ( + "FileHashStore - _delete_cid_refs_file: Cid reference file not found: %s", + cid_ref_abs_path, + ) + logging.error(err_msg) + raise FileNotFoundError(err_msg) + if os.path.getsize(cid_ref_abs_path) != 0: + err_msg = ( + "FileHashStore - _delete_cid_refs_file: Failed to delete cid reference file." + + f" File is not empty: {cid_ref_abs_path} " + ) + logging.error(err_msg) + raise OSError(err_msg) + else: + os.remove(cid_ref_abs_path) + return True + + except Exception as err: + exception_string = ( + "FileHashStore - _delete_cid_refs_file: failed to delete reference file:" + + f" {cid_ref_abs_path}. Unexpected {err=}, {type(err)=}" + ) + logging.error(exception_string) + raise err + + def _write_pid_refs_file(self, path, cid): + """Write the pid reference file in the supplied path for the given cid (content + identifier). A reference file for a pid contains the cid that it references. + + Args: + path (string): Path of file to be written into + cid (string): Content identifier + """ + logging.debug( + "FileHashStore - _write_pid_refs_file: Writing cid (%s) into file: %s", + cid, + path, + ) + + try: + with open(path, "w", encoding="utf8") as pid_ref_file: + fcntl.flock(pid_ref_file, fcntl.LOCK_EX) + pid_ref_file.write(cid) + # The context manager will take care of releasing the lock + # But the code to explicitly release the lock if desired is below + # fcntl.flock(f, fcntl.LOCK_UN) + return + + except Exception as err: + exception_string = ( + f"FileHashStore - _write_pid_refs_file: failed to write cid ({cid})" + + f" into path: {path}. Unexpected {err=}, {type(err)=}" + ) + logging.error(exception_string) + raise err + + def _delete_pid_refs_file(self, pid_ref_abs_path): + """Delete a pid reference file. + + Args: + pid_ref_abs_path (string): Absolute path to the pid ref file + """ + logging.debug( + "FileHashStore - _delete_pid_refs_file: Deleting reference file: %s", + pid_ref_abs_path, + ) + + try: + if not os.path.exists(pid_ref_abs_path): + err_msg = ( + "FileHashStore - _delete_pid_refs_file: pid reference file not found: %s", + pid_ref_abs_path, + ) + raise FileNotFoundError(err_msg) + else: + os.remove(pid_ref_abs_path) + return + + except Exception as err: + exception_string = ( + "FileHashStore - _delete_pid_refs_file: failed to delete reference file:" + + f" {pid_ref_abs_path}. Unexpected {err=}, {type(err)=}" + ) + logging.error(exception_string) + raise err + def put_metadata(self, metadata, pid, format_id): """Store contents of metadata to `[self.root]/metadata` using the hash of the given pid and format_id as the permanent address. @@ -942,7 +1433,7 @@ def put_metadata(self, metadata, pid, format_id): metadata_tmp = self._mktmpmetadata(metadata_stream) # Get target and related paths (permanent location) - metadata_cid = self.get_sha256_hex_digest(pid + format_id) + metadata_cid = self.computehash(pid + format_id) rel_path = "/".join(self.shard(metadata_cid)) full_path = self.get_store_path("metadata") / rel_path @@ -980,11 +1471,10 @@ def put_metadata(self, metadata, pid, format_id): raise FileNotFoundError(exception_string) def _mktmpmetadata(self, stream): - """Create a named temporary file with `stream` (metadata) and `format_id`. + """Create a named temporary file with `stream` (metadata). Args: stream (io.BufferedReader): Metadata stream. - format_id (string): Format of metadata. Returns: tmp.name (string): Path/name of temporary file created and written into. @@ -995,14 +1485,7 @@ def _mktmpmetadata(self, stream): if os.path.exists(tmp_root_path) is False: self.create_path(tmp_root_path) - tmp = NamedTemporaryFile(dir=tmp_root_path, delete=False) - # Ensure tmp file is created with desired permissions - if self.fmode is not None: - oldmask = os.umask(0) - try: - os.chmod(tmp.name, self.fmode) - finally: - os.umask(oldmask) + tmp = self._mktmpfile(tmp_root_path) # tmp is a file-like object that is already opened for writing by default logging.debug( @@ -1021,12 +1504,15 @@ def _mktmpmetadata(self, stream): # FileHashStore Utility & Supporting Methods - def _validate_data_to_store(self, data): - """Evaluates a data argument to ensure that it is either a string, path or - stream object before attempting to store it. + def _validate_arg_data(self, data): + """Checks a data argument to ensure that it is either a string, path or stream + object. Args: data (string, path, stream): object to validate + + Returns: + boolean: True if valid. """ if ( not isinstance(data, str) @@ -1034,7 +1520,7 @@ def _validate_data_to_store(self, data): and not isinstance(data, io.BufferedIOBase) ): exception_string = ( - "FileHashStore - store_object: Data must be a path, string or buffered" + "FileHashStore - _validate_arg_data: Data must be a path, string or buffered" + f" stream type. Data type supplied: {type(data)}" ) logging.error(exception_string) @@ -1042,21 +1528,26 @@ def _validate_data_to_store(self, data): if isinstance(data, str): if data.replace(" ", "") == "": exception_string = ( - "FileHashStore - store_object: Data string cannot be empty." + "FileHashStore - _validate_arg_data: Data string cannot be empty." ) logging.error(exception_string) raise TypeError(exception_string) + return True - def _validate_algorithms_and_checksum( + def _validate_arg_algorithms_and_checksum( self, additional_algorithm, checksum, checksum_algorithm ): - """Determines whether calling app has supplied the necessary arguments to validate - an object with a checksum value + """Determines whether caller has supplied the necessary arguments to validate + an object with a checksum value. Args: additional_algorithm: value of additional algorithm to calculate checksum (string): value of checksum checksum_algorithm (string): algorithm of checksum + + Returns: + additional_algorithm_checked (string): hashlib compatible string or 'None' + checksum_algorithm_checked (string): hashlib compatible string or 'None' """ additional_algorithm_checked = None if additional_algorithm != self.algorithm and additional_algorithm is not None: @@ -1064,56 +1555,22 @@ def _validate_algorithms_and_checksum( additional_algorithm_checked = self.clean_algorithm(additional_algorithm) checksum_algorithm_checked = None if checksum is not None: - self._is_string_none_or_empty( + self._validate_string( checksum_algorithm, "checksum_algorithm", - "validate_checksum_args (store_object)", + "_validate_arg_algorithms_and_checksum (store_object)", ) if checksum_algorithm is not None: - self._is_string_none_or_empty( + self._validate_string( checksum, "checksum", - "validate_checksum_args (store_object)", + "_validate_arg_algorithms_and_checksum (store_object)", ) # Set checksum_algorithm checksum_algorithm_checked = self.clean_algorithm(checksum_algorithm) return additional_algorithm_checked, checksum_algorithm_checked - def _refine_algorithm_list(self, additional_algorithm, checksum_algorithm): - """Create the final list of hash algorithms to calculate - - Args: - additional_algorithm (string) - checksum_algorithm (string) - - Return: - algorithm_list_to_calculate (set): De-duplicated list of hash algorithms - """ - algorithm_list_to_calculate = self.default_algo_list - if checksum_algorithm is not None: - self.clean_algorithm(checksum_algorithm) - if checksum_algorithm in self.other_algo_list: - debug_additional_other_algo_str = ( - f"FileHashStore - _mktempfile: checksum algorithm: {checksum_algorithm}" - + " found in other_algo_lists, adding to list of algorithms to calculate." - ) - logging.debug(debug_additional_other_algo_str) - algorithm_list_to_calculate.append(checksum_algorithm) - if additional_algorithm is not None: - self.clean_algorithm(additional_algorithm) - if additional_algorithm in self.other_algo_list: - debug_additional_other_algo_str = ( - f"FileHashStore - _mktempfile: additional algorithm: {additional_algorithm}" - + " found in other_algo_lists, adding to list of algorithms to calculate." - ) - logging.debug(debug_additional_other_algo_str) - algorithm_list_to_calculate.append(additional_algorithm) - - # Remove duplicates - algorithm_list_to_calculate = set(algorithm_list_to_calculate) - return algorithm_list_to_calculate - - def _validate_object( + def _validate_arg_object( self, pid, checksum, @@ -1124,70 +1581,65 @@ def _validate_object( tmp_file_size, file_size_to_validate, ): - """Evaluates an object's integrity + """Evaluates an object's integrity and throws exception if there is a mismatch. Args: - pid: For logging purposes - checksum: Value of checksum - checksum_algorithm: Algorithm of checksum - entity: Type of object - hex_digests: Dictionary of hex digests to select from - tmp_file_name: Name of tmp file - tmp_file_size: Size of the tmp file - file_size_to_validate: Expected size of the object + pid (string): For logging purposes + checksum (string): Value of checksum to check + checksum_algorithm (string): Algorithm of checksum + entity (string): Type of object ('objects' or 'metadata') + hex_digests (dictionary): Dictionary of hex digests to parse + tmp_file_name (string): Name of tmp file + tmp_file_size (int): Size of the tmp file + file_size_to_validate (int): Expected size of the object """ if file_size_to_validate is not None and file_size_to_validate > 0: if file_size_to_validate != tmp_file_size: - self.delete(entity, tmp_file_name) exception_string = ( - "FileHashStore - _move_and_get_checksums: Object file size calculated: " + "FileHashStore - _validate_arg_object: Object file size calculated: " + f" {tmp_file_size} does not match with expected size:" - + f"{file_size_to_validate}. Tmp file deleted and file not stored for" - + f" pid: {pid}" + + f"{file_size_to_validate}." ) - logging.error(exception_string) - raise ValueError(exception_string) + if pid is not None: + self.delete(entity, tmp_file_name) + exception_string_for_pid = ( + exception_string + + f" Tmp file deleted and file not stored for pid: {pid}" + ) + logging.error(exception_string_for_pid) + raise ValueError(exception_string_for_pid) + else: + logging.error(exception_string) + raise ValueError(exception_string) if checksum_algorithm is not None and checksum is not None: - hex_digest_stored = hex_digests[checksum_algorithm] - if hex_digest_stored != checksum: - self.delete(entity, tmp_file_name) + if checksum_algorithm not in hex_digests: exception_string = ( - "FileHashStore - _move_and_get_checksums: Hex digest and checksum" - + f" do not match - file not stored for pid: {pid}. Algorithm:" - + f" {checksum_algorithm}. Checksum provided: {checksum} !=" - + f" HexDigest: {hex_digest_stored}. Tmp file deleted." + "FileHashStore - _validate_arg_object: checksum_algorithm" + + f" ({checksum_algorithm}) cannot be found in the hex digests dictionary." ) logging.error(exception_string) - raise ValueError(exception_string) - - def _validate_metadata_to_store(self, metadata): - """Evaluates a metadata argument to ensure that it is either a string, path or - stream object before attempting to store it. - - Args: - metadata (string, path, stream): metadata to validate - """ - if isinstance(metadata, str): - if metadata.replace(" ", "") == "": - exception_string = ( - "FileHashStore - store_metadata: Given string path to" - + " metadata cannot be empty." - ) - logging.error(exception_string) - raise TypeError(exception_string) - if ( - not isinstance(metadata, str) - and not isinstance(metadata, Path) - and not isinstance(metadata, io.BufferedIOBase) - ): - exception_string = ( - "FileHashStore - store_metadata: Metadata must be a path or string" - + f" type, data type supplied: {type(metadata)}" - ) - logging.error(exception_string) - raise TypeError(exception_string) + raise KeyError(exception_string) + else: + hex_digest_stored = hex_digests[checksum_algorithm] + if hex_digest_stored != checksum: + exception_string = ( + "FileHashStore - _validate_arg_object: Hex digest and checksum" + + f" do not match - file not stored for pid: {pid}. Algorithm:" + + f" {checksum_algorithm}. Checksum provided: {checksum} !=" + + f" HexDigest: {hex_digest_stored}." + ) + if pid is not None: + self.delete(entity, tmp_file_name) + exception_string_for_pid = ( + exception_string + f"Tmp file ({tmp_file_name}) deleted." + ) + logging.error(exception_string_for_pid) + raise ValueError(exception_string_for_pid) + else: + logging.error(exception_string) + raise ValueError(exception_string) - def _validate_format_id(self, format_id, method): + def _validate_arg_format_id(self, format_id, method): """Determines the metadata namespace (format_id) to use for storing, retrieving and deleting metadata. @@ -1210,6 +1662,90 @@ def _validate_format_id(self, format_id, method): checked_format_id = format_id return checked_format_id + def _verify_hashstore_references(self, pid, cid): + """Verifies that the supplied pid and pid reference file and content have been + written successfully. + + Args: + pid (string): Authority-based or persistent identifier + cid (string): Content identifier + """ + # Check that reference files were created + pid_ref_abs_path = self.get_refs_abs_path("pid", pid) + cid_ref_abs_path = self.get_refs_abs_path("cid", cid) + if not os.path.exists(pid_ref_abs_path): + exception_string = ( + "FileHashStore - _verify_hashstore_references: Pid refs file missing: %s", + pid_ref_abs_path, + ) + logging.error(exception_string) + raise FileNotFoundError(exception_string) + if not os.path.exists(cid_ref_abs_path): + exception_string = ( + "FileHashStore - _verify_hashstore_references: Cid refs file missing: %s", + cid_ref_abs_path, + ) + logging.error(exception_string) + raise FileNotFoundError(exception_string) + # Check the content of the reference files + # Start with the cid + retrieved_cid = self.find_object(pid) + if retrieved_cid != cid: + exception_string = ( + "FileHashStore - _verify_hashstore_references: Pid refs file exists" + + f" ({pid_ref_abs_path}) but cid ({cid}) does not match." + ) + logging.error(exception_string) + raise ValueError(exception_string) + # Then the pid + pid_found = False + with open(cid_ref_abs_path, "r", encoding="utf8") as f: + for _, line in enumerate(f, start=1): + value = line.strip() + if value == pid: + pid_found = True + if not pid_found: + exception_string = ( + "FileHashStore - _verify_hashstore_references: Cid refs file exists" + + f" ({cid_ref_abs_path}) but pid ({pid}) not found." + ) + logging.error(exception_string) + raise ValueError(exception_string) + + def _refine_algorithm_list(self, additional_algorithm, checksum_algorithm): + """Create the final list of hash algorithms to calculate. + + Args: + additional_algorithm (string) + checksum_algorithm (string) + + Return: + algorithm_list_to_calculate (set): De-duplicated list of hash algorithms + """ + algorithm_list_to_calculate = self.default_algo_list + if checksum_algorithm is not None: + self.clean_algorithm(checksum_algorithm) + if checksum_algorithm in self.other_algo_list: + debug_additional_other_algo_str = ( + f"FileHashStore - _refine_algorithm_list: checksum algo: {checksum_algorithm}" + + " found in other_algo_lists, adding to list of algorithms to calculate." + ) + logging.debug(debug_additional_other_algo_str) + algorithm_list_to_calculate.append(checksum_algorithm) + if additional_algorithm is not None: + self.clean_algorithm(additional_algorithm) + if additional_algorithm in self.other_algo_list: + debug_additional_other_algo_str = ( + f"FileHashStore - _refine_algorithm_list: addit algo: {additional_algorithm}" + + " found in other_algo_lists, adding to list of algorithms to calculate." + ) + logging.debug(debug_additional_other_algo_str) + algorithm_list_to_calculate.append(additional_algorithm) + + # Remove duplicates + algorithm_list_to_calculate = set(algorithm_list_to_calculate) + return algorithm_list_to_calculate + def clean_algorithm(self, algorithm_string): """Format a string and ensure that it is supported and compatible with the python hashlib library. @@ -1242,11 +1778,12 @@ def clean_algorithm(self, algorithm_string): return cleaned_string def computehash(self, stream, algorithm=None): - """Compute hash of a file-like object using :attr:`algorithm` by default - or with optional algorithm supported. + """Compute the hash of a file-like object (or string) using the store algorthm by + default or with optional algorithm supported. Args: - stream (io.BufferedReader): A buffered stream of an object_cid object. \n + stream (mixed): A buffered stream (io.BufferedReader) of an object. A string is + also acceptable as they are a sequence of characters (Python only).\n algorithm (string): Algorithm of hex digest to generate. Returns: @@ -1272,9 +1809,11 @@ def get_store_path(self, entity): return Path(self.objects) elif entity == "metadata": return Path(self.metadata) + elif entity == "refs": + return Path(self.refs) else: raise ValueError( - f"entity: {entity} does not exist. Do you mean 'objects' or 'metadata'?" + f"entity: {entity} does not exist. Do you mean 'objects', 'metadata' or 'refs'?" ) def exists(self, entity, file): @@ -1312,8 +1851,8 @@ def compact(items): # This creates a list of `depth` number of tokens with width # `width` from the first part of the id plus the remainder. hierarchical_list = compact( - [digest[i * self.width : self.width * (i + 1)] for i in range(self.depth)] - + [digest[self.depth * self.width :]] + [digest[i * self.width: self.width * (i + 1)] for i in range(self.depth)] + + [digest[self.depth * self.width:]] ) return hierarchical_list @@ -1356,9 +1895,9 @@ def delete(self, entity, file): except OSError: pass else: - self.remove_empty(os.path.dirname(realpath)) + self._remove_empty(os.path.dirname(realpath)) - def remove_empty(self, subpath): + def _remove_empty(self, subpath): """Successively remove all empty folders starting with `subpath` and proceeding "up" through directory tree until reaching the `root` folder. @@ -1445,18 +1984,18 @@ def get_real_path(self, entity, file): # Could not determine a match. return None - def build_abs_path(self, entity, cid, extension=""): + def build_abs_path(self, entity, hash_id, extension=""): """Build the absolute file path for a given hash id with an optional file extension. Args: entity (str): Desired entity type (ex. "objects", "metadata"). \n - cid (str): A hash id to build a file path for. \n + hash_id (str): A hash id to build a file path for. \n extension (str): An optional file extension to append to the file path. Returns: absolute_path (str): An absolute file path for the specified hash id. """ - paths = self.shard(cid) + paths = self.shard(hash_id) root_dir = self.get_store_path(entity) if extension and not extension.startswith(os.extsep): @@ -1467,6 +2006,28 @@ def build_abs_path(self, entity, cid, extension=""): absolute_path = os.path.join(root_dir, *paths) + extension return absolute_path + def get_refs_abs_path(self, ref_type, hash_id): + """Get the absolute path to the reference file for the given ref_type. If a + 'pid' is provided, this method will calculate the pid's hash based on the store + algorithm, and return the expected address of the pid reference file. If a + 'cid' is provided, this method will return the expected address by sharding the + cid based on HashStore's configuration. + + Args: + ref_type (string): 'pid' or 'cid' + hash_id (string): Authority-based, persistent or hash identifier + + Returns: + ref_file_abs_path (string): Path to the ref file for the given type and pid + """ + entity = "refs" + if ref_type == "pid": + hash_id = self.computehash(hash_id, self.algorithm) + ref_file_abs_path = self.build_abs_path(entity, hash_id).replace( + "/refs/", f"/refs/{ref_type}/" + ) + return ref_file_abs_path + def count(self, entity): """Return count of the number of files in the `root` directory. @@ -1495,8 +2056,8 @@ def count(self, entity): # Other Static Methods @staticmethod - def _validate_file_size(file_size): - """Checks whether a file size is > 0 and an int and throws exception if not. + def _is_int_and_non_negative(file_size): + """Checks whether a given argument is an integer and > 0 and throws exception if not. Args: file_size (int): file size to check @@ -1504,28 +2065,28 @@ def _validate_file_size(file_size): if file_size is not None: if not isinstance(file_size, int): exception_string = ( - "FileHashStore - _is_file_size_valid: size given must be an integer." + "FileHashStore - _is_int_and_non_negative: size given must be an integer." + f" File size: {file_size}. Arg Type: {type(file_size)}." ) logging.error(exception_string) raise TypeError(exception_string) - if file_size < 1 or not isinstance(file_size, int): + if file_size < 1: exception_string = ( - "FileHashStore - _is_file_size_valid: size given must be > 0" + "FileHashStore - _is_int_and_non_negative: size given must be > 0" ) logging.error(exception_string) raise ValueError(exception_string) @staticmethod - def _is_string_none_or_empty(string, arg, method): + def _validate_string(string, arg, method): """Checks whether a string is None or empty and throws an exception if so. Args: string (string): Value to check - arg (): Name of argument to check + arg (string): Name of argument to check method (string): Calling method for logging purposes """ - if string is None or string.replace(" ", "") == "": + if string is None or string.strip() == "": exception_string = ( f"FileHashStore - {method}: {arg} cannot be None" + f" or empty, {arg}: {string}." @@ -1547,19 +2108,6 @@ def _to_bytes(text): text = bytes(text, "utf8") return text - @staticmethod - def get_sha256_hex_digest(string): - """Calculate the SHA-256 digest of a UTF-8 encoded string. - - Args: - string (string): String to convert. - - Returns: - hex (string): Hexadecimal string. - """ - hex_digest = hashlib.sha256(string.encode("utf-8")).hexdigest() - return hex_digest - class Stream(object): """Common interface for file-like objects. diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 6c704209..d1ff440c 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -2,12 +2,12 @@ from abc import ABC, abstractmethod from collections import namedtuple import importlib.metadata +import importlib.util class HashStore(ABC): - """HashStore is a content-addressable file management system that - utilizes a persistent identifier (PID) in the form of a hex digest - value to address files.""" + """HashStore is a content-addressable file management system that utilizes + an object's content identifier (hex digest/checksum) to address files.""" @staticmethod def version(): @@ -26,28 +26,32 @@ def store_object( expected_object_size, ): """The `store_object` method is responsible for the atomic storage of objects to - disk using a given InputStream and a persistent identifier (pid). Upon - successful storage, the method returns a ObjectMetadata object containing - relevant file information, such as the file's id (which can be used to locate the - object on disk), the file's size, and a hex digest map of algorithms and checksums. - `store_object` also ensures that an object is stored only once by synchronizing - multiple calls and rejecting calls to store duplicate objects. - - The file's id is determined by calculating the SHA-256 hex digest of the - provided pid, which is also used as the permanent address of the file. The - file's identifier is then sharded using a depth of 3 and width of 2, + disk using a given stream. Upon successful storage, the method returns a ObjectMetadata + object containing relevant file information, such as the file's id (which can be + used to locate the object on disk), the file's size, and a hex digest dict of algorithms + and checksums. Storing an object with `store_object` also tags an object (creating + references) which allow the object to be discoverable. + + `store_object` also ensures that an object is stored only once by synchronizing multiple + calls and rejecting calls to store duplicate objects. Note, calling `store_object` without + a pid is a possibility, but should only store the object without tagging the object. + It is then the caller's responsibility to finalize the process by calling `tag_object` + after veriftying the correct object is stored. + + The file's id is determined by calculating the object's content identifier based on + the store's default algorithm, which is also used as the permanent address of the file. + The file's identifier is then sharded using the store's configured depth and width, delimited by '/' and concatenated to produce the final permanent address and is stored in the `/store_directory/objects/` directory. By default, the hex digest map includes the following hash algorithms: - Default algorithms and hex digests to return: md5, sha1, sha256, sha384, sha512, - which are the most commonly used algorithms in dataset submissions to DataONE - and the Arctic Data Center. If an additional algorithm is provided, the - `store_object` method checks if it is supported and adds it to the map along - with its corresponding hex digest. An algorithm is considered "supported" if it - is recognized as a valid hash algorithm in the `hashlib` library. - - Similarly, if a file size and/or checksum & checksumAlgorithm value are provided, + md5, sha1, sha256, sha384, sha512 - which are the most commonly used algorithms in + dataset submissions to DataONE and the Arctic Data Center. If an additional algorithm + is provided, the `store_object` method checks if it is supported and adds it to the + hex digests dict along with its corresponding hex digest. An algorithm is considered + "supported" if it is recognized as a valid hash algorithm in the `hashlib` library. + + Similarly, if a file size and/or checksum & checksum_algorithm value are provided, `store_object` validates the object to ensure it matches the given arguments before moving the file to its permanent address. @@ -61,7 +65,50 @@ def store_object( Returns: object_metadata (ObjectMetadata): Object that contains the permanent address, - file size, duplicate file boolean and hex digest dictionary. + file size and hex digest dictionary. + """ + raise NotImplementedError() + + @abstractmethod + def tag_object(self, pid, cid): + """The `tag_object` method creates references that allow objects stored in HashStore + to be discoverable. Retrieving, deleting or calculating a hex digest of an object is + based on a pid argument; and to proceed, we must be able to find the object associated + with the pid. + + Args: + pid (string): Authority-based or persistent identifier of object + cid (string): Content identifier of object + + Returns: + boolean: `True` upon successful tagging. + """ + raise NotImplementedError() + + @abstractmethod + def verify_object( + self, object_metadata, checksum, checksum_algorithm, expected_file_size + ): + """Confirms that an object_metadata's content is equal to the given values. + + Args: + object_metadata (ObjectMetadata): object_metadata object + checksum (string): Value of checksum + checksum_algorithm (string): Algorithm of checksum + expected_file_size (int): Size of the tmp file + """ + raise NotImplementedError() + + @abstractmethod + def find_object(self, pid): + """The `find_object` method checks whether an object referenced by a pid exists + and returns the content identifier. + + Args: + pid (string): Authority-based or persistent identifier of object + + Returns: + cid (string): Content identifier of the object """ raise NotImplementedError() @@ -89,9 +136,8 @@ def store_metadata(self, pid, metadata, format_id): @abstractmethod def retrieve_object(self, pid): """The `retrieve_object` method retrieves an object from disk using a given - persistent identifier (pid). If the object exists (determined by calculating - the object's permanent address using the SHA-256 hash of the given pid), the - method will open and return a buffered object stream ready to read from. + persistent identifier (pid). If the object exists, the method will open and return + a buffered object stream ready to read from. Args: pid (string): Authority-based identifier. @@ -211,12 +257,12 @@ def get_hashstore(module_name, class_name, properties=None): class ObjectMetadata(namedtuple("ObjectMetadata", ["id", "obj_size", "hex_digests"])): - """File address containing file's path on disk and its content hash ID. + """Represents metadata associated with an object. - Args: - ab_id (str): Hash ID (hexdigest) of file contents. - obj_size (bytes): Size of the object - hex_digests (dict, optional): A list of hex digests to validate objects + Attributes: + id (str): A unique identifier for the object (Hash ID, hex digest). + obj_size (bytes): The size of the object in bytes. + hex_digests (list, optional): A list of hex digests to validate objects (md5, sha1, sha256, sha384, sha512) """ diff --git a/tests/conftest.py b/tests/conftest.py index 9b25c520..54af3542 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -47,7 +47,6 @@ def init_pids(): test_pids = { "doi:10.18739/A2901ZH2M": { "file_size_bytes": 39993, - "object_cid": "0d555ed77052d7e166017f779cbc193357c3a5006ee8b8457230bcf7abcef65e", "metadata_cid": "323e0799524cec4c7e14d31289cefd884b563b5c052f154a066de5ec1e477da7", "md5": "db91c910a3202478c8def1071c54aae5", "sha1": "1fe86e3c8043afa4c70857ca983d740ad8501ccd", @@ -58,7 +57,6 @@ def init_pids(): }, "jtao.1700.1": { "file_size_bytes": 8724, - "object_cid": "a8241925740d5dcd719596639e780e0a090c9d55a5d0372b0eaf55ed711d4edf", "metadata_cid": "ddf07952ef28efc099d10d8b682480f7d2da60015f5d8873b6e1ea75b4baf689", "md5": "f4ea2d07db950873462a064937197b0f", "sha1": "3d25436c4490b08a2646e283dada5c60e5c0539d", @@ -69,7 +67,6 @@ def init_pids(): }, "urn:uuid:1b35d0a5-b17a-423b-a2ed-de2b18dc367a": { "file_size_bytes": 18699, - "object_cid": "7f5cc18f0b04e812a3b4c8f686ce34e6fec558804bf61e54b176742a7f6368d6", "metadata_cid": "9a2e08c666b728e6cbd04d247b9e556df3de5b2ca49f7c5a24868eb27cddbff2", "md5": "e1932fc75ca94de8b64f1d73dc898079", "sha1": "c6d2a69a3f5adaf478ba796c114f57b990cf7ad1", diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index a2f0fdfe..ce04ecec 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -1,4 +1,4 @@ -"""Test module for FileHashStore core, utility and supporting methods""" +"""Test module for FileHashStore init, core, utility and supporting methods.""" import io import os from pathlib import Path @@ -6,6 +6,9 @@ from hashstore.filehashstore import FileHashStore +# Tests for HashStore Configuration and Related Methods + + def test_pids_length(pids): """Ensure test harness pids are present.""" assert len(pids) == 3 @@ -18,12 +21,17 @@ def test_init_directories_created(store): assert os.path.exists(store.objects + "/tmp") assert os.path.exists(store.metadata) assert os.path.exists(store.metadata + "/tmp") + assert os.path.exists(store.refs) + assert os.path.exists(store.refs + "/tmp") + assert os.path.exists(store.refs + "/pid") + assert os.path.exists(store.refs + "/cid") def test_init_existing_store_incorrect_algorithm_format(store): - """Confirm that exception is thrown when store_algorithm is not a DataONE controlled value""" + """Confirm that exception is thrown when store_algorithm is not a DataONE + controlled value.""" properties = { - "store_path": store.root, + "store_path": store.root + "/incorrect_algo_format", "store_depth": 3, "store_width": 2, "store_algorithm": "sha256", @@ -34,7 +42,7 @@ def test_init_existing_store_incorrect_algorithm_format(store): def test_init_existing_store_correct_algorithm_format(store): - """Confirm second instance of HashStore with DataONE controlled value""" + """Confirm second instance of HashStore with DataONE controlled value.""" properties = { "store_path": store.root, "store_depth": 3, @@ -52,7 +60,8 @@ def test_init_write_properties_hashstore_yaml_exists(store): def test_init_with_existing_hashstore_mismatched_config_depth(store): - """Test init with existing HashStore raises ValueError with mismatching properties.""" + """Test init with existing HashStore raises a ValueError when supplied with + mismatching depth.""" properties = { "store_path": store.root, "store_depth": 1, @@ -65,7 +74,8 @@ def test_init_with_existing_hashstore_mismatched_config_depth(store): def test_init_with_existing_hashstore_mismatched_config_width(store): - """Test init with existing HashStore raises ValueError with mismatching properties.""" + """Test init with existing HashStore raises a ValueError when supplied with + mismatching width.""" properties = { "store_path": store.root, "store_depth": 3, @@ -78,7 +88,8 @@ def test_init_with_existing_hashstore_mismatched_config_width(store): def test_init_with_existing_hashstore_mismatched_config_algo(store): - """Test init with existing HashStore raises ValueError with mismatching properties.""" + """Test init with existing HashStore raises a ValueError when supplied with + mismatching default algorithm.""" properties = { "store_path": store.root, "store_depth": 3, @@ -91,7 +102,8 @@ def test_init_with_existing_hashstore_mismatched_config_algo(store): def test_init_with_existing_hashstore_mismatched_config_metadata_ns(store): - """Test init with existing HashStore raises ValueError with mismatching properties.""" + """Test init with existing HashStore raises a ValueError when supplied with + mismatching default name space.""" properties = { "store_path": store.root, "store_depth": 3, @@ -109,7 +121,7 @@ def test_init_with_existing_hashstore_missing_yaml(store, pids): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - store.put_object(pid, path) + store.store_and_validate_data(pid, path) os.remove(store.hashstore_configuration_yaml) properties = { "store_path": store.root, @@ -182,7 +194,7 @@ def test_validate_properties_key_value_is_none(store): def test_validate_properties_incorrect_type(store): - """Confirm exception raised when key missing in properties.""" + """Confirm exception raised when a bad properties value is given.""" properties = "etc/filehashstore/hashstore.yaml" with pytest.raises(ValueError): # pylint: disable=W0212 @@ -195,75 +207,78 @@ def test_set_default_algorithms_missing_yaml(store, pids): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - store.put_object(pid, path) + store.store_and_validate_data(pid, path) os.remove(store.hashstore_configuration_yaml) with pytest.raises(FileNotFoundError): # pylint: disable=W0212 store._set_default_algorithms() -def test_put_object_files_path(pids, store): - """Test put objects with path object.""" +# Tests for FileHashStore Core Methods + + +def test_store_and_validate_data_files_path(pids, store): + """Test store_and_validate_data objects with path object for the path arg.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): path = Path(test_dir) / pid.replace("/", "_") - object_metadata = store.put_object(pid, path) + object_metadata = store.store_and_validate_data(pid, path) object_metadata_id = object_metadata.id assert store.exists(entity, object_metadata_id) -def test_put_object_files_string(pids, store): - """Test put objects with string.""" +def test_store_and_validate_data_files_string(pids, store): + """Test store_and_validate_data objects with string for the path arg.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path) + object_metadata = store.store_and_validate_data(pid, path) object_metadata_id = object_metadata.id assert store.exists(entity, object_metadata_id) -def test_put_object_files_stream(pids, store): - """Test put objects with stream.""" +def test_store_and_validate_data_files_stream(pids, store): + """Test store_and_validate_data objects with stream for the path arg.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") - object_metadata = store.put_object(pid, input_stream) + object_metadata = store.store_and_validate_data(pid, input_stream) input_stream.close() object_metadata_id = object_metadata.id assert store.exists(entity, object_metadata_id) assert store.count(entity) == 3 -def test_put_object_cid(pids, store): - """Check put returns correct id.""" +def test_store_and_validate_data_cid(pids, store): + """Check store_and_validate_data returns correct id.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path) + object_metadata = store.store_and_validate_data(pid, path) object_metadata_id = object_metadata.id - assert object_metadata_id == pids[pid]["object_cid"] + assert object_metadata_id == pids[pid][store.algorithm] -def test_put_object_file_size(pids, store): - """Check put returns correct file size.""" +def test_store_and_validate_data_file_size(pids, store): + """Check store_and_validate_data returns correct file size.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path) + object_metadata = store.store_and_validate_data(pid, path) object_size = object_metadata.obj_size assert object_size == pids[pid]["file_size_bytes"] -def test_put_object_hex_digests(pids, store): - """Check put successfully generates hex digests dictionary.""" +def test_store_and_validate_data_hex_digests(pids, store): + """Check store_and_validate_data successfully generates hex digests dictionary.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path) + object_metadata = store.store_and_validate_data(pid, path) object_metadata_hex_digests = object_metadata.hex_digests assert object_metadata_hex_digests.get("md5") == pids[pid]["md5"] assert object_metadata_hex_digests.get("sha1") == pids[pid]["sha1"] @@ -272,31 +287,35 @@ def test_put_object_hex_digests(pids, store): assert object_metadata_hex_digests.get("sha512") == pids[pid]["sha512"] -def test_put_object_additional_algorithm(pids, store): - """Check put_object returns additional algorithm in hex digests.""" +def test_store_and_validate_data_additional_algorithm(pids, store): + """Check store_and_validate_data returns additional algorithm in hex digests.""" test_dir = "tests/testdata/" for pid in pids.keys(): algo = "sha224" path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path, additional_algorithm=algo) + object_metadata = store.store_and_validate_data( + pid, path, additional_algorithm=algo + ) hex_digests = object_metadata.hex_digests sha224_hash = hex_digests.get(algo) assert sha224_hash == pids[pid][algo] -def test_put_object_with_correct_checksums(pids, store): - """Check put_object success with valid checksum and checksum algorithm supplied.""" +def test_store_and_validate_data_with_correct_checksums(pids, store): + """Check store_and_validate_data with valid checksum and checksum algorithm supplied.""" test_dir = "tests/testdata/" for pid in pids.keys(): algo = "sha224" algo_checksum = pids[pid][algo] path = test_dir + pid.replace("/", "_") - store.put_object(pid, path, checksum=algo_checksum, checksum_algorithm=algo) + store.store_and_validate_data( + pid, path, checksum=algo_checksum, checksum_algorithm=algo + ) assert store.count("objects") == 3 -def test_put_object_with_incorrect_checksum(pids, store): - """Check put fails when bad checksum supplied.""" +def test_store_and_validate_data_with_incorrect_checksum(pids, store): + """Check store_and_validate_data fails when a bad checksum supplied.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): @@ -304,10 +323,46 @@ def test_put_object_with_incorrect_checksum(pids, store): algo_checksum = "badChecksumValue" path = test_dir + pid.replace("/", "_") with pytest.raises(ValueError): - store.put_object(pid, path, checksum=algo_checksum, checksum_algorithm=algo) + store.store_and_validate_data( + pid, path, checksum=algo_checksum, checksum_algorithm=algo + ) assert store.count(entity) == 0 +def test_store_data_only_cid(pids, store): + """Check store_data_only returns correct id.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_data_only(path) + object_metadata_id = object_metadata.id + assert object_metadata_id == pids[pid][store.algorithm] + + +def test_store_data_only_file_size(pids, store): + """Check store_data_only returns correct file size.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_data_only(path) + object_size = object_metadata.obj_size + assert object_size == pids[pid]["file_size_bytes"] + + +def test_store_data_only_hex_digests(pids, store): + """Check store_data_only generates hex digests dictionary.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_data_only(path) + object_metadata_hex_digests = object_metadata.hex_digests + assert object_metadata_hex_digests.get("md5") == pids[pid]["md5"] + assert object_metadata_hex_digests.get("sha1") == pids[pid]["sha1"] + assert object_metadata_hex_digests.get("sha256") == pids[pid]["sha256"] + assert object_metadata_hex_digests.get("sha384") == pids[pid]["sha384"] + assert object_metadata_hex_digests.get("sha512") == pids[pid]["sha512"] + + def test_move_and_get_checksums_id(pids, store): """Test move returns correct id.""" test_dir = "tests/testdata/" @@ -321,8 +376,7 @@ def test_move_and_get_checksums_id(pids, store): _, ) = store._move_and_get_checksums(pid, input_stream) input_stream.close() - object_cid = store.get_sha256_hex_digest(pid) - assert move_id == object_cid + assert move_id == pids[pid][store.algorithm] def test_move_and_get_checksums_file_size(pids, store): @@ -381,8 +435,8 @@ def test_move_and_get_checksums_duplicates_raises_error(pids, store): assert store.count(entity) == 3 -def test_move_and_get_checksums_file_size_raises_error(pids, store): - """Test move and get checksum raises error with incorrect file size""" +def test_move_and_get_checksums_incorrect_file_size(pids, store): + """Test move and get checksum raises error with an incorrect file size.""" test_dir = "tests/testdata/" for pid in pids.keys(): with pytest.raises(ValueError): @@ -401,8 +455,8 @@ def test_move_and_get_checksums_file_size_raises_error(pids, store): input_stream.close() -def test_mktempfile_additional_algo(store): - """Test _mktempfile returns correct hex digests for additional algorithm.""" +def test_write_to_tmp_file_and_get_hex_digests_additional_algo(store): + """Test _write...hex_digests returns correct hex digests with an additional algorithm.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -412,15 +466,16 @@ def test_mktempfile_additional_algo(store): "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) # pylint: disable=W0212 - hex_digests, _, _ = store._mktmpfile( + hex_digests, _, _ = store._write_to_tmp_file_and_get_hex_digests( input_stream, additional_algorithm=checksum_algo ) input_stream.close() assert hex_digests.get("sha3_256") == checksum_correct -def test_mktempfile_checksum_algo(store): - """Test _mktempfile returns correct hex digests for checksum algorithm.""" +def test_write_to_tmp_file_and_get_hex_digests_checksum_algo(store): + """Test _write...hex_digests returns correct hex digests when given a checksum_algorithm + is provided.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -430,13 +485,16 @@ def test_mktempfile_checksum_algo(store): "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) # pylint: disable=W0212 - hex_digests, _, _ = store._mktmpfile(input_stream, checksum_algorithm=checksum_algo) + hex_digests, _, _ = store._write_to_tmp_file_and_get_hex_digests( + input_stream, checksum_algorithm=checksum_algo + ) input_stream.close() assert hex_digests.get("sha3_256") == checksum_correct -def test_mktempfile_checksum_and_additional_algo(store): - """Test _mktempfile returns correct hex digests for checksum algorithm.""" +def test_write_to_tmp_file_and_get_hex_digests_checksum_and_additional_algo(store): + """Test _write...hex_digests returns correct hex digests when an additional and + checksum algorithm is provided.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -450,7 +508,7 @@ def test_mktempfile_checksum_and_additional_algo(store): "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) # pylint: disable=W0212 - hex_digests, _, _ = store._mktmpfile( + hex_digests, _, _ = store._write_to_tmp_file_and_get_hex_digests( input_stream, additional_algorithm=additional_algo, checksum_algorithm=checksum_algo, @@ -460,8 +518,10 @@ def test_mktempfile_checksum_and_additional_algo(store): assert hex_digests.get("sha224") == additional_algo_checksum -def test_mktempfile_checksum_and_additional_algo_duplicate(store): - """Test _mktempfile succeeds with duplicate algorithms (de-duplicates).""" +def test_write_to_tmp_file_and_get_hex_digests_checksum_and_additional_algo_duplicate( + store, +): + """Test _write...hex_digests succeeds with duplicate algorithms (de-duplicates).""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -470,7 +530,7 @@ def test_mktempfile_checksum_and_additional_algo_duplicate(store): checksum_algo = "sha224" checksum_correct = "9b3a96f434f3c894359193a63437ef86fbd5a1a1a6cc37f1d5013ac1" # pylint: disable=W0212 - hex_digests, _, _ = store._mktmpfile( + hex_digests, _, _ = store._write_to_tmp_file_and_get_hex_digests( input_stream, additional_algorithm=additional_algo, checksum_algorithm=checksum_algo, @@ -479,26 +539,26 @@ def test_mktempfile_checksum_and_additional_algo_duplicate(store): assert hex_digests.get("sha224") == checksum_correct -def test_mktempfile_file_size(pids, store): - """Test _mktempfile returns correct file size.""" +def test_write_to_tmp_file_and_get_hex_digests_file_size(pids, store): + """Test _write...hex_digests returns correct file size.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") # pylint: disable=W0212 - _, _, tmp_file_size = store._mktmpfile(input_stream) + _, _, tmp_file_size = store._write_to_tmp_file_and_get_hex_digests(input_stream) input_stream.close() assert tmp_file_size == pids[pid]["file_size_bytes"] -def test_mktempfile_hex_digests(pids, store): - """Test _mktempfile returns correct hex digests.""" +def test_write_to_tmp_file_and_get_hex_digests_hex_digests(pids, store): + """Test _write...hex_digests returns correct hex digests.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") # pylint: disable=W0212 - hex_digests, _, _ = store._mktmpfile(input_stream) + hex_digests, _, _ = store._write_to_tmp_file_and_get_hex_digests(input_stream) input_stream.close() assert hex_digests.get("md5") == pids[pid]["md5"] assert hex_digests.get("sha1") == pids[pid]["sha1"] @@ -507,20 +567,20 @@ def test_mktempfile_hex_digests(pids, store): assert hex_digests.get("sha512") == pids[pid]["sha512"] -def test_mktempfile_tmpfile_object(pids, store): - """Test _mktempfile creates file successfully.""" +def test_write_to_tmp_file_and_get_hex_digests_tmpfile_object(pids, store): + """Test _write...hex_digests returns a tmp file successfully.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") # pylint: disable=W0212 - _, tmp_file_name, _ = store._mktmpfile(input_stream) + _, tmp_file_name, _ = store._write_to_tmp_file_and_get_hex_digests(input_stream) input_stream.close() assert os.path.isfile(tmp_file_name) is True -def test_mktempfile_with_unsupported_algorithm(pids, store): - """Test _mktempfile raises error when bad algorithm supplied.""" +def test_write_to_tmp_file_and_get_hex_digests_with_unsupported_algorithm(pids, store): + """Test _write...hex_digests raises an exception when an unsupported algorithm supplied.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -528,35 +588,48 @@ def test_mktempfile_with_unsupported_algorithm(pids, store): algo = "md2" with pytest.raises(ValueError): # pylint: disable=W0212 - _, _, _ = store._mktmpfile(input_stream, additional_algorithm=algo) + _, _, _ = store._write_to_tmp_file_and_get_hex_digests( + input_stream, additional_algorithm=algo + ) with pytest.raises(ValueError): # pylint: disable=W0212 - _, _, _ = store._mktmpfile(input_stream, checksum_algorithm=algo) + _, _, _ = store._write_to_tmp_file_and_get_hex_digests( + input_stream, checksum_algorithm=algo + ) input_stream.close() +def test_mktmpfile(store): + """Test that _mktmpfile creates and returns a tmp file.""" + path = store.root + "/doutest/tmp/" + store.create_path(path) + # pylint: disable=W0212 + tmp = store._mktmpfile(path) + assert os.path.exists(tmp.name) + + def test_put_metadata_with_path(pids, store): - """Test put_metadata with path object.""" + """Test put_metadata with path object for the path arg.""" entity = "metadata" test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - metadata_cid = store.store_metadata(pid, syspath, format_id) + metadata_cid = store.put_metadata(syspath, pid, format_id) assert store.exists(entity, metadata_cid) assert store.count(entity) == 3 def test_put_metadata_with_string(pids, store): - """Test_put metadata with string.""" + """Test_put metadata with string for the path arg.""" entity = "metadata" test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = str(Path(test_dir) / filename) - metadata_cid = store.store_metadata(pid, syspath, format_id) + metadata_cid = store.put_metadata(syspath, pid, format_id) assert store.exists(entity, metadata_cid) assert store.count(entity) == 3 @@ -568,14 +641,13 @@ def test_put_metadata_cid(pids, store): for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - metadata_cid = store.store_metadata(pid, syspath, format_id) + metadata_cid = store.put_metadata(syspath, pid, format_id) assert metadata_cid == pids[pid]["metadata_cid"] def test_mktmpmetadata(pids, store): """Test mktmpmetadata creates tmpFile.""" test_dir = "tests/testdata/" - entity = "metadata" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename @@ -583,7 +655,109 @@ def test_mktmpmetadata(pids, store): # pylint: disable=W0212 tmp_name = store._mktmpmetadata(sys_stream) sys_stream.close() - assert store.exists(entity, tmp_name) + assert os.path.exists(tmp_name) + + +# Tests for FileHashStore Utility & Supporting Methods + + +def test_validate_arg_object(pids, store): + """Test _validate_arg_object succeeds given good arguments.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + hex_digests = object_metadata.hex_digests + checksum = object_metadata.hex_digests.get(store.algorithm) + checksum_algorithm = store.algorithm + expected_file_size = object_metadata.obj_size + # pylint: disable=W0212 + store._validate_arg_object( + None, + checksum, + checksum_algorithm, + None, + hex_digests, + None, + expected_file_size, + expected_file_size, + ) + + +def test_validate_arg_object_incorrect_size(pids, store): + """Test _validate_arg_object throws exception when size is incorrect.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + hex_digests = object_metadata.hex_digests + checksum = hex_digests.get(store.algorithm) + checksum_algorithm = store.algorithm + with pytest.raises(ValueError): + # pylint: disable=W0212 + store._validate_arg_object( + None, + checksum, + checksum_algorithm, + None, + hex_digests, + None, + 1000, + 2000, + ) + + +def test_validate_arg_object_incorrect_size_with_pid(pids, store): + """Test _validate_arg_object deletes the expected tmp file if obj size does + not match and raises an exception.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + hex_digests = object_metadata.hex_digests + checksum = object_metadata.hex_digests.get(store.algorithm) + checksum_algorithm = store.algorithm + expected_file_size = object_metadata.obj_size + + objects_tmp_folder = store.objects + "/tmp" + # pylint: disable=W0212 + tmp_file = store._mktmpfile(objects_tmp_folder) + assert os.path.isfile(tmp_file.name) + with pytest.raises(ValueError): + store._validate_arg_object( + "Test_Pid", + checksum, + checksum_algorithm, + None, + hex_digests, + tmp_file.name, + 1000, + expected_file_size, + ) + assert not os.path.isfile(tmp_file.name) + + +def test_validate_arg_object_missing_key_in_hex_digests(pids, store): + """Test _validate_arg_object throws exception when algorithm is not found in hex digests.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + checksum = object_metadata.hex_digests.get(store.algorithm) + checksum_algorithm = "blake2s" + expected_file_size = object_metadata.obj_size + with pytest.raises(KeyError): + # pylint: disable=W0212 + store._validate_arg_object( + None, + checksum, + checksum_algorithm, + None, + object_metadata.hex_digests, + None, + expected_file_size, + expected_file_size, + ) def test_clean_algorithm(store): @@ -626,13 +800,21 @@ def test_get_store_path_metadata(store): assert path_metadata_string.endswith("/metacat/metadata") +def test_get_store_path_refs(store): + """Check get_store_path for refs path.""" + # pylint: disable=W0212 + path_metadata = store.get_store_path("refs") + path_metadata_string = str(path_metadata) + assert path_metadata_string.endswith("/metacat/refs") + + def test_exists_with_object_metadata_id(pids, store): """Test exists method with an absolute file path.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path) + object_metadata = store.store_and_validate_data(pid, path) assert store.exists(entity, object_metadata.id) @@ -642,7 +824,7 @@ def test_exists_with_sharded_path(pids, store): entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path) + object_metadata = store.store_and_validate_data(pid, path) object_metadata_shard = store.shard(object_metadata.id) object_metadata_shard_path = "/".join(object_metadata_shard) assert store.exists(entity, object_metadata_shard_path) @@ -675,7 +857,7 @@ def test_open_objects(pids, store): entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path) + object_metadata = store.store_and_validate_data(pid, path) object_metadata_id = object_metadata.id io_buffer = store.open(entity, object_metadata_id) assert isinstance(io_buffer, io.BufferedReader) @@ -683,12 +865,12 @@ def test_open_objects(pids, store): def test_delete_by_object_metadata_id(pids, store): - """Check objects are deleted after calling delete with hash address id.""" + """Check objects are deleted after calling delete with object id.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path) + object_metadata = store.store_and_validate_data(pid, path) object_metadata_id = object_metadata.id store.delete(entity, object_metadata_id) assert store.count(entity) == 0 @@ -705,9 +887,10 @@ def test_remove_empty_removes_empty_folders_string(store): assert os.path.exists(os.path.join(store.root, three_dirs)) assert os.path.exists(os.path.join(store.root, two_dirs)) assert os.path.exists(os.path.join(store.root, one_dir)) - store.remove_empty(os.path.join(store.root, three_dirs)) - store.remove_empty(os.path.join(store.root, two_dirs)) - store.remove_empty(os.path.join(store.root, one_dir)) + # pylint: disable=W0212 + store._remove_empty(os.path.join(store.root, three_dirs)) + store._remove_empty(os.path.join(store.root, two_dirs)) + store._remove_empty(os.path.join(store.root, one_dir)) assert not os.path.exists(os.path.join(store.root, three_dirs)) assert not os.path.exists(os.path.join(store.root, two_dirs)) assert not os.path.exists(os.path.join(store.root, one_dir)) @@ -724,9 +907,10 @@ def test_remove_empty_removes_empty_folders_path(store): assert (store.root / three_dirs).exists() assert (store.root / two_dirs).exists() assert (store.root / one_dir).exists() - store.remove_empty(store.root / three_dirs) - store.remove_empty(store.root / two_dirs) - store.remove_empty(store.root / one_dir) + # pylint: disable=W0212 + store._remove_empty(store.root / three_dirs) + store._remove_empty(store.root / two_dirs) + store._remove_empty(store.root / one_dir) assert not (store.root / three_dirs).exists() assert not (store.root / two_dirs).exists() assert not (store.root / one_dir).exists() @@ -737,13 +921,14 @@ def test_remove_empty_does_not_remove_nonempty_folders(pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path) + object_metadata = store.store_and_validate_data(pid, path) object_metadata_shard = store.shard(object_metadata.id) object_metadata_shard_path = "/".join(object_metadata_shard) # Get parent directory of the relative path parent_dir = os.path.dirname(object_metadata_shard_path) # Attempt to remove the parent directory - store.remove_empty(parent_dir) + # pylint: disable=W0212 + store._remove_empty(parent_dir) abs_parent_dir = store.objects + "/" + parent_dir assert os.path.exists(abs_parent_dir) @@ -800,7 +985,7 @@ def test_get_real_path_with_object_id(store, pids): entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path) + object_metadata = store.store_and_validate_data(pid, path) obj_abs_path = store.get_real_path(entity, object_metadata.id) assert os.path.exists(obj_abs_path) @@ -811,7 +996,7 @@ def test_get_real_path_with_object_id_sharded(pids, store): entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path) + object_metadata = store.store_and_validate_data(pid, path) object_metadata_shard = store.shard(object_metadata.id) object_metadata_shard_path = "/".join(object_metadata_shard) obj_abs_path = store.get_real_path(entity, object_metadata_shard_path) @@ -837,7 +1022,7 @@ def test_get_real_path_with_bad_entity(store, pids): entity = "bad_entity" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - object_metadata = store.put_object(pid, path) + object_metadata = store.store_and_validate_data(pid, path) with pytest.raises(ValueError): store.get_real_path(entity, object_metadata.id) @@ -848,10 +1033,10 @@ def test_build_abs_path(store, pids): entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - _ = store.put_object(pid, path) + _ = store.store_and_validate_data(pid, path) # pylint: disable=W0212 - abs_path = store.build_abs_path(entity, pids[pid]["object_cid"]) - assert abs_path + abs_path = store.build_abs_path(entity, pids[pid][store.algorithm]) + assert os.path.exists(abs_path) def test_count(pids, store): @@ -860,7 +1045,7 @@ def test_count(pids, store): entity = "objects" for pid in pids.keys(): path_string = test_dir + pid.replace("/", "_") - store.put_object(pid, path_string) + store.store_and_validate_data(pid, path_string) assert store.count(entity) == 3 @@ -870,10 +1055,3 @@ def test_to_bytes(store): # pylint: disable=W0212 string_bytes = store._to_bytes(string) assert isinstance(string_bytes, bytes) - - -def test_get_sha256_hex_digest(pids, store): - """Test for correct sha256 return value.""" - for pid in pids: - hash_val = store.get_sha256_hex_digest(pid) - assert hash_val == pids[pid]["object_cid"] diff --git a/tests/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py index 92b125cb..c06c23d1 100644 --- a/tests/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -1,4 +1,4 @@ -"""Test module for FileHashStore HashStore interface methods""" +"""Test module for FileHashStore HashStore interface methods.""" import io import os from pathlib import Path @@ -34,14 +34,10 @@ def test_store_object(pids, store): """Test store object.""" test_dir = "tests/testdata/" entity = "objects" - format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): path = Path(test_dir + pid.replace("/", "_")) - filename = pid.replace("/", "_") + ".xml" - syspath = Path(test_dir) / filename object_metadata = store.store_object(pid, path) - _metadata_cid = store.store_metadata(pid, syspath, format_id) - assert object_metadata.id == pids[pid]["object_cid"] + assert object_metadata.id == pids[pid][store.algorithm] assert store.count(entity) == 3 @@ -49,14 +45,10 @@ def test_store_object_files_path(pids, store): """Test store object when given a path.""" test_dir = "tests/testdata/" entity = "objects" - format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): path = Path(test_dir + pid.replace("/", "_")) - filename = pid.replace("/", "_") + ".xml" - syspath = Path(test_dir) / filename _object_metadata = store.store_object(pid, path) - _metadata_cid = store.store_metadata(pid, syspath, format_id) - assert store.exists(entity, pids[pid]["object_cid"]) + assert store.exists(entity, pids[pid][store.algorithm]) assert store.count(entity) == 3 @@ -64,14 +56,10 @@ def test_store_object_files_string(pids, store): """Test store object when given a string.""" test_dir = "tests/testdata/" entity = "objects" - format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): path_string = test_dir + pid.replace("/", "_") - filename = pid.replace("/", "_") + ".xml" - syspath = Path(test_dir) / filename _object_metadata = store.store_object(pid, path_string) - _metadata_cid = store.store_metadata(pid, syspath, format_id) - assert store.exists(entity, pids[pid]["object_cid"]) + assert store.exists(entity, pids[pid][store.algorithm]) assert store.count(entity) == 3 @@ -84,18 +72,17 @@ def test_store_object_files_input_stream(pids, store): input_stream = io.open(path, "rb") _object_metadata = store.store_object(pid, input_stream) input_stream.close() - object_cid = store.get_sha256_hex_digest(pid) - assert store.exists(entity, object_cid) + assert store.exists(entity, pids[pid][store.algorithm]) assert store.count(entity) == 3 def test_store_object_id(pids, store): - """Test store object returns expected id (object_cid).""" + """Test store object returns expected id.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store.store_object(pid, path) - assert object_metadata.id == pids[pid]["object_cid"] + assert object_metadata.id == pids[pid][store.algorithm] def test_store_object_obj_size(pids, store): @@ -139,15 +126,6 @@ def test_store_object_pid_empty_spaces(store): store.store_object(" ", path) -def test_store_object_pid_none(store): - """Test store object raises error when supplied with 'None' pid.""" - test_dir = "tests/testdata/" - pid = "jtao.1700.1" - path = test_dir + pid - with pytest.raises(ValueError): - store.store_object(None, path) - - def test_store_object_data_incorrect_type_none(store): """Test store object raises error when data is 'None'.""" pid = "jtao.1700.1" @@ -192,11 +170,10 @@ def test_store_object_additional_algorithm_hyphen_uppercase(pids, store): object_metadata = store.store_object(pid, path, algorithm_with_hyphen_and_upper) sha256_cid = object_metadata.hex_digests.get("sha384") assert sha256_cid == pids[pid]["sha384"] - object_cid = store.get_sha256_hex_digest(pid) - assert store.exists(entity, object_cid) + assert store.exists(entity, pids[pid][store.algorithm]) -def test_store_object_additional_algorithm_hyphen_lowercase(store): +def test_store_object_additional_algorithm_hyphen_lowercase(pids, store): """Test store object with additional algorithm in lowercase.""" test_dir = "tests/testdata/" entity = "objects" @@ -209,11 +186,10 @@ def test_store_object_additional_algorithm_hyphen_lowercase(store): "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) assert additional_sha3_256_hex_digest == sha3_256_checksum - object_cid = store.get_sha256_hex_digest(pid) - assert store.exists(entity, object_cid) + assert store.exists(entity, pids[pid][store.algorithm]) -def test_store_object_additional_algorithm_underscore(store): +def test_store_object_additional_algorithm_underscore(pids, store): """Test store object with additional algorithm with underscore.""" test_dir = "tests/testdata/" entity = "objects" @@ -226,8 +202,7 @@ def test_store_object_additional_algorithm_underscore(store): "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) assert additional_sha3_256_hex_digest == sha3_256_checksum - pid_hash = store.get_sha256_hex_digest(pid) - assert store.exists(entity, pid_hash) + assert store.exists(entity, pids[pid][store.algorithm]) def test_store_object_checksum_correct(store): @@ -356,7 +331,7 @@ def test_store_object_checksum_incorrect_checksum(store): ) -def test_store_object_duplicate_raises_error(store): +def test_store_object_duplicate_raises_error(pids, store): """Test store duplicate object throws FileExistsError.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" @@ -368,8 +343,7 @@ def test_store_object_duplicate_raises_error(store): with pytest.raises(FileExistsError): _object_metadata_two = store.store_object(pid, path) assert store.count(entity) == 1 - object_cid = store.get_sha256_hex_digest(pid) - assert store.exists(entity, object_cid) + assert store.exists(entity, pids[pid][store.algorithm]) def test_store_object_with_obj_file_size(store, pids): @@ -415,7 +389,7 @@ def test_store_object_with_obj_file_size_zero(store, pids): store.store_object(pid, path, expected_object_size=obj_file_size) -def test_store_object_duplicates_threads(store): +def test_store_object_duplicates_threads(pids, store): """Test store object thread lock.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" @@ -424,10 +398,10 @@ def test_store_object_duplicates_threads(store): file_exists_error_flag = False - def store_object_wrapper(pid, path): + def store_object_wrapper(obj_pid, obj_path): nonlocal file_exists_error_flag try: - store.store_object(pid, path) # Call store_object inside the thread + store.store_object(obj_pid, obj_path) # Call store_object inside the thread except FileExistsError: file_exists_error_flag = True @@ -442,8 +416,7 @@ def store_object_wrapper(pid, path): thread3.join() # One thread will succeed, file count must still be 1 assert store.count(entity) == 1 - object_cid = store.get_sha256_hex_digest(pid) - assert store.exists(entity, object_cid) + assert store.exists(entity, pids[pid][store.algorithm]) assert file_exists_error_flag @@ -471,10 +444,10 @@ def test_store_object_interrupt_process(store): interrupt_flag = False - def store_object_wrapper(pid, path): + def store_object_wrapper(obj_pid, path): print(store.root) while not interrupt_flag: - store.store_object(pid, path) # Call store_object inside the thread + store.store_object(obj_pid, path) # Call store_object inside the thread # Create/start the thread thread = threading.Thread(target=store_object_wrapper, args=(pid, file_path)) @@ -517,8 +490,7 @@ def test_store_object_large_file(store): pid = "testfile_filehashstore" object_metadata = store.store_object(pid, file_path) object_metadata_id = object_metadata.id - pid_sha256_hex_digest = store.get_sha256_hex_digest(pid) - assert object_metadata_id == pid_sha256_hex_digest + assert object_metadata_id == object_metadata.hex_digests.get("sha256") @slow_test @@ -537,8 +509,35 @@ def test_store_object_sparse_large_file(store): pid = "testfile_filehashstore" object_metadata = store.store_object(pid, file_path) object_metadata_id = object_metadata.id - pid_sha256_hex_digest = store.get_sha256_hex_digest(pid) - assert object_metadata_id == pid_sha256_hex_digest + assert object_metadata_id == object_metadata.hex_digests.get("sha256") + + +def test_find_object(pids, store): + """Test find object returns the correct content identifier (cid).""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + cid = store.find_object(pid) + assert cid == object_metadata.hex_digests.get("sha256") + + +def test_find_object_pid_object_does_not_exist(store): + """Test find object throws exception when object doesn't exist.""" + with pytest.raises(FileNotFoundError): + store.find_object("dou.test.1") + + +def test_find_object_pid_none(store): + """Test find object throws exception when pid is None.""" + with pytest.raises(ValueError): + store.find_object(None) + + +def test_find_object_pid_empty(store): + """Test find object throws exception when pid is empty.""" + with pytest.raises(ValueError): + store.find_object("") def test_store_metadata(pids, store): @@ -546,10 +545,8 @@ def test_store_metadata(pids, store): test_dir = "tests/testdata/" format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - _object_metadata = store.store_object(pid, path) metadata_cid = store.store_metadata(pid, syspath, format_id) assert metadata_cid == pids[pid]["metadata_cid"] @@ -558,10 +555,8 @@ def test_store_metadata_default_format_id(pids, store): """Test store metadata returns expected id when storing with default format_id.""" test_dir = "tests/testdata/" for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - _object_metadata = store.store_object(pid, path) metadata_cid = store.store_metadata(pid, syspath) assert metadata_cid == pids[pid]["metadata_cid"] @@ -572,10 +567,8 @@ def test_store_metadata_files_path(pids, store): entity = "metadata" format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - _object_metadata = store.store_object(pid, path) metadata_cid = store.store_metadata(pid, syspath, format_id) assert store.exists(entity, metadata_cid) assert metadata_cid == pids[pid]["metadata_cid"] @@ -588,10 +581,8 @@ def test_store_metadata_files_string(pids, store): entity = "metadata" format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): - path_string = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath_string = str(Path(test_dir) / filename) - _object_metadata = store.store_object(pid, path_string) metadata_cid = store.store_metadata(pid, syspath_string, format_id) assert store.exists(entity, metadata_cid) assert store.count(entity) == 3 @@ -603,8 +594,6 @@ def test_store_metadata_files_input_stream(pids, store): entity = "metadata" format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - _object_metadata = store.store_object(pid, path) filename = pid.replace("/", "_") + ".xml" syspath_string = str(Path(test_dir) / filename) syspath_stream = io.open(syspath_string, "rb") @@ -731,7 +720,7 @@ def test_retrieve_object_pid_invalid(store): """Test retrieve_object raises error when supplied with bad pid.""" pid = "jtao.1700.1" pid_does_not_exist = pid + "test" - with pytest.raises(ValueError): + with pytest.raises(FileNotFoundError): store.retrieve_object(pid_does_not_exist) @@ -802,7 +791,7 @@ def test_retrieve_metadata_format_id_empty_spaces(store): def test_delete_objects(pids, store): - """Test delete_object successfully deletes objects.""" + """Test delete_object successfully deletes objects from /objects.""" test_dir = "tests/testdata/" entity = "objects" format_id = "http://ns.dataone.org/service/types/v2.0" @@ -816,6 +805,57 @@ def test_delete_objects(pids, store): assert store.count(entity) == 0 +def test_delete_objects_pid_refs_file(pids, store): + """Test delete_object deletes the pid refs file containing the cid.""" + test_dir = "tests/testdata/" + format_id = "http://ns.dataone.org/service/types/v2.0" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + _object_metadata = store.store_object(pid, path) + _metadata_cid = store.store_metadata(pid, syspath, format_id) + store.delete_object(pid) + pid_refs_file_path = store.get_refs_abs_path("pid", pid) + assert not os.path.exists(pid_refs_file_path) + + +def test_delete_objects_cid_refs_file(pids, store): + """Test delete_object deletes the cid refs file containing the cid.""" + test_dir = "tests/testdata/" + format_id = "http://ns.dataone.org/service/types/v2.0" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + object_metadata = store.store_object(pid, path) + _metadata_cid = store.store_metadata(pid, syspath, format_id) + cid = object_metadata.id + store.delete_object(pid) + cid_refs_file_path = store.get_refs_abs_path("cid", cid) + assert not os.path.exists(cid_refs_file_path) + + +def test_delete_objects_cid_refs_file_with_pid_refs_remaining(pids, store): + """Test delete_object does not delete the cid refs file that still contains ref.""" + test_dir = "tests/testdata/" + format_id = "http://ns.dataone.org/service/types/v2.0" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + object_metadata = store.store_object(pid, path) + cid = object_metadata.id + cid_refs_abs_path = store.get_refs_abs_path("cid", cid) + # pylint: disable=W0212 + store._update_cid_refs(cid_refs_abs_path, "dou.test.1") + _metadata_cid = store.store_metadata(pid, syspath, format_id) + with pytest.raises(OSError): + store.delete_object(pid) + cid_refs_file_path = store.get_refs_abs_path("cid", cid) + assert os.path.exists(cid_refs_file_path) + + def test_delete_object_pid_empty(store): """Test delete_object raises error when empty pid supplied.""" pid = " " @@ -905,7 +945,7 @@ def test_get_hex_digest_pid_not_found(store): pid = "jtao.1700.1" pid_does_not_exist = pid + "test" algorithm = "sha256" - with pytest.raises(ValueError): + with pytest.raises(FileNotFoundError): store.get_hex_digest(pid_does_not_exist, algorithm) diff --git a/tests/test_filehashstore_references.py b/tests/test_filehashstore_references.py new file mode 100644 index 00000000..e4974bcc --- /dev/null +++ b/tests/test_filehashstore_references.py @@ -0,0 +1,477 @@ +"""Test module for FileHashStore's reference system to tag stored objects.""" +import os +import pytest + +# pylint: disable=W0212 + + +def test_tag_object(pids, store): + """Test tag object returns boolean.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(None, path) + object_tagged = store.tag_object(pid, object_metadata.id) + assert object_tagged + + +def test_tag_object_pid_refs_file(pids, store): + """Test tag object creates the pid reference file.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(None, path) + store.tag_object(pid, object_metadata.id) + pid_refs_file_path = store.get_refs_abs_path("pid", pid) + assert os.path.exists(pid_refs_file_path) + + +def test_tag_object_pid_refs_file_exists(pids, store): + """Test tag object throws exception when pid refs file already exists.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(None, path) + cid = object_metadata.id + store.tag_object(pid, cid) + pid_refs_file_path = store.get_refs_abs_path("pid", pid) + assert os.path.exists(pid_refs_file_path) + cid_refs_file_path = store.get_refs_abs_path("cid", cid) + assert os.path.exists(cid_refs_file_path) + with pytest.raises(FileExistsError): + store.tag_object(pid, cid) + + +def test_tag_object_pid_refs_file_content(pids, store): + """Test tag object creates the pid reference file contains the correct cid.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(None, path) + store.tag_object(pid, object_metadata.id) + pid_refs_file_path = store.get_refs_abs_path("pid", pid) + with open(pid_refs_file_path, "r", encoding="utf8") as f: + pid_refs_cid = f.read() + assert pid_refs_cid == object_metadata.id + + +def test_tag_object_cid_refs_file(pids, store): + """Test tag object creates the cid reference file.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(None, path) + cid = object_metadata.id + store.tag_object(pid, object_metadata.id) + cid_refs_file_path = store.get_refs_abs_path("cid", cid) + assert os.path.exists(cid_refs_file_path) + + +def test_tag_object_cid_refs_file_content(pids, store): + """Test tag object tags cid reference file successfully with pid.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(None, path) + store.tag_object(pid, object_metadata.id) + cid_refs_file_path = store.get_refs_abs_path("cid", object_metadata.id) + with open(cid_refs_file_path, "r", encoding="utf8") as f: + pid_refs_cid = f.read().strip() + assert pid_refs_cid == pid + + +def test_tag_object_cid_refs_file_exists(pids, store): + """Test tag object raises exception when trying to add another cid to an + existing pid reference file and that a cid reference file is not created.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(None, path) + store.tag_object(pid, object_metadata.id) + another_cid = "dou.test.1" + with pytest.raises(FileExistsError): + store.tag_object(pid, another_cid) + + second_cid_hash = store.get_refs_abs_path("cid", another_cid) + assert not os.path.exists(second_cid_hash) + + +def test_tag_object_cid_refs_update(pids, store): + """Test tag object updates a cid reference file that already exists.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(None, path) + cid = object_metadata.id + store.tag_object(pid, cid) + store.tag_object("dou.test.1", cid) + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + with open(cid_ref_abs_path, "r", encoding="utf8") as f: + cid_ref_file_pid = f.read() + + assert "dou.test.1" in cid_ref_file_pid + + +def test_verify_object(pids, store): + """Test verify object succeeds given good arguments.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + cid = object_metadata.id + store.tag_object(pid, cid) + checksum = object_metadata.hex_digests.get(store.algorithm) + checksum_algorithm = store.algorithm + expected_file_size = object_metadata.obj_size + store.verify_object( + object_metadata, checksum, checksum_algorithm, expected_file_size + ) + + +def test_verify_object_exception_incorrect_object_metadata_type(pids, store): + """Test verify object raises exception when incorrect object is given to + object_metadata arg.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + cid = object_metadata.id + store.tag_object(pid, cid) + checksum = object_metadata.hex_digests.get(store.algorithm) + checksum_algorithm = store.algorithm + expected_file_size = object_metadata.obj_size + with pytest.raises(ValueError): + store.verify_object( + "bad_type", checksum, checksum_algorithm, expected_file_size + ) + + +def test_verify_object_exception_incorrect_size(pids, store): + """Test verify object raises exception when incorrect size is supplied.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + cid = object_metadata.id + store.tag_object(pid, cid) + checksum = object_metadata.hex_digests.get(store.algorithm) + checksum_algorithm = store.algorithm + with pytest.raises(ValueError): + store.verify_object(object_metadata, checksum, checksum_algorithm, 1000) + + +def test_verify_object_exception_incorrect_checksum(pids, store): + """Test verify object raises exception when incorrect checksum is supplied.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + cid = object_metadata.id + store.tag_object(pid, cid) + checksum_algorithm = store.algorithm + expected_file_size = object_metadata.obj_size + with pytest.raises(ValueError): + store.verify_object( + object_metadata, "abc123", checksum_algorithm, expected_file_size + ) + + +def test_verify_object_exception_incorrect_checksum_algo(pids, store): + """Test verify object raises exception when incorrect algorithm is supplied.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + cid = object_metadata.id + store.tag_object(pid, cid) + checksum = object_metadata.hex_digests.get(store.algorithm) + expected_file_size = object_metadata.obj_size + with pytest.raises(ValueError): + store.verify_object(object_metadata, checksum, "md2", expected_file_size) + + +def test_write_cid_refs_file(pids, store): + """Test that write_cid_reference writes a reference file.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store._write_cid_refs_file(cid_ref_abs_path, pid) + assert os.path.exists(cid_ref_abs_path) + + +def test_write_cid_refs_file_content(pids, store): + """Test that write_cid_ref_file writes the expected content.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store._write_cid_refs_file(cid_ref_abs_path, pid) + + with open(cid_ref_abs_path, "r", encoding="utf8") as f: + cid_ref_file_pid = f.read() + + assert pid == cid_ref_file_pid.strip() + + +def test_write_cid_refs_file_into_empty_file(pids, store): + """Test that write_cid_reference writes an empty file.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + with open(cid_ref_abs_path, "w", encoding="utf8"): + pass + store._write_cid_refs_file(cid_ref_abs_path, pid) + assert os.path.exists(cid_ref_abs_path) + + +def test_write_cid_refs_file_file_not_empty(pids, store): + """Test that write_cid_reference does not overwrite an existing file.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store._write_cid_refs_file(cid_ref_abs_path, pid) + with pytest.raises(OSError): + store._write_cid_refs_file(cid_ref_abs_path, "other_pid") + + +def test_update_cid_refs_content(pids, store): + """Test that update_cid_ref updates the ref file as expected.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store._write_cid_refs_file(cid_ref_abs_path, pid) + + pid_other = "dou.test.1" + store._update_cid_refs(cid_ref_abs_path, pid_other) + + with open(cid_ref_abs_path, "r", encoding="utf8") as f: + for _, line in enumerate(f, start=1): + value = line.strip() + assert value == pid or value == pid_other + + +def test_update_cid_refs_content_multiple(pids, store): + """Test that update_cid_refs adds multiple references successfully.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store._write_cid_refs_file(cid_ref_abs_path, pid) + + cid_reference_list = [pid] + for i in range(0, 5): + store._update_cid_refs(cid_ref_abs_path, f"dou.test.{i}") + cid_reference_list.append(f"dou.test.{i}") + + line_count = 0 + with open(cid_ref_abs_path, "r", encoding="utf8") as f: + for _, line in enumerate(f, start=1): + line_count += 1 + value = line.strip() + assert value in cid_reference_list + + assert line_count == 6 + + +def test_update_cid_refs_content_pid_exists(pids, store): + """Test that update_cid_ref throws exception if pid already exists.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store._write_cid_refs_file(cid_ref_abs_path, pid) + with pytest.raises(ValueError): + store._update_cid_refs(cid_ref_abs_path, pid) + + +def test_update_cid_refs_content_cid_refs_does_not_exist(pids, store): + """Test that update_cid_ref throws exception if cid refs file doesn't exist.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + with pytest.raises(FileNotFoundError): + store._update_cid_refs(cid_ref_abs_path, pid) + + +def test_delete_cid_refs_pid(pids, store): + """Test that delete_cid_refs_pid deletes the given pid from the ref file.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store._write_cid_refs_file(cid_ref_abs_path, pid) + + pid_other = "dou.test.1" + store._update_cid_refs(cid_ref_abs_path, pid_other) + store._delete_cid_refs_pid(cid_ref_abs_path, pid) + + with open(cid_ref_abs_path, "r", encoding="utf8") as f: + for _, line in enumerate(f, start=1): + value = line.strip() + print(value) + assert value == pid_other + + +def test_delete_cid_refs_pid_pid_not_found(pids, store): + """Test that delete_cid_refs_pid raises exception when pid not found.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store._write_cid_refs_file(cid_ref_abs_path, pid) + + pid_other = "dou.test.1" + store._update_cid_refs(cid_ref_abs_path, pid_other) + with pytest.raises(ValueError): + store._delete_cid_refs_pid(cid_ref_abs_path, "dou.not.found.1") + + +def test_delete_cid_refs_pid_file(pids, store): + """Test that delete_cid_refs_file deletes a reference file.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store._write_cid_refs_file(cid_ref_abs_path, pid) + store._delete_cid_refs_pid(cid_ref_abs_path, pid) + cid_refs_deleted = store._delete_cid_refs_file(cid_ref_abs_path) + + assert cid_refs_deleted + assert not os.path.exists(cid_ref_abs_path) + + +def test_delete_cid_refs_file_file_not_empty(pids, store): + """Test that delete_cid_refs_file raises an exception when refs file is not empty.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + store._write_cid_refs_file(cid_ref_abs_path, pid) + with pytest.raises(OSError): + store._delete_cid_refs_file(cid_ref_abs_path) + + +def test_delete_cid_refs_file_file_not_found(pids, store): + """Test that delete_cid_refs_file raises an exception when refs file not found.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + with pytest.raises(FileNotFoundError): + store._delete_cid_refs_file(cid_ref_abs_path) + + +def test_write_pid_refs_file(pids, store): + """Test that write_pid_refs_file writes a reference file.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + pid_ref_abs_path = store.get_refs_abs_path("pid", pid) + store.create_path(os.path.dirname(pid_ref_abs_path)) + store._write_pid_refs_file(pid_ref_abs_path, cid) + assert os.path.exists(pid_ref_abs_path) + + +def test_write_pid_refs_file_content(pids, store): + """Test that write_pid_refs_file writes the expected content.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + pid_ref_abs_path = store.get_refs_abs_path("pid", pid) + store.create_path(os.path.dirname(pid_ref_abs_path)) + store._write_pid_refs_file(pid_ref_abs_path, cid) + + with open(pid_ref_abs_path, "r", encoding="utf8") as f: + pid_refs_cid = f.read() + + assert cid == pid_refs_cid + + +def test_delete_pid_refs_file(pids, store): + """Test that delete_pid_refs_file deletes a reference file.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + pid_ref_abs_path = store.get_refs_abs_path("pid", pid) + store.create_path(os.path.dirname(pid_ref_abs_path)) + store._write_pid_refs_file(pid_ref_abs_path, cid) + store._delete_pid_refs_file(pid_ref_abs_path) + + assert not os.path.exists(pid_ref_abs_path) + + +def test_delete_pid_refs_file_file_not_found(pids, store): + """Test that delete_pid_refs_file raises an exception when refs file not found.""" + for pid in pids.keys(): + pid_ref_abs_path = store.get_refs_abs_path("pid", pid) + with pytest.raises(FileNotFoundError): + store._delete_cid_refs_file(pid_ref_abs_path) + + +def test_verify_hashstore_references_pid_refs_file_missing(pids, store): + """Test _verify_hashstore_references throws exception when pid refs file is missing.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + with pytest.raises(FileNotFoundError): + store._verify_hashstore_references(pid, cid) + + +def test_verify_hashstore_references_pid_refs_incorrect_cid(pids, store): + """Test _verify_hashstore_references throws exception when pid refs file cid is incorrect.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + pid_ref_abs_path = store.get_refs_abs_path("pid", pid) + store.create_path(os.path.dirname(pid_ref_abs_path)) + store._write_pid_refs_file(pid_ref_abs_path, "bad_cid") + with pytest.raises(FileNotFoundError): + store._verify_hashstore_references(pid, cid) + + +def test_verify_hashstore_references_cid_refs_file_missing(pids, store): + """Test _verify_hashstore_references throws exception when cid refs file is missing.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + pid_ref_abs_path = store.get_refs_abs_path("pid", pid) + store.create_path(os.path.dirname(pid_ref_abs_path)) + store._write_pid_refs_file(pid_ref_abs_path, cid) + with pytest.raises(FileNotFoundError): + store._verify_hashstore_references(pid, cid) + + +def test_verify_hashstore_references_cid_refs_file_missing_pid(pids, store): + """Test _verify_hashstore_references throws exception when cid refs file does not contain + the expected pid.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + pid_ref_abs_path = store.get_refs_abs_path("pid", pid) + store.create_path(os.path.dirname(pid_ref_abs_path)) + store._write_pid_refs_file(pid_ref_abs_path, cid) + store._write_cid_refs_file(cid_ref_abs_path, "bad_pid") + with pytest.raises(ValueError): + store._verify_hashstore_references(pid, cid) + + +def test_verify_hashstore_references_cid_refs_file_with_multiple_refs_missing_pid( + pids, store +): + """Test _verify_hashstore_references throws exception when cid refs file with multiple + references does not contain the expected pid.""" + for pid in pids.keys(): + cid = pids[pid]["sha256"] + cid_ref_abs_path = store.get_refs_abs_path("cid", cid) + store.create_path(os.path.dirname(cid_ref_abs_path)) + pid_ref_abs_path = store.get_refs_abs_path("pid", pid) + store.create_path(os.path.dirname(pid_ref_abs_path)) + store._write_pid_refs_file(pid_ref_abs_path, cid) + store.create_path(os.path.dirname(pid_ref_abs_path)) + store._write_cid_refs_file(cid_ref_abs_path, "bad_pid") + + cid_reference_list = [pid] + for i in range(0, 5): + store._update_cid_refs(cid_ref_abs_path, f"dou.test.{i}") + cid_reference_list.append(f"dou.test.{i}") + + with pytest.raises(ValueError): + store._verify_hashstore_references(pid, cid) diff --git a/tests/test_filehashstore_stream.py b/tests/test_filehashstore_stream.py index 8cf4a7d0..29fa4d20 100644 --- a/tests/test_filehashstore_stream.py +++ b/tests/test_filehashstore_stream.py @@ -1,4 +1,4 @@ -"""Test module for Stream""" +"""Test module for FileHashStore's Stream class.""" import hashlib import io from pathlib import Path @@ -15,6 +15,7 @@ def test_stream_reads_file(pids): hashobj = hashlib.new("sha256") for data in obj_stream: hashobj.update(data) + obj_stream.close() hex_digest = hashobj.hexdigest() assert pids[pid]["sha256"] == hex_digest @@ -28,6 +29,7 @@ def test_stream_reads_path_object(pids): hashobj = hashlib.new("sha256") for data in obj_stream: hashobj.update(data) + obj_stream.close() hex_digest = hashobj.hexdigest() assert pids[pid]["sha256"] == hex_digest diff --git a/tests/test_hashstore.py b/tests/test_hashstore.py index 68cd195a..e161c967 100644 --- a/tests/test_hashstore.py +++ b/tests/test_hashstore.py @@ -1,4 +1,4 @@ -"""Test module for HashStore Module""" +"""Test module for HashStore's HashStoreFactory and ObjectMetadata class.""" import os import pytest from hashstore.hashstore import ObjectMetadata, HashStoreFactory @@ -43,7 +43,8 @@ def test_factory_get_hashstore_unsupported_module(factory): def test_factory_get_hashstore_filehashstore_unsupported_algorithm(factory): - """Check factory raises exception with store algorithm value that part of the default list""" + """Check factory raises exception with store algorithm value that is not part of + the default list.""" module_name = "hashstore.filehashstore" class_name = "FileHashStore" @@ -59,7 +60,7 @@ def test_factory_get_hashstore_filehashstore_unsupported_algorithm(factory): def test_factory_get_hashstore_filehashstore_incorrect_algorithm_format(factory): - """Check factory raises exception with incorrectly formatted algorithm value""" + """Check factory raises exception with incorrectly formatted algorithm value.""" module_name = "hashstore.filehashstore" class_name = "FileHashStore" @@ -67,7 +68,7 @@ def test_factory_get_hashstore_filehashstore_incorrect_algorithm_format(factory) "store_path": os.getcwd() + "/metacat/test", "store_depth": 3, "store_width": 2, - "store_algorithm": "sha256", + "store_algorithm": "dou_algo", "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", } with pytest.raises(ValueError): @@ -75,7 +76,7 @@ def test_factory_get_hashstore_filehashstore_incorrect_algorithm_format(factory) def test_objectmetadata(): - """Test class returns correct values via dot notation.""" + """Test ObjectMetadata class returns correct values via dot notation.""" ab_id = "hashstoretest" obj_size = 1234 hex_digest_dict = { diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index 7d73e524..96c9ad45 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -1,4 +1,4 @@ -"""Test module for the Python client (Public API calls only)""" +"""Test module for the Python client (Public API calls only).""" import sys import os from pathlib import Path @@ -41,12 +41,47 @@ def test_create_hashstore(tmp_path): assert os.path.exists(hashstore_client_python_log) +def test_get_checksum(capsys, store, pids): + """Test calculating a hash via HashStore through client.""" + client_directory = os.getcwd() + "/src/hashstore" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + store.store_object(pid, path) + + client_module_path = f"{client_directory}/client.py" + test_store = store.root + get_checksum_opt = "-getchecksum" + client_pid_arg = f"-pid={pid}" + algo_arg = f"-algo={store.algorithm}" + chs_args = [ + client_module_path, + test_store, + get_checksum_opt, + client_pid_arg, + algo_arg, + ] + + # Add file path of HashStore to sys so modules can be discovered + sys.path.append(client_directory) + # Manually change sys args to simulate command line arguments + sys.argv = chs_args + client.main() + + capsystext = capsys.readouterr().out + expected_output = ( + f"guid/pid: {pid}\n" + + f"algorithm: {store.algorithm}\n" + + f"Checksum/Hex Digest: {pids[pid][store.algorithm]}\n" + ) + assert capsystext == expected_output + + def test_store_object(store, pids): """Test storing objects to HashStore through client.""" client_directory = os.getcwd() + "/src/hashstore" test_dir = "tests/testdata/" for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") client_module_path = f"{client_directory}/client.py" test_store = store.root store_object_opt = "-storeobject" @@ -66,7 +101,7 @@ def test_store_object(store, pids): sys.argv = chs_args client.main() - assert store.exists("objects", pids[pid]["object_cid"]) + assert store.exists("objects", pids[pid][store.algorithm]) def test_store_metadata(store, pids): @@ -75,7 +110,6 @@ def test_store_metadata(store, pids): test_dir = "tests/testdata/" namespace = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename client_module_path = f"{client_directory}/client.py" @@ -108,7 +142,7 @@ def test_retrieve_objects(capsys, pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - _object_metadata = store.store_object(pid, path) + store.store_object(pid, path) client_module_path = f"{client_directory}/client.py" test_store = store.root @@ -188,7 +222,7 @@ def test_delete_objects(pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - _object_metadata = store.store_object(pid, path) + store.store_object(pid, path) client_module_path = f"{client_directory}/client.py" test_store = store.root @@ -207,7 +241,7 @@ def test_delete_objects(pids, store): sys.argv = chs_args client.main() - assert not store.exists("objects", pids[pid]["object_cid"]) + assert not store.exists("objects", pids[pid][store.algorithm]) def test_delete_metadata(pids, store):