diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 357e8774..c7d727dd 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -65,8 +65,8 @@ class FileHashStore(HashStore): "store_metadata_namespace", ] # Permissions settings for writing files and creating directories - fmode = 0o664 - dmode = 0o755 + f_mode = 0o664 + d_mode = 0o755 # The other algorithm list consists of additional algorithms that can be included # for calculating when storing objects, in addition to the default list. other_algo_list = [ @@ -143,17 +143,17 @@ def __init__(self, properties=None): ] # Check to see if a configuration is present in the given store path - self.hashstore_configuration_yaml = prop_store_path + "/hashstore.yaml" + self.hashstore_configuration_yaml = Path(prop_store_path) / "hashstore.yaml" self._verify_hashstore_properties(properties, prop_store_path) # If no exceptions thrown, FileHashStore ready for initialization logging.debug("FileHashStore - Initializing, properties verified.") - self.root = prop_store_path + self.root = Path(prop_store_path) self.depth = prop_store_depth self.width = prop_store_width self.sysmeta_ns = prop_store_metadata_namespace # Write 'hashstore.yaml' to store path - if not os.path.exists(self.hashstore_configuration_yaml): + if not os.path.isfile(self.hashstore_configuration_yaml): # pylint: disable=W1201 logging.debug( "FileHashStore - HashStore does not exist & configuration file not found." @@ -163,19 +163,19 @@ def __init__(self, properties=None): # Default algorithm list for FileHashStore based on config file written self._set_default_algorithms() # Complete initialization/instantiation by setting and creating store directories - self.objects = self.root + "/objects" - self.metadata = self.root + "/metadata" - self.refs = self.root + "/refs" - self.cids = self.refs + "/cids" - self.pids = self.refs + "/pids" + self.objects = self.root / "objects" + self.metadata = self.root / "metadata" + self.refs = self.root / "refs" + self.cids = self.refs / "cids" + self.pids = self.refs / "pids" if not os.path.exists(self.objects): - self._create_path(self.objects + "/tmp") + self._create_path(self.objects / "tmp") if not os.path.exists(self.metadata): - self._create_path(self.metadata + "/tmp") + self._create_path(self.metadata / "tmp") if not os.path.exists(self.refs): - self._create_path(self.refs + "/tmp") - self._create_path(self.refs + "/pids") - self._create_path(self.refs + "/cids") + self._create_path(self.refs / "tmp") + self._create_path(self.refs / "pids") + self._create_path(self.refs / "cids") logging.debug( "FileHashStore - Initialization success. Store root: %s", self.root ) @@ -192,18 +192,17 @@ def __init__(self, properties=None): @staticmethod def _load_properties( - hashstore_yaml_path: str, hashstore_required_prop_keys: List[str] + hashstore_yaml_path: Path, hashstore_required_prop_keys: List[str] ) -> Dict[str, Union[str, int]]: """Get and return the contents of the current HashStore configuration. :return: HashStore properties with the following keys (and values): - - ``store_depth`` (int): Depth when sharding an object's hex digest. - - ``store_width`` (int): Width of directories when sharding an object's hex digest. - - ``store_algorithm`` (str): Hash algo used for calculating the object's hex digest. - - ``store_metadata_namespace`` (str): Namespace for the HashStore's system metadata. - :rtype: dict + - store_depth (int): Depth when sharding an object's hex digest. + - store_width (int): Width of directories when sharding an object's hex digest. + - store_algorithm (str): Hash algo used for calculating the object's hex digest. + - store_metadata_namespace (str): Namespace for the HashStore's system metadata. """ - if not os.path.exists(hashstore_yaml_path): + if not os.path.isfile(hashstore_yaml_path): exception_string = ( "FileHashStore - load_properties: hashstore.yaml not found" + " in store root path." @@ -229,15 +228,14 @@ def _write_properties(self, properties: Dict[str, Union[str, int]]) -> None: """Writes 'hashstore.yaml' to FileHashStore's root directory with the respective properties object supplied. - :param properties: A Python dictionary with the following keys (and values): - - ``store_depth`` (int): Depth when sharding an object's hex digest. - - ``store_width`` (int): Width of directories when sharding an object's hex digest. - - ``store_algorithm`` (str): Hash algo used for calculating the object's hex digest. - - ``store_metadata_namespace`` (str): Namespace for the HashStore's system metadata. - :type properties: dict + :param dict properties: A Python dictionary with the following keys (and values): + - store_depth (int): Depth when sharding an object's hex digest. + - store_width (int): Width of directories when sharding an object's hex digest. + - store_algorithm (str): Hash algo used for calculating the object's hex digest. + - store_metadata_namespace (str): Namespace for the HashStore's system metadata. """ # If hashstore.yaml already exists, must throw exception and proceed with caution - if os.path.exists(self.hashstore_configuration_yaml): + if os.path.isfile(self.hashstore_configuration_yaml): exception_string = ( "FileHashStore - write_properties: configuration file 'hashstore.yaml'" + " already exists." @@ -306,7 +304,6 @@ def _build_hashstore_yaml_string( :param str store_metadata_namespace: Namespace for the HashStore's system metadata. :return: A YAML string representing the configuration for a HashStore. - :rtype: str """ hashstore_configuration_yaml = f""" # Default configuration variables for HashStore @@ -360,7 +357,7 @@ def _verify_hashstore_properties( :param dict properties: HashStore properties. :param str prop_store_path: Store path to check. """ - if os.path.exists(self.hashstore_configuration_yaml): + if os.path.isfile(self.hashstore_configuration_yaml): logging.debug( "FileHashStore - Config found (hashstore.yaml) at {%s}. Verifying properties.", self.hashstore_configuration_yaml, @@ -412,7 +409,6 @@ def _validate_properties( :raises ValueError: If value is missing for a required key. :return: The given properties object (that has been validated). - :rtype: dict """ if not isinstance(properties, dict): exception_string = ( @@ -477,7 +473,7 @@ def lookup_algo(algo_to_translate): } return dataone_algo_translation[algo_to_translate] - if not os.path.exists(self.hashstore_configuration_yaml): + if not os.path.isfile(self.hashstore_configuration_yaml): exception_string = ( "FileHashStore - set_default_algorithms: hashstore.yaml not found" + " in store root path." @@ -679,8 +675,8 @@ def store_metadata( ) # Validate input parameters self._check_string(pid, "pid") - checked_format_id = self._check_arg_format_id(format_id, "store_metadata") self._check_arg_data(metadata) + checked_format_id = self._check_arg_format_id(format_id, "store_metadata") pid_doc = self._computehash(pid + checked_format_id) sync_begin_debug_msg = ( @@ -776,12 +772,13 @@ def retrieve_metadata(self, pid: str, format_id: Optional[str] = None) -> IO[byt metadata_document_name = self._computehash(pid + self.sysmeta_ns) else: metadata_document_name = self._computehash(pid + checked_format_id) - rel_path = "/".join(self._shard(metadata_directory)) - metadata_rel_path = rel_path + "/" + metadata_document_name - metadata_exists = self._exists(entity, metadata_rel_path) + metadata_rel_path = ( + Path(*self._shard(metadata_directory)) / metadata_document_name + ) + metadata_exists = self._exists(entity, str(metadata_rel_path)) if metadata_exists: - metadata_stream = self._open(entity, metadata_rel_path) + metadata_stream = self._open(entity, str(metadata_rel_path)) logging.info( "FileHashStore - retrieve_metadata: Retrieved metadata for pid: %s", pid ) @@ -931,7 +928,7 @@ def delete_object(self, pid: str) -> None: return except OrphanPidRefsFileFound: # Delete pid refs file - pid_ref_abs_path = str(self._get_hashstore_pid_refs_path(pid)) + pid_ref_abs_path = self._get_hashstore_pid_refs_path(pid) objects_to_delete.append( self._rename_path_for_deletion(pid_ref_abs_path) ) @@ -948,10 +945,10 @@ def delete_object(self, pid: str) -> None: ) # Remove pid from cid refs file pid_refs_cid = self._read_small_file_content(pid_ref_abs_path) - cid_ref_abs_str = str(self._get_hashstore_cid_refs_path(pid_refs_cid)) + cid_ref_abs_path = self._get_hashstore_cid_refs_path(pid_refs_cid) # Remove if the pid refs is found - if self._is_string_in_refs_file(pid, Path(cid_ref_abs_str)): - self._update_refs_file(Path(cid_ref_abs_str), pid, "remove") + if self._is_string_in_refs_file(pid, cid_ref_abs_path): + self._update_refs_file(cid_ref_abs_path, pid, "remove") # Remove metadata files if they exist self.delete_metadata(pid) # Remove all files confirmed for deletion @@ -959,7 +956,7 @@ def delete_object(self, pid: str) -> None: return except PidNotFoundInCidRefsFile: # Add pid refs file to be permanently deleted - pid_ref_abs_path = str(self._get_hashstore_pid_refs_path(pid)) + pid_ref_abs_path = self._get_hashstore_pid_refs_path(pid) objects_to_delete.append( self._rename_path_for_deletion(pid_ref_abs_path) ) @@ -994,7 +991,7 @@ def delete_metadata(self, pid: str, format_id: Optional[str] = None) -> None: self._check_string(pid, "pid") checked_format_id = self._check_arg_format_id(format_id, "delete_metadata") metadata_directory = self._computehash(pid) - rel_path = "/".join(self._shard(metadata_directory)) + rel_path = Path(*self._shard(metadata_directory)) if format_id is None: # Delete all metadata documents @@ -1089,9 +1086,7 @@ def delete_metadata(self, pid: str, format_id: Optional[str] = None) -> None: logging.debug(sync_begin_debug_msg) self.metadata_locked_docs_th.append(pid_doc) try: - full_path_without_directory = ( - self.metadata + "/" + rel_path + "/" + pid_doc - ) + full_path_without_directory = Path(self.metadata / rel_path / pid_doc) self._delete("metadata", full_path_without_directory) info_string = ( "FileHashStore - delete_metadata: Successfully deleted metadata for pid:" @@ -1152,7 +1147,7 @@ def _find_object(self, pid: str) -> Dict[str, str]: :param str pid: Authority-based or persistent identifier of the object. - :return: obj_info_dict (dict): + :return: obj_info_dict: - cid: content identifier - cid_object_path: path to the object - cid_refs_path: path to the cid refs file @@ -1165,13 +1160,13 @@ def _find_object(self, pid: str) -> Dict[str, str]: self._check_string(pid, "pid") pid_ref_abs_path = self._get_hashstore_pid_refs_path(pid) - if os.path.exists(pid_ref_abs_path): + if os.path.isfile(pid_ref_abs_path): # Read the file to get the cid from the pid reference pid_refs_cid = self._read_small_file_content(pid_ref_abs_path) # Confirm that the cid reference file exists cid_ref_abs_path = self._get_hashstore_cid_refs_path(pid_refs_cid) - if os.path.exists(cid_ref_abs_path): + if os.path.isfile(cid_ref_abs_path): # Check that the pid is actually found in the cid reference file if self._is_string_in_refs_file(pid, cid_ref_abs_path): # Object must also exist in order to return the cid retrieved @@ -1186,7 +1181,7 @@ def _find_object(self, pid: str) -> Dict[str, str]: else: sysmeta_doc_name = self._computehash(pid + self.sysmeta_ns) metadata_directory = self._computehash(pid) - metadata_rel_path = "/".join(self._shard(metadata_directory)) + metadata_rel_path = Path(*self._shard(metadata_directory)) sysmeta_full_path = ( self._get_store_path("metadata") / metadata_rel_path @@ -1201,7 +1196,7 @@ def _find_object(self, pid: str) -> Dict[str, str]: "pid_refs_path": pid_ref_abs_path, "sysmeta_path": ( sysmeta_full_path - if os.path.exists(sysmeta_full_path) + if os.path.isfile(sysmeta_full_path) else "Does not exist." ), } @@ -1487,7 +1482,7 @@ def _move_and_get_checksums( finally: # Ensure that the tmp file has been removed, the data object already exists, so it # is redundant. No exception is thrown so 'store_object' can proceed to tag object - if os.path.exists(tmp_file_name): + if os.path.isfile(tmp_file_name): self._delete("tmp", tmp_file_name) return object_cid, tmp_file_size, hex_digests @@ -1571,12 +1566,12 @@ def _write_to_tmp_file_and_get_hex_digests( + " Keyboard interruption by user." ) logging.error(exception_string) - if os.path.exists(tmp.name): + if os.path.isfile(tmp.name): os.remove(tmp.name) finally: if not tmp_file_completion_flag: try: - if os.path.exists(tmp.name): + if os.path.isfile(tmp.name): os.remove(tmp.name) # pylint: disable=W0718 except Exception as err: @@ -1602,18 +1597,18 @@ def _mktmpfile(self, path: Path) -> IO[bytes]: # Delete tmp file if python interpreter crashes or thread is interrupted def delete_tmp_file(): - if os.path.exists(tmp.name): + if os.path.isfile(tmp.name): os.remove(tmp.name) atexit.register(delete_tmp_file) # Ensure tmp file is created with desired permissions - if self.fmode is not None: - oldmask = os.umask(0) + if self.f_mode is not None: + old_mask = os.umask(0) try: - os.chmod(tmp.name, self.fmode) + os.chmod(tmp.name, self.f_mode) finally: - os.umask(oldmask) + os.umask(old_mask) return tmp def _store_hashstore_refs_files(self, pid: str, cid: str) -> None: @@ -1636,7 +1631,7 @@ def _store_hashstore_refs_files(self, pid: str, cid: str) -> None: self._create_path(Path(os.path.dirname(pid_refs_path))) self._create_path(Path(os.path.dirname(cid_refs_path))) - if os.path.exists(pid_refs_path) and os.path.exists(cid_refs_path): + if os.path.isfile(pid_refs_path) and os.path.isfile(cid_refs_path): # If both reference files exist, we confirm that reference files are where they # are expected to be and throw an exception to inform the client that everything # is in place - and include other issues for context @@ -1659,7 +1654,7 @@ def _store_hashstore_refs_files(self, pid: str, cid: str) -> None: logging.error(rev_msg) raise HashStoreRefsAlreadyExists(err_msg) - elif os.path.exists(pid_refs_path) and not os.path.exists( + elif os.path.isfile(pid_refs_path) and not os.path.isfile( cid_refs_path ): # If pid refs exists, the pid has already been claimed and cannot be tagged we @@ -1671,7 +1666,7 @@ def _store_hashstore_refs_files(self, pid: str, cid: str) -> None: logging.error(error_msg) raise PidRefsAlreadyExistsError(error_msg) - elif not os.path.exists(pid_refs_path) and os.path.exists( + elif not os.path.isfile(pid_refs_path) and os.path.isfile( cid_refs_path ): debug_msg = ( @@ -1868,7 +1863,6 @@ def _put_metadata( :param str metadata_doc_name: Metadata document name :return: Address of the metadata document. - :rtype: Path """ logging.debug( "FileHashStore - _put_metadata: Request to put metadata for pid: %s", pid @@ -1881,11 +1875,11 @@ def _put_metadata( # Get target and related paths (permanent location) metadata_directory = self._computehash(pid) metadata_document_name = metadata_doc_name - rel_path = "/".join(self._shard(metadata_directory)) + rel_path = Path(*self._shard(metadata_directory)) full_path = self._get_store_path("metadata") / rel_path / metadata_document_name # Move metadata to target path - if os.path.exists(metadata_tmp): + if os.path.isfile(metadata_tmp): try: parent = full_path.parent parent.mkdir(parents=True, exist_ok=True) @@ -1901,13 +1895,13 @@ def _put_metadata( f"FileHashStore - _put_metadata: Unexpected {err=}, {type(err)=}" ) logging.error(exception_string) - if os.path.exists(metadata_tmp): + if os.path.isfile(metadata_tmp): # Remove tmp metadata, calling app must re-upload logging.debug( "FileHashStore - _put_metadata: Deleting metadata for pid: %s", pid, ) - self.metadata.delete(metadata_tmp) + self._delete("metadata", metadata_tmp) raise else: exception_string = ( @@ -1923,7 +1917,6 @@ def _mktmpmetadata(self, stream: "Stream") -> str: :param Stream stream: Metadata stream. :return: Path/name of temporary file created and written into. - :rtype: str """ # Create temporary file in .../{store_path}/tmp tmp_root_path = self._get_store_path("metadata") / "tmp" @@ -2037,7 +2030,6 @@ def _write_refs_file(self, path: Path, ref_id: str, ref_type: str) -> str: :param str ref_type: 'cid' or 'pid' :return: tmp_file_path - Path to the tmp refs file - :rtype: string """ logging.debug( "FileHashStore - _write_refs_file: Writing id (%s) into a tmp file in: %s", @@ -2076,7 +2068,7 @@ def _update_refs_file( + f" at refs file: {refs_file_path}." ) logging.debug(debug_msg) - if not os.path.exists(refs_file_path): + if not os.path.isfile(refs_file_path): exception_string = ( f"FileHashStore - _update_refs_file: {refs_file_path} does not exist." + f" Cannot {update_type} ref_id: {ref_id}" @@ -2127,7 +2119,6 @@ def _is_string_in_refs_file(ref_id: str, refs_file_path: Path) -> bool: :param path refs_file_path: Path to the refs file :return: pid_found - :rtype: boolean """ with open(refs_file_path, "r", encoding="utf8") as ref_file: # Confirm that pid is not currently already tagged @@ -2254,7 +2245,7 @@ def _verify_hashstore_references( cid_refs_path = self._get_hashstore_cid_refs_path(cid) # Check that reference files were created - if not os.path.exists(pid_refs_path): + if not os.path.isfile(pid_refs_path): exception_string = ( "FileHashStore - _verify_hashstore_references: Pid refs file missing: " + str(pid_refs_path) @@ -2262,7 +2253,7 @@ def _verify_hashstore_references( ) logging.error(exception_string) raise PidRefsFileNotFound(exception_string) - if not os.path.exists(cid_refs_path): + if not os.path.isfile(cid_refs_path): exception_string = ( "FileHashStore - _verify_hashstore_references: Cid refs file missing: " + str(cid_refs_path) @@ -2300,7 +2291,7 @@ def _delete_object_only(self, cid: str) -> None: """ cid_refs_abs_path = self._get_hashstore_cid_refs_path(cid) # If the refs file still exists, do not delete the object - if not os.path.exists(cid_refs_abs_path): + if not os.path.isfile(cid_refs_abs_path): sync_begin_debug_msg = ( f"FileHashStore - delete_object: Cid ({cid}) to locked list." ) @@ -2361,7 +2352,6 @@ def _check_arg_algorithms_and_checksum( :return: Hashlib-compatible string or 'None' for additional_algorithm and checksum_algorithm. - :rtype: str """ additional_algorithm_checked = None if additional_algorithm != self.algorithm and additional_algorithm is not None: @@ -2384,7 +2374,6 @@ def _check_arg_format_id(self, format_id: str, method: str) -> str: :param str method: Calling method for logging purposes. :return: Valid metadata namespace. - :rtype: str """ if format_id and not format_id.strip(): exception_string = f"FileHashStore - {method}: Format_id cannot be empty." @@ -2406,7 +2395,6 @@ def _refine_algorithm_list( :param str checksum_algorithm: Checksum algorithm. :return: De-duplicated list of hash algorithms. - :rtype: set """ algorithm_list_to_calculate = self.default_algo_list if checksum_algorithm is not None: @@ -2439,7 +2427,6 @@ def _clean_algorithm(self, algorithm_string: str) -> str: :param str algorithm_string: Algorithm to validate. :return: `hashlib` supported algorithm string. - :rtype: str """ count = 0 for char in algorithm_string: @@ -2473,16 +2460,15 @@ def _computehash( :param str algorithm: Algorithm of hex digest to generate. :return: Hex digest. - :rtype: str """ if algorithm is None: - hashobj = hashlib.new(self.algorithm) + hash_obj = hashlib.new(self.algorithm) else: check_algorithm = self._clean_algorithm(algorithm) - hashobj = hashlib.new(check_algorithm) + hash_obj = hashlib.new(check_algorithm) for data in stream: - hashobj.update(self._cast_to_bytes(data)) - hex_digest = hashobj.hexdigest() + hash_obj.update(self._cast_to_bytes(data)) + hex_digest = hash_obj.hexdigest() return hex_digest def _shard(self, checksum: str) -> List[str]: @@ -2502,7 +2488,6 @@ def _shard(self, checksum: str) -> List[str]: :return: A list where each element is a token of fixed width, with any leftover characters as the last element. - :rtype: list """ def compact(items: List[Any]) -> List[Any]: @@ -2529,7 +2514,6 @@ def _count(self, entity: str) -> int: :param str entity: Desired entity type (ex. "objects", "metadata"). :return: Number of files in the directory. - :rtype: int """ count = 0 if entity == "objects": @@ -2541,7 +2525,7 @@ def _count(self, entity: str) -> int: elif entity == "cid": directory_to_count = self.cids elif entity == "tmp": - directory_to_count = self.objects + "tmp" + directory_to_count = self.objects / "tmp" else: raise ValueError( f"entity: {entity} does not exist. Do you mean 'objects' or 'metadata'?" @@ -2559,7 +2543,6 @@ def _exists(self, entity: str, file: str) -> bool: :param str file: The name of the file to check. :return: True if the file exists. - :rtype: bool """ if entity == "objects": try: @@ -2583,7 +2566,6 @@ def _open( :param str mode: Mode to open file in. Defaults to 'rb'. :return: An `io` stream dependent on the `mode`. - :rtype: io.BufferedReader """ realpath = None if entity == "objects": @@ -2616,7 +2598,7 @@ def _delete(self, entity: str, file: Union[str, Path]) -> None: except FileNotFoundError: # Swallow file not found exceptions for metadata realpath = None - elif os.path.exists(file): + elif os.path.isfile(file): # Check if the given path is an absolute path realpath = file else: @@ -2640,7 +2622,7 @@ def _create_path(self, path: Path) -> None: :raises AssertionError: If the path already exists but is not a directory. """ try: - os.makedirs(path, self.dmode) + os.makedirs(path, self.d_mode) except FileExistsError: assert os.path.isdir(path), f"expected {path} to be a directory" @@ -2651,7 +2633,6 @@ def _get_store_path(self, entity: str) -> Path: Note, "cid" and "pid" are refs specific directories. :return: Path to requested store entity type - :rtype: Path """ if entity == "objects": return Path(self.objects) @@ -2674,7 +2655,6 @@ def _build_hashstore_data_object_path(self, hash_id: str) -> str: :param str hash_id: A hash ID to build a file path for. :return: An absolute file path for the specified hash ID. - :rtype: str """ paths = self._shard(hash_id) root_dir = self._get_store_path("objects") @@ -2687,7 +2667,6 @@ def _get_hashstore_data_object_path(self, cid_or_relative_path: str) -> Path: :param str cid_or_relative_path: Content identifier or relative path in '/objects' to check :return: Path to the data object referenced by the pid - :rtype: Path """ expected_abs_data_obj_path = self._build_hashstore_data_object_path( cid_or_relative_path @@ -2713,11 +2692,10 @@ def _get_hashstore_data_object_path(self, cid_or_relative_path: str) -> Path: def _get_hashstore_metadata_path(self, metadata_relative_path: str) -> Path: """Return the expected metadata path to a hashstore metadata object that exists. - :param str metadata_relative_path: Metadata path to check or relative path in - '/metadata' to check + :param str metadata_relative_path: Metadata path to check or relative path in '/metadata' + to check :return: Path to the data object referenced by the pid - :rtype: Path """ # Form the absolute path to the metadata file expected_abs_metadata_path = os.path.join(self.metadata, metadata_relative_path) @@ -2731,7 +2709,7 @@ def _get_hashstore_metadata_path(self, metadata_relative_path: str) -> Path: raise FileNotFoundError( "FileHashStore - _get_hashstore_metadata_path: could not locate a" + "metadata object in '/metadata' for the supplied metadata_relative_path: " - + metadata_relative_path + + str(metadata_relative_path) ) def _get_hashstore_pid_refs_path(self, pid: str) -> Path: @@ -2740,7 +2718,6 @@ def _get_hashstore_pid_refs_path(self, pid: str) -> Path: :param str pid: Persistent or authority-based identifier :return: Path to pid reference file - :rtype: Path """ # The pid refs file is named after the hash of the pid using the store's algorithm hash_id = self._computehash(pid, self.algorithm) @@ -2755,7 +2732,6 @@ def _get_hashstore_cid_refs_path(self, cid: str) -> Path: :param str cid: Content identifier :return: Path to cid reference file - :rtype: Path """ root_dir = self._get_store_path("cid") # The content identifier is to be split into directories as is supplied @@ -2942,7 +2918,6 @@ def _read_small_file_content(path_to_file: Path): :param path path_to_file: Path to the file to read :return: Content of the given file - :rtype: str """ with open(path_to_file, "r", encoding="utf8") as opened_path: content = opened_path.read() @@ -2955,7 +2930,6 @@ def _rename_path_for_deletion(path: Union[Path, str]) -> str: :param Path path: Path to file to rename :return: Path to the renamed file - :rtype: str """ if isinstance(path, str): path = Path(path) @@ -2973,7 +2947,6 @@ def _get_file_paths(directory: Union[str, Path]) -> Optional[List[Path]]: :raises FileNotFoundError: If the directory doesn't exist :return: file_paths - File paths of the given directory or None if directory doesn't exist - :rtype: List """ if os.path.exists(directory): files = os.listdir(directory) @@ -2993,7 +2966,6 @@ def _check_arg_data(data: Union[str, os.PathLike, io.BufferedReader]) -> bool: :type data: str, os.PathLike, io.BufferedReader :return: True if valid. - :rtype: bool """ if ( not isinstance(data, str) @@ -3060,7 +3032,6 @@ def _cast_to_bytes(text: any) -> bytes: :param Any text: String to convert. :return: Bytes with utf-8 encoding. - :rtype: bytes """ if not isinstance(text, bytes): text = bytes(text, "utf8") @@ -3081,7 +3052,7 @@ class Stream: set its position back to ``0``. """ - def __init__(self, obj: Union[IO[bytes], str]): + def __init__(self, obj: Union[IO[bytes], str, Path]): if hasattr(obj, "read"): pos = obj.tell() elif os.path.isfile(obj): diff --git a/src/hashstore/hashstoreclient.py b/src/hashstore/hashstoreclient.py index 9575a908..b1c1cc67 100644 --- a/src/hashstore/hashstoreclient.py +++ b/src/hashstore/hashstoreclient.py @@ -193,7 +193,8 @@ def __init__(self): help="Flag to delete a metadata document from a HashStore", ) - def load_store_properties(self, hashstore_yaml): + @staticmethod + def load_store_properties(hashstore_yaml): """Get and return the contents of the current HashStore config file. :return: HashStore properties with the following keys (and values): @@ -291,6 +292,7 @@ def store_to_hashstore_from_list(self, origin_dir, obj_type, num, skip_obj_size) logging.info(info_msg) # Get list of objects to store from metacat db + checked_obj_list = None if obj_type == self.OBJ_TYPE: checked_obj_list = self.metacatdb.refine_list_for_objects( metacat_obj_list, "store" @@ -373,6 +375,7 @@ def retrieve_and_validate_from_hashstore( # Get list of objects to store from metacat db logging.info("HashStore Client - Refining object list for %s", obj_type) + checked_obj_list = None if obj_type == self.OBJ_TYPE: checked_obj_list = self.metacatdb.refine_list_for_objects( metacat_obj_list, "retrieve" @@ -469,6 +472,7 @@ def delete_objects_from_list(self, origin_dir, obj_type, num, skip_obj_size): ) # Get list of objects to store from metacat db + checked_obj_list = None if obj_type == self.OBJ_TYPE: checked_obj_list = self.metacatdb.refine_list_for_objects( metacat_obj_list, "delete" @@ -597,8 +601,9 @@ def get_object_metadata_list(self, origin_directory, num, skip_obj_size=None): limit_query = f" LIMIT {num}" query = f"""SELECT identifier.guid, identifier.docid, identifier.rev, systemmetadata.object_format, systemmetadata.checksum, - systemmetadata.checksum_algorithm, systemmetadata.size FROM identifier INNER JOIN systemmetadata - ON identifier.guid = systemmetadata.guid ORDER BY identifier.guid{limit_query};""" + systemmetadata.checksum_algorithm, systemmetadata.size FROM identifier INNER JOIN + systemmetadata ON identifier.guid = systemmetadata.guid ORDER BY + identifier.guid{limit_query};""" cursor.execute(query) # Fetch all rows from the result set @@ -639,7 +644,8 @@ def get_object_metadata_list(self, origin_directory, num, skip_obj_size=None): return object_metadata_list - def refine_list_for_objects(self, metacat_obj_list, action): + @staticmethod + def refine_list_for_objects(metacat_obj_list, action): """Refine a list of objects by checking for file existence and removing duplicates. :param List metacat_obj_list: List of tuple objects representing rows from Metacat database. @@ -681,7 +687,8 @@ def refine_list_for_objects(self, metacat_obj_list, action): return refined_object_list - def refine_list_for_metadata(self, metacat_obj_list, action): + @staticmethod + def refine_list_for_metadata(metacat_obj_list, action): """Refine a list of metadata by checking for file existence and removing duplicates. :param List metacat_obj_list: List of tuple objects representing rows from metacat db. diff --git a/tests/filehashstore/test_filehashstore.py b/tests/filehashstore/test_filehashstore.py index ead33e2c..0b0e94c3 100644 --- a/tests/filehashstore/test_filehashstore.py +++ b/tests/filehashstore/test_filehashstore.py @@ -32,13 +32,13 @@ def test_init_directories_created(store): """Confirm that object and metadata directories have been created.""" assert os.path.exists(store.root) assert os.path.exists(store.objects) - assert os.path.exists(store.objects + "/tmp") + assert os.path.exists(store.objects / "tmp") assert os.path.exists(store.metadata) - assert os.path.exists(store.metadata + "/tmp") + assert os.path.exists(store.metadata / "tmp") assert os.path.exists(store.refs) - assert os.path.exists(store.refs + "/tmp") - assert os.path.exists(store.refs + "/pids") - assert os.path.exists(store.refs + "/cids") + assert os.path.exists(store.refs / "tmp") + assert os.path.exists(store.refs / "pids") + assert os.path.exists(store.refs / "cids") def test_init_existing_store_incorrect_algorithm_format(store): @@ -46,7 +46,7 @@ def test_init_existing_store_incorrect_algorithm_format(store): the string must exactly match the expected format). DataONE uses the library of congress vocabulary to standardize algorithm types.""" properties = { - "store_path": store.root + "/incorrect_algo_format", + "store_path": store.root / "incorrect_algo_format", "store_depth": 3, "store_width": 2, "store_algorithm": "sha256", @@ -364,7 +364,6 @@ def test_store_and_validate_data_files_path(pids, store): def test_store_and_validate_data_files_string(pids, store): """Test _store_and_validate_data accepts string for the path arg.""" test_dir = "tests/testdata/" - entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store._store_and_validate_data(pid, path) @@ -722,7 +721,7 @@ def test_write_to_tmp_file_and_get_hex_digests_with_unsupported_algorithm(pids, def test_mktmpfile(store): """Test that _mktmpfile creates and returns a tmp file.""" - path = store.root + "/doutest/tmp/" + path = store.root / "doutest" / "tmp" store._create_path(path) tmp = store._mktmpfile(path) assert os.path.exists(tmp.name) @@ -1059,7 +1058,7 @@ def test_put_metadata_stored_path(pids, store): # Manually calculate expected path metadata_directory = store._computehash(pid) - rel_path = "/".join(store._shard(metadata_directory)) + rel_path = Path(*store._shard(metadata_directory)) full_path = ( store._get_store_path("metadata") / rel_path / metadata_document_name ) @@ -1383,7 +1382,7 @@ def test_verify_object_information_incorrect_size_with_pid(pids, store): checksum_algorithm = store.algorithm expected_file_size = object_metadata.obj_size - objects_tmp_folder = store.objects + "/tmp" + objects_tmp_folder = store.objects / "tmp" tmp_file = store._mktmpfile(objects_tmp_folder) assert os.path.isfile(tmp_file.name) with pytest.raises(NonMatchingObjSize): @@ -1667,8 +1666,7 @@ def test_exists_object_with_sharded_path(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store._store_and_validate_data(pid, path) - object_metadata_shard = store._shard(object_metadata.cid) - object_metadata_shard_path = "/".join(object_metadata_shard) + object_metadata_shard_path = os.path.join(*store._shard(object_metadata.cid)) assert store._exists(entity, object_metadata_shard_path) @@ -1728,9 +1726,7 @@ def test_private_delete_metadata(pids, store): # Manually calculate expected path metadata_directory = store._computehash(pid) metadata_document_name = store._computehash(pid + format_id) - rel_path = ( - Path("/".join(store._shard(metadata_directory))) / metadata_document_name - ) + rel_path = Path(*store._shard(metadata_directory)) / metadata_document_name store._delete("metadata", rel_path) @@ -1758,7 +1754,7 @@ def test_create_path(pids, store): for pid in pids: root_directory = store.root pid_hex_digest_directory = pids[pid]["metadata_cid"][:2] - pid_directory = root_directory + pid_hex_digest_directory + pid_directory = root_directory / pid_hex_digest_directory store._create_path(pid_directory) assert os.path.isdir(pid_directory) @@ -1827,15 +1823,13 @@ def test_get_hashstore_metadata_path_relative_path(pids, store): metadata_directory = store._computehash(pid) metadata_document_name = store._computehash(pid + format_id) - rel_path = "/".join(store._shard(metadata_directory)) - full_path_without_dir = rel_path + "/" + metadata_document_name + rel_path = Path(*store._shard(metadata_directory)) + full_path_without_dir = Path(rel_path) / metadata_document_name metadata_resolved_path = store._get_hashstore_metadata_path( full_path_without_dir ) - calculated_metadata_path = ( - store.metadata + "/" + rel_path + "/" + metadata_document_name - ) + calculated_metadata_path = store.metadata / rel_path / metadata_document_name assert Path(calculated_metadata_path) == metadata_resolved_path @@ -1849,8 +1843,8 @@ def test_get_hashstore_pid_refs_path(pids, store): resolved_pid_ref_abs_path = store._get_hashstore_pid_refs_path(pid) pid_refs_metadata_hashid = store._computehash(pid) - calculated_pid_ref_path = ( - store.pids + "/" + "/".join(store._shard(pid_refs_metadata_hashid)) + calculated_pid_ref_path = store.pids / Path( + *store._shard(pid_refs_metadata_hashid) ) assert resolved_pid_ref_abs_path == Path(calculated_pid_ref_path) @@ -1865,7 +1859,7 @@ def test_get_hashstore_cid_refs_path(pids, store): cid = object_metadata.cid resolved_cid_ref_abs_path = store._get_hashstore_cid_refs_path(cid) - calculated_cid_ref_path = store.cids + "/" + "/".join(store._shard(cid)) + calculated_cid_ref_path = store.cids / Path(*store._shard(cid)) assert resolved_cid_ref_abs_path == Path(calculated_cid_ref_path) @@ -1922,11 +1916,11 @@ def test_stream_reads_path_object(pids): for pid in pids.keys(): path = Path(test_dir + pid.replace("/", "_")) obj_stream = Stream(path) - hashobj = hashlib.new("sha256") + hash_obj = hashlib.new("sha256") for data in obj_stream: - hashobj.update(data) + hash_obj.update(data) obj_stream.close() - hex_digest = hashobj.hexdigest() + hex_digest = hash_obj.hexdigest() assert pids[pid]["sha256"] == hex_digest @@ -1946,6 +1940,7 @@ def test_stream_returns_to_original_position_on_close(pids): input_stream.close() +# noinspection PyTypeChecker def test_stream_raises_error_for_invalid_object(): """Test that a stream raises ValueError for an invalid input object.""" with pytest.raises(ValueError): diff --git a/tests/filehashstore/test_filehashstore_interface.py b/tests/filehashstore/test_filehashstore_interface.py index 0f93a64a..381e1035 100644 --- a/tests/filehashstore/test_filehashstore_interface.py +++ b/tests/filehashstore/test_filehashstore_interface.py @@ -650,9 +650,9 @@ def get_number_of_files(folder_path): file_count += len(files) return file_count - assert get_number_of_files(store.refs + "/pids") == 6 - assert get_number_of_files(store.refs + "/cids") == 1 - assert folder_has_files(store.refs + "/tmp") is False + assert get_number_of_files(store.refs / "pids") == 6 + assert get_number_of_files(store.refs / "cids") == 1 + assert folder_has_files(store.refs / "tmp") is False @slow_test @@ -960,7 +960,7 @@ def test_store_metadata(pids, store): # Manually calculate expected path metadata_directory = store._computehash(pid) metadata_document_name = store._computehash(pid + format_id) - rel_path = "/".join(store._shard(metadata_directory)) + rel_path = Path(*store._shard(metadata_directory)) full_path = ( store._get_store_path("metadata") / rel_path / metadata_document_name ) @@ -976,7 +976,7 @@ def test_store_metadata_one_pid_multiple_docs_correct_location(store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename metadata_directory = store._computehash(pid) - rel_path = "/".join(store._shard(metadata_directory)) + rel_path = Path(*store._shard(metadata_directory)) format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" format_id3 = "http://ns.dataone.org/service/types/v3.0" format_id4 = "http://ns.dataone.org/service/types/v4.0" @@ -1008,7 +1008,7 @@ def test_store_metadata_default_format_id(pids, store): # Manually calculate expected path metadata_directory = store._computehash(pid) metadata_document_name = store._computehash(pid + format_id) - rel_path = "/".join(store._shard(metadata_directory)) + rel_path = Path(*store._shard(metadata_directory)) full_path = ( store._get_store_path("metadata") / rel_path / metadata_document_name ) @@ -1491,22 +1491,22 @@ def test_store_and_delete_objects_100_pids_1_cid(store): deletes all related files""" test_dir = "tests/testdata/" path = test_dir + "jtao.1700.1" + refs_pids_path = store.root / "refs" / "pids" + refs_cids_path = store.root / "refs" / "cids" # Store upper_limit = 101 for i in range(1, upper_limit): pid_modified = f"dou.test.{str(i)}" store.store_object(pid_modified, path) - assert ( - sum([len(files) for _, _, files in os.walk(store.root + "/refs/pids")]) == 100 - ) - assert sum([len(files) for _, _, files in os.walk(store.root + "/refs/cids")]) == 1 + assert sum([len(files) for _, _, files in os.walk(refs_pids_path)]) == 100 + assert sum([len(files) for _, _, files in os.walk(refs_cids_path)]) == 1 assert store._count("objects") == 1 # Delete for i in range(1, upper_limit): pid_modified = f"dou.test.{str(i)}" store.delete_object(pid_modified) - assert sum([len(files) for _, _, files in os.walk(store.root + "/refs/pids")]) == 0 - assert sum([len(files) for _, _, files in os.walk(store.root + "/refs/cids")]) == 0 + assert sum([len(files) for _, _, files in os.walk(refs_pids_path)]) == 0 + assert sum([len(files) for _, _, files in os.walk(refs_cids_path)]) == 0 assert store._count("objects") == 0 @@ -1557,6 +1557,8 @@ def delete_object_wrapper(pid_var): thread5.join() thread6.join() - assert sum([len(files) for _, _, files in os.walk(store.root + "/refs/pid")]) == 0 - assert sum([len(files) for _, _, files in os.walk(store.root + "/refs/cid")]) == 0 + refs_pids_path = store.root / "refs" / "pids" + refs_cids_path = store.root / "refs" / "cids" + assert sum([len(files) for _, _, files in os.walk(refs_pids_path)]) == 0 + assert sum([len(files) for _, _, files in os.walk(refs_cids_path)]) == 0 assert store._count("objects") == 0 diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py index 7e13f37a..ba0bd566 100644 --- a/tests/test_hashstore_client.py +++ b/tests/test_hashstore_client.py @@ -53,7 +53,7 @@ def test_get_checksum(capsys, store, pids): store.store_object(pid, path) client_module_path = f"{client_directory}/client.py" - test_store = store.root + test_store = str(store.root) get_checksum_opt = "-getchecksum" client_pid_arg = f"-pid={pid}" algo_arg = f"-algo={store.algorithm}" @@ -86,7 +86,7 @@ def test_store_object(store, pids): test_dir = "tests/testdata/" for pid in pids.keys(): client_module_path = f"{client_directory}/client.py" - test_store = store.root + test_store = str(store.root) store_object_opt = "-storeobject" client_pid_arg = f"-pid={pid}" path = f'-path={test_dir + pid.replace("/", "_")}' @@ -117,7 +117,7 @@ def test_store_metadata(capsys, store, pids): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename client_module_path = f"{client_directory}/client.py" - test_store = store.root + test_store = str(store.root) store_metadata_opt = "-storemetadata" client_pid_arg = f"-pid={pid}" path = f"-path={syspath}" @@ -139,7 +139,7 @@ def test_store_metadata(capsys, store, pids): metadata_directory = store._computehash(pid) metadata_document_name = store._computehash(pid + namespace) - rel_path = "/".join(store._shard(metadata_directory)) + rel_path = Path(*store._shard(metadata_directory)) full_path = ( store._get_store_path("metadata") / rel_path / metadata_document_name ) @@ -159,7 +159,7 @@ def test_retrieve_objects(capsys, pids, store): store.store_object(pid, path) client_module_path = f"{client_directory}/client.py" - test_store = store.root + test_store = str(store.root) delete_object_opt = "-retrieveobject" client_pid_arg = f"-pid={pid}" chs_args = [ @@ -199,7 +199,7 @@ def test_retrieve_metadata(capsys, pids, store): _metadata_cid = store.store_metadata(pid, syspath, namespace) client_module_path = f"{client_directory}/client.py" - test_store = store.root + test_store = str(store.root) retrieve_metadata_opt = "-retrievemetadata" client_pid_arg = f"-pid={pid}" format_id = f"-formatid={namespace}" @@ -239,7 +239,7 @@ def test_delete_objects(pids, store): store.store_object(pid, path) client_module_path = f"{client_directory}/client.py" - test_store = store.root + test_store = str(store.root) delete_object_opt = "-deleteobject" client_pid_arg = f"-pid={pid}" chs_args = [ @@ -269,7 +269,7 @@ def test_delete_metadata(pids, store): _metadata_cid = store.store_metadata(pid, syspath, namespace) client_module_path = f"{client_directory}/client.py" - test_store = store.root + test_store = str(store.root) delete_metadata_opt = "-deletemetadata" client_pid_arg = f"-pid={pid}" format_id = f"-formatid={namespace}"