diff --git a/.vscode/settings.json b/.vscode/settings.json index c31c8612..b15ffaa4 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,9 +1,12 @@ { "python.terminal.activateEnvInCurrentTerminal": true, - "python.formatting.provider": "black", "python.testing.pytestArgs": [ "tests" ], "python.testing.unittestEnabled": false, - "python.testing.pytestEnabled": true -} + "python.testing.pytestEnabled": true, + "editor.formatOnSave": true, + "[python]": { + "editor.defaultFormatter": "ms-python.black-formatter" + } +} \ No newline at end of file diff --git a/README.md b/README.md index fd9bd193..0ddaac61 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,82 @@ HashStore is a python package, and built using the [Python Poetry](https://pytho To install `hashstore` locally, create a virtual environment for python 3.9+, install poetry, and then install or build the package with `poetry install` or `poetry build`, respectively. +To run tests, navigate to the root directory and run `pytest -s`. The test suite contains tests that +take a longer time to run (relating to the storage of large files) - to execute all tests, run +`pytest --run-slow`. To see detailed + +## Usage Example + +To view more details about the Public API - see 'hashstore.py` interface documentation + +```py +from hashstore import HashStoreFactory + +# Instantiate a factory +hashstore_factory = HashStoreFactory() + +# Create a properties dictionary with the required fields +properties = { + "store_path": "/path/to/your/store", + "store_depth": 3, + "store_width": 2, + "store_algorithm": "sha256", + "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", +} + +# Get HashStore from factory +module_name = "hashstore.filehashstore.filehashstore" +class_name = "FileHashStore" +my_store = factory.get_hashstore(module_name, class_name, properties) + +# Store objects (.../[hashstore_path]/objects/) +pid = "j.tao.1700.1" +object = "/path/to/your/object.data" +hash_address = my_store.store_object(pid, object) +object_cid = hash_address.id + +# Store metadata (.../[hashstore_path]/metadata/) +# By default, storing metadata will use the given properties namespace `format_id` +pid = "j.tao.1700.1" +sysmeta = "/path/to/your/sysmeta/document.xml" +metadata_cid = my_store.store_metadata(pid, sysmeta) +``` + +If you want to store other types of metadata, add an additional `format_id`. +```py +pid = "j.tao.1700.1" +metadata = "/path/to/your/metadata/document.json" +format_id = "http://custom.metadata.com/json/type/v1.0" +metadata_cid = my_store.store_metadata(pid, metadata, format_id) +``` + +How to use HashStore client (command line app) +```sh +# Step 1: Create a HashStore +$ python './src/hashstore/client.py' /path/to/store/ -chs -dp=3 -wp=2 -ap=SHA-256 -nsp="http://www.ns.test/v1" + +# Get the checksum of a data object +$ python './src/hashstore/client.py' /path/to/store/ -getchecksum -pid=content_identifier -algo=SHA-256 + +# Store a data object +$ python './src/hashstore/client.py' /path/to/store/ -storeobject -pid=content_identifier -path=/path/to/object + +# Store a metadata object +$ python './src/hashstore/client.py' /path/to/store/ -storemetadata -pid=content_identifier -path=/path/to/metadata/object -formatid=http://ns.dataone.org/service/types/v2.0 + +# Retrieve a data object +$ python './src/hashstore/client.py' /path/to/store/ -retrieveobject -pid=content_identifier + +# Retrieve a metadata object +$ python './src/hashstore/client.py' /path/to/store/ -retrievemetadata -pid=content_identifier -formatid=http://ns.dataone.org/service/types/v2.0 + +# Delete a data object +$ python './src/hashstore/client.py' /path/to/store/ -deleteobject -pid=content_identifier + +# Delete a metadata file +$ python './src/hashstore/client.py' /path/to/store/ -deletemetadata -pid=content_identifier -formatid=http://ns.dataone.org/service/types/v2.0 +``` + ## License ``` Copyright [2022] [Regents of the University of California] @@ -44,7 +120,7 @@ limitations under the License. Work on this package was supported by: - DataONE Network -- Arctic Data Center: NSF-PLR grant #2042102 to M. B. Jones, A. Budden, M. Schildhauer, and J. Dozier +- Arctic Data Center: NSF-PLR grant #2042102 to M. B. Jones, A. Budden, M. Schildhauer, and J. Dozier Additional support was provided for collaboration by the National Center for Ecological Analysis and Synthesis, a Center funded by the University of California, Santa Barbara, and the State of California. diff --git a/poetry.lock b/poetry.lock index a1f53156..85abf43e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,14 +1,27 @@ -# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand. + +[[package]] +name = "asn1crypto" +version = "1.5.1" +description = "Fast ASN.1 parser and serializer with definitions for private keys, public keys, certificates, CRL, OCSP, CMS, PKCS#3, PKCS#7, PKCS#8, PKCS#12, PKCS#5, X.509 and TSP" +category = "dev" +optional = false +python-versions = "*" +files = [ + {file = "asn1crypto-1.5.1-py2.py3-none-any.whl", hash = "sha256:db4e40728b728508912cbb3d44f19ce188f218e9eba635821bb4b68564f8fd67"}, + {file = "asn1crypto-1.5.1.tar.gz", hash = "sha256:13ae38502be632115abf8a24cbe5f4da52e3b5231990aff31123c805306ccb9c"}, +] [[package]] name = "astroid" -version = "2.15.5" +version = "2.15.6" description = "An abstract syntax tree for Python with inference support." +category = "dev" optional = false python-versions = ">=3.7.2" files = [ - {file = "astroid-2.15.5-py3-none-any.whl", hash = "sha256:078e5212f9885fa85fbb0cf0101978a336190aadea6e13305409d099f71b2324"}, - {file = "astroid-2.15.5.tar.gz", hash = "sha256:1039262575027b441137ab4a62a793a9b43defb42c32d5670f38686207cd780f"}, + {file = "astroid-2.15.6-py3-none-any.whl", hash = "sha256:389656ca57b6108f939cf5d2f9a2a825a3be50ba9d589670f393236e0a03b91c"}, + {file = "astroid-2.15.6.tar.gz", hash = "sha256:903f024859b7c7687d7a7f3a3f73b17301f8e42dfd9cc9df9d4418172d3e2dbd"}, ] [package.dependencies] @@ -23,6 +36,7 @@ wrapt = [ name = "black" version = "22.12.0" description = "The uncompromising code formatter." +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -56,13 +70,14 @@ uvloop = ["uvloop (>=0.15.2)"] [[package]] name = "click" -version = "8.1.3" +version = "8.1.5" description = "Composable command line interface toolkit" +category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "click-8.1.3-py3-none-any.whl", hash = "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"}, - {file = "click-8.1.3.tar.gz", hash = "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e"}, + {file = "click-8.1.5-py3-none-any.whl", hash = "sha256:e576aa487d679441d7d30abb87e1b43d24fc53bffb8758443b1a9e1cee504548"}, + {file = "click-8.1.5.tar.gz", hash = "sha256:4be4b1af8d665c6d942909916d31a213a106800c47d0eeba73d34da3cbc11367"}, ] [package.dependencies] @@ -72,6 +87,7 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""} name = "colorama" version = "0.4.6" description = "Cross-platform colored terminal text." +category = "dev" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" files = [ @@ -83,6 +99,7 @@ files = [ name = "dill" version = "0.3.6" description = "serialize all of python" +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -95,13 +112,14 @@ graph = ["objgraph (>=1.7.2)"] [[package]] name = "exceptiongroup" -version = "1.1.1" +version = "1.1.2" description = "Backport of PEP 654 (exception groups)" +category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "exceptiongroup-1.1.1-py3-none-any.whl", hash = "sha256:232c37c63e4f682982c8b6459f33a8981039e5fb8756b2074364e5055c498c9e"}, - {file = "exceptiongroup-1.1.1.tar.gz", hash = "sha256:d484c3090ba2889ae2928419117447a14daf3c1231d5e30d0aae34f354f01785"}, + {file = "exceptiongroup-1.1.2-py3-none-any.whl", hash = "sha256:e346e69d186172ca7cf029c8c1d16235aa0e04035e5750b4b95039e65204328f"}, + {file = "exceptiongroup-1.1.2.tar.gz", hash = "sha256:12c3e887d6485d16943a309616de20ae5582633e0a2eda17f4e10fd61c1e8af5"}, ] [package.extras] @@ -111,6 +129,7 @@ test = ["pytest (>=6)"] name = "iniconfig" version = "2.0.0" description = "brain-dead simple config-ini parsing" +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -122,6 +141,7 @@ files = [ name = "isort" version = "5.12.0" description = "A Python utility / library to sort Python imports." +category = "dev" optional = false python-versions = ">=3.8.0" files = [ @@ -139,6 +159,7 @@ requirements-deprecated-finder = ["pip-api", "pipreqs"] name = "lazy-object-proxy" version = "1.9.0" description = "A fast and thorough lazy object proxy." +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -184,6 +205,7 @@ files = [ name = "mccabe" version = "0.7.0" description = "McCabe checker, plugin for flake8" +category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -195,6 +217,7 @@ files = [ name = "mypy-extensions" version = "1.0.0" description = "Type system extensions for programs checked with the mypy type checker." +category = "dev" optional = false python-versions = ">=3.5" files = [ @@ -206,6 +229,7 @@ files = [ name = "packaging" version = "23.1" description = "Core utilities for Python packages" +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -217,6 +241,7 @@ files = [ name = "pathlib" version = "1.0.1" description = "Object-oriented filesystem paths" +category = "main" optional = false python-versions = "*" files = [ @@ -228,6 +253,7 @@ files = [ name = "pathspec" version = "0.11.1" description = "Utility library for gitignore style pattern matching of file paths." +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -235,30 +261,48 @@ files = [ {file = "pathspec-0.11.1.tar.gz", hash = "sha256:2798de800fa92780e33acca925945e9a19a133b715067cf165b8866c15a31687"}, ] +[[package]] +name = "pg8000" +version = "1.29.8" +description = "PostgreSQL interface library" +category = "dev" +optional = false +python-versions = ">=3.7" +files = [ + {file = "pg8000-1.29.8-py3-none-any.whl", hash = "sha256:962e9d6687f76057bd6d9c9c0f67f503a503216bf60b3a4d71e4cb8c97f8326d"}, + {file = "pg8000-1.29.8.tar.gz", hash = "sha256:609cfbccea783e15f111cc0cb2f6d4e6b4c349a695c59505a29baba6fc79ffa9"}, +] + +[package.dependencies] +python-dateutil = ">=2.8.2" +scramp = ">=1.4.3" + [[package]] name = "platformdirs" -version = "3.5.1" +version = "3.8.1" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." +category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "platformdirs-3.5.1-py3-none-any.whl", hash = "sha256:e2378146f1964972c03c085bb5662ae80b2b8c06226c54b2ff4aa9483e8a13a5"}, - {file = "platformdirs-3.5.1.tar.gz", hash = "sha256:412dae91f52a6f84830f39a8078cecd0e866cb72294a5c66808e74d5e88d251f"}, + {file = "platformdirs-3.8.1-py3-none-any.whl", hash = "sha256:cec7b889196b9144d088e4c57d9ceef7374f6c39694ad1577a0aab50d27ea28c"}, + {file = "platformdirs-3.8.1.tar.gz", hash = "sha256:f87ca4fcff7d2b0f81c6a748a77973d7af0f4d526f98f308477c3c436c74d528"}, ] [package.extras] -docs = ["furo (>=2023.3.27)", "proselint (>=0.13)", "sphinx (>=6.2.1)", "sphinx-autodoc-typehints (>=1.23,!=1.23.4)"] -test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.3.1)", "pytest-cov (>=4)", "pytest-mock (>=3.10)"] +docs = ["furo (>=2023.5.20)", "proselint (>=0.13)", "sphinx (>=7.0.1)", "sphinx-autodoc-typehints (>=1.23,!=1.23.4)"] +test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.3.1)", "pytest-cov (>=4.1)", "pytest-mock (>=3.10)"] [[package]] name = "pluggy" -version = "1.0.0" +version = "1.2.0" description = "plugin and hook calling mechanisms for python" +category = "dev" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" files = [ - {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, - {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"}, + {file = "pluggy-1.2.0-py3-none-any.whl", hash = "sha256:c2fd55a7d7a3863cba1a013e4e2414658b1d07b6bc57b3919e0c63c9abb99849"}, + {file = "pluggy-1.2.0.tar.gz", hash = "sha256:d12f0c4b579b15f5e054301bb226ee85eeeba08ffec228092f8defbaa3a4c4b3"}, ] [package.extras] @@ -269,6 +313,7 @@ testing = ["pytest", "pytest-benchmark"] name = "pylint" version = "2.17.4" description = "python code static checker" +category = "dev" optional = false python-versions = ">=3.7.2" files = [ @@ -296,13 +341,14 @@ testutils = ["gitpython (>3)"] [[package]] name = "pytest" -version = "7.3.1" +version = "7.4.0" description = "pytest: simple powerful testing with Python" +category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "pytest-7.3.1-py3-none-any.whl", hash = "sha256:3799fa815351fea3a5e96ac7e503a96fa51cc9942c3753cda7651b93c1cfa362"}, - {file = "pytest-7.3.1.tar.gz", hash = "sha256:434afafd78b1d78ed0addf160ad2b77a30d35d4bdf8af234fe621919d9ed15e3"}, + {file = "pytest-7.4.0-py3-none-any.whl", hash = "sha256:78bf16451a2eb8c7a2ea98e32dc119fd2aa758f1d5d66dbf0a59d69a3969df32"}, + {file = "pytest-7.4.0.tar.gz", hash = "sha256:b4bf8c45bd59934ed84001ad51e11b4ee40d40a1229d2c79f9c592b0a3f6bd8a"}, ] [package.dependencies] @@ -314,12 +360,28 @@ pluggy = ">=0.12,<2.0" tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] -testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] +testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] + +[[package]] +name = "python-dateutil" +version = "2.8.2" +description = "Extensions to the standard Python datetime module" +category = "dev" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +files = [ + {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, + {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, +] + +[package.dependencies] +six = ">=1.5" [[package]] name = "pyyaml" version = "6.0" description = "YAML parser and emitter for Python" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -365,10 +427,38 @@ files = [ {file = "PyYAML-6.0.tar.gz", hash = "sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2"}, ] +[[package]] +name = "scramp" +version = "1.4.4" +description = "An implementation of the SCRAM protocol." +category = "dev" +optional = false +python-versions = ">=3.7" +files = [ + {file = "scramp-1.4.4-py3-none-any.whl", hash = "sha256:b142312df7c2977241d951318b7ee923d6b7a4f75ba0f05b621ece1ed616faa3"}, + {file = "scramp-1.4.4.tar.gz", hash = "sha256:b7022a140040f33cf863ab2657917ed05287a807b917950489b89b9f685d59bc"}, +] + +[package.dependencies] +asn1crypto = ">=1.5.1" + +[[package]] +name = "six" +version = "1.16.0" +description = "Python 2 and 3 compatibility utilities" +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" +files = [ + {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, + {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, +] + [[package]] name = "tomli" version = "2.0.1" description = "A lil' TOML parser" +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -380,6 +470,7 @@ files = [ name = "tomlkit" version = "0.11.8" description = "Style preserving TOML library" +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -389,19 +480,21 @@ files = [ [[package]] name = "typing-extensions" -version = "4.6.3" +version = "4.7.1" description = "Backported and Experimental Type Hints for Python 3.7+" +category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "typing_extensions-4.6.3-py3-none-any.whl", hash = "sha256:88a4153d8505aabbb4e13aacb7c486c2b4a33ca3b3f807914a9b4c844c471c26"}, - {file = "typing_extensions-4.6.3.tar.gz", hash = "sha256:d91d5919357fe7f681a9f2b5b4cb2a5f1ef0a1e9f59c4d8ff0d3491e05c0ffd5"}, + {file = "typing_extensions-4.7.1-py3-none-any.whl", hash = "sha256:440d5dd3af93b060174bf433bccd69b0babc3b15b1a8dca43789fd7f61514b36"}, + {file = "typing_extensions-4.7.1.tar.gz", hash = "sha256:b75ddc264f0ba5615db7ba217daeb99701ad295353c45f9e95963337ceeeffb2"}, ] [[package]] name = "wrapt" version = "1.15.0" description = "Module for decorators, wrappers and monkey patching." +category = "dev" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" files = [ @@ -485,4 +578,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "b04d8166655a79de94436d54e060f0e04c185ac067bc8579619a7f8444e70370" +content-hash = "6eeffad7b4becc9f995e576d3fc5db2a8640bfe60876d254a6b5854ddd0e283a" diff --git a/pyproject.toml b/pyproject.toml index b0df7426..1c9f80d9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "hashstore" -version = "0.9.0" +version = "1.0.0" description = "HashStore, a hash-based object store for data packages." authors = ["Matt Jones ", "Dou Mok "] readme = "README.md" @@ -14,6 +14,7 @@ pyyaml = "^6.0" pytest = "^7.2.0" black = "^22.10.0" pylint = "^2.17.4" +pg8000 = "^1.29.8" [build-system] requires = ["poetry-core"] diff --git a/src/hashstore/__init__.py b/src/hashstore/__init__.py index 2902348a..352bd3d3 100644 --- a/src/hashstore/__init__.py +++ b/src/hashstore/__init__.py @@ -11,10 +11,10 @@ - Data objects are named using the SHA-256, base64-encoded hash of their contents (thus, a content-identifier) - Metadata objects are stored with the formatId, a null character and its contents -- Metadata objects are named using the SHA-256, base64-encoded hash of their - persistent identifier (PID) -- An object's persistent identifier can be used to read both metadata and contents - of the object +- Metadata objects are named using the SHA-256 + formatId, base64-encoded hash of + their persistent identifier (PID) """ -from hashstore.hashstore import HashStore +from hashstore.hashstore import HashStore, HashStoreFactory, ObjectMetadata + +__all__ = ("HashStore", "HashStoreFactory", "ObjectMetadata") diff --git a/src/hashstore/client.py b/src/hashstore/client.py new file mode 100644 index 00000000..c1e2e4b6 --- /dev/null +++ b/src/hashstore/client.py @@ -0,0 +1,887 @@ +"""HashStore Command Line App""" +import logging +import os +from argparse import ArgumentParser +from datetime import datetime +import multiprocessing +from pathlib import Path +import yaml +import pg8000 +from hashstore import HashStoreFactory + + +class HashStoreParser: + """Class to set up parsing arguments via argparse.""" + + def __init__(self): + """Initialize the argparse 'parser'.""" + + program_name = "HashStore Command Line Client" + description = ( + "Command line tool to call store, retrieve and delete with a HashStore." + + " Additionally, methods are available to test functionality with a" + + " metacat postgres db." + ) + epilog = "Created for DataONE (NCEAS)" + + self.parser = ArgumentParser( + prog=program_name, + description=description, + epilog=epilog, + ) + + # Add positional argument + self.parser.add_argument("store_path", help="Path of the HashStore") + + # Add optional arguments + self.parser.add_argument( + "-knbvm", + dest="knbvm_flag", + action="store_true", + help="Flag for testing with knbvm", + ) + self.parser.add_argument( + "-loglevel", + dest="logging_level", + help="Set logging level for the client", + ) + self.parser.add_argument( + "-chs", + dest="create_hashstore", + action="store_true", + help="Create a HashStore", + ) + self.parser.add_argument( + "-dp", "-store_depth", dest="depth", help="Depth of HashStore" + ) + self.parser.add_argument( + "-wp", "-store_width", dest="width", help="Width of HashStore" + ) + self.parser.add_argument( + "-ap", + "-store_algorithm", + dest="algorithm", + help="Algorithm to use when calculating object address", + ) + self.parser.add_argument( + "-nsp", + "-store_namespace", + dest="formatid", + help="Default metadata namespace for metadata", + ) + + # KNBVM testing related arguments + self.parser.add_argument( + "-sdir", + dest="source_directory", + help="Source directory of objects to work with", + ) + self.parser.add_argument( + "-stype", + dest="source_directory_type", + help="Source directory type (ex. 'objects' or 'metadata')", + ) + self.parser.add_argument( + "-nobj", + dest="num_obj_to_convert", + help="Number of objects to convert", + ) + self.parser.add_argument( + "-sts", + dest="store_to_hashstore", + action="store_true", + help="Store objects into a HashStore", + ) + self.parser.add_argument( + "-rav", + dest="retrieve_and_validate", + action="store_true", + help="Retrieve and validate objects in a HashStore", + ) + self.parser.add_argument( + "-dfs", + dest="delete_from_hashstore", + action="store_true", + help="Delete objects in a HashStore", + ) + + # Individual API call related optional arguments + self.parser.add_argument( + "-pid", + dest="object_pid", + help="Pid/Guid of object to work with", + ) + self.parser.add_argument( + "-path", + dest="object_path", + help="Path of the data or metadata object", + ) + self.parser.add_argument( + "-algo", + dest="object_algorithm", + help="Algorithm to work with", + ) + self.parser.add_argument( + "-checksum", + dest="object_checksum", + help="Checksum of data object to validate", + ) + self.parser.add_argument( + "-checksum_algo", + dest="object_checksum_algorithm", + help="Algorithm of checksum to validate", + ) + self.parser.add_argument( + "-obj_size", + dest="object_size", + help="Size of data object to validate", + ) + self.parser.add_argument( + "-formatid", + dest="object_formatid", + help="Format/namespace of the metadata", + ) + + # Public API optional arguments + self.parser.add_argument( + "-getchecksum", + dest="client_getchecksum", + action="store_true", + help="Flag to get the hex digest of a data object in HashStore", + ) + self.parser.add_argument( + "-storeobject", + dest="client_storeobject", + action="store_true", + help="Flag to store an object to a HashStore", + ) + self.parser.add_argument( + "-storemetadata", + dest="client_storemetadata", + action="store_true", + help="Flag to store a metadata document to a HashStore", + ) + self.parser.add_argument( + "-retrieveobject", + dest="client_retrieveobject", + action="store_true", + help="Flag to retrieve an object from a HashStore", + ) + self.parser.add_argument( + "-retrievemetadata", + dest="client_retrievemetadata", + action="store_true", + help="Flag to retrieve a metadata document from a HashStore", + ) + self.parser.add_argument( + "-deleteobject", + dest="client_deleteobject", + action="store_true", + help="Flag to delete on object from a HashStore", + ) + self.parser.add_argument( + "-deletemetadata", + dest="client_deletemetadata", + action="store_true", + help="Flag to delete a metadata document from a HashStore", + ) + + def load_store_properties(self, hashstore_yaml): + """Get and return the contents of the current HashStore config file. + + Returns: + hashstore_yaml_dict (dict): HashStore properties with the following keys (and values): + store_depth (int): Depth when sharding an object's hex digest. + store_width (int): Width of directories when sharding an object's hex digest. + store_algorithm (str): Hash algorithm used for calculating the object's hex digest. + store_metadata_namespace (str): Namespace for the HashStore's system metadata. + """ + property_required_keys = [ + "store_depth", + "store_width", + "store_algorithm", + "store_metadata_namespace", + ] + + if not os.path.exists(hashstore_yaml): + exception_string = ( + "HashStoreParser - load_store_properties: hashstore.yaml not found" + + " in store root path." + ) + raise FileNotFoundError(exception_string) + # Open file + with open(hashstore_yaml, "r", encoding="utf-8") as file: + yaml_data = yaml.safe_load(file) + + # Get hashstore properties + hashstore_yaml_dict = {} + for key in property_required_keys: + checked_property = yaml_data[key] + if key == "store_depth" or key == "store_width": + checked_property = int(yaml_data[key]) + hashstore_yaml_dict[key] = checked_property + return hashstore_yaml_dict + + def get_parser_args(self): + """Get command line arguments.""" + return self.parser.parse_args() + + +class HashStoreClient: + """Create a HashStore to use through the command line.""" + + OBJ_TYPE = "object" + MET_TYPE = "metadata" + + def __init__(self, properties, testflag=None): + """Initialize HashStore and MetacatDB + + Args: + properties: See FileHashStore for dictionary example + testflag (str): "knbvm" to initialize MetacatDB + """ + factory = HashStoreFactory() + + # Get HashStore from factory + module_name = "filehashstore" + class_name = "FileHashStore" + + # Instance attributes + self.hashstore = factory.get_hashstore(module_name, class_name, properties) + logging.info("HashStoreClient - HashStore initialized.") + + # Set up access to Metacat postgres db + if testflag: + self.metacatdb = MetacatDB(properties["store_path"], self.hashstore) + logging.info("HashStoreClient - MetacatDB initialized.") + + # Methods relating to testing HashStore with knbvm (test.arcticdata.io) + + def store_to_hashstore_from_list(self, origin_dir, obj_type, num): + """Store objects in a given directory into HashStore + + Args: + origin_dir (str): Directory to convert + obj_type (str): 'object' or 'metadata' + num (int): Number of files to store + """ + info_msg = f"HashStore Client - Begin storing {obj_type} objects." + logging.info(info_msg) + # Object and Metadata list + metacat_obj_list = self.metacatdb.get_object_metadata_list(origin_dir, num) + + # Get list of objects to store from metacat db + if obj_type == self.OBJ_TYPE: + checked_obj_list = self.metacatdb.refine_list_for_objects( + metacat_obj_list, "store" + ) + if obj_type == self.MET_TYPE: + checked_obj_list = self.metacatdb.refine_list_for_metadata( + metacat_obj_list, "store" + ) + + start_time = datetime.now() + + # Set up pool and processes + pool = multiprocessing.Pool() + + # Call 'obj_type' respective public API methods + info_msg = f"HashStoreClient - Request to Store {len(checked_obj_list)} Objs" + logging.info(info_msg) + if obj_type == self.OBJ_TYPE: + # results = pool.starmap(self.hashstore.store_object, checked_obj_list) + pool.imap(self.try_store_object, checked_obj_list) + if obj_type == self.MET_TYPE: + pool.imap(self.try_store_metadata, checked_obj_list) + + # Close the pool and wait for all processes to complete + pool.close() + pool.join() + + end_time = datetime.now() + content = ( + f"HashStoreClient (store_to_hashstore_from_list):\n" + f"Start Time: {start_time}\nEnd Time: {end_time}\n" + + f"Total Time to Store {len(checked_obj_list)} {obj_type}" + + f" Objects: {end_time - start_time}\n" + ) + logging.info(content) + + def try_store_object(self, obj_tuple): + """Store an object to HashStore and log exceptions as warning. + + Args: + obj_tuple: See HashStore store_object signature for details. + """ + try: + self.hashstore.store_object(*obj_tuple) + return + # pylint: disable=W0718 + except Exception as so_exception: + print(so_exception) + + def try_store_metadata(self, obj_tuple): + """Store an object to HashStore and log exceptions as warning. + + Args: + obj_tuple: See HashStore store_object signature for details. + """ + try: + self.hashstore.store_metadata(*obj_tuple) + return + # pylint: disable=W0718 + except Exception as so_exception: + print(so_exception) + + def retrieve_and_validate_from_hashstore(self, origin_dir, obj_type, num): + """Retrieve objects or metadata from a Hashstore and validate the content. + + Args: + origin_dir (str): Directory to convert + obj_type (str): 'object' or 'metadata' + num (int): Number of files to store + """ + info_msg = ( + f"HashStore Client - Begin retrieving and validating {obj_type} objects." + ) + logging.info(info_msg) + # Object and Metadata list + metacat_obj_list = self.metacatdb.get_object_metadata_list(origin_dir, num) + + # Get list of objects to store from metacat db + logging.info("HashStore Client - Refining object list for %s", obj_type) + if obj_type == self.OBJ_TYPE: + checked_obj_list = self.metacatdb.refine_list_for_objects( + metacat_obj_list, "retrieve" + ) + if obj_type == self.MET_TYPE: + checked_obj_list = self.metacatdb.refine_list_for_metadata( + metacat_obj_list, "retrieve" + ) + + start_time = datetime.now() + + # Set up pool and processes + pool = multiprocessing.Pool() + if obj_type == "object": + pool.imap(self.validate_object, checked_obj_list) + if obj_type == "metadata": + pool.imap(self.validate_metadata, checked_obj_list) + + # Close the pool and wait for all processes to complete + pool.close() + pool.join() + + end_time = datetime.now() + content = ( + f"retrieve_and_validate_from_hashstore:\n" + f"Start Time: {start_time}\nEnd Time: {end_time}\n" + + f"Total Time to retrieve and validate {len(checked_obj_list)} {obj_type}" + + f" Objects: {end_time - start_time}\n" + ) + logging.info(content) + + def validate_object(self, obj_tuple): + """Retrieves an object from HashStore and validates its checksum. + + Args: + obj_tuple: pid_guid, obj_checksum_algo, obj_checksum + """ + pid_guid = obj_tuple[0] + algo = obj_tuple[1] + obj_db_checksum = obj_tuple[2] + + with self.hashstore.retrieve_object(pid_guid) as obj_stream: + computed_digest = self.hashstore.computehash(obj_stream, algo) + obj_stream.close() + + if computed_digest != obj_db_checksum: + err_msg = ( + f"Assertion Error for pid/guid: {pid_guid} -" + + f" Digest calculated from stream ({computed_digest}) does not match" + + f" checksum from metacat db: {obj_db_checksum}" + ) + logging.error(err_msg) + print(err_msg) + + return + + def validate_metadata(self, obj_tuple): + """Retrieves a metadata from HashStore and validates its checksum + + Args: + obj_tuple: pid_guid, format_id, obj_checksum, obj_algorithm + """ + pid_guid = obj_tuple[0] + namespace = obj_tuple[1] + metadata_db_checksum = obj_tuple[2] + algo = obj_tuple[3] + + with self.hashstore.retrieve_metadata(pid_guid, namespace) as metadata_stream: + computed_digest = self.hashstore.computehash(metadata_stream, algo) + metadata_stream.close() + + if computed_digest != metadata_db_checksum: + err_msg = ( + f"Assertion Error for pid/guid: {pid_guid} -" + + f" Digest calculated from stream ({computed_digest}) does not match" + + f" checksum from metacat db: {metadata_db_checksum}" + ) + logging.error(err_msg) + print(err_msg) + + return + + def delete_objects_from_list(self, origin_dir, obj_type, num): + """Store objects in a given directory into HashStore + Args: + origin_dir (str): Directory to convert + obj_type (str): 'object' or 'metadata' + num (int): Number of files to store + """ + info_msg = f"HashStore Client - Begin deleting {obj_type} objects." + logging.info(info_msg) + # Object and Metadata list + metacat_obj_list = self.metacatdb.get_object_metadata_list(origin_dir, num) + + # Get list of objects to store from metacat db + if obj_type == self.OBJ_TYPE: + checked_obj_list = self.metacatdb.refine_list_for_objects( + metacat_obj_list, "delete" + ) + if obj_type == self.MET_TYPE: + checked_obj_list = self.metacatdb.refine_list_for_metadata( + metacat_obj_list, "delete" + ) + + start_time = datetime.now() + + # Setup pool and processes + pool = multiprocessing.Pool() + + # Call 'obj_type' respective public API methods + info_msg = f"HashStoreClient - Request to delete {len(checked_obj_list)} Objs" + logging.info(info_msg) + if obj_type == self.OBJ_TYPE: + # results = pool.starmap(self.hashstore.store_object, checked_obj_list) + pool.imap(self.try_delete_object, checked_obj_list) + if obj_type == self.MET_TYPE: + pool.imap(self.try_delete_metadata, checked_obj_list) + + # Close the pool and wait for all processes to complete + pool.close() + pool.join() + + end_time = datetime.now() + content = ( + f"HashStoreClient (delete_objects_from_list):\n" + f"Start Time: {start_time}\nEnd Time: {end_time}\n" + + f"Total Time to Delete {len(checked_obj_list)} {obj_type}" + + f" Objects: {end_time - start_time}\n" + ) + logging.info(content) + + def try_delete_object(self, obj_pid): + """Delete an object to HashStore and log exceptions as warning. + + Args: + obj_pid (str): Pid of object to delete + """ + try: + self.hashstore.delete_object(obj_pid) + return + # pylint: disable=W0718 + except Exception as do_exception: + print(do_exception) + + def try_delete_metadata(self, obj_tuple): + """Delete an object to HashStore and log exceptions as warning. + + Args: + obj_tuple: pid_guid, format_id (namespace) + """ + pid_guid = obj_tuple[0] + namespace = obj_tuple[1] + try: + self.hashstore.delete_metadata(pid_guid, namespace) + return + # pylint: disable=W0718 + except Exception as do_exception: + print(do_exception) + + +class MetacatDB: + """Class to interact with Metacat's Postgres DB""" + + def __init__(self, hashstore_path, hashstore): + """Initialize credentials to access metacat pgdb.""" + db_keys = [ + "db_user", + "db_password", + "db_host", + "db_port", + "db_name", + ] + + # Note, 'pgdb.yaml' config file must be manually created for security + pgyaml_path = hashstore_path + "/pgdb.yaml" + if not os.path.exists(pgyaml_path): + exception_string = ( + "HashStore CLI Client - _load_metacat_db_properties: pgdb.yaml not found" + + " in store root path. Must be manually created with the following keys:" + + " db_user, db_password, db_host, db_port, db_name" + ) + raise FileNotFoundError(exception_string) + # Open file + with open(pgyaml_path, "r", encoding="utf-8") as file: + yaml_data = yaml.safe_load(file) + + # Get database values + self.hashstore = hashstore + self.db_yaml_dict = {} + for key in db_keys: + checked_property = yaml_data[key] + self.db_yaml_dict[key] = checked_property + + def get_object_metadata_list(self, origin_directory, num): + """Query the metacat db for the full obj and metadata list and order by guid. + + Args: + origin_directory (string): 'var/metacat/data' or 'var/metacat/documents' + num (int): Number of rows to retrieve from metacat db + """ + # Create a connection to the database + db_user = self.db_yaml_dict["db_user"] + db_password = self.db_yaml_dict["db_password"] + db_host = self.db_yaml_dict["db_host"] + db_port = self.db_yaml_dict["db_port"] + db_name = self.db_yaml_dict["db_name"] + + conn = pg8000.connect( + user=db_user, + password=db_password, + host=db_host, + port=int(db_port), + database=db_name, + ) + + # Create a cursor to execute queries + cursor = conn.cursor() + + # Query to refine rows between `identifier` and `systemmetadata`` table + if num is None: + limit_query = "" + else: + limit_query = f" LIMIT {num}" + query = f"""SELECT identifier.guid, identifier.docid, identifier.rev, + systemmetadata.object_format, systemmetadata.checksum, + systemmetadata.checksum_algorithm FROM identifier INNER JOIN systemmetadata + ON identifier.guid = systemmetadata.guid ORDER BY identifier.guid{limit_query};""" + cursor.execute(query) + + # Fetch all rows from the result set + rows = cursor.fetchall() + + # Create full object list to store into HashStore + print("Creating list of objects and metadata from metacat db") + object_metadata_list = [] + for row in rows: + # Get pid, filepath and formatId + pid_guid = row[0] + metadatapath_docid_rev = origin_directory + "/" + row[1] + "." + str(row[2]) + metadata_namespace = row[3] + row_checksum = row[4] + row_checksum_algorithm = row[5] + tuple_item = ( + pid_guid, + metadatapath_docid_rev, + metadata_namespace, + row_checksum, + row_checksum_algorithm, + ) + object_metadata_list.append(tuple_item) + + # Close the cursor and connection when done + cursor.close() + conn.close() + + return object_metadata_list + + def refine_list_for_objects(self, metacat_obj_list, action): + """Refine a list of objects by checking for file existence and removing duplicates. + + Args: + metacat_obj_list (List): List of tuple objects representing rows from metacat db + action (string): "store", "retrieve" or "delete". + "store" will create a list of objects to store that do not exist in HashStore. + "retrieve" will create a list of objects that exist in HashStore. + "delete" will create a list of object pids + + Returns: + refined_object_list (List): List of tuple objects based on "action" + """ + refined_object_list = [] + for tuple_item in metacat_obj_list: + pid_guid = tuple_item[0] + filepath_docid_rev = tuple_item[1] + item_checksum = tuple_item[3] + item_checksum_algorithm = tuple_item[4] + if os.path.exists(filepath_docid_rev): + if action == "store": + # If the file has already been stored, skip it + if not self.hashstore.exists( + "objects", self.hashstore.get_sha256_hex_digest(pid_guid) + ): + # This tuple is formed to match 'HashStore' store_object's signature + # Which is '.starmap()'ed when called + store_object_tuple_item = ( + pid_guid, + filepath_docid_rev, + None, + item_checksum, + item_checksum_algorithm, + ) + refined_object_list.append(store_object_tuple_item) + if action == "retrieve": + if self.hashstore.exists( + "objects", self.hashstore.get_sha256_hex_digest(pid_guid) + ): + retrieve_object_tuple_item = ( + pid_guid, + item_checksum_algorithm, + item_checksum, + ) + refined_object_list.append(retrieve_object_tuple_item) + if action == "delete": + if self.hashstore.exists( + "objects", self.hashstore.get_sha256_hex_digest(pid_guid) + ): + refined_object_list.append(pid_guid) + + return refined_object_list + + def refine_list_for_metadata(self, metacat_obj_list, action): + """Refine a list of metadata by checking for file existence and removing duplicates. + + Args: + metacat_obj_list (List): List of tuple objects representing rows from metacat db + action (string): "store", "retrieve" or "delete". + "store" will create a list of metadata to store that do not exist in HashStore. + "retrieve" will create a list of metadata that exist in HashStore. + "delete" will create a list of metadata pids with their format_ids + + Returns: + refined_object_list (List): List of tuple metadata based on "action" + """ + refined_metadata_list = [] + for tuple_item in metacat_obj_list: + pid_guid = tuple_item[0] + filepath_docid_rev = tuple_item[1] + metadata_namespace = tuple_item[2] + item_checksum = tuple_item[3] + item_checksum_algorithm = tuple_item[4] + if os.path.exists(filepath_docid_rev): + if action == "store": + # If the file has already been stored, skip it + if not self.hashstore.exists( + "metadata", + self.hashstore.get_sha256_hex_digest( + pid_guid + metadata_namespace + ), + ): + tuple_item = (pid_guid, filepath_docid_rev, metadata_namespace) + refined_metadata_list.append(tuple_item) + if action == "retrieve": + if self.hashstore.exists( + "metadata", + self.hashstore.get_sha256_hex_digest( + pid_guid + metadata_namespace + ), + ): + tuple_item = ( + pid_guid, + metadata_namespace, + item_checksum, + item_checksum_algorithm, + ) + refined_metadata_list.append(tuple_item) + if action == "delete": + if self.hashstore.exists( + "metadata", + self.hashstore.get_sha256_hex_digest( + pid_guid + metadata_namespace + ), + ): + tuple_item = ( + pid_guid, + metadata_namespace, + ) + refined_metadata_list.append(tuple_item) + return refined_metadata_list + + +def main(): + """Entry point of the HashStore client.""" + + parser = HashStoreParser() + args = parser.get_parser_args() + + # Client setup process + if getattr(args, "create_hashstore"): + # Create HashStore if -chs flag is true in a given directory + # Get store attributes, HashStore will validate properties + props = { + "store_path": getattr(args, "store_path"), + "store_depth": int(getattr(args, "depth")), + "store_width": int(getattr(args, "width")), + "store_algorithm": getattr(args, "algorithm"), + "store_metadata_namespace": getattr(args, "formatid"), + } + HashStoreClient(props) + # Can't use client app without first initializing HashStore + store_path = getattr(args, "store_path") + store_path_config_yaml = store_path + "/hashstore.yaml" + if not os.path.exists(store_path_config_yaml): + raise FileNotFoundError( + f"Missing config file (hashstore.yaml) at store path: {store_path}." + + " HashStore must first be initialized, use `--help` for more information." + ) + # Setup logging, create log file if it doesn't already exist + hashstore_py_log = store_path + "/python_client.log" + python_log_file_path = Path(hashstore_py_log) + if not os.path.exists(python_log_file_path): + python_log_file_path.parent.mkdir(parents=True, exist_ok=True) + open(python_log_file_path, "w", encoding="utf-8").close() + # Check for logging level + logging_level_arg = getattr(args, "logging_level") + if logging_level_arg is None: + logging_level = "INFO" + else: + logging_level = logging_level_arg + logging.basicConfig( + filename=python_log_file_path, + level=logging_level, + format="%(asctime)s - %(levelname)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + + # Collect arguments to process + pid = getattr(args, "object_pid") + path = getattr(args, "object_path") + algorithm = getattr(args, "object_algorithm") + checksum = getattr(args, "object_checksum") + checksum_algorithm = getattr(args, "object_checksum_algorithm") + size = getattr(args, "object_size") + formatid = getattr(args, "object_formatid") + knbvm_test = getattr(args, "knbvm_flag") + # Instantiate HashStore Client + props = parser.load_store_properties(store_path_config_yaml) + # Reminder: 'hashstore.yaml' only contains 4 of the required 5 properties + props["store_path"] = store_path + hashstore_c = HashStoreClient(props, knbvm_test) + if knbvm_test: + directory_to_convert = getattr(args, "source_directory") + # Check if the directory to convert exists + if os.path.exists(directory_to_convert): + # If -nobj is supplied, limit the objects we work with + number_of_objects_to_convert = getattr(args, "num_obj_to_convert") + # Determine if we are working with objects or metadata + directory_type = getattr(args, "source_directory_type") + accepted_directory_types = ["object", "metadata"] + if directory_type not in accepted_directory_types: + raise ValueError( + "Directory `-stype` cannot be empty, must be 'object' or 'metadata'." + + f" source_directory_type: {directory_type}" + ) + if getattr(args, "store_to_hashstore"): + hashstore_c.store_to_hashstore_from_list( + directory_to_convert, + directory_type, + number_of_objects_to_convert, + ) + if getattr(args, "retrieve_and_validate"): + hashstore_c.retrieve_and_validate_from_hashstore( + directory_to_convert, + directory_type, + number_of_objects_to_convert, + ) + if getattr(args, "delete_from_hashstore"): + hashstore_c.delete_objects_from_list( + directory_to_convert, + directory_type, + number_of_objects_to_convert, + ) + else: + raise FileNotFoundError( + f"Directory to convert is None or does not exist: {directory_to_convert}." + ) + elif getattr(args, "client_getchecksum"): + if pid is None: + raise ValueError("'-pid' option is required") + if algorithm is None: + raise ValueError("'-algo' option is required") + # Calculate the hex digest of a given pid with algorithm supplied + digest = hashstore_c.hashstore.get_hex_digest(pid, algorithm) + print(f"guid/pid: {pid}") + print(f"algorithm: {algorithm}") + print(f"Checksum/Hex Digest: {digest}") + + elif getattr(args, "client_storeobject"): + if pid is None: + raise ValueError("'-pid' option is required") + if path is None: + raise ValueError("'-path' option is required") + # Store object to HashStore + object_metadata = hashstore_c.hashstore.store_object( + pid, path, algorithm, checksum, checksum_algorithm, size + ) + print(f"Object Metadata:\n{object_metadata}") + + elif getattr(args, "client_storemetadata"): + if pid is None: + raise ValueError("'-pid' option is required") + if path is None: + raise ValueError("'-path' option is required") + # Store metadata to HashStore + metadata_cid = hashstore_c.hashstore.store_metadata(pid, path, formatid) + print(f"Metadata ID: {metadata_cid}") + + elif getattr(args, "client_retrieveobject"): + if pid is None: + raise ValueError("'-pid' option is required") + # Retrieve object from HashStore and display the first 1000 bytes + object_stream = hashstore_c.hashstore.retrieve_object(pid) + object_content = object_stream.read(1000).decode("utf-8") + object_stream.close() + print(object_content) + print("...\n<-- Truncated for Display Purposes -->") + + elif getattr(args, "client_retrievemetadata"): + if pid is None: + raise ValueError("'-pid' option is required") + # Retrieve metadata from HashStore and display the first 1000 bytes + metadata_stream = hashstore_c.hashstore.retrieve_metadata(pid, formatid) + metadata_content = metadata_stream.read(1000).decode("utf-8") + metadata_stream.close() + print(metadata_content) + print("...\n<-- Truncated for Display Purposes -->") + + elif getattr(args, "client_deleteobject"): + if pid is None: + raise ValueError("'-pid' option is required") + # Delete object from HashStore + delete_status = hashstore_c.hashstore.delete_object(pid) + print(f"Object Deleted (T/F): {delete_status}") + + elif getattr(args, "client_deletemetadata"): + if pid is None: + raise ValueError("'-pid' option is required") + # Delete metadata from HashStore + delete_status = hashstore_c.hashstore.delete_metadata(pid, formatid) + print( + f"Metadata for pid: {pid} & formatid: {formatid}\nDeleted (T/F): {delete_status}" + ) + + +if __name__ == "__main__": + main() diff --git a/src/hashstore/filehashstore/filehashstore.py b/src/hashstore/filehashstore.py similarity index 52% rename from src/hashstore/filehashstore/filehashstore.py rename to src/hashstore/filehashstore.py index e5a72f3f..87f652e7 100644 --- a/src/hashstore/filehashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1,4 +1,5 @@ """Core module for FileHashStore""" +import atexit import io import shutil import threading @@ -10,8 +11,7 @@ from contextlib import closing from tempfile import NamedTemporaryFile import yaml -from hashstore import HashStore -from hashstore.hashaddress import HashAddress +from hashstore import HashStore, ObjectMetadata class FileHashStore(HashStore): @@ -20,7 +20,7 @@ class FileHashStore(HashStore): an authority-based identifier's hex digest with a given hash algorithm value to address files. - FileHashStore initializes by providing a properties dictionary containing the + FileHashStore initializes using a given properties dictionary containing the required keys (see Args). Upon initialization, FileHashStore verifies the provided properties and attempts to write a configuration file 'hashstore.yaml' to the given store path directory. Properties must always be supplied to ensure consistent @@ -32,7 +32,7 @@ class FileHashStore(HashStore): store_depth (int): Depth when sharding an object's hex digest. store_width (int): Width of directories when sharding an object's hex digest. store_algorithm (str): Hash algorithm used for calculating the object's hex digest. - store_sysmeta_namespace (str): Namespace for the HashStore's system metadata. + store_metadata_namespace (str): Namespace for the HashStore's system metadata. """ # Property (hashstore configuration) requirements @@ -41,15 +41,11 @@ class FileHashStore(HashStore): "store_depth", "store_width", "store_algorithm", - "store_sysmeta_namespace", + "store_metadata_namespace", ] # Permissions settings for writing files and creating directories fmode = 0o664 dmode = 0o755 - # Default and other algorithm list for FileHashStore - # The default algorithm list includes the hash algorithms calculated when - # storing an object to disk and returned to the caller after successful storage. - default_algo_list = ["sha1", "sha256", "sha384", "sha512", "md5"] # The other algorithm list consists of additional algorithms that can be included # for calculating when storing objects, in addition to the default list. other_algo_list = [ @@ -64,9 +60,9 @@ class FileHashStore(HashStore): # Variables to orchestrate thread locking and object store synchronization time_out_sec = 1 object_lock = threading.Lock() - sysmeta_lock = threading.Lock() + metadata_lock = threading.Lock() object_locked_pids = [] - sysmeta_locked_pids = [] + metadata_locked_pids = [] def __init__(self, properties=None): if properties: @@ -76,8 +72,8 @@ def __init__(self, properties=None): prop_store_path, prop_store_depth, prop_store_width, - prop_store_algorithm, - prop_store_sysmeta_namespace, + _, + prop_store_metadata_namespace, ) = [ checked_properties[property_name] for property_name in self.property_required_keys @@ -85,38 +81,16 @@ def __init__(self, properties=None): # Check to see if a configuration is present in the given store path self.hashstore_configuration_yaml = prop_store_path + "/hashstore.yaml" - if os.path.exists(self.hashstore_configuration_yaml): - logging.debug( - "FileHashStore - Config found (hashstore.yaml) at {%s}. Verifying properties.", - self.hashstore_configuration_yaml, - ) - # If 'hashstore.yaml' is found, verify given properties before init - hashstore_yaml_dict = self.get_properties() - for key in self.property_required_keys: - if hashstore_yaml_dict[key] != properties[key]: - exception_string = ( - f"Given properties ({key}: {properties[key]}) does not match " - + f"HashStore configuration ({key}: {hashstore_yaml_dict[key]})" - + f"found at: {self.hashstore_configuration_yaml}" - ) - logging.critical("FileHashStore - %s", exception_string) - raise ValueError(exception_string) - else: - # Check if HashStore exists and throw exception if found - if any(Path(prop_store_path).iterdir()): - exception_string = ( - f"HashStore directories and/or objects found at: {prop_store_path} but" - + f" missing configuration file at: {self.hashstore_configuration_yaml}." - ) - logging.critical("FileHashStore - %s", exception_string) - raise FileNotFoundError(exception_string) + self._verify_hashstore_properties(properties, prop_store_path) + # If no exceptions thrown, FileHashStore ready for initialization logging.debug("FileHashStore - Initializing, properties verified.") self.root = prop_store_path + if not os.path.exists(self.root): + self.create_path(self.root) self.depth = prop_store_depth self.width = prop_store_width - self.algorithm = prop_store_algorithm - self.sysmeta_ns = prop_store_sysmeta_namespace + self.sysmeta_ns = prop_store_metadata_namespace # Write 'hashstore.yaml' to store path if not os.path.exists(self.hashstore_configuration_yaml): # pylint: disable=W1201 @@ -124,102 +98,133 @@ def __init__(self, properties=None): "FileHashStore - HashStore does not exist & configuration file not found." + " Writing configuration file." ) - self.put_properties(properties) - # Complete initialization/instantiation by setting store directories + self.write_properties(properties) + # Default algorithm list for FileHashStore based on config file written + self._set_default_algorithms() + # Complete initialization/instantiation by setting and creating store directories self.objects = self.root + "/objects" - self.sysmeta = self.root + "/sysmeta" + self.metadata = self.root + "/metadata" + if not os.path.exists(self.objects): + self.create_path(self.objects + "/tmp") + if not os.path.exists(self.metadata): + self.create_path(self.metadata + "/tmp") logging.debug( "FileHashStore - Initialization success. Store root: %s", self.root ) else: + # Cannot instantiate or initialize FileHashStore without config exception_string = ( - f"HashStore properties must be supplied. Properties: {properties}" + "FileHashStore - HashStore properties must be supplied." + + f" Properties: {properties}" ) - logging.debug("FileHashStore - %s", exception_string) - # Cannot instantiate or initialize FileHashStore without config + logging.debug(exception_string) raise ValueError(exception_string) - # Configuration Methods + # Configuration and Related Methods - def get_properties(self): + def load_properties(self): """Get and return the contents of the current HashStore configuration. Returns: - hashstore_yaml_dict (dict): HashStore properties with the following keys/values: - "store_path", "store_depth", "store_width", "store_algorithm","store_sysmeta_namespace". + hashstore_yaml_dict (dict): HashStore properties with the following keys (and values): + store_depth (int): Depth when sharding an object's hex digest. + store_width (int): Width of directories when sharding an object's hex digest. + store_algorithm (str): Hash algorithm used for calculating the object's hex digest. + store_metadata_namespace (str): Namespace for the HashStore's system metadata. """ if not os.path.exists(self.hashstore_configuration_yaml): - exception_string = "hashstore.yaml not found in store root path." - logging.critical("FileHashStore - get_properties: %s", exception_string) + exception_string = ( + "FileHashStore - load_properties: hashstore.yaml not found" + + " in store root path." + ) + logging.critical(exception_string) raise FileNotFoundError(exception_string) # Open file with open(self.hashstore_configuration_yaml, "r", encoding="utf-8") as file: yaml_data = yaml.safe_load(file) + # Get hashstore properties hashstore_yaml_dict = {} for key in self.property_required_keys: - hashstore_yaml_dict[key] = yaml_data[key] + if key is not "store_path": + hashstore_yaml_dict[key] = yaml_data[key] logging.debug( - "FileHashStore - get_properties: Successfully retrieved 'hashstore.yaml' properties." + "FileHashStore - load_properties: Successfully retrieved 'hashstore.yaml' properties." ) return hashstore_yaml_dict - def put_properties(self, properties): + def write_properties(self, properties): """Writes 'hashstore.yaml' to FileHashStore's root directory with the respective properties object supplied. Args: properties (dict): A python dictionary with the following keys (and values): - store_path (str): Path to the HashStore directory. store_depth (int): Depth when sharding an object's hex digest. store_width (int): Width of directories when sharding an object's hex digest. store_algorithm (str): Hash algorithm used for calculating the object's hex digest. - store_sysmeta_namespace (str): Namespace for the HashStore's system metadata. + store_metadata_namespace (str): Namespace for the HashStore's system metadata. """ # If hashstore.yaml already exists, must throw exception and proceed with caution if os.path.exists(self.hashstore_configuration_yaml): exception_string = ( - "FileHashStore configuration file 'hashstore.yaml' already exists." + "FileHashStore - write_properties: configuration file 'hashstore.yaml'" + + " already exists." ) - logging.error("FileHashStore - put_properties: %s", exception_string) + logging.error(exception_string) raise FileExistsError(exception_string) # Validate properties checked_properties = self._validate_properties(properties) # Collect configuration properties from validated & supplied dictionary ( - store_path, + _, store_depth, store_width, store_algorithm, - store_sysmeta_namespace, + store_metadata_namespace, ) = [ checked_properties[property_name] for property_name in self.property_required_keys ] + # Standardize algorithm value for cross-language compatibility + checked_store_algorithm = None + # Note, this must be declared here because HashStore has not yet been initialized + accepted_store_algorithms = ["MD5", "SHA-1", "SHA-256", "SHA-384", "SHA-512"] + if store_algorithm in accepted_store_algorithms: + checked_store_algorithm = store_algorithm + else: + exception_string = ( + f"FileHashStore - write_properties: algorithm supplied ({store_algorithm})" + + " cannot be used as default for HashStore. Must be one of:" + + " MD5, SHA-1, SHA-256, SHA-384, SHA-512 which are DataONE" + + " controlled algorithm values" + ) + logging.error(exception_string) + raise ValueError(exception_string) + # .yaml file to write hashstore_configuration_yaml = self._build_hashstore_yaml_string( - store_path, store_depth, store_width, - store_algorithm, - store_sysmeta_namespace, + checked_store_algorithm, + store_metadata_namespace, ) # Write 'hashstore.yaml' with open( self.hashstore_configuration_yaml, "w", encoding="utf-8" ) as hashstore_yaml: hashstore_yaml.write(hashstore_configuration_yaml) + logging.debug( - "FileHashStore - put_properties: Configuration file written to: %s", + "FileHashStore - write_properties: Configuration file written to: %s", self.hashstore_configuration_yaml, ) return @staticmethod def _build_hashstore_yaml_string( - store_path, store_depth, store_width, store_algorithm, store_sysmeta_namespace + store_depth, store_width, store_algorithm, store_metadata_namespace ): """Build a YAML string representing the configuration for a HashStore. @@ -228,7 +233,7 @@ def _build_hashstore_yaml_string( store_depth (int): Depth when sharding an object's hex digest. store_width (int): Width of directories when sharding an object's hex digest. store_algorithm (str): Hash algorithm used for calculating the object's hex digest. - store_sysmeta_namespace (str): Namespace for the HashStore's system metadata. + store_metadata_namespace (str): Namespace for the HashStore's system metadata. Returns: hashstore_configuration_yaml (str): A YAML string representing the configuration for @@ -237,10 +242,6 @@ def _build_hashstore_yaml_string( hashstore_configuration_yaml = f""" # Default configuration variables for HashStore - ############### Store Path ############### - # Default path for `FileHashStore` if no path is provided - store_path: "{store_path}" - ############### Directory Structure ############### # Desired amount of directories when sharding an object to form the permanent address store_depth: {store_depth} # WARNING: DO NOT CHANGE UNLESS SETTING UP NEW HASHSTORE @@ -256,7 +257,8 @@ def _build_hashstore_yaml_string( # │ └── 8f0b04e812a3b4c8f686ce34e6fec558804bf61e54b176742a7f6368d6 ############### Format of the Metadata ############### - store_sysmeta_namespace: "{store_sysmeta_namespace}" + # The default metadata format + store_metadata_namespace: "{store_metadata_namespace}" ############### Hash Algorithms ############### # Hash algorithm to use when calculating object's hex digest for the permanent address @@ -264,25 +266,63 @@ def _build_hashstore_yaml_string( # Algorithm values supported by python hashlib 3.9.0+ for File Hash Store (FHS) # The default algorithm list includes the hash algorithms calculated when storing an # object to disk and returned to the caller after successful storage. - filehashstore_default_algo_list: - - "sha1" - - "sha256" - - "sha384" - - "sha512" - - "md5" - # The other algorithm list consists of additional algorithms that can be included for - # calculating when storing objects, in addition to the default list. - filehashstore_other_algo_list: - - "sha224" - - "sha3_224" - - "sha3_256" - - "sha3_384" - - "sha3_512" - - "blake2b" - - "blake2s" + store_default_algo_list: + - "MD5" + - "SHA-1" + - "SHA-256" + - "SHA-384" + - "SHA-512" """ return hashstore_configuration_yaml + def _verify_hashstore_properties(self, properties, prop_store_path): + """Determines whether FileHashStore can instantiate by validating a set of arguments + and throwing exceptions. HashStore will not instantiate if an existing configuration + file's properties (`hashstore.yaml`) are different from what is supplied - or if an + object store exists at the given path, but it is missing the `hashstore.yaml` config file. + + If `hashstore.yaml` exists, it will retrieve its properties and compare them with the + given values; and if there is a mismatch, an exception will be thrown. If not, it will + look to see if any directories/files exist in the given store path and throw an exception + if any file or directory is found. + + Args: + properties (dict): HashStore properties + prop_store_path (string): Store path to check + """ + if os.path.exists(self.hashstore_configuration_yaml): + logging.debug( + "FileHashStore - Config found (hashstore.yaml) at {%s}. Verifying properties.", + self.hashstore_configuration_yaml, + ) + # If 'hashstore.yaml' is found, verify given properties before init + hashstore_yaml_dict = self.load_properties() + for key in self.property_required_keys: + # 'store_path' is required to init HashStore but not saved in `hashstore.yaml` + if key is not "store_path": + supplied_key = properties[key] + if key == "store_depth" or key == "store_width": + supplied_key = int(properties[key]) + if hashstore_yaml_dict[key] != supplied_key: + exception_string = ( + f"FileHashStore - Given properties ({key}: {properties[key]}) does not" + + f" match. HashStore configuration ({key}: {hashstore_yaml_dict[key]})" + + f" found at: {self.hashstore_configuration_yaml}" + ) + logging.critical(exception_string) + raise ValueError(exception_string) + else: + if os.path.exists(prop_store_path): + # Check if HashStore exists and throw exception if found + if any(Path(prop_store_path).iterdir()): + exception_string = ( + "FileHashStore - HashStore directories and/or objects found at:" + + f" {prop_store_path} but missing configuration file at: " + + self.hashstore_configuration_yaml + ) + logging.critical(exception_string) + raise FileNotFoundError(exception_string) + def _validate_properties(self, properties): """Validate a properties dictionary by checking if it contains all the required keys and non-None values. @@ -298,85 +338,92 @@ def _validate_properties(self, properties): properties (dict): The given properties object (that has been validated). """ if not isinstance(properties, dict): - exception_string = "Invalid argument - expected a dictionary." - logging.debug("FileHashStore - _validate_properties: %s", exception_string) + exception_string = ( + "FileHashStore - _validate_properties: Invalid argument -" + + " expected a dictionary." + ) + logging.debug(exception_string) raise ValueError(exception_string) + for key in self.property_required_keys: if key not in properties: - exception_string = f"Missing required key: {key}." - logging.debug( - "FileHashStore - _validate_properties: %s", exception_string + exception_string = ( + "FileHashStore - _validate_properties: Missing required" + + f" key: {key}." ) + logging.debug(exception_string) raise KeyError(exception_string) if properties.get(key) is None: - exception_string = f"Value for key: {key} is none." - logging.debug( - "FileHashStore - _validate_properties: %s", exception_string + exception_string = ( + "FileHashStore - _validate_properties: Value for key:" + + f" {key} is none." ) + logging.debug(exception_string) raise ValueError(exception_string) return properties + def _set_default_algorithms(self): + """Set the default algorithms to calculate when storing objects.""" + + def lookup_algo(algo): + """Translate DataONE controlled algorithms to python hashlib values: + https://dataoneorg.github.io/api-documentation/apis/Types.html#Types.ChecksumAlgorithm + """ + dataone_algo_translation = { + "MD5": "md5", + "SHA-1": "sha1", + "SHA-256": "sha256", + "SHA-384": "sha384", + "SHA-512": "sha512", + } + return dataone_algo_translation[algo] + + if not os.path.exists(self.hashstore_configuration_yaml): + exception_string = ( + "FileHashStore - set_default_algorithms: hashstore.yaml not found" + + " in store root path." + ) + logging.critical(exception_string) + raise FileNotFoundError(exception_string) + with open(self.hashstore_configuration_yaml, "r", encoding="utf-8") as file: + yaml_data = yaml.safe_load(file) + + # Set default store algorithm + self.algorithm = lookup_algo(yaml_data["store_algorithm"]) + # Takes DataOne controlled algorithm values and translates to hashlib supported values + yaml_store_default_algo_list = yaml_data["store_default_algo_list"] + translated_default_algo_list = [] + for algo in yaml_store_default_algo_list: + translated_default_algo_list.append(lookup_algo(algo)) + + # Set class variable + self.default_algo_list = translated_default_algo_list + return + # Public API / HashStore Interface Methods def store_object( self, pid, data, - additional_algorithm="sha256", + additional_algorithm=None, checksum=None, checksum_algorithm=None, + expected_object_size=None, ): logging.debug( "FileHashStore - store_object: Request to store object for pid: %s", pid ) # Validate input parameters - logging.debug("FileHashStore - store_object: Validating arguments.") - if pid is None or pid.replace(" ", "") == "": - exception_string = f"Pid cannot be None or empty, pid: {pid}." - logging.error("FileHashStore - store_object: %s", exception_string) - raise ValueError(exception_string) - if ( - not isinstance(data, str) - and not isinstance(data, Path) - and not isinstance(data, io.BufferedIOBase) - ): - exception_string = ( - "Data must be a path, string or buffered stream type." - + f" data type supplied: {type(data)}" - ) - logging.error("FileHashStore - store_object: %s", exception_string) - raise TypeError(exception_string) - if isinstance(data, str): - if data.replace(" ", "") == "": - exception_string = "Data string cannot be empty." - logging.error("FileHashStore - store_object: %s", exception_string) - raise TypeError(exception_string) - # Format additional algorithm if supplied - logging.debug( - "FileHashStore - store_object: Validating algorithm and checksum args." + self._is_string_none_or_empty(pid, "pid", "store_object") + self._validate_data_to_store(data) + self._validate_file_size(expected_object_size) + ( + additional_algorithm_checked, + checksum_algorithm_checked, + ) = self._validate_algorithms_and_checksum( + additional_algorithm, checksum, checksum_algorithm ) - additional_algorithm_checked = None - if additional_algorithm != self.algorithm and additional_algorithm is not None: - additional_algorithm_checked = self.clean_algorithm(additional_algorithm) - # Checksum and checksum_algorithm must both be supplied - if checksum is not None: - if checksum_algorithm is None or checksum_algorithm.replace(" ", "") == "": - exception_string = ( - "checksum_algorithm cannot be None or empty if checksum is" - + "supplied." - ) - logging.error("FileHashStore - store_object: %s", exception_string) - raise ValueError(exception_string) - checksum_algorithm_checked = None - if checksum_algorithm is not None: - checksum_algorithm_checked = self.clean_algorithm(checksum_algorithm) - if checksum is None or checksum.replace(" ", "") == "": - exception_string = ( - "checksum cannot be None or empty if checksum_algorithm is" - + " supplied." - ) - logging.error("FileHashStore - store_object: %s", exception_string) - raise ValueError(exception_string) # Wait for the pid to release if it's in use while pid in self.object_locked_pids: @@ -397,12 +444,13 @@ def store_object( "FileHashStore - store_object: Attempting to store object for pid: %s", pid, ) - hash_address = self.put_object( + object_metadata = self.put_object( pid, data, additional_algorithm=additional_algorithm_checked, checksum=checksum, checksum_algorithm=checksum_algorithm_checked, + file_size_to_validate=expected_object_size, ) finally: # Release pid @@ -416,162 +464,139 @@ def store_object( "FileHashStore - store_object: Successfully stored object for pid: %s", pid, ) - return hash_address - def store_sysmeta(self, pid, sysmeta): + return object_metadata + + def store_metadata(self, pid, metadata, format_id=None): logging.debug( - "FileHashStore - store_sysmeta: Request to store sysmeta for pid: %s", pid + "FileHashStore - store_metadata: Request to store metadata for pid: %s", pid ) # Validate input parameters - logging.debug("FileHashStore - store_sysmeta: Validating arguments.") - if pid is None or pid.replace(" ", "") == "": - exception_string = f"Pid cannot be None or empty, pid: {pid}" - logging.error("FileHashStore - store_sysmeta: %s", exception_string) - raise ValueError(exception_string) - if ( - not isinstance(sysmeta, str) - and not isinstance(sysmeta, Path) - and not isinstance(sysmeta, io.BufferedIOBase) - ): - exception_string = ( - "Sysmeta must be a path or string type, data type supplied: " - + {type(sysmeta)} - ) - logging.error("FileHashStore - store_sysmeta: %s", exception_string) - raise TypeError(exception_string) - if isinstance(sysmeta, str): - if sysmeta.replace(" ", "") == "": - exception_string = "Given string path to sysmeta cannot be empty." - logging.error("FileHashStore - store_sysmeta: %s", exception_string) - raise TypeError(exception_string) + self._is_string_none_or_empty(pid, "pid", "store_metadata") + checked_format_id = self._validate_format_id(format_id, "store_metadata") + self._validate_metadata_to_store(metadata) # Wait for the pid to release if it's in use - while pid in self.sysmeta_locked_pids: + while pid in self.metadata_locked_pids: logging.debug( - "FileHashStore - store_sysmeta: %s is currently being stored. Waiting.", + "FileHashStore - store_metadata: %s is currently being stored. Waiting.", pid, ) time.sleep(self.time_out_sec) - # Modify sysmeta_locked_pids consecutively - with self.sysmeta_lock: + + with self.metadata_lock: logging.debug( - "FileHashStore - store_sysmeta: Adding pid: %s to sysmeta_locked_pids.", + "FileHashStore - store_metadata: Adding pid: %s to metadata_locked_pids.", pid, ) - self.sysmeta_locked_pids.append(pid) + # Modify metadata_locked_pids consecutively + self.metadata_locked_pids.append(pid) + try: logging.debug( - "FileHashStore - store_sysmeta: Attempting to store sysmeta for pid: %s", + "FileHashStore - store_metadata: Attempting to store metadata for pid: %s", pid, ) - sysmeta_cid = self.put_sysmeta(pid, sysmeta) + metadata_cid = self.put_metadata(metadata, pid, checked_format_id) finally: # Release pid - with self.sysmeta_lock: + with self.metadata_lock: logging.debug( - "FileHashStore - store_sysmeta: Removing pid: %s from sysmeta_locked_pids.", + "FileHashStore - store_metadata: Removing pid: %s from metadata_locked_pids.", pid, ) - self.sysmeta_locked_pids.remove(pid) + self.metadata_locked_pids.remove(pid) logging.info( - "FileHashStore - store_sysmeta: Successfully stored sysmeta for pid: %s", + "FileHashStore - store_metadata: Successfully stored metadata for pid: %s", pid, ) - return sysmeta_cid + + return metadata_cid def retrieve_object(self, pid): logging.debug( "FileHashStore - retrieve_object: Request to retrieve object for pid: %s", pid, ) - if pid is None or pid.replace(" ", "") == "": - exception_string = f"Pid cannot be None or empty, pid: {pid}" - logging.error("FileHashStore - retrieve_object: %s", exception_string) - raise ValueError(exception_string) + self._is_string_none_or_empty(pid, "pid", "retrieve_object") entity = "objects" - ab_id = self.get_sha256_hex_digest(pid) - sysmeta_exists = self.exists(entity, ab_id) - if sysmeta_exists: + object_cid = self.get_sha256_hex_digest(pid) + object_exists = self.exists(entity, object_cid) + + if object_exists: logging.debug( - "FileHashStore - retrieve_object: Sysmeta exists for pid: %s, retrieving object.", + "FileHashStore - retrieve_object: Metadata exists for pid: %s, retrieving object.", pid, ) - obj_stream = self.open(entity, ab_id) + obj_stream = self.open(entity, object_cid) else: - exception_string = f"No sysmeta found for pid: {pid}" - logging.error("FileHashStore - retrieve_object: %s", exception_string) + exception_string = ( + f"FileHashStore - retrieve_object: No object found for pid: {pid}" + ) + logging.error(exception_string) raise ValueError(exception_string) logging.info( "FileHashStore - retrieve_object: Retrieved object for pid: %s", pid ) + return obj_stream - def retrieve_sysmeta(self, pid): + def retrieve_metadata(self, pid, format_id=None): logging.debug( - "FileHashStore - retrieve_sysmeta: Request to retrieve sysmeta for pid: %s", + "FileHashStore - retrieve_metadata: Request to retrieve metadata for pid: %s", pid, ) - if pid is None or pid.replace(" ", "") == "": - exception_string = f"Pid cannot be None or empty, pid: {pid}" - logging.error("FileHashStore - retrieve_sysmeta: %s", exception_string) - raise ValueError(exception_string) - - entity = "sysmeta" - ab_id = self.get_sha256_hex_digest(pid) - sysmeta_exists = self.exists(entity, ab_id) - if sysmeta_exists: - logging.debug( - "FileHashStore - retrieve_sysmeta: Sysmeta exists for pid: %s, retrieving sysmeta.", - pid, - ) - ab_id = self.get_sha256_hex_digest(pid) - s_path = self.open(entity, ab_id) - s_content = s_path.read().decode("utf-8").split("\x00", 1) - s_path.close() - sysmeta = s_content[1] + self._is_string_none_or_empty(pid, "pid", "retrieve_metadata") + checked_format_id = self._validate_format_id(format_id, "retrieve_metadata") + + entity = "metadata" + metadata_cid = self.get_sha256_hex_digest(pid + checked_format_id) + metadata_exists = self.exists(entity, metadata_cid) + if metadata_exists: + metadata_stream = self.open(entity, metadata_cid) else: - exception_string = f"No sysmeta found for pid: {pid}" - logging.error("FileHashStore - retrieve_sysmeta: %s", exception_string) + exception_string = ( + f"FileHashStore - retrieve_metadata: No metadata found for pid: {pid}" + ) + logging.error(exception_string) raise ValueError(exception_string) + logging.info( - "FileHashStore - retrieve_sysmeta: Retrieved sysmeta for pid: %s", pid + "FileHashStore - retrieve_metadata: Retrieved metadata for pid: %s", pid ) - return sysmeta + return metadata_stream def delete_object(self, pid): logging.debug( "FileHashStore - delete_object: Request to delete object for pid: %s", pid ) - if pid is None or pid.replace(" ", "") == "": - exception_string = f"Pid cannot be None or empty, pid: {pid}" - logging.error("FileHashStore - delete_object: %s", exception_string) - raise ValueError(exception_string) + self._is_string_none_or_empty(pid, "pid", "delete_object") entity = "objects" - ab_id = self.get_sha256_hex_digest(pid) - self.delete(entity, ab_id) + object_cid = self.get_sha256_hex_digest(pid) + self.delete(entity, object_cid) + logging.info( "FileHashStore - delete_object: Successfully deleted object for pid: %s", pid, ) return True - def delete_sysmeta(self, pid): + def delete_metadata(self, pid, format_id=None): logging.debug( - "FileHashStore - delete_sysmeta: Request to delete sysmeta for pid: %s", + "FileHashStore - delete_metadata: Request to delete metadata for pid: %s", pid, ) - if pid is None or pid.replace(" ", "") == "": - exception_string = f"Pid cannot be None or empty, pid: {pid}" - logging.error("FileHashStore - delete_sysmeta: %s", exception_string) - raise ValueError(exception_string) + self._is_string_none_or_empty(pid, "pid", "delete_metadata") + checked_format_id = self._validate_format_id(format_id, "delete_metadata") + + entity = "metadata" + metadata_cid = self.get_sha256_hex_digest(pid + checked_format_id) + self.delete(entity, metadata_cid) - entity = "sysmeta" - ab_id = self.get_sha256_hex_digest(pid) - self.delete(entity, ab_id) logging.info( - "FileHashStore - delete_sysmeta: Successfully deleted sysmeta for pid: %s", + "FileHashStore - delete_metadata: Successfully deleted metadata for pid: %s", pid, ) return True @@ -581,30 +606,26 @@ def get_hex_digest(self, pid, algorithm): "FileHashStore - get_hex_digest: Request to get hex digest for object with pid: %s", pid, ) - if pid is None or pid.replace(" ", "") == "": - exception_string = f"Pid cannot be None or empty, pid: {pid}" - logging.error("FileHashStore - get_hex_digest: %s", exception_string) - raise ValueError(exception_string) - if algorithm is None or algorithm.replace(" ", "") == "": - exception_string = f"Algorithm cannot be None or empty, pid: {pid}" - logging.error("FileHashStore - get_hex_digest: %s", exception_string) - raise ValueError(exception_string) + self._is_string_none_or_empty(pid, "pid", "get_hex_digest") + self._is_string_none_or_empty(algorithm, "algorithm", "get_hex_digest") entity = "objects" algorithm = self.clean_algorithm(algorithm) - ab_id = self.get_sha256_hex_digest(pid) - if not self.exists(entity, ab_id): - exception_string = f"No object found for pid: {pid}" - logging.error("FileHashStore - get_hex_digest: %s", exception_string) + object_cid = self.get_sha256_hex_digest(pid) + if not self.exists(entity, object_cid): + exception_string = ( + f"FileHashStore - get_hex_digest: No object found for pid: {pid}" + ) + logging.error(exception_string) raise ValueError(exception_string) - c_stream = self.open(entity, ab_id) - hex_digest = self.computehash(c_stream, algorithm=algorithm) + cid_stream = self.open(entity, object_cid) + hex_digest = self.computehash(cid_stream, algorithm=algorithm) - logging_info_statement = ( + info_msg = ( f"FileHashStore - get_hex_digest: Successfully calculated hex digest for pid: {pid}." + f" Hex Digest: {hex_digest}", ) - logging.info(logging_info_statement) + logging.info(info_msg) return hex_digest # FileHashStore Core Methods @@ -617,6 +638,7 @@ def put_object( additional_algorithm=None, checksum=None, checksum_algorithm=None, + file_size_to_validate=None, ): """Store contents of `file` on disk using the hash of the given pid @@ -629,12 +651,12 @@ def put_object( when returning hex digests. \n checksum (str, optional): Optional checksum to validate object against hex digest before moving to permanent location. \n - checksum_algorithm (str, optional): Algorithm value of given checksum. + checksum_algorithm (str, optional): Algorithm value of given checksum. \n + file_size_to_validate (bytes, optional): Expected size of object Returns: - hash_address (HashAddress): object that contains the permanent address, - relative file path, absolute file path, duplicate file boolean and hex - digest dictionary. + object_metadata (ObjectMetadata): object that contains the object id, + object file size, duplicate file boolean and hex digest dictionary. """ stream = Stream(file) @@ -643,10 +665,8 @@ def put_object( ) with closing(stream): ( - ab_id, - rel_path, - abs_path, - is_duplicate, + object_cid, + obj_file_size, hex_digest_dict, ) = self._move_and_get_checksums( pid, @@ -655,16 +675,15 @@ def put_object( additional_algorithm, checksum, checksum_algorithm, + file_size_to_validate, ) - hash_address = HashAddress( - ab_id, rel_path, abs_path, is_duplicate, hex_digest_dict - ) + object_metadata = ObjectMetadata(object_cid, obj_file_size, hex_digest_dict) logging.debug( "FileHashStore - put_object: Successfully put object for pid: %s", pid, ) - return hash_address + return object_metadata def _move_and_get_checksums( self, @@ -674,13 +693,13 @@ def _move_and_get_checksums( additional_algorithm=None, checksum=None, checksum_algorithm=None, + file_size_to_validate=None, ): """Copy the contents of `stream` onto disk with an optional file extension appended. The copy process uses a temporary file to store the initial contents and returns a dictionary of algorithms and their hex digest values. If the file already exists, the method will immediately - return with is_duplicate: True and "None" for the remaining HashAddress - attributes. If an algorithm and checksum is provided, it will proceed to + raise an exception. If an algorithm and checksum is provided, it will proceed to validate the object (and delete the tmpFile if the hex digest stored does not match what is provided). @@ -694,33 +713,34 @@ def _move_and_get_checksums( checksum (str, optional): Optional checksum to validate object against hex digest before moving to permanent location. \n checksum_algorithm (str, optional): Algorithm value of given checksum. \n + file_size_to_validate (bytes, optional): Expected size of object Returns: - hash_address (HashAddress): object that contains the permanent address, - relative file path, absolute file path, duplicate file boolean and hex - digest dictionary. + object_metadata (tuple): object id, object file size, duplicate file + boolean and hex digest dictionary. """ entity = "objects" - ab_id = self.get_sha256_hex_digest(pid) - abs_file_path = self.build_abs_path(entity, ab_id, extension) - self.create_path(os.path.dirname(abs_file_path)) - # Only put file if it doesn't exist + object_cid = self.get_sha256_hex_digest(pid) + abs_file_path = self.build_abs_path(entity, object_cid, extension) + + # Only create tmp file to be moved if target destination doesn't exist if os.path.isfile(abs_file_path): - exception_string = f"File already exists for pid: {pid} at {abs_file_path}" - logging.error( - "FileHashStore - _move_and_get_checksums: %s", exception_string + exception_string = ( + "FileHashStore - _move_and_get_checksums: File already exists" + + f" for pid: {pid} at {abs_file_path}" ) + logging.error(exception_string) raise FileExistsError(exception_string) - rel_file_path = os.path.relpath(abs_file_path, self.objects) - # Create temporary file and calculate hex digests - debug_tmp_file_str = ( + debug_msg = ( "FileHashStore - _move_and_get_checksums: Creating temp" + f" file and calculating checksums for pid: {pid}" ) - logging.debug(debug_tmp_file_str) - hex_digests, tmp_file_name = self._mktempfile(stream, additional_algorithm) + logging.debug(debug_msg) + hex_digests, tmp_file_name, tmp_file_size = self._mktmpfile( + stream, additional_algorithm, checksum_algorithm + ) logging.debug( "FileHashStore - _move_and_get_checksums: Temp file created: %s", tmp_file_name, @@ -729,55 +749,53 @@ def _move_and_get_checksums( # Only move file if it doesn't exist. # Files are stored once and only once if not os.path.isfile(abs_file_path): - if checksum_algorithm is not None and checksum is not None: - hex_digest_stored = hex_digests[checksum_algorithm] - if hex_digest_stored != checksum: - self.delete(entity, tmp_file_name) - exception_string = ( - "Hex digest and checksum do not match - file not stored." - + f" Algorithm: {checksum_algorithm}." - + f" Checksum provided: {checksum} != Hex Digest: {hex_digest_stored}" - ) - logging.error( - "FileHashStore - _move_and_get_checksums: %s", exception_string - ) - raise ValueError(exception_string) - is_duplicate = False + self._validate_object( + pid, + checksum, + checksum_algorithm, + entity, + hex_digests, + tmp_file_name, + tmp_file_size, + file_size_to_validate, + ) + self.create_path(os.path.dirname(abs_file_path)) try: - debug_move_tmp_file_str = ( + debug_msg = ( "FileHashStore - _move_and_get_checksums: Moving temp file to permanent" + f" location: {abs_file_path}", ) - logging.debug(debug_move_tmp_file_str) + logging.debug(debug_msg) shutil.move(tmp_file_name, abs_file_path) except Exception as err: # Revert storage process - exception_string = f"Unexpected {err=}, {type(err)=}" - logging.error( - "FileHashStore - _move_and_get_checksums: %s", exception_string + exception_string = ( + "FileHashStore - _move_and_get_checksums:" + + f" Unexpected {err=}, {type(err)=}" ) + logging.error(exception_string) if os.path.isfile(abs_file_path): # Check to see if object has moved successfully before deleting - debug_file_found_exception_str = ( + debug_msg = ( "FileHashStore - _move_and_get_checksums: Permanent file" + f" found during exception, checking hex digest for pid: {pid}" ) - logging.debug(debug_file_found_exception_str) + logging.debug(debug_msg) pid_checksum = self.get_hex_digest(pid, self.algorithm) if pid_checksum == hex_digests.get(self.algorithm): # If the checksums match, return and log warning - warning_file_stored_str = ( + warning_msg = ( "FileHashStore - _move_and_get_checksums: File moved" + f" successfully but unexpected issue encountered: {exception_string}", ) - logging.warning(warning_file_stored_str) + logging.warning(warning_msg) return else: - debug_file_incomplete_state_str = ( + debug_msg = ( "FileHashStore - _move_and_get_checksums: Permanent file" + f" found but with incomplete state, deleting file: {abs_file_path}", ) - logging.debug(debug_file_incomplete_state_str) + logging.debug(debug_msg) self.delete(entity, abs_file_path) logging.debug( "FileHashStore - _move_and_get_checksums: Deleting temporary file: %s", @@ -786,46 +804,55 @@ def _move_and_get_checksums( self.delete(entity, tmp_file_name) err_msg = ( "Aborting store_object upload - an unexpected error has occurred when moving" - + f" file to: {ab_id} - Error: {err}" + + f" file to: {object_cid} - Error: {err}" ) logging.error("FileHashStore - _move_and_get_checksums: %s", err_msg) raise else: # Else delete temporary file - warning_duplicate_file_str = ( + warning_msg = ( f"FileHashStore - _move_and_get_checksums: Object exists at: {abs_file_path}," + " deleting temporary file." ) - logging.warning(warning_duplicate_file_str) - is_duplicate = True + logging.warning(warning_msg) self.delete(entity, tmp_file_name) - return ab_id, rel_file_path, abs_file_path, is_duplicate, hex_digests + return (object_cid, tmp_file_size, hex_digests) - def _mktempfile(self, stream, algorithm=None): - """Create a named temporary file from a `Stream` object and - return its filename and a dictionary of its algorithms and hex digests. - If an algorithm is provided, it will add the respective hex digest to - the dictionary. + def _mktmpfile(self, stream, additional_algorithm=None, checksum_algorithm=None): + """Create a named temporary file from a `Stream` object and return its filename + and a dictionary of its algorithms and hex digests. If an additionak and/or checksum + algorithm is provided, it will add the respective hex digest to the dictionary. Args: stream (io.BufferedReader): Object stream. - algorithm (string): Algorithm of additional hex digest to generate. + algorithm (string): Algorithm of additional hex digest to generate + checksum_algorithm (string): Algorithm of additional checksum algo to generate Returns: hex_digest_dict, tmp.name (tuple pack): hex_digest_dict (dictionary): Algorithms and their hex digests. tmp.name: Name of temporary file created and written into. """ - algorithm_list_to_calculate = self.default_algo_list + # Review additional hash object to digest and create new list + algorithm_list_to_calculate = self._refine_algorithm_list( + additional_algorithm, checksum_algorithm + ) - # Create temporary file in .../{store_path}/tmp tmp_root_path = self.get_store_path("objects") / "tmp" # Physically create directory if it doesn't exist if os.path.exists(tmp_root_path) is False: self.create_path(tmp_root_path) tmp = NamedTemporaryFile(dir=tmp_root_path, delete=False) + # Delete tmp file if python interpreter crashes or thread is interrupted + # when store_object is called + def delete_tmp_file(): + if os.path.exists(tmp.name): + os.remove(tmp.name) + + atexit.register(delete_tmp_file) + # Ensure tmp file is created with desired permissions if self.fmode is not None: oldmask = os.umask(0) @@ -834,112 +861,136 @@ def _mktempfile(self, stream, algorithm=None): finally: os.umask(oldmask) - # Additional hash object to digest - if algorithm is not None: - self.clean_algorithm(algorithm) - if algorithm in self.other_algo_list: - debug_additional_other_algo_str = ( - f"FileHashStore - _mktempfile: additional algorithm: {algorithm} found" - + " in other_algo_lists, adding to list of algorithms to calculate." - ) - logging.debug(debug_additional_other_algo_str) - algorithm_list_to_calculate.append(algorithm) - logging.debug( "FileHashStore - _mktempfile: tmp file created: %s, calculating hex digests.", tmp.name, ) - hash_algorithms = [ - hashlib.new(algorithm) for algorithm in algorithm_list_to_calculate - ] - # tmp is a file-like object that is already opened for writing by default - with tmp as tmp_file: - for data in stream: - tmp_file.write(self._to_bytes(data)) - for hash_algorithm in hash_algorithms: - hash_algorithm.update(self._to_bytes(data)) - logging.debug( - "FileHashStore - _mktempfile: Object stream successfully written to tmp file: %s", - tmp.name, - ) + tmp_file_completion_flag = False + try: + hash_algorithms = [ + hashlib.new(algorithm) for algorithm in algorithm_list_to_calculate + ] - hex_digest_list = [ - hash_algorithm.hexdigest() for hash_algorithm in hash_algorithms - ] - hex_digest_dict = dict(zip(algorithm_list_to_calculate, hex_digest_list)) + # tmp is a file-like object that is already opened for writing by default + with tmp as tmp_file: + for data in stream: + tmp_file.write(self._to_bytes(data)) + for hash_algorithm in hash_algorithms: + hash_algorithm.update(self._to_bytes(data)) + logging.debug( + "FileHashStore - _mktempfile: Object stream successfully written to tmp file: %s", + tmp.name, + ) - logging.debug("FileHashStore - _mktempfile: Hex digests calculated.") - return hex_digest_dict, tmp.name + hex_digest_list = [ + hash_algorithm.hexdigest() for hash_algorithm in hash_algorithms + ] + hex_digest_dict = dict(zip(algorithm_list_to_calculate, hex_digest_list)) + tmp_file_size = os.path.getsize(tmp.name) + # Ready for validation and atomic move + tmp_file_completion_flag = True + + logging.debug("FileHashStore - _mktempfile: Hex digests calculated.") + return hex_digest_dict, tmp.name, tmp_file_size + # pylint: disable=W0718 + except Exception as err: + exception_string = ( + f"FileHashStore - _mktempfile: Unexpected {err=}, {type(err)=}" + ) + logging.error(exception_string) + # pylint: disable=W0707,W0719 + raise Exception(exception_string) + except KeyboardInterrupt: + exception_string = ( + "FileHashStore - _mktempfile: Keyboard interruption by user." + ) + logging.error(exception_string) + if os.path.exists(tmp.name): + os.remove(tmp.name) + finally: + if not tmp_file_completion_flag: + try: + if os.path.exists(tmp.name): + os.remove(tmp.name) + # pylint: disable=W0718 + except Exception as err: + exception_string = ( + f"FileHashStore - _mktempfile: Unexpected {err=} while attempting to" + + f" delete tmp file: {tmp.name}, {type(err)=}" + ) + logging.error(exception_string) - def put_sysmeta(self, pid, sysmeta): - """Store contents of `sysmeta` on disk using the hash of the given pid + def put_metadata(self, metadata, pid, format_id): + """Store contents of metadata to `[self.root]/metadata` using the hash of the + given pid and format_id as the permanent address. Args: pid (string): Authority-based identifier. - sysmeta (mixed): String or path to sysmeta document. + format_id (string): Metadata format. + metadata (mixed): String or path to metadata document. Returns: - ab_id (string): Address of the sysmeta document. + metadata_cid (string): Address of the metadata document. """ logging.debug( - "FileHashStore - put_sysmeta: Request to put sysmeta for pid: %s", pid + "FileHashStore - put_metadata: Request to put metadata for pid: %s", pid ) - - # Create tmp file and write to it - sysmeta_stream = Stream(sysmeta) - with closing(sysmeta_stream): - sysmeta_tmp = self._mktmpsysmeta(sysmeta_stream, self.sysmeta_ns) - - # Target path (permanent location) - ab_id = self.get_sha256_hex_digest(pid) - rel_path = "/".join(self.shard(ab_id)) - full_path = self.get_store_path("sysmeta") / rel_path - - # Move sysmeta to target path - if os.path.exists(sysmeta_tmp): + # Create metadata tmp file and write to it + metadata_stream = Stream(metadata) + with closing(metadata_stream): + metadata_tmp = self._mktmpmetadata(metadata_stream) + + # Get target and related paths (permanent location) + metadata_cid = self.get_sha256_hex_digest(pid + format_id) + rel_path = "/".join(self.shard(metadata_cid)) + full_path = self.get_store_path("metadata") / rel_path + + # Move metadata to target path + if os.path.exists(metadata_tmp): try: parent = full_path.parent parent.mkdir(parents=True, exist_ok=True) - # Sysmeta will be replaced if it exists - shutil.move(sysmeta_tmp, full_path) + # Metadata will be replaced if it exists + shutil.move(metadata_tmp, full_path) logging.debug( - "FileHashStore - put_sysmeta: Successfully put sysmeta for pid: %s", + "FileHashStore - put_metadata: Successfully put metadata for pid: %s", pid, ) - return ab_id + return metadata_cid except Exception as err: - exception_string = f"Unexpected {err=}, {type(err)=}" - logging.error("FileHashStore - put_sysmeta: %s", exception_string) - if os.path.exists(sysmeta_tmp): - # Remove tmp sysmeta, calling app must re-upload + exception_string = ( + f"FileHashStore - put_metadata: Unexpected {err=}, {type(err)=}" + ) + logging.error(exception_string) + if os.path.exists(metadata_tmp): + # Remove tmp metadata, calling app must re-upload logging.debug( - "FileHashStore - put_sysmeta: Deleting sysmeta for pid: %s", pid + "FileHashStore - put_metadata: Deleting metadata for pid: %s", + pid, ) - self.sysmeta.delete(sysmeta_tmp) - err_msg = f"Aborting store_sysmeta upload - an unexpected error has occurred: {err}" - logging.error("FileHashStore - put_sysmeta: %s", err_msg) + self.metadata.delete(metadata_tmp) raise else: exception_string = ( - f"Attempt to move sysmeta for pid: {pid}" - + f", but sysmeta temp file not found: {sysmeta_tmp}" + f"FileHashStore - put_metadata: Attempt to move metadata for pid: {pid}" + + f", but metadata temp file not found: {metadata_tmp}" ) - logging.error("FileHashStore - put_sysmeta: %s", exception_string) - raise FileNotFoundError() + logging.error(exception_string) + raise FileNotFoundError(exception_string) - def _mktmpsysmeta(self, stream, namespace): - """Create a named temporary file with `sysmeta` bytes and `namespace`. + def _mktmpmetadata(self, stream): + """Create a named temporary file with `stream` (metadata) and `format_id`. Args: - stream (io.BufferedReader): Sysmeta stream. - namespace (string): Format of sysmeta. + stream (io.BufferedReader): Metadata stream. + format_id (string): Format of metadata. Returns: - tmp.name (string): Name of temporary file created and written into. + tmp.name (string): Path/name of temporary file created and written into. """ # Create temporary file in .../{store_path}/tmp - tmp_root_path = self.get_store_path("sysmeta") / "tmp" + tmp_root_path = self.get_store_path("metadata") / "tmp" # Physically create directory if it doesn't exist if os.path.exists(tmp_root_path) is False: self.create_path(tmp_root_path) @@ -955,23 +1006,210 @@ def _mktmpsysmeta(self, stream, namespace): # tmp is a file-like object that is already opened for writing by default logging.debug( - "FileHashStore - _mktmpsysmeta: Writing stream to tmp sysmeta file: %s", + "FileHashStore - _mktmpmetadata: Writing stream to tmp metadata file: %s", tmp.name, ) with tmp as tmp_file: - tmp_file.write(namespace.encode("utf-8")) - tmp_file.write(b"\x00") for data in stream: tmp_file.write(self._to_bytes(data)) logging.debug( - "FileHashStore - _mktmpsysmeta: Successfully written to tmp sysmeta file: %s", + "FileHashStore - _mktmpmetadata: Successfully written to tmp metadata file: %s", tmp.name, ) return tmp.name # FileHashStore Utility & Supporting Methods + def _validate_data_to_store(self, data): + """Evaluates a data argument to ensure that it is either a string, path or + stream object before attempting to store it. + + Args: + data (string, path, stream): object to validate + """ + if ( + not isinstance(data, str) + and not isinstance(data, Path) + and not isinstance(data, io.BufferedIOBase) + ): + exception_string = ( + "FileHashStore - store_object: Data must be a path, string or buffered" + + f" stream type. Data type supplied: {type(data)}" + ) + logging.error(exception_string) + raise TypeError(exception_string) + if isinstance(data, str): + if data.replace(" ", "") == "": + exception_string = ( + "FileHashStore - store_object: Data string cannot be empty." + ) + logging.error(exception_string) + raise TypeError(exception_string) + + def _validate_algorithms_and_checksum( + self, additional_algorithm, checksum, checksum_algorithm + ): + """Determines whether calling app has supplied the necessary arguments to validate + an object with a checksum value + + Args: + additional_algorithm: value of additional algorithm to calculate + checksum (string): value of checksum + checksum_algorithm (string): algorithm of checksum + """ + additional_algorithm_checked = None + if additional_algorithm != self.algorithm and additional_algorithm is not None: + # Set additional_algorithm + additional_algorithm_checked = self.clean_algorithm(additional_algorithm) + checksum_algorithm_checked = None + if checksum is not None: + self._is_string_none_or_empty( + checksum_algorithm, + "checksum_algorithm", + "validate_checksum_args (store_object)", + ) + if checksum_algorithm is not None: + self._is_string_none_or_empty( + checksum, + "checksum", + "validate_checksum_args (store_object)", + ) + # Set checksum_algorithm + checksum_algorithm_checked = self.clean_algorithm(checksum_algorithm) + return additional_algorithm_checked, checksum_algorithm_checked + + def _refine_algorithm_list(self, additional_algorithm, checksum_algorithm): + """Create the final list of hash algorithms to calculate + + Args: + additional_algorithm (string) + checksum_algorithm (string) + + Return: + algorithm_list_to_calculate (set): De-duplicated list of hash algorithms + """ + algorithm_list_to_calculate = self.default_algo_list + if checksum_algorithm is not None: + self.clean_algorithm(checksum_algorithm) + if checksum_algorithm in self.other_algo_list: + debug_additional_other_algo_str = ( + f"FileHashStore - _mktempfile: checksum algorithm: {checksum_algorithm}" + + " found in other_algo_lists, adding to list of algorithms to calculate." + ) + logging.debug(debug_additional_other_algo_str) + algorithm_list_to_calculate.append(checksum_algorithm) + if additional_algorithm is not None: + self.clean_algorithm(additional_algorithm) + if additional_algorithm in self.other_algo_list: + debug_additional_other_algo_str = ( + f"FileHashStore - _mktempfile: additional algorithm: {additional_algorithm}" + + " found in other_algo_lists, adding to list of algorithms to calculate." + ) + logging.debug(debug_additional_other_algo_str) + algorithm_list_to_calculate.append(additional_algorithm) + + # Remove duplicates + algorithm_list_to_calculate = set(algorithm_list_to_calculate) + return algorithm_list_to_calculate + + def _validate_object( + self, + pid, + checksum, + checksum_algorithm, + entity, + hex_digests, + tmp_file_name, + tmp_file_size, + file_size_to_validate, + ): + """Evaluates an object's integrity + + Args: + pid: For logging purposes + checksum: Value of checksum + checksum_algorithm: Algorithm of checksum + entity: Type of object + hex_digests: Dictionary of hex digests to select from + tmp_file_name: Name of tmp file + tmp_file_size: Size of the tmp file + file_size_to_validate: Expected size of the object + """ + if file_size_to_validate is not None and file_size_to_validate > 0: + if file_size_to_validate != tmp_file_size: + self.delete(entity, tmp_file_name) + exception_string = ( + "FileHashStore - _move_and_get_checksums: Object file size calculated: " + + f" {tmp_file_size} does not match with expected size:" + + f"{file_size_to_validate}. Tmp file deleted and file not stored for" + + f" pid: {pid}" + ) + logging.error(exception_string) + raise ValueError(exception_string) + if checksum_algorithm is not None and checksum is not None: + hex_digest_stored = hex_digests[checksum_algorithm] + if hex_digest_stored != checksum: + self.delete(entity, tmp_file_name) + exception_string = ( + "FileHashStore - _move_and_get_checksums: Hex digest and checksum" + + f" do not match - file not stored for pid: {pid}. Algorithm:" + + f" {checksum_algorithm}. Checksum provided: {checksum} !=" + + f" HexDigest: {hex_digest_stored}. Tmp file deleted." + ) + logging.error(exception_string) + raise ValueError(exception_string) + + def _validate_metadata_to_store(self, metadata): + """Evaluates a metadata argument to ensure that it is either a string, path or + stream object before attempting to store it. + + Args: + metadata (string, path, stream): metadata to validate + """ + if isinstance(metadata, str): + if metadata.replace(" ", "") == "": + exception_string = ( + "FileHashStore - store_metadata: Given string path to" + + " metadata cannot be empty." + ) + logging.error(exception_string) + raise TypeError(exception_string) + if ( + not isinstance(metadata, str) + and not isinstance(metadata, Path) + and not isinstance(metadata, io.BufferedIOBase) + ): + exception_string = ( + "FileHashStore - store_metadata: Metadata must be a path or string" + + f" type, data type supplied: {type(metadata)}" + ) + logging.error(exception_string) + raise TypeError(exception_string) + + def _validate_format_id(self, format_id, method): + """Determines the metadata namespace (format_id) to use for storing, + retrieving and deleting metadata. + + Args: + format_id (string): Metadata namespace to review + method (string): Calling method for logging purposes + + Returns: + checked_format_id (string): Valid metadata namespace + """ + checked_format_id = None + if format_id is not None and format_id.replace(" ", "") == "": + exception_string = f"FileHashStore - {method}: Format_id cannot be empty." + logging.error(exception_string) + raise ValueError(exception_string) + elif format_id is None: + # Use default value set by hashstore config + checked_format_id = self.sysmeta_ns + else: + checked_format_id = format_id + return checked_format_id + def clean_algorithm(self, algorithm_string): """Format a string and ensure that it is supported and compatible with the python hashlib library. @@ -995,8 +1233,11 @@ def clean_algorithm(self, algorithm_string): cleaned_string not in self.default_algo_list and cleaned_string not in self.other_algo_list ): - exception_string = f"Algorithm not supported: {cleaned_string}" - logging.error("FileHashStore: clean_algorithm: %s", exception_string) + exception_string = ( + "FileHashStore: clean_algorithm: Algorithm not supported:" + + cleaned_string + ) + logging.error(exception_string) raise ValueError(exception_string) return cleaned_string @@ -1005,7 +1246,7 @@ def computehash(self, stream, algorithm=None): or with optional algorithm supported. Args: - stream (io.BufferedReader): A buffered stream of an ab_id object. \n + stream (io.BufferedReader): A buffered stream of an object_cid object. \n algorithm (string): Algorithm of hex digest to generate. Returns: @@ -1025,22 +1266,22 @@ def get_store_path(self, entity): """Return a path object of the root directory of the store. Args: - entity (str): Desired entity type (ex. "objects", "sysmeta"). + entity (str): Desired entity type: "objects" or "metadata" """ if entity == "objects": return Path(self.objects) - elif entity == "sysmeta": - return Path(self.sysmeta) + elif entity == "metadata": + return Path(self.metadata) else: raise ValueError( - f"entity: {entity} does not exist. Do you mean 'objects' or 'sysmeta'?" + f"entity: {entity} does not exist. Do you mean 'objects' or 'metadata'?" ) def exists(self, entity, file): """Check whether a given file id or path exists on disk. Args: - entity (str): Desired entity type (ex. "objects", "sysmeta"). \n + entity (str): Desired entity type (ex. "objects", "metadata"). \n file (str): The name of the file to check. Returns: @@ -1082,7 +1323,7 @@ def open(self, entity, file, mode="rb"): for closing the stream. Args: - entity (str): Desired entity type (ex. "objects", "sysmeta"). \n + entity (str): Desired entity type (ex. "objects", "metadata"). \n file (str): Address ID or path of file. \n mode (str, optional): Mode to open file in. Defaults to 'rb'. @@ -1103,7 +1344,7 @@ def delete(self, entity, file): deleting. No exception is raised if file doesn't exist. Args: - entity (str): Desired entity type (ex. "objects", "sysmeta"). \n + entity (str): Desired entity type (ex. "objects", "metadata"). \n file (str): Address ID or path of file. """ realpath = self.get_real_path(entity, file) @@ -1152,7 +1393,7 @@ def _has_subdir(self, path): return is_subdir def create_path(self, path): - """Physically create the folder path on disk. + """Physically create the folder path (and all intermediate ones) on disk. Args: path (str): The path to create. @@ -1172,7 +1413,7 @@ def get_real_path(self, entity, file): the expected file path of the id. Args: - entity (str): desired entity type (ex. "objects", "sysmeta"). \n + entity (str): desired entity type (ex. "objects", "metadata"). \n file (string): Name of file. Returns: @@ -1186,11 +1427,11 @@ def get_real_path(self, entity, file): rel_root = "" if entity == "objects": rel_root = self.objects - elif entity == "sysmeta": - rel_root = self.sysmeta + elif entity == "metadata": + rel_root = self.metadata else: raise ValueError( - f"entity: {entity} does not exist. Do you mean 'objects' or 'sysmeta'?" + f"entity: {entity} does not exist. Do you mean 'objects' or 'metadata'?" ) relpath = os.path.join(rel_root, file) if os.path.isfile(relpath): @@ -1204,18 +1445,18 @@ def get_real_path(self, entity, file): # Could not determine a match. return None - def build_abs_path(self, entity, ab_id, extension=""): + def build_abs_path(self, entity, cid, extension=""): """Build the absolute file path for a given hash id with an optional file extension. Args: - entity (str): Desired entity type (ex. "objects", "sysmeta"). \n - ab_id (str): A hash id to build a file path for. \n + entity (str): Desired entity type (ex. "objects", "metadata"). \n + cid (str): A hash id to build a file path for. \n extension (str): An optional file extension to append to the file path. Returns: absolute_path (str): An absolute file path for the specified hash id. """ - paths = self.shard(ab_id) + paths = self.shard(cid) root_dir = self.get_store_path(entity) if extension and not extension.startswith(os.extsep): @@ -1230,7 +1471,7 @@ def count(self, entity): """Return count of the number of files in the `root` directory. Args: - entity (str): Desired entity type (ex. "objects", "sysmeta"). + entity (str): Desired entity type (ex. "objects", "metadata"). Returns: count (int): Number of files in the directory. @@ -1239,11 +1480,11 @@ def count(self, entity): directory_to_count = "" if entity == "objects": directory_to_count = self.objects - elif entity == "sysmeta": - directory_to_count = self.sysmeta + elif entity == "metadata": + directory_to_count = self.metadata else: raise ValueError( - f"entity: {entity} does not exist. Do you mean 'objects' or 'sysmeta'?" + f"entity: {entity} does not exist. Do you mean 'objects' or 'metadata'?" ) for _, _, files in os.walk(directory_to_count): @@ -1253,6 +1494,45 @@ def count(self, entity): # Other Static Methods + @staticmethod + def _validate_file_size(file_size): + """Checks whether a file size is > 0 and an int and throws exception if not. + + Args: + file_size (int): file size to check + """ + if file_size is not None: + if not isinstance(file_size, int): + exception_string = ( + "FileHashStore - _is_file_size_valid: size given must be an integer." + + f" File size: {file_size}. Arg Type: {type(file_size)}." + ) + logging.error(exception_string) + raise TypeError(exception_string) + if file_size < 1 or not isinstance(file_size, int): + exception_string = ( + "FileHashStore - _is_file_size_valid: size given must be > 0" + ) + logging.error(exception_string) + raise ValueError(exception_string) + + @staticmethod + def _is_string_none_or_empty(string, arg, method): + """Checks whether a string is None or empty and throws an exception if so. + + Args: + string (string): Value to check + arg (): Name of argument to check + method (string): Calling method for logging purposes + """ + if string is None or string.replace(" ", "") == "": + exception_string = ( + f"FileHashStore - {method}: {arg} cannot be None" + + f" or empty, {arg}: {string}." + ) + logging.error(exception_string) + raise ValueError(exception_string) + @staticmethod def _to_bytes(text): """Convert text to sequence of bytes using utf-8 encoding. diff --git a/src/hashstore/hashaddress.py b/src/hashstore/hashaddress.py deleted file mode 100644 index 71cbac48..00000000 --- a/src/hashstore/hashaddress.py +++ /dev/null @@ -1,27 +0,0 @@ -"""HashAddress must be returned for all HashStore implementations""" -from collections import namedtuple - - -class HashAddress( - namedtuple( - "HashAddress", ["id", "relpath", "abspath", "is_duplicate", "hex_digests"] - ) -): - """File address containing file's path on disk and its content hash ID. - - Args: - ab_id (str): Hash ID (hexdigest) of file contents. - relpath (str): Relative path location to :attr:`HashFS.root`. - abspath (str): Absolute path location of file on disk. - is_duplicate (boolean, optional): Whether the hash address created was - a duplicate of a previously existing file. Can only be ``True`` - after a put operation. Defaults to ``False``. - hex_digests (dict, optional): A list of hex digests to validate objects - (md5, sha1, sha256, sha384, sha512) - """ - - # Default value to prevent dangerous default value - def __new__(cls, ab_id, relpath, abspath, is_duplicate=False, hex_digests=None): - return super(HashAddress, cls).__new__( - cls, ab_id, relpath, abspath, is_duplicate, hex_digests - ) diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index 25a5979e..6c704209 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -1,5 +1,6 @@ """Hashstore Interface""" from abc import ABC, abstractmethod +from collections import namedtuple import importlib.metadata @@ -22,14 +23,15 @@ def store_object( additional_algorithm, checksum, checksum_algorithm, + expected_object_size, ): """The `store_object` method is responsible for the atomic storage of objects to disk using a given InputStream and a persistent identifier (pid). Upon - successful storage, the method returns a HashAddress object containing - relevant file information, such as the file's id, relative path, absolute - path, duplicate object status, and hex digest map of algorithms and - checksums. `store_object` also ensures that an object is stored only once by - synchronizing multiple calls and rejecting calls to store duplicate objects. + successful storage, the method returns a ObjectMetadata object containing + relevant file information, such as the file's id (which can be used to locate the + object on disk), the file's size, and a hex digest map of algorithms and checksums. + `store_object` also ensures that an object is stored only once by synchronizing + multiple calls and rejecting calls to store duplicate objects. The file's id is determined by calculating the SHA-256 hex digest of the provided pid, which is also used as the permanent address of the file. The @@ -45,8 +47,8 @@ def store_object( with its corresponding hex digest. An algorithm is considered "supported" if it is recognized as a valid hash algorithm in the `hashlib` library. - Similarly, if a checksum and a checksumAlgorithm value are provided, - `store_object` validates the object to ensure it matches what is provided + Similarly, if a file size and/or checksum & checksumAlgorithm value are provided, + `store_object` validates the object to ensure it matches the given arguments before moving the file to its permanent address. Args: @@ -55,33 +57,32 @@ def store_object( additional_algorithm (string): Additional hex digest to include. checksum (string): Checksum to validate against. checksum_algorithm (string): Algorithm of supplied checksum. + expected_object_size (int): Size of object to verify Returns: - address (HashAddress): Object that contains the permanent address, relative - file path, absolute file path, duplicate file boolean and hex digest dictionary. + object_metadata (ObjectMetadata): Object that contains the permanent address, + file size, duplicate file boolean and hex digest dictionary. """ raise NotImplementedError() @abstractmethod - def store_sysmeta(self, pid, sysmeta): - """The `store_sysmeta` method is responsible for adding and/or updating metadata - (`sysmeta`) to disk using a given InputStream and a persistent identifier - (pid). The metadata object consists of a header and body portion. The header - is formed by writing the namespace/format (utf-8) of the metadata document - followed by a null character `\x00` and the body follows immediately after. - - Upon successful storage of sysmeta, the method returns a String that - represents the file's permanent address, and similarly to 'store_object', this - permanent address is determined by calculating the SHA-256 hex digest of the - provided pid. Finally, sysmeta are stored in parallel to objects in the - `/store_directory/sysmeta/` directory. + def store_metadata(self, pid, metadata, format_id): + """The `store_metadata` method is responsible for adding and/or updating metadata + (ex. `sysmeta`) to disk using a given path/stream, a persistent identifier `pid` + and a metadata `format_id`. The metadata object's permanent address, which is + determined by calculating the SHA-256 hex digest of the provided `pid` + `format_id`. + + Upon successful storage of metadata, `store_metadata` returns a string that + represents the file's permanent address. Lastly, the metadata objects are stored + in parallel to objects in the `/store_directory/metadata/` directory. Args: pid (string): Authority-based identifier. - sysmeta (mixed): String or path to sysmeta document. + format_id (string): Metadata format + metadata (mixed): String or path to metadata document. Returns: - sysmeta_cid (string): Address of the sysmeta document. + metadata_cid (string): Address of the metadata document. """ raise NotImplementedError() @@ -96,20 +97,24 @@ def retrieve_object(self, pid): pid (string): Authority-based identifier. Returns: - obj_stream (io.BufferedReader): A buffered stream of an ab_id object. + obj_stream (io.BufferedReader): A buffered stream of a data object. """ raise NotImplementedError() @abstractmethod - def retrieve_sysmeta(self, pid): - """The 'retrieve_sysmeta' method retrieves the metadata content from disk and - returns it in the form of a String using a given persistent identifier. + def retrieve_metadata(self, pid, format_id): + """The 'retrieve_metadata' method retrieves the metadata object from disk using + a given persistent identifier (pid) and metadata namespace (format_id). + If the object exists (determined by calculating the metadata object's permanent + address using the SHA-256 hash of the given pid+format_id), the method will open + and return a buffered metadata stream ready to read from. Args: - pid (string): Authority-based identifier. + pid (string): Authority-based identifier + format_id (string): Metadata format Returns: - sysmeta (string): Sysmeta content. + metadata_stream (io.BufferedReader): A buffered stream of a metadata object. """ raise NotImplementedError() @@ -127,12 +132,13 @@ def delete_object(self, pid): raise NotImplementedError() @abstractmethod - def delete_sysmeta(self, pid): - """The 'delete_sysmeta' method deletes a metadata document (sysmeta) permanently - from disk using a given persistent identifier. + def delete_metadata(self, pid, format_id): + """The 'delete_metadata' method deletes a metadata document permanently + from disk using a given persistent identifier and format_id. Args: - pid (string): Authority-based identifier. + pid (string): Authority-based identifier + format_id (string): Metadata format Returns: boolean: `True` upon successful deletion. @@ -152,3 +158,68 @@ def get_hex_digest(self, pid, algorithm): hex_digest (string): Hex digest of the object. """ raise NotImplementedError() + + +class HashStoreFactory: + """A factory class for creating `HashStore`-like objects (classes + that implement the 'HashStore' abstract methods) + + This factory class provides a method to retrieve a `HashStore` object + based on a given module (ex. "hashstore.filehashstore.filehashstore") + and class name (ex. "FileHashStore"). + """ + + @staticmethod + def get_hashstore(module_name, class_name, properties=None): + """Get a `HashStore`-like object based on the specified `module_name` and `class_name`. + + Args: + module_name (str): Name of package (ex. "hashstore.filehashstore") \n + class_name (str): Name of class in the given module (ex. "FileHashStore") \n + properties (dict, optional): Desired HashStore properties, if 'None', default values + will be used. \n + Example Properties Dictionary: + { + "store_path": "var/metacat",\n + "store_depth": 3,\n + "store_width": 2,\n + "store_algorithm": "sha256",\n + "store_sysmeta_namespace": "http://ns.dataone.org/service/types/v2.0"\n + } + + Returns: + HashStore: A hash store object based on the given `module_name` and `class_name` + + Raises: + ModuleNotFoundError: If module is not found + AttributeError: If class does not exist within the module + """ + # Validate module + if importlib.util.find_spec(module_name) is None: + raise ModuleNotFoundError(f"No module found for '{module_name}'") + + # Get HashStore + imported_module = importlib.import_module(module_name) + + # If class is not part of module, raise error + if hasattr(imported_module, class_name): + hashstore_class = getattr(imported_module, class_name) + return hashstore_class(properties=properties) + raise AttributeError( + f"Class name '{class_name}' is not an attribute of module '{module_name}'" + ) + + +class ObjectMetadata(namedtuple("ObjectMetadata", ["id", "obj_size", "hex_digests"])): + """File address containing file's path on disk and its content hash ID. + + Args: + ab_id (str): Hash ID (hexdigest) of file contents. + obj_size (bytes): Size of the object + hex_digests (dict, optional): A list of hex digests to validate objects + (md5, sha1, sha256, sha384, sha512) + """ + + # Default value to prevent dangerous default value + def __new__(cls, ab_id, obj_size, hex_digests=None): + return super(ObjectMetadata, cls).__new__(cls, ab_id, obj_size, hex_digests) diff --git a/src/hashstore/hashstore_factory.py b/src/hashstore/hashstore_factory.py deleted file mode 100644 index bcdeff0c..00000000 --- a/src/hashstore/hashstore_factory.py +++ /dev/null @@ -1,52 +0,0 @@ -"""Core module for HashStore Factory""" -import importlib - - -class HashStoreFactory: - """A factory class for creating `HashStore`-like objects (classes - that implement the 'HashStore' abstract methods) - - This factory class provides a method to retrieve a `HashStore` object - based on a given module (ex. "hashstore.filehashstore.filehashstore") - and class name (ex. "FileHashStore"). - """ - - @staticmethod - def get_hashstore(module_name, class_name, properties=None): - """Get a `HashStore`-like object based on the specified `module_name` and `class_name`. - - Args: - module_name (str): Name of package (ex. "hashstore.filehashstore.filehashstore") \n - class_name (str): Name of class in the given module (ex. "FileHashStore") \n - properties (dict, optional): Desired HashStore properties, if 'None', default values - will be used. \n - Example Properties Dictionary: - { - "store_path": "var/metacat",\n - "store_depth": 3,\n - "store_width": 2,\n - "store_algorithm": "sha256",\n - "store_sysmeta_namespace": "http://ns.dataone.org/service/types/v2.0"\n - } - - Returns: - HashStore: A hash store object based on the given `module_name` and `class_name` - - Raises: - ModuleNotFoundError: If module is not found - AttributeError: If class does not exist within the module - """ - # Validate module - if importlib.util.find_spec(module_name) is None: - raise ModuleNotFoundError(f"No module found for '{module_name}'") - - # Get HashStore - imported_module = importlib.import_module(module_name) - - # If class is not part of module, raise error - if hasattr(imported_module, class_name): - hashstore_class = getattr(imported_module, class_name) - return hashstore_class(properties=properties) - raise AttributeError( - f"Class name '{class_name}' is not an attribute of module '{module_name}'" - ) diff --git a/tests/conftest.py b/tests/conftest.py index e8c1bcf4..9b25c520 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,6 @@ """Pytest overall configuration file for fixtures""" import pytest -from hashstore.filehashstore.filehashstore import FileHashStore +from hashstore.filehashstore import FileHashStore def pytest_addoption(parser): @@ -25,8 +25,8 @@ def init_props(tmp_path): "store_path": hashstore_path, "store_depth": 3, "store_width": 2, - "store_algorithm": "sha256", - "store_sysmeta_namespace": "http://ns.dataone.org/service/types/v2.0", + "store_algorithm": "SHA-256", + "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", } return properties @@ -40,10 +40,15 @@ def init_store(props): @pytest.fixture(name="pids") def init_pids(): - """Shared test harness data.""" + """Shared test harness data. + - object_cid: hex digest of the pid + - metadata_cid: hex digest of the pid + store_metadata_namespace + """ test_pids = { "doi:10.18739/A2901ZH2M": { - "ab_id": "0d555ed77052d7e166017f779cbc193357c3a5006ee8b8457230bcf7abcef65e", + "file_size_bytes": 39993, + "object_cid": "0d555ed77052d7e166017f779cbc193357c3a5006ee8b8457230bcf7abcef65e", + "metadata_cid": "323e0799524cec4c7e14d31289cefd884b563b5c052f154a066de5ec1e477da7", "md5": "db91c910a3202478c8def1071c54aae5", "sha1": "1fe86e3c8043afa4c70857ca983d740ad8501ccd", "sha224": "922b1e86f83d3ea3060fd0f7b2cf04476e8b3ddeaa3cf48c2c3cf502", @@ -52,7 +57,9 @@ def init_pids(): "sha512": "e9bcd6b91b102ef5803d1bd60c7a5d2dbec1a2baf5f62f7da60de07607ad6797d6a9b740d97a257fd2774f2c26503d455d8f2a03a128773477dfa96ab96a2e54", }, "jtao.1700.1": { - "ab_id": "a8241925740d5dcd719596639e780e0a090c9d55a5d0372b0eaf55ed711d4edf", + "file_size_bytes": 8724, + "object_cid": "a8241925740d5dcd719596639e780e0a090c9d55a5d0372b0eaf55ed711d4edf", + "metadata_cid": "ddf07952ef28efc099d10d8b682480f7d2da60015f5d8873b6e1ea75b4baf689", "md5": "f4ea2d07db950873462a064937197b0f", "sha1": "3d25436c4490b08a2646e283dada5c60e5c0539d", "sha224": "9b3a96f434f3c894359193a63437ef86fbd5a1a1a6cc37f1d5013ac1", @@ -61,7 +68,9 @@ def init_pids(): "sha512": "bf9e7f4d4e66bd082817d87659d1d57c2220c376cd032ed97cadd481cf40d78dd479cbed14d34d98bae8cebc603b40c633d088751f07155a94468aa59e2ad109", }, "urn:uuid:1b35d0a5-b17a-423b-a2ed-de2b18dc367a": { - "ab_id": "7f5cc18f0b04e812a3b4c8f686ce34e6fec558804bf61e54b176742a7f6368d6", + "file_size_bytes": 18699, + "object_cid": "7f5cc18f0b04e812a3b4c8f686ce34e6fec558804bf61e54b176742a7f6368d6", + "metadata_cid": "9a2e08c666b728e6cbd04d247b9e556df3de5b2ca49f7c5a24868eb27cddbff2", "md5": "e1932fc75ca94de8b64f1d73dc898079", "sha1": "c6d2a69a3f5adaf478ba796c114f57b990cf7ad1", "sha224": "f86491d23d25dbaf7620542f056aba8a092a70be625502a6afd1fde0", diff --git a/tests/filehashstore/test_filehashstore.py b/tests/test_filehashstore.py similarity index 55% rename from tests/filehashstore/test_filehashstore.py rename to tests/test_filehashstore.py index f36bb2ce..a2f0fdfe 100644 --- a/tests/filehashstore/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -3,22 +3,101 @@ import os from pathlib import Path import pytest -from hashstore.filehashstore.filehashstore import FileHashStore +from hashstore.filehashstore import FileHashStore -def test_init_put_properties_hashstore_yaml_exists(store): - """Verify properties file present in store root directory.""" +def test_pids_length(pids): + """Ensure test harness pids are present.""" + assert len(pids) == 3 + + +def test_init_directories_created(store): + """Confirm that object and metadata directories have been created.""" + assert os.path.exists(store.root) + assert os.path.exists(store.objects) + assert os.path.exists(store.objects + "/tmp") + assert os.path.exists(store.metadata) + assert os.path.exists(store.metadata + "/tmp") + + +def test_init_existing_store_incorrect_algorithm_format(store): + """Confirm that exception is thrown when store_algorithm is not a DataONE controlled value""" + properties = { + "store_path": store.root, + "store_depth": 3, + "store_width": 2, + "store_algorithm": "sha256", + "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", + } + with pytest.raises(ValueError): + FileHashStore(properties) + + +def test_init_existing_store_correct_algorithm_format(store): + """Confirm second instance of HashStore with DataONE controlled value""" + properties = { + "store_path": store.root, + "store_depth": 3, + "store_width": 2, + "store_algorithm": "SHA-256", + "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", + } + hashstore_instance = FileHashStore(properties) + assert isinstance(hashstore_instance, FileHashStore) + + +def test_init_write_properties_hashstore_yaml_exists(store): + """Verify config file present in store root directory.""" assert os.path.exists(store.hashstore_configuration_yaml) -def test_init_with_existing_hashstore_mismatched_config(store): +def test_init_with_existing_hashstore_mismatched_config_depth(store): """Test init with existing HashStore raises ValueError with mismatching properties.""" properties = { "store_path": store.root, "store_depth": 1, "store_width": 2, - "store_algorithm": "sha256", - "store_sysmeta_namespace": "http://ns.dataone.org/service/types/v2.0", + "store_algorithm": "SHA-256", + "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", + } + with pytest.raises(ValueError): + FileHashStore(properties) + + +def test_init_with_existing_hashstore_mismatched_config_width(store): + """Test init with existing HashStore raises ValueError with mismatching properties.""" + properties = { + "store_path": store.root, + "store_depth": 3, + "store_width": 1, + "store_algorithm": "SHA-256", + "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", + } + with pytest.raises(ValueError): + FileHashStore(properties) + + +def test_init_with_existing_hashstore_mismatched_config_algo(store): + """Test init with existing HashStore raises ValueError with mismatching properties.""" + properties = { + "store_path": store.root, + "store_depth": 3, + "store_width": 1, + "store_algorithm": "SHA-512", + "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", + } + with pytest.raises(ValueError): + FileHashStore(properties) + + +def test_init_with_existing_hashstore_mismatched_config_metadata_ns(store): + """Test init with existing HashStore raises ValueError with mismatching properties.""" + properties = { + "store_path": store.root, + "store_depth": 3, + "store_width": 1, + "store_algorithm": "SHA-512", + "store_metadata_namespace": "http://ns.dataone.org/service/types/v5.0", } with pytest.raises(ValueError): FileHashStore(properties) @@ -36,31 +115,30 @@ def test_init_with_existing_hashstore_missing_yaml(store, pids): "store_path": store.root, "store_depth": 3, "store_width": 2, - "store_algorithm": "sha256", - "store_sysmeta_namespace": "http://ns.dataone.org/service/types/v2.0", + "store_algorithm": "SHA-256", + "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", } with pytest.raises(FileNotFoundError): FileHashStore(properties) -def test_get_properties(store): - """Verify dictionary returned from get_properties matches initialization.""" - hashstore_yaml_dict = store.get_properties() - assert hashstore_yaml_dict.get("store_path") == store.root +def test_load_properties(store): + """Verify dictionary returned from load_properties matches initialization.""" + hashstore_yaml_dict = store.load_properties() assert hashstore_yaml_dict.get("store_depth") == 3 assert hashstore_yaml_dict.get("store_width") == 2 - assert hashstore_yaml_dict.get("store_algorithm") == "sha256" + assert hashstore_yaml_dict.get("store_algorithm") == "SHA-256" assert ( - hashstore_yaml_dict.get("store_sysmeta_namespace") + hashstore_yaml_dict.get("store_metadata_namespace") == "http://ns.dataone.org/service/types/v2.0" ) -def test_get_properties_hashstore_yaml_missing(store): +def test_load_properties_hashstore_yaml_missing(store): """Confirm FileNotFoundError is raised when hashstore.yaml does not exist.""" os.remove(store.hashstore_configuration_yaml) with pytest.raises(FileNotFoundError): - store.get_properties() + store.load_properties() def test_validate_properties(store): @@ -69,8 +147,8 @@ def test_validate_properties(store): "store_path": "/etc/test", "store_depth": 3, "store_width": 2, - "store_algorithm": "sha256", - "store_sysmeta_namespace": "http://ns.dataone.org/service/types/v2.0", + "store_algorithm": "SHA-256", + "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", } # pylint: disable=W0212 assert store._validate_properties(properties) @@ -82,7 +160,7 @@ def test_validate_properties_missing_key(store): "store_path": "/etc/test", "store_depth": 3, "store_width": 2, - "store_algorithm": "sha256", + "store_algorithm": "SHA-256", } with pytest.raises(KeyError): # pylint: disable=W0212 @@ -90,13 +168,13 @@ def test_validate_properties_missing_key(store): def test_validate_properties_key_value_is_none(store): - """Confirm exception raised when value from key is 'None'""" + """Confirm exception raised when value from key is 'None'.""" properties = { "store_path": "/etc/test", "store_depth": 3, "store_width": 2, - "store_algorithm": "sha256", - "store_sysmeta_namespace": None, + "store_algorithm": "SHA-256", + "store_metadata_namespace": None, } with pytest.raises(ValueError): # pylint: disable=W0212 @@ -105,15 +183,23 @@ def test_validate_properties_key_value_is_none(store): def test_validate_properties_incorrect_type(store): """Confirm exception raised when key missing in properties.""" - properties = "etc/filehashstore" + properties = "etc/filehashstore/hashstore.yaml" with pytest.raises(ValueError): # pylint: disable=W0212 store._validate_properties(properties) -def test_pids_length(pids): - """Ensure test harness pids are present.""" - assert len(pids) == 3 +def test_set_default_algorithms_missing_yaml(store, pids): + """Confirm set_default_algorithms raises FileNotFoundError when hashstore.yaml + not found.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + store.put_object(pid, path) + os.remove(store.hashstore_configuration_yaml) + with pytest.raises(FileNotFoundError): + # pylint: disable=W0212 + store._set_default_algorithms() def test_put_object_files_path(pids, store): @@ -122,9 +208,9 @@ def test_put_object_files_path(pids, store): entity = "objects" for pid in pids.keys(): path = Path(test_dir) / pid.replace("/", "_") - hash_address = store.put_object(pid, path) - hashaddress_id = hash_address.id - assert store.exists(entity, hashaddress_id) + object_metadata = store.put_object(pid, path) + object_metadata_id = object_metadata.id + assert store.exists(entity, object_metadata_id) def test_put_object_files_string(pids, store): @@ -133,9 +219,9 @@ def test_put_object_files_string(pids, store): entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - hash_address = store.put_object(pid, path) - hashaddress_id = hash_address.id - assert store.exists(entity, hashaddress_id) + object_metadata = store.put_object(pid, path) + object_metadata_id = object_metadata.id + assert store.exists(entity, object_metadata_id) def test_put_object_files_stream(pids, store): @@ -145,56 +231,31 @@ def test_put_object_files_stream(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") - hash_address = store.put_object(pid, input_stream) + object_metadata = store.put_object(pid, input_stream) input_stream.close() - hashaddress_id = hash_address.id - assert store.exists(entity, hashaddress_id) + object_metadata_id = object_metadata.id + assert store.exists(entity, object_metadata_id) assert store.count(entity) == 3 -def test_put_object_id(pids, store): +def test_put_object_cid(pids, store): """Check put returns correct id.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - hashaddress = store.put_object(pid, path) - hashaddress_id = hashaddress.id - assert hashaddress_id == pids[pid]["ab_id"] + object_metadata = store.put_object(pid, path) + object_metadata_id = object_metadata.id + assert object_metadata_id == pids[pid]["object_cid"] -def test_put_object_relpath(pids, store): - """Check put returns correct relative path.""" +def test_put_object_file_size(pids, store): + """Check put returns correct file size.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - hashaddress = store.put_object(pid, path) - hashaddress_id = hashaddress.id - hashaddress_relpath = hashaddress.relpath - shard_id_path = "/".join(store.shard(hashaddress_id)) - assert hashaddress_relpath == shard_id_path - - -def test_put_object_abspath(pids, store): - """Check put returns correct absolute path.""" - test_dir = "tests/testdata/" - entity = "objects" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - hashaddress = store.put_object(pid, path) - hashaddress_id = hashaddress.id - hashaddress_abspath = hashaddress.abspath - id_abs_path = store.get_real_path(entity, hashaddress_id) - assert hashaddress_abspath == id_abs_path - - -def test_put_object_is_duplicate(pids, store): - """Check put returns expected is_duplicate boolean value.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - hashaddress = store.put_object(pid, path) - hashaddress_is_duplicate = hashaddress.is_duplicate - assert hashaddress_is_duplicate is False + object_metadata = store.put_object(pid, path) + object_size = object_metadata.obj_size + assert object_size == pids[pid]["file_size_bytes"] def test_put_object_hex_digests(pids, store): @@ -202,29 +263,29 @@ def test_put_object_hex_digests(pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - hashaddress = store.put_object(pid, path) - hashaddress_hex_digests = hashaddress.hex_digests - assert hashaddress_hex_digests.get("md5") == pids[pid]["md5"] - assert hashaddress_hex_digests.get("sha1") == pids[pid]["sha1"] - assert hashaddress_hex_digests.get("sha256") == pids[pid]["sha256"] - assert hashaddress_hex_digests.get("sha384") == pids[pid]["sha384"] - assert hashaddress_hex_digests.get("sha512") == pids[pid]["sha512"] + object_metadata = store.put_object(pid, path) + object_metadata_hex_digests = object_metadata.hex_digests + assert object_metadata_hex_digests.get("md5") == pids[pid]["md5"] + assert object_metadata_hex_digests.get("sha1") == pids[pid]["sha1"] + assert object_metadata_hex_digests.get("sha256") == pids[pid]["sha256"] + assert object_metadata_hex_digests.get("sha384") == pids[pid]["sha384"] + assert object_metadata_hex_digests.get("sha512") == pids[pid]["sha512"] def test_put_object_additional_algorithm(pids, store): - """Check put returns additional algorithm in hex digests.""" + """Check put_object returns additional algorithm in hex digests.""" test_dir = "tests/testdata/" for pid in pids.keys(): algo = "sha224" path = test_dir + pid.replace("/", "_") - hash_address = store.put_object(pid, path, additional_algorithm=algo) - hex_digests = hash_address.hex_digests + object_metadata = store.put_object(pid, path, additional_algorithm=algo) + hex_digests = object_metadata.hex_digests sha224_hash = hex_digests.get(algo) assert sha224_hash == pids[pid][algo] def test_put_object_with_correct_checksums(pids, store): - """Check put success with good checksum supplied.""" + """Check put_object success with valid checksum and checksum algorithm supplied.""" test_dir = "tests/testdata/" for pid in pids.keys(): algo = "sha224" @@ -258,16 +319,14 @@ def test_move_and_get_checksums_id(pids, store): move_id, _, _, - _, - _, ) = store._move_and_get_checksums(pid, input_stream) input_stream.close() - ab_id = store.get_sha256_hex_digest(pid) - assert move_id == ab_id + object_cid = store.get_sha256_hex_digest(pid) + assert move_id == object_cid -def test_move_and_get_checksums_hex_digests(pids, store): - """Test move returns correct hex digests.""" +def test_move_and_get_checksums_file_size(pids, store): + """Test move returns correct file size.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -275,21 +334,15 @@ def test_move_and_get_checksums_hex_digests(pids, store): # pylint: disable=W0212 ( _, + tmp_file_size, _, - _, - _, - hex_digests, ) = store._move_and_get_checksums(pid, input_stream) input_stream.close() - assert hex_digests.get("md5") == pids[pid]["md5"] - assert hex_digests.get("sha1") == pids[pid]["sha1"] - assert hex_digests.get("sha256") == pids[pid]["sha256"] - assert hex_digests.get("sha384") == pids[pid]["sha384"] - assert hex_digests.get("sha512") == pids[pid]["sha512"] + assert tmp_file_size == pids[pid]["file_size_bytes"] -def test_move_and_get_checksums_abs_path(pids, store): - """Test move returns correct absolute path.""" +def test_move_and_get_checksums_hex_digests(pids, store): + """Test move returns correct hex digests.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -298,13 +351,14 @@ def test_move_and_get_checksums_abs_path(pids, store): ( _, _, - abs_path, - _, - _, + hex_digests, ) = store._move_and_get_checksums(pid, input_stream) input_stream.close() - store.get_sha256_hex_digest(pid) - assert os.path.isfile(abs_path) is True + assert hex_digests.get("md5") == pids[pid]["md5"] + assert hex_digests.get("sha1") == pids[pid]["sha1"] + assert hex_digests.get("sha256") == pids[pid]["sha256"] + assert hex_digests.get("sha384") == pids[pid]["sha384"] + assert hex_digests.get("sha512") == pids[pid]["sha512"] def test_move_and_get_checksums_duplicates_raises_error(pids, store): @@ -327,6 +381,116 @@ def test_move_and_get_checksums_duplicates_raises_error(pids, store): assert store.count(entity) == 3 +def test_move_and_get_checksums_file_size_raises_error(pids, store): + """Test move and get checksum raises error with incorrect file size""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + with pytest.raises(ValueError): + path = test_dir + pid.replace("/", "_") + input_stream = io.open(path, "rb") + incorrect_file_size = 1000 + # pylint: disable=W0212 + ( + _, + _, + _, + _, + ) = store._move_and_get_checksums( + pid, input_stream, file_size_to_validate=incorrect_file_size + ) + input_stream.close() + + +def test_mktempfile_additional_algo(store): + """Test _mktempfile returns correct hex digests for additional algorithm.""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid + input_stream = io.open(path, "rb") + checksum_algo = "sha3_256" + checksum_correct = ( + "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" + ) + # pylint: disable=W0212 + hex_digests, _, _ = store._mktmpfile( + input_stream, additional_algorithm=checksum_algo + ) + input_stream.close() + assert hex_digests.get("sha3_256") == checksum_correct + + +def test_mktempfile_checksum_algo(store): + """Test _mktempfile returns correct hex digests for checksum algorithm.""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid + input_stream = io.open(path, "rb") + checksum_algo = "sha3_256" + checksum_correct = ( + "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" + ) + # pylint: disable=W0212 + hex_digests, _, _ = store._mktmpfile(input_stream, checksum_algorithm=checksum_algo) + input_stream.close() + assert hex_digests.get("sha3_256") == checksum_correct + + +def test_mktempfile_checksum_and_additional_algo(store): + """Test _mktempfile returns correct hex digests for checksum algorithm.""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid + input_stream = io.open(path, "rb") + additional_algo = "sha224" + additional_algo_checksum = ( + "9b3a96f434f3c894359193a63437ef86fbd5a1a1a6cc37f1d5013ac1" + ) + checksum_algo = "sha3_256" + checksum_correct = ( + "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" + ) + # pylint: disable=W0212 + hex_digests, _, _ = store._mktmpfile( + input_stream, + additional_algorithm=additional_algo, + checksum_algorithm=checksum_algo, + ) + input_stream.close() + assert hex_digests.get("sha3_256") == checksum_correct + assert hex_digests.get("sha224") == additional_algo_checksum + + +def test_mktempfile_checksum_and_additional_algo_duplicate(store): + """Test _mktempfile succeeds with duplicate algorithms (de-duplicates).""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid + input_stream = io.open(path, "rb") + additional_algo = "sha224" + checksum_algo = "sha224" + checksum_correct = "9b3a96f434f3c894359193a63437ef86fbd5a1a1a6cc37f1d5013ac1" + # pylint: disable=W0212 + hex_digests, _, _ = store._mktmpfile( + input_stream, + additional_algorithm=additional_algo, + checksum_algorithm=checksum_algo, + ) + input_stream.close() + assert hex_digests.get("sha224") == checksum_correct + + +def test_mktempfile_file_size(pids, store): + """Test _mktempfile returns correct file size.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + input_stream = io.open(path, "rb") + # pylint: disable=W0212 + _, _, tmp_file_size = store._mktmpfile(input_stream) + input_stream.close() + assert tmp_file_size == pids[pid]["file_size_bytes"] + + def test_mktempfile_hex_digests(pids, store): """Test _mktempfile returns correct hex digests.""" test_dir = "tests/testdata/" @@ -334,7 +498,7 @@ def test_mktempfile_hex_digests(pids, store): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") # pylint: disable=W0212 - hex_digests, _ = store._mktempfile(input_stream) + hex_digests, _, _ = store._mktmpfile(input_stream) input_stream.close() assert hex_digests.get("md5") == pids[pid]["md5"] assert hex_digests.get("sha1") == pids[pid]["sha1"] @@ -343,31 +507,18 @@ def test_mktempfile_hex_digests(pids, store): assert hex_digests.get("sha512") == pids[pid]["sha512"] -def test_mktempfile_object(pids, store): +def test_mktempfile_tmpfile_object(pids, store): """Test _mktempfile creates file successfully.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") # pylint: disable=W0212 - _, tmp_file_name = store._mktempfile(input_stream) + _, tmp_file_name, _ = store._mktmpfile(input_stream) input_stream.close() assert os.path.isfile(tmp_file_name) is True -def test_mktempfile_with_algorithm(pids, store): - """Test _mktempfile returns additional hex digest when supplied.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - input_stream = io.open(path, "rb") - algo = "sha224" - # pylint: disable=W0212 - hex_digests, _ = store._mktempfile(input_stream, algo) - input_stream.close() - assert hex_digests.get("sha224") == pids[pid]["sha224"] - - def test_mktempfile_with_unsupported_algorithm(pids, store): """Test _mktempfile raises error when bad algorithm supplied.""" test_dir = "tests/testdata/" @@ -377,55 +528,60 @@ def test_mktempfile_with_unsupported_algorithm(pids, store): algo = "md2" with pytest.raises(ValueError): # pylint: disable=W0212 - _, _ = store._mktempfile(input_stream, algo) + _, _, _ = store._mktmpfile(input_stream, additional_algorithm=algo) + with pytest.raises(ValueError): + # pylint: disable=W0212 + _, _, _ = store._mktmpfile(input_stream, checksum_algorithm=algo) input_stream.close() -def test_put_sysmeta_with_path(pids, store): - """Test put sysmeta with path object.""" - entity = "sysmeta" +def test_put_metadata_with_path(pids, store): + """Test put_metadata with path object.""" + entity = "metadata" test_dir = "tests/testdata/" + format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - ab_id = store.store_sysmeta(pid, syspath) - assert store.exists(entity, ab_id) + metadata_cid = store.store_metadata(pid, syspath, format_id) + assert store.exists(entity, metadata_cid) assert store.count(entity) == 3 -def test_put_sysmeta_with_string(pids, store): - """Test put sysmeta with string.""" - entity = "sysmeta" +def test_put_metadata_with_string(pids, store): + """Test_put metadata with string.""" + entity = "metadata" test_dir = "tests/testdata/" + format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = str(Path(test_dir) / filename) - ab_id = store.store_sysmeta(pid, syspath) - assert store.exists(entity, ab_id) + metadata_cid = store.store_metadata(pid, syspath, format_id) + assert store.exists(entity, metadata_cid) assert store.count(entity) == 3 -def test_put_sysmeta_ab_id(pids, store): - """Test put sysmeta returns correct id.""" +def test_put_metadata_cid(pids, store): + """Test put metadata returns correct id.""" test_dir = "tests/testdata/" + format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - ab_id = store.store_sysmeta(pid, syspath) - assert ab_id == pids[pid]["ab_id"] + metadata_cid = store.store_metadata(pid, syspath, format_id) + assert metadata_cid == pids[pid]["metadata_cid"] -def test_mktmpsysmeta(pids, store): - """Test mktmpsysmeta creates tmpFile.""" +def test_mktmpmetadata(pids, store): + """Test mktmpmetadata creates tmpFile.""" test_dir = "tests/testdata/" - entity = "sysmeta" + entity = "metadata" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename sys_stream = io.open(syspath, "rb") - namespace = "http://ns.dataone.org/service/types/v2.0" # pylint: disable=W0212 - tmp_name = store._mktmpsysmeta(sys_stream, namespace) + tmp_name = store._mktmpmetadata(sys_stream) sys_stream.close() assert store.exists(entity, tmp_name) @@ -462,34 +618,22 @@ def test_get_store_path_object(store): assert path_objects_string.endswith("/metacat/objects") -def test_get_store_path_sysmeta(store): - """Check get_store_path for sysmeta path.""" +def test_get_store_path_metadata(store): + """Check get_store_path for metadata path.""" # pylint: disable=W0212 - path_sysmeta = store.get_store_path("sysmeta") - path_sysmeta_string = str(path_sysmeta) - assert path_sysmeta_string.endswith("/metacat/sysmeta") - - -def test_exists_with_absolute_path(pids, store): - """Test exists method with an absolute file path.""" - test_dir = "tests/testdata/" - entity = "objects" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - hashaddress = store.put_object(pid, path) - hashaddress_abspath = hashaddress.abspath - assert store.exists(entity, hashaddress_abspath) + path_metadata = store.get_store_path("metadata") + path_metadata_string = str(path_metadata) + assert path_metadata_string.endswith("/metacat/metadata") -def test_exists_with_relative_path(pids, store): +def test_exists_with_object_metadata_id(pids, store): """Test exists method with an absolute file path.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - hashaddress = store.put_object(pid, path) - hashaddress_relpath = hashaddress.relpath - assert store.exists(entity, hashaddress_relpath) + object_metadata = store.put_object(pid, path) + assert store.exists(entity, object_metadata.id) def test_exists_with_sharded_path(pids, store): @@ -498,10 +642,10 @@ def test_exists_with_sharded_path(pids, store): entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - hashaddress = store.put_object(pid, path) - hashaddress_shard = store.shard(hashaddress.id) - hashaddress_shard_path = "/".join(hashaddress_shard) - assert store.exists(entity, hashaddress_shard_path) + object_metadata = store.put_object(pid, path) + object_metadata_shard = store.shard(object_metadata.id) + object_metadata_shard_path = "/".join(object_metadata_shard) + assert store.exists(entity, object_metadata_shard_path) def test_exists_with_nonexistent_file(store): @@ -531,34 +675,22 @@ def test_open_objects(pids, store): entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - hash_address = store.put_object(pid, path) - hashaddress_id = hash_address.id - io_buffer = store.open(entity, hashaddress_id) + object_metadata = store.put_object(pid, path) + object_metadata_id = object_metadata.id + io_buffer = store.open(entity, object_metadata_id) assert isinstance(io_buffer, io.BufferedReader) io_buffer.close() -def test_delete_by_id(pids, store): - """Check objects are deleted after calling delete with id.""" +def test_delete_by_object_metadata_id(pids, store): + """Check objects are deleted after calling delete with hash address id.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - hash_address = store.put_object(pid, path) - hashaddress_id = hash_address.id - store.delete(entity, hashaddress_id) - assert store.count(entity) == 0 - - -def test_delete_by_path(pids, store): - """Check objects are deleted after calling delete with path.""" - test_dir = "tests/testdata/" - entity = "objects" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - hash_address = store.put_object(pid, path) - hashaddress_relpath = hash_address.relpath - store.delete(entity, hashaddress_relpath) + object_metadata = store.put_object(pid, path) + object_metadata_id = object_metadata.id + store.delete(entity, object_metadata_id) assert store.count(entity) == 0 @@ -605,10 +737,11 @@ def test_remove_empty_does_not_remove_nonempty_folders(pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - hash_address = store.put_object(pid, path) - hashaddress_relpath = hash_address.relpath + object_metadata = store.put_object(pid, path) + object_metadata_shard = store.shard(object_metadata.id) + object_metadata_shard_path = "/".join(object_metadata_shard) # Get parent directory of the relative path - parent_dir = os.path.dirname(hashaddress_relpath) + parent_dir = os.path.dirname(object_metadata_shard_path) # Attempt to remove the parent directory store.remove_empty(parent_dir) abs_parent_dir = store.objects + "/" + parent_dir @@ -647,7 +780,7 @@ def test_create_path(pids, store): """Test makepath creates folder successfully.""" for pid in pids: root_directory = store.root - pid_hex_digest_directory = pids[pid]["ab_id"][:2] + pid_hex_digest_directory = pids[pid]["metadata_cid"][:2] pid_directory = root_directory + pid_hex_digest_directory store.create_path(pid_directory) assert os.path.isdir(pid_directory) @@ -661,40 +794,52 @@ def test_get_real_path_file_does_not_exist(store): assert real_path_exists is None -def test_get_real_path_absolute_path(store, pids): - """Test get_real_path returns path (is truthy) when absolute path exists.""" +def test_get_real_path_with_object_id(store, pids): + """Test get_real_path returns absolute path given an object id.""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - hashaddress = store.put_object(pid, path) - hashaddress_abspath = hashaddress.abspath - abs_path = store.get_real_path(entity, hashaddress_abspath) - assert abs_path + object_metadata = store.put_object(pid, path) + obj_abs_path = store.get_real_path(entity, object_metadata.id) + assert os.path.exists(obj_abs_path) -def test_get_real_path_relative_path(store, pids): - """Test get_real_path returns path (is truthy) when rel path exists.""" +def test_get_real_path_with_object_id_sharded(pids, store): + """Test exists method with a sharded path (relative path).""" test_dir = "tests/testdata/" entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - hashaddress = store.put_object(pid, path) - hashaddress_relpath = hashaddress.relpath - rel_path = store.get_real_path(entity, hashaddress_relpath) - assert rel_path + object_metadata = store.put_object(pid, path) + object_metadata_shard = store.shard(object_metadata.id) + object_metadata_shard_path = "/".join(object_metadata_shard) + obj_abs_path = store.get_real_path(entity, object_metadata_shard_path) + assert os.path.exists(obj_abs_path) -def test_get_real_path_hex_digest_path(store, pids): - """Test get_real_path returns path (is truthy) when rel path exists.""" +def test_get_real_path_with_metadata_id(store, pids): + """Test get_real_path returns absolute path given a metadata id.""" + entity = "metadata" test_dir = "tests/testdata/" - entity = "objects" + format_id = "http://ns.dataone.org/service/types/v2.0" + for pid in pids.keys(): + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + metadata_cid = store.store_metadata(pid, syspath, format_id) + metadata_abs_path = store.get_real_path(entity, metadata_cid) + assert os.path.exists(metadata_abs_path) + + +def test_get_real_path_with_bad_entity(store, pids): + """Test get_real_path returns absolute path given an object id.""" + test_dir = "tests/testdata/" + entity = "bad_entity" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - hashaddress = store.put_object(pid, path) - hashaddress_id = hashaddress.id - hex_digest = store.get_real_path(entity, hashaddress_id) - assert hex_digest + object_metadata = store.put_object(pid, path) + with pytest.raises(ValueError): + store.get_real_path(entity, object_metadata.id) def test_build_abs_path(store, pids): @@ -705,7 +850,7 @@ def test_build_abs_path(store, pids): path = test_dir + pid.replace("/", "_") _ = store.put_object(pid, path) # pylint: disable=W0212 - abs_path = store.build_abs_path(entity, pids[pid]["ab_id"]) + abs_path = store.build_abs_path(entity, pids[pid]["object_cid"]) assert abs_path @@ -731,4 +876,4 @@ def test_get_sha256_hex_digest(pids, store): """Test for correct sha256 return value.""" for pid in pids: hash_val = store.get_sha256_hex_digest(pid) - assert hash_val == pids[pid]["ab_id"] + assert hash_val == pids[pid]["object_cid"] diff --git a/tests/filehashstore/test_filehashstore_interface.py b/tests/test_filehashstore_interface.py similarity index 51% rename from tests/filehashstore/test_filehashstore_interface.py rename to tests/test_filehashstore_interface.py index a1bca2e7..92b125cb 100644 --- a/tests/filehashstore/test_filehashstore_interface.py +++ b/tests/test_filehashstore_interface.py @@ -1,8 +1,11 @@ """Test module for FileHashStore HashStore interface methods""" import io +import os from pathlib import Path from threading import Thread import random +import threading +import time import pytest # Define a mark to be used to label slow tests @@ -18,26 +21,42 @@ def test_pids_length(pids): def test_store_address_length(pids, store): - """Test store object ab_id length is 64 characters.""" + """Test store object object_cid length is 64 characters.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - hash_address = store.store_object(pid, path) - ab_id = hash_address.id - assert len(ab_id) == 64 + object_metadata = store.store_object(pid, path) + object_cid = object_metadata.id + assert len(object_cid) == 64 + + +def test_store_object(pids, store): + """Test store object.""" + test_dir = "tests/testdata/" + entity = "objects" + format_id = "http://ns.dataone.org/service/types/v2.0" + for pid in pids.keys(): + path = Path(test_dir + pid.replace("/", "_")) + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + object_metadata = store.store_object(pid, path) + _metadata_cid = store.store_metadata(pid, syspath, format_id) + assert object_metadata.id == pids[pid]["object_cid"] + assert store.count(entity) == 3 def test_store_object_files_path(pids, store): """Test store object when given a path.""" test_dir = "tests/testdata/" entity = "objects" + format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): path = Path(test_dir + pid.replace("/", "_")) filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - _hash_address = store.store_object(pid, path) - ab_id = store.store_sysmeta(pid, syspath) - assert store.exists(entity, ab_id) + _object_metadata = store.store_object(pid, path) + _metadata_cid = store.store_metadata(pid, syspath, format_id) + assert store.exists(entity, pids[pid]["object_cid"]) assert store.count(entity) == 3 @@ -45,13 +64,14 @@ def test_store_object_files_string(pids, store): """Test store object when given a string.""" test_dir = "tests/testdata/" entity = "objects" + format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): path_string = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - _hash_address = store.store_object(pid, path_string) - ab_id = store.store_sysmeta(pid, syspath) - assert store.exists(entity, ab_id) + _object_metadata = store.store_object(pid, path_string) + _metadata_cid = store.store_metadata(pid, syspath, format_id) + assert store.exists(entity, pids[pid]["object_cid"]) assert store.count(entity) == 3 @@ -62,52 +82,30 @@ def test_store_object_files_input_stream(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") - _hash_address = store.store_object(pid, input_stream) + _object_metadata = store.store_object(pid, input_stream) input_stream.close() - ab_id = store.get_sha256_hex_digest(pid) - assert store.exists(entity, ab_id) + object_cid = store.get_sha256_hex_digest(pid) + assert store.exists(entity, object_cid) assert store.count(entity) == 3 def test_store_object_id(pids, store): - """Test store object returns expected id (ab_id).""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - hash_address = store.store_object(pid, path) - assert hash_address.id == pids[pid]["ab_id"] - - -def test_store_object_rel_path(pids, store): - """Test store object returns expected relative path.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - hash_address = store.store_object(pid, path) - ab_id = pids[pid]["ab_id"] - ab_id_rel_path = "/".join(store.shard(ab_id)) - assert hash_address.relpath == ab_id_rel_path - - -def test_store_object_abs_path(pids, store): - """Test store object returns expected absolute path.""" + """Test store object returns expected id (object_cid).""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - hash_address = store.store_object(pid, path) - ab_id = pids[pid]["ab_id"] - ab_id_rel_path = "/".join(store.shard(ab_id)) - ab_id_abs_path = store.objects + "/" + ab_id_rel_path - assert hash_address.abspath == ab_id_abs_path + object_metadata = store.store_object(pid, path) + assert object_metadata.id == pids[pid]["object_cid"] -def test_store_object_is_duplicate(pids, store): - """Test store object returns expected is_duplicate boolean.""" +def test_store_object_obj_size(pids, store): + """Test store object returns expected file size.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - hash_address = store.store_object(pid, path) - assert hash_address.is_duplicate is False + object_metadata = store.store_object(pid, path) + object_size = object_metadata.obj_size + assert object_size == pids[pid]["file_size_bytes"] def test_store_object_hex_digests(pids, store): @@ -115,12 +113,12 @@ def test_store_object_hex_digests(pids, store): test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - hash_address = store.store_object(pid, path) - assert hash_address.hex_digests.get("md5") == pids[pid]["md5"] - assert hash_address.hex_digests.get("sha1") == pids[pid]["sha1"] - assert hash_address.hex_digests.get("sha256") == pids[pid]["sha256"] - assert hash_address.hex_digests.get("sha384") == pids[pid]["sha384"] - assert hash_address.hex_digests.get("sha512") == pids[pid]["sha512"] + object_metadata = store.store_object(pid, path) + assert object_metadata.hex_digests.get("md5") == pids[pid]["md5"] + assert object_metadata.hex_digests.get("sha1") == pids[pid]["sha1"] + assert object_metadata.hex_digests.get("sha256") == pids[pid]["sha256"] + assert object_metadata.hex_digests.get("sha384") == pids[pid]["sha384"] + assert object_metadata.hex_digests.get("sha512") == pids[pid]["sha512"] def test_store_object_pid_empty(store): @@ -191,11 +189,11 @@ def test_store_object_additional_algorithm_hyphen_uppercase(pids, store): pid = "jtao.1700.1" path = test_dir + pid algorithm_with_hyphen_and_upper = "SHA-384" - hash_address = store.store_object(pid, path, algorithm_with_hyphen_and_upper) - sha256_cid = hash_address.hex_digests.get("sha384") + object_metadata = store.store_object(pid, path, algorithm_with_hyphen_and_upper) + sha256_cid = object_metadata.hex_digests.get("sha384") assert sha256_cid == pids[pid]["sha384"] - ab_id = store.get_sha256_hex_digest(pid) - assert store.exists(entity, ab_id) + object_cid = store.get_sha256_hex_digest(pid) + assert store.exists(entity, object_cid) def test_store_object_additional_algorithm_hyphen_lowercase(store): @@ -205,14 +203,14 @@ def test_store_object_additional_algorithm_hyphen_lowercase(store): pid = "jtao.1700.1" path = test_dir + pid algorithm_other = "sha3-256" - hash_address = store.store_object(pid, path, algorithm_other) - additional_sha3_256_hex_digest = hash_address.hex_digests.get("sha3_256") + object_metadata = store.store_object(pid, path, algorithm_other) + additional_sha3_256_hex_digest = object_metadata.hex_digests.get("sha3_256") sha3_256_checksum = ( "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) assert additional_sha3_256_hex_digest == sha3_256_checksum - ab_id = store.get_sha256_hex_digest(pid) - assert store.exists(entity, ab_id) + object_cid = store.get_sha256_hex_digest(pid) + assert store.exists(entity, object_cid) def test_store_object_additional_algorithm_underscore(store): @@ -222,8 +220,8 @@ def test_store_object_additional_algorithm_underscore(store): pid = "jtao.1700.1" path = test_dir + pid algorithm_other = "sha3_256" - hash_address = store.store_object(pid, path, algorithm_other) - additional_sha3_256_hex_digest = hash_address.hex_digests.get("sha3_256") + object_metadata = store.store_object(pid, path, algorithm_other) + additional_sha3_256_hex_digest = object_metadata.hex_digests.get("sha3_256") sha3_256_checksum = ( "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) @@ -238,16 +236,60 @@ def test_store_object_checksum_correct(store): entity = "objects" pid = "jtao.1700.1" path = test_dir + pid - algorithm_other = "sha3_256" + checksum_algo = "sha3_256" checksum_correct = ( "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) - _hash_address = store.store_object( - pid, path, checksum=checksum_correct, checksum_algorithm=algorithm_other + _object_metadata = store.store_object( + pid, path, checksum=checksum_correct, checksum_algorithm=checksum_algo ) assert store.count(entity) == 1 +def test_store_object_checksum_correct_and_additional_algo(store): + """Test store object successfully stores with good checksum and same additional algorithm.""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid + algorithm_additional = "sha224" + sha224_additional_checksum = ( + "9b3a96f434f3c894359193a63437ef86fbd5a1a1a6cc37f1d5013ac1" + ) + algorithm_checksum = "sha3_256" + checksum_correct = ( + "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" + ) + object_metadata = store.store_object( + pid, + path, + additional_algorithm=algorithm_additional, + checksum=checksum_correct, + checksum_algorithm=algorithm_checksum, + ) + assert object_metadata.hex_digests.get("sha224") == sha224_additional_checksum + assert object_metadata.hex_digests.get("sha3_256") == checksum_correct + + +def test_store_object_checksum_correct_and_additional_algo_duplicate(store): + """Test store object successfully stores with good checksum and same additional algorithm.""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid + algorithm_additional = "sha3_256" + algorithm_checksum = "sha3_256" + checksum_correct = ( + "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" + ) + object_metadata = store.store_object( + pid, + path, + additional_algorithm=algorithm_additional, + checksum=checksum_correct, + checksum_algorithm=algorithm_checksum, + ) + assert object_metadata.hex_digests.get("sha3_256") == checksum_correct + + def test_store_object_checksum_algorithm_empty(store): """Test store object raises error when checksum supplied with no checksum_algorithm.""" test_dir = "tests/testdata/" @@ -261,7 +303,7 @@ def test_store_object_checksum_algorithm_empty(store): def test_store_object_checksum_empty(store): - """Test store object raises error when checksum_algorithm supplied and checksum is empty.""" + """Test store object raises error when checksum_algorithm supplied with empty checksum.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -321,13 +363,56 @@ def test_store_object_duplicate_raises_error(store): path = test_dir + pid entity = "objects" # Store first blob - _hash_address_one = store.store_object(pid, path) + _object_metadata_one = store.store_object(pid, path) # Store second blob with pytest.raises(FileExistsError): - _hash_address_two = store.store_object(pid, path) + _object_metadata_two = store.store_object(pid, path) assert store.count(entity) == 1 - ab_id = store.get_sha256_hex_digest(pid) - assert store.exists(entity, ab_id) + object_cid = store.get_sha256_hex_digest(pid) + assert store.exists(entity, object_cid) + + +def test_store_object_with_obj_file_size(store, pids): + """Test store object with correct file sizes.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + obj_file_size = pids[pid]["file_size_bytes"] + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object( + pid, path, expected_object_size=obj_file_size + ) + object_size = object_metadata.obj_size + assert object_size == obj_file_size + + +def test_store_object_with_obj_file_size_incorrect(store, pids): + """Test store object throws exception with incorrect file size.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + obj_file_size = 1234 + path = test_dir + pid.replace("/", "_") + with pytest.raises(ValueError): + store.store_object(pid, path, expected_object_size=obj_file_size) + + +def test_store_object_with_obj_file_size_non_integer(store, pids): + """Test store object throws exception with a non integer value as the file size.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + obj_file_size = "Bob" + path = test_dir + pid.replace("/", "_") + with pytest.raises(TypeError): + store.store_object(pid, path, expected_object_size=obj_file_size) + + +def test_store_object_with_obj_file_size_zero(store, pids): + """Test store object throws exception with zero as the file size.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + obj_file_size = 0 + path = test_dir + pid.replace("/", "_") + with pytest.raises(ValueError): + store.store_object(pid, path, expected_object_size=obj_file_size) def test_store_object_duplicates_threads(store): @@ -357,11 +442,57 @@ def store_object_wrapper(pid, path): thread3.join() # One thread will succeed, file count must still be 1 assert store.count(entity) == 1 - ab_id = store.get_sha256_hex_digest(pid) - assert store.exists(entity, ab_id) + object_cid = store.get_sha256_hex_digest(pid) + assert store.exists(entity, object_cid) assert file_exists_error_flag +@slow_test +def test_store_object_interrupt_process(store): + """Test that tmp file created when storing a large object (2GB) and + interrupting the process is cleaned up. + """ + file_size = 2 * 1024 * 1024 * 1024 # 2GB + file_path = store.root + "random_file_2.bin" + + pid = "Testpid" + # Generate a random file with the specified size + with open(file_path, "wb") as file: + remaining_bytes = file_size + buffer_size = 1024 * 1024 # 1MB buffer size (adjust as needed) + + while remaining_bytes > 0: + # Generate random data for the buffer + buffer = bytearray(random.getrandbits(8) for _ in range(buffer_size)) + # Write the buffer to the file + bytes_to_write = min(buffer_size, remaining_bytes) + file.write(buffer[:bytes_to_write]) + remaining_bytes -= bytes_to_write + + interrupt_flag = False + + def store_object_wrapper(pid, path): + print(store.root) + while not interrupt_flag: + store.store_object(pid, path) # Call store_object inside the thread + + # Create/start the thread + thread = threading.Thread(target=store_object_wrapper, args=(pid, file_path)) + thread.start() + + # Sleep for 5 seconds to let the thread run + time.sleep(5) + + # Interrupt the thread + interrupt_flag = True + + # Wait for the thread to finish + thread.join() + + # Confirm no tmp objects found in objects/tmp directory + assert len(os.listdir(store.root + "/objects/tmp")) == 0 + + @slow_test def test_store_object_large_file(store): """Test storing a large object (1GB). This test has also been executed with @@ -384,16 +515,17 @@ def test_store_object_large_file(store): remaining_bytes -= bytes_to_write # Store object pid = "testfile_filehashstore" - hash_address = store.store_object(pid, file_path) - hash_address_id = hash_address.id + object_metadata = store.store_object(pid, file_path) + object_metadata_id = object_metadata.id pid_sha256_hex_digest = store.get_sha256_hex_digest(pid) - assert hash_address_id == pid_sha256_hex_digest + assert object_metadata_id == pid_sha256_hex_digest @slow_test def test_store_object_sparse_large_file(store): """Test storing a large object (4GB) via sparse file. This test has also been - executed with a 10GB file and the test classes succeeded locally in 117.03s (0:01:57).""" + executed with a 10GB file and the test classes succeeded locally in 117.03s (0:01:57). + """ # file_size = 10 * 1024 * 1024 * 1024 # 10GB file_size = 4 * 1024 * 1024 * 1024 # 4GB file_path = store.root + "random_file.bin" @@ -403,118 +535,164 @@ def test_store_object_sparse_large_file(store): file.write(b"\0") # Store object pid = "testfile_filehashstore" - hash_address = store.store_object(pid, file_path) - hash_address_id = hash_address.id + object_metadata = store.store_object(pid, file_path) + object_metadata_id = object_metadata.id pid_sha256_hex_digest = store.get_sha256_hex_digest(pid) - assert hash_address_id == pid_sha256_hex_digest + assert object_metadata_id == pid_sha256_hex_digest + + +def test_store_metadata(pids, store): + """Test store metadata.""" + test_dir = "tests/testdata/" + format_id = "http://ns.dataone.org/service/types/v2.0" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + _object_metadata = store.store_object(pid, path) + metadata_cid = store.store_metadata(pid, syspath, format_id) + assert metadata_cid == pids[pid]["metadata_cid"] + + +def test_store_metadata_default_format_id(pids, store): + """Test store metadata returns expected id when storing with default format_id.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + _object_metadata = store.store_object(pid, path) + metadata_cid = store.store_metadata(pid, syspath) + assert metadata_cid == pids[pid]["metadata_cid"] -def test_store_sysmeta_files_path(pids, store): - """Test store sysmeta with path.""" +def test_store_metadata_files_path(pids, store): + """Test store metadata with path.""" test_dir = "tests/testdata/" - entity = "sysmeta" + entity = "metadata" + format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - _hash_address = store.store_object(pid, path) - ab_id = store.store_sysmeta(pid, syspath) - assert store.exists(entity, ab_id) + _object_metadata = store.store_object(pid, path) + metadata_cid = store.store_metadata(pid, syspath, format_id) + assert store.exists(entity, metadata_cid) + assert metadata_cid == pids[pid]["metadata_cid"] assert store.count(entity) == 3 -def test_store_sysmeta_files_string(pids, store): - """Test store sysmeta with string.""" +def test_store_metadata_files_string(pids, store): + """Test store metadata with string.""" test_dir = "tests/testdata/" - entity = "sysmeta" + entity = "metadata" + format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): path_string = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath_string = str(Path(test_dir) / filename) - _hash_address = store.store_object(pid, path_string) - ab_id = store.store_sysmeta(pid, syspath_string) - assert store.exists(entity, ab_id) + _object_metadata = store.store_object(pid, path_string) + metadata_cid = store.store_metadata(pid, syspath_string, format_id) + assert store.exists(entity, metadata_cid) assert store.count(entity) == 3 -def test_store_sysmeta_files_input_stream(pids, store): - """Test store sysmeta with an input stream to sysmeta.""" +def test_store_metadata_files_input_stream(pids, store): + """Test store metadata with an input stream to metadata.""" test_dir = "tests/testdata/" - entity = "sysmeta" + entity = "metadata" + format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - _hash_address = store.store_object(pid, path) + _object_metadata = store.store_object(pid, path) filename = pid.replace("/", "_") + ".xml" syspath_string = str(Path(test_dir) / filename) syspath_stream = io.open(syspath_string, "rb") - _ab_id = store.store_sysmeta(pid, syspath_stream) + _metadata_cid = store.store_metadata(pid, syspath_stream, format_id) syspath_stream.close() assert store.count(entity) == 3 -def test_store_sysmeta_pid_empty(store): - """Test store sysmeta raises error with empty string.""" +def test_store_metadata_pid_empty(store): + """Test store metadata raises error with empty string.""" test_dir = "tests/testdata/" + format_id = "http://ns.dataone.org/service/types/v2.0" pid = "" filename = pid.replace("/", "_") + ".xml" syspath_string = str(Path(test_dir) / filename) with pytest.raises(ValueError): - store.store_sysmeta(pid, syspath_string) + store.store_metadata(pid, syspath_string, format_id) -def test_store_sysmeta_pid_empty_spaces(store): - """Test store sysmeta raises error with empty string.""" +def test_store_metadata_pid_empty_spaces(store): + """Test store metadata raises error with empty spaces.""" test_dir = "tests/testdata/" + format_id = "http://ns.dataone.org/service/types/v2.0" pid = " " filename = pid.replace("/", "_") + ".xml" syspath_string = str(Path(test_dir) / filename) with pytest.raises(ValueError): - store.store_sysmeta(pid, syspath_string) + store.store_metadata(pid, syspath_string, format_id) -def test_store_sysmeta_sysmeta_empty(store): - """Test store sysmeta raises error with empty sysmeta string.""" +def test_store_metadata_pid_format_id_spaces(store): + """Test store metadata raises error with empty spaces.""" + test_dir = "tests/testdata/" + format_id = " " pid = "jtao.1700.1" + filename = pid.replace("/", "_") + ".xml" + syspath_string = str(Path(test_dir) / filename) + with pytest.raises(ValueError): + store.store_metadata(pid, syspath_string, format_id) + + +def test_store_metadata_metadata_empty(store): + """Test store metadata raises error with empty metadata string.""" + pid = "jtao.1700.1" + format_id = "http://ns.dataone.org/service/types/v2.0" syspath_string = " " with pytest.raises(TypeError): - store.store_sysmeta(pid, syspath_string) + store.store_metadata(pid, syspath_string, format_id) -def test_store_sysmeta_sysmeta_none(store): - """Test store sysmeta raises error with empty sysmeta string.""" +def test_store_metadata_metadata_none(store): + """Test store metadata raises error with empty None metadata.""" pid = "jtao.1700.1" + format_id = "http://ns.dataone.org/service/types/v2.0" syspath_string = None with pytest.raises(TypeError): - store.store_sysmeta(pid, syspath_string) + store.store_metadata(pid, syspath_string, format_id) -def test_store_sysmeta_ab_id(pids, store): - """Test store sysmeta returns expected ab_id.""" +def test_store_metadata_metadata_cid(pids, store): + """Test store metadata returns expected metadata_cid.""" test_dir = "tests/testdata/" + format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - _hash_address = store.store_object(pid, path) - ab_id = store.store_sysmeta(pid, syspath) - assert ab_id == pids[pid]["ab_id"] + _object_metadata = store.store_object(pid, path) + metadata_cid = store.store_metadata(pid, syspath, format_id) + assert metadata_cid == pids[pid]["metadata_cid"] -def test_store_sysmeta_thread_lock(store): - """Test store sysmeta thread lock.""" +def test_store_metadata_thread_lock(store): + """Test store metadata thread lock.""" test_dir = "tests/testdata/" - entity = "sysmeta" + entity = "metadata" + format_id = "http://ns.dataone.org/service/types/v2.0" pid = "jtao.1700.1" path = test_dir + pid filename = pid + ".xml" syspath = Path(test_dir) / filename - _hash_address = store.store_object(pid, path) - store.store_sysmeta(pid, syspath) + _object_metadata = store.store_object(pid, path) + store.store_metadata(pid, syspath, format_id) # Start threads - thread1 = Thread(target=store.store_sysmeta, args=(pid, syspath)) - thread2 = Thread(target=store.store_sysmeta, args=(pid, syspath)) - thread3 = Thread(target=store.store_sysmeta, args=(pid, syspath)) - thread4 = Thread(target=store.store_sysmeta, args=(pid, syspath)) + thread1 = Thread(target=store.store_metadata, args=(pid, syspath, format_id)) + thread2 = Thread(target=store.store_metadata, args=(pid, syspath, format_id)) + thread3 = Thread(target=store.store_metadata, args=(pid, syspath, format_id)) + thread4 = Thread(target=store.store_metadata, args=(pid, syspath, format_id)) thread1.start() thread2.start() thread3.start() @@ -529,16 +707,17 @@ def test_store_sysmeta_thread_lock(store): def test_retrieve_object(pids, store): """Test retrieve_object returns correct object data.""" test_dir = "tests/testdata/" + format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - hash_address = store.store_object(pid, path) - store.store_sysmeta(pid, syspath) + object_metadata = store.store_object(pid, path) + store.store_metadata(pid, syspath, format_id) obj_stream = store.retrieve_object(pid) sha256_hex = store.computehash(obj_stream) obj_stream.close() - assert sha256_hex == hash_address.hex_digests.get("sha256") + assert sha256_hex == object_metadata.hex_digests.get("sha256") def test_retrieve_object_pid_empty(store): @@ -556,45 +735,83 @@ def test_retrieve_object_pid_invalid(store): store.retrieve_object(pid_does_not_exist) -def test_retrieve_sysmeta(store): - """Test retrieve_sysmeta returns correct sysmeta data.""" +def test_retrieve_metadata(store): + """Test retrieve_metadata returns correct metadata.""" test_dir = "tests/testdata/" + format_id = "http://ns.dataone.org/service/types/v2.0" pid = "jtao.1700.1" path = test_dir + pid filename = pid + ".xml" syspath = Path(test_dir) / filename - _hash_address = store.store_object(pid, path) - _ab_id = store.store_sysmeta(pid, syspath) - sysmeta_ret = store.retrieve_sysmeta(pid) - sysmeta = syspath.read_bytes() - assert sysmeta.decode("utf-8") == sysmeta_ret + _object_metadata = store.store_object(pid, path) + _metadata_cid = store.store_metadata(pid, syspath, format_id) + metadata_stream = store.retrieve_metadata(pid, format_id) + metadata_content = metadata_stream.read().decode("utf-8") + metadata_stream.close() + metadata = syspath.read_bytes() + assert metadata.decode("utf-8") == metadata_content -def test_retrieve_sysmeta_pid_invalid(store): - """Test retrieve_sysmeta raises error when supplied with bad pid.""" +def test_retrieve_metadata_default_format_id(store): + """Test retrieve_metadata retrieves expected metadata with default format_id.""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid + filename = pid + ".xml" + syspath = Path(test_dir) / filename + _object_metadata = store.store_object(pid, path) + _metadata_cid = store.store_metadata(pid, syspath) + metadata_stream = store.retrieve_metadata(pid) + metadata_content = metadata_stream.read().decode("utf-8") + metadata_stream.close() + metadata = syspath.read_bytes() + assert metadata.decode("utf-8") == metadata_content + + +def test_retrieve_metadata_bytes_pid_invalid(store): + """Test retrieve_metadata raises error when supplied with bad pid.""" + format_id = "http://ns.dataone.org/service/types/v2.0" pid = "jtao.1700.1" pid_does_not_exist = pid + "test" with pytest.raises(ValueError): - store.retrieve_sysmeta(pid_does_not_exist) + store.retrieve_metadata(pid_does_not_exist, format_id) -def test_retrieve_sysmeta_pid_empty(store): - """Test retrieve_sysmeta raises error when supplied with empty pid.""" +def test_retrieve_metadata_bytes_pid_empty(store): + """Test retrieve_metadata raises error when supplied with empty pid.""" + format_id = "http://ns.dataone.org/service/types/v2.0" pid = " " with pytest.raises(ValueError): - store.retrieve_sysmeta(pid) + store.retrieve_metadata(pid, format_id) + + +def test_retrieve_metadata_format_id_empty(store): + """Test retrieve_metadata raises error when supplied with empty format_id.""" + format_id = "" + pid = "jtao.1700.1" + with pytest.raises(ValueError): + store.retrieve_metadata(pid, format_id) + + +def test_retrieve_metadata_format_id_empty_spaces(store): + """Test retrieve_metadata raises error when supplied with empty spaces format_id.""" + format_id = " " + pid = "jtao.1700.1" + with pytest.raises(ValueError): + store.retrieve_metadata(pid, format_id) def test_delete_objects(pids, store): """Test delete_object successfully deletes objects.""" test_dir = "tests/testdata/" entity = "objects" + format_id = "http://ns.dataone.org/service/types/v2.0" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - _hash_address = store.store_object(pid, path) - _ab_id = store.store_sysmeta(pid, syspath) + _object_metadata = store.store_object(pid, path) + _metadata_cid = store.store_metadata(pid, syspath, format_id) store.delete_object(pid) assert store.count(entity) == 0 @@ -613,43 +830,69 @@ def test_delete_object_pid_none(store): store.delete_object(pid) -def test_delete_sysmeta(pids, store): - """Test delete_sysmeta successfully deletes sysmeta.""" +def test_delete_metadata(pids, store): + """Test delete_metadata successfully deletes metadata.""" + test_dir = "tests/testdata/" + entity = "metadata" + format_id = "http://ns.dataone.org/service/types/v2.0" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + _object_metadata = store.store_object(pid, path) + _metadata_cid = store.store_metadata(pid, syspath, format_id) + store.delete_metadata(pid, format_id) + assert store.count(entity) == 0 + + +def test_delete_metadata_default_format_id(store, pids): + """Test delete_metadata deletes successfully with default format_id.""" test_dir = "tests/testdata/" - entity = "sysmeta" + entity = "metadata" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - _hash_address = store.store_object(pid, path) - _ab_id = store.store_sysmeta(pid, syspath) - store.delete_sysmeta(pid) + _object_metadata = store.store_object(pid, path) + _metadata_cid = store.store_metadata(pid, syspath) + store.delete_metadata(pid) assert store.count(entity) == 0 -def test_delete_sysmeta_pid_empty(store): +def test_delete_metadata_pid_empty(store): """Test delete_object raises error when empty pid supplied.""" + format_id = "http://ns.dataone.org/service/types/v2.0" pid = " " with pytest.raises(ValueError): - store.delete_sysmeta(pid) + store.delete_metadata(pid, format_id) -def test_delete_sysmeta_pid_none(store): +def test_delete_metadata_pid_none(store): """Test delete_object raises error when pid is 'None'.""" + format_id = "http://ns.dataone.org/service/types/v2.0" pid = None with pytest.raises(ValueError): - store.delete_sysmeta(pid) + store.delete_metadata(pid, format_id) + + +def test_delete_metadata_format_id_empty(store): + """Test delete_object raises error when empty format_id supplied.""" + format_id = " " + pid = "jtao.1700.1" + with pytest.raises(ValueError): + store.delete_metadata(pid, format_id) def test_get_hex_digest(store): """Test get_hex_digest for expected value.""" test_dir = "tests/testdata/" + format_id = "http://ns.dataone.org/service/types/v2.0" pid = "jtao.1700.1" path = test_dir + pid filename = pid + ".xml" syspath = Path(test_dir) / filename - _hash_address = store.store_object(pid, path) - _ab_id = store.store_sysmeta(pid, syspath) + _object_metadata = store.store_object(pid, path) + _metadata_cid = store.store_metadata(pid, syspath, format_id) sha3_256_hex_digest = ( "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) @@ -674,7 +917,7 @@ def test_get_hex_digest_pid_unsupported_algorithm(store): filename = pid + ".xml" syspath = Path(test_dir) / filename syspath.read_bytes() - _hash_address = store.store_object(pid, path) + _object_metadata = store.store_object(pid, path) algorithm = "sm3" with pytest.raises(ValueError): store.get_hex_digest(pid, algorithm) diff --git a/tests/filehashstore/test_stream.py b/tests/test_filehashstore_stream.py similarity index 96% rename from tests/filehashstore/test_stream.py rename to tests/test_filehashstore_stream.py index a60960e7..8cf4a7d0 100644 --- a/tests/filehashstore/test_stream.py +++ b/tests/test_filehashstore_stream.py @@ -3,7 +3,7 @@ import io from pathlib import Path import pytest -from hashstore.filehashstore.filehashstore import Stream +from hashstore.filehashstore import Stream def test_stream_reads_file(pids): diff --git a/tests/test_hashaddress.py b/tests/test_hashaddress.py deleted file mode 100644 index b7ea6971..00000000 --- a/tests/test_hashaddress.py +++ /dev/null @@ -1,27 +0,0 @@ -"""Test module for HashAddress""" -from hashstore.hashaddress import HashAddress - - -def test_hashaddress(): - """Test class returns correct values via dot notation.""" - ab_id = "hashstoretest" - rel_path = "rel/path/to/object" - abs_path = "abs/path/to/object" - is_duplicate = "false" - hex_digest_dict = { - "md5": "md5value", - "sha1": "sha1value", - "sha224": "sha224value", - "sha256": "sha256value", - "sha512": "sha512value", - } - hash_address = HashAddress(ab_id, rel_path, abs_path, is_duplicate, hex_digest_dict) - assert hash_address.id == ab_id - assert hash_address.relpath == rel_path - assert hash_address.abspath == abs_path - assert hash_address.is_duplicate == is_duplicate - assert hash_address.hex_digests.get("md5") == hex_digest_dict["md5"] - assert hash_address.hex_digests.get("sha1") == hex_digest_dict["sha1"] - assert hash_address.hex_digests.get("sha224") == hex_digest_dict["sha224"] - assert hash_address.hex_digests.get("sha256") == hex_digest_dict["sha256"] - assert hash_address.hex_digests.get("sha512") == hex_digest_dict["sha512"] diff --git a/tests/test_hashstore.py b/tests/test_hashstore.py index 8bd6b44f..68cd195a 100644 --- a/tests/test_hashstore.py +++ b/tests/test_hashstore.py @@ -1,7 +1,8 @@ -"""Test module for HashStore (and HashStoreFactory)""" +"""Test module for HashStore Module""" +import os import pytest -from hashstore.filehashstore.filehashstore import FileHashStore -from hashstore.hashstore_factory import HashStoreFactory +from hashstore.hashstore import ObjectMetadata, HashStoreFactory +from hashstore.filehashstore import FileHashStore @pytest.fixture(name="factory") @@ -18,7 +19,7 @@ def test_init(factory): def test_factory_get_hashstore_filehashstore(factory, props): """Check factory creates instance of FileHashStore.""" - module_name = "hashstore.filehashstore.filehashstore" + module_name = "hashstore.filehashstore" class_name = "FileHashStore" # These props can be found in tests/conftest.py store = factory.get_hashstore(module_name, class_name, props) @@ -28,7 +29,7 @@ def test_factory_get_hashstore_filehashstore(factory, props): def test_factory_get_hashstore_unsupported_class(factory): """Check that AttributeError is raised when provided with unsupported class.""" with pytest.raises(AttributeError): - module_name = "hashstore.filehashstore.filehashstore" + module_name = "hashstore.filehashstore" class_name = "S3HashStore" factory.get_hashstore(module_name, class_name) @@ -36,6 +37,59 @@ def test_factory_get_hashstore_unsupported_class(factory): def test_factory_get_hashstore_unsupported_module(factory): """Check that ModuleNotFoundError is raised when provided with unsupported module.""" with pytest.raises(ModuleNotFoundError): - module_name = "hashstore.s3filestore.s3filestore" + module_name = "hashstore.s3filestore" class_name = "FileHashStore" factory.get_hashstore(module_name, class_name) + + +def test_factory_get_hashstore_filehashstore_unsupported_algorithm(factory): + """Check factory raises exception with store algorithm value that part of the default list""" + module_name = "hashstore.filehashstore" + class_name = "FileHashStore" + + properties = { + "store_path": os.getcwd() + "/metacat/test", + "store_depth": 3, + "store_width": 2, + "store_algorithm": "MD2", + "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", + } + with pytest.raises(ValueError): + factory.get_hashstore(module_name, class_name, properties) + + +def test_factory_get_hashstore_filehashstore_incorrect_algorithm_format(factory): + """Check factory raises exception with incorrectly formatted algorithm value""" + module_name = "hashstore.filehashstore" + class_name = "FileHashStore" + + properties = { + "store_path": os.getcwd() + "/metacat/test", + "store_depth": 3, + "store_width": 2, + "store_algorithm": "sha256", + "store_metadata_namespace": "http://ns.dataone.org/service/types/v2.0", + } + with pytest.raises(ValueError): + factory.get_hashstore(module_name, class_name, properties) + + +def test_objectmetadata(): + """Test class returns correct values via dot notation.""" + ab_id = "hashstoretest" + obj_size = 1234 + hex_digest_dict = { + "md5": "md5value", + "sha1": "sha1value", + "sha224": "sha224value", + "sha256": "sha256value", + "sha512": "sha512value", + } + object_metadata = ObjectMetadata(ab_id, obj_size, hex_digest_dict) + assert object_metadata.id == ab_id + assert object_metadata.obj_size == obj_size + assert object_metadata.hex_digests.get("md5") == hex_digest_dict["md5"] + assert object_metadata.hex_digests.get("sha1") == hex_digest_dict["sha1"] + assert object_metadata.hex_digests.get("sha224") == hex_digest_dict["sha224"] + assert object_metadata.hex_digests.get("sha256") == hex_digest_dict["sha256"] + assert object_metadata.hex_digests.get("sha512") == hex_digest_dict["sha512"] diff --git a/tests/test_hashstore_client.py b/tests/test_hashstore_client.py new file mode 100644 index 00000000..7d73e524 --- /dev/null +++ b/tests/test_hashstore_client.py @@ -0,0 +1,242 @@ +"""Test module for the Python client (Public API calls only)""" +import sys +import os +from pathlib import Path +from hashstore import client + + +def test_create_hashstore(tmp_path): + """Test creating a HashStore through the client.""" + client_directory = os.getcwd() + "/src/hashstore" + client_module_path = f"{client_directory}/client.py" + client_test_store = f"{tmp_path}/clienths" + create_hashstore_opt = "-chs" + store_depth = "-dp=3" + store_width = "-wp=2" + store_algorithm = "-ap=SHA-256" + store_namespace = "-nsp=http://www.ns.test/v1" + chs_args = [ + client_module_path, + client_test_store, + create_hashstore_opt, + store_depth, + store_width, + store_algorithm, + store_namespace, + ] + + # Add file path of HashStore to sys so modules can be discovered + sys.path.append(client_directory) + # Manually change sys args to simulate command line arguments + sys.argv = chs_args + client.main() + + hashstore_yaml = Path(client_test_store + "/hashstore.yaml") + hashstore_object_path = Path(client_test_store + "/objects") + hashstore_metadata_path = Path(client_test_store + "/metadata") + hashstore_client_python_log = Path(client_test_store + "/python_client.log") + assert os.path.exists(hashstore_yaml) + assert os.path.exists(hashstore_object_path) + assert os.path.exists(hashstore_metadata_path) + assert os.path.exists(hashstore_client_python_log) + + +def test_store_object(store, pids): + """Test storing objects to HashStore through client.""" + client_directory = os.getcwd() + "/src/hashstore" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + client_module_path = f"{client_directory}/client.py" + test_store = store.root + store_object_opt = "-storeobject" + client_pid_arg = f"-pid={pid}" + path = f'-path={test_dir + pid.replace("/", "_")}' + chs_args = [ + client_module_path, + test_store, + store_object_opt, + client_pid_arg, + path, + ] + + # Add file path of HashStore to sys so modules can be discovered + sys.path.append(client_directory) + # Manually change sys args to simulate command line arguments + sys.argv = chs_args + client.main() + + assert store.exists("objects", pids[pid]["object_cid"]) + + +def test_store_metadata(store, pids): + """Test storing metadata to HashStore through client.""" + client_directory = os.getcwd() + "/src/hashstore" + test_dir = "tests/testdata/" + namespace = "http://ns.dataone.org/service/types/v2.0" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + client_module_path = f"{client_directory}/client.py" + test_store = store.root + store_metadata_opt = "-storemetadata" + client_pid_arg = f"-pid={pid}" + path = f"-path={syspath}" + format_id = f"-formatid={namespace}" + chs_args = [ + client_module_path, + test_store, + store_metadata_opt, + client_pid_arg, + path, + format_id, + ] + + # Add file path of HashStore to sys so modules can be discovered + sys.path.append(client_directory) + # Manually change sys args to simulate command line arguments + sys.argv = chs_args + client.main() + + assert store.exists("metadata", pids[pid]["metadata_cid"]) + + +def test_retrieve_objects(capsys, pids, store): + """Test retrieving objects from a HashStore through client.""" + client_directory = os.getcwd() + "/src/hashstore" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + _object_metadata = store.store_object(pid, path) + + client_module_path = f"{client_directory}/client.py" + test_store = store.root + delete_object_opt = "-retrieveobject" + client_pid_arg = f"-pid={pid}" + chs_args = [ + client_module_path, + test_store, + delete_object_opt, + client_pid_arg, + ] + + # Add file path of HashStore to sys so modules can be discovered + sys.path.append(client_directory) + # Manually change sys args to simulate command line arguments + sys.argv = chs_args + client.main() + + object_stream = store.retrieve_object(pid) + object_content = ( + object_stream.read(1000).decode("utf-8") + + "\n" + + "...\n<-- Truncated for Display Purposes -->" + + "\n" + ) + object_stream.close() + + capsystext = capsys.readouterr().out + assert capsystext == object_content + + +def test_retrieve_metadata(capsys, pids, store): + """Test retrieving metadata from a HashStore through client.""" + client_directory = os.getcwd() + "/src/hashstore" + test_dir = "tests/testdata/" + namespace = "http://ns.dataone.org/service/types/v2.0" + for pid in pids.keys(): + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + _metadata_cid = store.store_metadata(pid, syspath, namespace) + + client_module_path = f"{client_directory}/client.py" + test_store = store.root + retrieve_metadata_opt = "-retrievemetadata" + client_pid_arg = f"-pid={pid}" + format_id = f"-formatid={namespace}" + chs_args = [ + client_module_path, + test_store, + retrieve_metadata_opt, + client_pid_arg, + format_id, + ] + + # Add file path of HashStore to sys so modules can be discovered + sys.path.append(client_directory) + # Manually change sys args to simulate command line arguments + sys.argv = chs_args + client.main() + + metadata_stream = store.retrieve_metadata(pid, namespace) + metadata_content = ( + metadata_stream.read(1000).decode("utf-8") + + "\n" + + "...\n<-- Truncated for Display Purposes -->" + + "\n" + ) + metadata_stream.close() + + capsystext = capsys.readouterr().out + assert capsystext == metadata_content + + +def test_delete_objects(pids, store): + """Test deleting objects from a HashStore through client.""" + client_directory = os.getcwd() + "/src/hashstore" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + _object_metadata = store.store_object(pid, path) + + client_module_path = f"{client_directory}/client.py" + test_store = store.root + delete_object_opt = "-deleteobject" + client_pid_arg = f"-pid={pid}" + chs_args = [ + client_module_path, + test_store, + delete_object_opt, + client_pid_arg, + ] + + # Add file path of HashStore to sys so modules can be discovered + sys.path.append(client_directory) + # Manually change sys args to simulate command line arguments + sys.argv = chs_args + client.main() + + assert not store.exists("objects", pids[pid]["object_cid"]) + + +def test_delete_metadata(pids, store): + """Test deleting metadata from a HashStore through client.""" + client_directory = os.getcwd() + "/src/hashstore" + test_dir = "tests/testdata/" + namespace = "http://ns.dataone.org/service/types/v2.0" + for pid in pids.keys(): + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + _metadata_cid = store.store_metadata(pid, syspath, namespace) + + client_module_path = f"{client_directory}/client.py" + test_store = store.root + delete_metadata_opt = "-deletemetadata" + client_pid_arg = f"-pid={pid}" + format_id = f"-formatid={namespace}" + chs_args = [ + client_module_path, + test_store, + delete_metadata_opt, + client_pid_arg, + format_id, + ] + + # Add file path of HashStore to sys so modules can be discovered + sys.path.append(client_directory) + # Manually change sys args to simulate command line arguments + sys.argv = chs_args + client.main() + + assert not store.exists("metadata", pids[pid]["metadata_cid"])