Skip to content

Commit

Permalink
Merge pull request #79 from DataONEorg/bug-75-missing-pidrefs
Browse files Browse the repository at this point in the history
Bug-75: Missing Pid Reference Files
  • Loading branch information
doulikecookiedough authored Dec 30, 2023
2 parents 1d4c820 + 66a9990 commit 29cdbe8
Show file tree
Hide file tree
Showing 2 changed files with 88 additions and 23 deletions.
32 changes: 23 additions & 9 deletions src/hashstore/filehashstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -541,29 +541,42 @@ def tag_object(self, pid, cid):
try:
pid_ref_abs_path = self.get_refs_abs_path("pid", pid)
cid_ref_abs_path = self.get_refs_abs_path("cid", cid)
# Ensure refs tmp folder exists
tmp_root_path = self.get_store_path("refs") / "tmp"
if os.path.exists(tmp_root_path) is False:
self.create_path(tmp_root_path)

# Proceed to tagging process
if os.path.exists(pid_ref_abs_path):
print("Path exists:\n")
print(pid_ref_abs_path)
# A pid reference file can only contain one cid
exception_string = (
"FileHashStore - write_pid_refs_file: pid ref file already exists for %s",
pid_ref_abs_path,
"FileHashStore - write_pid_refs_file: pid ref file already exists for"
+ pid_ref_abs_path
)
logging.error(exception_string)
raise FileExistsError(exception_string)
elif os.path.exists(cid_ref_abs_path):
# Create the pid refs file
pid_tmp_file = self._mktmpfile(tmp_root_path)
pid_tmp_file_path = pid_tmp_file.name
self._write_pid_refs_file(pid_tmp_file_path, cid)
# Create path for pid ref file in '.../refs/pid'
self.create_path(os.path.dirname(pid_ref_abs_path))
shutil.move(pid_tmp_file_path, pid_ref_abs_path)
# Update cid ref files if it already exists
self._update_cid_refs(cid_ref_abs_path, pid)
# Verify refs file content
self._verify_hashstore_references(pid, cid)
logging.info(
"FileHashStore - tag_object: Successfully updated cid: %s with pid: %s",
cid,
pid,
)
return True
else:
# All ref files begin as tmp files and get moved sequentially at once
# Ensure refs tmp folder exists
tmp_root_path = self.get_store_path("refs") / "tmp"
if os.path.exists(tmp_root_path) is False:
self.create_path(tmp_root_path)

# Then write pid_refs_file content into tmp file
pid_tmp_file = self._mktmpfile(tmp_root_path)
pid_tmp_file_path = pid_tmp_file.name
Expand Down Expand Up @@ -1227,11 +1240,12 @@ def _update_cid_refs(self, cid_ref_abs_path, pid):
for _, line in enumerate(f, start=1):
value = line.strip()
if pid == value:
err_msg = (
warning_msg = (
f"FileHashStore - update_cid_refs: pid ({pid}) already reference in"
+ f" cid reference file: {cid_ref_abs_path} "
)
raise ValueError(err_msg)
logging.warning(warning_msg)
return

with open(cid_ref_abs_path, "a+", encoding="utf8") as cid_ref_file:
fcntl.flock(cid_ref_file, fcntl.LOCK_EX)
Expand Down
79 changes: 65 additions & 14 deletions tests/test_filehashstore_references.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,20 +96,70 @@ def test_tag_object_cid_refs_file_exists(pids, store):
assert not os.path.exists(second_cid_hash)


def test_tag_object_cid_refs_update(pids, store):
def test_tag_object_cid_refs_update_cid_refs_updated(store):
"""Test tag object updates a cid reference file that already exists."""
test_dir = "tests/testdata/"
for pid in pids.keys():
path = test_dir + pid.replace("/", "_")
object_metadata = store.store_object(None, path)
cid = object_metadata.id
store.tag_object(pid, cid)
store.tag_object("dou.test.1", cid)
cid_ref_abs_path = store.get_refs_abs_path("cid", cid)
with open(cid_ref_abs_path, "r", encoding="utf8") as f:
cid_ref_file_pid = f.read()
pid = "jtao.1700.1"
path = test_dir + pid.replace("/", "_")
# Store data only
object_metadata = store.store_object(None, path)
cid = object_metadata.id
# Tag object
store.tag_object(pid, cid)
# Tag the cid with another pid
additional_pid = "dou.test.1"
store.tag_object(additional_pid, cid)

# Read cid file to confirm cid refs file contains the additional pid
cid_ref_abs_path = store.get_refs_abs_path("cid", cid)
with open(cid_ref_abs_path, "r", encoding="utf8") as f:
for _, line in enumerate(f, start=1):
value = line.strip()
assert value == pid or value == additional_pid


def test_tag_object_cid_refs_update_pid_refs_created(store):
"""Test tag object creates a pid reference file when called to tag an object
that already exists."""
test_dir = "tests/testdata/"
pid = "jtao.1700.1"
path = test_dir + pid.replace("/", "_")
# Store data only
object_metadata = store.store_object(None, path)
cid = object_metadata.id
# Tag object
store.tag_object(pid, cid)
# Tag the cid with another pid
additional_pid = "dou.test.1"
store.tag_object(additional_pid, cid)

pid_refs_file_path = store.get_refs_abs_path("pid", additional_pid)
assert os.path.exists(pid_refs_file_path)


def test_tag_object_cid_refs_update_pid_found_but_file_missing(store):
"""Test that tag_object creates a missing pid refs file that somehow disappeared
when called to tag a cid that already contains the pid."""
test_dir = "tests/testdata/"
pid = "jtao.1700.1"
path = test_dir + pid.replace("/", "_")
object_metadata = store.store_object(None, path)
store.tag_object(pid, object_metadata.id)
cid = object_metadata.id
# Manually update the cid refs, pid refs file missing at this point
additional_pid = "dou.test.1"
cid_ref_abs_path = store.get_refs_abs_path("cid", cid)
store._update_cid_refs(cid_ref_abs_path, additional_pid)

# Confirm the pid refs file is missing
pid_refs_file_path = store.get_refs_abs_path("pid", additional_pid)
assert not os.path.exists(pid_refs_file_path)

assert "dou.test.1" in cid_ref_file_pid
# Call tag_object, this should create the missing pid refs file
store.tag_object(additional_pid, cid)

# Confirm it has been created
assert os.path.exists(pid_refs_file_path)


def test_verify_object(pids, store):
Expand Down Expand Up @@ -278,14 +328,15 @@ def test_update_cid_refs_content_multiple(pids, store):


def test_update_cid_refs_content_pid_exists(pids, store):
"""Test that update_cid_ref throws exception if pid already exists."""
"""Test that update_cid_ref does not throw exception if pid already exists
and proceeds to complete the tagging process (verify_object)"""
for pid in pids.keys():
cid = pids[pid]["sha256"]
cid_ref_abs_path = store.get_refs_abs_path("cid", cid)
store.create_path(os.path.dirname(cid_ref_abs_path))
store._write_cid_refs_file(cid_ref_abs_path, pid)
with pytest.raises(ValueError):
store._update_cid_refs(cid_ref_abs_path, pid)
# Exception should not be thrown
store._update_cid_refs(cid_ref_abs_path, pid)


def test_update_cid_refs_content_cid_refs_does_not_exist(pids, store):
Expand Down

0 comments on commit 29cdbe8

Please sign in to comment.