Skip to content

Commit

Permalink
Merge pull request #102 from DataONEorg/bug-101-tmp-refsfiles
Browse files Browse the repository at this point in the history
BugFix-101: `cid refs file` does not get cleaned up
  • Loading branch information
doulikecookiedough authored May 30, 2024
2 parents ad55464 + cf21ee2 commit bf904c1
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 6 deletions.
8 changes: 4 additions & 4 deletions src/hashstore/filehashstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -604,10 +604,6 @@ def tag_object(self, pid, cid):
tmp_root_path = self._get_store_path("refs") / "tmp"
pid_refs_path = self._resolve_path("pid", pid)
cid_refs_path = self._resolve_path("cid", cid)
# All ref files begin as tmp files and get moved sequentially at once
# Get tmp files with the expected cid and pid refs content
pid_tmp_file_path = self._write_refs_file(tmp_root_path, cid, "pid")
cid_tmp_file_path = self._write_refs_file(tmp_root_path, pid, "cid")
# Create paths for pid ref file in '.../refs/pid' and cid ref file in '.../refs/cid'
self._create_path(os.path.dirname(pid_refs_path))
self._create_path(os.path.dirname(cid_refs_path))
Expand Down Expand Up @@ -635,6 +631,7 @@ def tag_object(self, pid, cid):

if self._is_string_in_refs_file(cid, pid_refs_path):
# The pid correctly references the given cid, but the cid refs file is missing
cid_tmp_file_path = self._write_refs_file(tmp_root_path, pid, "cid")
shutil.move(cid_tmp_file_path, cid_refs_path)
self._verify_hashstore_references(
pid,
Expand Down Expand Up @@ -676,6 +673,7 @@ def tag_object(self, pid, cid):
)
logging.debug(debug_msg)
# Move the pid refs file
pid_tmp_file_path = self._write_refs_file(tmp_root_path, cid, "pid")
shutil.move(pid_tmp_file_path, pid_refs_path)
# Update cid ref files as it already exists
if not self._is_string_in_refs_file(pid, cid_refs_path):
Expand All @@ -695,6 +693,8 @@ def tag_object(self, pid, cid):
return True

# Move both files after checking the existing status of refs files
pid_tmp_file_path = self._write_refs_file(tmp_root_path, cid, "pid")
cid_tmp_file_path = self._write_refs_file(tmp_root_path, pid, "cid")
shutil.move(pid_tmp_file_path, pid_refs_path)
shutil.move(cid_tmp_file_path, cid_refs_path)
log_msg = "Reference files have been moved to their permanent location. Verifying refs."
Expand Down
57 changes: 55 additions & 2 deletions tests/test_filehashstore_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -503,8 +503,8 @@ def store_object_wrapper(obj_pid, obj_path):
# implement a multiprocessing test


def test_store_object_threads_multiple_pids_one_cid(pids, store):
"""Test store object thread lock."""
def test_store_object_threads_multiple_pids_one_cid_content(pids, store):
"""Test store object thread lock and refs files content"""
entity = "objects"
test_dir = "tests/testdata/"
path = test_dir + "jtao.1700.1"
Expand Down Expand Up @@ -550,6 +550,59 @@ def store_object_wrapper(obj_pid, obj_path):
assert number_of_pids_reffed == 6


def test_store_object_threads_multiple_pids_one_cid_files(store):
"""Test store object with threads produces the expected amount of files"""
test_dir = "tests/testdata/"
path = test_dir + "jtao.1700.1"
pid_list = ["jtao.1700.1"]
for n in range(0, 5):
pid_list.append(f"dou.test.{n}")

def store_object_wrapper(obj_pid, obj_path):
store.store_object(obj_pid, obj_path) # Call store_object inside the thread

thread1 = Thread(target=store_object_wrapper, args=(pid_list[0], path))
thread2 = Thread(target=store_object_wrapper, args=(pid_list[1], path))
thread3 = Thread(target=store_object_wrapper, args=(pid_list[2], path))
thread4 = Thread(target=store_object_wrapper, args=(pid_list[3], path))
thread5 = Thread(target=store_object_wrapper, args=(pid_list[4], path))
thread6 = Thread(target=store_object_wrapper, args=(pid_list[5], path))
thread1.start()
thread2.start()
thread3.start()
thread4.start()
thread5.start()
thread6.start()
thread1.join()
thread2.join()
thread3.join()
thread4.join()
thread5.join()
thread6.join()

# Confirm that tmp files do not remain in refs
def folder_has_files(folder_path):
# Iterate over directory contents
for _, _, files in os.walk(folder_path):
if files: # If there are any files in the folder
print(files)
return True
return False

# Confirm that tmp files do not remain in refs
def get_number_of_files(folder_path):
# Iterate over directory contents
file_count = 0
for _, _, files in os.walk(folder_path):
if files: # If there are any files in the folder
file_count += len(files)
return file_count

assert get_number_of_files(store.refs + "/pids") == 6
assert get_number_of_files(store.refs + "/cids") == 1
assert folder_has_files(store.refs + "/tmp") is False


@slow_test
def test_store_object_interrupt_process(store):
"""Test that tmp file created when storing a large object (2GB) and
Expand Down

0 comments on commit bf904c1

Please sign in to comment.