-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataset_sanity_check.py
46 lines (36 loc) · 1.36 KB
/
dataset_sanity_check.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import os
import glob
import argparse
import shutil
def count_files(directory, extension):
total_count = 0
for dirpath, dirnames, filenames in os.walk(directory):
total_count += len(glob.glob1(dirpath, "*." + extension))
return total_count
def count_files_in_subfolders(directory, extension):
count_dict = {}
for subfolder in os.listdir(directory):
subfolder_path = os.path.join(directory, subfolder)
if os.path.isdir(subfolder_path):
count_dict[subfolder] = count_files(subfolder_path, extension)
return count_dict
"Counts for all transformed datasets, the number of .pt files to easily see if some transformations failed."
parser = argparse.ArgumentParser()
parser.add_argument("-r", "--remove", action="store_true",
help="Remove directories with missing files")
args = parser.parse_args()
data_path = './tmp'
file_extension = 'pt'
expected_size = 7
all_good = True
counts = count_files_in_subfolders(data_path, file_extension)
for folder, count in counts.items():
if count < expected_size:
all_good = False
if not args.remove:
print(f"Found missing files at {folder} ({count})")
else:
print(f"Found missing files at {folder} ({count}), removing...")
shutil.rmtree(os.path.join(data_path, folder))
if all_good:
print("All good!")