Skip to content

Commit

Permalink
Use uuid for filenames (#33)
Browse files Browse the repository at this point in the history
  • Loading branch information
JosselinSomervilleRoberts authored Mar 15, 2024
1 parent 28b0f83 commit feb6848
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 20 deletions.
20 changes: 8 additions & 12 deletions src/image2structure/collect.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import shutil
import tarfile
import time
import uuid

from .runner import Runner
from .run_specs import _RUNNER_REGISTRY
Expand Down Expand Up @@ -259,7 +260,7 @@ def run(runner: Runner, args: argparse.Namespace) -> None:
done: bool = False
for compilation_result in accepted_results:
category: str = compilation_result.category
num_id: int = 0
file_name: str = str(uuid.uuid4())
if category not in num_instances_collected:
# First time we collect this category
# Create the directories
Expand All @@ -268,30 +269,25 @@ def run(runner: Runner, args: argparse.Namespace) -> None:
os.path.join(output_path, category, dir), exist_ok=True
)
num_instances_collected[category] = 0
else:
# Increment the number of instances collected
num_id = num_files_in_dir(
os.path.join(output_path, category, "metadata")
)

# Copy shared metadata to compiled metadata
compiled_metadata: Dict[str, Any] = {
**metadata,
"assets": compilation_result.assets_path,
"category": category,
"num_id": num_id,
"uuid": file_name,
}

# Save the metadata
instance_metadata_path: str = os.path.join(
output_path, category, "metadata", f"{num_id}.json"
output_path, category, "metadata", f"{file_name}.json"
)
with open(instance_metadata_path, "w") as f:
json.dump(compiled_metadata, f, indent=4)

# Save the image
instance_image_path: str = os.path.join(
output_path, category, "images", f"{num_id}.png"
output_path, category, "images", f"{file_name}.png"
)
shutil.copy(compilation_result.rendering_path, instance_image_path)

Expand All @@ -306,7 +302,7 @@ def run(runner: Runner, args: argparse.Namespace) -> None:

# Save the text
instance_text_path: str = os.path.join(
output_path, category, "text", f"{num_id}.txt"
output_path, category, "text", f"{file_name}.txt"
)
with open(instance_text_path, "w") as f:
f.write(compilation_result.text)
Expand All @@ -318,7 +314,7 @@ def run(runner: Runner, args: argparse.Namespace) -> None:
else ""
)
instance_structure_path: str = os.path.join(
output_path, category, "structures", f"{num_id}{extension}"
output_path, category, "structures", f"{file_name}{extension}"
)
if os.path.isdir(compilation_result.data_path):
# First delete all files that we do not want to include
Expand All @@ -345,7 +341,7 @@ def run(runner: Runner, args: argparse.Namespace) -> None:
assert category in num_instances_collected
num_instances_collected[category] += 1
runner.compiler.acknowledge_compilation(category)
print(f"Instance number {num_id} of category {category} collected")
print(f"Instance {file_name} of category {category} collected")

done = True
for category in num_instances_collected.keys():
Expand Down
20 changes: 12 additions & 8 deletions src/image2structure/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,17 +110,19 @@ def main():
# list of encoded strings and stored in the column "assets"

# Figure out the extension of the structure files
file_name: str = os.listdir(structure_path)[0]
extension: str = os.path.splitext(file_name)[-1]
if file_name.endswith(".tar.gz"):
first_file_name: str = os.listdir(structure_path)[0]
extension: str = os.path.splitext(first_file_name)[-1]
if first_file_name.endswith(".tar.gz"):
extension = ".tar.gz"

# Load the structure
df: pd.DataFrame = pd.DataFrame()
structure_set = set()
file_names: List[str] = os.listdir(structure_path)
for i in tqdm(range(num_data_points), desc="Loading data"):
try:
structure_file = os.path.join(structure_path, f"{i}{extension}")
file_name: str = file_names[i].replace(extension, "")
structure_file = os.path.join(structure_path, f"{file_name}{extension}")
structure: str
if extension == ".tar.gz" or extension == ".zip":
structure = load_archive(structure_file)
Expand All @@ -129,9 +131,9 @@ def main():
if structure in structure_set:
continue
structure_set.add(structure)
text: str = load_file(os.path.join(text_path, f"{i}.txt"))
image = os.path.join(image_path, f"{i}.png")
metadata = os.path.join(metadata_path, f"{i}.json")
text: str = load_file(os.path.join(text_path, f"{file_name}.txt"))
image = os.path.join(image_path, f"{file_name}.png")
metadata = os.path.join(metadata_path, f"{file_name}.json")
df = pd.concat(
[
df,
Expand All @@ -146,7 +148,9 @@ def main():
]
)
except FileNotFoundError as e:
print(f"Skipping {i} as it is missing one of the required files: {e}")
print(
f"Skipping {file_name} as it is missing one of the required files: {e}"
)
continue

# Remove duplicates
Expand Down

0 comments on commit feb6848

Please sign in to comment.