Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use uuid for filenames #33

Merged
merged 1 commit into from
Mar 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 8 additions & 12 deletions src/image2structure/collect.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import shutil
import tarfile
import time
import uuid

from .runner import Runner
from .run_specs import _RUNNER_REGISTRY
Expand Down Expand Up @@ -259,7 +260,7 @@ def run(runner: Runner, args: argparse.Namespace) -> None:
done: bool = False
for compilation_result in accepted_results:
category: str = compilation_result.category
num_id: int = 0
file_name: str = str(uuid.uuid4())
if category not in num_instances_collected:
# First time we collect this category
# Create the directories
Expand All @@ -268,30 +269,25 @@ def run(runner: Runner, args: argparse.Namespace) -> None:
os.path.join(output_path, category, dir), exist_ok=True
)
num_instances_collected[category] = 0
else:
# Increment the number of instances collected
num_id = num_files_in_dir(
os.path.join(output_path, category, "metadata")
)

# Copy shared metadata to compiled metadata
compiled_metadata: Dict[str, Any] = {
**metadata,
"assets": compilation_result.assets_path,
"category": category,
"num_id": num_id,
"uuid": file_name,
}

# Save the metadata
instance_metadata_path: str = os.path.join(
output_path, category, "metadata", f"{num_id}.json"
output_path, category, "metadata", f"{file_name}.json"
)
with open(instance_metadata_path, "w") as f:
json.dump(compiled_metadata, f, indent=4)

# Save the image
instance_image_path: str = os.path.join(
output_path, category, "images", f"{num_id}.png"
output_path, category, "images", f"{file_name}.png"
)
shutil.copy(compilation_result.rendering_path, instance_image_path)

Expand All @@ -306,7 +302,7 @@ def run(runner: Runner, args: argparse.Namespace) -> None:

# Save the text
instance_text_path: str = os.path.join(
output_path, category, "text", f"{num_id}.txt"
output_path, category, "text", f"{file_name}.txt"
)
with open(instance_text_path, "w") as f:
f.write(compilation_result.text)
Expand All @@ -318,7 +314,7 @@ def run(runner: Runner, args: argparse.Namespace) -> None:
else ""
)
instance_structure_path: str = os.path.join(
output_path, category, "structures", f"{num_id}{extension}"
output_path, category, "structures", f"{file_name}{extension}"
)
if os.path.isdir(compilation_result.data_path):
# First delete all files that we do not want to include
Expand All @@ -345,7 +341,7 @@ def run(runner: Runner, args: argparse.Namespace) -> None:
assert category in num_instances_collected
num_instances_collected[category] += 1
runner.compiler.acknowledge_compilation(category)
print(f"Instance number {num_id} of category {category} collected")
print(f"Instance {file_name} of category {category} collected")

done = True
for category in num_instances_collected.keys():
Expand Down
20 changes: 12 additions & 8 deletions src/image2structure/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,17 +110,19 @@ def main():
# list of encoded strings and stored in the column "assets"

# Figure out the extension of the structure files
file_name: str = os.listdir(structure_path)[0]
extension: str = os.path.splitext(file_name)[-1]
if file_name.endswith(".tar.gz"):
first_file_name: str = os.listdir(structure_path)[0]
extension: str = os.path.splitext(first_file_name)[-1]
if first_file_name.endswith(".tar.gz"):
extension = ".tar.gz"

# Load the structure
df: pd.DataFrame = pd.DataFrame()
structure_set = set()
file_names: List[str] = os.listdir(structure_path)
for i in tqdm(range(num_data_points), desc="Loading data"):
try:
structure_file = os.path.join(structure_path, f"{i}{extension}")
file_name: str = file_names[i].replace(extension, "")
structure_file = os.path.join(structure_path, f"{file_name}{extension}")
structure: str
if extension == ".tar.gz" or extension == ".zip":
structure = load_archive(structure_file)
Expand All @@ -129,9 +131,9 @@ def main():
if structure in structure_set:
continue
structure_set.add(structure)
text: str = load_file(os.path.join(text_path, f"{i}.txt"))
image = os.path.join(image_path, f"{i}.png")
metadata = os.path.join(metadata_path, f"{i}.json")
text: str = load_file(os.path.join(text_path, f"{file_name}.txt"))
image = os.path.join(image_path, f"{file_name}.png")
metadata = os.path.join(metadata_path, f"{file_name}.json")
df = pd.concat(
[
df,
Expand All @@ -146,7 +148,9 @@ def main():
]
)
except FileNotFoundError as e:
print(f"Skipping {i} as it is missing one of the required files: {e}")
print(
f"Skipping {file_name} as it is missing one of the required files: {e}"
)
continue

# Remove duplicates
Expand Down