Skip to content

Commit

Permalink
fix(clone): Better error handling for clones (#162)
Browse files Browse the repository at this point in the history
* fix(clone): Better handling of exports to encord

* add chunker

* fixy fix

* types

* remove http check

* comments
  • Loading branch information
yogesh-encord authored Feb 10, 2023
1 parent 2c9e04c commit 231c5e9
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 56 deletions.
65 changes: 34 additions & 31 deletions src/encord_active/app/actions_page/export_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,26 @@ def _get_columns(needs_ontology: bool, num_rows: int) -> RenderItems:
return RenderItems(*[_get_column(col, item, num_rows) for item, col in zip(items_to_render, form_columns)])


def _get_project():
try:
action_utils = EncordActions(get_state().project_paths.project_dir, app_config.get_ssh_key())
return action_utils, bool(action_utils.original_project)
except ProjectNotFound as e:
st.markdown(
f"""
❌ No `project_meta.yaml` file in the project folder.
Please create `project_meta.yaml` file in **{e.project_dir}** folder with the following content
and try again:
``` yaml
project_hash: <project_hash>
ssh_key_path: /path/to/your/encord/ssh_key
```
"""
)
except Exception as e:
st.error(str(e))


def export_filter():
get_filtered_row_count, set_filtered_row_count = use_state(0)
get_clone_button, set_clone_button = use_state(False)
Expand All @@ -136,7 +156,7 @@ def export_filter():
message_placeholder = st.empty()

st.header("Filter & Export")

action_utils, has_original_project = _get_project()
filtered_df = filter_dataframe(get_state().merged_metrics.copy())
filtered_df.reset_index(inplace=True)
row_count = filtered_df.shape[0]
Expand Down Expand Up @@ -188,16 +208,21 @@ def export_filter():
help="Ensure you have generated an updated COCO file before downloading",
)

if get_filtered_row_count() != row_count:
set_filtered_row_count(row_count)
set_clone_button(False)

action_columns[3].button(
"🏗 Clone",
"🏗 Clone" if has_original_project else "🏗 Export to Encord",
on_click=lambda: set_clone_button(True),
disabled=get_filtered_row_count() != row_count,
help="Clone the filtered data into a new Encord dataset and project",
)
delete_btn = action_columns[4].button("👀 Review", help="Assign the filtered data for review on the Encord platform")
edit_btn = action_columns[5].button(
"🖋 Re-label", help="Assign the filtered data for relabelling on the Encord platform"
)
augment_btn = action_columns[6].button("➕ Augment", help="Augment your dataset based on the filered data")
augment_btn = action_columns[6].button("➕ Augment", help="Augment your dataset based on the filtered data")

if any([delete_btn, edit_btn, augment_btn]):
set_clone_button(False)
Expand All @@ -214,32 +239,7 @@ def export_filter():
unsafe_allow_html=True,
)

prev_row_count = get_filtered_row_count()
if prev_row_count != row_count:
set_filtered_row_count(row_count)
set_clone_button(False)

if get_clone_button():
try:
action_utils = EncordActions(get_state().project_paths.project_dir, app_config.get_ssh_key())
has_original_project = bool(action_utils.original_project)
except ProjectNotFound as e:
st.markdown(
f"""
❌ No `project_meta.yaml` file in the project folder.
Please create `project_meta.yaml` file in **{e.project_dir}** folder with the following content
and try again:
``` yaml
project_hash: <project_hash>
ssh_key_path: /path/to/your/encord/ssh_key
```
"""
)
return
except Exception as e:
st.error(str(e))
return

with st.form("new_project_form"):
st.subheader("Create a new project with the selected items")

Expand Down Expand Up @@ -269,9 +269,12 @@ def export_filter():
dataset_creation_result, cols.project.title, cols.project.description, ontology_hash, progress
)

action_utils.replace_uids(
dataset_creation_result.lr_du_mapping, new_project.project_hash, dataset_creation_result.hash
)
try:
action_utils.replace_uids(
dataset_creation_result.lr_du_mapping, new_project.project_hash, dataset_creation_result.hash
)
except Exception as e:
st.error(str(e))
clear()
label.info("🎉 New project is created!")

Expand Down
16 changes: 15 additions & 1 deletion src/encord_active/lib/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,17 @@
from concurrent.futures import as_completed
from itertools import product
from pathlib import Path
from typing import Any, Collection, Dict, List, Optional, Tuple, TypedDict, Union
from typing import (
Any,
Collection,
Dict,
List,
Optional,
Sequence,
Tuple,
TypedDict,
Union,
)

import av
import cv2
Expand Down Expand Up @@ -428,3 +438,7 @@ def download_file(
f.flush()

return destination


def iterate_in_batches(seq: Sequence, size: int):
return (seq[pos : pos + size] for pos in range(0, len(seq), size))
4 changes: 2 additions & 2 deletions src/encord_active/lib/db/merged_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,8 @@ def replace_all(self, df: pd.DataFrame):
def replace_identifiers(self, mappings: dict[str, str]):
def _replace_identifiers(id: str):
lr, du, *rest = id.split("_")
mappedlr, mappeddu = mappings[lr], mappings[du]
return "_".join([mappedlr, mappeddu, *rest])
mapped_lr, mapped_du = mappings.get(lr, lr), mappings.get(du, du)
return "_".join([mapped_lr, mapped_du, *rest])

with DBConnection() as conn:
df = pd.read_sql(f"SELECT * FROM {TABLE_NAME}", conn, index_col="identifier")
Expand Down
57 changes: 35 additions & 22 deletions src/encord_active/lib/encord/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from encord.utilities.label_utilities import construct_answer_dictionaries
from tqdm import tqdm

from encord_active.lib.common.utils import fetch_project_meta
from encord_active.lib.common.utils import fetch_project_meta, iterate_in_batches
from encord_active.lib.db.merged_metrics import MergedMetrics
from encord_active.lib.embeddings.utils import (
LabelEmbedding,
Expand Down Expand Up @@ -257,42 +257,55 @@ def create_project(

def update_embedding_identifiers(self, embedding_type: EmbeddingType, renaming_map: dict[str, str]):
def _update_identifiers(embedding: LabelEmbedding, renaming_map: dict[str, str]):
embedding["label_row"] = renaming_map[embedding["label_row"]]
embedding["data_unit"] = renaming_map[embedding["data_unit"]]
url_without_extension, extension = embedding["url"].split(".")
changed_parts = [renaming_map[x] if x in renaming_map else x for x in url_without_extension.split("/")]
embedding["url"] = "/".join(changed_parts) + "." + extension
old_lr, old_du = embedding["label_row"], embedding["data_unit"]
new_lr, new_du = renaming_map.get(old_lr, old_lr), renaming_map.get(old_du, old_du)
embedding["label_row"] = new_lr
embedding["data_unit"] = new_du
embedding["url"] = embedding["url"].replace(old_du, new_du).replace(old_lr, new_lr)
return embedding

collection = load_collections(embedding_type, self.project_file_structure.embeddings)
updated_collection = [_update_identifiers(up, renaming_map) for up in collection]
save_collections(embedding_type, self.project_file_structure.embeddings, updated_collection)

def _rename_files(self, file_mappings: dict[LabelRowDataUnit, LabelRowDataUnit]):
for (old_lr, old_du), (new_lr, new_du) in file_mappings.items():
old_lr_path = self.project_file_structure.data / old_lr
new_lr_path = self.project_file_structure.data / new_lr
if old_lr_path.exists() and not new_lr_path.exists():
old_lr_path.rename(new_lr_path)
for old_du_f in new_lr_path.glob(f"**/{old_du}.*"):
old_du_f.rename(new_lr_path / "images" / f"{new_du}.{old_du_f.suffix}")

def _replace_in_files(self, renaming_map):
for subs in iterate_in_batches(renaming_map.items(), 100):
substitutions = ";".join(f"s/{old}/{new}/g" for old, new in subs)
cmd = f" find . -type f \( -iname \*.json -o -iname \*.yaml -o -iname \*.csv \) -exec sed -i '' '{substitutions}' {{}} +"
subprocess.run(cmd, shell=True, cwd=self.project_file_structure.project_dir)

def replace_uids(
self, file_mappings: dict[LabelRowDataUnit, LabelRowDataUnit], project_hash: str, dataset_hash: str
):
label_row_meta = json.loads(self.project_file_structure.label_row_meta.read_text(encoding="utf-8"))
original_dataset_hash = next(iter(label_row_meta.values()))["dataset_hash"]

renaming_map = {self.project_meta["project_hash"]: project_hash, original_dataset_hash: dataset_hash}

for (old_lr, old_du), (new_lr, new_du) in file_mappings.items():
old_lr_path = self.project_file_structure.data / old_lr
for old_du_f in old_lr_path.glob(f"**/{old_du}.*"):
old_du_f.rename(old_lr_path / "images" / f"{new_du}.{old_du_f.suffix}")
renaming_map[old_lr], renaming_map[old_du] = new_lr, new_du

dir_renames = {old_lr: new_lr for (old_lr, old_du), (new_lr, new_du) in file_mappings.items()}
for (old_lr, new_lr) in dir_renames.items():
(self.project_file_structure.data / old_lr).rename(self.project_file_structure.data / new_lr)

MergedMetrics().replace_identifiers(renaming_map)
for old, new in renaming_map.items():
cmd = f" find . -type f \( -iname \*.json -o -iname \*.yaml -o -iname \*.csv \) -exec sed -i '' 's/{old}/{new}/g' {{}} +"
subprocess.run(cmd, shell=True, cwd=self.project_file_structure.project_dir)

for embedding_type in [EmbeddingType.IMAGE, EmbeddingType.CLASSIFICATION, EmbeddingType.OBJECT]:
self.update_embedding_identifiers(embedding_type, renaming_map)
try:
self._rename_files(file_mappings)
self._replace_in_files(renaming_map)
MergedMetrics().replace_identifiers(renaming_map)
for embedding_type in [EmbeddingType.IMAGE, EmbeddingType.CLASSIFICATION, EmbeddingType.OBJECT]:
self.update_embedding_identifiers(embedding_type, renaming_map)
except Exception as e:
rev_renaming_map = {v: k for k, v in renaming_map.items()}
self._rename_files({v: k for k, v in file_mappings.items()})
self._replace_in_files(rev_renaming_map)
MergedMetrics().replace_identifiers(rev_renaming_map)
for embedding_type in [EmbeddingType.IMAGE, EmbeddingType.CLASSIFICATION, EmbeddingType.OBJECT]:
self.update_embedding_identifiers(embedding_type, rev_renaming_map)
raise Exception("UID replacement failed")


def _find_new_row_hash(user_client: EncordUserClient, new_dataset_hash: str, out_mapping: dict) -> Optional[str]:
Expand Down

0 comments on commit 231c5e9

Please sign in to comment.