From 91660a13480fb5e6e1a3f364466bad9c2238d1bf Mon Sep 17 00:00:00 2001 From: Alexander Gutkin Date: Fri, 10 Jan 2025 09:49:53 +0000 Subject: [PATCH] Glyph prediction from sketches: Results' post-processing tool. PiperOrigin-RevId: 713961954 --- protoscribe/evolution/stages/common_flags.py | 10 ++ .../stages/glyphs_from_jsonl_main.py | 53 +------ .../stages/sketches_from_jsonl_main.py | 146 ++++++++++++++++++ protoscribe/evolution/stages/utils.py | 57 +++++++ 4 files changed, 217 insertions(+), 49 deletions(-) create mode 100644 protoscribe/evolution/stages/sketches_from_jsonl_main.py create mode 100644 protoscribe/evolution/stages/utils.py diff --git a/protoscribe/evolution/stages/common_flags.py b/protoscribe/evolution/stages/common_flags.py index 253daf2..be48b63 100644 --- a/protoscribe/evolution/stages/common_flags.py +++ b/protoscribe/evolution/stages/common_flags.py @@ -48,6 +48,16 @@ "Type of the phonetic model to use." ) +SEMANTICS_XID = flags.DEFINE_string( + "semantics_xid", None, + "XManager job ID for the inference run with semantics model for this round." +) + +PHONETICS_XID = flags.DEFINE_string( + "phonetics_xid", None, + "XManager job ID for the inference run with phonetics model for this round." +) + def experiment_dir() -> str: """Returns fully-qualified experiment directory path.""" diff --git a/protoscribe/evolution/stages/glyphs_from_jsonl_main.py b/protoscribe/evolution/stages/glyphs_from_jsonl_main.py index 30b4ab1..049d545 100644 --- a/protoscribe/evolution/stages/glyphs_from_jsonl_main.py +++ b/protoscribe/evolution/stages/glyphs_from_jsonl_main.py @@ -24,7 +24,7 @@ from absl import app from absl import flags from protoscribe.evolution.stages import common_flags -from protoscribe.utils import file_utils +from protoscribe.evolution.stages import utils from protoscribe.utils import subprocess_utils _EXPERIMENT_NAME = flags.DEFINE_string( @@ -34,16 +34,6 @@ required=True ) -_SEMANTICS_XID = flags.DEFINE_string( - "semantics_xid", None, - "XManager job ID for the inference run with semantics model for this round." -) - -_PHONETICS_XID = flags.DEFINE_string( - "phonetics_xid", None, - "XManager job ID for the inference run with phonetics model for this round." -) - _JSONL_FILE_NAME_GLYPHS = flags.DEFINE_string( "jsonl_file_name_glyphs", None, "File name used for storing the outputs of glyph inference.", @@ -59,41 +49,6 @@ _MODE = "glyph" -def _setup_inference_directories( - round_data_dir: str, - experiment_name: str, - experiment_id: str | None -) -> str: - """Sets up the directory for storing the post-processed inference outputs. - - Args: - round_data_dir: Data directory for this round. - experiment_name: Symbol name for the experiment. - experiment_id: XManager Job ID (integer string). - - Returns: - Output directory where postprocessed results will be stored. - - Raises: - ValueError if output directory could not be determined. - """ - output_dir = os.path.join( - round_data_dir, f"{experiment_name}:inference_outputs" - ) - if experiment_id: - output_dir = os.path.join(output_dir, experiment_id) - else: - experiment_dirs = file_utils.list_subdirs(output_dir) - if not experiment_dirs: - raise ValueError( - f"No inference experiment directories found under {output_dir}!" - ) - output_dir = experiment_dirs[-1] - - logging.info("Reading and writing output data to %s ...", output_dir) - return output_dir - - def _glyphs_for_model_type( round_data_dir: str, model_type: str, experiment_id: str ) -> None: @@ -108,7 +63,7 @@ def _glyphs_for_model_type( experiment_name = ( f"{_EXPERIMENT_NAME.value}:{round_id}:{_MODE}_{model_type}" ) - output_dir = _setup_inference_directories( + output_dir = utils.setup_inference_directories( round_data_dir=round_data_dir, experiment_name=experiment_name, experiment_id=experiment_id @@ -137,13 +92,13 @@ def main(argv: Sequence[str]) -> None: _glyphs_for_model_type( round_data_dir=round_data_dir, model_type=common_flags.SEMANTIC_MODEL.value, - experiment_id=_SEMANTICS_XID.value + experiment_id=common_flags.SEMANTICS_XID.value ) # Post-process inference results for the phonetic stream. _glyphs_for_model_type( round_data_dir=round_data_dir, model_type=common_flags.PHONETIC_MODEL.value, - experiment_id=_PHONETICS_XID.value + experiment_id=common_flags.PHONETICS_XID.value ) diff --git a/protoscribe/evolution/stages/sketches_from_jsonl_main.py b/protoscribe/evolution/stages/sketches_from_jsonl_main.py new file mode 100644 index 0000000..101fe31 --- /dev/null +++ b/protoscribe/evolution/stages/sketches_from_jsonl_main.py @@ -0,0 +1,146 @@ +# Copyright 2024 The Protoscribe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Stage-specific helper for postprocessing sketch inference results. + +This tool is intended to be used on outputs of the recognizer run over the +sketch generator, in other words the outputs are produced by the following +pipeline: [inputs] -> model_a -> [sketches] -> model_b -> [glyphs, sketches]. +""" + +from collections.abc import Sequence +import logging +import os +import tempfile + +from absl import app +from absl import flags +from protoscribe.evolution.stages import common_flags +from protoscribe.evolution.stages import utils +from protoscribe.utils import file_utils +from protoscribe.utils import subprocess_utils + +import glob +import os + +_MODE = flags.DEFINE_enum( + "mode", "sketch-token", + [ + "sketch-token", + "sketch-token-and-glyph", + ], + "Type of sketch mdoel. Can be 'sketch-token' for pure sketch generation or " + "'sketch-token-and-glyph' for combined glyph and sletch prediction. " + "This is a prefix part of the model configuration in 'configs' directory." +) + +_EXPERIMENT_NAME = flags.DEFINE_string( + "experiment_name", None, + "An experiment name which will define the directory in which the " + "evolving system data is placed.", + required=True +) + +_JSONL_FILE_NAME_SKETCHES_RECO = flags.DEFINE_string( + "jsonl_file_name_sketches_reco", None, + "File name used for storing the outputs of recognizer run over " + "generated sketches.", + required=True +) + +# Actual inference post-processing tool. +_SKETCHES_TOOL = ( + "protoscribe/sketches/inference/sketches_from_jsonl" +) + + +def _sketches_and_glyphs_for_model_type( + round_data_dir: str, model_type: str, experiment_id: str +) -> None: + """Run sketch and glyph extraction from results for a given model type. + + Args: + round_data_dir: Data directory for this round. + model_type: Type of the model. + experiment_id: XManager job ID. + """ + + # Figure out directory for the outputs. + round_id = common_flags.ROUND.value + experiment_name = ( + f"{_EXPERIMENT_NAME.value}:{round_id}:{_MODE.value}_{model_type}" + ) + output_dir = utils.setup_inference_directories( + round_data_dir=round_data_dir, + experiment_name=f"{experiment_name}:reco", + experiment_id=experiment_id + ) + jsonl_file = os.path.join(output_dir, _JSONL_FILE_NAME_SKETCHES_RECO.value) + + # Run the post-processing tool. + if _MODE.value == "sketch-token": + # Outputs generated by the recognizer. + mode_flag = "--recognizer_json" + else: + # Joint sketch-and-glyph generator model. + mode_flag = "--combined_glyphs_and_strokes" + + with tempfile.TemporaryDirectory() as temp_dir_name: + subprocess_utils.run_subprocess( + _SKETCHES_TOOL, + args=[ + "--dataset_dir", round_data_dir, + "--input_jsonl_file", jsonl_file, + "--dedup_inputs", True, + "--skip_plots", False, + "--prune_numbers", True, + "--output_dir", temp_dir_name, # Images/SVGs. + "--output_tsv_file", f"{output_dir}/results.tsv", + "--output_file_for_scorer", f"{output_dir}/results.jsonl", + "--ignore_errors", True, + mode_flag, True, + ] + ) + + # Place the resulting sketches along with the rest of results. + images_dir = os.path.join(output_dir, "images") + logging.info("Copying sketches to %s ...", images_dir) + if not os.path.exists(images_dir): + os.makedirs(images_dir, exist_ok=True) + file_utils.copy_dir(temp_dir_name, images_dir) + + +def main(argv: Sequence[str]) -> None: + if len(argv) > 1: + raise app.UsageError("Too many command-line arguments.") + + round_data_dir = common_flags.round_data_dir() + logging.info("Using data location: %s", round_data_dir) + + # Post-process inference results for the semantic stream. + _sketches_and_glyphs_for_model_type( + round_data_dir=round_data_dir, + model_type=common_flags.SEMANTIC_MODEL.value, + experiment_id=common_flags.SEMANTICS_XID.value + ) + # Post-process inference results for the phonetic stream. + _sketches_and_glyphs_for_model_type( + round_data_dir=round_data_dir, + model_type=common_flags.PHONETIC_MODEL.value, + experiment_id=common_flags.PHONETICS_XID.value + ) + + +if __name__ == "__main__": + app.run(main) diff --git a/protoscribe/evolution/stages/utils.py b/protoscribe/evolution/stages/utils.py new file mode 100644 index 0000000..681c67c --- /dev/null +++ b/protoscribe/evolution/stages/utils.py @@ -0,0 +1,57 @@ +# Copyright 2024 The Protoscribe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Miscellaneous stage-specific helpers.""" + +import logging +import os + +from protoscribe.utils import file_utils + + +def setup_inference_directories( + round_data_dir: str, + experiment_name: str, + experiment_id: str | None +) -> str: + """Sets up the directory for storing the post-processed inference outputs. + + Args: + round_data_dir: Data directory for this round. + experiment_name: Symbol name for the experiment. + experiment_id: XManager Job ID (integer string). + + Returns: + Output directory where postprocessed results will be stored. + + Raises: + ValueError if output directory could not be determined. + """ + output_dir = os.path.join( + round_data_dir, f"{experiment_name}:inference_outputs" + ) + if experiment_id: + output_dir = os.path.join(output_dir, experiment_id) + else: + experiment_dirs = file_utils.list_subdirs(output_dir) + if not experiment_dirs: + raise ValueError( + f"No inference experiment directories found under {output_dir}!" + ) + # TODO: We should probably be either returning the latest + # created directory *or* not allow multiple directories at all. + output_dir = experiment_dirs[-1] + + logging.info("Reading and writing output data to %s ...", output_dir) + return output_dir