diff --git a/app/transcription.py b/app/transcription.py index 9a209a9..354de20 100644 --- a/app/transcription.py +++ b/app/transcription.py @@ -40,12 +40,17 @@ def __init__( username=None, test_mode=False, working_dir=None, - batch_preprocessing_output=False + batch_preprocessing_output=False, + needs_review=False, ): + self.test_mode = test_mode self.logger = get_logger() self.tmp_dir = working_dir if working_dir is not None else tempfile.mkdtemp() self.transcript_by = "username" if test_mode else self.__get_username() + # during testing we need to create the markdown for validation purposes + self.markdown = markdown or test_mode + self.review_flag = self.__configure_review_flag(needs_review) self.open_pr = pr if deepgram: self.service = services.Deepgram( @@ -57,10 +62,7 @@ def __init__( self.nocleanup = nocleanup # during testing we do not have/need a queuer backend self.queuer = Queuer(test_mode=test_mode) if queue is True else None - # during testing we need to create the markdown for validation purposes - self.markdown = markdown or test_mode self.existing_media = None - self.test_mode = test_mode self.preprocessing_output = [] if batch_preprocessing_output else None self.logger.info(f"Temp directory: {self.tmp_dir}") @@ -71,6 +73,17 @@ def _create_subdirectory(self, subdir_name): os.makedirs(subdir_path) return subdir_path + def __configure_review_flag(self, needs_review): + # sanity check + if needs_review and not self.markdown: + raise Exception( + "The `--needs-review` flag is only applicable when creating a markdown") + + if needs_review: + return " --needs-review" + else: + return "" + def __get_username(self): try: if os.path.isfile(".username"): @@ -284,7 +297,7 @@ def write_to_markdown_file(self, transcript: Transcript, output_dir): meta_data = ( "---\n" f'title: "{transcript.title}"\n' - f"transcript_by: {self.transcript_by} via TBTBTC v{__version__}\n" + f"transcript_by: {self.transcript_by} via tstbtc v{__version__}{self.review_flag}\n" ) if not transcript.source.local: meta_data += f"media: {transcript.source.media}\n" @@ -310,7 +323,7 @@ def write_to_json_file(self, transcript: Transcript): self.logger.info("Creating JSON file with transcription...") output_dir = f"{self.model_output_dir}/{transcript.source.loc}" transcript_json = transcript.to_json() - transcript_json["transcript_by"] = f"{self.transcript_by} via TBTBTC v{__version__}" + transcript_json["transcript_by"] = f"{self.transcript_by} via tstbtc v{__version__}" json_file = utils.write_to_json( transcript_json, output_dir, @@ -338,7 +351,7 @@ def postprocess(self, transcript: Transcript): ) elif not self.test_mode: transcript_json = transcript.to_json() - transcript_json["transcript_by"] = f"{self.transcript_by} via TBTBTC v{__version__}" + transcript_json["transcript_by"] = f"{self.transcript_by} via tstbtc v{__version__}" if self.queuer: return self.queuer.push_to_queue(transcript_json) else: diff --git a/test/testAssets/payload.json b/test/testAssets/payload.json index c008213..c44dd08 100644 --- a/test/testAssets/payload.json +++ b/test/testAssets/payload.json @@ -1,7 +1,7 @@ { "content": { "title": "test_title", - "transcript_by": "username via TBTBTC v1.0.0", + "transcript_by": "username via tstbtc v1.0.0", "categories": ["category1", "category2"], "tags": [], "speakers": ["speaker1", "speaker2"], diff --git a/test/test_helpers.py b/test/test_helpers.py index bbebcb6..686cc95 100644 --- a/test/test_helpers.py +++ b/test/test_helpers.py @@ -35,7 +35,7 @@ def check_md_file( if x.startswith("##"): detected_chapters.append(x[3:].strip()) - assert fields["transcript_by"] == f"{transcript_by} via TBTBTC v{application.__version__}" + assert fields["transcript_by"] == f"{transcript_by} via tstbtc v{application.__version__}" if not local: assert fields["media"] == media diff --git a/test/test_video.py b/test/test_video.py index 0e7b4af..2751c10 100644 --- a/test/test_video.py +++ b/test/test_video.py @@ -148,7 +148,7 @@ def test_generate_payload(): source_file=source, loc=loc, title=title, date=date, tags=tags, category=category, speakers=speakers) transcription.start(test_transcript=transcript) transcript_json = transcription.transcripts[0].to_json() - transcript_json["transcript_by"] = f"{username} via TBTBTC v{__version__}" + transcript_json["transcript_by"] = f"{username} via tstbtc v{__version__}" payload = { "content": transcript_json } diff --git a/transcriber.py b/transcriber.py index 114c47b..8f51a87 100644 --- a/transcriber.py +++ b/transcriber.py @@ -114,6 +114,12 @@ def print_help(ctx, param, value): default=False, help="Do not push the resulting transcript to the Queuer backend", ) +needs_review = click.option( + "--needs-review", + is_flag=True, + default=False, + help="Add 'needs review' flag to the resulting transcript", +) model_output_dir = click.option( "-o", "--model_output_dir", @@ -193,6 +199,7 @@ def print_help(ctx, param, value): @upload_to_s3 @save_to_markdown @noqueue +@needs_review # Configuration options @model_output_dir @nocleanup @@ -215,7 +222,8 @@ def transcribe( model_output_dir: str, nocleanup: bool, noqueue: bool, - markdown: bool + markdown: bool, + needs_review: bool, ) -> None: """Transcribe the provided sources. Suported sources include: \n - YouTube videos and playlists\n @@ -246,6 +254,7 @@ def transcribe( nocleanup=nocleanup, queue=not noqueue, markdown=markdown, + needs_review=needs_review, working_dir=tmp_dir ) if source.endswith(".json"): @@ -348,6 +357,7 @@ def preprocess( @upload_to_s3 @save_to_markdown @noqueue +@needs_review def postprocess( metadata_json_file, service, @@ -355,6 +365,7 @@ def postprocess( upload: bool, markdown: bool, noqueue: bool, + needs_review: bool, ): """Postprocess the output of a transcription service. Requires the metadata JSON file that is the output of the previous stage @@ -363,15 +374,16 @@ def postprocess( try: configure_logger(log_level=logging.INFO) utils.check_if_valid_file_path(metadata_json_file) - logger.info( - f"Postprocessing {service} transcript from {metadata_json_file}") transcription = Transcription( deepgram=service == "deepgram", pr=pr, upload=upload, markdown=markdown, queue=not noqueue, + needs_review=needs_review, ) + logger.info( + f"Postprocessing {service} transcript from {metadata_json_file}") with open(metadata_json_file, "r") as outfile: metadata_json = json.load(outfile) metadata = utils.configure_metadata_given_from_JSON(metadata_json)