Skip to content

Commit

Permalink
add --needs-review flag
Browse files Browse the repository at this point in the history
This flag is part of the new queueing pipeline
  • Loading branch information
kouloumos committed Dec 14, 2023
1 parent 522f353 commit 5fcb9f9
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 13 deletions.
27 changes: 20 additions & 7 deletions app/transcription.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,17 @@ def __init__(
username=None,
test_mode=False,
working_dir=None,
batch_preprocessing_output=False
batch_preprocessing_output=False,
needs_review=False,
):
self.test_mode = test_mode
self.logger = get_logger()
self.tmp_dir = working_dir if working_dir is not None else tempfile.mkdtemp()

self.transcript_by = "username" if test_mode else self.__get_username()
# during testing we need to create the markdown for validation purposes
self.markdown = markdown or test_mode
self.review_flag = self.__configure_review_flag(needs_review)
self.open_pr = pr
if deepgram:
self.service = services.Deepgram(
Expand All @@ -57,10 +62,7 @@ def __init__(
self.nocleanup = nocleanup
# during testing we do not have/need a queuer backend
self.queuer = Queuer(test_mode=test_mode) if queue is True else None
# during testing we need to create the markdown for validation purposes
self.markdown = markdown or test_mode
self.existing_media = None
self.test_mode = test_mode
self.preprocessing_output = [] if batch_preprocessing_output else None

self.logger.info(f"Temp directory: {self.tmp_dir}")
Expand All @@ -71,6 +73,17 @@ def _create_subdirectory(self, subdir_name):
os.makedirs(subdir_path)
return subdir_path

def __configure_review_flag(self, needs_review):
# sanity check
if needs_review and not self.markdown:
raise Exception(
"The `--needs-review` flag is only applicable when creating a markdown")

if needs_review:
return " --needs-review"
else:
return ""

def __get_username(self):
try:
if os.path.isfile(".username"):
Expand Down Expand Up @@ -284,7 +297,7 @@ def write_to_markdown_file(self, transcript: Transcript, output_dir):
meta_data = (
"---\n"
f'title: "{transcript.title}"\n'
f"transcript_by: {self.transcript_by} via TBTBTC v{__version__}\n"
f"transcript_by: {self.transcript_by} via tstbtc v{__version__}{self.review_flag}\n"
)
if not transcript.source.local:
meta_data += f"media: {transcript.source.media}\n"
Expand All @@ -310,7 +323,7 @@ def write_to_json_file(self, transcript: Transcript):
self.logger.info("Creating JSON file with transcription...")
output_dir = f"{self.model_output_dir}/{transcript.source.loc}"
transcript_json = transcript.to_json()
transcript_json["transcript_by"] = f"{self.transcript_by} via TBTBTC v{__version__}"
transcript_json["transcript_by"] = f"{self.transcript_by} via tstbtc v{__version__}"
json_file = utils.write_to_json(
transcript_json,
output_dir,
Expand Down Expand Up @@ -338,7 +351,7 @@ def postprocess(self, transcript: Transcript):
)
elif not self.test_mode:
transcript_json = transcript.to_json()
transcript_json["transcript_by"] = f"{self.transcript_by} via TBTBTC v{__version__}"
transcript_json["transcript_by"] = f"{self.transcript_by} via tstbtc v{__version__}"
if self.queuer:
return self.queuer.push_to_queue(transcript_json)
else:
Expand Down
2 changes: 1 addition & 1 deletion test/testAssets/payload.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"content": {
"title": "test_title",
"transcript_by": "username via TBTBTC v1.0.0",
"transcript_by": "username via tstbtc v1.0.0",
"categories": ["category1", "category2"],
"tags": [],
"speakers": ["speaker1", "speaker2"],
Expand Down
2 changes: 1 addition & 1 deletion test/test_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def check_md_file(
if x.startswith("##"):
detected_chapters.append(x[3:].strip())

assert fields["transcript_by"] == f"{transcript_by} via TBTBTC v{application.__version__}"
assert fields["transcript_by"] == f"{transcript_by} via tstbtc v{application.__version__}"

if not local:
assert fields["media"] == media
Expand Down
2 changes: 1 addition & 1 deletion test/test_video.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def test_generate_payload():
source_file=source, loc=loc, title=title, date=date, tags=tags, category=category, speakers=speakers)
transcription.start(test_transcript=transcript)
transcript_json = transcription.transcripts[0].to_json()
transcript_json["transcript_by"] = f"{username} via TBTBTC v{__version__}"
transcript_json["transcript_by"] = f"{username} via tstbtc v{__version__}"
payload = {
"content": transcript_json
}
Expand Down
18 changes: 15 additions & 3 deletions transcriber.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,12 @@ def print_help(ctx, param, value):
default=False,
help="Do not push the resulting transcript to the Queuer backend",
)
needs_review = click.option(
"--needs-review",
is_flag=True,
default=False,
help="Add 'needs review' flag to the resulting transcript",
)
model_output_dir = click.option(
"-o",
"--model_output_dir",
Expand Down Expand Up @@ -193,6 +199,7 @@ def print_help(ctx, param, value):
@upload_to_s3
@save_to_markdown
@noqueue
@needs_review
# Configuration options
@model_output_dir
@nocleanup
Expand All @@ -215,7 +222,8 @@ def transcribe(
model_output_dir: str,
nocleanup: bool,
noqueue: bool,
markdown: bool
markdown: bool,
needs_review: bool,
) -> None:
"""Transcribe the provided sources. Suported sources include: \n
- YouTube videos and playlists\n
Expand Down Expand Up @@ -246,6 +254,7 @@ def transcribe(
nocleanup=nocleanup,
queue=not noqueue,
markdown=markdown,
needs_review=needs_review,
working_dir=tmp_dir
)
if source.endswith(".json"):
Expand Down Expand Up @@ -348,13 +357,15 @@ def preprocess(
@upload_to_s3
@save_to_markdown
@noqueue
@needs_review
def postprocess(
metadata_json_file,
service,
pr: bool,
upload: bool,
markdown: bool,
noqueue: bool,
needs_review: bool,
):
"""Postprocess the output of a transcription service.
Requires the metadata JSON file that is the output of the previous stage
Expand All @@ -363,15 +374,16 @@ def postprocess(
try:
configure_logger(log_level=logging.INFO)
utils.check_if_valid_file_path(metadata_json_file)
logger.info(
f"Postprocessing {service} transcript from {metadata_json_file}")
transcription = Transcription(
deepgram=service == "deepgram",
pr=pr,
upload=upload,
markdown=markdown,
queue=not noqueue,
needs_review=needs_review,
)
logger.info(
f"Postprocessing {service} transcript from {metadata_json_file}")
with open(metadata_json_file, "r") as outfile:
metadata_json = json.load(outfile)
metadata = utils.configure_metadata_given_from_JSON(metadata_json)
Expand Down

0 comments on commit 5fcb9f9

Please sign in to comment.