From 2764a7fc273680e2ee45fa8e9fa17d7b59913574 Mon Sep 17 00:00:00 2001 From: kouloumos Date: Wed, 6 Dec 2023 23:09:19 +0200 Subject: [PATCH] extract transcription services to distinct modules `Transcription` now initializes one of the available services for transcription based on user's choice: `Whisper` or `Deepgram` --- Readme.md | 10 +- app/application.py | 253 --------------------------------------- app/services/__init__.py | 2 + app/services/deepgram.py | 214 +++++++++++++++++++++++++++++++++ app/services/whisper.py | 113 +++++++++++++++++ app/transcript.py | 111 +---------------- app/transcription.py | 35 +++--- app/utils.py | 17 ++- test/test_video.py | 1 - transcriber.py | 31 +---- 10 files changed, 376 insertions(+), 411 deletions(-) create mode 100644 app/services/__init__.py create mode 100644 app/services/deepgram.py create mode 100644 app/services/whisper.py diff --git a/Readme.md b/Readme.md index b2ee34c..46813c3 100644 --- a/Readme.md +++ b/Readme.md @@ -15,7 +15,15 @@ This transcription tool operates through a structured four-stage process: 1. Preprocess: Gathers all the available metadata for each source (supports YouTube videos&playlists, and RSS feeds) 2. Process: Downloads and converts sources for transcription preparation -3. Transcription: Utilizes [`openai-whisper`](https://github.com/openai/whisper) or [Deepgram](https://deepgram.com/) to generate transcripts from MP3 files. +3. Transcription: Utilizes [`openai-whisper`](https://github.com/openai/whisper) or [Deepgram](https://deepgram.com/) to generate transcripts. + 1. Converts audio to text. + - Preserves raw wisper transcript in SRT + - Preserves raw deepgram output in JSON + 2. Summarize: Generates a summary of the transcript. [only available with deepgram] + 3. Upload: Saves raw transcript files in an AWS S3 Bucket [optional] + 4. Constructs the resulting transcript. + - Process diarization. [deepgram only] + - Process chapters. 4. Postprocess: Offers multiple options for further actions: - **Pull Request**: Opens a PR on the [bitcointranscripts](https://github.com/bitcointranscripts/bitcointranscripts) repo for the resulting transcript. - **Markdown**: Saves transcripts in a markdown format supported by bitcointranscripts. diff --git a/app/application.py b/app/application.py index 9a05649..a8fe2b3 100644 --- a/app/application.py +++ b/app/application.py @@ -1,28 +1,12 @@ """This module provides the transcript cli.""" import errno -import json import logging -import mimetypes import os -import re import shutil import subprocess -import tempfile -import time -from datetime import datetime -from urllib.parse import parse_qs, urlparse import boto3 -import pytube -import requests -import static_ffmpeg -import whisper -import yt_dlp -from clint.textui import progress -from deepgram import Deepgram from dotenv import dotenv_values -from moviepy.editor import VideoFileClip -from pytube.exceptions import PytubeError from app import __app_name__, __version__ from app.logging import get_logger @@ -44,184 +28,6 @@ def convert_wav_to_mp3(abs_path, filename, working_dir="tmp/"): return os.path.abspath(os.path.join(working_dir, filename[:-4] + ".mp3")) -def decimal_to_sexagesimal(dec): - sec = int(dec % 60) - minu = int((dec // 60) % 60) - hrs = int((dec // 60) // 60) - - return f"{hrs:02d}:{minu:02d}:{sec:02d}" - - -def combine_chapter(chapters, transcript, working_dir="tmp/"): - logger = logging.getLogger(__app_name__) - try: - chapters_pointer = 0 - transcript_pointer = 0 - result = "" - # chapters index, start time, name - # transcript start time, end time, text - - while chapters_pointer < len(chapters) and transcript_pointer < len( - transcript - ): - if ( - chapters[chapters_pointer][1] - <= transcript[transcript_pointer][0] - ): - result = ( - result + "\n\n## " + chapters[chapters_pointer][2] + "\n\n" - ) - chapters_pointer += 1 - else: - result = result + transcript[transcript_pointer][2] - transcript_pointer += 1 - - while transcript_pointer < len(transcript): - result = result + transcript[transcript_pointer][2] - transcript_pointer += 1 - - return result - except Exception as e: - logger.error("Error combining chapters") - logger.error(e) - - -def combine_deepgram_chapters_with_diarization(deepgram_data, chapters): - logger.info("(deepgram) Combining transcript with detected chapters...") - try: - para = "" - string = "" - curr_speaker = None - words = deepgram_data["results"]["channels"][0]["alternatives"][0][ - "words" - ] - words_pointer = 0 - chapters_pointer = 0 - while chapters_pointer < len(chapters) and words_pointer < len(words): - if chapters[chapters_pointer][1] <= words[words_pointer]["start"]: - if para != "": - para = para.strip(" ") - string = string + para + "\n\n" - para = "" - string = string + f"## {chapters[chapters_pointer][2]}\n\n" - chapters_pointer += 1 - else: - if words[words_pointer]["speaker"] != curr_speaker: - if para != "": - para = para.strip(" ") - string = string + para + "\n\n" - para = "" - string = ( - string - + f'Speaker {words[words_pointer]["speaker"]}: ' - + decimal_to_sexagesimal(words[words_pointer]["start"]) - ) - curr_speaker = words[words_pointer]["speaker"] - string = string + "\n\n" - - para = para + " " + words[words_pointer]["punctuated_word"] - words_pointer += 1 - while words_pointer < len(words): - if words[words_pointer]["speaker"] != curr_speaker: - if para != "": - para = para.strip(" ") - string = string + para + "\n\n" - para = "" - string = ( - string + f'Speaker {words[words_pointer]["speaker"]}:' - f' {decimal_to_sexagesimal(words[words_pointer]["start"])}' - ) - curr_speaker = words[words_pointer]["speaker"] - string = string + "\n\n" - - para = para + " " + words[words_pointer]["punctuated_word"] - words_pointer += 1 - para = para.strip(" ") - string = string + para - return string - except Exception as e: - logger.error("Error combining deepgram chapters") - logger.error(e) - - -def get_deepgram_transcript(deepgram_data, diarize): - logger = logging.getLogger(__app_name__) - try: - if diarize: - logger.info(f"(deepgram) Processing diarization...") - para = "" - string = "" - curr_speaker = None - for word in deepgram_data["results"]["channels"][0]["alternatives"][0][ - "words" - ]: - if word["speaker"] != curr_speaker: - if para != "": - para = para.strip(" ") - string = string + para + "\n\n" - para = "" - string = ( - string + f'Speaker {word["speaker"]}: ' - f'{decimal_to_sexagesimal(word["start"])}' - ) - curr_speaker = word["speaker"] - string = string + "\n\n" - - para = para + " " + word["punctuated_word"] - para = para.strip(" ") - string = string + para - return string - else: - return deepgram_data["results"]["channels"][0]["alternatives"][0][ - "transcript" - ] - except Exception as e: - raise Exception(f"Error while getting deepgram transcript: {e}") - - -def get_deepgram_summary(deepgram_data): - logger = logging.getLogger(__app_name__) - try: - summaries = deepgram_data["results"]["channels"][0]["alternatives"][0][ - "summaries" - ] - summary = "" - for x in summaries: - summary = summary + " " + x["summary"] - return summary.strip(" ") - except Exception as e: - logger.error("Error getting summary") - logger.error(e) - - -def process_mp3_deepgram(filename, summarize, diarize): - """using deepgram""" - logger = logging.getLogger(__app_name__) - logger.info("Transcribing audio to text using deepgram...") - try: - config = dotenv_values(".env") - dg_client = Deepgram(config["DEEPGRAM_API_KEY"]) - - with open(filename, "rb") as audio: - mimeType = mimetypes.MimeTypes().guess_type(filename)[0] - source = {"buffer": audio, "mimetype": mimeType} - response = dg_client.transcription.sync_prerecorded( - source, - { - "punctuate": True, - "speaker_labels": True, - "diarize": diarize, - "smart_formatting": True, - "summarize": summarize, - "model": "whisper-large", - }, - ) - audio.close() - return response - except Exception as e: - raise Exception(f"(deepgram) Error transcribing audio to text: {e}") - - def create_pr(absolute_path, loc, username, curr_time, title): logger = logging.getLogger(__app_name__) branch_name = loc.replace("/", "-") @@ -242,40 +48,6 @@ def create_pr(absolute_path, loc, username, curr_time, title): logger.info("Please check the PR for the transcription.") -def combine_deepgram_with_chapters(deepgram_data, chapters): - logger.info("(deepgram) Combining transcript with detected chapters...") - try: - chapters_pointer = 0 - words_pointer = 0 - result = "" - words = deepgram_data["results"]["channels"][0]["alternatives"][0][ - "words" - ] - # chapters index, start time, name - # transcript start time, end time, text - while chapters_pointer < len(chapters) and words_pointer < len(words): - if chapters[chapters_pointer][1] <= words[words_pointer]["end"]: - result = ( - result + "\n\n## " + chapters[chapters_pointer][2] + "\n\n" - ) - chapters_pointer += 1 - else: - result = result + words[words_pointer]["punctuated_word"] + " " - words_pointer += 1 - - # Append the final chapter heading and remaining content - while chapters_pointer < len(chapters): - result = result + "\n\n## " + chapters[chapters_pointer][2] + "\n\n" - chapters_pointer += 1 - while words_pointer < len(words): - result = result + words[words_pointer]["punctuated_word"] + " " - words_pointer += 1 - - return result - except Exception as e: - raise Exception(f"Error combining deepgram with chapters: {e}") - - def clean_up(tmp_dir): try: shutil.rmtree(tmp_dir) @@ -284,31 +56,6 @@ def clean_up(tmp_dir): raise -def generate_srt(data, filename, model_output_dir): - time_in_str = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") - if not os.path.isdir(model_output_dir): - os.makedirs(model_output_dir) - output_file = os.path.join( - model_output_dir, filename + "_" + time_in_str + ".srt" - ) - logger.info(f"Writing srt to {output_file}...") - with open(output_file, "w") as f: - for index, segment in enumerate(data): - start_time, end_time, text = segment - f.write(f"{index+1}\n") - f.write(f"{format_time(start_time)} --> {format_time(end_time)}\n") - f.write(f"{text.strip()}\n\n") - logger.info("File saved") - return output_file - - -def format_time(time): - hours = int(time / 3600) - minutes = int((time % 3600) / 60) - seconds = int(time % 60) - milliseconds = int((time % 1) * 1000) - return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}" - def upload_file_to_s3(file_path): logger = logging.getLogger(__app_name__) diff --git a/app/services/__init__.py b/app/services/__init__.py new file mode 100644 index 0000000..358cca7 --- /dev/null +++ b/app/services/__init__.py @@ -0,0 +1,2 @@ +from .whisper import Whisper +from .deepgram import Deepgram \ No newline at end of file diff --git a/app/services/deepgram.py b/app/services/deepgram.py new file mode 100644 index 0000000..4060e1d --- /dev/null +++ b/app/services/deepgram.py @@ -0,0 +1,214 @@ +import mimetypes + +import deepgram +from dotenv import dotenv_values + +from app import ( + application, + utils +) +from app.logging import get_logger +from app.transcript import Transcript + +logger = get_logger() + + +class Deepgram: + def __init__(self, summarize, diarize, upload, output_dir): + self.summarize = summarize + self.diarize = diarize + self.upload = upload + self.output_dir = output_dir + + def audio_to_text(self, audio_file): + logger.info("Transcribing audio to text using deepgram...") + try: + config = dotenv_values(".env") + dg_client = deepgram.Deepgram(config["DEEPGRAM_API_KEY"]) + + with open(audio_file, "rb") as audio: + mimeType = mimetypes.MimeTypes().guess_type(audio_file)[0] + source = {"buffer": audio, "mimetype": mimeType} + response = dg_client.transcription.sync_prerecorded( + source, + { + "punctuate": True, + "speaker_labels": True, + "diarize": self.diarize, + "smart_formatting": True, + "summarize": self.summarize, + "model": "whisper-large", + }, + ) + audio.close() + return response + except Exception as e: + raise Exception(f"(deepgram) Error transcribing audio to text: {e}") + + def process_with_diarization_and_chapters(self, raw_transcript, chapters): + logger.info( + "(deepgram) Processing diarization with detected chapters...") + try: + para = "" + string = "" + curr_speaker = None + words = raw_transcript["results"]["channels"][0]["alternatives"][0][ + "words" + ] + words_pointer = 0 + chapters_pointer = 0 + while chapters_pointer < len(chapters) and words_pointer < len(words): + if chapters[chapters_pointer][1] <= words[words_pointer]["start"]: + if para != "": + para = para.strip(" ") + string = string + para + "\n\n" + para = "" + string = string + f"## {chapters[chapters_pointer][2]}\n\n" + chapters_pointer += 1 + else: + if words[words_pointer]["speaker"] != curr_speaker: + if para != "": + para = para.strip(" ") + string = string + para + "\n\n" + para = "" + string = ( + string + + f'Speaker {words[words_pointer]["speaker"]}: ' + + utils.decimal_to_sexagesimal(words[words_pointer]["start"]) + ) + curr_speaker = words[words_pointer]["speaker"] + string = string + "\n\n" + + para = para + " " + words[words_pointer]["punctuated_word"] + words_pointer += 1 + while words_pointer < len(words): + if words[words_pointer]["speaker"] != curr_speaker: + if para != "": + para = para.strip(" ") + string = string + para + "\n\n" + para = "" + string = ( + string + f'Speaker {words[words_pointer]["speaker"]}:' + f' {utils.decimal_to_sexagesimal(words[words_pointer]["start"])}' + ) + curr_speaker = words[words_pointer]["speaker"] + string = string + "\n\n" + + para = para + " " + words[words_pointer]["punctuated_word"] + words_pointer += 1 + para = para.strip(" ") + string = string + para + return string + except Exception as e: + raise Exception(f"Error combining deepgram chapters: {e}") + + def process_with_diarization(self, raw_transcript): + logger.info(f"(deepgram) Processing diarization...") + para = "" + string = "" + curr_speaker = None + for word in raw_transcript["results"]["channels"][0]["alternatives"][0][ + "words" + ]: + if word["speaker"] != curr_speaker: + if para != "": + para = para.strip(" ") + string = string + para + "\n\n" + para = "" + string = ( + string + f'Speaker {word["speaker"]}: ' + f'{utils.decimal_to_sexagesimal(word["start"])}' + ) + curr_speaker = word["speaker"] + string = string + "\n\n" + + para = para + " " + word["punctuated_word"] + para = para.strip(" ") + string = string + para + return string + + def process_with_chapters(self, raw_transcript, chapters): + logger.info("(deepgram) Combining transcript with detected chapters...") + try: + chapters_pointer = 0 + words_pointer = 0 + result = "" + words = raw_transcript["results"]["channels"][0]["alternatives"][0][ + "words" + ] + # chapters index, start time, name + # transcript start time, end time, text + while chapters_pointer < len(chapters) and words_pointer < len(words): + if chapters[chapters_pointer][1] <= words[words_pointer]["end"]: + result = ( + result + "\n\n## " + + chapters[chapters_pointer][2] + "\n\n" + ) + chapters_pointer += 1 + else: + result = result + \ + words[words_pointer]["punctuated_word"] + " " + words_pointer += 1 + + # Append the final chapter heading and remaining content + while chapters_pointer < len(chapters): + result = result + "\n\n## " + \ + chapters[chapters_pointer][2] + "\n\n" + chapters_pointer += 1 + while words_pointer < len(words): + result = result + words[words_pointer]["punctuated_word"] + " " + words_pointer += 1 + + return result + except Exception as e: + raise Exception(f"Error combining deepgram with chapters: {e}") + + def process_summary(self, raw_transcript): + try: + summaries = raw_transcript["results"]["channels"][0]["alternatives"][0][ + "summaries" + ] + summary = "" + for x in summaries: + summary = summary + " " + x["summary"] + return summary.strip(" ") + except Exception as e: + logger.error(f"Error getting summary: {e}") + + def construct_transcript(self, raw_transcript, chapters): + if len(chapters) > 0: + # With chapters + if self.diarize: + # With diarization + return self.process_with_diarization_and_chapters(raw_transcript, chapters) + else: + # Without diarization + return self.process_with_chapters(raw_transcript, chapters) + else: + # Without chapters + if self.diarize: + # With diarization + return self.process_with_diarization(raw_transcript) + else: + # Without diarization + return raw_transcript["results"]["channels"][0]["alternatives"][0]["transcript"] + + return result + + def transcribe(self, transcript: Transcript): + try: + raw_transcript = self.audio_to_text(transcript.audio_file) + raw_transcript_file = utils.write_to_json( + raw_transcript, f"{self.output_dir}/{transcript.source.loc}", transcript.title, is_metadata=True) + logger.info( + f"(deepgram) Model stored at: {raw_transcript_file}") + if self.upload: + application.upload_file_to_s3(raw_transcript_file) + if self.summarize: + transcript.summary = self.process_summary(raw_transcript) + transcript.result = self.construct_transcript( + raw_transcript, transcript.source.chapters) + + return transcript + except Exception as e: + raise Exception(f"(deepgram) Error while transcribing: {e}") diff --git a/app/services/whisper.py b/app/services/whisper.py new file mode 100644 index 0000000..358523b --- /dev/null +++ b/app/services/whisper.py @@ -0,0 +1,113 @@ +import whisper + +from app import ( + application, + utils +) +from app.logging import get_logger +from app.transcript import Transcript + +logger = get_logger() + + +class Whisper: + def __init__(self, model, upload, output_dir): + self.model = model + self.upload = upload + self.output_dir = output_dir + + def audio_to_text(self, audio_file): + logger.info( + f"Transcribing audio to text using whisper ({self.model}) ...") + try: + my_model = whisper.load_model(self.model) + result = my_model.transcribe(audio_file) + data = [] + for x in result["segments"]: + data.append(tuple((x["start"], x["end"], x["text"]))) + return data + except Exception as e: + logger.error( + f"(wisper,{service}) Error transcribing audio to text: {e}") + return + + def generate_srt(self, data, filename, loc): + def format_time(time): + hours = int(time / 3600) + minutes = int((time % 3600) / 60) + seconds = int(time % 60) + milliseconds = int((time % 1) * 1000) + return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}" + + output_file = f"{utils.configure_output_file_path(f'{self.output_dir}/{loc}', filename, is_metadata=True)}.srt" + logger.info(f"(whisper) Writing srt to {output_file}...") + with open(output_file, "w") as f: + for index, segment in enumerate(data): + start_time, end_time, text = segment + f.write(f"{index+1}\n") + f.write( + f"{format_time(start_time)} --> {format_time(end_time)}\n") + f.write(f"{text.strip()}\n\n") + return output_file + + def process_with_chapters(self, raw_transcript, chapters): + try: + chapters_pointer = 0 + transcript_pointer = 0 + result = "" + # chapters index, start time, name + # transcript start time, end time, text + + while chapters_pointer < len(chapters) and transcript_pointer < len( + raw_transcript + ): + if ( + chapters[chapters_pointer][1] + <= raw_transcript[transcript_pointer][0] + ): + result = ( + result + "\n\n## " + + chapters[chapters_pointer][2] + "\n\n" + ) + chapters_pointer += 1 + else: + result = result + raw_transcript[transcript_pointer][2] + transcript_pointer += 1 + + while transcript_pointer < len(raw_transcript): + result = result + raw_transcript[transcript_pointer][2] + transcript_pointer += 1 + + return result + except Exception as e: + logger.error("Error combining chapters") + logger.error(e) + + def process_default(self): + result = "" + for x in self.result: + result = result + x[2] + " " + + return result + + def construct_transcript(self, raw_transcript, chapters): + if len(chapters) > 0: + # Source has chapters, add them to transcript + return self.process_with_chapters(raw_transcript, chapters) + else: + return self.process_default(raw_transcript) + + def transcribe(self, transcript: Transcript): + try: + raw_transcript = self.audio_to_text(transcript.audio_file) + raw_transcript_file = self.generate_srt( + raw_transcript, transcript.title, transcript.source.loc) + if self.upload: + application.upload_file_to_s3(raw_transcript_file) + + transcript.result = construct_transcript( + raw_transcript, transcript.source.chapters) + + return transcript + except Exception as e: + raise Exception(f"(whisper) Error while transcribing: {e}") diff --git a/app/transcript.py b/app/transcript.py index f27bea9..6535efe 100644 --- a/app/transcript.py +++ b/app/transcript.py @@ -26,16 +26,10 @@ class Transcript: def __init__(self, source, test_mode=False): self.source = source + self.summary = None self.test_mode = test_mode self.logger = get_logger() - def create_transcript(self): - result = "" - for x in self.result: - result = result + x[2] + " " - - return result - def process_source(self, tmp_dir=None): tmp_dir = tmp_dir if tmp_dir is not None else tempfile.mkdtemp() self.audio_file = self.source.process(tmp_dir) @@ -43,109 +37,6 @@ def process_source(self, tmp_dir=None): self.audio_file)[:-4] return self.audio_file, tmp_dir - def transcribe(self, working_dir, generate_chapters, summarize_transcript, service, diarize, upload, model_output_dir, test_transcript=None): - - def process_mp3(): - """using whisper""" - self.logger.info("Transcribing audio to text using whisper ...") - try: - my_model = whisper.load_model(service) - result = my_model.transcribe(self.audio_file) - data = [] - for x in result["segments"]: - data.append(tuple((x["start"], x["end"], x["text"]))) - data_path = application.generate_srt( - data, self.title, model_output_dir) - if upload: - application.upload_file_to_s3(data_path) - return data - except Exception as e: - self.logger.error( - f"(wisper,{service}) Error transcribing audio to text: {e}") - return - - def write_chapters_file(): - """Write out the chapter file based on simple MP4 format (OGM)""" - try: - if generate_chapters and len(self.source.chapters) > 0: - self.logger.info("Chapters detected") - chapters_file = os.path.join(working_dir, os.path.basename( - self.audio_file)[:-4] + ".chapters") - - with open(chapters_file, "w") as fo: - for current_chapter in self.source.chapters: - fo.write( - f"CHAPTER{current_chapter[0]}=" - f"{current_chapter[1]}\n" - f"CHAPTER{current_chapter[0]}NAME=" - f"{current_chapter[2]}\n" - ) - fo.close() - return True - else: - return False - except Exception as e: - raise Exception(f"Error writing chapters file: {e}") - - try: - self.summary = None - if self.test_mode: - self.result = test_transcript if test_transcript is not None else "test-mode" - return self.result - if not self.audio_file: - # TODO give audio file path as argument - raise Exception( - "audio file is missing, you need to process_source() first") - - has_chapters = len(self.source.chapters) > 0 - self.result = None - if service == "deepgram" or summarize_transcript: - # process mp3 using deepgram - deepgram_resp = application.process_mp3_deepgram( - self.audio_file, summarize_transcript, diarize) - # store deepgram output - deepgram_output_file_path = write_to_json( - deepgram_resp, model_output_dir, self.title, is_metadata=True) - self.logger.info( - f"(deepgram) Model stored at: {deepgram_output_file_path}") - if upload: - application.upload_file_to_s3(deepgram_output_file_path) - self.result = application.get_deepgram_transcript( - deepgram_resp, diarize) - - if summarize_transcript: - self.summary = application.get_deepgram_summary( - deepgram_resp) - - if service == "deepgram" and has_chapters: - if diarize: - self.result = application.combine_deepgram_chapters_with_diarization( - deepgram_data=deepgram_resp, chapters=self.source.chapters - ) - else: - self.result = application.combine_deepgram_with_chapters( - deepgram_data=deepgram_resp, chapters=self.source.chapters - ) - - if not service == "deepgram": - # whisper - self.result = process_mp3() - if has_chapters: - # this is only available for videos, for now - self.result = application.combine_chapter( - chapters=self.source.chapters, - transcript=self.result, - working_dir=working_dir - ) - else: - # finalize transcript - self.result = self.create_transcript() - - return self.result - - except Exception as e: - raise Exception(f"Error while transcribing audio source: {e}") - def write_to_file(self, working_dir, transcript_by): """Writes transcript to a markdown file and returns its absolute path This file is submitted as part of the Pull Request to the diff --git a/app/transcription.py b/app/transcription.py index adf68d3..4d01b92 100644 --- a/app/transcription.py +++ b/app/transcription.py @@ -13,7 +13,12 @@ import yt_dlp from app.transcript import Transcript, Source, Audio, Video, Playlist, RSS -from app import __app_name__, __version__, application +from app import ( + __app_name__, + __version__, + application, + services +) from app.utils import ( check_if_valid_file_path, check_if_valid_json, @@ -29,7 +34,6 @@ class Transcription: def __init__( self, model="tiny", - chapters=False, pr=False, summarize=False, deepgram=False, @@ -47,14 +51,13 @@ def __init__( self.logger = get_logger() self.tmp_dir = working_dir if working_dir is not None else tempfile.mkdtemp() - self.model = model self.transcript_by = "username" if test_mode else self.__get_username() - self.generate_chapters = chapters self.open_pr = pr - self.summarize_transcript = summarize - self.service = "deepgram" if deepgram else model - self.diarize = diarize - self.upload = upload + if deepgram: + self.service = services.Deepgram( + summarize, diarize, upload, model_output_dir) + else: + self.service = services.Whisper(model, upload, model_output_dir) self.model_output_dir = model_output_dir self.transcripts = [] self.nocleanup = nocleanup @@ -248,16 +251,10 @@ def start(self, test_transcript=None): transcript.tmp_dir = self._create_subdirectory( f"transcript{len(self.result) + 1}") transcript.process_source(transcript.tmp_dir) - result = transcript.transcribe( - transcript.tmp_dir, - self.generate_chapters, - self.summarize_transcript, - self.service, - self.diarize, - self.upload, - output_dir, - test_transcript=test_transcript - ) + if self.test_mode: + transcript.result = test_transcript if test_transcript is not None else "test-mode" + else: + transcript = self.service.transcribe(transcript) postprocessed_transcript = self.postprocess(transcript) self.result.append(postprocessed_transcript) @@ -299,7 +296,7 @@ def postprocess(self, transcript: Transcript): result = payload_json_file return result except Exception as e: - raise Exception(f"Error with postprocessing: {e}") from e + raise Exception(f"Error with postprocessing: {e}") from e def clean_up(self): self.logger.info("Cleaning up...") diff --git a/app/utils.py b/app/utils.py index 49963a9..dcda1c5 100644 --- a/app/utils.py +++ b/app/utils.py @@ -14,7 +14,7 @@ def slugify(text): return re.sub(r'\W+', '-', text).strip('-').lower() -def write_to_json(json_data, output_dir, filename, add_timestamp=True, is_metadata=False): +def configure_output_file_path(output_dir, filename, add_timestamp=True, is_metadata=False): if is_metadata: # subdirectory for metadata output_dir = os.path.join(output_dir, "metadata") @@ -22,13 +22,26 @@ def write_to_json(json_data, output_dir, filename, add_timestamp=True, is_metada os.makedirs(output_dir) time_in_str = f'_{datetime.now().strftime("%Y-%m-%d-%H-%M-%S")}' if add_timestamp else "" file_path = os.path.join( - output_dir, f"{slugify(filename)}{time_in_str}.json" + output_dir, f"{slugify(filename)}{time_in_str}" ) + return file_path + + +def write_to_json(json_data, output_dir, filename, add_timestamp=True, is_metadata=False): + file_path = f"{configure_output_file_path(output_dir, filename, add_timestamp, is_metadata)}.json" with open(file_path, "w") as json_file: json.dump(json_data, json_file, indent=4) return file_path +def decimal_to_sexagesimal(dec): + sec = int(dec % 60) + minu = int((dec // 60) % 60) + hrs = int((dec // 60) // 60) + + return f"{hrs:02d}:{minu:02d}:{sec:02d}" + + def check_if_valid_json(file_path): try: with open(file_path) as file: diff --git a/test/test_video.py b/test/test_video.py index dcb55fc..5d11a4e 100644 --- a/test/test_video.py +++ b/test/test_video.py @@ -172,7 +172,6 @@ def test_video_with_chapters(): transcription = Transcription( username=username, - chapters=True, test_mode=True, ) transcription.add_transcription_source( diff --git a/transcriber.py b/transcriber.py index 19fa501..0afaa60 100644 --- a/transcriber.py +++ b/transcriber.py @@ -84,13 +84,6 @@ def print_help(ctx, param, value): default=False, help="Summarize the transcript [only available with deepgram]", ) -use_youtube_chapters = click.option( - "-C", - "--chapters", - is_flag=True, - default=False, - help="For YouTube videos, include the YouTube chapters and timestamps in the resulting transcript.", -) open_pr = click.option( "-p", "--PR", @@ -191,7 +184,6 @@ def print_help(ctx, param, value): # Options for configuring the transcription process @diarize @summarize -@use_youtube_chapters @open_pr @upload_to_s3 @save_to_markdown @@ -208,7 +200,6 @@ def transcribe( tags: list, speakers: list, category: list, - chapters: bool, pr: bool, deepgram: bool, summarize: bool, @@ -240,7 +231,6 @@ def transcribe( try: transcription = Transcription( model=model, - chapters=chapters, pr=pr, summarize=summarize, deepgram=deepgram, @@ -332,8 +322,7 @@ def preprocess( write_to_json([preprocessed_source for preprocessed_source in transcription.preprocessing_output], transcription.model_output_dir, "preprocessed_sources") except Exception as e: - logger.error(e) - logger.info(f"Exited with error") + logger.info(f"Exited with error: {e}") @cli.command() @@ -352,7 +341,8 @@ def postprocess_deepgram_transcript( check_if_valid_file_path(deepgram_json_file) check_if_valid_file_path(preprocess_json_file) logger.info(f"Processing deepgram output from {deepgram_json_file}") - transcription = Transcription(queue=False) + transcription = Transcription( + deepgram=True, queue=False, diarize=diarize) with open(deepgram_json_file, "r") as outfile: deepgram_output = json.load(outfile) outfile.close() @@ -373,20 +363,11 @@ def postprocess_deepgram_transcript( link=metadata["media"], preprocess=False ) - # Postprocess deepgram transcript - has_chapters = len(metadata["chapters"]) > 0 + # Process raw deepgram transcript transcript_from_deepgram = transcription.transcripts[0] transcript_from_deepgram.title = metadata["title"] - transcript_from_deepgram.result = application.get_deepgram_transcript( - deepgram_output, diarize) - if has_chapters: - if diarize: - transcript_from_deepgram.result = application.combine_deepgram_chapters_with_diarization( - deepgram_data=deepgram_output, chapters=metadata["chapters"]) - else: - transcript_from_deepgram.result = application.combine_deepgram_with_chapters( - deepgram_data=deepgram_output, chapters=metadata["chapters"]) - + transcript_from_deepgram.result = transcription.service.construct_transcript( + deepgram_output, metadata["chapters"]) transcription.postprocess(transcript_from_deepgram) except Exception as e: logger.error(e)