From 73199eb9d112ca52a54f11a599e986e27bebe19b Mon Sep 17 00:00:00 2001 From: Ed Summers Date: Mon, 28 Oct 2024 18:21:12 -0400 Subject: [PATCH] Mocked AWS This PR mocks AWS resources in the test with moto. It also includes a GitHub Action configuration to run linting and format check with ruff, before running the tests. Closes #12 Refs #36 --- .github/workflows/test.yml | 39 ++++++++++++++++ .gitignore | 1 + README.md | 2 + pytest.ini | 2 + requirements.txt | 1 + speech_to_text.py | 26 ++++------- tests/test_speech_to_text.py | 91 +++++++++++++++++++++++------------- 7 files changed, 114 insertions(+), 48 deletions(-) create mode 100644 .github/workflows/test.yml diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..c04508a --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,39 @@ +name: Test +on: + - push +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [3.11] + steps: + + - name: checkout + uses: actions/checkout@v3 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Lint + uses: chartboost/ruff-action@v1 + # Default action is 'check' + # it may move, see https://github.com/astral-sh/ruff/issues/8400 + + - name: Format + uses: chartboost/ruff-action@v1 + with: + args: 'format --check' + + - name: Install dependencies + run: | + pip install -r requirements.txt + + - name: Install ffmpeg + run: | + wget -O - https://raw.githubusercontent.com/jontybrook/ffmpeg-install-script/main/install-ffmpeg-static.sh | bash -s -- --stable --force + + - name: Run tests + run: pytest diff --git a/.gitignore b/.gitignore index fe88abd..dc05fae 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ .env __pycache__/ whisper_models +*.log diff --git a/README.md b/README.md index 87359c3..27b3d06 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # speech-to-text +[![Test](https://github.com/sul-dlss/speech-to-text/actions/workflows/test.yml/badge.svg)](https://github.com/sul-dlss/speech-to-text/actions/workflows/test.yml) + This repository contains a Docker configuration for performing serverless speech-to-text processing with Whisper using an Amazon Simple Storage Service (S3) bucket for media files, and Amazon Simple Queue Service (SQS) for coordinating work. ## Build diff --git a/pytest.ini b/pytest.ini index a635c5c..d9e3dc9 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,2 +1,4 @@ [pytest] +log_level = INFO +log_file = test.log pythonpath = . diff --git a/requirements.txt b/requirements.txt index 0801cc8..db91d28 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ boto3 openai-whisper python-dotenv pytest +moto[s3,sqs] diff --git a/speech_to_text.py b/speech_to_text.py index f2e9546..4eb931b 100755 --- a/speech_to_text.py +++ b/speech_to_text.py @@ -18,13 +18,6 @@ import whisper from whisper.utils import get_writer -dotenv.load_dotenv() -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s :: %(levelname)s :: %(message)s", - datefmt="%Y-%m-%dT%H:%M:%S%z", -) - def main(daemon=True): # loop forever looking for jobs unless daemon says not to @@ -95,7 +88,7 @@ def download_media(job): output_dir.mkdir() for media_file in job["media"]: - # note the media_file is expected to be the full path in the bucket + # note the media_file is expected to be the full path in the bucket # e.g. pg879tb2706-v2/video_1.mp4 bucket.download_file(media_file, media_file) @@ -155,7 +148,6 @@ def upload_results(job): job["output"] = [] output_dir = get_output_dir(job) for path in output_dir.iterdir(): - # ignore non output files if path.suffix not in [".vtt", ".srt", ".json", ".txt", ".tsv"]: continue @@ -280,13 +272,7 @@ def create_job(media_path: Path, job_id: str = None, options={}): job_id = str(uuid.uuid4()) if job_id is None else job_id add_media(media_path, job_id) - job = { - "id": job_id, - "media": [ - f"{job_id}/{media_path.name}" - ], - "options": options - } + job = {"id": job_id, "media": [f"{job_id}/{media_path.name}"], "options": options} add_job(job) return job_id @@ -345,6 +331,14 @@ def load_whisper_model(model_name): if __name__ == "__main__": + dotenv.load_dotenv() + + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s :: %(levelname)s :: %(message)s", + datefmt="%Y-%m-%dT%H:%M:%S%z", + ) + check_env() parser = argparse.ArgumentParser(prog="speech_to_text") diff --git a/tests/test_speech_to_text.py b/tests/test_speech_to_text.py index 0536a49..c4059de 100644 --- a/tests/test_speech_to_text.py +++ b/tests/test_speech_to_text.py @@ -1,25 +1,66 @@ +import os +import boto3 import json from pathlib import Path -import dotenv +import moto import pytest import speech_to_text -# set AWS_PROFILE from .env in the environment -dotenv.load_dotenv() +BUCKET = "bucket" +TODO_QUEUE = "todo" +DONE_QUEUE = "done" + + +@pytest.fixture(scope="function") +def aws_credentials(): + """Mocked AWS Credentials for moto.""" + os.environ["AWS_ACCESS_KEY_ID"] = "testing" + os.environ["AWS_SECRET_ACCESS_KEY"] = "testing" + os.environ["AWS_SECURITY_TOKEN"] = "testing" + os.environ["AWS_SESSION_TOKEN"] = "testing" + os.environ["AWS_DEFAULT_REGION"] = "us-east-1" + os.environ["SPEECH_TO_TEXT_TODO_SQS_QUEUE"] = TODO_QUEUE + os.environ["SPEECH_TO_TEXT_DONE_SQS_QUEUE"] = DONE_QUEUE + os.environ["SPEECH_TO_TEXT_S3_BUCKET"] = BUCKET + + +@pytest.fixture(scope="function") +def sts(aws_credentials): + with moto.mock_aws(): + yield boto3.client("sts") + + +@pytest.fixture(scope="function") +def s3(aws_credentials): + with moto.mock_aws(): + yield boto3.client("s3") + + +@pytest.fixture(scope="function") +def sqs(aws_credentials): + with moto.mock_aws(): + yield boto3.resource("sqs") + + +@pytest.fixture(scope="function") +def bucket(s3): + return s3.create_bucket(Bucket=BUCKET) + + +@pytest.fixture(scope="function") +def queues(sqs): + sqs.create_queue(QueueName=TODO_QUEUE) + sqs.create_queue(QueueName=DONE_QUEUE) # ignore utcnow warning until https://github.com/boto/boto3/issues/3889 is resolved @pytest.mark.filterwarnings("ignore:datetime.datetime.utcnow") -def test_speech_to_text(): - clean() - - job_id = speech_to_text.create_job(Path("tests/data/en.wav"), options={ - "model": "small", - "writer": { - "max_line_width": 90 - } - }) +def test_speech_to_text(bucket, queues): + job_id = speech_to_text.create_job( + Path("tests/data/en.wav"), + options={"model": "small", "writer": {"max_line_width": 42}}, + ) speech_to_text.main(daemon=False) @@ -41,7 +82,12 @@ def test_speech_to_text(): assert f"{job_id}/output/en.json" in job["output"] # check that max_line_width took effect on the writer options that were used - assert job["extraction_technical_metadata"]["effective_writer_options"]["max_line_width"] == 90 + assert ( + job["extraction_technical_metadata"]["effective_writer_options"][ + "max_line_width" + ] + == 42 + ) # make sure there's a message in the "done" queue queue = speech_to_text.get_done_queue() @@ -62,22 +108,3 @@ def test_speech_to_text(): jobs = queue.receive_messages(MaxNumberOfMessages=1) assert len(jobs) == 0, "queue empty" - - -def clean(): - """ - Ensure that the bucket and queues are empty. - """ - todo = speech_to_text.get_todo_queue() - while messages := todo.receive_messages(): - for m in messages: - m.delete() - - done = speech_to_text.get_done_queue() - while messages := done.receive_messages(): - for m in messages: - m.delete() - - bucket = speech_to_text.get_bucket() - for obj in bucket.objects.all(): - obj.delete()