Mocked AWS

This PR mocks AWS resources in the test with moto. It also includes a GitHub Action configuration to run linting and format check with ruff, before running the tests. Closes #12 Refs #36
sul-dlss · Oct 29, 2024 · a75dd3a · a75dd3a
1 parent 864a7d8
commit a75dd3a
Show file tree

Hide file tree

Showing 7 changed files with 116 additions and 48 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -0,0 +1,39 @@
+name: Test
+on: 
+  - push
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [3.11]
+    steps:
+
+      - name: checkout
+        uses: actions/checkout@v3
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Lint
+        uses: chartboost/ruff-action@v1
+        # Default action is 'check'
+        # it may move, see https://github.com/astral-sh/ruff/issues/8400
+
+      - name: Format
+        uses: chartboost/ruff-action@v1
+        with:
+          args: 'format --check'
+
+      - name: Install ffmpeg
+        run: |
+          wget -O - https://raw.githubusercontent.com/jontybrook/ffmpeg-install-script/main/install-ffmpeg-static.sh | bash -s -- --stable --force
+
+      - name: Install Python dependencies
+        run: |
+          pip install -r requirements.txt
+
+      - name: Run tests
+        run: pytest
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,4 @@
 .env
 __pycache__/
 whisper_models
+*.log
diff --git a/README.md b/README.md
@@ -1,5 +1,7 @@
 # speech-to-text
 
+[![Test](https://github.com/sul-dlss/speech-to-text/actions/workflows/test.yml/badge.svg)](https://github.com/sul-dlss/speech-to-text/actions/workflows/test.yml)
+
 This repository contains a Docker configuration for performing serverless speech-to-text processing with Whisper using an Amazon Simple Storage Service (S3) bucket for media files, and Amazon Simple Queue Service (SQS) for coordinating work.
 
 ## Build
@@ -153,3 +155,5 @@ source .venv/bin/activate
 pip install -r requirements.txt
 pytest
 ```
+
+Note: the tests use the [moto](https://docs.getmoto.org/en/latest/) library to mock out AWS resources. If you want to test live AWS you can follow the steps above to create a job, run, and then receive the done message.
diff --git a/pytest.ini b/pytest.ini
@@ -1,2 +1,4 @@
 [pytest]
+log_level = INFO
+log_file = test.log
 pythonpath = .
diff --git a/requirements.txt b/requirements.txt
@@ -2,3 +2,4 @@ boto3
 openai-whisper
 python-dotenv
 pytest
+moto[s3,sqs,sts]
diff --git a/speech_to_text.py b/speech_to_text.py
@@ -18,13 +18,6 @@
 import whisper
 from whisper.utils import get_writer
 
-dotenv.load_dotenv()
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s :: %(levelname)s :: %(message)s",
-    datefmt="%Y-%m-%dT%H:%M:%S%z",
-)
-
 
 def main(daemon=True):
     # loop forever looking for jobs unless daemon says not to
@@ -95,7 +88,7 @@ def download_media(job):
         output_dir.mkdir()
 
     for media_file in job["media"]:
-        # note the media_file is expected to be the full path in the bucket 
+        # note the media_file is expected to be the full path in the bucket
         # e.g. pg879tb2706-v2/video_1.mp4
         bucket.download_file(media_file, media_file)
 
@@ -155,7 +148,6 @@ def upload_results(job):
     job["output"] = []
     output_dir = get_output_dir(job)
     for path in output_dir.iterdir():
-
         # ignore non output files
         if path.suffix not in [".vtt", ".srt", ".json", ".txt", ".tsv"]:
             continue
@@ -280,13 +272,7 @@ def create_job(media_path: Path, job_id: str = None, options={}):
     job_id = str(uuid.uuid4()) if job_id is None else job_id
     add_media(media_path, job_id)
 
-    job = {
-        "id": job_id, 
-        "media": [
-            f"{job_id}/{media_path.name}"
-        ], 
-        "options": options
-    }
+    job = {"id": job_id, "media": [f"{job_id}/{media_path.name}"], "options": options}
     add_job(job)
 
     return job_id
@@ -345,6 +331,14 @@ def load_whisper_model(model_name):
 
 
 if __name__ == "__main__":
+    dotenv.load_dotenv()
+
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s :: %(levelname)s :: %(message)s",
+        datefmt="%Y-%m-%dT%H:%M:%S%z",
+    )
+
     check_env()
 
     parser = argparse.ArgumentParser(prog="speech_to_text")

diff --git a/tests/test_speech_to_text.py b/tests/test_speech_to_text.py
@@ -1,25 +1,66 @@
+import os
+import boto3
 import json
 from pathlib import Path
 
-import dotenv
+import moto
 import pytest
 import speech_to_text
 
-# set AWS_PROFILE from .env in the environment
-dotenv.load_dotenv()
+BUCKET = "bucket"
+TODO_QUEUE = "todo"
+DONE_QUEUE = "done"
+
+
+@pytest.fixture
+def aws_credentials():
+    """Mocked AWS Credentials for moto."""
+    os.environ["AWS_ACCESS_KEY_ID"] = "testing"
+    os.environ["AWS_SECRET_ACCESS_KEY"] = "testing"
+    os.environ["AWS_SECURITY_TOKEN"] = "testing"
+    os.environ["AWS_SESSION_TOKEN"] = "testing"
+    os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
+    os.environ["SPEECH_TO_TEXT_TODO_SQS_QUEUE"] = TODO_QUEUE
+    os.environ["SPEECH_TO_TEXT_DONE_SQS_QUEUE"] = DONE_QUEUE
+    os.environ["SPEECH_TO_TEXT_S3_BUCKET"] = BUCKET
+
+
+@pytest.fixture
+def sts(aws_credentials):
+    with moto.mock_aws():
+        yield boto3.client("sts")
+
+
+@pytest.fixture
+def s3(aws_credentials):
+    with moto.mock_aws():
+        yield boto3.client("s3")
+
+
+@pytest.fixture
+def sqs(aws_credentials):
+    with moto.mock_aws():
+        yield boto3.resource("sqs")
+
+
+@pytest.fixture
+def bucket(s3):
+    return s3.create_bucket(Bucket=BUCKET)
+
+
+@pytest.fixture
+def queues(sqs):
+    sqs.create_queue(QueueName=TODO_QUEUE)
+    sqs.create_queue(QueueName=DONE_QUEUE)
 
 
 # ignore utcnow warning until https://github.com/boto/boto3/issues/3889 is resolved
 @pytest.mark.filterwarnings("ignore:datetime.datetime.utcnow")
-def test_speech_to_text():
-    clean()
-
-    job_id = speech_to_text.create_job(Path("tests/data/en.wav"), options={
-        "model": "small",
-        "writer": {
-            "max_line_width": 90
-        }
-    })
+def test_speech_to_text(bucket, queues):
+    job_id = speech_to_text.create_job(
+        Path("tests/data/en.wav"),
+        options={"model": "small", "writer": {"max_line_width": 42}},
+    )
 
     speech_to_text.main(daemon=False)
 
@@ -41,7 +82,12 @@ def test_speech_to_text():
     assert f"{job_id}/output/en.json" in job["output"]
 
     # check that max_line_width took effect on the writer options that were used
-    assert job["extraction_technical_metadata"]["effective_writer_options"]["max_line_width"] == 90 
+    assert (
+        job["extraction_technical_metadata"]["effective_writer_options"][
+            "max_line_width"
+        ]
+        == 42
+    )
 
     # make sure there's a message in the "done" queue
     queue = speech_to_text.get_done_queue()
@@ -62,22 +108,3 @@ def test_speech_to_text():
 
     jobs = queue.receive_messages(MaxNumberOfMessages=1)
     assert len(jobs) == 0, "queue empty"
-
-
-def clean():
-    """
-    Ensure that the bucket and queues are empty.
-    """
-    todo = speech_to_text.get_todo_queue()
-    while messages := todo.receive_messages():
-        for m in messages:
-            m.delete()
-
-    done = speech_to_text.get_done_queue()
-    while messages := done.receive_messages():
-        for m in messages:
-            m.delete()
-
-    bucket = speech_to_text.get_bucket()
-    for obj in bucket.objects.all():
-        obj.delete()
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,3 +2,4 @@ @@
     .env
     __pycache__/
     whisper_models
+    *.log