Skip to content

Commit

Permalink
Mocked AWS
Browse files Browse the repository at this point in the history
This PR mocks AWS resources in the test with moto. It also includes a GitHub Action
configuration to run linting and format check with ruff, before running
the tests.

Closes #12
Refs #36
  • Loading branch information
edsu committed Oct 29, 2024
1 parent 864a7d8 commit a75dd3a
Show file tree
Hide file tree
Showing 7 changed files with 116 additions and 48 deletions.
39 changes: 39 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
name: Test
on:
- push
jobs:
build:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.11]
steps:

- name: checkout
uses: actions/checkout@v3

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}

- name: Lint
uses: chartboost/ruff-action@v1
# Default action is 'check'
# it may move, see https://github.com/astral-sh/ruff/issues/8400

- name: Format
uses: chartboost/ruff-action@v1
with:
args: 'format --check'

- name: Install ffmpeg
run: |
wget -O - https://raw.githubusercontent.com/jontybrook/ffmpeg-install-script/main/install-ffmpeg-static.sh | bash -s -- --stable --force
- name: Install Python dependencies
run: |
pip install -r requirements.txt
- name: Run tests
run: pytest
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
.env
__pycache__/
whisper_models
*.log
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# speech-to-text

[![Test](https://github.com/sul-dlss/speech-to-text/actions/workflows/test.yml/badge.svg)](https://github.com/sul-dlss/speech-to-text/actions/workflows/test.yml)

This repository contains a Docker configuration for performing serverless speech-to-text processing with Whisper using an Amazon Simple Storage Service (S3) bucket for media files, and Amazon Simple Queue Service (SQS) for coordinating work.

## Build
Expand Down Expand Up @@ -153,3 +155,5 @@ source .venv/bin/activate
pip install -r requirements.txt
pytest
```

Note: the tests use the [moto](https://docs.getmoto.org/en/latest/) library to mock out AWS resources. If you want to test live AWS you can follow the steps above to create a job, run, and then receive the done message.
2 changes: 2 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
[pytest]
log_level = INFO
log_file = test.log
pythonpath = .
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ boto3
openai-whisper
python-dotenv
pytest
moto[s3,sqs,sts]
26 changes: 10 additions & 16 deletions speech_to_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,6 @@
import whisper
from whisper.utils import get_writer

dotenv.load_dotenv()
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s :: %(levelname)s :: %(message)s",
datefmt="%Y-%m-%dT%H:%M:%S%z",
)


def main(daemon=True):
# loop forever looking for jobs unless daemon says not to
Expand Down Expand Up @@ -95,7 +88,7 @@ def download_media(job):
output_dir.mkdir()

for media_file in job["media"]:
# note the media_file is expected to be the full path in the bucket
# note the media_file is expected to be the full path in the bucket
# e.g. pg879tb2706-v2/video_1.mp4
bucket.download_file(media_file, media_file)

Expand Down Expand Up @@ -155,7 +148,6 @@ def upload_results(job):
job["output"] = []
output_dir = get_output_dir(job)
for path in output_dir.iterdir():

# ignore non output files
if path.suffix not in [".vtt", ".srt", ".json", ".txt", ".tsv"]:
continue
Expand Down Expand Up @@ -280,13 +272,7 @@ def create_job(media_path: Path, job_id: str = None, options={}):
job_id = str(uuid.uuid4()) if job_id is None else job_id
add_media(media_path, job_id)

job = {
"id": job_id,
"media": [
f"{job_id}/{media_path.name}"
],
"options": options
}
job = {"id": job_id, "media": [f"{job_id}/{media_path.name}"], "options": options}
add_job(job)

return job_id
Expand Down Expand Up @@ -345,6 +331,14 @@ def load_whisper_model(model_name):


if __name__ == "__main__":
dotenv.load_dotenv()

logging.basicConfig(
level=logging.INFO,
format="%(asctime)s :: %(levelname)s :: %(message)s",
datefmt="%Y-%m-%dT%H:%M:%S%z",
)

check_env()

parser = argparse.ArgumentParser(prog="speech_to_text")
Expand Down
91 changes: 59 additions & 32 deletions tests/test_speech_to_text.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,66 @@
import os
import boto3
import json
from pathlib import Path

import dotenv
import moto
import pytest
import speech_to_text

# set AWS_PROFILE from .env in the environment
dotenv.load_dotenv()
BUCKET = "bucket"
TODO_QUEUE = "todo"
DONE_QUEUE = "done"


@pytest.fixture
def aws_credentials():
"""Mocked AWS Credentials for moto."""
os.environ["AWS_ACCESS_KEY_ID"] = "testing"
os.environ["AWS_SECRET_ACCESS_KEY"] = "testing"
os.environ["AWS_SECURITY_TOKEN"] = "testing"
os.environ["AWS_SESSION_TOKEN"] = "testing"
os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
os.environ["SPEECH_TO_TEXT_TODO_SQS_QUEUE"] = TODO_QUEUE
os.environ["SPEECH_TO_TEXT_DONE_SQS_QUEUE"] = DONE_QUEUE
os.environ["SPEECH_TO_TEXT_S3_BUCKET"] = BUCKET


@pytest.fixture
def sts(aws_credentials):
with moto.mock_aws():
yield boto3.client("sts")


@pytest.fixture
def s3(aws_credentials):
with moto.mock_aws():
yield boto3.client("s3")


@pytest.fixture
def sqs(aws_credentials):
with moto.mock_aws():
yield boto3.resource("sqs")


@pytest.fixture
def bucket(s3):
return s3.create_bucket(Bucket=BUCKET)


@pytest.fixture
def queues(sqs):
sqs.create_queue(QueueName=TODO_QUEUE)
sqs.create_queue(QueueName=DONE_QUEUE)


# ignore utcnow warning until https://github.com/boto/boto3/issues/3889 is resolved
@pytest.mark.filterwarnings("ignore:datetime.datetime.utcnow")
def test_speech_to_text():
clean()

job_id = speech_to_text.create_job(Path("tests/data/en.wav"), options={
"model": "small",
"writer": {
"max_line_width": 90
}
})
def test_speech_to_text(bucket, queues):
job_id = speech_to_text.create_job(
Path("tests/data/en.wav"),
options={"model": "small", "writer": {"max_line_width": 42}},
)

speech_to_text.main(daemon=False)

Expand All @@ -41,7 +82,12 @@ def test_speech_to_text():
assert f"{job_id}/output/en.json" in job["output"]

# check that max_line_width took effect on the writer options that were used
assert job["extraction_technical_metadata"]["effective_writer_options"]["max_line_width"] == 90
assert (
job["extraction_technical_metadata"]["effective_writer_options"][
"max_line_width"
]
== 42
)

# make sure there's a message in the "done" queue
queue = speech_to_text.get_done_queue()
Expand All @@ -62,22 +108,3 @@ def test_speech_to_text():

jobs = queue.receive_messages(MaxNumberOfMessages=1)
assert len(jobs) == 0, "queue empty"


def clean():
"""
Ensure that the bucket and queues are empty.
"""
todo = speech_to_text.get_todo_queue()
while messages := todo.receive_messages():
for m in messages:
m.delete()

done = speech_to_text.get_done_queue()
while messages := done.receive_messages():
for m in messages:
m.delete()

bucket = speech_to_text.get_bucket()
for obj in bucket.objects.all():
obj.delete()

0 comments on commit a75dd3a

Please sign in to comment.