diff --git a/Dockerfile b/Dockerfile index ee3084c..a7f49fc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,7 +11,7 @@ RUN apt-get install -y ffmpeg nodejs=18.* \ && rm -rf /var/lib/apt/lists/* # install python dependencies -RUN pip install -U openai-whisper +RUN pip install -U openai-whisper whisper-ctranslate2 # add source files COPY src /app diff --git a/Makefile b/Makefile index e169519..497023d 100644 --- a/Makefile +++ b/Makefile @@ -32,7 +32,7 @@ test-clean: whisper: docker run -it --rm \ -v ${DIR}:/app \ - -v ${DIR}/.whisper:/root/.cache/whisper \ + -v ${DIR}/.whisper:/app/.whisper \ ${IMAGE_TAG} \ whisper ${VIDEO_FILE} \ --model ${MODEL} \ @@ -43,11 +43,12 @@ whisper: node: docker run -it --rm \ -v ${DIR}:/app \ - -v ${DIR}/.whisper:/root/.cache/whisper \ + -v ${DIR}/.whisper:/app/.whisper \ ${IMAGE_TAG} \ node /app/src/cut-video.js -t test -v test.mp4 bash: docker run -it --rm \ -v ${DIR}:/data \ + -v ${DIR}/.whisper:/app/.whisper \ ${IMAGE_TAG} /bin/bash diff --git a/README.md b/README.md index 0961e25..8ce4eec 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,7 @@ docker run --rm -it \ - `--input` - path to video file - `--model` - whisper model name - `tiny`, `tiny.en` (default), `base`, `base.en`, `small`, `small.en`, `medium`, `medium.sm`, `large`. View [official docs](https://github.com/openai/whisper#available-models-and-languages) for break down of model size and performance - `--language` - language code. Typically improves in transcription to set language instead of allowing Whisper to auto-detect. +- `--engine` - transcription engine `whisper-ctranslate2` (default) or `whisper`. `whisper` is likely to be removed in the near future if/when `whisper-ctranslate2` proves to be just as good but 4x faster ### Known Issues - `Error: Command "whisper" exited with code null` - this is likely caused by the container needing more allocated memory. Allocating at least 4 GB memory for the `small.en` usually resolved the issue but your mileage may vary. @@ -47,7 +48,7 @@ Allows you to manually create a list of timestamps to cut the video. Usage: ```shell -docker run --rm -it -v $(pwd):/data video-swear-jar \ +docker run --rm -it -v $(pwd):/data video-swear-jar:v1 \ cut-video --timestamp timestamps.txt --video video.mkv ``` @@ -60,7 +61,7 @@ This is the `whisper` CLI if you need to further customize the command. Visit ht Usage: ```shell -docker run --rm -it -v $(pwd):/data video-swear-jar \ +docker run --rm -it -v $(pwd):/data video-swear-jar:v1 \ whisper my-video.mp4 \ --model tiny.en \ --language en \ @@ -71,7 +72,7 @@ docker run --rm -it -v $(pwd):/data video-swear-jar \ ### ffmpeg Usage: ```shell -docker run --rm -it -v $(pwd):/data video-swear-jar \ +docker run --rm -it -v $(pwd):/data video-swear-jar:v1 \ ffmpeg -i input.mp4 output.avi ``` diff --git a/docs/notes.md b/docs/notes.md index 13ed9be..aa6c33b 100644 --- a/docs/notes.md +++ b/docs/notes.md @@ -7,4 +7,9 @@ ffmpeg \ -i input.mkv \ -vcodec copy \ -acodec copy output.mkv -``` \ No newline at end of file +``` + +## Whisper Research +- [Making OpenAI Whisper faster](https://nikolas.blog/making-openai-whisper-faster/) +- https://github.com/guillaumekln/faster-whisper - Faster Whisper transcription with CTranslate2 +- https://github.com/Softcatala/whisper-ctranslate2 - Whisper command line client compatible with original OpenAI client based on CTranslate2 diff --git a/src/clean.js b/src/clean.js index 25f27bb..bbd9491 100755 --- a/src/clean.js +++ b/src/clean.js @@ -4,18 +4,24 @@ const log = require('./log') const utils = require('./utils') const video = require('./video') -const argv = yargs.usage('clean') +const argv = yargs.usage('clean-fast') .options({ input: { description: 'Input video filename', demandOption: true, alias: 'i' }, + engine: { + description: 'Transcription engine', + alias: 'e', + default: 'whisper-ctranslate2', + choices: ['whisper', 'whisper-ctranslate2'] + }, model: { description: 'Whisper model name', alias: 'm', default: 'tiny.en', - choices: ['tiny.en', 'tiny', 'base.en', 'base', 'small.en', 'small'] + choices: ['tiny.en', 'tiny', 'base.en', 'base', 'small.en', 'small', 'medium.en', 'medium', 'large-v1', 'large-v2', 'large'] }, language: { description: 'Video file language', @@ -41,7 +47,7 @@ const run = async () => { try { log.info('[1 of 4] Starting video transcribe...') const { model, language } = argv - await video.transcribe({ inputFile: paths.inputFile, model, language, outputDir: argv['output-dir'] }) + await video.transcribe({ engine: argv.engine, inputFile: paths.inputFile, model, language, outputDir: argv['output-dir'] }) } catch (err) { log.error(`Unable to transcribe ${paths.inputFile}`, err) throw err diff --git a/src/video.js b/src/video.js index 2e4ef24..0622d12 100644 --- a/src/video.js +++ b/src/video.js @@ -2,17 +2,27 @@ const fs = require('fs') const swearWords = require('./swear-words.json') const utils = require('./utils') -const transcribe = async ({ inputFile, model = 'tiny.en', language = 'en', outputDir = '.' }) => { +const transcribe = async ({ engine = 'whisper-ctranslate2', inputFile, model = 'tiny.en', language = 'en', outputDir = '.' }) => { const args = [ inputFile, '--model', model, '--model_dir', '/app/.whisper', '--language', language, '--output_format', 'json', - '--output_dir', outputDir, - '--fp16', 'False' // TODO: make CLI argument to use GPU + '--output_dir', outputDir ] - await utils.asyncSpawn('whisper', args) + + // engine specific args + switch (engine) { + case 'whisper': + args.push('--fp16', 'False') + break + case 'whisper-ctranslate2': + args.push('--compute_type', 'int8') + break + } + + await utils.asyncSpawn(engine, args) } const cut = async ({ cutFile, outputFile }) => {