Allow to pass no_timestamp to predict.whisper #76

bnosac · Dec 23, 2024 · ca36a07 · ca36a07
1 parent 4b5c6a2
commit ca36a07
Show file tree

Hide file tree

Showing 6 changed files with 20 additions and 14 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: audio.whisper
 Type: Package
 Title: Transcribe Audio Files using the "Whisper" Automatic Speech Recognition Model
-Version: 0.4.1
+Version: 0.4.2
 Maintainer: Jan Wijffels <jwijffels@bnosac.be>
 Authors@R: c(
     person('Jan', 'Wijffels', role = c('aut', 'cre', 'cph'), email = 'jwijffels@bnosac.be', comment = "R wrapper"), 
@@ -29,6 +29,5 @@ Suggests:
     audio.vadwebrtc (>= 0.2.0)
 LinkingTo: Rcpp
 SystemRequirements: GNU make
-RoxygenNote: 7.1.2
+RoxygenNote: 7.2.3
 Remotes: bnosac/audio.vadwebrtc
-
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,7 @@
+## CHANGES IN audio.whisper VERSION 0.4.2
+
+- Allow to pass no_timestamps to predict.whisper
+
 ## CHANGES IN audio.whisper VERSION 0.4.1
 
 - Added function predict.whisper_transcription which allows to assign a transcription segment to either a left/right channel based on a Voice Activity Detection

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -5,8 +5,8 @@ whisper_load_model <- function(model, use_gpu = FALSE) {
     .Call('_audio_whisper_whisper_load_model', PACKAGE = 'audio.whisper', model, use_gpu)
 }
 
-whisper_encode <- function(model, path, language, token_timestamps = FALSE, translate = FALSE, duration = 0L, offset = 0L, trace = 1L, n_threads = 1L, n_processors = 1L, entropy_thold = 2.40, logprob_thold = -1.00, beam_size = -1L, best_of = 5L, split_on_word = FALSE, max_context = -1L, prompt = "", print_special = FALSE, diarize = FALSE, diarize_percent = 1.1) {
-    .Call('_audio_whisper_whisper_encode', PACKAGE = 'audio.whisper', model, path, language, token_timestamps, translate, duration, offset, trace, n_threads, n_processors, entropy_thold, logprob_thold, beam_size, best_of, split_on_word, max_context, prompt, print_special, diarize, diarize_percent)
+whisper_encode <- function(model, path, language, token_timestamps = FALSE, translate = FALSE, duration = 0L, offset = 0L, trace = 1L, n_threads = 1L, n_processors = 1L, entropy_thold = 2.40, logprob_thold = -1.00, beam_size = -1L, best_of = 5L, split_on_word = FALSE, max_context = -1L, prompt = "", print_special = FALSE, diarize = FALSE, diarize_percent = 1.1, no_timestamps = FALSE) {
+    .Call('_audio_whisper_whisper_encode', PACKAGE = 'audio.whisper', model, path, language, token_timestamps, translate, duration, offset, trace, n_threads, n_processors, entropy_thold, logprob_thold, beam_size, best_of, split_on_word, max_context, prompt, print_special, diarize, diarize_percent, no_timestamps)
 }
 
 whisper_print_benchmark <- function(model, n_threads = 1L) {

diff --git a/man/whisper_download_model.Rd b/man/whisper_download_model.Rd
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
@@ -18,8 +18,8 @@ BEGIN_RCPP
 END_RCPP
 }
 // whisper_encode
-Rcpp::List whisper_encode(SEXP model, std::string path, std::string language, bool token_timestamps, bool translate, Rcpp::IntegerVector duration, Rcpp::IntegerVector offset, int trace, int n_threads, int n_processors, float entropy_thold, float logprob_thold, int beam_size, int best_of, bool split_on_word, int max_context, std::string prompt, bool print_special, bool diarize, float diarize_percent);
-RcppExport SEXP _audio_whisper_whisper_encode(SEXP modelSEXP, SEXP pathSEXP, SEXP languageSEXP, SEXP token_timestampsSEXP, SEXP translateSEXP, SEXP durationSEXP, SEXP offsetSEXP, SEXP traceSEXP, SEXP n_threadsSEXP, SEXP n_processorsSEXP, SEXP entropy_tholdSEXP, SEXP logprob_tholdSEXP, SEXP beam_sizeSEXP, SEXP best_ofSEXP, SEXP split_on_wordSEXP, SEXP max_contextSEXP, SEXP promptSEXP, SEXP print_specialSEXP, SEXP diarizeSEXP, SEXP diarize_percentSEXP) {
+Rcpp::List whisper_encode(SEXP model, std::string path, std::string language, bool token_timestamps, bool translate, Rcpp::IntegerVector duration, Rcpp::IntegerVector offset, int trace, int n_threads, int n_processors, float entropy_thold, float logprob_thold, int beam_size, int best_of, bool split_on_word, int max_context, std::string prompt, bool print_special, bool diarize, float diarize_percent, bool no_timestamps);
+RcppExport SEXP _audio_whisper_whisper_encode(SEXP modelSEXP, SEXP pathSEXP, SEXP languageSEXP, SEXP token_timestampsSEXP, SEXP translateSEXP, SEXP durationSEXP, SEXP offsetSEXP, SEXP traceSEXP, SEXP n_threadsSEXP, SEXP n_processorsSEXP, SEXP entropy_tholdSEXP, SEXP logprob_tholdSEXP, SEXP beam_sizeSEXP, SEXP best_ofSEXP, SEXP split_on_wordSEXP, SEXP max_contextSEXP, SEXP promptSEXP, SEXP print_specialSEXP, SEXP diarizeSEXP, SEXP diarize_percentSEXP, SEXP no_timestampsSEXP) {
 BEGIN_RCPP
     Rcpp::RObject rcpp_result_gen;
     Rcpp::RNGScope rcpp_rngScope_gen;
@@ -43,7 +43,8 @@ BEGIN_RCPP
     Rcpp::traits::input_parameter< bool >::type print_special(print_specialSEXP);
     Rcpp::traits::input_parameter< bool >::type diarize(diarizeSEXP);
     Rcpp::traits::input_parameter< float >::type diarize_percent(diarize_percentSEXP);
-    rcpp_result_gen = Rcpp::wrap(whisper_encode(model, path, language, token_timestamps, translate, duration, offset, trace, n_threads, n_processors, entropy_thold, logprob_thold, beam_size, best_of, split_on_word, max_context, prompt, print_special, diarize, diarize_percent));
+    Rcpp::traits::input_parameter< bool >::type no_timestamps(no_timestampsSEXP);
+    rcpp_result_gen = Rcpp::wrap(whisper_encode(model, path, language, token_timestamps, translate, duration, offset, trace, n_threads, n_processors, entropy_thold, logprob_thold, beam_size, best_of, split_on_word, max_context, prompt, print_special, diarize, diarize_percent, no_timestamps));
     return rcpp_result_gen;
 END_RCPP
 }
@@ -71,7 +72,7 @@ END_RCPP
 
 static const R_CallMethodDef CallEntries[] = {
     {"_audio_whisper_whisper_load_model", (DL_FUNC) &_audio_whisper_whisper_load_model, 2},
-    {"_audio_whisper_whisper_encode", (DL_FUNC) &_audio_whisper_whisper_encode, 20},
+    {"_audio_whisper_whisper_encode", (DL_FUNC) &_audio_whisper_whisper_encode, 21},
     {"_audio_whisper_whisper_print_benchmark", (DL_FUNC) &_audio_whisper_whisper_print_benchmark, 2},
     {"_audio_whisper_whisper_language_info", (DL_FUNC) &_audio_whisper_whisper_language_info, 0},
     {NULL, NULL, 0}

diff --git a/src/rcpp_whisper.cpp b/src/rcpp_whisper.cpp
@@ -228,7 +228,8 @@ Rcpp::List whisper_encode(SEXP model, std::string path, std::string language,
                           std::string prompt = "",
                           bool print_special = false,
                           bool diarize = false,
-                          float diarize_percent = 1.1) {
+                          float diarize_percent = 1.1,
+                          bool no_timestamps = false) {
     float audio_duration=0;
 
     whisper_params params;
@@ -249,6 +250,7 @@ Rcpp::List whisper_encode(SEXP model, std::string path, std::string language,
     params.max_context = max_context;
     params.prompt = prompt;
     params.diarize = diarize;
+    params.no_timestamps = no_timestamps;
     if (params.fname_inp.empty()) {
         Rcpp::stop("error: no input files specified");
     }