Skip to content

Commit c8615bb

Browse files
committed
add offset/duration as arguments in predict.whisper, include new column called segment_offset in output by default, make sure examples on stereo are run in language es
1 parent 235178f commit c8615bb

File tree

4 files changed

+38
-17
lines changed

4 files changed

+38
-17
lines changed

NEWS.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
## CHANGES IN audio.whisper VERSION 0.4
22

33
- Allow to pass on multiple offset/durations
4-
- Allow to give sections in the audio (e.g. detected with a voice acitivy detector) to filter out these (voiced) data, make the transcription and make sure to add the amount of time which was cut out such that the resulting timepoints in from/to are aligned to the original audio file
4+
- Allow to give sections in the audio (e.g. detected with a voice activity detector) to filter out these (voiced) data, make the transcription and make sure to add the amount of time which was cut out to the from/to timestamps such that the resulting timepoints in from/to are aligned to the original audio file
5+
- The data element of the predict.whisper now includes a column called segment_offset indicating the offset of the provided sections or offsets
56

67
## CHANGES IN audio.whisper VERSION 0.3.3
78

R/whisper.R

+24-10
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,13 @@
88
#' @param language the language of the audio. Defaults to 'auto'. For a list of all languages the model can handle: see \code{\link{whisper_languages}}.
99
#' @param sections a data.frame with columns start and duration (measured in milliseconds) indicating voice segments to transcribe. This will make a new audio file with
1010
#' these sections, do the transcription and make sure the from/to timestamps are aligned to the original audio file. Defaults to transcribing the full audio file.
11+
#' @param offset an integer vector of offsets in milliseconds to start the transcription. Defaults to 0 - indicating to transcribe the full audio file.
12+
#' @param duration an integer vector of durations in milliseconds indicating how many milliseconds need to be transcribed from the corresponding \code{offset} onwards. Defaults to 0 - indicating to transcribe the full audio file.
1113
#' @param trim logical indicating to trim leading/trailing white space from the transcription using \code{\link{trimws}}. Defaults to \code{FALSE}.
1214
#' @param trace logical indicating to print the trace of the evolution of the transcription. Defaults to \code{TRUE}
1315
#' @param ... further arguments, directly passed on to the C++ function, for expert usage only and subject to naming changes. See the details.
1416
#' @details
1517
#' \itemize{
16-
#' \item{offset: milliseconds indicating to start transcribing from that timepoint onwards. Defaults to 0.}
17-
#' \item{duration: how many milliseconds need to be transcribed. Defaults to the whole audio file.}
1818
#' \item{token_timestamps: logical indicating to get the timepoints of each token}
1919
#' \item{n_threads: how many threads to use to make the prediction. Defaults to 1}
2020
#' \item{prompt: the initial prompt to pass on the model. Defaults to ''}
@@ -25,10 +25,12 @@
2525
#' \item{max_context: maximum number of text context tokens to store. Defaults to -1}
2626
#' \item{diarize: logical indicating to perform speaker diarization for audio with more than 1 channel}
2727
#' }
28+
#' If sections are provided
29+
#' If multiple offsets/durations are provided
2830
#' @return an object of class \code{whisper_transcription} which is a list with the following elements:
2931
#' \itemize{
3032
#' \item{n_segments: the number of audio segments}
31-
#' \item{data: a data.frame with the transcription with columns segment, text, from, to and optionally speaker if diarize=TRUE}
33+
#' \item{data: a data.frame with the transcription with columns segment, segment_offset, text, from, to and optionally speaker if diarize=TRUE}
3234
#' \item{tokens: a data.frame with the transcription tokens with columns segment, token_id, token, token_prob indicating the token probability given the context}
3335
#' \item{params: a list with parameters used for inference}
3436
#' \item{timing: a list with elements start, end and duration indicating how long it took to do the transcription}
@@ -69,15 +71,22 @@
6971
#' trans <- predict(model, newdata = audio, language = "auto", diarize = TRUE)
7072
predict.whisper <- function(object, newdata, type = c("transcribe", "translate"), language = "auto",
7173
sections = data.frame(start = integer(), duration = integer()),
74+
offset = 0L, duration = 0L,
7275
trim = FALSE, trace = TRUE, ...){
7376
type <- match.arg(type)
7477
stopifnot(length(newdata) == 1)
7578
stopifnot(file.exists(newdata))
7679
stopifnot(is.data.frame(sections) && all(c("start", "duration") %in% colnames(sections)))
80+
path <- newdata
81+
##
7782
## If specific audio sections are requested
83+
##
7884
if(nrow(sections) > 0){
85+
if(length(offset) > 1 || length(duration) > 1 || any(offset != 0) || any(duration != 0)){
86+
stop("sections can not be combined with offset/duration")
87+
}
7988
voiced <- subset.wav(newdata, offset = sections$start, duration = sections$duration)
80-
newdata <- voiced$file
89+
path <- voiced$file
8190
on.exit({
8291
if(file.exists(voiced$file)) file.remove(voiced$file)
8392
})
@@ -87,9 +96,9 @@ predict.whisper <- function(object, newdata, type = c("transcribe", "translate")
8796
}
8897
start <- Sys.time()
8998
if(type == "transcribe"){
90-
out <- whisper_encode(model = object$model, path = newdata, language = language, translate = FALSE, trace = as.integer(trace), ...)
99+
out <- whisper_encode(model = object$model, path = path, language = language, translate = FALSE, trace = as.integer(trace), offset = offset, duration = duration, ...)
91100
}else if(type == "translate"){
92-
out <- whisper_encode(model = object$model, path = newdata, language = language, translate = TRUE, trace = as.integer(trace), ...)
101+
out <- whisper_encode(model = object$model, path = path, language = language, translate = TRUE, trace = as.integer(trace), offset = offset, duration = duration, ...)
93102
}
94103
Encoding(out$data$text) <- "UTF-8"
95104
Encoding(out$tokens$token) <- "UTF-8"
@@ -98,11 +107,14 @@ predict.whisper <- function(object, newdata, type = c("transcribe", "translate")
98107
out$tokens$token <- trimws(out$tokens$token)
99108
}
100109
end <- Sys.time()
110+
##
101111
## If specific audio sections are requested - make sure timestamps are correct
112+
##
102113
if(nrow(sections) > 0){
114+
out$params$audio <- newdata
103115
## Align timestamps for out$data
104116
sentences <- align_skipped(sentences = out$data, skipped = skipped, from = "from", to = "to")
105-
sentences <- subset(sentences, sentences$grp == "voiced", select = intersect(c("segment", "from", "to", "text", "speaker"), colnames(sentences)))
117+
sentences <- subset(sentences, sentences$grp == "voiced", select = intersect(c("segment", "segment_offset", "from", "to", "text", "speaker"), colnames(sentences)))
106118
out$data <- sentences
107119
## Align timestamps for out$tokens if they are requested
108120
if("token_from" %in% colnames(out$tokens)){
@@ -143,6 +155,8 @@ align_skipped <- function(sentences, skipped, from = "from", to = "to"){
143155
sentences$end <- sentences$end + sentences$add
144156
sentences[[from]] <- format(as.POSIXct("1970-01-01 00:00:00", tz = "UTC") + sentences$start / 1000, "%H:%M:%OS")
145157
sentences[[to]] <- format(as.POSIXct("1970-01-01 00:00:00", tz = "UTC") + sentences$end / 1000, "%H:%M:%OS")
158+
sentences$segment_offset <- data.table::nafill(ifelse(sentences$grp == "skipped", sentences$start, NA_integer_), type = "locf")
159+
sentences$segment_offset <- ifelse(is.na(sentences$segment_offset), 0L, sentences$segment_offset)
146160
sentences <- data.table::setDF(sentences)
147161
sentences
148162
}
@@ -198,15 +212,15 @@ align_skipped <- function(sentences, skipped, from = "from", to = "to"){
198212
#'
199213
#' ## Add diarization
200214
#' trans <- predict(model, newdata = system.file(package = "audio.whisper", "samples", "stereo.wav"),
201-
#' language = "auto", diarize = TRUE)
215+
#' language = "es", diarize = TRUE)
202216
#' ## Provide multiple offsets and durations to get the segments in there
203217
#' trans <- predict(model, newdata = system.file(package = "audio.whisper", "samples", "stereo.wav"),
204-
#' language = "auto", diarize = TRUE,
218+
#' language = "es", diarize = TRUE,
205219
#' offset = c( 650, 6060, 10230), duration = c(4990, 3830, 11650))
206220
#' ## Provide sections - this will make a new audio file and next do the transcription
207221
#' if(require(data.table) && require(audio)){
208222
#' trans <- predict(model, newdata = system.file(package = "audio.whisper", "samples", "stereo.wav"),
209-
#' language = "auto", diarize = TRUE,
223+
#' language = "es", diarize = TRUE,
210224
#' sections = data.frame(start = c( 650, 6060, 10230),
211225
#' duration = c(4990, 3830, 11650)))
212226
#' }

man/predict.whisper.Rd

+9-3
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/whisper.Rd

+3-3
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)