Skip to content

Commit dad52b7

Browse files
committed
add predict.whisper_transcription which allows to assign a transcription segment to either a left/right channel based on a Voice Activity Detection
1 parent cbf7c00 commit dad52b7

5 files changed

+152
-2
lines changed

DESCRIPTION

+3-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Package: audio.whisper
22
Type: Package
33
Title: Transcribe Audio Files using the "Whisper" Automatic Speech Recognition Model
4-
Version: 0.4
4+
Version: 0.4.1
55
Maintainer: Jan Wijffels <jwijffels@bnosac.be>
66
Authors@R: c(
77
person('Jan', 'Wijffels', role = c('aut', 'cre', 'cph'), email = 'jwijffels@bnosac.be', comment = "R wrapper"),
@@ -25,7 +25,8 @@ Imports:
2525
Suggests:
2626
tinytest,
2727
audio,
28-
data.table (>= 1.12.4)
28+
data.table (>= 1.12.4),
29+
audio.vadwebrtc (>= 0.2.0)
2930
LinkingTo: Rcpp
3031
SystemRequirements: GNU make
3132
RoxygenNote: 7.1.2

NAMESPACE

+1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Generated by roxygen2: do not edit by hand
22

33
S3method(predict,whisper)
4+
S3method(predict,whisper_transcription)
45
export(whisper)
56
export(whisper_benchmark)
67
export(whisper_download_model)

NEWS.md

+4
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## CHANGES IN audio.whisper VERSION 0.4.1
2+
3+
- Added function predict.whisper_transcription which allows to assign a transcription segment to either a left/right channel based on a Voice Activity Detection
4+
15
## CHANGES IN audio.whisper VERSION 0.4
26

37
- Allow to pass on multiple offset/durations

R/vad-channel.R

+86
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
2+
3+
#' @title Predict to which channel a transcription section belongs
4+
#' @description Audio files containing 2 channels which were transcribed with \code{\link{predict.whisper}},
5+
#' you can use the results of a Voice Activity Detection by channel (either with R packages
6+
#' \code{audio.vadwebrtc} or \code{audio.vadsilero}) to assign the text segments to each of the channels.\cr
7+
#' This is done by looking for each text segment how many seconds overlap there is with the voiced sections which are identified
8+
#' by the Voice Activity Detection.
9+
#' @param object an object of class \code{whisper_transcription} as returned by \code{\link{predict.whisper}}
10+
#' @param vad an object of class \code{webrtc-gmm-bychannel} as returned by function \code{VAD_channel} from R package \code{audio.vadwebrtc} with information of the detected voice in at least channels 1 and 2.
11+
#' ar a list with element vad_segments containing a data.frame with columns channel, start, end and has_voice with information at which second
12+
#' there was a voice in the audio
13+
#' @param type character string with currently only possible value: 'channel' which does a 2-speaker channel assignment
14+
#' @param threshold numeric in 0-1 range indicating if the difference between the probability that the segment was from the left channel 1 or the right channel 2 is smaller than this amount, the column \code{channel} will be set to 'both'. Defaults to 0.
15+
#' @param ... not used
16+
#' @return an object of class \code{whisper_transcription} as documented in \code{\link{predict.whisper}}
17+
#' where element \code{data} contains the following extra columns indicating which channel the transcription is probably from
18+
#' \itemize{
19+
#' \item{channel: either 'left', 'right' or 'both' indicating the transcription segment was either from the left channel (1), the right channel (2) or probably from both as identified by the Voice Activity Detecion}
20+
#' \item{channel_probability: a number between 0 and 1 indicating for that specific segment the ratio of the amount of voiced seconds in the most probably channel to
21+
#' the sum of the amount of voiced seconds in the left + the right channel}
22+
#' \item{duration: how long (in seconds) the from-to segment is}
23+
#' \item{duration_voiced_left: how many seconds there was a voiced signal on the left channel (channel 1) as identified by \code{vad}}
24+
#' \item{duration_voiced_right: how many seconds there was a voiced signal on the right channel (channel 2) as identified by \code{vad}}
25+
#' }
26+
#' @export
27+
#' @seealso \code{\link{predict.whisper}}
28+
#' @examples
29+
#' library(audio.whisper)
30+
#' model <- whisper("tiny")
31+
#' audio <- system.file(package = "audio.whisper", "samples", "stereo.wav")
32+
#' trans <- predict(model, audio, language = "es")
33+
#' \dontrun{
34+
#' library(audio.vadwebrtc)
35+
#' vad <- VAD_channel(audio, channels = "all", mode = "veryaggressive", milliseconds = 30)
36+
#' }
37+
#' vad <- list(vad_segments = rbind(
38+
#' data.frame(channel = 1, start = c(0, 5, 15, 22), end = c(5, 9, 18, 23), has_voice = TRUE),
39+
#' data.frame(channel = 2, start = c(2, 9.5, 19, 22), end = c(2.5, 13.5, 21, 23), has_voice = TRUE)))
40+
#' out <- predict(trans, vad, type = "channel", threshold = 0)
41+
#' out$data
42+
predict.whisper_transcription <- function(object, vad, type = "channel", threshold = 0, ...){
43+
stopifnot(inherits(object, "whisper_transcription"))
44+
if(!inherits(vad, "webrtc-gmm-bychannel")){
45+
#warning("provided vad is not of type webrtc-gmm-bychannel")
46+
}
47+
fields <- colnames(object$data)
48+
sentences <- object$data
49+
today <- Sys.Date()
50+
sentences$start <- as.numeric(difftime(as.POSIXct(paste(today, sentences$from, sep = " "), format = "%Y-%m-%d %H:%M:%OS"), as.POSIXct(paste(today, "00:00:00.000", sep = " "), format = "%Y-%m-%d %H:%M:%OS"), units = "secs"))
51+
sentences$end <- as.numeric(difftime(as.POSIXct(paste(today, sentences$to, sep = " "), format = "%Y-%m-%d %H:%M:%OS"), as.POSIXct(paste(today, "00:00:00.000", sep = " "), format = "%Y-%m-%d %H:%M:%OS"), units = "secs"))
52+
sentences$duration <- sentences$end - sentences$start
53+
54+
#voiced <- audio.vadwebrtc:::VAD_channel(newdata, mode = mode)
55+
voiced <- vad
56+
left <- voiced$vad_segments[voiced$vad_segments$channel %in% 1 & voiced$vad_segments$has_voice, ]
57+
right <- voiced$vad_segments[voiced$vad_segments$channel %in% 2 & voiced$vad_segments$has_voice, ]
58+
sentences$duration_voiced_left <- mapply(start = sentences$start, end = sentences$end, FUN = function(start, end, voiced){
59+
voiced <- voiced[voiced$end >= start & voiced$start <= end, ]
60+
voiced$from <- ifelse(voiced$start > start, voiced$start, start)
61+
voiced$to <- ifelse(voiced$end > end, end, voiced$end)
62+
voiced <- voiced[voiced$from <= voiced$to, ]
63+
sum(voiced$to - voiced$from, na.rm = TRUE)
64+
}, MoreArgs = list(voiced = left), SIMPLIFY = TRUE, USE.NAMES = FALSE)
65+
sentences$duration_voiced_left <- as.numeric(sentences$duration_voiced_left)
66+
sentences$duration_voiced_right <- mapply(start = sentences$start, end = sentences$end, FUN = function(start, end, voiced){
67+
voiced <- voiced[voiced$end >= start & voiced$start <= end, ]
68+
voiced$from <- ifelse(voiced$start > start, voiced$start, start)
69+
voiced$to <- ifelse(voiced$end > end, end, voiced$end)
70+
voiced <- voiced[voiced$from <= voiced$to, ]
71+
sum(voiced$to - voiced$from, na.rm = TRUE)
72+
}, MoreArgs = list(voiced = right), SIMPLIFY = TRUE, USE.NAMES = FALSE)
73+
sentences$duration_voiced_right <- as.numeric(sentences$duration_voiced_right)
74+
sentences$channel <- ifelse(sentences$duration_voiced_left > sentences$duration_voiced_right, "left", "right")
75+
sentences$channel_left_probability <- sentences$duration_voiced_left / (sentences$duration_voiced_left + sentences$duration_voiced_right)
76+
sentences$channel_right_probability <- sentences$duration_voiced_right / (sentences$duration_voiced_left + sentences$duration_voiced_right)
77+
sentences$channel_probability <- ifelse(sentences$channel_left_probability > sentences$channel_right_probability, sentences$channel_left_probability, sentences$channel_right_probability)
78+
sentences$left_pct <- round(sentences$duration_voiced_left / (sentences$duration_voiced_left + sentences$duration_voiced_right), digits = 2)
79+
sentences$right_pct <- round(sentences$duration_voiced_right / (sentences$duration_voiced_left + sentences$duration_voiced_right), digits = 2)
80+
sentences$channel <- ifelse(abs(sentences$left_pct - sentences$right_pct) < threshold, "both", sentences$channel)
81+
sentences$segment_pct_nonsilent_left <- sentences$duration_voiced_left / sentences$duration
82+
sentences$segment_pct_nonsilent_right <- sentences$duration_voiced_right / sentences$duration
83+
#c("segment", "segment_offset", "from", "to", "text", "speaker")
84+
object$data <- sentences[, unique(c(fields, "channel", "channel_probability", "duration", "duration_voiced_left", "duration_voiced_right"))]
85+
object
86+
}

man/predict.whisper_transcription.Rd

+58
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)