Skip to content

Commit

Permalink
feat: translate long text
Browse files Browse the repository at this point in the history
  • Loading branch information
Tomeriko96 committed Jul 27, 2024
1 parent 6d2036c commit ee85fd1
Show file tree
Hide file tree
Showing 6 changed files with 104 additions and 18 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ export(create_transliteration_table)
export(google_get_supported_languages)
export(google_is_valid_language_code)
export(google_translate)
export(google_translate_long_text)
export(google_transliterate)
export(language_detect)
export(linguee_external_sources)
Expand Down
63 changes: 63 additions & 0 deletions R/google_translate_long_text.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#' Translate long text using Google Translate
#'
#' This function translates long text from one language to another using Google Translate.
#' It splits the text into smaller chunks if necessary to handle large inputs.
#'
#' @param text The long text to translate. Should be a single string.
#' @param target_language The language to translate the text into. Default is "en" for English.
#' @param source_language The language of the input text. Default is "auto" for automatic detection.
#' @param chunk_size The maximum number of characters to send in a single translation request. Default is 1000.
#'
#' @return A single string containing the translated text.
#' @export
#'
#' @examples
#' \donttest{
#' long_text <- paste(rep("This is a long text to translate.", 100), collapse = " ")
#' google_translate_long_text(
#' long_text, target_language = "de",
#' source_language = "en",
#' chunk_size = 500)
#' }
google_translate_long_text <- function(text, target_language = "en", source_language = "auto", chunk_size = 1000) {
if (!google_is_valid_language_code(target_language)) {
stop("Invalid target language code.")
}
if (!google_is_valid_language_code(source_language)) {
stop("Invalid source language code.")
}

# Function to split text into chunks
split_text <- function(text, chunk_size) {
split_indices <- seq(1, nchar(text), by = chunk_size)
sapply(split_indices, function(i) substr(text, i, i + chunk_size - 1))
}

# Split text into chunks if it's too long
if (nchar(text) > chunk_size) {
text_chunks <- split_text(text, chunk_size)
} else {
text_chunks <- list(text)
}

# Translate each chunk
translations <- sapply(text_chunks, function(chunk) {
formatted_text <- urltools::url_encode(chunk)
formatted_link <- paste0(
"https://translate.google.com/m?tl=",
target_language, "&sl=", source_language,
"&q=", formatted_text
)

response <- httr::GET(formatted_link)
translation <- httr::content(response) %>%
rvest::html_nodes("div.result-container") %>%
rvest::html_text()

translation <- urltools::url_decode(translation)
gsub("\n", "", translation)
})

# Combine translated chunks
paste(translations, collapse = " ")
}
2 changes: 1 addition & 1 deletion R/wmcloud_translate.R
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ wmcloud_translate <- function(content,
valid_formats <- c("json", "markdown", "text", "webpage")

# List of valid models
valid_models <- c("nllb200-600M") # Add more models here
valid_models <- c("nllb200-600M", "nllb-wikipedia", "opusmt-en-bi", "opusmt-en-bcl", "opusmt-en-to", "opusmt-en-chr", "opusmt-en-guw", "opusmt-en-srn", "opusmt-en-ty", "opusmt-en-ve", "opusmt-sv-fi", "softcatala", "indictrans2-indic-en", "indictrans2-en-indic", "indictrans2-indic-indic", "madlad-400")

# Check if format and model are valid
if (!format %in% valid_formats) {
Expand Down
1 change: 1 addition & 0 deletions _pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ reference:
- google_is_valid_language_code
- google_supported_languages
- google_translate
- google_translate_long_text
- google_transliterate
- language_detect
- translate_file
Expand Down
38 changes: 38 additions & 0 deletions man/google_translate_long_text.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 0 additions & 17 deletions tests/testthat/test-google_translate.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,24 +25,7 @@ test_that("google_translate returns correct translation for unvectorized input",
expect_equal(translation, expected_translation)
})

# Unit tests for special characters from French to English
test_that("google translate returns translations with special characters", {
text_to_translate <- "La Saône prend sa source à Vioménil dans les pré-Vosges à 405 m d'altitude. La rivière conflue avec le Rhône 473,3 km plus loin."
translation <- google_translate(text_to_translate, target_language = "en")

expected_translation <- "The Saône has its source at Vioménil in the pre-Vosges at an altitude of 405 m. The river confluences with the Rhône 473.3 km further."
expect_equal(translation, expected_translation)
})


# Unit tests for special characters from Arabic to English
test_that("google translate returns translations with special characters", {
text_to_translate <- "يتدفقُ النيل عبر الصحراء السودانية إلى مصر باتجاه الشمال ويمر في مدينةُ القاهرة الواقعة على دلتا النهر الكبيرة (دلتا النيل)، ثم يعبر النهر مدينتي دمياط ورشيد ويصب ..."
translation <- google_translate(text_to_translate, target_language = "en")

expected_translation <- "The Nile flows through the Sudanese desert to Egypt towards the north and passes through the city of Cairo, located on the large river delta (Nile Delta), then the river crosses the cities of Damietta and Rosetta and flows..."
expect_equal(translation, expected_translation)
})

# Unit tests for checking supported languages in Google Translate
test_that("google_is_valid_language_code: valid codes", {
Expand Down

0 comments on commit ee85fd1

Please sign in to comment.