From c8d7bfd35cb2639fe7a762e0746db8df212355ac Mon Sep 17 00:00:00 2001 From: jogrue Date: Sun, 20 Dec 2020 00:40:38 +0100 Subject: [PATCH] Release accompanying the publication of the paper in New Media & Society. --- .Rbuildignore | 3 + .gitignore | 4 + DESCRIPTION | 24 + LICENSE.md | 43 ++ NAMESPACE | 11 + R/apply-multidict.R | 145 ++++++ R/make-compounds.R | 222 +++++++++ R/misc-util-functions.R | 219 +++++++++ R/pattern-stats.R | 121 +++++ R/run-multidict.R | 474 +++++++++++++++++++ R/run-non-multidict.R | 422 +++++++++++++++++ README.md | 45 +- man/check_pattern_performance.Rd | 33 ++ man/dot-create_single_compound.Rd | 28 ++ man/dot-make_compounds_for_single_pattern.Rd | 39 ++ man/every_term_a_dict.Rd | 19 + man/get_pattern_stats.Rd | 56 +++ man/make_compounds.Rd | 65 +++ man/run_kwic_stepwise.Rd | 41 ++ man/run_multidict.Rd | 128 +++++ man/run_non_multidict.Rd | 123 +++++ man/run_tokens_compound_stewpise.Rd | 45 ++ multidictR.Rproj | 22 + 23 files changed, 2331 insertions(+), 1 deletion(-) create mode 100644 .Rbuildignore create mode 100644 .gitignore create mode 100644 DESCRIPTION create mode 100644 LICENSE.md create mode 100644 NAMESPACE create mode 100644 R/apply-multidict.R create mode 100644 R/make-compounds.R create mode 100644 R/misc-util-functions.R create mode 100644 R/pattern-stats.R create mode 100644 R/run-multidict.R create mode 100644 R/run-non-multidict.R create mode 100644 man/check_pattern_performance.Rd create mode 100644 man/dot-create_single_compound.Rd create mode 100644 man/dot-make_compounds_for_single_pattern.Rd create mode 100644 man/every_term_a_dict.Rd create mode 100644 man/get_pattern_stats.Rd create mode 100644 man/make_compounds.Rd create mode 100644 man/run_kwic_stepwise.Rd create mode 100644 man/run_multidict.Rd create mode 100644 man/run_non_multidict.Rd create mode 100644 man/run_tokens_compound_stewpise.Rd create mode 100644 multidictR.Rproj diff --git a/.Rbuildignore b/.Rbuildignore new file mode 100644 index 0000000..b4b19e1 --- /dev/null +++ b/.Rbuildignore @@ -0,0 +1,3 @@ +^multidictR\.Rproj$ +^\.Rproj\.user$ +^LICENSE\.md$ diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5b6a065 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.Rproj.user +.Rhistory +.RData +.Ruserdata diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..1170646 --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,24 @@ +Package: multidictR +Title: Deals with dictionaries containing multi word terms +Version: 0.9.0 +Authors@R: + person(given = "Johann", + family = "Gründl", + role = c("aut", "cre"), + email = "mail@johanngruendl.at", + comment = c(ORCID = "0000-0001-6497-8188")) +Description: Quanteda does not allow for more complex multi-word dictionaries + that for example include wildcards spanning multiple words. This + package provides some functions to deal with such more complex + dictionaries. It is used with the populism dictionary in popdictR. +License: CC0 +Imports: + stringr, + regexhelpeR, + quanteda, + dplyr, + rlang +Encoding: UTF-8 +LazyData: true +Roxygen: list(markdown = TRUE) +RoxygenNote: 7.1.1 diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..139c68e --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,43 @@ +## creative commons + +# CC0 1.0 Universal + +CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED HEREUNDER. + +### Statement of Purpose + +The laws of most jurisdictions throughout the world automatically confer exclusive Copyright and Related Rights (defined below) upon the creator and subsequent owner(s) (each and all, an "owner") of an original work of authorship and/or a database (each, a "Work"). + +Certain owners wish to permanently relinquish those rights to a Work for the purpose of contributing to a commons of creative, cultural and scientific works ("Commons") that the public can reliably and without fear of later claims of infringement build upon, modify, incorporate in other works, reuse and redistribute as freely as possible in any form whatsoever and for any purposes, including without limitation commercial purposes. These owners may contribute to the Commons to promote the ideal of a free culture and the further production of creative, cultural and scientific works, or to gain reputation or greater distribution for their Work in part through the use and efforts of others. + +For these and/or other purposes and motivations, and without any expectation of additional consideration or compensation, the person associating CC0 with a Work (the "Affirmer"), to the extent that he or she is an owner of Copyright and Related Rights in the Work, voluntarily elects to apply CC0 to the Work and publicly distribute the Work under its terms, with knowledge of his or her Copyright and Related Rights in the Work and the meaning and intended legal effect of CC0 on those rights. + +1. __Copyright and Related Rights.__ A Work made available under CC0 may be protected by copyright and related or neighboring rights ("Copyright and Related Rights"). Copyright and Related Rights include, but are not limited to, the following: + + i. the right to reproduce, adapt, distribute, perform, display, communicate, and translate a Work; + + ii. moral rights retained by the original author(s) and/or performer(s); + + iii. publicity and privacy rights pertaining to a person's image or likeness depicted in a Work; + + iv. rights protecting against unfair competition in regards to a Work, subject to the limitations in paragraph 4(a), below; + + v. rights protecting the extraction, dissemination, use and reuse of data in a Work; + + vi. database rights (such as those arising under Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, and under any national implementation thereof, including any amended or successor version of such directive); and + + vii. other similar, equivalent or corresponding rights throughout the world based on applicable law or treaty, and any national implementations thereof. + +2. __Waiver.__ To the greatest extent permitted by, but not in contravention of, applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and unconditionally waives, abandons, and surrenders all of Affirmer's Copyright and Related Rights and associated claims and causes of action, whether now known or unknown (including existing as well as future claims and causes of action), in the Work (i) in all territories worldwide, (ii) for the maximum duration provided by applicable law or treaty (including future time extensions), (iii) in any current or future medium and for any number of copies, and (iv) for any purpose whatsoever, including without limitation commercial, advertising or promotional purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each member of the public at large and to the detriment of Affirmer's heirs and successors, fully intending that such Waiver shall not be subject to revocation, rescission, cancellation, termination, or any other legal or equitable action to disrupt the quiet enjoyment of the Work by the public as contemplated by Affirmer's express Statement of Purpose. + +3. __Public License Fallback.__ Should any part of the Waiver for any reason be judged legally invalid or ineffective under applicable law, then the Waiver shall be preserved to the maximum extent permitted taking into account Affirmer's express Statement of Purpose. In addition, to the extent the Waiver is so judged Affirmer hereby grants to each affected person a royalty-free, non transferable, non sublicensable, non exclusive, irrevocable and unconditional license to exercise Affirmer's Copyright and Related Rights in the Work (i) in all territories worldwide, (ii) for the maximum duration provided by applicable law or treaty (including future time extensions), (iii) in any current or future medium and for any number of copies, and (iv) for any purpose whatsoever, including without limitation commercial, advertising or promotional purposes (the "License"). The License shall be deemed effective as of the date CC0 was applied by Affirmer to the Work. Should any part of the License for any reason be judged legally invalid or ineffective under applicable law, such partial invalidity or ineffectiveness shall not invalidate the remainder of the License, and in such case Affirmer hereby affirms that he or she will not (i) exercise any of his or her remaining Copyright and Related Rights in the Work or (ii) assert any associated claims and causes of action with respect to the Work, in either case contrary to Affirmer's express Statement of Purpose. + +4. __Limitations and Disclaimers.__ + + a. No trademark or patent rights held by Affirmer are waived, abandoned, surrendered, licensed or otherwise affected by this document. + + b. Affirmer offers the Work as-is and makes no representations or warranties of any kind concerning the Work, express, implied, statutory or otherwise, including without limitation warranties of title, merchantability, fitness for a particular purpose, non infringement, or the absence of latent or other defects, accuracy, or the present or absence of errors, whether or not discoverable, all to the greatest extent permissible under applicable law. + + c. Affirmer disclaims responsibility for clearing rights of other persons that may apply to the Work or any use thereof, including without limitation any person's Copyright and Related Rights in the Work. Further, Affirmer disclaims responsibility for obtaining any necessary consents, permissions or other rights required for any use of the Work. + + d. Affirmer understands and acknowledges that Creative Commons is not a party to this document and has no duty or obligation with respect to this CC0 or use of the Work. diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..c85474b --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,11 @@ +# Generated by roxygen2: do not edit by hand + +export(check_pattern_performance) +export(every_term_a_dict) +export(get_pattern_stats) +export(make_compounds) +export(run_kwic_stepwise) +export(run_multidict) +export(run_non_multidict) +export(run_tokens_compound_stewpise) +importFrom(rlang,.data) diff --git a/R/apply-multidict.R b/R/apply-multidict.R new file mode 100644 index 0000000..3f9c2ea --- /dev/null +++ b/R/apply-multidict.R @@ -0,0 +1,145 @@ +# ------------------------------------------------------------------------------ +# +# Script name: apply_multidict.R +# +# Purpose of script: Apply a dictionary containing complex multi-word patterns +# +# Author: Johann Gründl +# Email: mail@johanngruendl.at +# +# Date created: 2020-06-14 +# Date updated: 2020-06-14 +# +# ****************************************************************************** + +# apply_multidict <- function( +# text, +# dictionary, +# glob = FALSE, +# ignore_case = TRUE +# ) { +# +# quanteda::dfm_lookup( +# x = dfm, +# dictionary = dict, +# valuetype = valuetype, +# ) +# } +# +# # Replace dictionary terms +# smcorp_et <- readRDS("output/social-media-corpus-sentences.rds") +# # Possible terms +# patterns <- popdictR::gruendl_dictionary_complete %>% +# # filter(regex_sort <= 2) %>% +# pull(Word) +# +# # Prepare dictionary terms +# set.seed(20050617) +# all_terms <- tibble( +# # pattern = popdictR::gruendl_terms, +# pattern = patterns, +# replacement = as.character(NA)) %>% +# mutate(pattern = optimize_regex(pattern)) %>% +# mutate(pattern = switch_regex_greedy_lazy(pattern)) +# all_terms$replacement <- +# str_c( +# round( +# runif( +# n = nrow(all_terms), +# min = 10000, +# max = 99999 +# ), +# 0 +# ), +# replicate( +# n = nrow(all_terms), +# expr = str_c(sample(letters, 10, replace = TRUE), collapse = ""), +# ) +# ) +# saveRDS(all_terms, "output/all-terms-randomized-replacements.rds") +# +# # Replace dictionary hits with replacement strings +# for (i in 1:nrow(all_terms)) { +# loop_start <- Sys.time() +# texts(smcorp_et) <- str_replace_all(texts(smcorp_et), +# pattern = regex(pull(all_terms, pattern)[i], +# ignore_case = TRUE), +# replacement = pull(all_terms, +# replacement)[i]) +# # print(str_c("Loop ran for pattern ", pull(all_terms, pattern)[i], ".")) +# # print(Sys.time() - loop_start) +# } +# # Return corpus to original shape and save +# saveRDS(smcorp_et, +# "output/social-media-corpus-sentences-every-term-replaced.rds") +# smcorp_et <- readRDS("output/social-media-corpus-sentences-every-term-replaced.rds") +# all_terms <- readRDS("output/all-terms-randomized-replacements.rds") +# all_terms <- all_terms %>% +# mutate(replacement = str_c("*", replacement, "*")) +# replaced_dict <- sapply(pull(all_terms, replacement), list) +# names(replaced_dict) <- pull(all_terms, pattern) +# replaced_dict <- dictionary(replaced_dict) +# +# smdfm_et <- dfm( +# smcorp_et, +# tolower = TRUE, +# stem = FALSE, +# remove_numbers = TRUE, +# remove_punct = TRUE, +# remove_symbols = TRUE, +# remove_separators = FALSE, +# remove_twitter = TRUE, +# remove_hyphens = FALSE, +# remove_url = TRUE, +# verbose = TRUE +# ) +# # Reduce memore need by subsetting +# smdfm_et <- smdfm_et %>% +# dfm_subset(date >= date("2014-01-01") & +# date < date("2020-02-29")) +# # Lookup terms +# results <- dfm_lookup(smdfm_et, +# replaced_dict, +# case_insensitive = TRUE, +# valuetype = "glob") +# rm(all_terms, smdfm_et, replaced_dict) +# # Reduce memory need by returning to document level +# results <- ceiling(results/topfeatures(results)[1]) +# docvars(results, "tmp_docid") <- docvars(results, "_docid") +# results <- dfm_group(results, "tmp_docid") +# # (results/docvars(results, "doc_sentences"))[sample(1:1000, 10), "\\bwir\\b"] +# # # Prepare for combining with corpus again. +# # results <- results %>% +# # convert(to = "data.frame") %>% +# # #select(document) %>% +# # rename_at(vars(everything()), ~paste0("term_", .)) +# # results <- smcorp_et$documents %>% +# # bind_cols(results) +# # rm(smcorp_et) +# # # Select variables and save +# # results <- results %>% +# # filter(date >= date("2014-01-01") & +# # date < date("2020-02-29")) %>% +# # select(id = `_docid`, date, actor_country, party, +# # popu_list, +# # ends_with("_std"), +# # starts_with("term_"), +# # ) %>% +# # group_by(id) %>% +# # mutate( +# # date = unique(date), +# # actor_country = unique(actor_country), +# # party = unique(party), +# # sentences = n(), +# # popu_list = unique(popu_list)) %>% +# # mutate_at( +# # .vars = vars(ends_with("_std")), +# # .funs = unique +# # ) %>% +# # mutate_at( +# # .vars = vars(starts_with("term_")), +# # .funs = sum +# # ) %>% +# # ungroup %>% +# # distinct +# saveRDS(results, "output/results-every-term.rds") diff --git a/R/make-compounds.R b/R/make-compounds.R new file mode 100644 index 0000000..682a5b9 --- /dev/null +++ b/R/make-compounds.R @@ -0,0 +1,222 @@ +# ------------------------------------------------------------------------------ +# +# Script name: make-compounds.R +# +# Purpose of script: Scripts to create compounds in text. +# +# Author: Johann Gründl +# Email: mail@johanngruendl.at +# +# Date created: 2020-06-09 +# Date updated: 2020-06-09 +# +# ****************************************************************************** + + +#' Creates a single compound +#' +#' @description This is an internal function use in the +#' .make_compounds_for_single_pattern function. It is basically a wrapper around +#' stringr::str_replace_all. By default, it replaces all space characters (" ") +#' with underline character ("_"). +#' +#' @param compound A simple text (= character vector). +#' @param wordsep The word separator to look for, usually spaces. Defaults to +#' " ". +#' @param concatenator The replacement for wordsep characters (e.g., " "). +#' Defaults to "_". +#' +#' @return The provided text where all occurrences of the wordsep character +#' (by default, " ") were replaced by the concatenator character (by default, +#' "_"). +.create_single_compound <- function(compound, wordsep = " ", concatenator = "_") { + return(stringr::str_replace_all(compound, wordsep, concatenator)) +} + + +#' Replace matches for a single pattern with compounds +#' +#' @description This is an internal function use in the make_compounds function. +#' +#' @param text Text should be provided as a character vector. +#' @param pattern A single pattern with possible regular expressions. +#' @param wordsep The word separator to look for, usually spaces. Defaults to +#' " ". +#' @param concatenator The replacement for wordsep characters (e.g., " "). +#' Defaults to "_". +#' @param lazy Should regular expressions be transformed to lazy versions? +#' Defaults to TRUE to return the shortest possible compounds. +#' @param ignore_case Should the case be ignored when searching for pattern +#' matches? Defaults to TRUE. +#' +#' @return The provided text where all occurrences of the pattern have been +#' converted to multi-word compounds. +.make_compounds_for_single_pattern <- function( + text, + pattern, + wordsep = " ", + concatenator = "_", + lazy = TRUE, + ignore_case = TRUE +) { + if (lazy) { + pattern <- regexhelpeR::make_all_regex_lazy(pattern) + } + matchindices <- stringr::str_locate_all( + text, + stringr::regex(pattern, ignore_case = ignore_case) + ) + if (length(matchindices) == 1) { + matchindices <- matchindices[[1]] + } else { + stop("Not a single text provided for make_compound_text.") + } + if (nrow(matchindices) < 1) {return(text)} + for (i in 1:nrow(matchindices)) { + ind <- matchindices[i, ] + stringr::str_sub(text, ind[1], ind[2]) <- + .create_single_compound(stringr::str_sub(text, ind[1], ind[2]), + wordsep, concatenator) + } + return(text) +} + + +#' Create compounds for complex patterns +#' +#' @description For a list of (multi-word) patterns compounds are created. The +#' function works with quanteda corpus objects and regular texts (character +#' vectors; in general text that can be transformed to a quanteda corpus object +#' by quanteda's corpus() function should work). It expects regular expression +#' patterns but can work with glob expressions as well (if parameter glob is +#' set, internally the patterns are transformed to regular expressions). +#' quanteda has built-in functionality for this; however, it does not +#' allow for patterns which include wildcards that stand for multiple words. +#' For example, something like "the * people" could capture "the honest +#' people", "the hard-working, common people", or "the singer sings songs about +#' people". Such patterns would not work as expected in quanteda. With regular +#' expressions, a lot more sophisticated patterns become possible. +#' +#' @param text A quanteda corpus object or something that can be transformed to +#' a corpus by quanteda::corpus(), for example, a simple character vector +#' @param patterns A character vector where each element is a pattern or a +#' quanteda dictionary object. Patterns are expected to be regular expressions +#' (if glob parameter is not set) or only include glob-style wildcards (if glob +#' parameter is set to TRUE). +#' @param wordsep The word seperator, usually simply a space. Defaults to " ". +#' @param concatenator The character for creating multi-word compounds, defaults +#' to "_". +#' @param at_level At which level should patterns be applied. Possible values +#' are "documents", "sentences", or "paragraphs". Defaults to "sentences". +#' @param glob Do the provided patterns use glob-style wildcards instead of +#' regular expressions? Defaults to FALSE. +#' @param lazy Should regular expressions be transformed to lazy versions? +#' Defaults to TRUE to return the shortest possible compounds. +#' @param ignore_case Should the case be ignored when searching for pattern +#' matches? Defaults to TRUE. +#' @param optimize_regex Should the regular expressions be optimized to allow +#' for quicker lookups (see regexhelpeR package)? Defaults to TRUE. +#' +#' @return The corpus or text object where matched multi-word terms are now +#' replaced by multi-word compounds. +#' +#' @export +make_compounds <- function( + text, + patterns, + wordsep = " ", + concatenator = "_", + at_level = "sentences", + glob = FALSE, + lazy = TRUE, + ignore_case = TRUE, + optimize_regex = TRUE +) { + is_quanteda_corpus <- quanteda::is.corpus(text) + if (!is_quanteda_corpus) { + text <- quanteda::corpus(text) + } + # Prepare patterns + if (quanteda::is.dictionary(patterns)) { + patterns <- unlist(patterns) + } + if (glob) { + message("Provided glob patterns are replaced with regex patterns.") + patterns <- regexhelpeR::glob_to_regex(patterns) + } + if (optimize_regex) { + patterns <- regexhelpeR::optimize_regex_patterns(patterns) + } + if (length(patterns) < 1) { + warning("No pattern was provided.") + return(text) + } + patterns <- patterns[order(stringr::str_length(patterns), patterns)] + patterns <- patterns[stringr::str_detect(patterns, wordsep)] + patterns <- unique(patterns) + if (length(patterns) < 1) { + warning("No multi-word pattern was provided.") + return(text) + } + + # Reshape corpus to apply patterns at specified level + old_level <- quanteda::meta(text, type = "all")$unit + if (is.null(old_level)) { + # Older corpus objects do not have the unit meta field -> update corpus + warning(paste0('Corpus did not include the "unit" meta field. ', + 'Possibly an old corpus generated with quanteda ', + 'version < 2.0. The corpus was recast to new corpus ', + 'object.')) + text <- quanteda::corpus(text) + old_level <- quanteda::meta(text, type = "all")$unit + } + if (at_level != old_level) { + message(paste0("Corpus is reshaped to level ", at_level)) + text <- quanteda::corpus_reshape(text, to = at_level) + } + + # Make compounds for the whole corpus for every pattern + patterntimes <- rep(NA_real_, length(patterns)) + names(patterntimes) <- patterns + for (i in 1:length(patterns)) { + time <- Sys.time() + message(paste0("Run compound generation for whole text with pattern: ", + i, "/", length(patterns))) + quanteda::texts(text) <- sapply( + quanteda::texts(text), + .make_compounds_for_single_pattern, + pattern = patterns[i], + wordsep = wordsep, + concatenator = concatenator, + lazy = lazy, + ignore_case = ignore_case + ) + time <- as.numeric(Sys.time() - time, units = "secs") + patterntimes[i] <- time + message( + paste0( + patterns[i], " took ", time, " to complete. ", + "The average time for a pattern is now: ", + round(mean(patterntimes, na.rm = TRUE), 1), + " seconds. The remaining ", + length(patterns) - i, + " patterns should complete in ", + round( + mean(patterntimes, na.rm = TRUE)*(length(patterns) - i), + 1 + ), + " seconds." + ) + ) + } + # print(patterntimes) + # Return corpus to original shape + if (old_level != at_level) { + message(paste0("Corpus is returned to level ", old_level)) + text <- quanteda::corpus_reshape(text, to = old_level) + } + # If texts instead of a quanteda corpus were provided + text <- quanteda::texts(text) + # Return finished corpus + return(text) +} diff --git a/R/misc-util-functions.R b/R/misc-util-functions.R new file mode 100644 index 0000000..26346e9 --- /dev/null +++ b/R/misc-util-functions.R @@ -0,0 +1,219 @@ +# ------------------------------------------------------------------------------ +# +# Script name: misc-util-functions.R +# +# Purpose of script: Includes some functions to work with dictionaries. +# +# Author: Johann Gründl +# Email: mail@johanngruendl.at +# +# Date created: 2020-06-09 +# Date updated: 2020-06-09 +# +# ****************************************************************************** + + +#' Make every term a dictionary +#' +#' @description This functions transforms a dictionary (or a list of terms) to +#' a quanteda dictionary object where each term is it's own dictionary. +#' +#' @param dict A quanteda dictionary or a simple list of dictionary terms +#' +#' @return A quanteda dictionary object where each term is its own dictionary. +#' Dictionary names are simply the provided terms. +#' +#' @export +#' +every_term_a_dict <- function(dict) { + dict <- unlist(dict) + names(dict) <- dict + dict <- sapply(dict, list) + return(quanteda::dictionary(dict)) +} + + +#' Stepwise locate keywords-in-context +#' +#' @description With large amounts of text and complex (regex) patterns you +#' might run into problems with memory when running quanteda's kwic function. +#' Thus this function takes a stepwise approach. The tokens are split into +#' certain chunks (specified by step) and then kwic is run on these smaller +#' chunks. +#' +#' @param tokens A quanteda tokens object. +#' @param pattern A character vector, list of character vectors, dictionary, or +#' collocations object. +#' @param window The number of context words to be displayed around the keyword. +#' @param valuetype The type of pattern matching: "glob" for "glob"-style +#' wildcard expressions; "regex" for regular expressions; or "fixed" for exact +#' matching. Defaults to "regex". +#' @param step How many tokens should be processed at once? Defaults to 10000. +#' @param ... Additonal arguments passed to the kwic function. +#' +#' @return A kwic classed data.frame. +#' +#' @export +run_kwic_stepwise <- function( + tokens, + pattern, + window = 25, + valuetype = "regex", + step = 10000, + ... +) { + + if (!quanteda::is.tokens(tokens)) { + stop(paste0("Provided tokens object does not appear to be a quanteda", + " tokens object.")) + } + + # kwic are created in a loop + max <- length(tokens) + # step <- 10000 + first <- 1 - step + kwic <- NULL + repeat { + first <- first + step + last <- first + step - 1 + if (last > max) { + last <- max + } + tmp <- tokens[first:last] + print(paste0("Creating kwic for elements ", first, " to ", last, ".")) + tmp <- quanteda::kwic(tmp, + pattern = pattern, + window = window, + valuetype = valuetype, + ...) + if (is.null(kwic)) { + kwic <- tmp + } else { + kwic <- dplyr::bind_rows(kwic, tmp) + } + if (last == max) { + break + } + } + + return(kwic) +} + + +#' Get the amount possible pattern combinations +#' +#' @description Wildcards in patterns can potentially lead to a huge number +#' of possible combinations in tokens. This in turn leads to problems with +#' memory usage, in some quanteda functions. With this function you can check +#' which patterns in a dictionary are the most problematic (= produce the +#' highest number of possible combinations). This was suggested by Kohei +#' Watanabe here: +#' +#' +#' @param text A quanteda corpus object, text that can be transformed to a +#' corpus object, or a quanteda tokens object +#' @param pattern A quanteda pattern, i.e., a character vector, list of +#' character vectors, dictionary, or collocations object. +#' @param glob Are patterns glob-style? Defaults to FALSE (for regex patterns). +#' @param ... Additional parameters passed to quanteda:::pattern2list. +#' +#' @return A table containing the patterns and how many possible combinations +#' they produce for a given text or tokens object. On top are the patterns that +#' produce the most combinations and, thus, need the most memory. +#' +#' @export +check_pattern_performance <- function(text, pattern, glob = FALSE, ...) { + if (quanteda::is.corpus(text)) { + text <- quanteda::tokens(text) + } + if (!quanteda::is.tokens(text)) { + text <- quanteda::tokens(quanteda::corpus(text)) + } + if (!glob) { + valuetype = "regex" + } else { + valuetype = "glob" + } + return( + sort( + table( + names( + quanteda:::pattern2list( + x = pattern, + types = quanteda::types(text), + valuetype = valuetype, + case_insensitive = TRUE, + ... + ) + ) + ), + decreasing = TRUE + ) + ) +} + + +#' Stepwise generate compounds from tokens +#' +#' @description With large amounts of text and complex (regex) patterns you +#' might run into problems with memory when running quanteda's tokens_compound +#' function. Thus this function takes a stepwise approach. The tokens are split +#' into certain chunks (specified by step) and then tokens_compound is run on +#' these smaller chunks. See also +#' and the +#' check_pattern_performance function. +#' +#' @param tokens A quanteda tokens object. +#' @param pattern A character vector, list of character vectors, dictionary, or +#' collocations object. +#' @param concatenator The concatenation character that will connect the words +#' making up the multi-word sequences. Defaults to "_". +#' @param valuetype The type of pattern matching: "glob" for "glob"-style +#' wildcard expressions; "regex" for regular expressions; or "fixed" for exact +#' matching. Defaults to "regex". +#' @param step How many tokens should be processed at once? Defaults to 10000. +#' @param ... Additonal arguments passed to the tokens_compound function. +#' +#' @return A tokens object in which the token sequences matching pattern have +#' been replaced by new compounded "tokens" joined by the concatenator. +#' +#' @export +run_tokens_compound_stewpise <- function( + tokens, + pattern, + step = 10000, + concatenator = "_", + valuetype = "regex", + ... +) { + + # Compound tokens are created in a loop + max <- length(tokens) + first <- 1 - step + comp_toks <- NULL + repeat { + first <- first + step + last <- first + step - 1 + if (last > max) { + last <- max + } + tmp <- tokens[first:last] + print(paste0("Creating compounds for elements ", first, " to ", last, ".")) + tmp <- quanteda::tokens_compound( + x = tmp, + pattern = pattern, + concatenator = concatenator, + valuetype = valuetype, + ... + ) + if (is.null(comp_toks)) { + comp_toks <- tmp + } else { + comp_toks <- c(comp_toks, tmp) + } + if (last == max) { + break + } + } + return(comp_toks) +} diff --git a/R/pattern-stats.R b/R/pattern-stats.R new file mode 100644 index 0000000..778cfe0 --- /dev/null +++ b/R/pattern-stats.R @@ -0,0 +1,121 @@ +# ------------------------------------------------------------------------------ +# +# Script name: pattern-stats.R +# +# Purpose of script: Functions to return stats such as the lookup time and how +# often a pattern occurs. +# +# Author: Johann Gründl +# Email: mail@johanngruendl.at +# +# Date created: 2020-06-12 +# Date updated: 2020-06-12 +# +# ****************************************************************************** + + +#' Gather statistics for dictionary patterns +#' +#' @description For a list of (possibly multi-word) patterns statistics are +#' generated. Basically the number of matches and the time to run the pattern +#' on the whole corpus are recorded. By default, patterns are applied at the +#' sentence level. Thus, the match_count is the total count of sentences where +#' the pattern was detected. +#' +#' The function works with quanteda corpus objects and regular texts (character +#' vectors; in general text that can be transformed to a quanteda corpus object +#' by quanteda's corpus() function should work). It expects regular expression +#' patterns but can work with glob expressions as well (if parameter glob is +#' set, internally the patterns are transformed to regular expressions). +#' +#' @param text A quanteda corpus object or something that can be transformed to +#' a corpus by quanteda::corpus(), for example, a simple character vector. +#' @param patterns A character vector where each element is a pattern or a +#' quanteda dictionary object. Patterns are expected to be regular expressions +#' (if glob parameter is not set) or only include glob-style wildcars (if glob +#' parameter is set to TRUE). +#' @param at_level At which level should patterns be applied. Possible values +#' are "documents", "sentences", or "paragraphs". Defaults to "sentences". +#' @param glob Do the provided patterns use glob-style wildcards instead of +#' regular expressions? Defaults to FALSE. +#' @param ignore_case Should the case be ignored when searching for pattern +#' matches? Defaults to TRUE. +#' @param optimize_regex Should the regular expressions be optimized to allow +#' for quicker lookups (see regexhelpeR package)? Defaults to TRUE. +#' +#' @return A data frame containing a row for each pattern. It includes the +#' provided pattern (`original_pattern`). The version after pattern +#' transformations (e.g., switch from glob to regex, optimizations) were applied +#' (= the version which was actually tested; `applied_pattern`), the +#' `match_count`, and the `lookup_time`. +#' +#' @export +get_pattern_stats <- function( + text, + patterns, + at_level = "sentences", + glob = FALSE, + ignore_case = TRUE, + optimize_regex = TRUE +) { + is_quanteda_corpus <- quanteda::is.corpus(text) + if (!is_quanteda_corpus) { + message("Quanteda corpus is created") + text <- quanteda::corpus(text) + } + # Prepare patterns + if (quanteda::is.dictionary(patterns)) { + patterns <- unlist(patterns) + } + patterns_orig <- patterns + if (glob) { + message("Provided glob patterns are replaced with regex patterns.") + patterns <- regexhelpeR::glob_to_regex(patterns) + } + if (optimize_regex) { + message("Patterns are replaced with optimized regex patterns.") + patterns <- regexhelpeR::optimize_regex_patterns(patterns) + } + if (length(patterns) < 1) { + warning("No pattern was provided.") + return(text) + } + message(paste0('Reshape corpus to ', + '"', at_level, '"', + ' level')) + text <- quanteda::corpus_reshape(text, to = at_level) + # Create empty data frame + patternstats <- data.frame(original_pattern = patterns_orig, + applied_pattern = patterns, + match_count = NA_real_, + lookup_time = NA_real_, + stringsAsFactors = FALSE) + # Loop through all patterns + for (i in 1:nrow(patternstats)) { + pattern <- patternstats[i, "applied_pattern"] + time <- Sys.time() + patternstats[i, "match_count"] <- + sum( + as.numeric( + stringr::str_detect( + quanteda::texts(text), + stringr::regex(pattern, ignore_case = ignore_case) + ) + ) + ) + time <- as.numeric(Sys.time() - time, units = "secs") + patternstats[i, "lookup_time"] <- time + if (i == 1) { + print(paste0('First pattern was checked in ', + round(time, 1), + ' seconds.')) + } + if (i %in% ceiling(nrow(patternstats)/100*seq(10, 90, 10))) { + print(paste0( + floor(i/nrow(patternstats)*100), + "% of the patterns have been checked" + )) + } + } + return(patternstats) +} diff --git a/R/run-multidict.R b/R/run-multidict.R new file mode 100644 index 0000000..c2dc95d --- /dev/null +++ b/R/run-multidict.R @@ -0,0 +1,474 @@ +# ------------------------------------------------------------------------------ +# +# Script name: run-multidict.R +# +# Purpose of script: Function to run a multi term dictionary. +# +# Author: Johann Gründl +# Email: mail@johanngruendl.at +# +# Date created: 2020-11-20 +# Date updated: 2020-11-20 +# +# ****************************************************************************** + + +#' Run a dictionary with patterns that might match more than one term (or token) +#' +#' Quanteda's dictionary function does not allow for patterns with wildcards to +#' match more than one token. For example, a regular expression such as +#' "the (.*) people" would not work as expected if you wanted to match +#' expressions such as "the honest people" and "the good people". This +#' function facilitates the usage of dictionaries including such terms. It +#' could be considered the main function for the multidictR package. +#' Internally, the package uses stringr::str_replace_all to replace pattern +#' matches with a random string before then using quanteda to look this string +#' up in the corpus. +#' +#' @param corpus A quanteda corpus object or something that can be transformed +#' to a corpus by quanteda::corpus(), for example, a simple character vector +#' @param dict A character vector where each element is a pattern or a +#' quanteda dictionary object with one dictionary. +#' @param at_level At which level should patterns be applied. Possible values +#' are "documents", "sentences", or "paragraphs". Defaults to "sentences". +#' @param return_value How should the value be returned? Possible values +#' include "count", "binary", "prop", "count_at_level", or "prop_at_level". You +#' get the results from the dictionary at the document level. "count" (the +#' default) gives the simple frequency of dictionary hits in each document. +#' "count_at_level" gives you the number of sentences or paragraphs in a +#' document where there was at least one pattern match. Together with the +#' include_totals parameter "count" and "count_at_level" give you the most +#' flexibility to work with the results. "binary" returns 0 or 1, depending on +#' whether there was at least one pattern match in the document. "prop" is the +#' proportion of pattern matches relative to the total number of tokens in the +#' document. "prop_at_level" gives you the proportion of sentences or +#' paragraphs (in a document) where a pattern match was found. +#' @param include_totals Should the number sentencens (as "n_sentences") and +#' number of tokens (as "n_tokens") per document also be returned? Defaults to +#' TRUE. +#' @param return_result_only If TRUE, a data.frame containing the results will +#' be returned. If FALSE (the default), you will get the provided corpus with +#' the results attached as new columns. +#' @param pattern_type The type of pattern included in the dictionary. Defaults +#' to "regex" for regular expressions. "glob" is also possible for glob style +#' wildcards. Internally, glob patterns are transformed to regex patterns. +#' Other, usually not needed, possible values include "coll" and "fixed". See +#' the stringr package for details on pattern types. +#' @param case_insensitive Should the case be ignored when searching for +#' pattern matches? Defaults to TRUE. +#' @param regex_optimize Should the regular expressions be optimized by adding +#' word boundaries and removing open wildcards at word boundaries? This is +#' intended for using regular expression dictionary patterns the way I use +#' them in the popdictR package. It then allows for quicker lookups (see +#' regexhelpeR::optimize_regex_patterns)? Defaults to FALSE, so your patterns +#' are not changed. +#' @param regex_make_greedy Should regular expressions be transformed to greedy +#' versions? Defaults to FALSE. Usually not needed. If you switch this to TRUE, +#' while at the same time setting regex_make_lazy to TRUE as well, you will get +#' inverted patterns (i.e., lazy patterns become greedy and greedy patterns +#' become lazy). +#' @param regex_make_lazy Should regular expressions be transformed to lazy +#' versions? Defaults to FALSE, so your patterns are not changed. However, you +#' should probably use lazy regex patterns to replace the shortest possible +#' compounds. +#' @param dict_name You can set a custom name for your dictionary. This is also +#' the name of the variable that contains the results in the return value. If +#' you provided a quanteda dictionary, the name of the first dictionary +#' included will be used. Otherwise, the dict_name defaults to "dict". +#' @param custom_replacement Internally, this function replaces pattern matches +#' with a random string (containing 40 random letters and 10 random numbers) +#' before running quanteda's dictionary lookup function on the corpus. The +#' random string should be unique and there is usually no need to set a custom +#' string. +#' @param tolower Forwarded to quanteda's dfm function, converts all features +#' to lowercase. Defaults to the value for "case_insensitive." +#' @param stem Forwarded to quanteda's dfm function. If TRUE, quanteda stems +#' words. Defaults to FALSE. +#' @param remove Forwarded to quanteda's dfm function. A list of stopwords +#' which are removed from the dfm before running the dictionary. +#' @param ... Additional arguments passed on to quanteda's dfm function (and +#' there to the tokens function). Includes things such as "remove_punct", +#' "remove_symbols", "remove_numbers", etc. See quanteda's tokens function for +#' details. +#' +#' @return Returns the results of running the dictionary. If return_result_only +#' is set, you will get a data.frame with only the results. Otherwise, you the +#' results will be bound to the corpus as new columns. If you only provided +#' texts, the only other column will be these texts of course (variable x). If +#' you provided a quanteda corpus, the results will be stored as variables in +#' the docvars. +#' +#' @export +#' @importFrom rlang .data +run_multidict <- function( + corpus, + dict, + at_level = c("sentences", "paragraphs", "documents"), + return_value = c( + "count", + "binary", + "prop", + "count_at_level", + "prop_at_level" + ), + include_totals = TRUE, + return_result_only = FALSE, + pattern_type = c("regex", "glob", "coll", "fixed"), + case_insensitive = TRUE, + regex_optimize = FALSE, + regex_make_greedy = FALSE, + regex_make_lazy = FALSE, + dict_name, + custom_replacement, + tolower = case_insensitive, + stem = FALSE, + remove = NULL, + ... +) { + if (missing(corpus)) { + stop("You need to provide a text corpus.") + } + if (missing(dict)) { + stop("You need to provide a dictionary.") + } + # Prepare vector arguments + at_level <- at_level[1] + return_value <- return_value[1] + pattern_type <- pattern_type[1] + if (!(at_level %in% c("sentences", "paragraphs", "documents"))) { + stop('at_level has to be one of "sentences", "paragraphs", or "documents".') + } + if (!(return_value %in% c( + "prop_at_level", + "prop", + "count_at_level", + "count", + "binary" + ))) { + stop(paste0('return_value has to be one of "prop_at_level", "prop", ', + '"count_at_level", "count", or "binary".')) + } + if (!(pattern_type %in% c("regex", "glob", "coll", "fixed"))) { + stop('pattern_type has to be one of "regex", "glob", "coll", or "fixed".') + } + # Check return_value + if (at_level == "documents" & return_value == "prop_at_level") { + warning(paste0('return_value "prop_at_level" does not make sense with ', + 'analysis at the level of the documents. return_value was ', + 'changed to "prop" instead.')) + return_value <- "prop" + } + if (at_level == "documents" & return_value == "count_at_level") { + warning(paste0('return_value "count_at_level" does not make sense with ', + 'analysis at the level of the documents. return_value was ', + 'changed to "count" instead.')) + return_value <- "count" + } + # Check if paragraph was used with "at_level" results (not implemented, yet) + if (return_value == "prop_at_level" & at_level == "paragraphs") { + stop(paste0('The combination of return_value "', return_value, '" and ', + 'at_level "', at_level, '" has not been implemented yet.')) + } + # Quanteda corpus is prepared and original corpus stored if needed + if (quanteda::is.corpus(corpus)) { + old_level <- quanteda::meta(corpus, field = "unit", type = "object") + if ((old_level != at_level) & (old_level != "documents")) { + warning(paste0("You provided a quanteda corpus object which was not at ", + "the documents level and also not at the level you wanted", + "to do analysis for. Thus, it is transfered back to ", + "documents before running the rest of the operations.")) + corpus <- quanteda::corpus_reshape(corpus, "documents") + old_level <- "documents" + } + if (!return_result_only) { + corpus_orig <- corpus + if (old_level != "documents") { + corpus_orig <- quanteda::corpus_reshape(corpus, "documents") + } else { + corpus_orig <- corpus + } + } + } else { + if (!return_result_only) { + corpus_orig <- corpus + } + old_level <- "documents" + corpus <- quanteda::corpus(corpus) + } + # get the IDs for all documents + if (old_level != "documents") { + doc_ids <- quanteda::docid(corpus_orig) + } else { + doc_ids <- quanteda::docid(corpus) + } + # Prepare replacement + if (missing(custom_replacement)) { + set.seed(17062005) + replacement <- paste0( + paste0(sample(letters, 20, replace = TRUE), collapse = ""), + round(stats::runif(1, 1000000000, 9999999999), 0), + paste0(sample(letters, 20, replace = TRUE), collapse = "") + ) + } else { + replacement <- custom_replacement + } + if (tolower) { + replacement <- stringr::str_to_lower(replacement) + } + if (missing(dict_name)) { + dict_name <- "dict" + if (quanteda::is.dictionary(dict)) { + dict_name <- names(dict[1]) + } + } + # Prepare dictionary + if (quanteda::is.dictionary(dict)) { + if (length(dict) > 1) { + warning(paste0( + "Only dictionaries of length 1 are supported (i.e., only one ", + "dictionary in the quanteda dictionary object). Results are only ", + 'based on the first dictionary ("', names(dict[1]), '"). Others are ', + "dropped." + )) + dict <- dict[1] + } + dict <- unlist(dict) + } + # Prepare dictionary patterns if patterns are of type "regex" + if ((regex_optimize | regex_make_greedy | regex_make_lazy) & + (pattern_type == "coll" | pattern_type == "fixed")) { + message('Regex options are ignored for pattern_type "fixed" or "coll".') + } + if (pattern_type == "glob") { + dict <- regexhelpeR::glob_to_regex(dict) + pattern_type <- "regex" + if (regex_optimize) { + warning(paste0('You should not use the regex_optimize ', + 'argument with "glob" style patterns. They already ', + 'come with word boundaries and do not include ', + 'catch all wildcards at the boundaries. You might get ', + 'unexpected results.')) + } + } + if (pattern_type == "regex") { + if (regex_optimize) { + dict <- regexhelpeR::optimize_regex_patterns(dict) + } + if (regex_make_lazy & !regex_make_greedy) { + dict <- regexhelpeR::make_all_regex_lazy(dict) + } + if (!regex_make_lazy & regex_make_greedy) { + dict <- regexhelpeR::make_all_regex_greedy(dict) + } + if (regex_make_lazy & regex_make_greedy) { + dict <- regexhelpeR::switch_regex_greedy_lazy(dict) + } + } + # Prepare corpus for replacements + if ((at_level == "sentences" & return_value == "prop_at_level") | + include_totals ) { + if (old_level != "documents") { + tmp_corpus <- quanteda::corpus_reshape(corpus, to = "documents") + n_sentences <- data.frame(doc_id = quanteda::docid(tmp_corpus), + n_sentences = quanteda::nsentence(tmp_corpus)) + if (include_totals) { + n_tokens <- data.frame(doc_id = quanteda::docid(tmp_corpus), + n_tokens = quanteda::ntoken(tmp_corpus, ...)) + } + rm(tmp_corpus) + } else { + n_sentences <- data.frame(doc_id = quanteda::docid(corpus), + n_sentences = quanteda::nsentence(corpus)) + n_tokens <- data.frame(doc_id = quanteda::docid(corpus), + n_tokens = quanteda::ntoken(corpus, ...)) + # no function for paragraphs in quanteda + } + } + reshaped <- FALSE + if (old_level != at_level) { + corpus <- quanteda::corpus_reshape(corpus, to = at_level) + reshaped <- TRUE + } + for (i in dict) { + if (pattern_type == "regex") { + quanteda::texts(corpus) <- stringr::str_replace_all( + quanteda::texts(corpus), + pattern = stringr::regex(i, ignore_case = case_insensitive), + replacement = replacement + ) + } + if (pattern_type == "coll") { + quanteda::texts(corpus) <- stringr::str_replace_all( + quanteda::texts(corpus), + pattern = stringr::coll(i, ignore_case = case_insensitive), + replacement = replacement + ) + } + if (pattern_type == "fixed") { + quanteda::texts(corpus) <- stringr::str_replace_all( + quanteda::texts(corpus), + pattern = stringr::fixed(i, ignore_case = case_insensitive), + replacement = replacement + ) + } + } + # Make dictionary from replacement + lookup_dict <- list(paste0("*", replacement, "*")) + names(lookup_dict) <- dict_name + lookup_dict <- quanteda::dictionary(lookup_dict) + # Prepare dfm + dfm <- quanteda::dfm( + corpus, + tolower = tolower, + stem = stem, + remove = remove, + ... = ... + ) + # Lookup dictionary and prepare results + results <- switch( + return_value, + "prop_at_level" = { + # Lookup dictionary + dfm <- quanteda::dfm_lookup( + dfm, + dictionary = lookup_dict, + valuetype = "glob", + case_insensitive = case_insensitive, + ) + # Weight and group for results + if (at_level != "documents") { + dfm <- quanteda::dfm_weight(dfm, "boolean") + dfm <- quanteda::dfm_group( + dfm, + groups = quanteda::docid(dfm), + fill = TRUE, + force = TRUE + ) + } + dfm <- quanteda::convert(dfm, to = "data.frame") + dfm <- dplyr::left_join(dfm, n_sentences, by = "doc_id") + dfm[, dict_name] <- dfm[, dict_name]/dfm[, "n_sentences"] + dfm <- dplyr::select(dfm, -.data$n_sentences) + dfm + }, + "prop" = { + # Group and weight for results + if (at_level != "documents") { + dfm <- quanteda::dfm_group( + dfm, + groups = quanteda::docid(dfm), + fill = TRUE, + force = TRUE + ) + } + dfm <- quanteda::dfm_weight(dfm, "prop") + # Lookup dictionary + dfm <- quanteda::dfm_lookup( + dfm, + dictionary = lookup_dict, + valuetype = "glob", + case_insensitive = case_insensitive + ) + quanteda::convert(dfm, to = "data.frame") + }, + "count_at_level" = { + # Lookup dictionary + dfm <- quanteda::dfm_lookup( + dfm, + dictionary = lookup_dict, + valuetype = "glob", + case_insensitive = case_insensitive, + ) + # Weight and group for results + if (at_level != "documents") { + dfm <- quanteda::dfm_weight(dfm, "boolean") + dfm <- quanteda::dfm_group( + dfm, + groups = quanteda::docid(dfm), + fill = TRUE, + force = TRUE + ) + } + quanteda::convert(dfm, to = "data.frame") + }, + "count" = { + # Lookup dictionary + dfm <- quanteda::dfm_lookup( + dfm, + dictionary = lookup_dict, + valuetype = "glob", + case_insensitive = case_insensitive, + ) + # Weight and group for results + if (at_level != "documents") { + dfm <- quanteda::dfm_group( + dfm, + groups = quanteda::docid(dfm), + fill = TRUE + ) + } + quanteda::convert(dfm, to = "data.frame") + }, + "binary" = { + # Lookup dictionary + dfm <- quanteda::dfm_lookup( + dfm, + dictionary = lookup_dict, + valuetype = "glob", + case_insensitive = case_insensitive, + ) + # Weight and group for results + if (at_level != "documents") { + dfm <- quanteda::dfm_group( + dfm, + groups = quanteda::docid(dfm), + fill = TRUE + ) + } + dfm <- quanteda::dfm_weight(dfm, "boolean") + quanteda::convert(dfm, to = "data.frame") + } + ) + # Finish results (also the number of sentences and tokens per doc?) + results <- dplyr::left_join( + data.frame(doc_id = doc_ids), + results, + by = "doc_id" + ) + if (include_totals) { + results <- dplyr::left_join( + results, + n_sentences, + by = "doc_id" + ) + results <- dplyr::left_join( + results, + n_tokens, + by = "doc_id" + ) + } + results[, "doc_id"] <- NULL + if (!return_result_only) { + if (quanteda::is.corpus(corpus_orig)) { + quanteda::docvars(corpus_orig) <- dplyr::bind_cols( + quanteda::docvars(corpus_orig), + results, + .name_repair = "unique" + ) + results <- corpus_orig + } else if (is.data.frame(corpus_orig)) { + results <- dplyr::bind_cols( + corpus_orig, + results, + .name_repair = "unique" + ) + } else { + results <- dplyr::bind_cols( + data.frame(x = corpus_orig), + results, + .name_repair = "unique" + ) + } + } + return(results) +} diff --git a/R/run-non-multidict.R b/R/run-non-multidict.R new file mode 100644 index 0000000..544645c --- /dev/null +++ b/R/run-non-multidict.R @@ -0,0 +1,422 @@ +# ------------------------------------------------------------------------------ +# +# Script name: run-non-multidict.R +# +# Purpose of script: Function to run a regular dictionary in the same way as +# the run_multidict function. +# +# Author: Johann Gründl +# Email: mail@johanngruendl.at +# +# Date created: 2020-11-20 +# Date updated: 2020-11-20 +# +# ****************************************************************************** + + +#' Run a regular dictionary +#' +#' This function mimics the behavior of the run_multidict function, but runs +#' dictionaries with the regular quanteda features. This functions exists just +#' for consistency when running regular dictionaries alongside multi term +#' dictionaries through the run_multidict function. +#' +#' Quanteda's dictionary function does not allow for patterns with wildcards to +#' match more than one token. For example, a regular expression such as +#' "the (.*) people" would not work as expected if you wanted to match +#' expressions such as "the honest people" and "the good people". The +#' run_multidict function facilitates the usage of dictionaries including such +#' terms. +#' +#' @param corpus A quanteda corpus object or something that can be transformed +#' to a corpus by quanteda::corpus(), for example, a simple character vector +#' @param dict A character vector where each element is a pattern or a +#' quanteda dictionary object with one dictionary. +#' @param at_level At which level should patterns be applied. Possible values +#' are "documents", "sentences", or "paragraphs". Defaults to "sentences". +#' @param return_value How should the value be returned? Possible values +#' include "count", "binary", "prop", "count_at_level", or "prop_at_level". You +#' get the results from the dictionary at the document level. "count" (the +#' default) gives the simple frequency of dictionary hits in each document. +#' "count_at_level" gives you the number of sentences or paragraphs in a +#' document where there was at least one pattern match. Together with the +#' include_totals parameter "count" and "count_at_level" give you the most +#' flexibility to work with the results. "binary" returns 0 or 1, depending on +#' whether there was at least one pattern match in the document. "prop" is the +#' proportion of pattern matches relative to the total number of tokens in the +#' document. "prop_at_level" gives you the proportion of sentences or +#' paragraphs (in a document) where a pattern match was found. +#' @param include_totals Should the number sentencens (as "n_sentences") and +#' number of tokens (as "n_tokens") per document also be returned? Defaults to +#' TRUE. +#' @param return_result_only If TRUE, a data.frame containing the results will +#' be returned. If FALSE (the default), you will get the provided corpus with +#' the results attached as new columns. +#' @param pattern_type The type of pattern included in the dictionary. Defaults +#' to "regex" for regular expressions. "glob" is also possible for glob style +#' wildcards. Furthermore, "fixed" is possible. See the quanteda package for +#' details on pattern types. +#' @param case_insensitive Should the case be ignored when searching for +#' pattern matches? Defaults to TRUE. +#' @param regex_optimize Should the regular expressions be optimized by adding +#' word boundaries and removing open wildcards at word boundaries? This is +#' intended for using regular expression dictionary patterns the way I use +#' them in the popdictR package. It then allows for quicker lookups (see +#' regexhelpeR::optimize_regex_patterns)? Defaults to FALSE, so your patterns +#' are not changed. +#' @param regex_make_greedy Should regular expressions be transformed to greedy +#' versions? Defaults to FALSE. Usually not needed. If you switch this to TRUE, +#' while at the same time setting regex_make_lazy to TRUE as well, you will get +#' inverted patterns (i.e., lazy patterns become greedy and greedy patterns +#' become lazy). +#' @param regex_make_lazy Should regular expressions be transformed to lazy +#' versions? Defaults to FALSE, so your patterns are not changed. However, you +#' should probably use lazy regex patterns to replace the shortest possible +#' compounds. +#' @param dict_name You can set a custom name for your dictionary. This is also +#' the name of the variable that contains the results in the return value. If +#' you provided a quanteda dictionary, the name of the first dictionary +#' included will be used. Otherwise, the dict_name defaults to "dict". +#' @param tolower Forwarded to quanteda's dfm function, converts all features +#' to lowercase. Defaults to the value for "case_insensitive." +#' @param stem Forwarded to quanteda's dfm function. If TRUE, quanteda stems +#' words. Defaults to FALSE. +#' @param remove Forwarded to quanteda's dfm function. A list of stopwords +#' which are removed from the dfm before running the dictionary. +#' @param ... Additional arguments passed on to quanteda's dfm function (and +#' there to the tokens function). Includes things such as "remove_punct", +#' "remove_symbols", "remove_numbers", etc. See quanteda's tokens function for +#' details. +#' +#' @return Returns the results of running the dictionary. If return_result_only +#' is set, you will get a data.frame with only the results. Otherwise, you the +#' results will be bound to the corpus as new columns. If you only provided +#' texts, the only other column will be these texts of course (variable x). If +#' you provided a quanteda corpus, the results will be stored as variables in +#' the docvars. +#' +#' @export +#' @importFrom rlang .data +run_non_multidict <- function( + corpus, + dict, + at_level = c("sentences", "paragraphs", "documents"), + return_value = c( + "count", + "binary", + "prop", + "count_at_level", + "prop_at_level" + ), + include_totals = TRUE, + return_result_only = FALSE, + pattern_type = c("regex", "glob", "fixed"), + case_insensitive = TRUE, + regex_optimize = FALSE, + regex_make_greedy = FALSE, + regex_make_lazy = FALSE, + dict_name, + tolower = case_insensitive, + stem = FALSE, + remove = NULL, + ... +) { + if (missing(corpus)) { + stop("You need to provide a text corpus.") + } + if (missing(dict)) { + stop("You need to provide a dictionary.") + } + # Prepare vector arguments + at_level <- at_level[1] + return_value <- return_value[1] + pattern_type <- pattern_type[1] + if (!(at_level %in% c("sentences", "paragraphs", "documents"))) { + stop('at_level has to be one of "sentences", "paragraphs", or "documents".') + } + if (!(return_value %in% c( + "prop_at_level", + "prop", + "count_at_level", + "count", + "binary" + ))) { + stop(paste0('return_value has to be one of "prop_at_level", "prop", ', + '"count_at_level", "count", or "binary".')) + } + if (!(pattern_type %in% c("regex", "glob", "fixed"))) { + stop('pattern_type has to be one of "regex", "glob", or "fixed".') + } + # Check return_value + if (at_level == "documents" & return_value == "prop_at_level") { + warning(paste0('return_value "prop_at_level" does not make sense with ', + 'analysis at the level of the documents. return_value was ', + 'changed to "prop" instead.')) + return_value <- "prop" + } + if (at_level == "documents" & return_value == "count_at_level") { + warning(paste0('return_value "count_at_level" does not make sense with ', + 'analysis at the level of the documents. return_value was ', + 'changed to "count" instead.')) + return_value <- "count" + } + # Check if paragraph was used with "at_level" results (not implemented, yet) + if (return_value == "prop_at_level" & at_level == "paragraphs") { + stop(paste0('The combination of return_value "', return_value, '" and ', + 'at_level "', at_level, '" has not been implemented yet.')) + } + # Quanteda corpus is prepared and original corpus stored if needed + if (quanteda::is.corpus(corpus)) { + old_level <- quanteda::meta(corpus, field = "unit", type = "object") + if ((old_level != at_level) & (old_level != "documents")) { + warning(paste0("You provided a quanteda corpus object which was not at ", + "the documents level and also not at the level you wanted", + "to do analysis for. Thus, it is transfered back to ", + "documents before running the rest of the operations.")) + corpus <- quanteda::corpus_reshape(corpus, "documents") + old_level <- "documents" + } + if (!return_result_only) { + corpus_orig <- corpus + if (old_level != "documents") { + corpus_orig <- quanteda::corpus_reshape(corpus, "documents") + } else { + corpus_orig <- corpus + } + } + } else { + if (!return_result_only) { + corpus_orig <- corpus + } + old_level <- "documents" + corpus <- quanteda::corpus(corpus) + } + # get the IDs for all documents + if (old_level != "documents") { + doc_ids <- quanteda::docid(corpus_orig) + } else { + doc_ids <- quanteda::docid(corpus) + } + # Prepare dictionary + if (missing(dict_name)) { + dict_name <- "dict" + if (quanteda::is.dictionary(dict)) { + dict_name <- names(dict[1]) + } + } + if (quanteda::is.dictionary(dict)) { + if (length(dict) > 1) { + warning(paste0( + "Only dictionaries of length 1 are supported (i.e., only one ", + "dictionary in the quanteda dictionary object). Results are only ", + 'based on the first dictionary ("', names(dict[1]), '"). Others are ', + "dropped." + )) + dict <- dict[1] + } + dict <- unlist(dict) + } + # Prepare dictionary patterns if patterns are of type "regex" + if ((regex_optimize | regex_make_greedy | regex_make_lazy) & + (pattern_type == "glob" | pattern_type == "fixed")) { + message('Regex options are ignored for pattern_type "fixed " or "glob".') + } + if (pattern_type == "regex") { + if (regex_optimize) { + dict <- regexhelpeR::optimize_regex_patterns(dict) + } + if (regex_make_lazy & !regex_make_greedy) { + dict <- regexhelpeR::make_all_regex_lazy(dict) + } + if (!regex_make_lazy & regex_make_greedy) { + dict <- regexhelpeR::make_all_regex_greedy(dict) + } + if (regex_make_lazy & regex_make_greedy) { + dict <- regexhelpeR::switch_regex_greedy_lazy(dict) + } + } + # Prepare corpus for analysis + if ((at_level == "sentences" & return_value == "prop_at_level") | + include_totals ) { + if (old_level != "documents") { + tmp_corpus <- quanteda::corpus_reshape(corpus, to = "documents") + n_sentences <- data.frame(doc_id = quanteda::docid(tmp_corpus), + n_sentences = quanteda::nsentence(tmp_corpus)) + if (include_totals) { + n_tokens <- data.frame(doc_id = quanteda::docid(tmp_corpus), + n_tokens = quanteda::ntoken(tmp_corpus, ...)) + } + rm(tmp_corpus) + } else { + n_sentences <- data.frame(doc_id = quanteda::docid(corpus), + n_sentences = quanteda::nsentence(corpus)) + n_tokens <- data.frame(doc_id = quanteda::docid(corpus), + n_tokens = quanteda::ntoken(corpus, ...)) + # no function for paragraphs in quanteda + } + } + reshaped <- FALSE + if (old_level != at_level) { + corpus <- quanteda::corpus_reshape(corpus, to = at_level) + reshaped <- TRUE + } + # Make dictionary from replacement + lookup_dict <- list(dict) + names(lookup_dict) <- dict_name + lookup_dict <- quanteda::dictionary(lookup_dict) + # Prepare dfm + dfm <- quanteda::dfm( + corpus, + tolower = tolower, + stem = stem, + remove = remove, + ... = ... + ) + # Lookup dictionary and prepare results + results <- switch( + return_value, + "prop_at_level" = { + # Lookup dictionary + dfm <- quanteda::dfm_lookup( + dfm, + dictionary = lookup_dict, + valuetype = pattern_type, + case_insensitive = case_insensitive, + ) + # Weight and group for results + if (at_level != "documents") { + dfm <- quanteda::dfm_weight(dfm, "boolean") + dfm <- quanteda::dfm_group( + dfm, + groups = quanteda::docid(dfm), + fill = TRUE, + force = TRUE + ) + } + dfm <- quanteda::convert(dfm, to = "data.frame") + dfm <- dplyr::left_join(dfm, n_sentences, by = "doc_id") + dfm[, dict_name] <- dfm[, dict_name]/dfm[, "n_sentences"] + dfm <- dplyr::select(dfm, -.data$n_sentences) + dfm + }, + "prop" = { + # Group and weight for results + if (at_level != "documents") { + dfm <- quanteda::dfm_group( + dfm, + groups = quanteda::docid(dfm), + fill = TRUE, + force = TRUE + ) + } + dfm <- quanteda::dfm_weight(dfm, "prop") + # Lookup dictionary + dfm <- quanteda::dfm_lookup( + dfm, + dictionary = lookup_dict, + valuetype = pattern_type, + case_insensitive = case_insensitive + ) + quanteda::convert(dfm, to = "data.frame") + }, + "count_at_level" = { + # Lookup dictionary + dfm <- quanteda::dfm_lookup( + dfm, + dictionary = lookup_dict, + valuetype = pattern_type, + case_insensitive = case_insensitive, + ) + # Weight and group for results + if (at_level != "documents") { + dfm <- quanteda::dfm_weight(dfm, "boolean") + dfm <- quanteda::dfm_group( + dfm, + groups = quanteda::docid(dfm), + fill = TRUE, + force = TRUE + ) + } + quanteda::convert(dfm, to = "data.frame") + }, + "count" = { + # Lookup dictionary + dfm <- quanteda::dfm_lookup( + dfm, + dictionary = lookup_dict, + valuetype = pattern_type, + case_insensitive = case_insensitive, + ) + # Weight and group for results + if (at_level != "documents") { + dfm <- quanteda::dfm_group( + dfm, + groups = quanteda::docid(dfm), + fill = TRUE + ) + } + quanteda::convert(dfm, to = "data.frame") + }, + "binary" = { + # Lookup dictionary + dfm <- quanteda::dfm_lookup( + dfm, + dictionary = lookup_dict, + valuetype = pattern_type, + case_insensitive = case_insensitive, + ) + # Weight and group for results + if (at_level != "documents") { + dfm <- quanteda::dfm_group( + dfm, + groups = quanteda::docid(dfm), + fill = TRUE + ) + } + dfm <- quanteda::dfm_weight(dfm, "boolean") + quanteda::convert(dfm, to = "data.frame") + } + ) + # Finish results (also the number of sentences and tokens per doc?) + results <- dplyr::left_join( + data.frame(doc_id = doc_ids), + results, + by = "doc_id" + ) + if (include_totals) { + results <- dplyr::left_join( + results, + n_sentences, + by = "doc_id" + ) + results <- dplyr::left_join( + results, + n_tokens, + by = "doc_id" + ) + } + results[, "doc_id"] <- NULL + if (!return_result_only) { + if (quanteda::is.corpus(corpus_orig)) { + quanteda::docvars(corpus_orig) <- dplyr::bind_cols( + quanteda::docvars(corpus_orig), + results, + .name_repair = "unique" + ) + results <- corpus_orig + } else if (is.data.frame(corpus_orig)) { + results <- dplyr::bind_cols( + corpus_orig, + results, + .name_repair = "unique" + ) + } else { + results <- dplyr::bind_cols( + data.frame(x = corpus_orig), + results, + .name_repair = "unique" + ) + } + } + return(results) +} diff --git a/README.md b/README.md index bb2362c..871840d 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,45 @@ # multidictR -Functions that help to work with dictionaries that include search strings (and wildcards) consisting of multiple words + +Functions that help to work with dictionaries that include search strings (and +wildcards) consisting of multiple words. Quanteda does not allow for such more +complex multi-word dictionaries that for example include wildcards spanning +multiple words. This package provides some functions to deal with such +dictionaries. It is used with the populism dictionary in popdictR. + +Functions from this package are used with the populism dictionary in my [popdictR](https://github.com/jogrue/popdictR) project. + + +## Status + +The package worked for my particular use case. All functions are documented +already. However, the package has not been tested extensively. Thus, I am glad +for any feedback or issues. A new version (1.0) is planned for the end of March 2021. Some of the issues I plan on addressing: + +* More thorough testing +* Better documentation +* Highlighting use cases for this package + + +## Install + +This package requires my package +[regexhelpeR](https://github.com/jogrue/regexhelpeR) which should be installed +before this package. + +You can install everything from within R using [devtools](https://github.com/hadley/devtools): + +```R +library(devtools) + +# Install the dependency regexhelpeR from GitHub +devtools::install_github("jogrue/regexhelpeR") + +# Install the multidictR package from GitHub +devtools::install_github("jogrue/multidictR") +``` + +## Cite + +Gründl, J. (2020). Populist ideas on social media: A dictionary-based measurement of populist communication. _New Media & Society_. Advance online publication. [https://doi.org/10.1177/1461444820976970](https://doi.org/10.1177/1461444820976970) + +Gründl, J. (2020). _multidictR_ (R package). [https://github.com/jogrue/multidictR](https://github.com/jogrue/multidictR) diff --git a/man/check_pattern_performance.Rd b/man/check_pattern_performance.Rd new file mode 100644 index 0000000..9d7996e --- /dev/null +++ b/man/check_pattern_performance.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/misc-util-functions.R +\name{check_pattern_performance} +\alias{check_pattern_performance} +\title{Get the amount possible pattern combinations} +\usage{ +check_pattern_performance(text, pattern, glob = FALSE, ...) +} +\arguments{ +\item{text}{A quanteda corpus object, text that can be transformed to a +corpus object, or a quanteda tokens object} + +\item{pattern}{A quanteda pattern, i.e., a character vector, list of +character vectors, dictionary, or collocations object.} + +\item{glob}{Are patterns glob-style? Defaults to FALSE (for regex patterns).} + +\item{...}{Additional parameters passed to quanteda:::pattern2list.} +} +\value{ +A table containing the patterns and how many possible combinations +they produce for a given text or tokens object. On top are the patterns that +produce the most combinations and, thus, need the most memory. +} +\description{ +Wildcards in patterns can potentially lead to a huge number +of possible combinations in tokens. This in turn leads to problems with +memory usage, in some quanteda functions. With this function you can check +which patterns in a dictionary are the most problematic (= produce the +highest number of possible combinations). This was suggested by Kohei +Watanabe here: +\url{https://github.com/quanteda/quanteda/issues/1539#issuecomment-451588580} +} diff --git a/man/dot-create_single_compound.Rd b/man/dot-create_single_compound.Rd new file mode 100644 index 0000000..ae7c0f7 --- /dev/null +++ b/man/dot-create_single_compound.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/make-compounds.R +\name{.create_single_compound} +\alias{.create_single_compound} +\title{Creates a single compound} +\usage{ +.create_single_compound(compound, wordsep = " ", concatenator = "_") +} +\arguments{ +\item{compound}{A simple text (= character vector).} + +\item{wordsep}{The word separator to look for, usually spaces. Defaults to +" ".} + +\item{concatenator}{The replacement for wordsep characters (e.g., " "). +Defaults to "_".} +} +\value{ +The provided text where all occurrences of the wordsep character +(by default, " ") were replaced by the concatenator character (by default, +"_"). +} +\description{ +This is an internal function use in the +.make_compounds_for_single_pattern function. It is basically a wrapper around +stringr::str_replace_all. By default, it replaces all space characters (" ") +with underline character ("_"). +} diff --git a/man/dot-make_compounds_for_single_pattern.Rd b/man/dot-make_compounds_for_single_pattern.Rd new file mode 100644 index 0000000..baff508 --- /dev/null +++ b/man/dot-make_compounds_for_single_pattern.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/make-compounds.R +\name{.make_compounds_for_single_pattern} +\alias{.make_compounds_for_single_pattern} +\title{Replace matches for a single pattern with compounds} +\usage{ +.make_compounds_for_single_pattern( + text, + pattern, + wordsep = " ", + concatenator = "_", + lazy = TRUE, + ignore_case = TRUE +) +} +\arguments{ +\item{text}{Text should be provided as a character vector.} + +\item{pattern}{A single pattern with possible regular expressions.} + +\item{wordsep}{The word separator to look for, usually spaces. Defaults to +" ".} + +\item{concatenator}{The replacement for wordsep characters (e.g., " "). +Defaults to "_".} + +\item{lazy}{Should regular expressions be transformed to lazy versions? +Defaults to TRUE to return the shortest possible compounds.} + +\item{ignore_case}{Should the case be ignored when searching for pattern +matches? Defaults to TRUE.} +} +\value{ +The provided text where all occurrences of the pattern have been +converted to multi-word compounds. +} +\description{ +This is an internal function use in the make_compounds function. +} diff --git a/man/every_term_a_dict.Rd b/man/every_term_a_dict.Rd new file mode 100644 index 0000000..c290587 --- /dev/null +++ b/man/every_term_a_dict.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/misc-util-functions.R +\name{every_term_a_dict} +\alias{every_term_a_dict} +\title{Make every term a dictionary} +\usage{ +every_term_a_dict(dict) +} +\arguments{ +\item{dict}{A quanteda dictionary or a simple list of dictionary terms} +} +\value{ +A quanteda dictionary object where each term is its own dictionary. +Dictionary names are simply the provided terms. +} +\description{ +This functions transforms a dictionary (or a list of terms) to +a quanteda dictionary object where each term is it's own dictionary. +} diff --git a/man/get_pattern_stats.Rd b/man/get_pattern_stats.Rd new file mode 100644 index 0000000..e6cbbd7 --- /dev/null +++ b/man/get_pattern_stats.Rd @@ -0,0 +1,56 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pattern-stats.R +\name{get_pattern_stats} +\alias{get_pattern_stats} +\title{Gather statistics for dictionary patterns} +\usage{ +get_pattern_stats( + text, + patterns, + at_level = "sentences", + glob = FALSE, + ignore_case = TRUE, + optimize_regex = TRUE +) +} +\arguments{ +\item{text}{A quanteda corpus object or something that can be transformed to +a corpus by quanteda::corpus(), for example, a simple character vector.} + +\item{patterns}{A character vector where each element is a pattern or a +quanteda dictionary object. Patterns are expected to be regular expressions +(if glob parameter is not set) or only include glob-style wildcars (if glob +parameter is set to TRUE).} + +\item{at_level}{At which level should patterns be applied. Possible values +are "documents", "sentences", or "paragraphs". Defaults to "sentences".} + +\item{glob}{Do the provided patterns use glob-style wildcards instead of +regular expressions? Defaults to FALSE.} + +\item{ignore_case}{Should the case be ignored when searching for pattern +matches? Defaults to TRUE.} + +\item{optimize_regex}{Should the regular expressions be optimized to allow +for quicker lookups (see regexhelpeR package)? Defaults to TRUE.} +} +\value{ +A data frame containing a row for each pattern. It includes the +provided pattern (\code{original_pattern}). The version after pattern +transformations (e.g., switch from glob to regex, optimizations) were applied +(= the version which was actually tested; \code{applied_pattern}), the +\code{match_count}, and the \code{lookup_time}. +} +\description{ +For a list of (possibly multi-word) patterns statistics are +generated. Basically the number of matches and the time to run the pattern +on the whole corpus are recorded. By default, patterns are applied at the +sentence level. Thus, the match_count is the total count of sentences where +the pattern was detected. + +The function works with quanteda corpus objects and regular texts (character +vectors; in general text that can be transformed to a quanteda corpus object +by quanteda's corpus() function should work). It expects regular expression +patterns but can work with glob expressions as well (if parameter glob is +set, internally the patterns are transformed to regular expressions). +} diff --git a/man/make_compounds.Rd b/man/make_compounds.Rd new file mode 100644 index 0000000..a1a42c0 --- /dev/null +++ b/man/make_compounds.Rd @@ -0,0 +1,65 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/make-compounds.R +\name{make_compounds} +\alias{make_compounds} +\title{Create compounds for complex patterns} +\usage{ +make_compounds( + text, + patterns, + wordsep = " ", + concatenator = "_", + at_level = "sentences", + glob = FALSE, + lazy = TRUE, + ignore_case = TRUE, + optimize_regex = TRUE +) +} +\arguments{ +\item{text}{A quanteda corpus object or something that can be transformed to +a corpus by quanteda::corpus(), for example, a simple character vector} + +\item{patterns}{A character vector where each element is a pattern or a +quanteda dictionary object. Patterns are expected to be regular expressions +(if glob parameter is not set) or only include glob-style wildcards (if glob +parameter is set to TRUE).} + +\item{wordsep}{The word seperator, usually simply a space. Defaults to " ".} + +\item{concatenator}{The character for creating multi-word compounds, defaults +to "_".} + +\item{at_level}{At which level should patterns be applied. Possible values +are "documents", "sentences", or "paragraphs". Defaults to "sentences".} + +\item{glob}{Do the provided patterns use glob-style wildcards instead of +regular expressions? Defaults to FALSE.} + +\item{lazy}{Should regular expressions be transformed to lazy versions? +Defaults to TRUE to return the shortest possible compounds.} + +\item{ignore_case}{Should the case be ignored when searching for pattern +matches? Defaults to TRUE.} + +\item{optimize_regex}{Should the regular expressions be optimized to allow +for quicker lookups (see regexhelpeR package)? Defaults to TRUE.} +} +\value{ +The corpus or text object where matched multi-word terms are now +replaced by multi-word compounds. +} +\description{ +For a list of (multi-word) patterns compounds are created. The +function works with quanteda corpus objects and regular texts (character +vectors; in general text that can be transformed to a quanteda corpus object +by quanteda's corpus() function should work). It expects regular expression +patterns but can work with glob expressions as well (if parameter glob is +set, internally the patterns are transformed to regular expressions). +quanteda has built-in functionality for this; however, it does not +allow for patterns which include wildcards that stand for multiple words. +For example, something like "the * people" could capture "the honest +people", "the hard-working, common people", or "the singer sings songs about +people". Such patterns would not work as expected in quanteda. With regular +expressions, a lot more sophisticated patterns become possible. +} diff --git a/man/run_kwic_stepwise.Rd b/man/run_kwic_stepwise.Rd new file mode 100644 index 0000000..a4c8bc9 --- /dev/null +++ b/man/run_kwic_stepwise.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/misc-util-functions.R +\name{run_kwic_stepwise} +\alias{run_kwic_stepwise} +\title{Stepwise locate keywords-in-context} +\usage{ +run_kwic_stepwise( + tokens, + pattern, + window = 25, + valuetype = "regex", + step = 10000, + ... +) +} +\arguments{ +\item{tokens}{A quanteda tokens object.} + +\item{pattern}{A character vector, list of character vectors, dictionary, or +collocations object.} + +\item{window}{The number of context words to be displayed around the keyword.} + +\item{valuetype}{The type of pattern matching: "glob" for "glob"-style +wildcard expressions; "regex" for regular expressions; or "fixed" for exact +matching. Defaults to "regex".} + +\item{step}{How many tokens should be processed at once? Defaults to 10000.} + +\item{...}{Additonal arguments passed to the kwic function.} +} +\value{ +A kwic classed data.frame. +} +\description{ +With large amounts of text and complex (regex) patterns you +might run into problems with memory when running quanteda's kwic function. +Thus this function takes a stepwise approach. The tokens are split into +certain chunks (specified by step) and then kwic is run on these smaller +chunks. +} diff --git a/man/run_multidict.Rd b/man/run_multidict.Rd new file mode 100644 index 0000000..38b1045 --- /dev/null +++ b/man/run_multidict.Rd @@ -0,0 +1,128 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/run-multidict.R +\name{run_multidict} +\alias{run_multidict} +\title{Run a dictionary with patterns that might match more than one term (or token)} +\usage{ +run_multidict( + corpus, + dict, + at_level = c("sentences", "paragraphs", "documents"), + return_value = c("count", "binary", "prop", "count_at_level", "prop_at_level"), + include_totals = TRUE, + return_result_only = FALSE, + pattern_type = c("regex", "glob", "coll", "fixed"), + case_insensitive = TRUE, + regex_optimize = FALSE, + regex_make_greedy = FALSE, + regex_make_lazy = FALSE, + dict_name, + custom_replacement, + tolower = case_insensitive, + stem = FALSE, + remove = NULL, + ... +) +} +\arguments{ +\item{corpus}{A quanteda corpus object or something that can be transformed +to a corpus by quanteda::corpus(), for example, a simple character vector} + +\item{dict}{A character vector where each element is a pattern or a +quanteda dictionary object with one dictionary.} + +\item{at_level}{At which level should patterns be applied. Possible values +are "documents", "sentences", or "paragraphs". Defaults to "sentences".} + +\item{return_value}{How should the value be returned? Possible values +include "count", "binary", "prop", "count_at_level", or "prop_at_level". You +get the results from the dictionary at the document level. "count" (the +default) gives the simple frequency of dictionary hits in each document. +"count_at_level" gives you the number of sentences or paragraphs in a +document where there was at least one pattern match. Together with the +include_totals parameter "count" and "count_at_level" give you the most +flexibility to work with the results. "binary" returns 0 or 1, depending on +whether there was at least one pattern match in the document. "prop" is the +proportion of pattern matches relative to the total number of tokens in the +document. "prop_at_level" gives you the proportion of sentences or +paragraphs (in a document) where a pattern match was found.} + +\item{include_totals}{Should the number sentencens (as "n_sentences") and +number of tokens (as "n_tokens") per document also be returned? Defaults to +TRUE.} + +\item{return_result_only}{If TRUE, a data.frame containing the results will +be returned. If FALSE (the default), you will get the provided corpus with +the results attached as new columns.} + +\item{pattern_type}{The type of pattern included in the dictionary. Defaults +to "regex" for regular expressions. "glob" is also possible for glob style +wildcards. Internally, glob patterns are transformed to regex patterns. +Other, usually not needed, possible values include "coll" and "fixed". See +the stringr package for details on pattern types.} + +\item{case_insensitive}{Should the case be ignored when searching for +pattern matches? Defaults to TRUE.} + +\item{regex_optimize}{Should the regular expressions be optimized by adding +word boundaries and removing open wildcards at word boundaries? This is +intended for using regular expression dictionary patterns the way I use +them in the popdictR package. It then allows for quicker lookups (see +regexhelpeR::optimize_regex_patterns)? Defaults to FALSE, so your patterns +are not changed.} + +\item{regex_make_greedy}{Should regular expressions be transformed to greedy +versions? Defaults to FALSE. Usually not needed. If you switch this to TRUE, +while at the same time setting regex_make_lazy to TRUE as well, you will get +inverted patterns (i.e., lazy patterns become greedy and greedy patterns +become lazy).} + +\item{regex_make_lazy}{Should regular expressions be transformed to lazy +versions? Defaults to FALSE, so your patterns are not changed. However, you +should probably use lazy regex patterns to replace the shortest possible +compounds.} + +\item{dict_name}{You can set a custom name for your dictionary. This is also +the name of the variable that contains the results in the return value. If +you provided a quanteda dictionary, the name of the first dictionary +included will be used. Otherwise, the dict_name defaults to "dict".} + +\item{custom_replacement}{Internally, this function replaces pattern matches +with a random string (containing 40 random letters and 10 random numbers) +before running quanteda's dictionary lookup function on the corpus. The +random string should be unique and there is usually no need to set a custom +string.} + +\item{tolower}{Forwarded to quanteda's dfm function, converts all features +to lowercase. Defaults to the value for "case_insensitive."} + +\item{stem}{Forwarded to quanteda's dfm function. If TRUE, quanteda stems +words. Defaults to FALSE.} + +\item{remove}{Forwarded to quanteda's dfm function. A list of stopwords +which are removed from the dfm before running the dictionary.} + +\item{...}{Additional arguments passed on to quanteda's dfm function (and +there to the tokens function). Includes things such as "remove_punct", +"remove_symbols", "remove_numbers", etc. See quanteda's tokens function for +details.} +} +\value{ +Returns the results of running the dictionary. If return_result_only +is set, you will get a data.frame with only the results. Otherwise, you the +results will be bound to the corpus as new columns. If you only provided +texts, the only other column will be these texts of course (variable x). If +you provided a quanteda corpus, the results will be stored as variables in +the docvars. +} +\description{ +Quanteda's dictionary function does not allow for patterns with wildcards to +match more than one token. For example, a regular expression such as +"the (.*) people" would not work as expected if you wanted to match +expressions such as "the honest people" and "the good people". This +function facilitates the usage of dictionaries including such terms. It +could be considered the main function for the multidictR package. +Internally, the package uses stringr::str_replace_all to replace pattern +matches with a random string before then using quanteda to look this string +up in the corpus. +} diff --git a/man/run_non_multidict.Rd b/man/run_non_multidict.Rd new file mode 100644 index 0000000..01ef934 --- /dev/null +++ b/man/run_non_multidict.Rd @@ -0,0 +1,123 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/run-non-multidict.R +\name{run_non_multidict} +\alias{run_non_multidict} +\title{Run a regular dictionary} +\usage{ +run_non_multidict( + corpus, + dict, + at_level = c("sentences", "paragraphs", "documents"), + return_value = c("count", "binary", "prop", "count_at_level", "prop_at_level"), + include_totals = TRUE, + return_result_only = FALSE, + pattern_type = c("regex", "glob", "fixed"), + case_insensitive = TRUE, + regex_optimize = FALSE, + regex_make_greedy = FALSE, + regex_make_lazy = FALSE, + dict_name, + tolower = case_insensitive, + stem = FALSE, + remove = NULL, + ... +) +} +\arguments{ +\item{corpus}{A quanteda corpus object or something that can be transformed +to a corpus by quanteda::corpus(), for example, a simple character vector} + +\item{dict}{A character vector where each element is a pattern or a +quanteda dictionary object with one dictionary.} + +\item{at_level}{At which level should patterns be applied. Possible values +are "documents", "sentences", or "paragraphs". Defaults to "sentences".} + +\item{return_value}{How should the value be returned? Possible values +include "count", "binary", "prop", "count_at_level", or "prop_at_level". You +get the results from the dictionary at the document level. "count" (the +default) gives the simple frequency of dictionary hits in each document. +"count_at_level" gives you the number of sentences or paragraphs in a +document where there was at least one pattern match. Together with the +include_totals parameter "count" and "count_at_level" give you the most +flexibility to work with the results. "binary" returns 0 or 1, depending on +whether there was at least one pattern match in the document. "prop" is the +proportion of pattern matches relative to the total number of tokens in the +document. "prop_at_level" gives you the proportion of sentences or +paragraphs (in a document) where a pattern match was found.} + +\item{include_totals}{Should the number sentencens (as "n_sentences") and +number of tokens (as "n_tokens") per document also be returned? Defaults to +TRUE.} + +\item{return_result_only}{If TRUE, a data.frame containing the results will +be returned. If FALSE (the default), you will get the provided corpus with +the results attached as new columns.} + +\item{pattern_type}{The type of pattern included in the dictionary. Defaults +to "regex" for regular expressions. "glob" is also possible for glob style +wildcards. Furthermore, "fixed" is possible. See the quanteda package for +details on pattern types.} + +\item{case_insensitive}{Should the case be ignored when searching for +pattern matches? Defaults to TRUE.} + +\item{regex_optimize}{Should the regular expressions be optimized by adding +word boundaries and removing open wildcards at word boundaries? This is +intended for using regular expression dictionary patterns the way I use +them in the popdictR package. It then allows for quicker lookups (see +regexhelpeR::optimize_regex_patterns)? Defaults to FALSE, so your patterns +are not changed.} + +\item{regex_make_greedy}{Should regular expressions be transformed to greedy +versions? Defaults to FALSE. Usually not needed. If you switch this to TRUE, +while at the same time setting regex_make_lazy to TRUE as well, you will get +inverted patterns (i.e., lazy patterns become greedy and greedy patterns +become lazy).} + +\item{regex_make_lazy}{Should regular expressions be transformed to lazy +versions? Defaults to FALSE, so your patterns are not changed. However, you +should probably use lazy regex patterns to replace the shortest possible +compounds.} + +\item{dict_name}{You can set a custom name for your dictionary. This is also +the name of the variable that contains the results in the return value. If +you provided a quanteda dictionary, the name of the first dictionary +included will be used. Otherwise, the dict_name defaults to "dict".} + +\item{tolower}{Forwarded to quanteda's dfm function, converts all features +to lowercase. Defaults to the value for "case_insensitive."} + +\item{stem}{Forwarded to quanteda's dfm function. If TRUE, quanteda stems +words. Defaults to FALSE.} + +\item{remove}{Forwarded to quanteda's dfm function. A list of stopwords +which are removed from the dfm before running the dictionary.} + +\item{...}{Additional arguments passed on to quanteda's dfm function (and +there to the tokens function). Includes things such as "remove_punct", +"remove_symbols", "remove_numbers", etc. See quanteda's tokens function for +details.} +} +\value{ +Returns the results of running the dictionary. If return_result_only +is set, you will get a data.frame with only the results. Otherwise, you the +results will be bound to the corpus as new columns. If you only provided +texts, the only other column will be these texts of course (variable x). If +you provided a quanteda corpus, the results will be stored as variables in +the docvars. +} +\description{ +This function mimics the behavior of the run_multidict function, but runs +dictionaries with the regular quanteda features. This functions exists just +for consistency when running regular dictionaries alongside multi term +dictionaries through the run_multidict function. +} +\details{ +Quanteda's dictionary function does not allow for patterns with wildcards to +match more than one token. For example, a regular expression such as +"the (.*) people" would not work as expected if you wanted to match +expressions such as "the honest people" and "the good people". The +run_multidict function facilitates the usage of dictionaries including such +terms. +} diff --git a/man/run_tokens_compound_stewpise.Rd b/man/run_tokens_compound_stewpise.Rd new file mode 100644 index 0000000..766e8cf --- /dev/null +++ b/man/run_tokens_compound_stewpise.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/misc-util-functions.R +\name{run_tokens_compound_stewpise} +\alias{run_tokens_compound_stewpise} +\title{Stepwise generate compounds from tokens} +\usage{ +run_tokens_compound_stewpise( + tokens, + pattern, + step = 10000, + concatenator = "_", + valuetype = "regex", + ... +) +} +\arguments{ +\item{tokens}{A quanteda tokens object.} + +\item{pattern}{A character vector, list of character vectors, dictionary, or +collocations object.} + +\item{step}{How many tokens should be processed at once? Defaults to 10000.} + +\item{concatenator}{The concatenation character that will connect the words +making up the multi-word sequences. Defaults to "_".} + +\item{valuetype}{The type of pattern matching: "glob" for "glob"-style +wildcard expressions; "regex" for regular expressions; or "fixed" for exact +matching. Defaults to "regex".} + +\item{...}{Additonal arguments passed to the tokens_compound function.} +} +\value{ +A tokens object in which the token sequences matching pattern have +been replaced by new compounded "tokens" joined by the concatenator. +} +\description{ +With large amounts of text and complex (regex) patterns you +might run into problems with memory when running quanteda's tokens_compound +function. Thus this function takes a stepwise approach. The tokens are split +into certain chunks (specified by step) and then tokens_compound is run on +these smaller chunks. See also +\url{https://github.com/quanteda/quanteda/issues/1539} and the +check_pattern_performance function. +} diff --git a/multidictR.Rproj b/multidictR.Rproj new file mode 100644 index 0000000..7f1b52b --- /dev/null +++ b/multidictR.Rproj @@ -0,0 +1,22 @@ +Version: 1.0 + +RestoreWorkspace: No +SaveWorkspace: No +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: XeLaTeX + +AutoAppendNewline: Yes +StripTrailingWhitespace: Yes +LineEndingConversion: Posix + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source +PackageRoxygenize: rd,collate,namespace