From 975a6675d369bea7fce713ad94ec7cea69378dc7 Mon Sep 17 00:00:00 2001 From: Sebastian Gibb Date: Sun, 17 Apr 2022 17:35:20 +0200 Subject: [PATCH] feat: add rosranger, smote, cv, rcv, gridsearch --- DESCRIPTION | 4 +- NAMESPACE | 8 ++ NEWS.md | 4 + R/cv.R | 75 +++++++++++ R/cv_rusranger.R | 171 ++++++++++++++++++++++---- R/gridsearch.R | 87 ++++++------- R/rosranger.R | 49 ++++++++ R/smote.R | 124 +++++++++++++++++++ R/utils-rosranger.R | 13 ++ R/utils-rusranger.R | 15 ++- man/cv.Rd | 39 ++++++ man/gridsearch.Rd | 45 +++++++ man/gs_rusranger.Rd | 2 +- man/nested_gridsearch.Rd | 74 +++++++++++ man/nrcv_rusranger.Rd | 2 +- man/rcv.Rd | 31 +++++ man/rosranger.Rd | 56 +++++++++ man/smote.Rd | 31 +++++ tests/testthat/test_smote.R | 36 ++++++ tests/testthat/test_utils-rosranger.R | 9 ++ tests/testthat/test_utils-rusranger.R | 7 +- 21 files changed, 804 insertions(+), 78 deletions(-) create mode 100644 R/cv.R create mode 100644 R/rosranger.R create mode 100644 R/smote.R create mode 100644 R/utils-rosranger.R create mode 100644 man/cv.Rd create mode 100644 man/gridsearch.Rd create mode 100644 man/nested_gridsearch.Rd create mode 100644 man/rcv.Rd create mode 100644 man/rosranger.Rd create mode 100644 man/smote.Rd create mode 100644 tests/testthat/test_smote.R create mode 100644 tests/testthat/test_utils-rosranger.R diff --git a/DESCRIPTION b/DESCRIPTION index a29cb7e..4b1840b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: rusranger Title: Modified ranger implementation to support random-under-sampling -Version: 0.0.2 -Date: 2022-03-24 +Version: 0.0.3 +Date: 2022-04-17 Description: The random forest implementation of the ranger package is modified to support random-under-sampling. Additional helper functions for diff --git a/NAMESPACE b/NAMESPACE index df39108..d684454 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,16 +1,24 @@ # Generated by roxygen2: do not edit by hand +export(cv) export(cv_rusranger) +export(gridsearch) export(gs_rusranger) +export(nested_gridsearch) export(nrcv_rusranger) +export(rcv) export(rcv_rusranger) +export(rosranger) export(rusranger) +export(smote) import(future) import(ranger) importFrom(ROCR,performance) importFrom(ROCR,prediction) importFrom(future.apply,future_lapply) +importFrom(stats,dist) importFrom(stats,median) importFrom(stats,predict) importFrom(stats,quantile) +importFrom(stats,runif) importFrom(stats,setNames) diff --git a/NEWS.md b/NEWS.md index cfcab17..acc2798 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,9 @@ # rusranger 0.0 +## Changes in 0.0.3 + +- Add `rosranger`, `smote`, `cv`, `rcv`, `gridsearch`. + ## Changes in 0.0.2 - Fix `.caseweights` for 0/1 binary class coding (instead of 1/2). diff --git a/R/cv.R b/R/cv.R new file mode 100644 index 0000000..4f68e27 --- /dev/null +++ b/R/cv.R @@ -0,0 +1,75 @@ +#' Cross Validation +#' +#' Runs a cross validation for train and prediction function. +#' +#' @inheritParams rusranger +#' @param FUN `function` function to optimize. +#' @param nfolds `integer(1)` number of cross validation folds. +#' @param \ldots further arguments passed to `FUN`. +#' +#' @return `double(1)` median AUC across all cross validation splits +#' +#' @note +#' The function to optimize has to accept five arguments: xtrain, ytrain, xtest, +#' ytest and \ldots. +#' +#' @import future +#' @importFrom future.apply future_lapply +#' @importFrom ROCR performance prediction +#' @export +#' @examples +#' .rusranger <- function(xtrain, ytrain, xtest, ytest, ...) { +#' rngr <- rusranger(x = xtrain, y = ytrain, ...) +#' pred <- as.numeric(predict(rngr, xtest)$predictions[, 2L]) +#' performance(prediction(pred, ytest), measure = "auc")@y.values[[1L]] +#' } +#' cv(iris[-5], as.numeric(iris$Species == "versicolor"), .rusranger, nfolds = 3) +cv <- function(x, y, FUN, nfolds = 5, ...) { + folds <- .bfolds(y, nfolds = nfolds) + xl <- split(x, folds) + yl <- split(y, folds) + r <- unlist(future.apply::future_lapply( + seq_len(nfolds), + function(i) { + xtrain <- do.call(rbind, xl[-i]) + xtest <- xl[[i]] + ytrain <- do.call(c, yl[-i]) + ytest <- yl[[i]] + do.call( + FUN, + list( + xtrain = xtrain, ytrain = ytrain, + xtest = xtest, ytest = ytest, + ... + ) + ) + }, + future.seed = TRUE + )) + median(r) +} + +#' Repeated Cross Validation +#' +#' Runs a repeated cross validation for a function. +#' See also [`cv()`]. +#' +#' @inheritParams cv +#' @param nrepcv `integer(1)` number of repeats. +#' @param \ldots further arguments passed to `FUN`. +#' +#' @return `double(5)`, minimal, 25 % quartiel, median, 75 % quartile and +#' maximal results across the repeated cross validations. +#' @importFrom stats median predict quantile setNames +#' @export +rcv <- function(x, y, nfolds = 5, nrepcv = 2, FUN, ...) { + FUN <- match.fun(FUN) + r <- unlist(future.apply::future_lapply( + seq_len(nrepcv), function(i) + cv(x = x, y = y, nfolds = nfolds, FUN = FUN, ...), + future.seed = TRUE + )) + setNames( + quantile(r, names = FALSE), c("Min", "Q1", "Median", "Q3", "Max") + ) +} diff --git a/R/cv_rusranger.R b/R/cv_rusranger.R index f535149..ec90351 100644 --- a/R/cv_rusranger.R +++ b/R/cv_rusranger.R @@ -13,23 +13,7 @@ #' @importFrom ROCR performance prediction #' @export cv_rusranger <- function(x, y, nfolds = 5, ...) { - folds <- .bfolds(y, nfolds = nfolds) - xl <- split(x, folds) - yl <- split(y, folds) - auc <- unlist(future.apply::future_lapply( - seq_len(nfolds), - function(i) { - xtrain <- do.call(rbind, xl[-i]) - xtest <- xl[[i]] - ytrain <- do.call(c, yl[-i]) - ytest <- yl[[i]] - rngr <- rusranger(x = xtrain, y = ytrain, ...) - pred <- as.numeric(predict(rngr, xtest)$predictions[, 2L]) - performance(prediction(pred, ytest), measure = "auc")@y.values[[1L]] - }, - future.seed = TRUE - )) - median(auc) + cv(x = x, y = y, FUN = .rusranger, nfolds = nfolds, ...) } #' Repeated Cross Validation for rusranger @@ -53,12 +37,153 @@ cv_rusranger <- function(x, y, nfolds = 5, ...) { #' nfolds = 3, nrepcv = 3 #' ) rcv_rusranger <- function(x, y, nfolds = 5, nrepcv = 2, ...) { - auc <- unlist(future.apply::future_lapply( - seq_len(nrepcv), - function(i)cv_rusranger(x = x, y = y, nfolds = nfolds, ...), + rcv(x = x, y = y, nfolds = nfolds, nrepcv = nrepcv, FUN = .rusranger, ...) +} + +#' Grid Search +#' +#' Grid search to optimise hyperparameters for `rusranger()` +#' +#' @inheritParams rusranger +#' @inheritParams rcv_rusranger +#' @param searchspace `data.frame`, hyperparameters to tune. Column names have +#' to match the argument names of [`ranger()`]/[`rusranger()`]. +#' @param \ldots further arguments passed to [`rcv_rusranger()`]. +#' @return `data.frame` with tested hyperparameters and AUCs +#' @export +#' @examples +#' iris <- subset(iris, Species != "setosa") +#' searchspace <- expand.grid( +#' mtry = c(2, 3), +#' num.trees = c(500, 1000) +#' ) +#' ## nfolds and nrepcv are too low for real world applications, and are just +#' ## used for demonstration and to keep the run time of the examples low +#' gs_rusranger( +#' iris[-5], as.numeric(iris$Species == "versicolor"), +#' searchspace = searchspace, nfolds = 3, nrepcv = 1 +#' ) +gs_rusranger <- function(x, y, searchspace, nfolds = 5, nrepcv = 2, ...) { + gridsearch( + x = x, y = y, searchspace = searchspace, + FUN = .rusranger, nfolds = nfolds, nrepcv = nrepcv, ... + ) +} + +#' helper function to provide cv_rusranger/rcv_rusranger for backwards +#' compatibility +#' +#' @noRd +.rusranger <- function(xtrain, ytrain, xtest, ytest, ...) { + rngr <- rusranger(x = xtrain, y = ytrain, ...) + pred <- as.numeric(predict(rngr, xtest)$predictions[, 2L]) + performance(prediction(pred, ytest), measure = "auc")@y.values[[1L]] +} + +#' Nested Cross Validation for Hyperparameter Search +#' +#' Run a grid search in a nested cross validation. +#' +#' @note +#' The reported performance could slightly differ from the median performance +#' in the reported gridsearch. After the gridsearch `rusranger` is trained again +#' with the best hyperparameters which results in a new subsampling. +#' +#' @inheritParams gs_rusranger +#' @param nouterfolds `integer(1)`, number of outer cross validation folds. +#' @param ninnerfolds `integer(1)`, number of inner cross validation folds. +#' @param nrepcv `integer(1)`, number repeats of inner cross validations. +#' @param \ldots further arguments passed to [`gs_rusranger()`]. +#' @return `list`, with an element per `nouterfolds` containing the following +#' subelements: +#' * model selected `ranger` model. +#' * indextrain index of the used training items. +#' * indextest index of the used test items. +#' * prediction predictions results. +#' * truth original labels/classes. +#' * performance resulting performance (AUC). +#' * selectedparams select hyperparameters. +#' * gridsearch `data.frame`, results of the grid search. +#' * nouterfolds `integer(1)`. +#' * ninnerfolds `integer(1)`. +#' * nrepcv `integer(1)`. +#' @export +#' @examples +#' set.seed(20220324) +#' iris <- subset(iris, Species != "setosa") +#' searchspace <- expand.grid( +#' mtry = c(2, 3), +#' num.trees = c(500, 1000) +#' ) +#' ## n(outer|inner) folds and nrepcv are too low for real world applications, +#' ## and are just used for demonstration and to keep the run time of the examples +#' ## low +#' nrcv_rusranger( +#' iris[-5], as.numeric(iris$Species == "versicolor"), +#' searchspace = searchspace, nouterfolds = 3, ninnerfolds = 3, nrepcv = 1 +#' ) +nrcv_rusranger <- function(x, y, searchspace, + nouterfolds = 5, ninnerfolds = 5, nrepcv = 2, + ...) { + + folds <- .bfolds(y, nfolds = nouterfolds) + xl <- split(x, folds) + yl <- split(y, folds) + indices <- split(seq_along(y), folds) + + nrcv <- future.apply::future_lapply( + seq_len(nouterfolds), + function(i) { + xtrain <- do.call(rbind, xl[-i]) + xtest <- xl[[i]] + ytrain <- do.call(c, yl[-i]) + ytest <- yl[[i]] + + gs <- gs_rusranger( + xtrain, ytrain, searchspace, + nfolds = ninnerfolds, nrepcv = nrepcv, ... + ) + + top <- which.max(gs$Median) + selparms <- + gs[top, + !colnames(gs) %in% c("Min", "Q1", "Median", "Q3", "Max"), + drop = FALSE + ] + + ## additional call of an already calculated tree, ... + ## could be avoided if we would store the results of the trees + ## but this would take alot of memory + ## this could slightly change the results because of new + ## resampling + rngr <- do.call( + rusranger, + c( + list( + x = xtrain, + y = ytrain + ), list(...), selparms + ) + ) + pred <- as.numeric(predict(rngr, xtest)$predictions[, 2L]) + + list( + model = rngr, + indextrain = unlist(indices[-i]), + indextest = unlist(indices[i]), + prediction = pred, + truth = ytest, + performance = performance( + prediction(pred, ytest), measure = "auc" + )@y.values[[1L]], + selectedparams = selparms, + gridsearch = gs, + nouterfolds = nouterfolds, + ninnerfolds = ninnerfolds, + nrepcv = nrepcv + ) + }, future.seed = TRUE - )) - setNames( - quantile(auc, names = FALSE), c("Min", "Q1", "Median", "Q3", "Max") ) + nrcv } diff --git a/R/gridsearch.R b/R/gridsearch.R index af29af3..5a17ea6 100644 --- a/R/gridsearch.R +++ b/R/gridsearch.R @@ -1,13 +1,13 @@ #' Grid Search #' -#' Grid search to optimise hyperparameters for `rusranger()` +#' Grid search to optimise hyperparameters for `FUN` #' -#' @inheritParams rusranger -#' @inheritParams rcv_rusranger +#' @inheritParams cv +#' @inheritParams rcv #' @param searchspace `data.frame`, hyperparameters to tune. Column names have -#' to match the argument names of [`ranger()`]/[`rusranger()`]. -#' @param \ldots further arguments passed to [`rcv_rusranger()`]. -#' @return `data.frame` with tested hyperparameters and AUCs +#' to match the argument names of `FUN`. +#' @param \ldots further arguments passed to `FUN` +#' @return `data.frame` with tested hyperparameters and metric #' @export #' @examples #' iris <- subset(iris, Species != "setosa") @@ -21,21 +21,25 @@ #' iris[-5], as.numeric(iris$Species == "versicolor"), #' searchspace = searchspace, nfolds = 3, nrepcv = 1 #' ) -gs_rusranger <- function(x, y, searchspace, nfolds = 5, nrepcv = 2, ...) { - auc <- future.apply::future_lapply( +gridsearch <- function(x, y, searchspace, FUN, nfolds = 5, nrepcv = 2, ...) { + r <- future.apply::future_lapply( seq_len(nrow(searchspace)), function(i) { do.call( - rcv_rusranger, + rcv, c( - list(x = x, y = y, nfolds = nfolds, nrepcv = nrepcv), - list(...), searchspace[i, ] + list( + x = x, y = y, FUN = FUN, + nfolds = nfolds, nrepcv = nrepcv + ), + list(...), + searchspace[i, ] ) ) }, future.seed = TRUE ) - cbind.data.frame(searchspace, do.call(rbind, auc)) + cbind.data.frame(searchspace, do.call(rbind, r)) } #' Nested Cross Validation for Hyperparameter Search @@ -44,21 +48,20 @@ gs_rusranger <- function(x, y, searchspace, nfolds = 5, nrepcv = 2, ...) { #' #' @note #' The reported performance could slightly differ from the median performance -#' in the reported gridsearch. After the gridsearch `rusranger` is trained again +#' in the reported gridsearch. After the gridsearch `FUN` is trained again #' with the best hyperparameters which results in a new subsampling. #' -#' @inheritParams gs_rusranger +#' @inheritParams cv +#' @inheritParams rcv +#' @inheritParams gridsearch #' @param nouterfolds `integer(1)`, number of outer cross validation folds. #' @param ninnerfolds `integer(1)`, number of inner cross validation folds. #' @param nrepcv `integer(1)`, number repeats of inner cross validations. #' @param \ldots further arguments passed to [`gs_rusranger()`]. #' @return `list`, with an element per `nouterfolds` containing the following #' subelements: -#' * model selected `ranger` model. #' * indextrain index of the used training items. #' * indextest index of the used test items. -#' * prediction predictions results. -#' * truth original labels/classes. #' * performance resulting performance (AUC). #' * selectedparams select hyperparameters. #' * gridsearch `data.frame`, results of the grid search. @@ -80,16 +83,15 @@ gs_rusranger <- function(x, y, searchspace, nfolds = 5, nrepcv = 2, ...) { #' iris[-5], as.numeric(iris$Species == "versicolor"), #' searchspace = searchspace, nouterfolds = 3, ninnerfolds = 3, nrepcv = 1 #' ) -nrcv_rusranger <- function(x, y, searchspace, - nouterfolds = 5, ninnerfolds = 5, nrepcv = 2, - ...) { - +nested_gridsearch <- function(x, y, searchspace, FUN, + nouterfolds = 5, ninnerfolds = 5, nrepcv = 2, + ...) { folds <- .bfolds(y, nfolds = nouterfolds) xl <- split(x, folds) yl <- split(y, folds) indices <- split(seq_along(y), folds) - nrcv <- future.apply::future_lapply( + future.apply::future_lapply( seq_len(nouterfolds), function(i) { xtrain <- do.call(rbind, xl[-i]) @@ -97,45 +99,35 @@ nrcv_rusranger <- function(x, y, searchspace, ytrain <- do.call(c, yl[-i]) ytest <- yl[[i]] - gs <- gs_rusranger( - xtrain, ytrain, searchspace, - nfolds = ninnerfolds, nrepcv = nrepcv, ... + g <- gridsearch( + x = xtrain, y = ytrain, searchspace = searchspace, + FUN = FUN, nfolds = ninnerfolds, nrepcv = nrepcv, ... ) - top <- which.max(gs$Median) + top <- which.max(g$Median) selparms <- - gs[top, - !colnames(gs) %in% c("Min", "Q1", "Median", "Q3", "Max"), + g[top, + !colnames(g) %in% c("Min", "Q1", "Median", "Q3", "Max"), drop = FALSE ] - ## additional call of an already calculated tree, ... - ## could be avoided if we would store the results of the trees - ## but this would take alot of memory - ## this could slightly change the results because of new - ## resampling - rngr <- do.call( - rusranger, + r <- do.call( + FUN, c( - list( - x = xtrain, - y = ytrain - ), list(...), selparms + list( + xtrain = xtrain, ytrain = ytrain, + xtest = xtest, ytest = ytest, + ... + ), selparms ) ) - pred <- as.numeric(predict(rngr, xtest)$predictions[, 2L]) list( - model = rngr, indextrain = unlist(indices[-i]), indextest = unlist(indices[i]), - prediction = pred, - truth = ytest, - performance = performance( - prediction(pred, ytest), measure = "auc" - )@y.values[[1L]], + performance = r, selectedparams = selparms, - gridsearch = gs, + gridsearch = g, nouterfolds = nouterfolds, ninnerfolds = ninnerfolds, nrepcv = nrepcv @@ -143,5 +135,4 @@ nrcv_rusranger <- function(x, y, searchspace, }, future.seed = TRUE ) - nrcv } diff --git a/R/rosranger.R b/R/rosranger.R new file mode 100644 index 0000000..1fbac15 --- /dev/null +++ b/R/rosranger.R @@ -0,0 +1,49 @@ +#' ROS ranger +#' +#' Adapted default settings to the [`ranger()`] to support +#' random-over-sampling. Additionally the default settings are modified to the +#' most common classification settings used in the AMPEL project. +#' +#' @details +#' In contrast to [`ranger()`] `rusranger()` currently just supports binary +#' classifications. +#' +#' @param x `matrix`/`data.frame`, feature matrix, see [`ranger()`] for +#' details. +#' @param y `numeric`/`factor`, classification labels, see [`ranger()`] for +#' details. +#' @param probability `logical(1)`, grow probability trees, see [`ranger()`] +#' for details. +#' @param classification `logical(1)`, run classification even if `y` is +#' `numeric`, see [`ranger()`] for details. +#' @param min.node.size, same as in [`ranger()`] +#' @param ndups `numeric(1)`, times of duplication of minority class. +#' @param \ldots further arguments passed to [`ranger()`]. +#' +#' @return `ranger` object, see [`ranger()`] for details. +#' +#' @seealso +#' [`ranger()`] +#' +#' @references +#' *AMPEL* project: +#' Analysis and Reporting System for the Improvement of Patient Safety +#' through Real-Time Integration of Laboratory Findings, +#' https://ampel.care. +#' +#' @import ranger +#' @export +rosranger <- function(x, y, probability = TRUE, classification = !probability, + min.node.size = if (probability) 10 else 1, ndups = 1, + ...) { + ranger( + x = as.data.frame(x), y = y, + probability = probability, + classification = classification, + min.node.size = min.node.size, + replace = TRUE, + case.weights = .roscaseweights(y, ndups), + ..., + keep.inbag = FALSE + ) +} diff --git a/R/smote.R b/R/smote.R new file mode 100644 index 0000000..3333cf7 --- /dev/null +++ b/R/smote.R @@ -0,0 +1,124 @@ +#' SMOTE +#' +#' SMOTE: Synthetic Minority Over-sampling Technique. +#' +#' @param x `matrix`, features +#' @param y `factor`, class (length has to be equal to number of rows in `x`). +#' @param k `numeric(1)`, number of nearest neighbours. +#' @param ndups `numeric(1)`, number of synthetic duplicates to upsample +#' minority. Default is `0` which tries to upsample to majority. +#' @return `list` with elements `x` features with SMOTEd samples and `y` with +#' added minority class elements. +#' +#' @importFrom stats dist runif +#' @export +#' +#' @references +#' Chawla, Bowyer, Hall and Kegelmeyer. 2002. +#' SMOTE: Synthetic Minority Over-sampling Technique. +#' Journal of Artificial Intelligence Research 16. 321–357. +#' \doi{10.1613/jair.953} +smote <- function(x, y, k = 5L, ndups = 0L) { + if (nrow(x) != length(y)) + stop("nrow(x) has to be equal to length(y).") + my <- .minority(y) + sel <- y == my + if (!ndups) + ndups <- .ndups(nrow(x), .nmin(y)) + sm <- .smote(x[sel,, drop = FALSE], k = k, ndups = ndups) + x <- rbind(x, sm) + rownames(x) <- NULL + list(x = x, y = c(y, rep.int(my, nrow(sm)))) +} + +.smote <- function(x, k = 5L, ndups = 1L) { + if (!is.matrix(x)) + stop("'x' has to be a matrix.") + if (!length(k) == 1 || !is.integer(k) || k < 2L) + stop("'k' has to be an integer of length 1 and greater or equal 2.") + if (!length(ndups) == 1 || !is.integer(ndups) || ndups < 1L) + stop("'ndups' has to be an integer of length 1 and greater 0.") + nr <- nrow(x) + nc <- ncol(x) + iknn <- .knn(x, k) + s <- matrix(, nrow = nr * ndups, ncol = nc) + sdups <- seq_len(ndups) + + for (i in seq_len(nr)) { + ir <- rep.int(i, ndups) + # difference to nearest random nearest neighbour + j <- iknn[i, sample.int(k, ndups, replace = TRUE)] + d <- x[ir, ] - x[j, , drop = FALSE] + # difference multiplied with random gap in range 0..1 + s[(i - 1L) * ndups + sdups,] <- x[ir, ] + runif(ndups) * d + } + s +} + +#' K-nearest neighbour algrorithm +#' +#' Simple (not fastest) implementation of knn +#' +#' @param x `matrix` +#' @param k `integer(1)`, number of nearest neighbours +#' @return `matrix`, `nrow(x)` times `k`, +#' for each row in `x` the row indices of the nearest neighbours. +#' @noRd +.knn <- function(x, k = 5) { + d <- dist(x, method = "euclidean") + nr <- nrow(x) + m <- matrix(, nrow = nr, ncol = k) + k <- seq_len(k) + for (i in seq_len(nrow(x))) { + m[i, ] <- order(.neighbours(d, i))[k] + } + m +} + +#' Subsetting dist +#' +#' Return a column/all neighbours/competitors of an element of a `dist` without +#' turning the `dist` object into a `matrix` before. +#' +#' @param d `dist` +#' @param j `integer`, column index. +#' @return column/neighbour/competitor values. +#' @noRd +.neighbours <- function(d, j) { + if (!inherits(d, "dist")) + stop("'d' is not a 'dist' object.") + if (length(j) != 1L || !is.integer(j)) + stop("'j' has to be an integer of length 1.") + n <- attr(d, "Size") + i <- seq_len(n)[-j] + k <- ifelse(i < j, + n * (i - 1L) - i * (i - 1L) / 2L + j - i, + (2 * n - j) * (j - 1) / 2 + i - j + ) + d[k] +} + +#' Calculate SMOTE duplicates +#' +#' Calculate SMOTE duplicates to upsample to majority class +#' +#' @param n `numeric(1)`, total number of samples. +#' @param nmin `numeric(1)`, number of minority samples. +#' @return `numeric(1)`, number of duplicates necessary to upsample to majority +#' class. +#' @noRd +.ndups <- function(n, nmin) { + as.integer(floor((n - 2L * nmin) / nmin)) +} + +#' Find minority class +#' +#' @param y binary class vector. +#' @return minority class +#' @noRd +.minority <- function(y) { + tbl <- table(y) + r <- names(if (tbl[1] > tbl[2]) tbl[2] else tbl[1]) + mode(r) <- mode(y) + r +} diff --git a/R/utils-rosranger.R b/R/utils-rosranger.R new file mode 100644 index 0000000..585f9a5 --- /dev/null +++ b/R/utils-rosranger.R @@ -0,0 +1,13 @@ +#' Case weights for ROS +#' +#' @param y `factor`/`numeric`, classes +#' @param ndups `numeric(1)`, times of duplication of minority class. +#' @return `double(length(y))` with case weights +#' @noRd +.roscaseweights <- function(y, ndups) { + w <- rep_len(1, length(y)) + my <- .minority(y) + nm <- .nmin(y) + w[y == my] <- (length(y) + nm * ndups) / nm + w +} diff --git a/R/utils-rusranger.R b/R/utils-rusranger.R index 3f0b1bf..d9c0723 100644 --- a/R/utils-rusranger.R +++ b/R/utils-rusranger.R @@ -42,6 +42,18 @@ } } +#' N minority +#' +#' Find number of minority class samples. +#' +#' @param y vector of binary class labels +#' @return `integer(1)`, number of minority class samples +#' @noRd +.nmin <- function(y) { + tbl <- table(y) + as.vector(if (tbl[1] > tbl[2]) tbl[2] else tbl[1]) +} + #' Sample fraction #' #' Calculate sample fraction for 50:50 selection (two times the minority class) @@ -50,6 +62,5 @@ #' @return `double(1)`, sample fraction #' @noRd .samplefraction <- function(y) { - tbl <- table(y) - as.vector(2L * if (tbl[1] > tbl[2]) tbl[2] else tbl[1]) / length(y) + 2L * .nmin(y) / length(y) } diff --git a/man/cv.Rd b/man/cv.Rd new file mode 100644 index 0000000..c0e0a6b --- /dev/null +++ b/man/cv.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cv.R +\name{cv} +\alias{cv} +\title{Cross Validation} +\usage{ +cv(x, y, FUN, nfolds = 5, ...) +} +\arguments{ +\item{x}{\code{matrix}/\code{data.frame}, feature matrix, see \code{\link[=ranger]{ranger()}} for +details.} + +\item{y}{\code{numeric}/\code{factor}, classification labels, see \code{\link[=ranger]{ranger()}} for +details.} + +\item{FUN}{\code{function} function to optimize.} + +\item{nfolds}{\code{integer(1)} number of cross validation folds.} + +\item{\ldots}{further arguments passed to \code{FUN}.} +} +\value{ +\code{double(1)} median AUC across all cross validation splits +} +\description{ +Runs a cross validation for train and prediction function. +} +\note{ +The function to optimize has to accept five arguments: xtrain, ytrain, xtest, +ytest and \ldots. +} +\examples{ +.rusranger <- function(xtrain, ytrain, xtest, ytest, ...) { + rngr <- rusranger(x = xtrain, y = ytrain, ...) + pred <- as.numeric(predict(rngr, xtest)$predictions[, 2L]) + performance(prediction(pred, ytest), measure = "auc")@y.values[[1L]] +} +cv(iris[-5], as.numeric(iris$Species == "versicolor"), .rusranger, nfolds = 3) +} diff --git a/man/gridsearch.Rd b/man/gridsearch.Rd new file mode 100644 index 0000000..9f39aec --- /dev/null +++ b/man/gridsearch.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gridsearch.R +\name{gridsearch} +\alias{gridsearch} +\title{Grid Search} +\usage{ +gridsearch(x, y, searchspace, FUN, nfolds = 5, nrepcv = 2, ...) +} +\arguments{ +\item{x}{\code{matrix}/\code{data.frame}, feature matrix, see \code{\link[=ranger]{ranger()}} for +details.} + +\item{y}{\code{numeric}/\code{factor}, classification labels, see \code{\link[=ranger]{ranger()}} for +details.} + +\item{searchspace}{\code{data.frame}, hyperparameters to tune. Column names have +to match the argument names of \code{FUN}.} + +\item{FUN}{\code{function} function to optimize.} + +\item{nfolds}{\code{integer(1)} number of cross validation folds.} + +\item{nrepcv}{\code{integer(1)} number of repeats.} + +\item{\ldots}{further arguments passed to \code{FUN}} +} +\value{ +\code{data.frame} with tested hyperparameters and metric +} +\description{ +Grid search to optimise hyperparameters for \code{FUN} +} +\examples{ +iris <- subset(iris, Species != "setosa") +searchspace <- expand.grid( + mtry = c(2, 3), + num.trees = c(500, 1000) +) +## nfolds and nrepcv are too low for real world applications, and are just +## used for demonstration and to keep the run time of the examples low +gs_rusranger( + iris[-5], as.numeric(iris$Species == "versicolor"), + searchspace = searchspace, nfolds = 3, nrepcv = 1 +) +} diff --git a/man/gs_rusranger.Rd b/man/gs_rusranger.Rd index ab7aad2..3b064ab 100644 --- a/man/gs_rusranger.Rd +++ b/man/gs_rusranger.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/gridsearch.R +% Please edit documentation in R/cv_rusranger.R \name{gs_rusranger} \alias{gs_rusranger} \title{Grid Search} diff --git a/man/nested_gridsearch.Rd b/man/nested_gridsearch.Rd new file mode 100644 index 0000000..f238251 --- /dev/null +++ b/man/nested_gridsearch.Rd @@ -0,0 +1,74 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gridsearch.R +\name{nested_gridsearch} +\alias{nested_gridsearch} +\title{Nested Cross Validation for Hyperparameter Search} +\usage{ +nested_gridsearch( + x, + y, + searchspace, + FUN, + nouterfolds = 5, + ninnerfolds = 5, + nrepcv = 2, + ... +) +} +\arguments{ +\item{x}{\code{matrix}/\code{data.frame}, feature matrix, see \code{\link[=ranger]{ranger()}} for +details.} + +\item{y}{\code{numeric}/\code{factor}, classification labels, see \code{\link[=ranger]{ranger()}} for +details.} + +\item{searchspace}{\code{data.frame}, hyperparameters to tune. Column names have +to match the argument names of \code{FUN}.} + +\item{FUN}{\code{function} function to optimize.} + +\item{nouterfolds}{\code{integer(1)}, number of outer cross validation folds.} + +\item{ninnerfolds}{\code{integer(1)}, number of inner cross validation folds.} + +\item{nrepcv}{\code{integer(1)}, number repeats of inner cross validations.} + +\item{\ldots}{further arguments passed to \code{\link[=gs_rusranger]{gs_rusranger()}}.} +} +\value{ +\code{list}, with an element per \code{nouterfolds} containing the following +subelements: +\itemize{ +\item indextrain index of the used training items. +\item indextest index of the used test items. +\item performance resulting performance (AUC). +\item selectedparams select hyperparameters. +\item gridsearch \code{data.frame}, results of the grid search. +\item nouterfolds \code{integer(1)}. +\item ninnerfolds \code{integer(1)}. +\item nrepcv \code{integer(1)}. +} +} +\description{ +Run a grid search in a nested cross validation. +} +\note{ +The reported performance could slightly differ from the median performance +in the reported gridsearch. After the gridsearch \code{FUN} is trained again +with the best hyperparameters which results in a new subsampling. +} +\examples{ +set.seed(20220324) +iris <- subset(iris, Species != "setosa") +searchspace <- expand.grid( + mtry = c(2, 3), + num.trees = c(500, 1000) +) +## n(outer|inner) folds and nrepcv are too low for real world applications, +## and are just used for demonstration and to keep the run time of the examples +## low +nrcv_rusranger( + iris[-5], as.numeric(iris$Species == "versicolor"), + searchspace = searchspace, nouterfolds = 3, ninnerfolds = 3, nrepcv = 1 +) +} diff --git a/man/nrcv_rusranger.Rd b/man/nrcv_rusranger.Rd index 46735a6..084fb07 100644 --- a/man/nrcv_rusranger.Rd +++ b/man/nrcv_rusranger.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/gridsearch.R +% Please edit documentation in R/cv_rusranger.R \name{nrcv_rusranger} \alias{nrcv_rusranger} \title{Nested Cross Validation for Hyperparameter Search} diff --git a/man/rcv.Rd b/man/rcv.Rd new file mode 100644 index 0000000..4aa6d0f --- /dev/null +++ b/man/rcv.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cv.R +\name{rcv} +\alias{rcv} +\title{Repeated Cross Validation} +\usage{ +rcv(x, y, nfolds = 5, nrepcv = 2, FUN, ...) +} +\arguments{ +\item{x}{\code{matrix}/\code{data.frame}, feature matrix, see \code{\link[=ranger]{ranger()}} for +details.} + +\item{y}{\code{numeric}/\code{factor}, classification labels, see \code{\link[=ranger]{ranger()}} for +details.} + +\item{nfolds}{\code{integer(1)} number of cross validation folds.} + +\item{nrepcv}{\code{integer(1)} number of repeats.} + +\item{FUN}{\code{function} function to optimize.} + +\item{\ldots}{further arguments passed to \code{FUN}.} +} +\value{ +\code{double(5)}, minimal, 25 \% quartiel, median, 75 \% quartile and +maximal results across the repeated cross validations. +} +\description{ +Runs a repeated cross validation for a function. +See also \code{\link[=cv]{cv()}}. +} diff --git a/man/rosranger.Rd b/man/rosranger.Rd new file mode 100644 index 0000000..9f8e22b --- /dev/null +++ b/man/rosranger.Rd @@ -0,0 +1,56 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/rosranger.R +\name{rosranger} +\alias{rosranger} +\title{ROS ranger} +\usage{ +rosranger( + x, + y, + probability = TRUE, + classification = !probability, + min.node.size = if (probability) 10 else 1, + ndups = 1, + ... +) +} +\arguments{ +\item{x}{\code{matrix}/\code{data.frame}, feature matrix, see \code{\link[=ranger]{ranger()}} for +details.} + +\item{y}{\code{numeric}/\code{factor}, classification labels, see \code{\link[=ranger]{ranger()}} for +details.} + +\item{probability}{\code{logical(1)}, grow probability trees, see \code{\link[=ranger]{ranger()}} +for details.} + +\item{classification}{\code{logical(1)}, run classification even if \code{y} is +\code{numeric}, see \code{\link[=ranger]{ranger()}} for details.} + +\item{min.node.size, }{same as in \code{\link[=ranger]{ranger()}}} + +\item{ndups}{\code{numeric(1)}, times of duplication of minority class.} + +\item{\ldots}{further arguments passed to \code{\link[=ranger]{ranger()}}.} +} +\value{ +\code{ranger} object, see \code{\link[=ranger]{ranger()}} for details. +} +\description{ +Adapted default settings to the \code{\link[=ranger]{ranger()}} to support +random-over-sampling. Additionally the default settings are modified to the +most common classification settings used in the AMPEL project. +} +\details{ +In contrast to \code{\link[=ranger]{ranger()}} \code{rusranger()} currently just supports binary +classifications. +} +\references{ +\emph{AMPEL} project: +Analysis and Reporting System for the Improvement of Patient Safety +through Real-Time Integration of Laboratory Findings, +https://ampel.care. +} +\seealso{ +\code{\link[=ranger]{ranger()}} +} diff --git a/man/smote.Rd b/man/smote.Rd new file mode 100644 index 0000000..c8c796b --- /dev/null +++ b/man/smote.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/smote.R +\name{smote} +\alias{smote} +\title{SMOTE} +\usage{ +smote(x, y, k = 5L, ndups = 0L) +} +\arguments{ +\item{x}{\code{matrix}, features} + +\item{y}{\code{factor}, class (length has to be equal to number of rows in \code{x}).} + +\item{k}{\code{numeric(1)}, number of nearest neighbours.} + +\item{ndups}{\code{numeric(1)}, number of synthetic duplicates to upsample +minority. Default is \code{0} which tries to upsample to majority.} +} +\value{ +\code{list} with elements \code{x} features with SMOTEd samples and \code{y} with +added minority class elements. +} +\description{ +SMOTE: Synthetic Minority Over-sampling Technique. +} +\references{ +Chawla, Bowyer, Hall and Kegelmeyer. 2002. +SMOTE: Synthetic Minority Over-sampling Technique. +Journal of Artificial Intelligence Research 16. 321–357. +\doi{10.1613/jair.953} +} diff --git a/tests/testthat/test_smote.R b/tests/testthat/test_smote.R new file mode 100644 index 0000000..4c67b11 --- /dev/null +++ b/tests/testthat/test_smote.R @@ -0,0 +1,36 @@ +test_that("smote", { + x <- matrix(1:8, nrow = 4) + expect_error(smote(x, 1), "length") +}) + +test_that(".smote", { + expect_error(.smote(data.frame(a = 1:2)), "matrix") + x <- matrix(1:8, nrow = 4) + expect_error(.smote(x, k = 1:2), "length") + expect_error(.smote(x, k = 1.2), "integer") + expect_error(.smote(x, k = 1L), "2") + expect_error(.smote(x, ndups = 1:2), "length") + expect_error(.smote(x, ndups = 1.2), "integer") + expect_error(.smote(x, ndups = 0L), "0") + expect_identical(nrow(.smote(x, k = 2L, ndups = 1L)), 4L) + expect_identical(nrow(.smote(x, k = 2L, ndups = 2L)), 8L) +}) + +test_that(".neighbours", { + expect_error(.neighbours(1:10), "dist") + d <- dist(1:9) + expect_error(.neighbours(d, 1:2), "length") + expect_error(.neighbours(d, 1.2), "integer") + expect_equal(.neighbours(d, 3L), c(2:1, 1:6)) + expect_equal(.neighbours(d, 5L), c(4:1, 1:4)) +}) + +test_that(".ndups", { + expect_equal(.ndups(100, 20), 3) + expect_equal(.ndups(100, 10), 8) +}) + +test_that(".minority", { + expect_equal(.minority(rep(1:2, c(4, 2))), 2) + expect_equal(.minority(rep(c("A", "B"), c(2, 5))), "A") +}) diff --git a/tests/testthat/test_utils-rosranger.R b/tests/testthat/test_utils-rosranger.R new file mode 100644 index 0000000..66b1d2d --- /dev/null +++ b/tests/testthat/test_utils-rosranger.R @@ -0,0 +1,9 @@ +test_that(".caseweights", { + cl <- rep(1:2, c(2, 4)) + expect_equal( + .roscaseweights(cl, 2), rep(c(5, 1), c(2, 4)) + ) + expect_equal( + .roscaseweights(cl, 4), rep(c(7, 1), c(2, 4)) + ) +}) diff --git a/tests/testthat/test_utils-rusranger.R b/tests/testthat/test_utils-rusranger.R index e3117f7..4735823 100644 --- a/tests/testthat/test_utils-rusranger.R +++ b/tests/testthat/test_utils-rusranger.R @@ -27,10 +27,15 @@ test_that(".caseweights", { cl <- rep(0:1, c(2, 4)) expect_equal( unname(.caseweights(cl, replace = FALSE)), - unname(.caseweights(cl + 1, replace = FALSE)), + unname(.caseweights(cl + 1, replace = FALSE)) ) }) +test_that(".nmin", { + expect_equal(.nmin(rep(1:2, c(2, 4))), 2) + expect_equal(.nmin(rep(1:2, c(7, 4))), 4) +}) + test_that(".samplefraction", { cl <- rep(1:2, c(2, 4)) expect_equal(.samplefraction(cl), 2/3)