From 975a6675d369bea7fce713ad94ec7cea69378dc7 Mon Sep 17 00:00:00 2001
From: Sebastian Gibb <mail@sebastiangibb.de>
Date: Sun, 17 Apr 2022 17:35:20 +0200
Subject: [PATCH] feat: add rosranger, smote, cv, rcv, gridsearch

---
 DESCRIPTION                           |   4 +-
 NAMESPACE                             |   8 ++
 NEWS.md                               |   4 +
 R/cv.R                                |  75 +++++++++++
 R/cv_rusranger.R                      | 171 ++++++++++++++++++++++----
 R/gridsearch.R                        |  87 ++++++-------
 R/rosranger.R                         |  49 ++++++++
 R/smote.R                             | 124 +++++++++++++++++++
 R/utils-rosranger.R                   |  13 ++
 R/utils-rusranger.R                   |  15 ++-
 man/cv.Rd                             |  39 ++++++
 man/gridsearch.Rd                     |  45 +++++++
 man/gs_rusranger.Rd                   |   2 +-
 man/nested_gridsearch.Rd              |  74 +++++++++++
 man/nrcv_rusranger.Rd                 |   2 +-
 man/rcv.Rd                            |  31 +++++
 man/rosranger.Rd                      |  56 +++++++++
 man/smote.Rd                          |  31 +++++
 tests/testthat/test_smote.R           |  36 ++++++
 tests/testthat/test_utils-rosranger.R |   9 ++
 tests/testthat/test_utils-rusranger.R |   7 +-
 21 files changed, 804 insertions(+), 78 deletions(-)
 create mode 100644 R/cv.R
 create mode 100644 R/rosranger.R
 create mode 100644 R/smote.R
 create mode 100644 R/utils-rosranger.R
 create mode 100644 man/cv.Rd
 create mode 100644 man/gridsearch.Rd
 create mode 100644 man/nested_gridsearch.Rd
 create mode 100644 man/rcv.Rd
 create mode 100644 man/rosranger.Rd
 create mode 100644 man/smote.Rd
 create mode 100644 tests/testthat/test_smote.R
 create mode 100644 tests/testthat/test_utils-rosranger.R

diff --git a/DESCRIPTION b/DESCRIPTION
index a29cb7e..4b1840b 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: rusranger
 Title:
     Modified ranger implementation to support random-under-sampling
-Version: 0.0.2
-Date: 2022-03-24
+Version: 0.0.3
+Date: 2022-04-17
 Description:
     The random forest implementation of the ranger package is modified to
     support random-under-sampling. Additional helper functions for
diff --git a/NAMESPACE b/NAMESPACE
index df39108..d684454 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,16 +1,24 @@
 # Generated by roxygen2: do not edit by hand
 
+export(cv)
 export(cv_rusranger)
+export(gridsearch)
 export(gs_rusranger)
+export(nested_gridsearch)
 export(nrcv_rusranger)
+export(rcv)
 export(rcv_rusranger)
+export(rosranger)
 export(rusranger)
+export(smote)
 import(future)
 import(ranger)
 importFrom(ROCR,performance)
 importFrom(ROCR,prediction)
 importFrom(future.apply,future_lapply)
+importFrom(stats,dist)
 importFrom(stats,median)
 importFrom(stats,predict)
 importFrom(stats,quantile)
+importFrom(stats,runif)
 importFrom(stats,setNames)
diff --git a/NEWS.md b/NEWS.md
index cfcab17..acc2798 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,9 @@
 # rusranger 0.0
 
+## Changes in 0.0.3
+
+- Add `rosranger`, `smote`, `cv`, `rcv`, `gridsearch`.
+
 ## Changes in 0.0.2
 
 - Fix `.caseweights` for 0/1 binary class coding (instead of 1/2).
diff --git a/R/cv.R b/R/cv.R
new file mode 100644
index 0000000..4f68e27
--- /dev/null
+++ b/R/cv.R
@@ -0,0 +1,75 @@
+#' Cross Validation
+#'
+#' Runs a cross validation for train and prediction function.
+#'
+#' @inheritParams rusranger
+#' @param FUN `function` function to optimize.
+#' @param nfolds `integer(1)` number of cross validation folds.
+#' @param \ldots further arguments passed to `FUN`.
+#'
+#' @return `double(1)` median AUC across all cross validation splits
+#'
+#' @note
+#' The function to optimize has to accept five arguments: xtrain, ytrain, xtest,
+#' ytest and \ldots.
+#'
+#' @import future
+#' @importFrom future.apply future_lapply
+#' @importFrom ROCR performance prediction
+#' @export
+#' @examples
+#' .rusranger <- function(xtrain, ytrain, xtest, ytest, ...) {
+#'     rngr <- rusranger(x = xtrain, y = ytrain, ...)
+#'     pred <- as.numeric(predict(rngr, xtest)$predictions[, 2L])
+#'     performance(prediction(pred, ytest), measure = "auc")@y.values[[1L]]
+#' }
+#' cv(iris[-5], as.numeric(iris$Species == "versicolor"), .rusranger, nfolds = 3)
+cv <- function(x, y, FUN, nfolds = 5, ...) {
+    folds <- .bfolds(y, nfolds = nfolds)
+    xl <- split(x, folds)
+    yl <- split(y, folds)
+    r <- unlist(future.apply::future_lapply(
+        seq_len(nfolds),
+        function(i) {
+            xtrain <- do.call(rbind, xl[-i])
+            xtest <- xl[[i]]
+            ytrain <- do.call(c, yl[-i])
+            ytest <- yl[[i]]
+            do.call(
+                FUN,
+                list(
+                    xtrain = xtrain, ytrain = ytrain,
+                    xtest = xtest, ytest = ytest,
+                    ...
+                )
+            )
+        },
+        future.seed = TRUE
+    ))
+    median(r)
+}
+
+#' Repeated Cross Validation
+#'
+#' Runs a repeated cross validation for a function.
+#' See also [`cv()`].
+#'
+#' @inheritParams cv
+#' @param nrepcv `integer(1)` number of repeats.
+#' @param \ldots further arguments passed to `FUN`.
+#'
+#' @return `double(5)`, minimal, 25 % quartiel, median, 75 % quartile and
+#' maximal results across the repeated cross validations.
+#' @importFrom stats median predict quantile setNames
+#' @export
+rcv <- function(x, y, nfolds = 5, nrepcv = 2, FUN, ...) {
+    FUN <- match.fun(FUN)
+    r <- unlist(future.apply::future_lapply(
+        seq_len(nrepcv), function(i)
+            cv(x = x, y = y, nfolds = nfolds, FUN = FUN, ...),
+        future.seed = TRUE
+    ))
+    setNames(
+        quantile(r, names = FALSE), c("Min", "Q1", "Median", "Q3", "Max")
+    )
+}
diff --git a/R/cv_rusranger.R b/R/cv_rusranger.R
index f535149..ec90351 100644
--- a/R/cv_rusranger.R
+++ b/R/cv_rusranger.R
@@ -13,23 +13,7 @@
 #' @importFrom ROCR performance prediction
 #' @export
 cv_rusranger <- function(x, y, nfolds = 5, ...) {
-    folds <- .bfolds(y, nfolds = nfolds)
-    xl <- split(x, folds)
-    yl <- split(y, folds)
-    auc <- unlist(future.apply::future_lapply(
-        seq_len(nfolds),
-        function(i) {
-            xtrain <- do.call(rbind, xl[-i])
-            xtest <- xl[[i]]
-            ytrain <- do.call(c, yl[-i])
-            ytest <- yl[[i]]
-            rngr <- rusranger(x = xtrain, y = ytrain, ...)
-            pred <- as.numeric(predict(rngr, xtest)$predictions[, 2L])
-            performance(prediction(pred, ytest), measure = "auc")@y.values[[1L]]
-        },
-        future.seed = TRUE
-    ))
-    median(auc)
+    cv(x = x, y = y, FUN = .rusranger, nfolds = nfolds, ...)
 }
 
 #' Repeated Cross Validation for rusranger
@@ -53,12 +37,153 @@ cv_rusranger <- function(x, y, nfolds = 5, ...) {
 #'     nfolds = 3, nrepcv = 3
 #' )
 rcv_rusranger <- function(x, y, nfolds = 5, nrepcv = 2, ...) {
-    auc <- unlist(future.apply::future_lapply(
-        seq_len(nrepcv),
-        function(i)cv_rusranger(x = x, y = y, nfolds = nfolds, ...),
+    rcv(x = x, y = y, nfolds = nfolds, nrepcv = nrepcv, FUN = .rusranger, ...)
+}
+
+#' Grid Search
+#'
+#' Grid search to optimise hyperparameters for `rusranger()`
+#'
+#' @inheritParams rusranger
+#' @inheritParams rcv_rusranger
+#' @param searchspace `data.frame`, hyperparameters to tune. Column names have
+#' to match the argument names of [`ranger()`]/[`rusranger()`].
+#' @param \ldots further arguments passed to [`rcv_rusranger()`].
+#' @return `data.frame` with tested hyperparameters and AUCs
+#' @export
+#' @examples
+#' iris <- subset(iris, Species != "setosa")
+#' searchspace <- expand.grid(
+#'    mtry = c(2, 3),
+#'    num.trees = c(500, 1000)
+#' )
+#' ## nfolds and nrepcv are too low for real world applications, and are just
+#' ## used for demonstration and to keep the run time of the examples low
+#' gs_rusranger(
+#'     iris[-5], as.numeric(iris$Species == "versicolor"),
+#'     searchspace = searchspace, nfolds = 3, nrepcv = 1
+#' )
+gs_rusranger <- function(x, y, searchspace, nfolds = 5, nrepcv = 2, ...) {
+    gridsearch(
+        x = x, y = y, searchspace = searchspace,
+        FUN = .rusranger, nfolds = nfolds, nrepcv = nrepcv, ...
+    )
+}
+
+#' helper function to provide cv_rusranger/rcv_rusranger for backwards
+#' compatibility
+#'
+#' @noRd
+.rusranger <- function(xtrain, ytrain, xtest, ytest, ...) {
+    rngr <- rusranger(x = xtrain, y = ytrain, ...)
+    pred <- as.numeric(predict(rngr, xtest)$predictions[, 2L])
+    performance(prediction(pred, ytest), measure = "auc")@y.values[[1L]]
+}
+
+#' Nested Cross Validation for Hyperparameter Search
+#'
+#' Run a grid search in a nested cross validation.
+#'
+#' @note
+#' The reported performance could slightly differ from the median performance
+#' in the reported gridsearch. After the gridsearch `rusranger` is trained again
+#' with the best hyperparameters which results in a new subsampling.
+#'
+#' @inheritParams gs_rusranger
+#' @param nouterfolds `integer(1)`, number of outer cross validation folds.
+#' @param ninnerfolds `integer(1)`, number of inner cross validation folds.
+#' @param nrepcv `integer(1)`, number repeats of inner cross validations.
+#' @param \ldots further arguments passed to [`gs_rusranger()`].
+#' @return `list`, with an element per `nouterfolds` containing the following
+#' subelements:
+#' * model selected `ranger` model.
+#' * indextrain index of the used training items.
+#' * indextest index of the used test items.
+#' * prediction predictions results.
+#' * truth original labels/classes.
+#' * performance resulting performance (AUC).
+#' * selectedparams select hyperparameters.
+#' * gridsearch `data.frame`, results of the grid search.
+#' * nouterfolds `integer(1)`.
+#' * ninnerfolds `integer(1)`.
+#' * nrepcv `integer(1)`.
+#' @export
+#' @examples
+#' set.seed(20220324)
+#' iris <- subset(iris, Species != "setosa")
+#' searchspace <- expand.grid(
+#'    mtry = c(2, 3),
+#'    num.trees = c(500, 1000)
+#' )
+#' ## n(outer|inner) folds and nrepcv are too low for real world applications,
+#' ## and are just used for demonstration and to keep the run time of the examples
+#' ## low
+#' nrcv_rusranger(
+#'     iris[-5], as.numeric(iris$Species == "versicolor"),
+#'     searchspace = searchspace, nouterfolds = 3, ninnerfolds = 3, nrepcv = 1
+#' )
+nrcv_rusranger <- function(x, y, searchspace,
+                           nouterfolds = 5, ninnerfolds = 5, nrepcv = 2,
+                           ...) {
+
+    folds <- .bfolds(y, nfolds = nouterfolds)
+    xl <- split(x, folds)
+    yl <- split(y, folds)
+    indices <- split(seq_along(y), folds)
+
+    nrcv <- future.apply::future_lapply(
+        seq_len(nouterfolds),
+        function(i) {
+            xtrain <- do.call(rbind, xl[-i])
+            xtest <- xl[[i]]
+            ytrain <- do.call(c, yl[-i])
+            ytest <- yl[[i]]
+
+            gs <- gs_rusranger(
+                xtrain, ytrain, searchspace,
+                nfolds = ninnerfolds, nrepcv = nrepcv, ...
+            )
+
+            top <- which.max(gs$Median)
+            selparms <-
+                gs[top,
+                   !colnames(gs) %in% c("Min", "Q1", "Median", "Q3", "Max"),
+                   drop = FALSE
+                ]
+
+            ## additional call of an already calculated tree, ...
+            ## could be avoided if we would store the results of the trees
+            ## but this would take alot of memory
+            ## this could slightly change the results because of new
+            ## resampling
+            rngr <- do.call(
+                rusranger,
+                c(
+                  list(
+                    x = xtrain,
+                    y = ytrain
+                  ), list(...), selparms
+                )
+            )
+            pred <- as.numeric(predict(rngr, xtest)$predictions[, 2L])
+
+            list(
+                model = rngr,
+                indextrain = unlist(indices[-i]),
+                indextest = unlist(indices[i]),
+                prediction = pred,
+                truth = ytest,
+                performance = performance(
+                    prediction(pred, ytest), measure = "auc"
+                )@y.values[[1L]],
+                selectedparams = selparms,
+                gridsearch = gs,
+                nouterfolds = nouterfolds,
+                ninnerfolds = ninnerfolds,
+                nrepcv = nrepcv
+            )
+        },
         future.seed = TRUE
-    ))
-    setNames(
-        quantile(auc, names = FALSE), c("Min", "Q1", "Median", "Q3", "Max")
     )
+    nrcv
 }
diff --git a/R/gridsearch.R b/R/gridsearch.R
index af29af3..5a17ea6 100644
--- a/R/gridsearch.R
+++ b/R/gridsearch.R
@@ -1,13 +1,13 @@
 #' Grid Search
 #'
-#' Grid search to optimise hyperparameters for `rusranger()`
+#' Grid search to optimise hyperparameters for `FUN`
 #'
-#' @inheritParams rusranger
-#' @inheritParams rcv_rusranger
+#' @inheritParams cv
+#' @inheritParams rcv
 #' @param searchspace `data.frame`, hyperparameters to tune. Column names have
-#' to match the argument names of [`ranger()`]/[`rusranger()`].
-#' @param \ldots further arguments passed to [`rcv_rusranger()`].
-#' @return `data.frame` with tested hyperparameters and AUCs
+#' to match the argument names of `FUN`.
+#' @param \ldots further arguments passed to `FUN`
+#' @return `data.frame` with tested hyperparameters and metric
 #' @export
 #' @examples
 #' iris <- subset(iris, Species != "setosa")
@@ -21,21 +21,25 @@
 #'     iris[-5], as.numeric(iris$Species == "versicolor"),
 #'     searchspace = searchspace, nfolds = 3, nrepcv = 1
 #' )
-gs_rusranger <- function(x, y, searchspace, nfolds = 5, nrepcv = 2, ...) {
-    auc <- future.apply::future_lapply(
+gridsearch <- function(x, y, searchspace, FUN, nfolds = 5, nrepcv = 2, ...) {
+    r <- future.apply::future_lapply(
         seq_len(nrow(searchspace)),
         function(i) {
             do.call(
-                rcv_rusranger,
+                rcv,
                 c(
-                    list(x = x, y = y, nfolds = nfolds, nrepcv = nrepcv),
-                    list(...), searchspace[i, ]
+                    list(
+                        x = x, y = y, FUN = FUN,
+                        nfolds = nfolds, nrepcv = nrepcv
+                    ),
+                    list(...),
+                    searchspace[i, ]
                 )
             )
         },
         future.seed = TRUE
     )
-    cbind.data.frame(searchspace, do.call(rbind, auc))
+    cbind.data.frame(searchspace, do.call(rbind, r))
 }
 
 #' Nested Cross Validation for Hyperparameter Search
@@ -44,21 +48,20 @@ gs_rusranger <- function(x, y, searchspace, nfolds = 5, nrepcv = 2, ...) {
 #'
 #' @note
 #' The reported performance could slightly differ from the median performance
-#' in the reported gridsearch. After the gridsearch `rusranger` is trained again
+#' in the reported gridsearch. After the gridsearch `FUN` is trained again
 #' with the best hyperparameters which results in a new subsampling.
 #'
-#' @inheritParams gs_rusranger
+#' @inheritParams cv
+#' @inheritParams rcv
+#' @inheritParams gridsearch
 #' @param nouterfolds `integer(1)`, number of outer cross validation folds.
 #' @param ninnerfolds `integer(1)`, number of inner cross validation folds.
 #' @param nrepcv `integer(1)`, number repeats of inner cross validations.
 #' @param \ldots further arguments passed to [`gs_rusranger()`].
 #' @return `list`, with an element per `nouterfolds` containing the following
 #' subelements:
-#' * model selected `ranger` model.
 #' * indextrain index of the used training items.
 #' * indextest index of the used test items.
-#' * prediction predictions results.
-#' * truth original labels/classes.
 #' * performance resulting performance (AUC).
 #' * selectedparams select hyperparameters.
 #' * gridsearch `data.frame`, results of the grid search.
@@ -80,16 +83,15 @@ gs_rusranger <- function(x, y, searchspace, nfolds = 5, nrepcv = 2, ...) {
 #'     iris[-5], as.numeric(iris$Species == "versicolor"),
 #'     searchspace = searchspace, nouterfolds = 3, ninnerfolds = 3, nrepcv = 1
 #' )
-nrcv_rusranger <- function(x, y, searchspace,
-                           nouterfolds = 5, ninnerfolds = 5, nrepcv = 2,
-                           ...) {
-
+nested_gridsearch <- function(x, y, searchspace, FUN,
+                              nouterfolds = 5, ninnerfolds = 5, nrepcv = 2,
+                              ...) {
     folds <- .bfolds(y, nfolds = nouterfolds)
     xl <- split(x, folds)
     yl <- split(y, folds)
     indices <- split(seq_along(y), folds)
 
-    nrcv <- future.apply::future_lapply(
+    future.apply::future_lapply(
         seq_len(nouterfolds),
         function(i) {
             xtrain <- do.call(rbind, xl[-i])
@@ -97,45 +99,35 @@ nrcv_rusranger <- function(x, y, searchspace,
             ytrain <- do.call(c, yl[-i])
             ytest <- yl[[i]]
 
-            gs <- gs_rusranger(
-                xtrain, ytrain, searchspace,
-                nfolds = ninnerfolds, nrepcv = nrepcv, ...
+            g <- gridsearch(
+                x = xtrain, y = ytrain, searchspace = searchspace,
+                FUN = FUN, nfolds = ninnerfolds, nrepcv = nrepcv, ...
             )
 
-            top <- which.max(gs$Median)
+            top <- which.max(g$Median)
             selparms <-
-                gs[top,
-                   !colnames(gs) %in% c("Min", "Q1", "Median", "Q3", "Max"),
+                g[top,
+                   !colnames(g) %in% c("Min", "Q1", "Median", "Q3", "Max"),
                    drop = FALSE
                 ]
 
-            ## additional call of an already calculated tree, ...
-            ## could be avoided if we would store the results of the trees
-            ## but this would take alot of memory
-            ## this could slightly change the results because of new
-            ## resampling
-            rngr <- do.call(
-                rusranger,
+            r <- do.call(
+                FUN,
                 c(
-                  list(
-                    x = xtrain,
-                    y = ytrain
-                  ), list(...), selparms
+                    list(
+                        xtrain = xtrain, ytrain = ytrain,
+                        xtest = xtest, ytest = ytest,
+                        ...
+                    ), selparms
                 )
             )
-            pred <- as.numeric(predict(rngr, xtest)$predictions[, 2L])
 
             list(
-                model = rngr,
                 indextrain = unlist(indices[-i]),
                 indextest = unlist(indices[i]),
-                prediction = pred,
-                truth = ytest,
-                performance = performance(
-                    prediction(pred, ytest), measure = "auc"
-                )@y.values[[1L]],
+                performance = r,
                 selectedparams = selparms,
-                gridsearch = gs,
+                gridsearch = g,
                 nouterfolds = nouterfolds,
                 ninnerfolds = ninnerfolds,
                 nrepcv = nrepcv
@@ -143,5 +135,4 @@ nrcv_rusranger <- function(x, y, searchspace,
         },
         future.seed = TRUE
     )
-    nrcv
 }
diff --git a/R/rosranger.R b/R/rosranger.R
new file mode 100644
index 0000000..1fbac15
--- /dev/null
+++ b/R/rosranger.R
@@ -0,0 +1,49 @@
+#' ROS ranger
+#'
+#' Adapted default settings to the [`ranger()`] to support
+#' random-over-sampling. Additionally the default settings are modified to the
+#' most common classification settings used in the AMPEL project.
+#'
+#' @details
+#' In contrast to [`ranger()`] `rusranger()` currently just supports binary
+#' classifications.
+#'
+#' @param x `matrix`/`data.frame`, feature matrix, see [`ranger()`] for
+#' details.
+#' @param y `numeric`/`factor`, classification labels, see [`ranger()`] for
+#' details.
+#' @param probability `logical(1)`, grow probability trees, see [`ranger()`]
+#' for details.
+#' @param classification `logical(1)`, run classification even if `y` is
+#' `numeric`, see [`ranger()`] for details.
+#' @param min.node.size, same as in [`ranger()`]
+#' @param ndups `numeric(1)`, times of duplication of minority class.
+#' @param \ldots further arguments passed to [`ranger()`].
+#'
+#' @return `ranger` object, see [`ranger()`] for details.
+#'
+#' @seealso
+#' [`ranger()`]
+#'
+#' @references
+#' *AMPEL* project:
+#' Analysis and Reporting System for the Improvement of Patient Safety
+#' through Real-Time Integration of Laboratory Findings,
+#' https://ampel.care.
+#'
+#' @import ranger
+#' @export
+rosranger <- function(x, y, probability = TRUE, classification = !probability,
+                      min.node.size = if (probability) 10 else 1, ndups = 1,
+                      ...) {
+    ranger(
+        x = as.data.frame(x), y = y,
+        probability = probability,
+        classification = classification,
+        min.node.size = min.node.size,
+        replace = TRUE,
+        case.weights = .roscaseweights(y, ndups),
+        ...,
+        keep.inbag = FALSE
+    )
+}
diff --git a/R/smote.R b/R/smote.R
new file mode 100644
index 0000000..3333cf7
--- /dev/null
+++ b/R/smote.R
@@ -0,0 +1,124 @@
+#' SMOTE
+#'
+#' SMOTE: Synthetic Minority Over-sampling Technique.
+#'
+#' @param x `matrix`, features
+#' @param y `factor`, class (length has to be equal to number of rows in `x`).
+#' @param k `numeric(1)`, number of nearest neighbours.
+#' @param ndups `numeric(1)`, number of synthetic duplicates to upsample
+#' minority. Default is `0` which tries to upsample to majority.
+#' @return `list` with elements `x` features with SMOTEd samples and `y` with
+#' added minority class elements.
+#'
+#' @importFrom stats dist runif
+#' @export
+#'
+#' @references
+#' Chawla, Bowyer, Hall and Kegelmeyer. 2002.
+#' SMOTE: Synthetic Minority Over-sampling Technique.
+#' Journal of Artificial Intelligence Research 16. 321–357.
+#' \doi{10.1613/jair.953}
+smote <- function(x, y, k = 5L, ndups = 0L) {
+    if (nrow(x) != length(y))
+        stop("nrow(x) has to be equal to length(y).")
+    my <- .minority(y)
+    sel <- y == my
+    if (!ndups)
+        ndups <- .ndups(nrow(x), .nmin(y))
+    sm <- .smote(x[sel,, drop = FALSE], k = k, ndups = ndups)
+    x <- rbind(x, sm)
+    rownames(x) <- NULL
+    list(x = x, y = c(y, rep.int(my, nrow(sm))))
+}
+
+.smote <- function(x, k = 5L, ndups = 1L) {
+    if (!is.matrix(x))
+        stop("'x' has to be a matrix.")
+    if (!length(k) == 1 || !is.integer(k) || k < 2L)
+        stop("'k' has to be an integer of length 1 and greater or equal 2.")
+    if (!length(ndups) == 1 || !is.integer(ndups) || ndups < 1L)
+        stop("'ndups' has to be an integer of length 1 and greater 0.")
+    nr <- nrow(x)
+    nc <- ncol(x)
+    iknn <- .knn(x, k)
+    s <- matrix(, nrow = nr * ndups, ncol = nc)
+    sdups <- seq_len(ndups)
+
+    for (i in seq_len(nr)) {
+        ir <- rep.int(i, ndups)
+        # difference to nearest random nearest neighbour
+        j <- iknn[i, sample.int(k, ndups, replace = TRUE)]
+        d <- x[ir, ] - x[j, , drop = FALSE]
+        # difference multiplied with random gap in range 0..1
+        s[(i - 1L) * ndups + sdups,] <- x[ir, ] + runif(ndups) * d
+    }
+    s
+}
+
+#' K-nearest neighbour algrorithm
+#'
+#' Simple (not fastest) implementation of knn
+#'
+#' @param x `matrix`
+#' @param k `integer(1)`, number of nearest neighbours
+#' @return `matrix`, `nrow(x)` times `k`,
+#' for each row in `x` the row indices of the nearest neighbours.
+#' @noRd
+.knn <- function(x, k = 5) {
+    d <- dist(x, method = "euclidean")
+    nr <- nrow(x)
+    m <- matrix(, nrow = nr, ncol = k)
+    k <- seq_len(k)
+    for (i in seq_len(nrow(x))) {
+        m[i, ] <- order(.neighbours(d, i))[k]
+    }
+    m
+}
+
+#' Subsetting dist
+#'
+#' Return a column/all neighbours/competitors of an element of a `dist` without
+#' turning the `dist` object into a `matrix` before.
+#'
+#' @param d `dist`
+#' @param j `integer`, column index.
+#' @return column/neighbour/competitor values.
+#' @noRd
+.neighbours <- function(d, j) {
+    if (!inherits(d, "dist"))
+        stop("'d' is not a 'dist' object.")
+    if (length(j) != 1L || !is.integer(j))
+        stop("'j' has to be an integer of length 1.")
+    n <- attr(d, "Size")
+    i <- seq_len(n)[-j]
+    k <- ifelse(i < j,
+        n * (i - 1L) - i * (i - 1L) / 2L +  j - i,
+        (2 * n - j) * (j - 1) / 2 + i - j
+    )
+    d[k]
+}
+
+#' Calculate SMOTE duplicates
+#'
+#' Calculate SMOTE duplicates to upsample to majority class
+#'
+#' @param n `numeric(1)`, total number of samples.
+#' @param nmin `numeric(1)`, number of minority samples.
+#' @return `numeric(1)`, number of duplicates necessary to upsample to majority
+#' class.
+#' @noRd
+.ndups <- function(n, nmin) {
+    as.integer(floor((n - 2L * nmin) / nmin))
+}
+
+#' Find minority class
+#'
+#' @param y binary class vector.
+#' @return minority class
+#' @noRd
+.minority <- function(y) {
+    tbl <- table(y)
+    r <- names(if (tbl[1] > tbl[2]) tbl[2] else tbl[1])
+    mode(r) <- mode(y)
+    r
+}
diff --git a/R/utils-rosranger.R b/R/utils-rosranger.R
new file mode 100644
index 0000000..585f9a5
--- /dev/null
+++ b/R/utils-rosranger.R
@@ -0,0 +1,13 @@
+#' Case weights for ROS
+#'
+#' @param y `factor`/`numeric`, classes
+#' @param ndups `numeric(1)`, times of duplication of minority class.
+#' @return `double(length(y))` with case weights
+#' @noRd
+.roscaseweights <- function(y, ndups) {
+    w <- rep_len(1, length(y))
+    my <- .minority(y)
+    nm <- .nmin(y)
+    w[y == my] <- (length(y) + nm * ndups) / nm
+    w
+}
diff --git a/R/utils-rusranger.R b/R/utils-rusranger.R
index 3f0b1bf..d9c0723 100644
--- a/R/utils-rusranger.R
+++ b/R/utils-rusranger.R
@@ -42,6 +42,18 @@
     }
 }
 
+#' N minority
+#'
+#' Find number of minority class samples.
+#'
+#' @param y vector of binary class labels
+#' @return `integer(1)`, number of minority class samples
+#' @noRd
+.nmin <- function(y) {
+    tbl <- table(y)
+    as.vector(if (tbl[1] > tbl[2]) tbl[2] else tbl[1])
+}
+
 #' Sample fraction
 #'
 #' Calculate sample fraction for 50:50 selection (two times the minority class)
@@ -50,6 +62,5 @@
 #' @return `double(1)`, sample fraction
 #' @noRd
 .samplefraction <- function(y) {
-    tbl <- table(y)
-    as.vector(2L * if (tbl[1] > tbl[2]) tbl[2] else tbl[1]) / length(y)
+    2L * .nmin(y) / length(y)
 }
diff --git a/man/cv.Rd b/man/cv.Rd
new file mode 100644
index 0000000..c0e0a6b
--- /dev/null
+++ b/man/cv.Rd
@@ -0,0 +1,39 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/cv.R
+\name{cv}
+\alias{cv}
+\title{Cross Validation}
+\usage{
+cv(x, y, FUN, nfolds = 5, ...)
+}
+\arguments{
+\item{x}{\code{matrix}/\code{data.frame}, feature matrix, see \code{\link[=ranger]{ranger()}} for
+details.}
+
+\item{y}{\code{numeric}/\code{factor}, classification labels, see \code{\link[=ranger]{ranger()}} for
+details.}
+
+\item{FUN}{\code{function} function to optimize.}
+
+\item{nfolds}{\code{integer(1)} number of cross validation folds.}
+
+\item{\ldots}{further arguments passed to \code{FUN}.}
+}
+\value{
+\code{double(1)} median AUC across all cross validation splits
+}
+\description{
+Runs a cross validation for train and prediction function.
+}
+\note{
+The function to optimize has to accept five arguments: xtrain, ytrain, xtest,
+ytest and \ldots.
+}
+\examples{
+.rusranger <- function(xtrain, ytrain, xtest, ytest, ...) {
+    rngr <- rusranger(x = xtrain, y = ytrain, ...)
+    pred <- as.numeric(predict(rngr, xtest)$predictions[, 2L])
+    performance(prediction(pred, ytest), measure = "auc")@y.values[[1L]]
+}
+cv(iris[-5], as.numeric(iris$Species == "versicolor"), .rusranger, nfolds = 3)
+}
diff --git a/man/gridsearch.Rd b/man/gridsearch.Rd
new file mode 100644
index 0000000..9f39aec
--- /dev/null
+++ b/man/gridsearch.Rd
@@ -0,0 +1,45 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/gridsearch.R
+\name{gridsearch}
+\alias{gridsearch}
+\title{Grid Search}
+\usage{
+gridsearch(x, y, searchspace, FUN, nfolds = 5, nrepcv = 2, ...)
+}
+\arguments{
+\item{x}{\code{matrix}/\code{data.frame}, feature matrix, see \code{\link[=ranger]{ranger()}} for
+details.}
+
+\item{y}{\code{numeric}/\code{factor}, classification labels, see \code{\link[=ranger]{ranger()}} for
+details.}
+
+\item{searchspace}{\code{data.frame}, hyperparameters to tune. Column names have
+to match the argument names of \code{FUN}.}
+
+\item{FUN}{\code{function} function to optimize.}
+
+\item{nfolds}{\code{integer(1)} number of cross validation folds.}
+
+\item{nrepcv}{\code{integer(1)} number of repeats.}
+
+\item{\ldots}{further arguments passed to \code{FUN}}
+}
+\value{
+\code{data.frame} with tested hyperparameters and metric
+}
+\description{
+Grid search to optimise hyperparameters for \code{FUN}
+}
+\examples{
+iris <- subset(iris, Species != "setosa")
+searchspace <- expand.grid(
+   mtry = c(2, 3),
+   num.trees = c(500, 1000)
+)
+## nfolds and nrepcv are too low for real world applications, and are just
+## used for demonstration and to keep the run time of the examples low
+gs_rusranger(
+    iris[-5], as.numeric(iris$Species == "versicolor"),
+    searchspace = searchspace, nfolds = 3, nrepcv = 1
+)
+}
diff --git a/man/gs_rusranger.Rd b/man/gs_rusranger.Rd
index ab7aad2..3b064ab 100644
--- a/man/gs_rusranger.Rd
+++ b/man/gs_rusranger.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/gridsearch.R
+% Please edit documentation in R/cv_rusranger.R
 \name{gs_rusranger}
 \alias{gs_rusranger}
 \title{Grid Search}
diff --git a/man/nested_gridsearch.Rd b/man/nested_gridsearch.Rd
new file mode 100644
index 0000000..f238251
--- /dev/null
+++ b/man/nested_gridsearch.Rd
@@ -0,0 +1,74 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/gridsearch.R
+\name{nested_gridsearch}
+\alias{nested_gridsearch}
+\title{Nested Cross Validation for Hyperparameter Search}
+\usage{
+nested_gridsearch(
+  x,
+  y,
+  searchspace,
+  FUN,
+  nouterfolds = 5,
+  ninnerfolds = 5,
+  nrepcv = 2,
+  ...
+)
+}
+\arguments{
+\item{x}{\code{matrix}/\code{data.frame}, feature matrix, see \code{\link[=ranger]{ranger()}} for
+details.}
+
+\item{y}{\code{numeric}/\code{factor}, classification labels, see \code{\link[=ranger]{ranger()}} for
+details.}
+
+\item{searchspace}{\code{data.frame}, hyperparameters to tune. Column names have
+to match the argument names of \code{FUN}.}
+
+\item{FUN}{\code{function} function to optimize.}
+
+\item{nouterfolds}{\code{integer(1)}, number of outer cross validation folds.}
+
+\item{ninnerfolds}{\code{integer(1)}, number of inner cross validation folds.}
+
+\item{nrepcv}{\code{integer(1)}, number repeats of inner cross validations.}
+
+\item{\ldots}{further arguments passed to \code{\link[=gs_rusranger]{gs_rusranger()}}.}
+}
+\value{
+\code{list}, with an element per \code{nouterfolds} containing the following
+subelements:
+\itemize{
+\item indextrain index of the used training items.
+\item indextest index of the used test items.
+\item performance resulting performance (AUC).
+\item selectedparams select hyperparameters.
+\item gridsearch \code{data.frame}, results of the grid search.
+\item nouterfolds \code{integer(1)}.
+\item ninnerfolds \code{integer(1)}.
+\item nrepcv \code{integer(1)}.
+}
+}
+\description{
+Run a grid search in a nested cross validation.
+}
+\note{
+The reported performance could slightly differ from the median performance
+in the reported gridsearch. After the gridsearch \code{FUN} is trained again
+with the best hyperparameters which results in a new subsampling.
+}
+\examples{
+set.seed(20220324)
+iris <- subset(iris, Species != "setosa")
+searchspace <- expand.grid(
+   mtry = c(2, 3),
+   num.trees = c(500, 1000)
+)
+## n(outer|inner) folds and nrepcv are too low for real world applications,
+## and are just used for demonstration and to keep the run time of the examples
+## low
+nrcv_rusranger(
+    iris[-5], as.numeric(iris$Species == "versicolor"),
+    searchspace = searchspace, nouterfolds = 3, ninnerfolds = 3, nrepcv = 1
+)
+}
diff --git a/man/nrcv_rusranger.Rd b/man/nrcv_rusranger.Rd
index 46735a6..084fb07 100644
--- a/man/nrcv_rusranger.Rd
+++ b/man/nrcv_rusranger.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/gridsearch.R
+% Please edit documentation in R/cv_rusranger.R
 \name{nrcv_rusranger}
 \alias{nrcv_rusranger}
 \title{Nested Cross Validation for Hyperparameter Search}
diff --git a/man/rcv.Rd b/man/rcv.Rd
new file mode 100644
index 0000000..4aa6d0f
--- /dev/null
+++ b/man/rcv.Rd
@@ -0,0 +1,31 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/cv.R
+\name{rcv}
+\alias{rcv}
+\title{Repeated Cross Validation}
+\usage{
+rcv(x, y, nfolds = 5, nrepcv = 2, FUN, ...)
+}
+\arguments{
+\item{x}{\code{matrix}/\code{data.frame}, feature matrix, see \code{\link[=ranger]{ranger()}} for
+details.}
+
+\item{y}{\code{numeric}/\code{factor}, classification labels, see \code{\link[=ranger]{ranger()}} for
+details.}
+
+\item{nfolds}{\code{integer(1)} number of cross validation folds.}
+
+\item{nrepcv}{\code{integer(1)} number of repeats.}
+
+\item{FUN}{\code{function} function to optimize.}
+
+\item{\ldots}{further arguments passed to \code{FUN}.}
+}
+\value{
+\code{double(5)}, minimal, 25 \% quartiel, median, 75 \% quartile and
+maximal results across the repeated cross validations.
+}
+\description{
+Runs a repeated cross validation for a function.
+See also \code{\link[=cv]{cv()}}.
+}
diff --git a/man/rosranger.Rd b/man/rosranger.Rd
new file mode 100644
index 0000000..9f8e22b
--- /dev/null
+++ b/man/rosranger.Rd
@@ -0,0 +1,56 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/rosranger.R
+\name{rosranger}
+\alias{rosranger}
+\title{ROS ranger}
+\usage{
+rosranger(
+  x,
+  y,
+  probability = TRUE,
+  classification = !probability,
+  min.node.size = if (probability) 10 else 1,
+  ndups = 1,
+  ...
+)
+}
+\arguments{
+\item{x}{\code{matrix}/\code{data.frame}, feature matrix, see \code{\link[=ranger]{ranger()}} for
+details.}
+
+\item{y}{\code{numeric}/\code{factor}, classification labels, see \code{\link[=ranger]{ranger()}} for
+details.}
+
+\item{probability}{\code{logical(1)}, grow probability trees, see \code{\link[=ranger]{ranger()}}
+for details.}
+
+\item{classification}{\code{logical(1)}, run classification even if \code{y} is
+\code{numeric}, see \code{\link[=ranger]{ranger()}} for details.}
+
+\item{min.node.size, }{same as in \code{\link[=ranger]{ranger()}}}
+
+\item{ndups}{\code{numeric(1)}, times of duplication of minority class.}
+
+\item{\ldots}{further arguments passed to \code{\link[=ranger]{ranger()}}.}
+}
+\value{
+\code{ranger} object, see \code{\link[=ranger]{ranger()}} for details.
+}
+\description{
+Adapted default settings to the \code{\link[=ranger]{ranger()}} to support
+random-over-sampling. Additionally the default settings are modified to the
+most common classification settings used in the AMPEL project.
+}
+\details{
+In contrast to \code{\link[=ranger]{ranger()}} \code{rusranger()} currently just supports binary
+classifications.
+}
+\references{
+\emph{AMPEL} project:
+Analysis and Reporting System for the Improvement of Patient Safety
+through Real-Time Integration of Laboratory Findings,
+https://ampel.care.
+}
+\seealso{
+\code{\link[=ranger]{ranger()}}
+}
diff --git a/man/smote.Rd b/man/smote.Rd
new file mode 100644
index 0000000..c8c796b
--- /dev/null
+++ b/man/smote.Rd
@@ -0,0 +1,31 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/smote.R
+\name{smote}
+\alias{smote}
+\title{SMOTE}
+\usage{
+smote(x, y, k = 5L, ndups = 0L)
+}
+\arguments{
+\item{x}{\code{matrix}, features}
+
+\item{y}{\code{factor}, class (length has to be equal to number of rows in \code{x}).}
+
+\item{k}{\code{numeric(1)}, number of nearest neighbours.}
+
+\item{ndups}{\code{numeric(1)}, number of synthetic duplicates to upsample
+minority. Default is \code{0} which tries to upsample to majority.}
+}
+\value{
+\code{list} with elements \code{x} features with SMOTEd samples and \code{y} with
+added minority class elements.
+}
+\description{
+SMOTE: Synthetic Minority Over-sampling Technique.
+}
+\references{
+Chawla, Bowyer, Hall and Kegelmeyer. 2002.
+SMOTE: Synthetic Minority Over-sampling Technique.
+Journal of Artificial Intelligence Research 16. 321–357.
+\doi{10.1613/jair.953}
+}
diff --git a/tests/testthat/test_smote.R b/tests/testthat/test_smote.R
new file mode 100644
index 0000000..4c67b11
--- /dev/null
+++ b/tests/testthat/test_smote.R
@@ -0,0 +1,36 @@
+test_that("smote", {
+    x <- matrix(1:8, nrow = 4)
+    expect_error(smote(x, 1), "length")
+})
+
+test_that(".smote", {
+    expect_error(.smote(data.frame(a = 1:2)), "matrix")
+    x <- matrix(1:8, nrow = 4)
+    expect_error(.smote(x, k = 1:2), "length")
+    expect_error(.smote(x, k = 1.2), "integer")
+    expect_error(.smote(x, k = 1L), "2")
+    expect_error(.smote(x, ndups = 1:2), "length")
+    expect_error(.smote(x, ndups = 1.2), "integer")
+    expect_error(.smote(x, ndups = 0L), "0")
+    expect_identical(nrow(.smote(x, k = 2L, ndups = 1L)), 4L)
+    expect_identical(nrow(.smote(x, k = 2L, ndups = 2L)), 8L)
+})
+
+test_that(".neighbours", {
+    expect_error(.neighbours(1:10), "dist")
+    d <- dist(1:9)
+    expect_error(.neighbours(d, 1:2), "length")
+    expect_error(.neighbours(d, 1.2), "integer")
+    expect_equal(.neighbours(d, 3L), c(2:1, 1:6))
+    expect_equal(.neighbours(d, 5L), c(4:1, 1:4))
+})
+
+test_that(".ndups", {
+    expect_equal(.ndups(100, 20), 3)
+    expect_equal(.ndups(100, 10), 8)
+})
+
+test_that(".minority", {
+    expect_equal(.minority(rep(1:2, c(4, 2))), 2)
+    expect_equal(.minority(rep(c("A", "B"), c(2, 5))), "A")
+})
diff --git a/tests/testthat/test_utils-rosranger.R b/tests/testthat/test_utils-rosranger.R
new file mode 100644
index 0000000..66b1d2d
--- /dev/null
+++ b/tests/testthat/test_utils-rosranger.R
@@ -0,0 +1,9 @@
+test_that(".caseweights", {
+    cl <- rep(1:2, c(2, 4))
+    expect_equal(
+        .roscaseweights(cl, 2), rep(c(5, 1), c(2, 4))
+    )
+    expect_equal(
+        .roscaseweights(cl, 4), rep(c(7, 1), c(2, 4))
+    )
+})
diff --git a/tests/testthat/test_utils-rusranger.R b/tests/testthat/test_utils-rusranger.R
index e3117f7..4735823 100644
--- a/tests/testthat/test_utils-rusranger.R
+++ b/tests/testthat/test_utils-rusranger.R
@@ -27,10 +27,15 @@ test_that(".caseweights", {
     cl <- rep(0:1, c(2, 4))
     expect_equal(
         unname(.caseweights(cl, replace = FALSE)),
-        unname(.caseweights(cl + 1, replace = FALSE)),
+        unname(.caseweights(cl + 1, replace = FALSE))
     )
 })
 
+test_that(".nmin", {
+    expect_equal(.nmin(rep(1:2, c(2, 4))), 2)
+    expect_equal(.nmin(rep(1:2, c(7, 4))), 4)
+})
+
 test_that(".samplefraction", {
     cl <- rep(1:2, c(2, 4))
     expect_equal(.samplefraction(cl), 2/3)