Merge branch 'main' into add_offset

mlr-org · Jan 16, 2025 · 8e3e5d6 · 8e3e5d6
2 parents adbbcbf + a54679e
commit 8e3e5d6
Show file tree

Hide file tree

Showing 71 changed files with 379 additions and 184 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -42,6 +42,7 @@ S3method(as_resampling,Resampling)
 S3method(as_resamplings,default)
 S3method(as_resamplings,list)
 S3method(as_task,Task)
+S3method(as_task,default)
 S3method(as_task_classif,DataBackend)
 S3method(as_task_classif,Matrix)
 S3method(as_task_classif,TaskClassif)
@@ -200,6 +201,7 @@ export(as_tasks)
 export(as_tasks_unsupervised)
 export(assert_backend)
 export(assert_benchmark_result)
+export(assert_empty_ellipsis)
 export(assert_learnable)
 export(assert_learner)
 export(assert_learners)

diff --git a/NEWS.md b/NEWS.md
@@ -1,6 +1,15 @@
 # mlr3 (development version)
 
+<<<<<<< HEAD
 * feat: add new `col_role` offset in `Task`.
+=======
+* fix: the `$predict_newdata()` method of `Learner` now automatically conducts type conversions (#685)
+* BREAKING_CHANGE: Predicting on a `task` with the wrong column information is now an error and not a warning. 
+* Column names with UTF-8 characters are now allowed by default.
+  The option `mlr3.allow_utf8_names` is removed.
+* BREAKING CHANGE: `Learner$predict_types` is read-only now.
+* docs: Clear up behavior of `Learner$predict_type` after training.
+>>>>>>> main
 
 # mlr3 0.22.1
 

diff --git a/R/BenchmarkResult.R b/R/BenchmarkResult.R
@@ -175,7 +175,7 @@ BenchmarkResult = R6Class("BenchmarkResult",
     #'
     #' @return [data.table::data.table()].
     score = function(measures = NULL, ids = TRUE, conditions = FALSE, predictions = TRUE) {
-      measures = as_measures(measures, task_type = self$task_type)
+      measures = assert_measures(as_measures(measures, task_type = self$task_type))
       assert_flag(ids)
       assert_flag(conditions)
       assert_flag(predictions)
@@ -230,7 +230,7 @@ BenchmarkResult = R6Class("BenchmarkResult",
     #' @param predict_sets (`character()`)\cr
     #'   The predict sets.
     obs_loss = function(measures = NULL, predict_sets = "test") {
-      measures = as_measures(measures, task_type = private$.data$task_type)
+      measures = assert_measures(as_measures(measures, task_type = self$task_type))
       map_dtr(self$resample_results$resample_result,
         function(rr) {
           rr$obs_loss(measures, predict_sets)
@@ -276,7 +276,11 @@ BenchmarkResult = R6Class("BenchmarkResult",
     #'
     #' @return [data.table::data.table()].
     aggregate = function(measures = NULL, ids = TRUE, uhashes = FALSE, params = FALSE, conditions = FALSE) {
-      measures = assert_measures(as_measures(measures, task_type = self$task_type))
+      measures = if (is.null(measures)) {
+        default_measures(self$task_type)
+      } else {
+        assert_measures(as_measures(measures))
+      }
       assert_flag(ids)
       assert_flag(uhashes)
       assert_flag(params)

diff --git a/R/DataBackendRename.R b/R/DataBackendRename.R
@@ -9,13 +9,13 @@ DataBackendRename = R6Class("DataBackendRename", inherit = DataBackend, cloneabl
       assert_character(old, any.missing = FALSE, unique = TRUE)
       assert_subset(old, b$colnames)
       assert_character(new, any.missing = FALSE, len = length(old))
-      assert_names(new, if (allow_utf8_names()) "unique" else "strict")
+      assert_names(new, "unique")
 
       ii = old != new
       old = old[ii]
       new = new[ii]
 
-      if (self$primary_key %in% old) {
+      if (self$primary_key %chin% old) {
         stopf("Renaming the primary key is not supported")
       }
 

diff --git a/R/HotstartStack.R b/R/HotstartStack.R
@@ -202,9 +202,9 @@ calculate_cost = function(start_learner, learner, hotstart_id) {
   cost = learner$param_set$values[[hotstart_id]] - start_learner$param_set$values[[hotstart_id]]
   if (cost == 0) return(-1)
 
-  if ("hotstart_backward" %in% learner$properties && "hotstart_forward" %in% learner$properties) {
+  if ("hotstart_backward" %chin% learner$properties && "hotstart_forward" %chin% learner$properties) {
     if (cost < 0) 0 else cost
-  } else if ("hotstart_backward" %in% learner$properties) {
+  } else if ("hotstart_backward" %chin% learner$properties) {
     if (cost < 0) 0 else NA_real_
   } else {
     if (cost > 0) cost else NA_real_

diff --git a/R/Learner.R b/R/Learner.R
@@ -72,10 +72,10 @@
 #' All information about hyperparameters is stored in the slot `param_set` which is a [paradox::ParamSet].
 #' The printer gives an overview about the ids of available hyperparameters, their storage type, lower and upper bounds,
 #' possible levels (for factors), default values and assigned values.
-#' To set hyperparameters, assign a named list to the subslot `values`:
+#' To set hyperparameters, call the `set_values()` method on the `param_set`:
 #' ```
 #' lrn = lrn("classif.rpart")
-#' lrn$param_set$values = list(minsplit = 3, cp = 0.01)
+#' lrn$param_set$set_values(minsplit = 3, cp = 0.01)
 #' ```
 #' Note that this operation replaces all previously set hyperparameter values.
 #' If you only intend to change one specific hyperparameter value and leave the others as-is, you can use the helper function [mlr3misc::insert_named()]:
@@ -157,11 +157,6 @@ Learner = R6Class("Learner",
     #' @template field_task_type
     task_type = NULL,
 
-    #' @field predict_types (`character()`)\cr
-    #' Stores the possible predict types the learner is capable of.
-    #' A complete list of candidate predict types, grouped by task type, is stored in [`mlr_reflections$learner_predict_types`][mlr_reflections].
-    predict_types = NULL,
-
     #' @field feature_types (`character()`)\cr
     #' Stores the feature types the learner can handle, e.g. `"logical"`, `"numeric"`, or `"factor"`.
     #' A complete list of candidate feature types, grouped by task type, is stored in [`mlr_reflections$task_feature_types`][mlr_reflections].
@@ -214,7 +209,7 @@ Learner = R6Class("Learner",
       self$task_type = assert_choice(task_type, mlr_reflections$task_types$type)
       private$.param_set = assert_param_set(param_set)
       self$feature_types = assert_ordered_set(feature_types, mlr_reflections$task_feature_types, .var.name = "feature_types")
-      self$predict_types = assert_ordered_set(predict_types, names(mlr_reflections$learner_predict_types[[task_type]]),
+      private$.predict_types = assert_ordered_set(predict_types, names(mlr_reflections$learner_predict_types[[task_type]]),
         empty.ok = FALSE, .var.name = "predict_types")
       private$.predict_type = predict_types[1L]
       self$properties = sort(assert_subset(properties, mlr_reflections$learner_properties[[task_type]]))
@@ -375,6 +370,8 @@ Learner = R6Class("Learner",
     #' of the training task stored in the learner.
     #' If the learner has been fitted via [resample()] or [benchmark()], you need to pass the corresponding task stored
     #' in the [ResampleResult] or [BenchmarkResult], respectively.
+    #' Further, [`auto_convert`] is used for type-conversions to ensure compatability
+    #' of features between `$train()` and `$predict()`.
     #'
     #' @param newdata (any object supported by [as_data_backend()])\cr
     #'   New data to predict on.
@@ -404,16 +401,32 @@ Learner = R6Class("Learner",
       # the following columns are automatically set to NA if missing
       impute = unlist(task$col_roles[c("target", "name", "order", "stratum", "group", "weight")], use.names = FALSE)
       impute = setdiff(impute, newdata$colnames)
-      if (length(impute)) {
+      tab1 = if (length(impute)) {
         # create list with correct NA types and cbind it to the backend
         ci = insert_named(task$col_info[list(impute), c("id", "type", "levels"), on = "id", with = FALSE], list(value = NA))
         na_cols = set_names(pmap(ci, function(..., nrow) rep(auto_convert(...), nrow), nrow = newdata$nrow), ci$id)
-        tab = invoke(data.table, .args = insert_named(na_cols, set_names(list(newdata$rownames), newdata$primary_key)))
+        invoke(data.table, .args = insert_named(na_cols, set_names(list(newdata$rownames), newdata$primary_key)))
+      }
+
+      # Perform type conversion where necessary
+      keep_cols = intersect(newdata$colnames, task$col_info$id)
+      ci = task$col_info[list(keep_cols), ][
+        get("type") != col_info(newdata)[list(keep_cols), on = "id"]$type]
+      tab2 = do.call(data.table, Map(auto_convert,
+        value = as.list(newdata$data(rows = newdata$rownames, cols = ci$id)),
+        id = ci$id, type = ci$type, levels = ci$levels))
+
+      tab = cbind(tab1, tab2)
+      if (ncol(tab)) {
+        tab[[newdata$primary_key]] = newdata$rownames
         newdata = DataBackendCbind$new(newdata, DataBackendDataTable$new(tab, primary_key = newdata$primary_key))
       }
 
-      # do some type conversions if necessary
+      prevci = task$col_info
       task$backend = newdata
+      task$col_info = col_info(task$backend)
+      task$col_info[, c("label", "fix_factor_levels")] = prevci[list(task$col_info$id), on = "id", c("label", "fix_factor_levels")]
+      task$col_info$fix_factor_levels[is.na(task$col_info$fix_factor_levels)] = FALSE
       task$row_roles$use = task$backend$rownames
       self$predict(task)
     },
@@ -627,6 +640,8 @@ Learner = R6Class("Learner",
     #' @field predict_type (`character(1)`)\cr
     #' Stores the currently active predict type, e.g. `"response"`.
     #' Must be an element of `$predict_types`.
+    #' A few learners already use the predict type during training.
+    #' So there is no guarantee that changing the predict type after training will have any effect or does not lead to errors.
     predict_type = function(rhs) {
       if (missing(rhs)) {
         return(private$.predict_type)
@@ -648,8 +663,6 @@ Learner = R6Class("Learner",
       private$.param_set
     },
 
-
-
     #' @field fallback ([Learner])\cr
     #' Returns the fallback learner set with `$encapsulate()`.
     fallback = function(rhs) {
@@ -672,13 +685,23 @@ Learner = R6Class("Learner",
       }
       assert_r6(rhs, "HotstartStack", null.ok = TRUE)
       private$.hotstart_stack = rhs
+    },
+
+    #' @field predict_types (`character()`)\cr
+    #' Stores the possible predict types the learner is capable of.
+    #' A complete list of candidate predict types, grouped by task type, is stored in [`mlr_reflections$learner_predict_types`][mlr_reflections].
+    #' This field is read-only.
+    predict_types = function(rhs) {
+      assert_ro_binding(rhs)
+      return(private$.predict_types)
     }
   ),
 
   private = list(
     .encapsulation = c(train = "none", predict = "none"),
     .fallback = NULL,
     .predict_type = NULL,
+    .predict_types = NULL,
     .param_set = NULL,
     .hotstart_stack = NULL,
 

diff --git a/R/LearnerClassifDebug.R b/R/LearnerClassifDebug.R
@@ -37,7 +37,7 @@
 #' @export
 #' @examples
 #' learner = lrn("classif.debug")
-#' learner$param_set$values = list(message_train = 1, save_tasks = TRUE)
+#' learner$param_set$set_values(message_train = 1, save_tasks = TRUE)
 #'
 #' # this should signal a message
 #' task = tsk("penguins")
@@ -163,7 +163,7 @@ LearnerClassifDebug = R6Class("LearnerClassifDebug", inherit = LearnerClassif,
       pv = self$param_set$get_values(tags = "train")
       pv$count_marshaling = pv$count_marshaling %??% FALSE
       roll = function(name) {
-        name %in% names(pv) && pv[[name]] > runif(1L)
+        name %chin% names(pv) && pv[[name]] > runif(1L)
       }
 
       if (!is.null(pv$sleep_train)) {
@@ -248,7 +248,7 @@ LearnerClassifDebug = R6Class("LearnerClassifDebug", inherit = LearnerClassif,
       n = task$nrow
       pv = self$param_set$get_values(tags = "predict")
       roll = function(name) {
-        name %in% names(pv) && pv[[name]] > runif(1L)
+        name %chin% names(pv) && pv[[name]] > runif(1L)
       }
 
       if (!is.null(pv$sleep_predict)) {
@@ -281,7 +281,7 @@ LearnerClassifDebug = R6Class("LearnerClassifDebug", inherit = LearnerClassif,
       response = prob = NULL
       missing_type = pv$predict_missing_type %??% "na"
 
-      if ("response" %in% self$predict_type) {
+      if ("response" %chin% self$predict_type) {
         response = rep.int(unclass(model$response), n)
         if (!is.null(pv$predict_missing)) {
           ii = sample.int(n, n * pv$predict_missing)
@@ -292,7 +292,7 @@ LearnerClassifDebug = R6Class("LearnerClassifDebug", inherit = LearnerClassif,
         }
       }
 
-      if ("prob" %in% self$predict_type) {
+      if ("prob" %chin% self$predict_type) {
         cl = task$class_names
         prob = matrix(runif(n * length(cl)), nrow = n)
         prob = prob / rowSums(prob)

diff --git a/R/LearnerClassifFeatureless.R b/R/LearnerClassifFeatureless.R
@@ -34,7 +34,7 @@ LearnerClassifFeatureless = R6Class("LearnerClassifFeatureless", inherit = Learn
       ps = ps(
         method = p_fct(c("mode", "sample", "weighted.sample"), default = "mode", tags = "predict")
       )
-      ps$values = list(method = "mode")
+      ps$set_values(method = "mode")
       super$initialize(
         id = "classif.featureless",
         feature_types = mlr_reflections$task_feature_types,

diff --git a/R/LearnerClassifRpart.R b/R/LearnerClassifRpart.R
@@ -37,7 +37,7 @@ LearnerClassifRpart = R6Class("LearnerClassifRpart", inherit = LearnerClassif,
         usesurrogate   = p_int(0L, 2L, default = 2L, tags = "train"),
         xval           = p_int(0L, default = 10L, tags = "train")
       )
-      ps$values = list(xval = 0L)
+      ps$set_values(xval = 0L)
 
       super$initialize(
         id = "classif.rpart",
@@ -77,7 +77,7 @@ LearnerClassifRpart = R6Class("LearnerClassifRpart", inherit = LearnerClassif,
     .train = function(task) {
       pv = self$param_set$get_values(tags = "train")
       names(pv) = replace(names(pv), names(pv) == "keep_model", "model")
-      if ("weights" %in% task$properties) {
+      if ("weights" %chin% task$properties) {
         pv = insert_named(pv, list(weights = task$weights$weight))
       }
 
@@ -89,11 +89,11 @@ LearnerClassifRpart = R6Class("LearnerClassifRpart", inherit = LearnerClassif,
       newdata = task$data(cols = task$feature_names)
       response = prob = NULL
 
-      if ("response" %in% self$predict_type) {
+      if ("response" %chin% self$predict_type) {
         response = invoke(predict, self$model, newdata = newdata, type = "class",
           .opts = allow_partial_matching, .args = pv)
         response = unname(response)
-      } else if ("prob" %in% self$predict_type) {
+      } else if ("prob" %chin% self$predict_type) {
         prob = invoke(predict, self$model, newdata = newdata, type = "prob",
           .opts = allow_partial_matching, .args = pv)
         rownames(prob) = NULL

diff --git a/R/LearnerRegrFeatureless.R b/R/LearnerRegrFeatureless.R
@@ -23,7 +23,7 @@ LearnerRegrFeatureless = R6Class("LearnerRegrFeatureless", inherit = LearnerRegr
       ps = ps(
         robust = p_lgl(default = TRUE, tags = "train")
       )
-      ps$values = list(robust = FALSE)
+      ps$set_values(robust = FALSE)
 
       super$initialize(
         id = "regr.featureless",

diff --git a/R/LearnerRegrRpart.R b/R/LearnerRegrRpart.R
@@ -37,7 +37,7 @@ LearnerRegrRpart = R6Class("LearnerRegrRpart", inherit = LearnerRegr,
         usesurrogate   = p_int(0L, 2L, default = 2L, tags = "train"),
         xval           = p_int(0L, default = 10L, tags = "train")
       )
-      ps$values = list(xval = 0L)
+      ps$set_values(xval = 0L)
 
       super$initialize(
         id = "regr.rpart",
@@ -77,7 +77,7 @@ LearnerRegrRpart = R6Class("LearnerRegrRpart", inherit = LearnerRegr,
     .train = function(task) {
       pv = self$param_set$get_values(tags = "train")
       names(pv) = replace(names(pv), names(pv) == "keep_model", "model")
-      if ("weights" %in% task$properties) {
+      if ("weights" %chin% task$properties) {
         pv = insert_named(pv, list(weights = task$weights$weight))
       }
 

diff --git a/R/Measure.R b/R/Measure.R
@@ -200,19 +200,19 @@ Measure = R6Class("Measure",
       # check should be added to assert_measure()
       # except when the checks are superfluous for rr$score() and bmr$score()
       # these checks should be added bellow
-      if ("requires_task" %in% self$properties && is.null(task)) {
+      if ("requires_task" %chin% self$properties && is.null(task)) {
         stopf("Measure '%s' requires a task", self$id)
       }
 
-      if ("requires_learner" %in% self$properties && is.null(learner)) {
+      if ("requires_learner" %chin% self$properties && is.null(learner)) {
         stopf("Measure '%s' requires a learner", self$id)
       }
 
       if (!is_scalar_na(self$task_type) && self$task_type != prediction$task_type) {
         stopf("Measure '%s' incompatible with task type '%s'", self$id, prediction$task_type)
       }
 
-      if ("requires_train_set" %in% self$properties && is.null(train_set)) {
+      if ("requires_train_set" %chin% self$properties && is.null(train_set)) {
         stopf("Measure '%s' requires the train_set", self$id)
       }
 
@@ -258,7 +258,7 @@ Measure = R6Class("Measure",
     #' @template field_predict_sets
     predict_sets = function(rhs) {
       if (!missing(rhs)) {
-        private$.predict_sets = assert_subset(rhs, mlr_reflections$predict_sets, empty.ok = "requires_no_prediction" %in% self$properties)
+        private$.predict_sets = assert_subset(rhs, mlr_reflections$predict_sets, empty.ok = "requires_no_prediction" %chin% self$properties)
       }
       private$.predict_sets
     },
@@ -385,7 +385,7 @@ score_single_measure = function(measure, task, learner, train_set, prediction) {
 #' @noRd
 score_measures = function(obj, measures, reassemble = TRUE, view = NULL, iters = NULL) {
   reassemble_learners = reassemble ||
-    some(measures, function(m) any(c("requires_learner", "requires_model") %in% m$properties))
+    some(measures, function(m) any(c("requires_learner", "requires_model") %chin% m$properties))
   tab = get_private(obj)$.data$as_data_table(view = view, reassemble_learners = reassemble_learners, convert_predictions = FALSE)
 
   if (!is.null(iters)) {

diff --git a/R/MeasureClassifCosts.R b/R/MeasureClassifCosts.R
@@ -45,7 +45,7 @@ MeasureClassifCosts = R6Class("MeasureClassifCosts",
     #' Creates a new instance of this [R6][R6::R6Class] class.
     initialize = function() {
       param_set = ps(normalize = p_lgl(tags = "required"))
-      param_set$values = list(normalize = TRUE)
+      param_set$set_values(normalize = TRUE)
 
       super$initialize(
         id = "classif.costs",

diff --git a/R/MeasureDebug.R b/R/MeasureDebug.R
@@ -27,7 +27,7 @@ MeasureDebugClassif = R6Class("MeasureDebugClassif",
     #' Creates a new instance of this [R6][R6::R6Class] class.
     initialize = function() {
       param_set = ps(na_ratio = p_dbl(0, 1, tags = "required"))
-      param_set$values = list(na_ratio = 0)
+      param_set$set_values(na_ratio = 0)
       super$initialize(
         id = "debug_classif",
         param_set = param_set,

diff --git a/R/MeasureSelectedFeatures.R b/R/MeasureSelectedFeatures.R
@@ -31,7 +31,7 @@ MeasureSelectedFeatures = R6Class("MeasureSelectedFeatures",
     #' Creates a new instance of this [R6][R6::R6Class] class.
     initialize = function() {
       param_set = ps(normalize = p_lgl(tags = "required"))
-      param_set$values = list(normalize = FALSE)
+      param_set$set_values(normalize = FALSE)
 
       super$initialize(
         id = "selected_features",