From 22ef4e359d964ef3785908f0d3af10cc0836980a Mon Sep 17 00:00:00 2001 From: tiemvanderdeure Date: Tue, 26 Nov 2024 12:58:48 +0100 Subject: [PATCH 1/7] maxnet is registered --- docs/src/usage/quickstart.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/src/usage/quickstart.md b/docs/src/usage/quickstart.md index 8f46e73..86982b6 100644 --- a/docs/src/usage/quickstart.md +++ b/docs/src/usage/quickstart.md @@ -3,10 +3,10 @@ CurrentModule = Maxnet ``` ## Installation -Maxnet.jl is not yet registered - install by running +Install the latest version of Maxnet.jl by running ```julia ] -add https://github.com/tiemvanderdeure/Maxnet.jl +add Maxnet ``` ## Basic usage @@ -31,7 +31,7 @@ There are numerous settings that can be tweaked to change the model fit. These a ### Model settings The two most important settings to change when running Maxnet is the feature classes selected and the regularization factor. -By default, the feature classes selected depends on the number of presence points, see [Maxnet.default_features](@ref). To set them manually, specify the `features` keyword using either a `Vector` of `AbstractFeatureClass`, or a `string`, where `l` represents `LinearFeature` and `CategoricalFeature`, `q` represents `QuadraticFeature`, `p` represents `ProductFeature`, `t` represents `ThresholdFeature` and `h` represents `HingeFeature`. +By default, the feature classes selected depends on the number of presence points, see [default_features](@ref). To set them manually, specify the `features` keyword using either a `Vector` of `AbstractFeatureClass`, or a `string`, where `l` represents `LinearFeature` and `CategoricalFeature`, `q` represents `QuadraticFeature`, `p` represents `ProductFeature`, `t` represents `ThresholdFeature` and `h` represents `HingeFeature`. For example: ```julia From 1742c96a67cf0438145f24c54d6775e95f82c289 Mon Sep 17 00:00:00 2001 From: Tiem van der Deure Date: Tue, 3 Dec 2024 13:22:41 +0100 Subject: [PATCH 2/7] fully implement MLJ (#22) * add mlj docstring * test with MLJTestInterface * throw a helpful error if input data only has one class * mljtestinterface is not a dep (oops) * move allequal error to main function * fix allequal error * fix tests * add MLJBase as docs dep * fix mlj doctest * attempt fix of multiclass printing * use @example instead of jldoctest * test for no failures in mlj interface test --- Project.toml | 3 ++- docs/Project.toml | 1 + src/maxnet_function.jl | 5 +++++ src/mlj_interface.jl | 44 +++++++++++++++++++----------------------- test/runtests.jl | 15 ++++++++++++-- 5 files changed, 41 insertions(+), 27 deletions(-) diff --git a/Project.toml b/Project.toml index 034d148..aa30d6f 100644 --- a/Project.toml +++ b/Project.toml @@ -36,7 +36,8 @@ julia = "1.9" [extras] DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab" MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" +MLJTestInterface = "72560011-54dd-4dc2-94f3-c5de45b75ecd" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["DelimitedFiles", "MLJBase", "Test"] +test = ["DelimitedFiles", "MLJBase", "MLJTestInterface", "Test"] diff --git a/docs/Project.toml b/docs/Project.toml index 491d2dd..3f5ef3c 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,3 +1,4 @@ [deps] Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" Maxnet = "81f79f80-22f2-4e41-ab86-00c11cf0f26f" diff --git a/src/maxnet_function.jl b/src/maxnet_function.jl index 9e87636..fd414ac 100644 --- a/src/maxnet_function.jl +++ b/src/maxnet_function.jl @@ -49,6 +49,11 @@ function maxnet( n_knots::Int = 50, kw...) + if allequal(presences) + pa = first(presences) ? "presences" : "absences" + throw(ArgumentError("All data points are $pa. Maxnet will only work with at least some presences and some absences.")) + end + _maxnet( presences, predictors, diff --git a/src/mlj_interface.jl b/src/mlj_interface.jl index 878a21d..e081801 100644 --- a/src/mlj_interface.jl +++ b/src/mlj_interface.jl @@ -24,30 +24,6 @@ function MaxnetBinaryClassifier(; ) end -""" - MaxnetBinaryClassifier - - A model type for fitting a maxnet model using `MLJ`. - - Use `MaxnetBinaryClassifier()` to create an instance with default parameters, or use keyword arguments to specify parameters. - - The keywords `link`, and `clamp` are passed to [`Maxnet.predict`](@ref), while all other keywords are passed to [`maxnet`](@ref). - See the documentation of these functions for the meaning of these parameters and their defaults. - - # Example - ```jldoctest - using Maxnet, MLJBase - p_a, env = Maxnet.bradypus() - - mach = machine(MaxnetBinaryClassifier(features = "lqp"), env, categorical(p_a)) - fit!(mach) - yhat = MLJBase.predict(mach, env) - # output - ``` - -""" -MaxnetBinaryClassifier - MMI.metadata_pkg( MaxnetBinaryClassifier; name = "Maxnet", @@ -67,6 +43,26 @@ MMI.metadata_model( reports_feature_importances=false ) +""" +$(MMI.doc_header(MaxnetBinaryClassifier)) + +The keywords `link`, and `clamp` are passed to [`predict`](@ref), while all other keywords are passed to [`maxnet`](@ref). +See the documentation of these functions for the meaning of these parameters and their defaults. + +# Example +```@example +using MLJBase +p_a, env = Maxnet.bradypus() + +mach = machine(MaxnetBinaryClassifier(features = "lqp"), env, categorical(p_a), scitype_check_level = 0) +fit!(mach, verbosity = 0) +yhat = MLJBase.predict(mach, env) + +``` + +""" +MaxnetBinaryClassifier + function MMI.fit(m::MaxnetBinaryClassifier, verbosity::Int, X, y) # convert categorical to boolean y_boolean = Bool.(MMI.int(y) .- 1) diff --git a/test/runtests.jl b/test/runtests.jl index ed8e29a..e776b16 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,5 +1,7 @@ -using Maxnet, Test, Statistics, CategoricalArrays +using Maxnet, Statistics, CategoricalArrays, MLJTestInterface +using Test +# read in Bradypus data p_a, env = Maxnet.bradypus() # Make the levels in ecoreg string to make sure that that works env = merge(env, (; ecoreg = recode(env.ecoreg, (l => string(l) for l in levels(env.ecoreg))...))) @@ -82,9 +84,18 @@ end m = maxnet(p_a, env; features = "lq", addsamplestobackground = false) @test m_w.entropy > m.entropy end -m = maxnet(p_a, env; features = "lq", addsamplestobackground = false) @testset "MLJ" begin + data = MLJTestInterface.make_binary() + failures, summary = MLJTestInterface.test( + [MaxnetBinaryClassifier], + data...; + mod=@__MODULE__, + verbosity=0, # bump to debug + throw=false, # set to true to debug + ) + @test isempty(failures) + using MLJBase mn = Maxnet.MaxnetBinaryClassifier From 9a224f3a568530984bef97c39790a6ef31de15e8 Mon Sep 17 00:00:00 2001 From: tiemvanderdeure Date: Tue, 26 Nov 2024 12:58:48 +0100 Subject: [PATCH 3/7] maxnet is registered --- docs/src/usage/quickstart.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/src/usage/quickstart.md b/docs/src/usage/quickstart.md index 8f46e73..86982b6 100644 --- a/docs/src/usage/quickstart.md +++ b/docs/src/usage/quickstart.md @@ -3,10 +3,10 @@ CurrentModule = Maxnet ``` ## Installation -Maxnet.jl is not yet registered - install by running +Install the latest version of Maxnet.jl by running ```julia ] -add https://github.com/tiemvanderdeure/Maxnet.jl +add Maxnet ``` ## Basic usage @@ -31,7 +31,7 @@ There are numerous settings that can be tweaked to change the model fit. These a ### Model settings The two most important settings to change when running Maxnet is the feature classes selected and the regularization factor. -By default, the feature classes selected depends on the number of presence points, see [Maxnet.default_features](@ref). To set them manually, specify the `features` keyword using either a `Vector` of `AbstractFeatureClass`, or a `string`, where `l` represents `LinearFeature` and `CategoricalFeature`, `q` represents `QuadraticFeature`, `p` represents `ProductFeature`, `t` represents `ThresholdFeature` and `h` represents `HingeFeature`. +By default, the feature classes selected depends on the number of presence points, see [default_features](@ref). To set them manually, specify the `features` keyword using either a `Vector` of `AbstractFeatureClass`, or a `string`, where `l` represents `LinearFeature` and `CategoricalFeature`, `q` represents `QuadraticFeature`, `p` represents `ProductFeature`, `t` represents `ThresholdFeature` and `h` represents `HingeFeature`. For example: ```julia From 739b243ef6658f10f7ff7606624e0c97e3b7a0db Mon Sep 17 00:00:00 2001 From: tiemvanderdeure Date: Thu, 23 Jan 2025 16:59:44 +0100 Subject: [PATCH 4/7] more MLJ docs --- src/mlj_interface.jl | 59 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 54 insertions(+), 5 deletions(-) diff --git a/src/mlj_interface.jl b/src/mlj_interface.jl index e081801..83ab240 100644 --- a/src/mlj_interface.jl +++ b/src/mlj_interface.jl @@ -46,16 +46,65 @@ MMI.metadata_model( """ $(MMI.doc_header(MaxnetBinaryClassifier)) -The keywords `link`, and `clamp` are passed to [`predict`](@ref), while all other keywords are passed to [`maxnet`](@ref). -See the documentation of these functions for the meaning of these parameters and their defaults. +# Training data + +In MLJ or MLJBase, bind an instance `model` to data with + + mach = machine(model, X, y) + +where + +- `X`: any table of input features (eg, a `DataFrame`) whose columns + each have one of the following element scitypes: `Continuous` or `<:Multiclass`. + +- `y`: is the target, which can be any `AbstractVector` whose element + scitype is `<:Binary`. The first class should refer to background values, + and the second class to presence values. + +# Hyper-parameters + +- `features`: Specifies which features classes to use in the model, e.g. "lqh" for linear, quadratic and hinge features. + See also [Maxnet.maxnet](@ref) +- `regularization_multiplier = 1.0`: 'Adjust how tight the model will fit. Increasing this will reduce overfitting. +- `regularization_function`: A function to compute the regularization of each feature class. Defaults to `Maxnet.default_regularization` +- `addsamplestobackground = true`: Controls wether to add presence values to the background. +- `n_knots = 50`: The number of knots used for Threshold and Hinge features. A higher number gives more flexibility for these features. +- `weight_factor = 100.0`: A `Float64` value to adjust the weight of the background samples. +- `link = CloglogLink()`: The link function to use when predicting. See `Maxnet.predict` +- `clamp = false`: Clamp values passed to `MLJBase.predict` to the range the model was trained on. + +# Operations + +- `predict(mach, Xnew)`: return predictions of the target given + features `Xnew` having the same scitype as `X` above. Predictions are + probabilistic and can be interpreted as the probability of presence. + +# Fitted Parameters + +The fields of `fitted_params(mach)` are: + +- `fitresult`: A `Tuple` where the first entry is the `Maxnet.MaxnetModel` returned by the Maxnet algorithm + and the second the entry is the classes of `y` + +# Report + +The fields of `report(mach)` are: + +- `selected_variables`: A `Vector` of `Symbols` of the variables that were selected. +- `selected_features`: A `Vector` of `Maxnet.ModelMatrixColumn` with the features that were selected. +- `complexity`: the number of selected features in the model. + # Example + ```@example -using MLJBase +using MLJBase, Maxnet p_a, env = Maxnet.bradypus() +y = coerce(p_a, Binary) +X = coerce(env, Count => Continuous) -mach = machine(MaxnetBinaryClassifier(features = "lqp"), env, categorical(p_a), scitype_check_level = 0) -fit!(mach, verbosity = 0) +mach = machine(MaxnetBinaryClassifier(features = "lqp"), X, y) +fit!(mach) yhat = MLJBase.predict(mach, env) ``` From 46b8b387d0c1c86e4e6badfac653301f6870dd94 Mon Sep 17 00:00:00 2001 From: tiemvanderdeure Date: Thu, 23 Jan 2025 17:01:36 +0100 Subject: [PATCH 5/7] small tweaks to core function docs --- src/maxnet_function.jl | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/maxnet_function.jl b/src/maxnet_function.jl index fd414ac..b4b8578 100644 --- a/src/maxnet_function.jl +++ b/src/maxnet_function.jl @@ -16,9 +16,11 @@ - `features`: Either a `Vector` of `AbstractFeatureClass` to be used in the model, or a `String` where "l" = linear and categorical, "q" = quadratic, "p" = product, "t" = threshold, "h" = hinge (e.g. "lqh"); or By default, the features are based on the number of presences are used. See [`default_features`](@ref) -- `regularization_multiplier`: A constant to adjust regularization, where a higher `regularization_multiplier` results in a higher penalization for features -- `regularization_function`: A function to compute a regularization for each feature. A default `regularization_function` is built in. -- `addsamplestobackground`: A boolean, where `true` adds the background samples to the predictors. Defaults to `true`. +- `regularization_multiplier`: A constant to adjust regularization, where a higher `regularization_multiplier` results in a higher + penalization for features and therefore less overfitting. +- `regularization_function`: A function to compute a regularization for each feature. A default `regularization_function` is built in + and should be used in most cases. +- `addsamplestobackground`: Whether to add presence values to the background. Defaults to `true`. - `n_knots`: the number of knots used for Threshold and Hinge features. Defaults to 50. Ignored if there are neither Threshold nor Hinge features - `weight_factor`: A `Float64` value to adjust the weight of the background samples. Defaults to 100.0. - `kw...`: Further arguments to be passed to `GLMNet.glmnet` @@ -32,6 +34,7 @@ using Maxnet p_a, env = Maxnet.bradypus(); bradypus_model = maxnet(p_a, env; features = "lq") +# Output Fit Maxnet model Features classes: Maxnet.AbstractFeatureClass[LinearFeature(), CategoricalFeature(), QuadraticFeature()] Entropy: 6.114650341746531 From fa121e219743e14bdcc026da0ffe0cab921e63cb Mon Sep 17 00:00:00 2001 From: Tiem van der Deure Date: Mon, 27 Jan 2025 10:00:59 +0100 Subject: [PATCH 6/7] add check scitypes Co-authored-by: Anthony Blaom, PhD --- src/mlj_interface.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mlj_interface.jl b/src/mlj_interface.jl index 83ab240..990a549 100644 --- a/src/mlj_interface.jl +++ b/src/mlj_interface.jl @@ -55,7 +55,8 @@ In MLJ or MLJBase, bind an instance `model` to data with where - `X`: any table of input features (eg, a `DataFrame`) whose columns - each have one of the following element scitypes: `Continuous` or `<:Multiclass`. + each have one of the following element scitypes: `Continuous` or `<:Multiclass`. Check + `scitypes` with `schema(X)`. - `y`: is the target, which can be any `AbstractVector` whose element scitype is `<:Binary`. The first class should refer to background values, From e71981207cc892ca2ae5b5526ab6f53547dbb072 Mon Sep 17 00:00:00 2001 From: Tiem van der Deure Date: Mon, 27 Jan 2025 10:01:48 +0100 Subject: [PATCH 7/7] Clogloglink is from Maxnet Co-authored-by: Anthony Blaom, PhD --- src/mlj_interface.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mlj_interface.jl b/src/mlj_interface.jl index 990a549..f98129c 100644 --- a/src/mlj_interface.jl +++ b/src/mlj_interface.jl @@ -71,7 +71,7 @@ where - `addsamplestobackground = true`: Controls wether to add presence values to the background. - `n_knots = 50`: The number of knots used for Threshold and Hinge features. A higher number gives more flexibility for these features. - `weight_factor = 100.0`: A `Float64` value to adjust the weight of the background samples. -- `link = CloglogLink()`: The link function to use when predicting. See `Maxnet.predict` +- `link = Maxnet.CloglogLink()`: The link function to use when predicting. See `Maxnet.predict` - `clamp = false`: Clamp values passed to `MLJBase.predict` to the range the model was trained on. # Operations