From aa613cba161b839bd57d35071d39c464d114d13a Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Thu, 27 Jun 2024 13:35:50 +1200 Subject: [PATCH 1/4] bump [compat] MLJModelInterface = "1.11", StatisticalTraits = "3.4" --- Project.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Project.toml b/Project.toml index 792ffb6a..d39569e1 100644 --- a/Project.toml +++ b/Project.toml @@ -47,7 +47,7 @@ DelimitedFiles = "1" Distributions = "0.25.3" InvertedIndices = "1" LearnAPI = "0.1" -MLJModelInterface = "1.10" +MLJModelInterface = "1.11" Missings = "0.4, 1" OrderedCollections = "1.1" Parameters = "0.12" @@ -58,7 +58,7 @@ Reexport = "1.2" ScientificTypes = "3" StatisticalMeasures = "0.1.1" StatisticalMeasuresBase = "0.1.1" -StatisticalTraits = "3.3" +StatisticalTraits = "3.4" Statistics = "1" StatsBase = "0.32, 0.33, 0.34" Tables = "0.2, 1.0" From cb152085d66fbd8847130af091657254e9dff2c1 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Thu, 27 Jun 2024 13:37:07 +1200 Subject: [PATCH 2/4] bump 1.6 --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index d39569e1..80203bc0 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "MLJBase" uuid = "a7f614a8-145f-11e9-1d2a-a57a1082229d" authors = ["Anthony D. Blaom "] -version = "1.5.0" +version = "1.6" [deps] CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" From 4d37eedd5dce1d3f43409880181ae25b2394eb4e Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Thu, 27 Jun 2024 15:15:25 +1200 Subject: [PATCH 3/4] make pipelines support `Unsupervised` with target in fit --- src/composition/models/pipelines.jl | 18 ++++++++++++-- test/composition/models/pipelines.jl | 35 ++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 2 deletions(-) diff --git a/src/composition/models/pipelines.jl b/src/composition/models/pipelines.jl index 0ea85297..fb7682a9 100644 --- a/src/composition/models/pipelines.jl +++ b/src/composition/models/pipelines.jl @@ -225,6 +225,15 @@ implements it (some clustering models). Similarly, calling `transform` on a supervised pipeline calls `transform` on the supervised component. +### Transformers that need a target in training + +Some transformers that have type `Unsupervised` (so that the output of `transform` is +propagated in pipelines) also see a target variable in training. An example are so-called +target encoders (which transform categorical input features, based on some target +observations). Provided they appear before any `Supervised` component in the pipelines, +such models are supported. Of course a target must be provided whenever training such a +pipeline, whether or not it contains a `Supervised` component. + ### Optional key-word arguments - `prediction_type` - @@ -444,9 +453,13 @@ function extend(front::Front{Pred}, ::Static, name, cache, args...) Front(transform(mach, active(front)), front.transform, Pred()) end -function extend(front::Front{Trans}, component::Unsupervised, name, cache, args...) +function extend(front::Front{Trans}, component::Unsupervised, name, cache, ::Any, sources...) a = active(front) - mach = machine(name, a; cache=cache) + if target_in_fit(component) + mach = machine(name, a, first(sources); cache=cache) + else + mach = machine(name, a; cache=cache) + end Front(predict(mach, a), transform(mach, a), Trans()) end @@ -598,6 +611,7 @@ function MMI.iteration_parameter(pipe::SupervisedPipeline) end MMI.target_scitype(p::SupervisedPipeline) = target_scitype(supervised_component(p)) +MMI.target_in_fit(p::SomePipeline) = any(target_in_fit, components(p)) MMI.package_name(::Type{<:SomePipeline}) = "MLJBase" MMI.load_path(::Type{<:SomePipeline}) = "MLJBase.Pipeline" diff --git a/test/composition/models/pipelines.jl b/test/composition/models/pipelines.jl index e213cdc2..c90143a7 100644 --- a/test/composition/models/pipelines.jl +++ b/test/composition/models/pipelines.jl @@ -544,6 +544,7 @@ end # inverse transform: p = Pipeline(UnivariateBoxCoxTransformer, UnivariateStandardizer) + @test !target_in_fit(p) xtrain = rand(rng, 10) mach = machine(p, xtrain) fit!(mach, verbosity=0) @@ -702,6 +703,40 @@ end @test Set(features) == Set(keys(X)) end +struct SupervisedTransformer <: Unsupervised end + +MLJBase.fit(::SupervisedTransformer, verbosity, X, y) = (mean(y), nothing, nothing) +MLJBase.transform(::SupervisedTransformer, fitresult, X) = + fitresult*MLJBase.matrix(X) |> MLJBase.table +MLJBase.target_in_fit(::Type{<:SupervisedTransformer}) = true + +struct DummyTransformer <: Unsupervised end +MLJBase.fit(::DummyTransformer, verbosity, X) = (nothing, nothing, nothing) +MLJBase.transform(::DummyTransformer, fitresult, X) = X + +@testset "supervised transformers in a pipeline" begin + X = MLJBase.table((a=fill(10.0, 3),)) + y = fill(2, 3) + pipe = SupervisedTransformer() |> DeterministicConstantRegressor() + @test target_in_fit(pipe) + mach = machine(pipe, X, y) + fit!(mach, verbosity=0) + @test predict(mach, X) == fill(2.0, 3) + + pipe2 = DummyTransformer |> pipe + @test target_in_fit(pipe2) + mach = machine(pipe2, X, y) + fit!(mach, verbosity=0) + @test predict(mach, X) == fill(2.0, 3) + + pipe3 = DummyTransformer |> SupervisedTransformer |> DummyTransformer + @test target_in_fit(pipe3) + mach = machine(pipe3, X, y) + fit!(mach, verbosity=0) + @test transform(mach, X).x1 == fill(20.0, 3) +end + + end # module true From 63344b7471a640b3ea8b64126fd616670bbf9e21 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Wed, 3 Jul 2024 08:01:59 +1200 Subject: [PATCH 4/4] tweak docstring --- src/composition/models/pipelines.jl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/composition/models/pipelines.jl b/src/composition/models/pipelines.jl index fb7682a9..0d3c4c6f 100644 --- a/src/composition/models/pipelines.jl +++ b/src/composition/models/pipelines.jl @@ -228,11 +228,11 @@ component. ### Transformers that need a target in training Some transformers that have type `Unsupervised` (so that the output of `transform` is -propagated in pipelines) also see a target variable in training. An example are so-called -target encoders (which transform categorical input features, based on some target -observations). Provided they appear before any `Supervised` component in the pipelines, -such models are supported. Of course a target must be provided whenever training such a -pipeline, whether or not it contains a `Supervised` component. +propagated in pipelines) may require a target variable for training. An example are +so-called target encoders (which transform categorical input features, based on some +target observations). Provided they appear before any `Supervised` component in the +pipelines, such models are supported. Of course a target must be provided whenever +training such a pipeline, whether or not it contains a `Supervised` component. ### Optional key-word arguments