JuliaAI · ablaom · Jul 2, 2024 · Jun 27, 2024 · Jun 27, 2024 · Jun 27, 2024
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "MLJBase"
 uuid = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
 authors = ["Anthony D. Blaom <[email protected]>"]
-version = "1.5.0"
+version = "1.6"
 
 [deps]
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
@@ -47,7 +47,7 @@ DelimitedFiles = "1"
 Distributions = "0.25.3"
 InvertedIndices = "1"
 LearnAPI = "0.1"
-MLJModelInterface = "1.10"
+MLJModelInterface = "1.11"
 Missings = "0.4, 1"
 OrderedCollections = "1.1"
 Parameters = "0.12"
@@ -58,7 +58,7 @@ Reexport = "1.2"
 ScientificTypes = "3"
 StatisticalMeasures = "0.1.1"
 StatisticalMeasuresBase = "0.1.1"
-StatisticalTraits = "3.3"
+StatisticalTraits = "3.4"
 Statistics = "1"
 StatsBase = "0.32, 0.33, 0.34"
 Tables = "0.2, 1.0"

diff --git a/src/composition/models/pipelines.jl b/src/composition/models/pipelines.jl
@@ -225,6 +225,15 @@ implements it (some clustering models). Similarly, calling `transform`
 on a supervised pipeline calls `transform` on the supervised
 component.
 
+### Transformers that need a target in training
+
+Some transformers that have type `Unsupervised` (so that the output of `transform` is
+propagated in pipelines) may require a target variable for training. An example are
+so-called target encoders (which transform categorical input features, based on some
+target observations). Provided they appear before any `Supervised` component in the
+pipelines, such models are supported. Of course a target must be provided whenever
+training such a pipeline, whether or not it contains a `Supervised` component.
+
 ### Optional key-word arguments
 
 - `prediction_type`  -
@@ -444,9 +453,13 @@ function extend(front::Front{Pred}, ::Static, name, cache, args...)
     Front(transform(mach, active(front)), front.transform, Pred())
 end
 
-function extend(front::Front{Trans}, component::Unsupervised, name, cache, args...)
+function extend(front::Front{Trans}, component::Unsupervised, name, cache, ::Any, sources...)
     a = active(front)
-    mach = machine(name, a; cache=cache)
+    if target_in_fit(component)
+        mach = machine(name, a, first(sources); cache=cache)
+    else
+        mach = machine(name, a; cache=cache)
+    end
     Front(predict(mach, a), transform(mach, a), Trans())
 end
 
@@ -598,6 +611,7 @@ function MMI.iteration_parameter(pipe::SupervisedPipeline)
 end
 
 MMI.target_scitype(p::SupervisedPipeline) = target_scitype(supervised_component(p))
+MMI.target_in_fit(p::SomePipeline) = any(target_in_fit, components(p))
 
 MMI.package_name(::Type{<:SomePipeline}) = "MLJBase"
 MMI.load_path(::Type{<:SomePipeline}) = "MLJBase.Pipeline"

diff --git a/test/composition/models/pipelines.jl b/test/composition/models/pipelines.jl
@@ -544,6 +544,7 @@ end
     # inverse transform:
     p = Pipeline(UnivariateBoxCoxTransformer,
                  UnivariateStandardizer)
+    @test !target_in_fit(p)
     xtrain = rand(rng, 10)
     mach = machine(p, xtrain)
     fit!(mach, verbosity=0)
@@ -702,6 +703,40 @@ end
     @test Set(features) == Set(keys(X))
 end
 
+struct SupervisedTransformer <: Unsupervised end
+
+MLJBase.fit(::SupervisedTransformer, verbosity, X, y) = (mean(y), nothing, nothing)
+MLJBase.transform(::SupervisedTransformer, fitresult, X) =
+   fitresult*MLJBase.matrix(X) |> MLJBase.table
+MLJBase.target_in_fit(::Type{<:SupervisedTransformer}) = true
+
+struct DummyTransformer <: Unsupervised end
+MLJBase.fit(::DummyTransformer, verbosity, X) = (nothing, nothing, nothing)
+MLJBase.transform(::DummyTransformer, fitresult, X) = X
+
+@testset "supervised transformers in a pipeline" begin
+    X = MLJBase.table((a=fill(10.0, 3),))
+    y = fill(2, 3)
+    pipe = SupervisedTransformer() |> DeterministicConstantRegressor()
+    @test target_in_fit(pipe)
+    mach = machine(pipe, X, y)
+    fit!(mach, verbosity=0)
+    @test predict(mach, X) == fill(2.0, 3)
+
+    pipe2 = DummyTransformer |> pipe
+    @test target_in_fit(pipe2)
+    mach = machine(pipe2, X, y)
+    fit!(mach, verbosity=0)
+    @test predict(mach, X) == fill(2.0, 3)
+
+    pipe3 = DummyTransformer |> SupervisedTransformer |> DummyTransformer
+    @test target_in_fit(pipe3)
+    mach = machine(pipe3, X, y)
+    fit!(mach, verbosity=0)
+    @test transform(mach, X).x1 == fill(20.0, 3)
+end
+
+
 end # module
 
 true