From b42c1bf894e37098f682f93a8ca3ed0ce56efc23 Mon Sep 17 00:00:00 2001 From: Olivier Labayle Date: Mon, 7 Mar 2022 17:30:13 +0000 Subject: [PATCH 1/6] make the stack's resampling scheme for general --- src/composition/models/stacking.jl | 9 +++------ test/composition/models/stacking.jl | 4 ++-- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/src/composition/models/stacking.jl b/src/composition/models/stacking.jl index 14492c15..bb7bb6b9 100644 --- a/src/composition/models/stacking.jl +++ b/src/composition/models/stacking.jl @@ -249,11 +249,8 @@ MLJBase.package_license(::Type{<:Stack}) = "MIT" ########################################################### -getfolds(y::AbstractNode, cv::CV, n::Int) = - source(train_test_pairs(cv, 1:n)) - -getfolds(y::AbstractNode, cv::StratifiedCV, n::Int) = - node(YY->train_test_pairs(cv, 1:n, YY), y) +getfolds(X::AbstractNode, y::AbstractNode, cv, n::Int) = + node((XX, YY) -> train_test_pairs(cv, 1:n, XX, YY), X, y) trainrows(X::AbstractNode, folds::AbstractNode, nfold) = node((XX, ff) -> selectrows(XX, ff[nfold][1]), X, folds) @@ -435,7 +432,7 @@ function fit(m::Stack, verbosity::Int, X, y) Xs = source(X) ys = source(y) - folds = getfolds(ys, m.resampling, n) + folds = getfolds(Xs, ys, m.resampling, n) Zval, yval, folds_evaluations = oos_set(m, folds, Xs, ys) diff --git a/test/composition/models/stacking.jl b/test/composition/models/stacking.jl index f8e26f6d..909724cd 100644 --- a/test/composition/models/stacking.jl +++ b/test/composition/models/stacking.jl @@ -247,11 +247,11 @@ end stack = Stack(metalearner=judge, model1=model1, model2=model2, - resampling=CV(;nfolds=3, shuffle=true, rng=rng)) + resampling=CV(;nfolds=3, shuffle=false)) Xs = source(X) ys = source(y) - folds = MLJBase.getfolds(ys, stack.resampling, n) + folds = MLJBase.getfolds(Xs, ys, stack.resampling, n) Zval, yval, folds_evaluations = MLJBase.oos_set(stack, folds, Xs, ys) From 92e15345e3333cf80dc1b32af879203c3156c9c9 Mon Sep 17 00:00:00 2001 From: Olivier Labayle Date: Tue, 8 Mar 2022 09:23:31 +0000 Subject: [PATCH 2/6] update oos_set to work for arbitrary resampling implemting the train_test_pairs --- src/composition/models/stacking.jl | 33 ++++++++++------------------- test/composition/models/stacking.jl | 13 ++++++------ 2 files changed, 18 insertions(+), 28 deletions(-) diff --git a/src/composition/models/stacking.jl b/src/composition/models/stacking.jl index bb7bb6b9..5b47c76f 100644 --- a/src/composition/models/stacking.jl +++ b/src/composition/models/stacking.jl @@ -248,16 +248,8 @@ MLJBase.package_license(::Type{<:Stack}) = "MIT" ################# Node operations Methods ################# ########################################################### - -getfolds(X::AbstractNode, y::AbstractNode, cv, n::Int) = - node((XX, YY) -> train_test_pairs(cv, 1:n, XX, YY), X, y) - -trainrows(X::AbstractNode, folds::AbstractNode, nfold) = - node((XX, ff) -> selectrows(XX, ff[nfold][1]), X, folds) - -testrows(X::AbstractNode, folds::AbstractNode, nfold) = - node((XX, ff) -> selectrows(XX, ff[nfold][2]), X, folds) - +selectrows(X::AbstractNode, idx) = + node(X-> selectrows(X, idx), X) pre_judge_transform(ŷ::Node, ::Type{<:Probabilistic}, ::Type{<:AbstractArray{<:Finite}}) = node(ŷ -> pdf(ŷ, levels(first(ŷ))), ŷ) @@ -382,16 +374,16 @@ This function is building the out-of-sample dataset that is later used by the `j for its own training. It also returns the folds_evaluations object if internal cross-validation results are requested. """ -function oos_set(m::Stack, folds::AbstractNode, Xs::Source, ys::Source) +function oos_set(m::Stack, Xs::Source, ys::Source, ttp) Zval = [] yval = [] folds_evaluations = [] # Loop over the cross validation folds to build a training set for the metalearner. - for nfold in 1:m.resampling.nfolds - Xtrain = trainrows(Xs, folds, nfold) - ytrain = trainrows(ys, folds, nfold) - Xtest = testrows(Xs, folds, nfold) - ytest = testrows(ys, folds, nfold) + for (training_rows, test_rows) in ttp + Xtrain = selectrows(Xs, training_rows) + ytrain = selectrows(ys, training_rows) + Xtest = selectrows(Xs, test_rows) + ytest = selectrows(ys, test_rows) # Train each model on the train fold and predict on the validation fold # predictions are subsequently used as an input to the metalearner @@ -426,15 +418,12 @@ end """ function fit(m::Stack, verbosity::Int, X, y) check_stack_measures(m, verbosity, m.measures, y) - - n = nrows(y) + ttp = train_test_pairs(m.resampling, 1:nrows(y), X, y) Xs = source(X) ys = source(y) - - folds = getfolds(Xs, ys, m.resampling, n) - - Zval, yval, folds_evaluations = oos_set(m, folds, Xs, ys) + + Zval, yval, folds_evaluations = oos_set(m, Xs, ys, ttp) metamach = machine(m.metalearner, Zval, yval) diff --git a/test/composition/models/stacking.jl b/test/composition/models/stacking.jl index 909724cd..b8f493a3 100644 --- a/test/composition/models/stacking.jl +++ b/test/composition/models/stacking.jl @@ -247,13 +247,14 @@ end stack = Stack(metalearner=judge, model1=model1, model2=model2, - resampling=CV(;nfolds=3, shuffle=false)) + resampling=CV(;nfolds=3, shuffle=true, rng=StableRNG(123))) Xs = source(X) ys = source(y) - folds = MLJBase.getfolds(Xs, ys, stack.resampling, n) + + ttp = MLJBase.train_test_pairs(stack.resampling, 1:n, X, y) - Zval, yval, folds_evaluations = MLJBase.oos_set(stack, folds, Xs, ys) + Zval, yval, folds_evaluations = MLJBase.oos_set(stack, Xs, ys, ttp) # No internal measure has been provided so the resulting # folds_evaluations contain nothing @@ -270,19 +271,19 @@ end # The lines of yval should match the reordering indexes # of the original y (reordering given by the folds node) - reordering = vcat([x[2] for x in folds()]...) + reordering = vcat([x[2] for x in ttp]...) @test yval() == y[reordering] # And the same is true for Zval, let's check this for model1's output # on the first fold, ie (2 first rows, 3 first columns) # First we need to train the model - trainingrows = folds()[1][1] + trainingrows = ttp[1][1] Xtrain = selectrows(X, trainingrows) ytrain = selectrows(y, trainingrows) mach = machine(model1, Xtrain, ytrain) fit!(mach, verbosity=0) # Then predict on the validation rows - Xpred = selectrows(X, folds()[1][2]) + Xpred = selectrows(X, ttp[1][2]) Zval_expected_dist = predict(mach, Xpred) # This is a distribution, we need to apply the appropriate transformation Zval_expected = pdf(Zval_expected_dist, levels(first(Zval_expected_dist))) From adeac7bea078b3f38c40bcf9a9a9b19d4d910e96 Mon Sep 17 00:00:00 2001 From: Olivier Labayle Date: Tue, 8 Mar 2022 09:45:24 +0000 Subject: [PATCH 3/6] add Holdout test and update internal_stack_report --- src/composition/models/stacking.jl | 18 +++++++++--------- test/composition/models/stacking.jl | 27 +++++++++++++++++++++++---- 2 files changed, 32 insertions(+), 13 deletions(-) diff --git a/src/composition/models/stacking.jl b/src/composition/models/stacking.jl index 5b47c76f..56d4b5d3 100644 --- a/src/composition/models/stacking.jl +++ b/src/composition/models/stacking.jl @@ -272,7 +272,7 @@ end When measure/measures is a Nothing, the folds_evaluation won't have been filled by `store_for_evaluation` and we thus return an empty NamedTuple. """ -internal_stack_report(m::Stack, verbosity::Int, y::AbstractNode, folds_evaluations::Vararg{Nothing}) = NamedTuple{}() +internal_stack_report(m::Stack, verbosity::Int, ttp, folds_evaluations::Vararg{Nothing}) = NamedTuple{}() """ internal_stack_report(m::Stack, verbosity::Int, y::AbstractNode, folds_evaluations::Vararg{AbstractNode}) @@ -281,10 +281,10 @@ When measure/measures is provided, the folds_evaluation will have been filled by not doing any heavy work (not constructing nodes corresponding to measures) but just unpacking all the folds_evaluations in a single node that can be evaluated later. """ -function internal_stack_report(m::Stack, verbosity::Int, y::AbstractNode, folds_evaluations::Vararg{AbstractNode}) - _internal_stack_report(y, folds_evaluations...) = - internal_stack_report(m, verbosity, y, folds_evaluations...) - return (report=(cv_report=node(_internal_stack_report, y, folds_evaluations...),),) +function internal_stack_report(m::Stack, verbosity::Int, ttp, folds_evaluations::Vararg{AbstractNode}) + _internal_stack_report(folds_evaluations...) = + internal_stack_report(m, verbosity, ttp, folds_evaluations...) + return (report=(cv_report=node(_internal_stack_report, folds_evaluations...),),) end """ @@ -294,10 +294,10 @@ Returns a `NamedTuple` of `PerformanceEvaluation` objects, one for each model. T are built in a flatten array respecting the order given by: (fold_1:(model_1:[mach, Xtest, ytest], model_2:[mach, Xtest, ytest], ...), fold_2:(model_1, model_2, ...), ...) """ -function internal_stack_report(stack::Stack{modelnames,}, verbosity::Int, y, folds_evaluations...) where modelnames +function internal_stack_report(stack::Stack{modelnames,}, verbosity::Int, ttp, folds_evaluations...) where modelnames n_measures = length(stack.measures) - nfolds = stack.resampling.nfolds + nfolds = length(ttp) # For each model we record the results mimicking the fields PerformanceEvaluation results = NamedTuple{modelnames}([ @@ -308,7 +308,7 @@ function internal_stack_report(stack::Stack{modelnames,}, verbosity::Int, y, fol per_observation=Vector{Union{Missing, Vector{Any}}}(missing, n_measures), fitted_params_per_fold=[], report_per_fold=[], - train_test_pairs=train_test_pairs(stack.resampling, 1:nrows(y), y) + train_test_pairs=ttp ) for model in getfield(stack, :models)] ) @@ -439,7 +439,7 @@ function fit(m::Stack, verbosity::Int, X, y) Zpred = MLJBase.table(hcat(Zpred...)) ŷ = predict(metamach, Zpred) - internal_report = internal_stack_report(m, verbosity, ys, folds_evaluations...) + internal_report = internal_stack_report(m, verbosity, ttp, folds_evaluations...) # We can infer the Surrogate by two calls to supertype mach = machine(supertype(supertype(typeof(m)))(), Xs, ys; predict=ŷ, internal_report...) diff --git a/test/composition/models/stacking.jl b/test/composition/models/stacking.jl index b8f493a3..4ad24320 100644 --- a/test/composition/models/stacking.jl +++ b/test/composition/models/stacking.jl @@ -399,10 +399,9 @@ end constant=evaluate(constant, X, y, resampling=resampling, measures=measures, verbosity=0), ridge=evaluate(ridge, X, y, resampling=resampling, measures=measures, verbosity=0) ) - + ttp = MLJBase.train_test_pairs(resampling, 1:nrows(y), X, y) # Testing internal_stack_report default with nothing - ys = source(y) - @test MLJBase.internal_stack_report(mystack, 0, ys, nothing, nothing) == NamedTuple{}() + @test MLJBase.internal_stack_report(mystack, 0, ttp, nothing, nothing) == NamedTuple{}() # Simulate the evaluation nodes which consist of # - The fold machine @@ -422,7 +421,7 @@ end internalreport = MLJBase.internal_stack_report( mystack, 0, - ys, + ttp, evaluation_nodes... ).report.cv_report() @@ -496,5 +495,25 @@ end end +@testset "Test Holdout CV" begin + X, y = make_regression(100, 3; rng=rng) + resampling = Holdout() + constant = ConstantRegressor() + ridge = FooBarRegressor() + mystack = Stack(;metalearner=FooBarRegressor(), + resampling=resampling, + measures=[rmse], + ridge=ridge, + constant=constant) + + mach = machine(mystack, X, y) + fit!(mach, verbosity=0) + for modelname in (:ridge, :constant) + model_perf = getproperty(report(mach).cv_report, modelname) + @test length(model_perf.per_fold) == 1 + @test length(model_perf.train_test_rows) == 1 + end +end + end true \ No newline at end of file From 4fbc3b684cbfe4d36fb2aa73e4c6b94a3ee24da5 Mon Sep 17 00:00:00 2001 From: Olivier Labayle Date: Tue, 8 Mar 2022 11:43:46 +0000 Subject: [PATCH 4/6] remove selectrows redefinition --- src/composition/models/stacking.jl | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/composition/models/stacking.jl b/src/composition/models/stacking.jl index 56d4b5d3..83ef925a 100644 --- a/src/composition/models/stacking.jl +++ b/src/composition/models/stacking.jl @@ -248,9 +248,6 @@ MLJBase.package_license(::Type{<:Stack}) = "MIT" ################# Node operations Methods ################# ########################################################### -selectrows(X::AbstractNode, idx) = - node(X-> selectrows(X, idx), X) - pre_judge_transform(ŷ::Node, ::Type{<:Probabilistic}, ::Type{<:AbstractArray{<:Finite}}) = node(ŷ -> pdf(ŷ, levels(first(ŷ))), ŷ) From 018a116013b955ba930cba3ff6ef8575e59973cf Mon Sep 17 00:00:00 2001 From: Olivier Labayle Date: Thu, 10 Mar 2022 08:56:18 +0000 Subject: [PATCH 5/6] update ttp name to tt_pairs and update docstring --- src/composition/models/stacking.jl | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/src/composition/models/stacking.jl b/src/composition/models/stacking.jl index 83ef925a..6221aabf 100644 --- a/src/composition/models/stacking.jl +++ b/src/composition/models/stacking.jl @@ -90,8 +90,6 @@ When training a machine bound to such an instance: - `resampling`: The resampling strategy used to prepare out-of-sample predictions of the base learners. - It can be a user-defined strategy, the only - caveat being that it should have a `nfolds` attribute. - `measures`: A measure or iterable over measures, to perform an internal evaluation of the learners in the Stack while training. This is not for the @@ -269,7 +267,7 @@ end When measure/measures is a Nothing, the folds_evaluation won't have been filled by `store_for_evaluation` and we thus return an empty NamedTuple. """ -internal_stack_report(m::Stack, verbosity::Int, ttp, folds_evaluations::Vararg{Nothing}) = NamedTuple{}() +internal_stack_report(m::Stack, verbosity::Int, tt_pairs, folds_evaluations::Vararg{Nothing}) = NamedTuple{}() """ internal_stack_report(m::Stack, verbosity::Int, y::AbstractNode, folds_evaluations::Vararg{AbstractNode}) @@ -278,9 +276,9 @@ When measure/measures is provided, the folds_evaluation will have been filled by not doing any heavy work (not constructing nodes corresponding to measures) but just unpacking all the folds_evaluations in a single node that can be evaluated later. """ -function internal_stack_report(m::Stack, verbosity::Int, ttp, folds_evaluations::Vararg{AbstractNode}) +function internal_stack_report(m::Stack, verbosity::Int, tt_pairs, folds_evaluations::Vararg{AbstractNode}) _internal_stack_report(folds_evaluations...) = - internal_stack_report(m, verbosity, ttp, folds_evaluations...) + internal_stack_report(m, verbosity, tt_pairs, folds_evaluations...) return (report=(cv_report=node(_internal_stack_report, folds_evaluations...),),) end @@ -291,10 +289,10 @@ Returns a `NamedTuple` of `PerformanceEvaluation` objects, one for each model. T are built in a flatten array respecting the order given by: (fold_1:(model_1:[mach, Xtest, ytest], model_2:[mach, Xtest, ytest], ...), fold_2:(model_1, model_2, ...), ...) """ -function internal_stack_report(stack::Stack{modelnames,}, verbosity::Int, ttp, folds_evaluations...) where modelnames +function internal_stack_report(stack::Stack{modelnames,}, verbosity::Int, tt_pairs, folds_evaluations...) where modelnames n_measures = length(stack.measures) - nfolds = length(ttp) + nfolds = length(tt_pairs) # For each model we record the results mimicking the fields PerformanceEvaluation results = NamedTuple{modelnames}([ @@ -305,7 +303,7 @@ function internal_stack_report(stack::Stack{modelnames,}, verbosity::Int, ttp, f per_observation=Vector{Union{Missing, Vector{Any}}}(missing, n_measures), fitted_params_per_fold=[], report_per_fold=[], - train_test_pairs=ttp + train_test_pairs=tt_pairs ) for model in getfield(stack, :models)] ) @@ -371,12 +369,12 @@ This function is building the out-of-sample dataset that is later used by the `j for its own training. It also returns the folds_evaluations object if internal cross-validation results are requested. """ -function oos_set(m::Stack, Xs::Source, ys::Source, ttp) +function oos_set(m::Stack, Xs::Source, ys::Source, tt_pairs) Zval = [] yval = [] folds_evaluations = [] # Loop over the cross validation folds to build a training set for the metalearner. - for (training_rows, test_rows) in ttp + for (training_rows, test_rows) in tt_pairs Xtrain = selectrows(Xs, training_rows) ytrain = selectrows(ys, training_rows) Xtest = selectrows(Xs, test_rows) @@ -415,12 +413,12 @@ end """ function fit(m::Stack, verbosity::Int, X, y) check_stack_measures(m, verbosity, m.measures, y) - ttp = train_test_pairs(m.resampling, 1:nrows(y), X, y) + tt_pairs = train_test_pairs(m.resampling, 1:nrows(y), X, y) Xs = source(X) ys = source(y) - Zval, yval, folds_evaluations = oos_set(m, Xs, ys, ttp) + Zval, yval, folds_evaluations = oos_set(m, Xs, ys, tt_pairs) metamach = machine(m.metalearner, Zval, yval) @@ -436,7 +434,7 @@ function fit(m::Stack, verbosity::Int, X, y) Zpred = MLJBase.table(hcat(Zpred...)) ŷ = predict(metamach, Zpred) - internal_report = internal_stack_report(m, verbosity, ttp, folds_evaluations...) + internal_report = internal_stack_report(m, verbosity, tt_pairs, folds_evaluations...) # We can infer the Surrogate by two calls to supertype mach = machine(supertype(supertype(typeof(m)))(), Xs, ys; predict=ŷ, internal_report...) From 94c3d10b72884551b72c2a3d1a7d037964fb651e Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Fri, 11 Mar 2022 12:17:01 +1300 Subject: [PATCH 6/6] bump 0.19.8 --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 2488cd53..bf6bfa7d 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "MLJBase" uuid = "a7f614a8-145f-11e9-1d2a-a57a1082229d" authors = ["Anthony D. Blaom "] -version = "0.19.7" +version = "0.19.8" [deps] CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"