Skip to content

Commit

Permalink
Merge pull request #746 from JuliaAI/dev
Browse files Browse the repository at this point in the history
For a 0.19.8 release
  • Loading branch information
ablaom authored Mar 10, 2022
2 parents bc96340 + 94c3d10 commit 0cb045c
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 48 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "MLJBase"
uuid = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
authors = ["Anthony D. Blaom <[email protected]>"]
version = "0.19.7"
version = "0.19.8"

[deps]
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
Expand Down
55 changes: 18 additions & 37 deletions src/composition/models/stacking.jl
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,6 @@ When training a machine bound to such an instance:
- `resampling`: The resampling strategy used
to prepare out-of-sample predictions of the base learners.
It can be a user-defined strategy, the only
caveat being that it should have a `nfolds` attribute.
- `measures`: A measure or iterable over measures, to perform an internal
evaluation of the learners in the Stack while training. This is not for the
Expand Down Expand Up @@ -248,20 +246,6 @@ MLJBase.package_license(::Type{<:Stack}) = "MIT"
################# Node operations Methods #################
###########################################################


getfolds(y::AbstractNode, cv::CV, n::Int) =
source(train_test_pairs(cv, 1:n))

getfolds(y::AbstractNode, cv::StratifiedCV, n::Int) =
node(YY->train_test_pairs(cv, 1:n, YY), y)

trainrows(X::AbstractNode, folds::AbstractNode, nfold) =
node((XX, ff) -> selectrows(XX, ff[nfold][1]), X, folds)

testrows(X::AbstractNode, folds::AbstractNode, nfold) =
node((XX, ff) -> selectrows(XX, ff[nfold][2]), X, folds)


pre_judge_transform(ŷ::Node, ::Type{<:Probabilistic}, ::Type{<:AbstractArray{<:Finite}}) =
node(ŷ -> pdf(ŷ, levels(first(ŷ))), ŷ)

Expand All @@ -283,7 +267,7 @@ end
When measure/measures is a Nothing, the folds_evaluation won't have been filled by `store_for_evaluation`
and we thus return an empty NamedTuple.
"""
internal_stack_report(m::Stack, verbosity::Int, y::AbstractNode, folds_evaluations::Vararg{Nothing}) = NamedTuple{}()
internal_stack_report(m::Stack, verbosity::Int, tt_pairs, folds_evaluations::Vararg{Nothing}) = NamedTuple{}()

"""
internal_stack_report(m::Stack, verbosity::Int, y::AbstractNode, folds_evaluations::Vararg{AbstractNode})
Expand All @@ -292,10 +276,10 @@ When measure/measures is provided, the folds_evaluation will have been filled by
not doing any heavy work (not constructing nodes corresponding to measures) but just unpacking all the folds_evaluations in a single node that
can be evaluated later.
"""
function internal_stack_report(m::Stack, verbosity::Int, y::AbstractNode, folds_evaluations::Vararg{AbstractNode})
_internal_stack_report(y, folds_evaluations...) =
internal_stack_report(m, verbosity, y, folds_evaluations...)
return (report=(cv_report=node(_internal_stack_report, y, folds_evaluations...),),)
function internal_stack_report(m::Stack, verbosity::Int, tt_pairs, folds_evaluations::Vararg{AbstractNode})
_internal_stack_report(folds_evaluations...) =
internal_stack_report(m, verbosity, tt_pairs, folds_evaluations...)
return (report=(cv_report=node(_internal_stack_report, folds_evaluations...),),)
end

"""
Expand All @@ -305,10 +289,10 @@ Returns a `NamedTuple` of `PerformanceEvaluation` objects, one for each model. T
are built in a flatten array respecting the order given by:
(fold_1:(model_1:[mach, Xtest, ytest], model_2:[mach, Xtest, ytest], ...), fold_2:(model_1, model_2, ...), ...)
"""
function internal_stack_report(stack::Stack{modelnames,}, verbosity::Int, y, folds_evaluations...) where modelnames
function internal_stack_report(stack::Stack{modelnames,}, verbosity::Int, tt_pairs, folds_evaluations...) where modelnames

n_measures = length(stack.measures)
nfolds = stack.resampling.nfolds
nfolds = length(tt_pairs)

# For each model we record the results mimicking the fields PerformanceEvaluation
results = NamedTuple{modelnames}([
Expand All @@ -319,7 +303,7 @@ function internal_stack_report(stack::Stack{modelnames,}, verbosity::Int, y, fol
per_observation=Vector{Union{Missing, Vector{Any}}}(missing, n_measures),
fitted_params_per_fold=[],
report_per_fold=[],
train_test_pairs=train_test_pairs(stack.resampling, 1:nrows(y), y)
train_test_pairs=tt_pairs
)
for model in getfield(stack, :models)]
)
Expand Down Expand Up @@ -385,16 +369,16 @@ This function is building the out-of-sample dataset that is later used by the `j
for its own training. It also returns the folds_evaluations object if internal
cross-validation results are requested.
"""
function oos_set(m::Stack, folds::AbstractNode, Xs::Source, ys::Source)
function oos_set(m::Stack, Xs::Source, ys::Source, tt_pairs)
Zval = []
yval = []
folds_evaluations = []
# Loop over the cross validation folds to build a training set for the metalearner.
for nfold in 1:m.resampling.nfolds
Xtrain = trainrows(Xs, folds, nfold)
ytrain = trainrows(ys, folds, nfold)
Xtest = testrows(Xs, folds, nfold)
ytest = testrows(ys, folds, nfold)
for (training_rows, test_rows) in tt_pairs
Xtrain = selectrows(Xs, training_rows)
ytrain = selectrows(ys, training_rows)
Xtest = selectrows(Xs, test_rows)
ytest = selectrows(ys, test_rows)

# Train each model on the train fold and predict on the validation fold
# predictions are subsequently used as an input to the metalearner
Expand Down Expand Up @@ -429,15 +413,12 @@ end
"""
function fit(m::Stack, verbosity::Int, X, y)
check_stack_measures(m, verbosity, m.measures, y)

n = nrows(y)
tt_pairs = train_test_pairs(m.resampling, 1:nrows(y), X, y)

Xs = source(X)
ys = source(y)

folds = getfolds(ys, m.resampling, n)

Zval, yval, folds_evaluations = oos_set(m, folds, Xs, ys)

Zval, yval, folds_evaluations = oos_set(m, Xs, ys, tt_pairs)

metamach = machine(m.metalearner, Zval, yval)

Expand All @@ -453,7 +434,7 @@ function fit(m::Stack, verbosity::Int, X, y)
Zpred = MLJBase.table(hcat(Zpred...))
= predict(metamach, Zpred)

internal_report = internal_stack_report(m, verbosity, ys, folds_evaluations...)
internal_report = internal_stack_report(m, verbosity, tt_pairs, folds_evaluations...)

# We can infer the Surrogate by two calls to supertype
mach = machine(supertype(supertype(typeof(m)))(), Xs, ys; predict=ŷ, internal_report...)
Expand Down
40 changes: 30 additions & 10 deletions test/composition/models/stacking.jl
Original file line number Diff line number Diff line change
Expand Up @@ -247,13 +247,14 @@ end
stack = Stack(metalearner=judge,
model1=model1,
model2=model2,
resampling=CV(;nfolds=3, shuffle=true, rng=rng))
resampling=CV(;nfolds=3, shuffle=true, rng=StableRNG(123)))

Xs = source(X)
ys = source(y)
folds = MLJBase.getfolds(ys, stack.resampling, n)

ttp = MLJBase.train_test_pairs(stack.resampling, 1:n, X, y)

Zval, yval, folds_evaluations = MLJBase.oos_set(stack, folds, Xs, ys)
Zval, yval, folds_evaluations = MLJBase.oos_set(stack, Xs, ys, ttp)

# No internal measure has been provided so the resulting
# folds_evaluations contain nothing
Expand All @@ -270,19 +271,19 @@ end

# The lines of yval should match the reordering indexes
# of the original y (reordering given by the folds node)
reordering = vcat([x[2] for x in folds()]...)
reordering = vcat([x[2] for x in ttp]...)
@test yval() == y[reordering]
# And the same is true for Zval, let's check this for model1's output
# on the first fold, ie (2 first rows, 3 first columns)
# First we need to train the model
trainingrows = folds()[1][1]
trainingrows = ttp[1][1]
Xtrain = selectrows(X, trainingrows)
ytrain = selectrows(y, trainingrows)
mach = machine(model1, Xtrain, ytrain)
fit!(mach, verbosity=0)

# Then predict on the validation rows
Xpred = selectrows(X, folds()[1][2])
Xpred = selectrows(X, ttp[1][2])
Zval_expected_dist = predict(mach, Xpred)
# This is a distribution, we need to apply the appropriate transformation
Zval_expected = pdf(Zval_expected_dist, levels(first(Zval_expected_dist)))
Expand Down Expand Up @@ -398,10 +399,9 @@ end
constant=evaluate(constant, X, y, resampling=resampling, measures=measures, verbosity=0),
ridge=evaluate(ridge, X, y, resampling=resampling, measures=measures, verbosity=0)
)

ttp = MLJBase.train_test_pairs(resampling, 1:nrows(y), X, y)
# Testing internal_stack_report default with nothing
ys = source(y)
@test MLJBase.internal_stack_report(mystack, 0, ys, nothing, nothing) == NamedTuple{}()
@test MLJBase.internal_stack_report(mystack, 0, ttp, nothing, nothing) == NamedTuple{}()

# Simulate the evaluation nodes which consist of
# - The fold machine
Expand All @@ -421,7 +421,7 @@ end
internalreport = MLJBase.internal_stack_report(
mystack,
0,
ys,
ttp,
evaluation_nodes...
).report.cv_report()

Expand Down Expand Up @@ -495,5 +495,25 @@ end

end

@testset "Test Holdout CV" begin
X, y = make_regression(100, 3; rng=rng)
resampling = Holdout()
constant = ConstantRegressor()
ridge = FooBarRegressor()
mystack = Stack(;metalearner=FooBarRegressor(),
resampling=resampling,
measures=[rmse],
ridge=ridge,
constant=constant)

mach = machine(mystack, X, y)
fit!(mach, verbosity=0)
for modelname in (:ridge, :constant)
model_perf = getproperty(report(mach).cv_report, modelname)
@test length(model_perf.per_fold) == 1
@test length(model_perf.train_test_rows) == 1
end
end

end
true

0 comments on commit 0cb045c

Please sign in to comment.