From 9a20a7edd0109d92d9d26f44c6403a13c4551d85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=BAlio=20Hoffimann?= Date: Sun, 12 Apr 2020 15:08:14 -0300 Subject: [PATCH 01/13] Update Project.toml --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 4c43f2e1..4cf9cd46 100644 --- a/Project.toml +++ b/Project.toml @@ -38,7 +38,7 @@ HTTP = "^0.8" InvertedIndices = "^1" JLSO = "^2.1,^2.2" JSON = "^0.21" -LossFunctions = "^0.5" +LossFunctions = "0.5, 0.6" MLJModelInterface = "^0.2" MLJScientificTypes = "^0.1,^0.2" Missings = "^0.4" From 05b97250ff2110abbb67e711985e7e3e71f0f5c9 Mon Sep 17 00:00:00 2001 From: Okon Samuel <39421418+OkonSamuel@users.noreply.github.com> Date: Sun, 12 Apr 2020 23:49:51 +0100 Subject: [PATCH 02/13] Update loss_functions_interface.jl --- src/measures/loss_functions_interface.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/measures/loss_functions_interface.jl b/src/measures/loss_functions_interface.jl index 1f4e3735..2a56025e 100644 --- a/src/measures/loss_functions_interface.jl +++ b/src/measures/loss_functions_interface.jl @@ -1,7 +1,7 @@ # implementation of MLJ measure interface for LossFunctions.jl # Supervised Loss -- measure traits - +const LSF = LossFunctions is_measure_type(::Type{<:SupervisedLoss}) = true orientation(::Type{<:SupervisedLoss}) = :loss reports_each_observation(::Type{<:SupervisedLoss}) = true @@ -42,7 +42,7 @@ function value(measure::MarginLoss, yhat, X, y, ::Nothing, ::Val{false}, ::Val{true}) check_pools(yhat, y) probs_of_observed = broadcast(pdf, yhat, y) - return broadcast(measure, _scale.(probs_of_observed), 1) + return value.(measure, _scale.(probs_of_observed), 1) end function value(measure::MarginLoss, yhat, X, y, w, From 64d9d39a471274ff2a0db07debedf4b1ea06c442 Mon Sep 17 00:00:00 2001 From: Cameron Bieganek <8310743+CameronBieganek@users.noreply.github.com> Date: Sun, 12 Apr 2020 17:58:58 -0500 Subject: [PATCH 03/13] Add .vscode to .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 0087f9b6..0e1b98c8 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ sandbox/ /docs/build/ /docs/site/ /docs/Manifest.toml +.vscode From 99369a2782897de2cdaf5f693ad1142ab5c87f37 Mon Sep 17 00:00:00 2001 From: Cameron Bieganek <8310743+CameronBieganek@users.noreply.github.com> Date: Sun, 12 Apr 2020 17:59:50 -0500 Subject: [PATCH 04/13] Modify stratified CV unit test so that the count for each level is a multiple of 3. --- test/resampling.jl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/resampling.jl b/test/resampling.jl index f17c52f4..268fd97a 100644 --- a/test/resampling.jl +++ b/test/resampling.jl @@ -213,13 +213,13 @@ end # check in explicit example: y = categorical(['c', 'a', 'b', 'a', 'c', 'x', 'c', 'a', 'a', 'b', 'b', 'b', 'b', 'b']) - rows = [14, 13, 12, 11, 10, 9, 8, 7, 5, 4, 3, 2, 1] - @test y[rows] == collect("bbbbbaaccabac") + rows = [14, 13, 12, 11, 10, 9, 8, 7, 5, 4, 3, 1] + @test y[rows] == collect("bbbbbaaccabc") scv = StratifiedCV(nfolds=3) pairs = MLJBase.train_test_pairs(scv, rows, nothing, y) - @test pairs == [([12, 11, 10, 8, 5, 4, 3, 2, 1], [14, 13, 9, 7]), - ([14, 13, 10, 9, 7, 4, 3, 2, 1], [12, 11, 8, 5]), - ([14, 13, 12, 11, 9, 8, 7, 5], [10, 4, 3, 2, 1])] + @test pairs == [([12, 11, 10, 8, 5, 4, 3, 1], [14, 13, 9, 7]), + ([14, 13, 10, 9, 7, 4, 3, 1], [12, 11, 8, 5]), + ([14, 13, 12, 11, 9, 8, 7, 5], [10, 4, 3, 1])] scv_random = StratifiedCV(nfolds=3, shuffle=true) pairs_random = MLJBase.train_test_pairs(scv_random, rows, nothing, y) @test pairs != pairs_random From 430e5e473727e8ec14ba43131405a13bddd36ccd Mon Sep 17 00:00:00 2001 From: Okon Samuel <39421418+OkonSamuel@users.noreply.github.com> Date: Mon, 13 Apr 2020 00:05:18 +0100 Subject: [PATCH 05/13] fixed typo --- src/measures/loss_functions_interface.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/measures/loss_functions_interface.jl b/src/measures/loss_functions_interface.jl index 2a56025e..74444ab5 100644 --- a/src/measures/loss_functions_interface.jl +++ b/src/measures/loss_functions_interface.jl @@ -42,7 +42,7 @@ function value(measure::MarginLoss, yhat, X, y, ::Nothing, ::Val{false}, ::Val{true}) check_pools(yhat, y) probs_of_observed = broadcast(pdf, yhat, y) - return value.(measure, _scale.(probs_of_observed), 1) + return (LSF.value).(measure, _scale.(probs_of_observed), 1) end function value(measure::MarginLoss, yhat, X, y, w, From b0cb7bd215a06d3e2c7fe55116e2206d5919227d Mon Sep 17 00:00:00 2001 From: Cameron Bieganek <8310743+CameronBieganek@users.noreply.github.com> Date: Sun, 12 Apr 2020 18:21:02 -0500 Subject: [PATCH 06/13] Add a unit test for test_train_pairs(::CV, ...). --- test/resampling.jl | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/test/resampling.jl b/test/resampling.jl index 268fd97a..b95c430f 100644 --- a/test/resampling.jl +++ b/test/resampling.jl @@ -64,6 +64,18 @@ end @test MLJBase.train_test_pairs(Holdout(), 1:10) != MLJBase.train_test_pairs(Holdout(shuffle=true), 1:10) +@testset "train test pairs" begin + cv = CV(nfolds=5) + pairs = MLJBase.train_test_pairs(cv, 1:24) + @test pairs == [ + (6:24, 1:5), + ([1:5..., 11:24...], 6:10), + ([1:10..., 16:24...], 11:15), + ([1:15..., 21:24...], 16:20), + (1:20, 21:24) + ] +end + @testset "checking measure/model compatibility" begin model = ConstantRegressor() y = rand(4) From 49bcd25ee8ff1fbcf2475b04d0bedeb8b18f2724 Mon Sep 17 00:00:00 2001 From: Okon Samuel <39421418+OkonSamuel@users.noreply.github.com> Date: Mon, 13 Apr 2020 00:29:15 +0100 Subject: [PATCH 07/13] fixed broken tests --- test/measures/loss_functions_interface.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/measures/loss_functions_interface.jl b/test/measures/loss_functions_interface.jl index 9aef8f77..9b5d7956 100644 --- a/test/measures/loss_functions_interface.jl +++ b/test/measures/loss_functions_interface.jl @@ -29,7 +29,7 @@ seed!(1234) for m in [ZeroOneLoss(), L1HingeLoss(), L2HingeLoss(), LogitMarginLoss(), ModifiedHuberLoss(), PerceptronLoss(), SmoothedL1HingeLoss(0.9), L2MarginLoss(), ExpLoss(), SigmoidLoss(), DWDMarginLoss(0.9)] - @test MLJBase.value(m, yhat, X, y, nothing) ≈ m(yhatm, ym) + @test MLJBase.value(m, yhat, X, y, nothing) ≈ LossFunctions.value(m, yhatm, ym) @test mean(MLJBase.value(m, yhat, X, y, w)) ≈ LossFunctions.value(m, yhatm, ym, AggMode.WeightedMean(w)) end @@ -47,7 +47,7 @@ end HuberLoss(0.9), EpsilonInsLoss(0.9), L1EpsilonInsLoss(0.9), L2EpsilonInsLoss(0.9), LogitDistLoss(), QuantileLoss(0.7)] - @test MLJBase.value(m, yhat, X, y, nothing) ≈ m(yhat, y) + @test MLJBase.value(m, yhat, X, y, nothing) ≈ LossFunctions.value(m, yhat, y) @test mean(MLJBase.value(m, yhat, X, y, w)) ≈ LossFunctions.value(m, yhat, y, AggMode.WeightedMean(w)) end From b18f34c38d7115cba3c226bc91413cde660787f0 Mon Sep 17 00:00:00 2001 From: Okon Samuel <39421418+OkonSamuel@users.noreply.github.com> Date: Mon, 13 Apr 2020 00:48:12 +0100 Subject: [PATCH 08/13] Update loss_functions_interface.jl --- src/measures/loss_functions_interface.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/measures/loss_functions_interface.jl b/src/measures/loss_functions_interface.jl index 74444ab5..ae03750b 100644 --- a/src/measures/loss_functions_interface.jl +++ b/src/measures/loss_functions_interface.jl @@ -18,12 +18,12 @@ MMI.target_scitype(::Type{<:DistanceLoss}) = AbstractArray{<:Continuous} function value(measure::DistanceLoss, yhat, X, y, ::Nothing, ::Val{false}, ::Val{true}) - return measure(yhat, y) + return LSF.value(measure, yhat, y) end function value(measure::DistanceLoss, yhat, X, y, w, ::Val{false}, ::Val{true}) - return w .* measure(yhat, y) ./ (sum(w)/length(y)) + return w .* value(measure, yhat, X, y, nothing) ./ (sum(w)/length(y)) end ## MARGIN BASED LOSS FUNCTIONS From fb0e2ac821e24f9484a32eaca0e3bfb5f096a140 Mon Sep 17 00:00:00 2001 From: Cameron Bieganek <8310743+CameronBieganek@users.noreply.github.com> Date: Sun, 12 Apr 2020 20:27:26 -0500 Subject: [PATCH 09/13] Update train_test_pairs(cv::CV, rows). --- src/resampling.jl | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/src/resampling.jl b/src/resampling.jl index 4e828df4..e90a12f7 100644 --- a/src/resampling.jl +++ b/src/resampling.jl @@ -132,31 +132,33 @@ CV(; nfolds::Int=6, shuffle=nothing, rng=nothing) = function train_test_pairs(cv::CV, rows) - n_observations = length(rows) - nfolds = cv.nfolds + n_obs = length(rows) + n_folds = cv.nfolds if cv.shuffle rows=shuffle!(cv.rng, collect(rows)) end - # number of observations per fold - k = floor(Int, n_observations/nfolds) - k > 0 || error("Inusufficient data for $nfolds-fold cross-validation.\n"* + n, r = divrem(n_obs, n_folds) + n > 0 || error("Inusufficient data for $n_folds-fold cross-validation.\n"* "Try reducing nfolds. ") - # define the (trainrows, testrows) pairs: - firsts = 1:k:((nfolds - 1)*k + 1) # itr of first `test` rows index - seconds = k:k:(nfolds*k) # itr of last `test` rows index + m = n + 1 # number of observations in first r folds - ret = map(1:nfolds) do k - f = firsts[k] - s = seconds[k] - k < nfolds || (s = n_observations) - return (vcat(rows[1:(f - 1)], rows[(s + 1):end]), # trainrows - rows[f:s]) # testrows - end + itr1 = Iterators.partition( 1 : m*r , m) + itr2 = Iterators.partition( m*r+1 : n_obs , n) + test_folds = Iterators.flatten((itr1, itr2)) - return ret + return map(test_folds) do test_indices + test_rows = rows[test_indices] + + train_rows = vcat( + rows[ 1 : first(test_indices)-1 ], + rows[ last(test_indices)+1 : end ] + ) + + (train_rows, test_rows) + end end # ---------------------------------------------------------------- From 9fec81aab6291169af0dd1076f9b2db726a2b885 Mon Sep 17 00:00:00 2001 From: Cameron Bieganek <8310743+CameronBieganek@users.noreply.github.com> Date: Sun, 12 Apr 2020 20:28:55 -0500 Subject: [PATCH 10/13] Remove unused n_observations = length(rows) statement in train_test_rows(::StratifiedCV, ...). --- src/resampling.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/resampling.jl b/src/resampling.jl index e90a12f7..dca23a32 100644 --- a/src/resampling.jl +++ b/src/resampling.jl @@ -218,7 +218,6 @@ StratifiedCV(; nfolds::Int=6, shuffle=nothing, rng=nothing) = function train_test_pairs(stratified_cv::StratifiedCV, rows, X, y) - n_observations = length(rows) nfolds = stratified_cv.nfolds if stratified_cv.shuffle @@ -841,7 +840,7 @@ function MLJBase.update(resampler::Resampler{Holdout}, reusable = !resampler.resampling.shuffle && resampler.repeats == 1 && old_resampling.fraction_train == - resampler.resampling.fraction_train + resampler.resampling.fraction_train if reusable mach = old_mach From 45b9ac197d911a5aae94ee2d61deb89be0208098 Mon Sep 17 00:00:00 2001 From: Cameron Bieganek <8310743+CameronBieganek@users.noreply.github.com> Date: Sun, 12 Apr 2020 20:46:19 -0500 Subject: [PATCH 11/13] Update the docstring for train_test_pairs(::CV, ...). --- src/resampling.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/resampling.jl b/src/resampling.jl index dca23a32..4e74a884 100644 --- a/src/resampling.jl +++ b/src/resampling.jl @@ -104,7 +104,9 @@ of `rows`. The `test` vectors are mutually exclusive and exhaust `test` vector. With no row pre-shuffling, the order of `rows` is preserved, in the sense that `rows` coincides precisely with the concatenation of the `test` vectors, in the order they are -generated. All but the last `test` vector have equal length. +generated. The first `r` test vectors have length `n + 1`, where +`n, r = divrem(length(rows), nfolds)`, and the remaining test vectors +have length `n`. Pre-shuffling of `rows` is controlled by `rng` and `shuffle`. If `rng` is an integer, then the `CV` keyword constructor resets it to From 0e1f4044d5be2eb78bb5c0e9958b9c89c493d766 Mon Sep 17 00:00:00 2001 From: "Anthony Blaom, PhD" Date: Tue, 14 Apr 2020 11:28:08 +1200 Subject: [PATCH 12/13] Revert "Spread remainder evenly among folds in train_test_pairs(::CV, ...)" --- .gitignore | 1 - src/resampling.jl | 41 +++++++++++++++++++---------------------- test/resampling.jl | 22 +++++----------------- 3 files changed, 24 insertions(+), 40 deletions(-) diff --git a/.gitignore b/.gitignore index 0e1b98c8..0087f9b6 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,3 @@ sandbox/ /docs/build/ /docs/site/ /docs/Manifest.toml -.vscode diff --git a/src/resampling.jl b/src/resampling.jl index 4e74a884..4e828df4 100644 --- a/src/resampling.jl +++ b/src/resampling.jl @@ -104,9 +104,7 @@ of `rows`. The `test` vectors are mutually exclusive and exhaust `test` vector. With no row pre-shuffling, the order of `rows` is preserved, in the sense that `rows` coincides precisely with the concatenation of the `test` vectors, in the order they are -generated. The first `r` test vectors have length `n + 1`, where -`n, r = divrem(length(rows), nfolds)`, and the remaining test vectors -have length `n`. +generated. All but the last `test` vector have equal length. Pre-shuffling of `rows` is controlled by `rng` and `shuffle`. If `rng` is an integer, then the `CV` keyword constructor resets it to @@ -134,33 +132,31 @@ CV(; nfolds::Int=6, shuffle=nothing, rng=nothing) = function train_test_pairs(cv::CV, rows) - n_obs = length(rows) - n_folds = cv.nfolds + n_observations = length(rows) + nfolds = cv.nfolds if cv.shuffle rows=shuffle!(cv.rng, collect(rows)) end - n, r = divrem(n_obs, n_folds) - n > 0 || error("Inusufficient data for $n_folds-fold cross-validation.\n"* + # number of observations per fold + k = floor(Int, n_observations/nfolds) + k > 0 || error("Inusufficient data for $nfolds-fold cross-validation.\n"* "Try reducing nfolds. ") - m = n + 1 # number of observations in first r folds + # define the (trainrows, testrows) pairs: + firsts = 1:k:((nfolds - 1)*k + 1) # itr of first `test` rows index + seconds = k:k:(nfolds*k) # itr of last `test` rows index - itr1 = Iterators.partition( 1 : m*r , m) - itr2 = Iterators.partition( m*r+1 : n_obs , n) - test_folds = Iterators.flatten((itr1, itr2)) - - return map(test_folds) do test_indices - test_rows = rows[test_indices] - - train_rows = vcat( - rows[ 1 : first(test_indices)-1 ], - rows[ last(test_indices)+1 : end ] - ) - - (train_rows, test_rows) + ret = map(1:nfolds) do k + f = firsts[k] + s = seconds[k] + k < nfolds || (s = n_observations) + return (vcat(rows[1:(f - 1)], rows[(s + 1):end]), # trainrows + rows[f:s]) # testrows end + + return ret end # ---------------------------------------------------------------- @@ -220,6 +216,7 @@ StratifiedCV(; nfolds::Int=6, shuffle=nothing, rng=nothing) = function train_test_pairs(stratified_cv::StratifiedCV, rows, X, y) + n_observations = length(rows) nfolds = stratified_cv.nfolds if stratified_cv.shuffle @@ -842,7 +839,7 @@ function MLJBase.update(resampler::Resampler{Holdout}, reusable = !resampler.resampling.shuffle && resampler.repeats == 1 && old_resampling.fraction_train == - resampler.resampling.fraction_train + resampler.resampling.fraction_train if reusable mach = old_mach diff --git a/test/resampling.jl b/test/resampling.jl index b95c430f..f17c52f4 100644 --- a/test/resampling.jl +++ b/test/resampling.jl @@ -64,18 +64,6 @@ end @test MLJBase.train_test_pairs(Holdout(), 1:10) != MLJBase.train_test_pairs(Holdout(shuffle=true), 1:10) -@testset "train test pairs" begin - cv = CV(nfolds=5) - pairs = MLJBase.train_test_pairs(cv, 1:24) - @test pairs == [ - (6:24, 1:5), - ([1:5..., 11:24...], 6:10), - ([1:10..., 16:24...], 11:15), - ([1:15..., 21:24...], 16:20), - (1:20, 21:24) - ] -end - @testset "checking measure/model compatibility" begin model = ConstantRegressor() y = rand(4) @@ -225,13 +213,13 @@ end # check in explicit example: y = categorical(['c', 'a', 'b', 'a', 'c', 'x', 'c', 'a', 'a', 'b', 'b', 'b', 'b', 'b']) - rows = [14, 13, 12, 11, 10, 9, 8, 7, 5, 4, 3, 1] - @test y[rows] == collect("bbbbbaaccabc") + rows = [14, 13, 12, 11, 10, 9, 8, 7, 5, 4, 3, 2, 1] + @test y[rows] == collect("bbbbbaaccabac") scv = StratifiedCV(nfolds=3) pairs = MLJBase.train_test_pairs(scv, rows, nothing, y) - @test pairs == [([12, 11, 10, 8, 5, 4, 3, 1], [14, 13, 9, 7]), - ([14, 13, 10, 9, 7, 4, 3, 1], [12, 11, 8, 5]), - ([14, 13, 12, 11, 9, 8, 7, 5], [10, 4, 3, 1])] + @test pairs == [([12, 11, 10, 8, 5, 4, 3, 2, 1], [14, 13, 9, 7]), + ([14, 13, 10, 9, 7, 4, 3, 2, 1], [12, 11, 8, 5]), + ([14, 13, 12, 11, 9, 8, 7, 5], [10, 4, 3, 2, 1])] scv_random = StratifiedCV(nfolds=3, shuffle=true) pairs_random = MLJBase.train_test_pairs(scv_random, rows, nothing, y) @test pairs != pairs_random From 9eea178dfe6bce7ba17cb176ea62e6fbe911353d Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Tue, 14 Apr 2020 12:11:19 +1200 Subject: [PATCH 13/13] bump version = "0.12.7" --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 4cf9cd46..2d71375b 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "MLJBase" uuid = "a7f614a8-145f-11e9-1d2a-a57a1082229d" authors = ["Anthony D. Blaom "] -version = "0.12.6" +version = "0.12.7" [deps] CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"