Skip to content

Commit

Permalink
Merge pull request #202 from alan-turing-institute/dev
Browse files Browse the repository at this point in the history
For a 0.11.8 release
  • Loading branch information
ablaom authored Feb 19, 2020
2 parents 3b6fd2b + 39c7058 commit 2b87398
Show file tree
Hide file tree
Showing 9 changed files with 313 additions and 104 deletions.
2 changes: 2 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
language: julia
os:
- linux
env:
- JULIA_NUM_THREADS=30
julia:
- 1.1
- 1.2
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "MLJBase"
uuid = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
authors = ["Anthony D. Blaom <[email protected]>"]
version = "0.11.7"
version = "0.11.8"

[deps]
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
Expand Down
1 change: 1 addition & 0 deletions src/MLJBase.jl
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ using Distributed
using ComputationalResources
using ComputationalResources: CPUProcesses
using ProgressMeter
import .Threads

# Operations & extensions
import LossFunctions
Expand Down
169 changes: 125 additions & 44 deletions src/resampling.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# ==================================================================
## RESAMPLING STRATEGIES

abstract type ResamplingStrategy <: MLJType end
Expand All @@ -9,15 +10,14 @@ function ==(s1::S, s2::S) where S <: ResamplingStrategy
return all(getfield(s1, fld) == getfield(s2, fld) for fld in fieldnames(S))
end

# fallbacks:
# fallbacks for method to be implemented by each new strategy:
train_test_pairs(s::ResamplingStrategy, rows, X, y, w) =
train_test_pairs(s, rows, X, y)
train_test_pairs(s::ResamplingStrategy, rows, X, y) =
train_test_pairs(s, rows, y)
train_test_pairs(s::ResamplingStrategy, rows, y) =
train_test_pairs(s, rows)


# Helper to interpret rng, shuffle in case either is `nothing` or if
# `rng` is an integer:
function shuffle_and_rng(shuffle, rng)
Expand All @@ -36,6 +36,9 @@ function shuffle_and_rng(shuffle, rng)
return shuffle, rng
end

# ----------------------------------------------------------------
# Holdout

"""
holdout = Holdout(; fraction_train=0.7,
shuffle=nothing,
Expand Down Expand Up @@ -83,6 +86,8 @@ function train_test_pairs(holdout::Holdout, rows)

end

# ----------------------------------------------------------------
# Cross-validation (vanilla)

"""
cv = CV(; nfolds=6, shuffle=nothing, rng=nothing)
Expand Down Expand Up @@ -154,6 +159,9 @@ function train_test_pairs(cv::CV, rows)
return ret
end

# ----------------------------------------------------------------
# Cross-validation (stratified; for `Finite` targets)

"""
stratified_cv = StratifiedCV(; nfolds=6,
shuffle=false,
Expand Down Expand Up @@ -265,8 +273,8 @@ function train_test_pairs(stratified_cv::StratifiedCV, rows, X, y)

end


## EVALUATION TYPE
# ================================================================
## EVALUATION RESULT TYPE

const PerformanceEvaluation = NamedTuple{(:measure, :measurement,
:per_fold, :per_observation)}
Expand All @@ -291,9 +299,20 @@ function Base.show(io::IO, e::PerformanceEvaluation)
print(io, "PerformanceEvaluation$summary")
end


# ===============================================================
## EVALUATION METHODS

# ---------------------------------------------------------------
# Helpers

function actual_rows(rows, N, verbosity)
unspecified_rows = (rows === nothing)
_rows = unspecified_rows ? (1:N) : rows
unspecified_rows ||
@info "Creating subsamples from a subset of all rows. "
return _rows
end

function _check_measure(model, measure, y, operation, override)

override && (return nothing)
Expand Down Expand Up @@ -377,6 +396,9 @@ function _process_weights_measures(weights, measures, mach,

end

# --------------------------------------------------------------
# User interface points: `evaluate!` and `evaluate`

"""
evaluate!(mach,
resampling=CV(),
Expand Down Expand Up @@ -517,30 +539,65 @@ See the machine version `evaluate!` for the complete list of options.
evaluate(model::Supervised, args...; kwargs...) =
evaluate!(machine(model, args...); kwargs...)

const AbstractRow = Union{AbstractVector{<:Integer}, Colon}
const TrainTestPair = Tuple{AbstractRow,AbstractRow}
const TrainTestPairs = AbstractVector{<:TrainTestPair}

function _evaluate!(func::Function, res::CPU1, nfolds, verbosity)
p = Progress(nfolds + 1, dt=0, desc="Evaluating over $nfolds folds: ",
barglyphs=BarGlyphs("[=> ]"), barlen=25, color=:yellow)
verbosity > 0 && next!(p)
return reduce(vcat, (func(k, p, verbosity) for k in 1:nfolds))
# -------------------------------------------------------------------
# Resource-specific methods to distribute a function parameterized by
# fold number `k` over processes/threads.

# Here `func` is always going to be `get_measurements`; see later

# machines has only one element:
function _evaluate!(func, machines, ::CPU1, nfolds, channel)
generator = (begin
r = func(machines[1], k)
put!(channel, true)
r
end for k in 1:nfolds)
ret = reduce(vcat, generator)
put!(channel, false)
return ret
end
function _evaluate!(func::Function, res::CPUProcesses, nfolds, verbosity)
# TODO: use pmap here ?:
return @distributed vcat for k in 1:nfolds
func(k)

# machines has only one element:
function _evaluate!(func, machines, ::CPUProcesses, nfolds, channel)
ret = @distributed vcat for k in 1:nfolds
r = func(machines[1], k)
put!(channel, true)
r
end
put!(channel, false)
return ret
end

@static if VERSION >= v"1.3.0-DEV.573"
function _evaluate!(func::Function, res::CPUThreads, nfolds, verbosity)
task_vec = [Threads.@spawn func(k) for k in 1:nfolds]
return reduce(vcat, fetch.(task_vec))
# one machine for each thread; cycle through available threads:
function _evaluate!(func, machines, ::CPUThreads, nfolds, channel)
nthreads = Threads.nthreads()
tasks = map(1:nfolds) do k
Threads.@spawn begin
id = Threads.threadid()
if !haskey(machines, id)
machines[id] =
machine(machines[1].model, machines[1].args...)
end
r = func(machines[id], k)
put!(channel, true)
r
end
end
ret = reduce(vcat, fetch.(tasks))
put!(channel, false)
return ret
end
end

# Evaluation when resampling is a TrainTestPairs (core evaluator):
# ------------------------------------------------------------
# Core `evaluation` method, operating on train-test pairs

const AbstractRow = Union{AbstractVector{<:Integer}, Colon}
const TrainTestPair = Tuple{AbstractRow,AbstractRow}
const TrainTestPairs = AbstractVector{<:TrainTestPair}

# Evaluation when resampling is a TrainTestPairs (CORE EVALUATOR):
function evaluate!(mach::Machine, resampling, weights,
rows, verbosity, repeats,
measures, operation, acceleration, force)
Expand All @@ -559,7 +616,21 @@ function evaluate!(mach::Machine, resampling, weights,

nmeasures = length(measures)

function get_measurements(k)
# For multithreading we need a clone of `mach` for each thread
# doing work. These are instantiated as needed except for
# threadid=1.
machines = Dict(1 => mach)

# set up progress meter and a remote channel for communication
p = Progress(nfolds,
dt=0,
desc="Evaluating over $nfolds folds: ",
barglyphs=BarGlyphs("[=> ]"),
barlen=25,
color=:yellow)
channel = RemoteChannel(()->Channel{Bool}(nfolds) , 1)

function get_measurements(mach, k)
train, test = resampling[k]
fit!(mach; rows=train, verbosity=verbosity-1, force=force)
Xtest = selectrows(X, test)
Expand All @@ -572,22 +643,31 @@ function evaluate!(mach::Machine, resampling, weights,
yhat = operation(mach, Xtest)
return [value(m, yhat, Xtest, ytest, wtest)
for m in measures]
end
function get_measurements(k, p, verbosity) # p = progress meter
ret = get_measurements(k)
verbosity > 0 && next!(p)
return ret
put!(channel, true)
end

measurements_flat = if acceleration isa CPUProcesses
## TODO: progress meter for distributed case
if acceleration isa CPUProcesses
if verbosity > 0
@info "Distributing cross-validation computation " *
@info "Distributing evaluations " *
"among $(nworkers()) workers."
end
end
measurements_flat =
_evaluate!(get_measurements, acceleration, nfolds, verbosity)

@sync begin
# printing the progress bar
@async while take!(channel)
verbosity < 1 || next!(p)
end

@async global measurements_flat =
_evaluate!(get_measurements,
machines,
acceleration,
nfolds,
channel)
end

close(channel)

# in the following rows=folds, columns=measures:
measurements_matrix = permutedims(
Expand Down Expand Up @@ -628,14 +708,9 @@ function evaluate!(mach::Machine, resampling, weights,

end

function actual_rows(rows, N, verbosity)
unspecified_rows = (rows === nothing)
_rows = unspecified_rows ? (1:N) : rows
unspecified_rows || @info "Creating subsamples from a subset of all rows. "
return _rows
end
# ----------------------------------------------------------------
# Evaluation when `resampling` is a ResamplingStrategy

# Evaluation when resampling is a ResamplingStrategy:
function evaluate!(mach::Machine, resampling::ResamplingStrategy,
weights, rows, verbosity, repeats, args...)

Expand All @@ -652,7 +727,7 @@ function evaluate!(mach::Machine, resampling::ResamplingStrategy,

end


# ====================================================================
## RESAMPLER - A MODEL WRAPPER WITH `evaluate` OPERATION

"""
Expand Down Expand Up @@ -751,14 +826,20 @@ function MLJBase.fit(resampler::Resampler, verbosity::Int, args...)

end

# in special case of holdout, we can reuse the underlying model's
# machine, provided the training_fraction has not changed:
# in special case of non-shuffled, non-repeated holdout, we can reuse
# the underlying model's machine, provided the training_fraction has
# not changed:
function MLJBase.update(resampler::Resampler{Holdout},
verbosity::Int, fitresult, cache, args...)

old_mach, old_resampling = cache

if old_resampling.fraction_train == resampler.resampling.fraction_train
reusable = !resampler.resampling.shuffle &&
resampler.repeats == 1 &&
old_resampling.fraction_train ==
resampler.resampling.fraction_train

if reusable
mach = old_mach
else
mach = machine(resampler.model, args...)
Expand Down
76 changes: 76 additions & 0 deletions test/measures/confusion_matrix.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
using Test
using MLJBase
include(joinpath("..", "..", "test", "_models", "models.jl"))
using .Models

@testset "basics" begin
y = categorical(['m', 'f', 'n', 'f', 'm', 'n', 'n', 'm', 'f'])
= categorical(['f', 'f', 'm', 'f', 'n', 'm', 'n', 'm', 'f'])
l = levels(y) # f, m, n
cm = confmat(ŷ, y; warn=false)
e(l,i,j) = sum((ŷ .== l[i]) .& (y .== l[j]))
for i in 1:3, j in 1:3
@test cm[i,j] == e(l,i,j)
end
perm = [3, 1, 2]
l2 = l[perm]
cm2 = confmat(ŷ, y; perm=perm) # no warning because permutation is given
for i in 1:3, j in 1:3
@test cm2[i,j] == e(l2,i,j)
end
@test_logs (:warn, "The classes are un-ordered,\nusing order: ['f', 'm', 'n'].\nTo suppress this warning, consider coercing to OrderedFactor.") confmat(ŷ, y)
ŷc = coerce(ŷ, OrderedFactor)
yc = coerce(y, OrderedFactor)
@test confmat(ŷc, yc).mat == cm.mat

y = categorical(['a','b','a','b'])
= categorical(['b','b','a','a'])
@test_logs (:warn, "The classes are un-ordered,\nusing: negative='a' and positive='b'.\nTo suppress this warning, consider coercing to OrderedFactor.") confmat(ŷ, y)

# more tests for coverage
y = categorical([1,2,3,1,2,3,1,2,3])
= categorical([1,2,3,1,2,3,1,2,3])
@test_throws ArgumentError confmat(ŷ, y, rev=true)

# silly test for display
= coerce(y, OrderedFactor)
y = coerce(y, OrderedFactor)
iob = IOBuffer()
Base.show(iob, MIME("text/plain"), confmat(ŷ, y))
siob = String(take!(iob))
@test strip(siob) == strip("""
┌─────────────────────────────────────────┐
│ Ground Truth │
┌─────────────┼─────────────┬─────────────┬─────────────┤
│ Predicted │ 1 │ 2 │ 3 │
├─────────────┼─────────────┼─────────────┼─────────────┤
│ 1 │ 3 │ 0 │ 0 │
├─────────────┼─────────────┼─────────────┼─────────────┤
│ 2 │ 0 │ 3 │ 0 │
├─────────────┼─────────────┼─────────────┼─────────────┤
│ 3 │ 0 │ 0 │ 3 │
└─────────────┴─────────────┴─────────────┴─────────────┘""")

end

@testset "confmat as measure" begin

@test info(confmat).orientation == :other
model = DeterministicConstantClassifier()

X = (x=rand(10),)
long = categorical(collect("abbaacaabbbbababcbac"), ordered=true)
y = long[1:10]
yhat =long[11:20]

confmat(yhat, y).mat == [1 2 0; 3 1 1; 1 1 0]

MLJBase.value(confmat, yhat, X, y, nothing)

e = evaluate(model, X, y,
measures=[misclassification_rate, confmat],
resampling=Holdout(fraction_train=0.5))
cm = e.measurement[2]
@test cm.labels == ["a", "b", "c"]
@test cm.mat == [2 2 1; 0 0 0; 0 0 0]
end
Loading

0 comments on commit 2b87398

Please sign in to comment.