-
Notifications
You must be signed in to change notification settings - Fork 18
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adaptive proposals #39
base: master
Are you sure you want to change the base?
Changes from all commits
e9fd602
5f0ddfa
f42b784
a188780
d8989aa
565f12a
cc16195
802ec67
c7623c4
a33937e
4007bd0
71e010b
2d59ede
279aea7
93f17c5
16715e1
046c21b
b91fcc0
387eff4
8fb1048
4071675
a63262d
afd3ed1
fe8562c
f66f647
2988ff2
e5ad041
7aa8631
4999349
d4b3f6b
cb52c7f
5a2a175
b2be967
518aab1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
""" | ||
Adaptor(; tune=25, target=0.44, bound=10., δmax=0.2) | ||
|
||
A helper struct for univariate adaptive proposal kernels. This tracks the | ||
number of accepted proposals and the total number of attempted proposals. The | ||
proposal kernel is tuned every `tune` proposals, such that the scale (log(σ) in | ||
the case of a Normal kernel, log(b) for a Uniform kernel) of the proposal is | ||
increased (decreased) by `δ(n) = min(δmax, 1/√n)` at tuning step `n` if the | ||
estimated acceptance probability is higher (lower) than `target`. The target | ||
acceptance probability defaults to 0.44 which is supposedly optimal for 1D | ||
proposals. To ensure ergodicity, the scale of the proposal has to be bounded | ||
(by `bound`), although this is often not required in practice. | ||
""" | ||
mutable struct Adaptor | ||
accepted::Int | ||
total::Int | ||
tune::Int # tuning interval | ||
target::Float64 # target acceptance rate | ||
bound::Float64 # bound on logσ of Gaussian kernel | ||
δmax::Float64 # maximum adaptation step | ||
end | ||
|
||
function Adaptor(; tune=25, target=0.44, bound=10., δmax=0.2) | ||
return Adaptor(0, 0, tune, target, bound, δmax) | ||
end | ||
|
||
""" | ||
AdaptiveProposal{P} | ||
|
||
An adaptive Metropolis-Hastings proposal. In order for this to work, the | ||
proposal kernel should implement the `adapted(proposal, δ)` method, where `δ` | ||
is the increment/decrement applied to the scale of the proposal distribution | ||
during adaptation (e.g. for a Normal distribution the scale is `log(σ)`, so | ||
that after adaptation the proposal is `Normal(0, exp(log(σ) + δ))`). | ||
|
||
# Example | ||
```julia | ||
julia> p = AdaptiveProposal(Uniform(-0.2, 0.2)); | ||
|
||
julia> rand(p) | ||
0.07975590594518434 | ||
``` | ||
|
||
# References | ||
|
||
Roberts, Gareth O., and Jeffrey S. Rosenthal. "Examples of adaptive MCMC." | ||
Journal of Computational and Graphical Statistics 18.2 (2009): 349-367. | ||
""" | ||
mutable struct AdaptiveProposal{P} <: Proposal{P} | ||
proposal::P | ||
adaptor::Adaptor | ||
end | ||
|
||
function AdaptiveProposal(p; kwargs...) | ||
return AdaptiveProposal(p, Adaptor(; kwargs...)) | ||
end | ||
|
||
# Adaptive proposals are only defined for symmetric proposal distributions | ||
is_symmetric_proposal(::AdaptiveProposal) = true | ||
|
||
accepted!(p::AdaptiveProposal) = p.adaptor.accepted += 1 | ||
accepted!(p::Vector{<:AdaptiveProposal}) = map(accepted!, p) | ||
accepted!(p::NamedTuple{names}) where names = map(x->accepted!(getfield(p, x)), names) | ||
|
||
# this is defined because the first draw has no transition yet (I think) | ||
function propose(rng::Random.AbstractRNG, p::AdaptiveProposal, m::DensityModel) | ||
return rand(rng, p.proposal) | ||
end | ||
|
||
# the actual proposal happens here | ||
function propose( | ||
rng::Random.AbstractRNG, | ||
proposal::AdaptiveProposal{<:Union{Distribution,Proposal}}, | ||
model::DensityModel, | ||
t | ||
) | ||
consider_adaptation!(proposal) | ||
return t + rand(rng, proposal.proposal) | ||
end | ||
|
||
function q(proposal::AdaptiveProposal, t, t_cond) | ||
return logpdf(proposal, t - t_cond) | ||
end | ||
|
||
Comment on lines
+81
to
+84
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this actually needed? It seems like the default for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Currently there seems to be no default |
||
function consider_adaptation!(p) | ||
(p.adaptor.total % p.adaptor.tune == 0) && adapt!(p) | ||
p.adaptor.total += 1 | ||
end | ||
|
||
function adapt!(p::AdaptiveProposal) | ||
a = p.adaptor | ||
a.total == 0 && return | ||
δ = min(a.δmax, sqrt(a.tune / a.total)) # diminishing adaptation | ||
α = a.accepted / a.tune # acceptance ratio | ||
p_ = adapted(p.proposal, α > a.target ? δ : -δ, a.bound) | ||
a.accepted = 0 | ||
p.proposal = p_ | ||
end | ||
|
||
function adapted(d::Normal, δ, bound=Inf) | ||
_lσ = log(d.σ) + δ | ||
lσ = sign(_lσ) * min(bound, abs(_lσ)) | ||
return Normal(d.μ, exp(lσ)) | ||
end | ||
|
||
function adapted(d::Uniform, δ, bound=Inf) | ||
lσ = log(d.b) + δ | ||
σ = exp(sign(lσ) * min(bound, abs(lσ))) | ||
return Uniform(-σ, σ) | ||
end |
Original file line number | Diff line number | Diff line change | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,82 @@ | ||||||||||||
""" | ||||||||||||
AdaptiveMvNormal(constant_component::MvNormal; σ=2.38, β=0.05) | ||||||||||||
|
||||||||||||
Adaptive multivariate normal mixture proposal as described in Haario et al. and | ||||||||||||
Roberts & Rosenthal (2009). Uses a two-component mixture of MvNormal | ||||||||||||
distributions. One of the components (with mixture weight `β`) remains | ||||||||||||
constant, while the other component is adapted to the target covariance | ||||||||||||
structure. The proposal is initialized by providing the constant component to | ||||||||||||
the constructor. | ||||||||||||
|
||||||||||||
`σ` is the scale factor for the covariance matrix, where 2.38 is supposedly | ||||||||||||
optimal in a high-dimensional context according to Roberts & Rosenthal. | ||||||||||||
|
||||||||||||
# References | ||||||||||||
|
||||||||||||
- Haario, Heikki, Eero Saksman, and Johanna Tamminen. | ||||||||||||
"An adaptive Metropolis algorithm." Bernoulli 7.2 (2001): 223-242. | ||||||||||||
- Roberts, Gareth O., and Jeffrey S. Rosenthal. "Examples of adaptive MCMC." | ||||||||||||
Journal of Computational and Graphical Statistics 18.2 (2009): 349-367. | ||||||||||||
""" | ||||||||||||
mutable struct AdaptiveMvNormal{T1,T2,V} <: Proposal{T1} | ||||||||||||
d::Int # dimensionality | ||||||||||||
n::Int # iteration | ||||||||||||
β::Float64 # constant component mixture weight | ||||||||||||
σ::Float64 # scale factor for adapted covariance matrix | ||||||||||||
constant::T1 | ||||||||||||
adaptive::T2 | ||||||||||||
Ex::Vector{V} # rolling mean vector | ||||||||||||
EX::Matrix{V} # scatter matrix of previous draws | ||||||||||||
end | ||||||||||||
|
||||||||||||
function AdaptiveMvNormal(dist::MvNormal; σ=2.38, β=0.05) | ||||||||||||
n = length(dist) | ||||||||||||
adaptive = MvNormal(cov(dist)) | ||||||||||||
AdaptiveMvNormal(n, -1, β, σ, dist, adaptive, zeros(n), zeros(n,n)) | ||||||||||||
end | ||||||||||||
|
||||||||||||
is_symmetric_proposal(::AdaptiveMvNormal) = true | ||||||||||||
|
||||||||||||
""" | ||||||||||||
adapt!(p::AdaptiveMvNormal, x::AbstractVector) | ||||||||||||
|
||||||||||||
Adaptation for the adaptive multivariate normal mixture proposal as described | ||||||||||||
in Haario et al. (2001) and Roberts & Rosenthal (2009). Will perform an online | ||||||||||||
estimation of the target covariance matrix and mean. The code for this routine | ||||||||||||
is largely based on `Mamba.jl`. | ||||||||||||
""" | ||||||||||||
function adapt!(p::AdaptiveMvNormal, x::AbstractVector) | ||||||||||||
p.n += 1 | ||||||||||||
# adapt mean vector and scatter matrix | ||||||||||||
f = p.n / (p.n + 1) | ||||||||||||
p.Ex = f * p.Ex + (1 - f) * x | ||||||||||||
p.EX = f * p.EX + (1 - f) * x * x' | ||||||||||||
# compute adapted covariance matrix | ||||||||||||
Σ = (p.σ^2 / (p.d * f)) * (p.EX - p.Ex * p.Ex') | ||||||||||||
F = cholesky(Hermitian(Σ), check=false) | ||||||||||||
if rank(F.L) == p.d | ||||||||||||
p.adaptive = MvNormal(PDMat(Σ, F)) | ||||||||||||
end | ||||||||||||
Comment on lines
+56
to
+59
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also here, can't we just use the default constructors of
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I implemented it this way (inspired by the Mamba implementation) to check whether the empirical estimate of the covariance matrix is a proper covariance matrix. If it is we can adapt the proposal distribution without additional computation by feeding the cholesky decomposition to MvNormal this way. How would you do it otherwise, I guess you could use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, I don't think a
IMO the first alternative would clearly be the best, if possible. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I just had a quick look at Haario et al. and the update formula of the covariance matrix in eq (3). It seems that indeed the update could be performed by only I think this would be the preferable method for updating the Cholesky decomposition. Probably it would also be cheaper to save just the Cholesky decomposition and not wrap it in a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That sounds interesting and worthwhile, have to look at it. FWIW I just based my code on the description in the Roberts & Rosenthal paper and the Mamba implementation, since I simply wanted to use this for a problem I'm having. I had the impression from the code that there is a preference to use Distributions.jl distributions for sampling, that's why I wrapped them in MvNormal objects instead of using the Cholesky factor directly (when both the full matrix and cholesky factors are available, it is as cheap to put them in an MvNormal). When updating the cholesky factor directly I agree it would be better to to do as you suggest. |
||||||||||||
end | ||||||||||||
|
||||||||||||
function Base.rand(rng::Random.AbstractRNG, p::AdaptiveMvNormal) | ||||||||||||
return if p.n > 2 * p.d | ||||||||||||
p.β * rand(rng, p.constant) + (1 - p.β) * rand(rng, p.adaptive) | ||||||||||||
else | ||||||||||||
rand(rng, p.constant) | ||||||||||||
end | ||||||||||||
end | ||||||||||||
|
||||||||||||
function propose(rng::Random.AbstractRNG, proposal::AdaptiveMvNormal, m::DensityModel) | ||||||||||||
return rand(rng, proposal) | ||||||||||||
end | ||||||||||||
|
||||||||||||
Comment on lines
+70
to
+73
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this not covered by the default for |
||||||||||||
function propose( | ||||||||||||
rng::Random.AbstractRNG, | ||||||||||||
proposal::AdaptiveMvNormal, | ||||||||||||
model::DensityModel, | ||||||||||||
t | ||||||||||||
) | ||||||||||||
adapt!(proposal, t) | ||||||||||||
return t + rand(rng, proposal) | ||||||||||||
end |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -103,4 +103,4 @@ function q( | |
t_cond | ||
) | ||
return q(proposal(t_cond), t, t_cond) | ||
end | ||
end |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,7 +18,7 @@ | |
Random.seed!(100) | ||
sampler = Ensemble(1_000, StretchProposal([InverseGamma(2, 3), Normal(0, 1)])) | ||
chain = sample(model, sampler, 1_000; | ||
param_names = ["s", "m"], chain_type = Chains) | ||
param_names = ["s", "m"], chain_type = Chains, progress = false) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
||
@test mean(chain["s"]) ≈ 49/24 atol=0.1 | ||
@test mean(chain["m"]) ≈ 7/6 atol=0.1 | ||
|
@@ -43,7 +43,7 @@ | |
Random.seed!(100) | ||
sampler = Ensemble(1_000, StretchProposal(MvNormal(2, 1))) | ||
chain = sample(model, sampler, 1_000; | ||
param_names = ["logs", "m"], chain_type = Chains) | ||
param_names = ["logs", "m"], chain_type = Chains, progress = false) | ||
|
||
@test mean(exp, chain["logs"]) ≈ 49/24 atol=0.1 | ||
@test mean(chain["m"]) ≈ 7/6 atol=0.1 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If this is the case, it would be good to check this property in an inner constructor.